Skip to content
This repository was archived by the owner on May 29, 2023. It is now read-only.

Handle fancy escapes in character classes #12

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 35 additions & 10 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

//! A regex parser yielding an AST.

use regex::escape;
use bit_set::BitSet;
use std::str::FromStr;
use std::usize;
Expand Down Expand Up @@ -199,7 +200,13 @@ impl<'a> Parser<'a> {
}
)),
b'(' => self.parse_group(ix, depth),
b'\\' => self.parse_escape(ix),
b'\\' => {
let (next, expr) = try!(self.parse_escape(ix));
if let Expr::Backref(group) = expr {
self.backrefs.insert(group);
}
Ok((next, expr))
},
b'+' | b'*' | b'?' | b'|' | b')' =>
Ok((ix, Expr::Empty)),
b'[' => self.parse_class(ix),
Expand All @@ -221,7 +228,7 @@ impl<'a> Parser<'a> {
}

// ix points to \ character
fn parse_escape(&mut self, ix: usize) -> Result<(usize, Expr)> {
fn parse_escape(&self, ix: usize) -> Result<(usize, Expr)> {
if ix + 1 == self.re.len() {
return Err(Error::TrailingBackslash);
}
Expand All @@ -233,7 +240,6 @@ impl<'a> Parser<'a> {
if let Some((end, group)) = parse_decimal(self.re, ix + 1) {
// protect BitSet against unreasonably large value
if group < self.re.len() / 2 {
self.backrefs.insert(group);
return Ok((end, Expr::Backref(group)));
}
}
Expand Down Expand Up @@ -331,9 +337,9 @@ impl<'a> Parser<'a> {
fn parse_class(&self, ix: usize) -> Result<(usize, Expr)> {
let bytes = self.re.as_bytes();
let mut ix = ix + 1; // skip opening '['
let mut inner = String::new();
let mut class = String::new();
let mut nest = 1;
inner.push('[');
class.push('[');
loop {
ix = self.optional_whitespace(ix);
if ix == self.re.len() {
Expand All @@ -344,27 +350,46 @@ impl<'a> Parser<'a> {
if ix + 1 == self.re.len() {
return Err(Error::InvalidClass);
}
ix + 1 + codepoint_len(bytes[ix + 1])

// We support more escapes than regex, so parse it ourselves before delegating.
let (end, expr) = try!(self.parse_escape(ix));
match expr {
Expr::Literal { val, .. } => {
class.push_str(&escape(&val));
}
Expr::Delegate { inner, .. } => {
class.push_str(&inner);
}
_ => {
return Err(Error::InvalidClass);
}
}
end
}
b'[' => {
nest += 1;
class.push('[');
ix + 1
}
b']' => {
nest -= 1;
if nest == 0 {
break;
}
class.push(']');
ix + 1
}
b => ix + codepoint_len(b)
b => {
let end = ix + codepoint_len(b);
class.push_str(&self.re[ix..end]);
end
}
};
inner.push_str(&self.re[ix..end]);
ix = end;
}
inner.push(']');
class.push(']');
let ix = ix + 1; // skip closing ']'
Ok((ix, Expr::Delegate { inner: inner, size: 1 }))
Ok((ix, Expr::Delegate { inner: class, size: 1 }))
}

fn parse_group(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> {
Expand Down
19 changes: 19 additions & 0 deletions tests/matching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,25 @@ fn control_character_escapes() {
assert_matches(r"\v", "\x0B");
}

#[test]
fn character_class_escapes() {
assert_matches(r"[\[]", "[");
assert_matches(r"[\^]", "^");

// The regex crate would reject the following because it's not necessary to escape them.
// Other engines allow to escape any non-alphanumeric character.
assert_matches(r"[\<]", "<");
assert_matches(r"[\>]", ">");
assert_matches(r"[\.]", ".");

// Character class escape
assert_matches(r"[\d]", "1");

// Control characters
assert_matches(r"[\e]", "\x1B");
assert_matches(r"[\n]", "\x0A");
}


fn assert_matches(re: &str, text: &str) {
let parse_result = Regex::new(re);
Expand Down