Skip to content

Commit 7ffd92e

Browse files
committed
Escapes are done.
Char class handling seems OK now.
1 parent e6359f4 commit 7ffd92e

File tree

2 files changed

+258
-9
lines changed

2 files changed

+258
-9
lines changed

regex_syntax/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,12 +356,14 @@ pub enum ErrorKind {
356356
UnclosedHex,
357357
UnclosedParen,
358358
UnclosedRepeat,
359+
UnclosedUnicodeName,
359360
UnexpectedEscapeEof,
360361
UnexpectedFlagEof,
361362
UnexpectedTwoDigitHexEof,
362363
UnopenedParen,
363364
UnrecognizedEscape(char),
364365
UnrecognizedFlag(char),
366+
UnrecognizedUnicodeClass(String),
365367
}
366368

367369
/// Returns the Unicode *simple* case folding of `c`.

regex_syntax/src/parser.rs

Lines changed: 256 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,10 @@ impl Parser {
6464
while !self.eof() {
6565
let c = self.cur();
6666
let build_expr = match c {
67-
'\\' => try!(self.parse_escape()),
67+
'\\' => {
68+
let bexpr = try!(self.parse_escape());
69+
self.maybe_class_case_fold(bexpr)
70+
}
6871
'|' => { let e = try!(self.alternate()); self.bump(); e }
6972
'?' => try!(self.parse_simple_repeat(Repeater::ZeroOrOne)),
7073
'*' => try!(self.parse_simple_repeat(Repeater::ZeroOrMore)),
@@ -150,8 +153,7 @@ impl Parser {
150153
}
151154
'd'|'s'|'w'|'D'|'S'|'W' => {
152155
self.bump();
153-
self.parse_perl_class(c, c == 'D' || c == 'S' || c == 'W')
154-
.map(|cls| Build::Expr(Expr::Class(cls)))
156+
Ok(Build::Expr(Expr::Class(self.parse_perl_class(c))))
155157
}
156158
c => Err(self.err(ErrorKind::UnrecognizedEscape(c))),
157159
}
@@ -463,11 +465,43 @@ impl Parser {
463465
//
464466
// `negate` is true when the class name is used with `\P`.
465467
fn parse_unicode_class(&mut self, neg: bool) -> Result<CharClass> {
466-
Ok(CharClass::new(vec![]))
468+
let name =
469+
if self.bump_if('{') {
470+
let n = self.bump_get(|c| c != '}').unwrap_or("".into());
471+
if n.is_empty() || !self.bump_if('}') {
472+
// e.g., \p{Greek
473+
return Err(self.err(ErrorKind::UnclosedUnicodeName));
474+
}
475+
n
476+
} else {
477+
if self.eof() {
478+
// e.g., \p
479+
return Err(self.err(ErrorKind::UnexpectedEscapeEof));
480+
}
481+
self.bump().to_string()
482+
};
483+
match unicode_class(&name) {
484+
None => Err(self.err(ErrorKind::UnrecognizedUnicodeClass(name))),
485+
Some(cls) => if neg { Ok(cls.negate()) } else { Ok(cls) },
486+
}
467487
}
468488

469-
fn parse_perl_class(&mut self, c: char, neg: bool) -> Result<CharClass> {
470-
Ok(CharClass::new(vec![]))
489+
// Parses a perl character class with Unicode support.
490+
//
491+
// `name` must be one of d, s, w, D, S, W. If not, this function panics.
492+
//
493+
// No parser state is changed.
494+
fn parse_perl_class(&mut self, name: char) -> CharClass {
495+
use unicode::regex::{PERLD, PERLS, PERLW};
496+
match name {
497+
'd' => raw_class_to_expr(PERLD),
498+
'D' => raw_class_to_expr(PERLD).negate(),
499+
's' => raw_class_to_expr(PERLS),
500+
'S' => raw_class_to_expr(PERLS).negate(),
501+
'w' => raw_class_to_expr(PERLW),
502+
'W' => raw_class_to_expr(PERLW).negate(),
503+
_ => unreachable!(),
504+
}
471505
}
472506

473507
// Always bump to the next input and return the given expression as a
@@ -532,6 +566,22 @@ impl Parser {
532566
Some(Build::Expr(e)) => Ok(e),
533567
}
534568
}
569+
570+
// If the current contexts calls for case insensitivity and if the expr
571+
// given is a character class, do case folding on it and return the new
572+
// class.
573+
//
574+
// Otherwise, return the expression unchanged.
575+
fn maybe_class_case_fold(&mut self, bexpr: Build) -> Build {
576+
match bexpr {
577+
Build::Expr(Expr::Class(cls)) => {
578+
Build::Expr(Expr::Class(
579+
if self.flags.casei { cls.case_fold() } else { cls }
580+
))
581+
}
582+
bexpr => bexpr,
583+
}
584+
}
535585
}
536586

537587
// Auxiliary methods for manipulating the expression stack.
@@ -774,14 +824,19 @@ fn is_hex(c: char) -> bool {
774824

775825
fn unicode_class(name: &str) -> Option<CharClass> {
776826
UNICODE_CLASSES.binary_search_by(|&(s, _)| s.cmp(name)).ok().map(|i| {
777-
let range = |&(s, e)| ClassRange { start: s, end: e };
778-
CharClass::new(UNICODE_CLASSES[i].1.iter().map(range).collect())
827+
raw_class_to_expr(UNICODE_CLASSES[i].1)
779828
})
780829
}
781830

831+
fn raw_class_to_expr(raw: &[(char, char)]) -> CharClass {
832+
let range = |&(s, e)| ClassRange { start: s, end: e };
833+
CharClass::new(raw.iter().map(range).collect())
834+
}
835+
782836
#[cfg(test)]
783837
mod tests {
784-
use { Expr, Repeater, Error, ErrorKind };
838+
use { CharClass, ClassRange, Expr, Repeater, Error, ErrorKind };
839+
use unicode::regex::{PERLD, PERLS, PERLW};
785840
use super::Parser;
786841

787842
fn p(s: &str) -> Expr { Parser::parse(s).unwrap() }
@@ -791,6 +846,26 @@ mod tests {
791846
fn b<T>(v: T) -> Box<T> { Box::new(v) }
792847
fn c(es: &[Expr]) -> Expr { Expr::Concat(es.to_vec()) }
793848

849+
fn class(ranges: &[(char, char)]) -> CharClass {
850+
CharClass::new(ranges.iter().cloned().map(|(c1, c2)| {
851+
if c1 <= c2 {
852+
ClassRange::new(c1, c2)
853+
} else {
854+
ClassRange::new(c2, c1)
855+
}
856+
}).collect())
857+
}
858+
859+
fn classi(ranges: &[(char, char)]) -> CharClass {
860+
CharClass::new_casei(ranges.iter().cloned().map(|(c1, c2)| {
861+
if c1 <= c2 {
862+
ClassRange::new(c1, c2)
863+
} else {
864+
ClassRange::new(c2, c1)
865+
}
866+
}).collect())
867+
}
868+
794869
#[test]
795870
fn empty() {
796871
assert_eq!(p(""), Expr::Empty);
@@ -1201,6 +1276,137 @@ mod tests {
12011276
assert_eq!(p(r"\x{2603}"), lit('\u{2603}'));
12021277
}
12031278

1279+
#[test]
1280+
fn escape_unicode_name() {
1281+
assert_eq!(p(r"\p{Yi}"), Expr::Class(class(&[
1282+
('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}')
1283+
])));
1284+
}
1285+
1286+
#[test]
1287+
fn escape_unicode_letter() {
1288+
assert_eq!(p(r"\pZ"), Expr::Class(class(&[
1289+
('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'),
1290+
('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'),
1291+
('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'),
1292+
('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'),
1293+
])));
1294+
}
1295+
1296+
#[test]
1297+
fn escape_unicode_name_case_fold() {
1298+
assert_eq!(p(r"(?i)\p{Yi}"), Expr::Class(class(&[
1299+
('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}')
1300+
]).case_fold()));
1301+
}
1302+
1303+
#[test]
1304+
fn escape_unicode_letter_case_fold() {
1305+
assert_eq!(p(r"(?i)\pZ"), Expr::Class(class(&[
1306+
('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'),
1307+
('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'),
1308+
('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'),
1309+
('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'),
1310+
]).case_fold()));
1311+
}
1312+
1313+
#[test]
1314+
fn escape_unicode_name_negate() {
1315+
assert_eq!(p(r"\P{Yi}"), Expr::Class(class(&[
1316+
('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}')
1317+
]).negate()));
1318+
}
1319+
1320+
#[test]
1321+
fn escape_unicode_letter_negate() {
1322+
assert_eq!(p(r"\PZ"), Expr::Class(class(&[
1323+
('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'),
1324+
('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'),
1325+
('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'),
1326+
('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'),
1327+
]).negate()));
1328+
}
1329+
1330+
#[test]
1331+
fn escape_unicode_name_negate_case_fold() {
1332+
assert_eq!(p(r"(?i)\P{Yi}"), Expr::Class(class(&[
1333+
('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}')
1334+
]).negate().case_fold()));
1335+
}
1336+
1337+
#[test]
1338+
fn escape_unicode_letter_negate_case_fold() {
1339+
assert_eq!(p(r"(?i)\PZ"), Expr::Class(class(&[
1340+
('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'),
1341+
('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'),
1342+
('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'),
1343+
('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'),
1344+
]).negate().case_fold()));
1345+
}
1346+
1347+
#[test]
1348+
fn escape_perl_d() {
1349+
assert_eq!(p(r"\d"), Expr::Class(class(PERLD)));
1350+
}
1351+
1352+
#[test]
1353+
fn escape_perl_s() {
1354+
assert_eq!(p(r"\s"), Expr::Class(class(PERLS)));
1355+
}
1356+
1357+
#[test]
1358+
fn escape_perl_w() {
1359+
assert_eq!(p(r"\w"), Expr::Class(class(PERLW)));
1360+
}
1361+
1362+
#[test]
1363+
fn escape_perl_d_negate() {
1364+
assert_eq!(p(r"\D"), Expr::Class(class(PERLD).negate()));
1365+
}
1366+
1367+
#[test]
1368+
fn escape_perl_s_negate() {
1369+
assert_eq!(p(r"\S"), Expr::Class(class(PERLS).negate()));
1370+
}
1371+
1372+
#[test]
1373+
fn escape_perl_w_negate() {
1374+
assert_eq!(p(r"\W"), Expr::Class(class(PERLW).negate()));
1375+
}
1376+
1377+
#[test]
1378+
fn escape_perl_d_case_fold() {
1379+
assert_eq!(p(r"(?i)\d"), Expr::Class(class(PERLD).case_fold()));
1380+
}
1381+
1382+
#[test]
1383+
fn escape_perl_s_case_fold() {
1384+
assert_eq!(p(r"(?i)\s"), Expr::Class(class(PERLS).case_fold()));
1385+
}
1386+
1387+
#[test]
1388+
fn escape_perl_w_case_fold() {
1389+
assert_eq!(p(r"(?i)\w"), Expr::Class(class(PERLW).case_fold()));
1390+
}
1391+
1392+
#[test]
1393+
fn escape_perl_d_case_fold_negate() {
1394+
assert_eq!(p(r"(?i)\D"),
1395+
Expr::Class(class(PERLD).negate().case_fold()));
1396+
}
1397+
1398+
#[test]
1399+
fn escape_perl_s_case_fold_negate() {
1400+
assert_eq!(p(r"(?i)\S"),
1401+
Expr::Class(class(PERLS).negate().case_fold()));
1402+
}
1403+
1404+
#[test]
1405+
fn escape_perl_w_case_fold_negate() {
1406+
assert_eq!(p(r"(?i)\W"),
1407+
Expr::Class(class(PERLW).negate().case_fold()));
1408+
}
1409+
12041410
/******************************************************/
12051411
// Test every single possible error case.
12061412
/******************************************************/
@@ -1610,4 +1816,45 @@ mod tests {
16101816
kind: ErrorKind::InvalidBase16("9999999999".into()),
16111817
});
16121818
}
1819+
1820+
#[test]
1821+
fn error_unicode_unclosed() {
1822+
assert_eq!(perr(r"\p{"), Error {
1823+
pos: 3,
1824+
surround: r"\p{".into(),
1825+
kind: ErrorKind::UnclosedUnicodeName,
1826+
});
1827+
assert_eq!(perr(r"\p{Greek"), Error {
1828+
pos: 8,
1829+
surround: r"Greek".into(),
1830+
kind: ErrorKind::UnclosedUnicodeName,
1831+
});
1832+
}
1833+
1834+
#[test]
1835+
fn error_unicode_no_letter() {
1836+
assert_eq!(perr(r"\p"), Error {
1837+
pos: 2,
1838+
surround: r"\p".into(),
1839+
kind: ErrorKind::UnexpectedEscapeEof,
1840+
});
1841+
}
1842+
1843+
#[test]
1844+
fn error_unicode_unknown_letter() {
1845+
assert_eq!(perr(r"\pA"), Error {
1846+
pos: 3,
1847+
surround: r"\pA".into(),
1848+
kind: ErrorKind::UnrecognizedUnicodeClass("A".into()),
1849+
});
1850+
}
1851+
1852+
#[test]
1853+
fn error_unicode_unknown_name() {
1854+
assert_eq!(perr(r"\p{Yii}"), Error {
1855+
pos: 7,
1856+
surround: r"{Yii}".into(),
1857+
kind: ErrorKind::UnrecognizedUnicodeClass("Yii".into()),
1858+
});
1859+
}
16131860
}

0 commit comments

Comments
 (0)