Skip to content

Commit 85ec3c8

Browse files
fuzz: Add a roundtrip regex fuzz harness
This change adds an optional dependency on 'arbitrary' for regex-syntax. This allows us to generate arbitrary high-level intermediate representations (HIR). Using this generated HIR we convert this back to a regex string and exercise the regex matching code under src. Using this approach we can generate arbitrary well-formed regex strings, allowing the fuzzer to penetrate deeper into the regex code.
1 parent a9b2e02 commit 85ec3c8

File tree

5 files changed

+198
-11
lines changed

5 files changed

+198
-11
lines changed

fuzz/Cargo.toml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
cargo-features = ['named-profiles']
2-
31
[package]
42
name = "regex-fuzz"
53
version = "0.0.0"
@@ -11,11 +9,16 @@ edition = "2018"
119
cargo-fuzz = true
1210

1311
[dependencies]
12+
arbitrary = { version = "1.2.3", features = ["derive"] }
1413
libfuzzer-sys = "0.4.1"
1514

1615
[dependencies.regex]
1716
path = ".."
1817

18+
[dependencies.regex-syntax]
19+
path = "../regex-syntax"
20+
features = ["arbitrary"]
21+
1922
# Prevent this from interfering with workspaces
2023
[workspace]
2124
members = ["."]
@@ -24,6 +27,12 @@ members = ["."]
2427
name = "fuzz_regex_match"
2528
path = "fuzz_targets/fuzz_regex_match.rs"
2629

30+
[[bin]]
31+
name = "fuzz_regex"
32+
path = "fuzz_targets/fuzz_regex.rs"
33+
test = false
34+
doc = false
35+
2736
[profile.release]
2837
opt-level = 3
2938
debug = true

fuzz/fuzz_targets/fuzz_regex.rs

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#![no_main]
2+
3+
use arbitrary::Arbitrary;
4+
use libfuzzer_sys::fuzz_target;
5+
use regex_syntax::hir::print::Printer;
6+
use regex_syntax::hir::Hir;
7+
use std::{convert::TryFrom, hint::black_box};
8+
9+
#[derive(Arbitrary, Debug, Clone)]
10+
enum Pattern {
11+
WellFormed(Hir),
12+
Random(String),
13+
}
14+
15+
impl TryFrom<Pattern> for String {
16+
type Error = std::fmt::Error;
17+
18+
fn try_from(pattern: Pattern) -> Result<Self, Self::Error> {
19+
match pattern {
20+
Pattern::WellFormed(hir) => {
21+
let mut printer = Printer::new();
22+
let mut dst = String::new();
23+
printer.print(&hir, &mut dst)?;
24+
return Ok(dst);
25+
}
26+
Pattern::Random(s) => {
27+
return Ok(s);
28+
}
29+
}
30+
}
31+
}
32+
33+
#[derive(Arbitrary, Debug)]
34+
struct Data<'a> {
35+
pattern: Pattern,
36+
replacen: (usize, &'a str),
37+
replacen_bytes: (usize, &'a [u8]),
38+
input: &'a str,
39+
input_bytes: &'a [u8],
40+
pattern_set: Vec<Pattern>,
41+
set_input: &'a str,
42+
set_input_bytes: &'a [u8],
43+
}
44+
45+
fn fuzz_regex(
46+
pattern: &Pattern,
47+
input: &str,
48+
replacen: &(usize, &str),
49+
) -> Result<(), Box<dyn std::error::Error>> {
50+
let re = regex::Regex::new(&String::try_from(pattern.clone())?)?;
51+
_ = black_box(re.is_match(&input));
52+
_ = black_box(re.captures_iter(&input).collect::<Vec<regex::Captures>>());
53+
_ = black_box(re.split(&input).collect::<Vec<&str>>());
54+
55+
let (limit, replace) = *replacen;
56+
_ = black_box(re.replacen(&input, limit, replace));
57+
58+
_ = black_box(re.find(&input));
59+
_ = black_box(re.shortest_match(&input));
60+
Ok(())
61+
}
62+
63+
fn fuzz_regex_bytes(
64+
pattern: &Pattern,
65+
input: &[u8],
66+
replacen: &(usize, &[u8]),
67+
) -> Result<(), Box<dyn std::error::Error>> {
68+
let re = regex::bytes::Regex::new(&String::try_from(pattern.clone())?)?;
69+
_ = black_box(re.is_match(&input));
70+
_ = black_box(
71+
re.captures_iter(&input).collect::<Vec<regex::bytes::Captures>>(),
72+
);
73+
_ = black_box(re.split(&input).collect::<Vec<&[u8]>>());
74+
75+
let (limit, replace) = *replacen;
76+
_ = black_box(re.replacen(&input, limit, replace));
77+
78+
_ = black_box(re.find(&input));
79+
_ = black_box(re.shortest_match(&input));
80+
Ok(())
81+
}
82+
83+
fn fuzz_regex_set(
84+
pattern_set: &Vec<Pattern>,
85+
input: &str,
86+
) -> Result<(), Box<dyn std::error::Error>> {
87+
let set = regex::RegexSet::new(
88+
pattern_set
89+
.into_iter()
90+
.filter_map(|x| String::try_from(x.clone()).ok()),
91+
)?;
92+
_ = black_box(set.is_match(&input));
93+
_ = black_box(set.matches(&input).into_iter().collect::<Vec<_>>());
94+
Ok(())
95+
}
96+
97+
fn fuzz_regex_set_bytes(
98+
pattern_set: &Vec<Pattern>,
99+
input: &[u8],
100+
) -> Result<(), Box<dyn std::error::Error>> {
101+
let set = regex::bytes::RegexSet::new(
102+
pattern_set
103+
.into_iter()
104+
.filter_map(|x| String::try_from(x.clone()).ok()),
105+
)?;
106+
_ = black_box(set.is_match(&input));
107+
_ = black_box(set.matches(&input).into_iter().collect::<Vec<_>>());
108+
Ok(())
109+
}
110+
111+
fuzz_target!(|data: Data| {
112+
if data.pattern_set.len() > 10 {
113+
return;
114+
}
115+
let (_, replace) = data.replacen;
116+
if replace.len() > 100 {
117+
return;
118+
}
119+
let (_, replace) = data.replacen_bytes;
120+
if replace.len() > 100 {
121+
return;
122+
}
123+
if data.set_input.len() > 500 {
124+
return;
125+
}
126+
if data.set_input_bytes.len() > 500 {
127+
return;
128+
}
129+
if data.input_bytes.len() > 500 {
130+
return;
131+
}
132+
if data.input.len() > 500 {
133+
return;
134+
}
135+
136+
if let Err(e) =
137+
black_box(fuzz_regex(&data.pattern, &data.input, &data.replacen))
138+
{
139+
black_box(format!("{e:?}"));
140+
}
141+
142+
if let Err(e) = black_box(fuzz_regex_bytes(
143+
&data.pattern,
144+
&data.input_bytes,
145+
&data.replacen_bytes,
146+
)) {
147+
black_box(format!("{e:?}"));
148+
}
149+
if let Err(e) =
150+
black_box(fuzz_regex_set(&data.pattern_set, &data.set_input))
151+
{
152+
black_box(format!("{e:?}"));
153+
}
154+
155+
if let Err(e) = black_box(fuzz_regex_set_bytes(
156+
&data.pattern_set,
157+
&data.set_input_bytes,
158+
)) {
159+
black_box(format!("{e:?}"));
160+
}
161+
});

regex-syntax/Cargo.toml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,14 @@ edition = "2018"
1515
[features]
1616
default = ["unicode"]
1717

18-
unicode = [
19-
"unicode-age",
20-
"unicode-bool",
21-
"unicode-case",
22-
"unicode-gencat",
23-
"unicode-perl",
24-
"unicode-script",
25-
"unicode-segment",
26-
]
18+
unicode = ["unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"]
2719
unicode-age = []
2820
unicode-bool = []
2921
unicode-case = []
3022
unicode-gencat = []
3123
unicode-perl = []
3224
unicode-script = []
3325
unicode-segment = []
26+
27+
[dependencies]
28+
arbitrary = { version = "1.2.3", features = ["derive"], optional = true }

regex-syntax/src/hir/interval.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ use std::slice;
55
use std::u8;
66

77
use crate::unicode;
8+
#[cfg(feature = "arbitrary")]
9+
use arbitrary::Arbitrary;
810

911
// This module contains an *internal* implementation of interval sets.
1012
//
@@ -33,6 +35,7 @@ use crate::unicode;
3335
// Tests on this are relegated to the public API of HIR in src/hir.rs.
3436

3537
#[derive(Clone, Debug, Eq, PartialEq)]
38+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
3639
pub struct IntervalSet<I> {
3740
ranges: Vec<I>,
3841
}

regex-syntax/src/hir/mod.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ use std::fmt;
88
use std::result;
99
use std::u8;
1010

11+
#[cfg(feature = "arbitrary")]
12+
use arbitrary::Arbitrary;
13+
1114
use crate::ast::Span;
1215
use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
1316
use crate::unicode;
@@ -172,6 +175,7 @@ impl fmt::Display for ErrorKind {
172175
/// expression pattern string, and uses constant stack space and heap space
173176
/// proportional to the size of the `Hir`.
174177
#[derive(Clone, Debug, Eq, PartialEq)]
178+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
175179
pub struct Hir {
176180
/// The underlying HIR kind.
177181
kind: HirKind,
@@ -181,6 +185,7 @@ pub struct Hir {
181185

182186
/// The kind of an arbitrary `Hir` expression.
183187
#[derive(Clone, Debug, Eq, PartialEq)]
188+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
184189
pub enum HirKind {
185190
/// The empty regular expression, which matches everything, including the
186191
/// empty string.
@@ -744,6 +749,7 @@ impl fmt::Display for Hir {
744749
/// are preferred whenever possible. In particular, a `Byte` variant is only
745750
/// ever produced when it could match invalid UTF-8.
746751
#[derive(Clone, Debug, Eq, PartialEq)]
752+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
747753
pub enum Literal {
748754
/// A single character represented by a Unicode scalar value.
749755
Unicode(char),
@@ -780,6 +786,7 @@ impl Literal {
780786
/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
781787
/// match the same set of strings.
782788
#[derive(Clone, Debug, Eq, PartialEq)]
789+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
783790
pub enum Class {
784791
/// A set of characters represented by Unicode scalar values.
785792
Unicode(ClassUnicode),
@@ -834,6 +841,7 @@ impl Class {
834841

835842
/// A set of characters represented by Unicode scalar values.
836843
#[derive(Clone, Debug, Eq, PartialEq)]
844+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
837845
pub struct ClassUnicode {
838846
set: IntervalSet<ClassUnicodeRange>,
839847
}
@@ -970,6 +978,7 @@ impl<'a> Iterator for ClassUnicodeIter<'a> {
970978
/// The range is closed. That is, the start and end of the range are included
971979
/// in the range.
972980
#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
981+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
973982
pub struct ClassUnicodeRange {
974983
start: char,
975984
end: char,
@@ -1077,6 +1086,7 @@ impl ClassUnicodeRange {
10771086
/// A set of characters represented by arbitrary bytes (where one byte
10781087
/// corresponds to one character).
10791088
#[derive(Clone, Debug, Eq, PartialEq)]
1089+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
10801090
pub struct ClassBytes {
10811091
set: IntervalSet<ClassBytesRange>,
10821092
}
@@ -1187,6 +1197,7 @@ impl<'a> Iterator for ClassBytesIter<'a> {
11871197
/// The range is closed. That is, the start and end of the range are included
11881198
/// in the range.
11891199
#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1200+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
11901201
pub struct ClassBytesRange {
11911202
start: u8,
11921203
end: u8,
@@ -1282,6 +1293,7 @@ impl fmt::Debug for ClassBytesRange {
12821293
///
12831294
/// A matching anchor assertion is always zero-length.
12841295
#[derive(Clone, Debug, Eq, PartialEq)]
1296+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
12851297
pub enum Anchor {
12861298
/// Match the beginning of a line or the beginning of text. Specifically,
12871299
/// this matches at the starting position of the input, or at the position
@@ -1303,6 +1315,7 @@ pub enum Anchor {
13031315
///
13041316
/// A matching word boundary assertion is always zero-length.
13051317
#[derive(Clone, Debug, Eq, PartialEq)]
1318+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
13061319
pub enum WordBoundary {
13071320
/// Match a Unicode-aware word boundary. That is, this matches a position
13081321
/// where the left adjacent character and right adjacent character
@@ -1336,6 +1349,7 @@ impl WordBoundary {
13361349
/// 2. A capturing group (e.g., `(expr)`).
13371350
/// 3. A named capturing group (e.g., `(?P<name>expr)`).
13381351
#[derive(Clone, Debug, Eq, PartialEq)]
1352+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
13391353
pub struct Group {
13401354
/// The kind of this group. If it is a capturing group, then the kind
13411355
/// contains the capture group index (and the name, if it is a named
@@ -1347,6 +1361,7 @@ pub struct Group {
13471361

13481362
/// The kind of group.
13491363
#[derive(Clone, Debug, Eq, PartialEq)]
1364+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
13501365
pub enum GroupKind {
13511366
/// A normal unnamed capturing group.
13521367
///
@@ -1368,6 +1383,7 @@ pub enum GroupKind {
13681383
/// A repetition operator permits the repetition of an arbitrary
13691384
/// sub-expression.
13701385
#[derive(Clone, Debug, Eq, PartialEq)]
1386+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
13711387
pub struct Repetition {
13721388
/// The kind of this repetition operator.
13731389
pub kind: RepetitionKind,
@@ -1407,6 +1423,7 @@ impl Repetition {
14071423

14081424
/// The kind of a repetition operator.
14091425
#[derive(Clone, Debug, Eq, PartialEq)]
1426+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
14101427
pub enum RepetitionKind {
14111428
/// Matches a sub-expression zero or one times.
14121429
ZeroOrOne,
@@ -1420,6 +1437,7 @@ pub enum RepetitionKind {
14201437

14211438
/// The kind of a counted repetition operator.
14221439
#[derive(Clone, Debug, Eq, PartialEq)]
1440+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
14231441
pub enum RepetitionRange {
14241442
/// Matches a sub-expression exactly this many times.
14251443
Exactly(u32),
@@ -1477,6 +1495,7 @@ impl Drop for Hir {
14771495
///
14781496
/// These attributes are typically defined inductively on the HIR.
14791497
#[derive(Clone, Debug, Eq, PartialEq)]
1498+
#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
14801499
struct HirInfo {
14811500
/// Represent yes/no questions by a bitfield to conserve space, since
14821501
/// this is included in every HIR expression.

0 commit comments

Comments
 (0)