Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 00516b1

Browse files
committedApr 30, 2023
api: introduce new regex-lite crate
Closes #961
1 parent 374b329 commit 00516b1

29 files changed

+7719
-0
lines changed
 

‎.github/workflows/ci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ jobs:
122122
- name: Run subset of regex-automata tests
123123
if: matrix.build != 'win-gnu' # Just horrifically slow.
124124
run: ${{ env.CARGO }} test --verbose --manifest-path regex-automata/Cargo.toml $TARGET
125+
- name: Run regex-lite tests
126+
run: ${{ env.CARGO }} test --verbose --manifest-path regex-lite/Cargo.toml $TARGET
125127

126128
# This job runs a stripped down version of CI to test the MSRV. The specific
127129
# reason for doing this is that the regex crate's dev-dependencies tend to

‎Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ members = [
2222
"regex-automata",
2323
"regex-capi",
2424
"regex-cli",
25+
"regex-lite",
2526
"regex-syntax",
2627
"regex-test",
2728
]

‎regex-cli/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ log = { version = "0.4.17", features = ["std"] }
3030
memmap2 = "0.5.10"
3131
regex = { path = ".." }
3232
regex-automata = { path = "../regex-automata", features = ["logging"] }
33+
regex-lite = { path = "../regex-lite" }
3334
regex-syntax = { path = "../regex-syntax" }
3435
tabwriter = { version = "1.2.1", features = ["ansi_formatting"] }
3536
textwrap = { version = "0.16.0", default-features = false }

‎regex-cli/args/lite.rs

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
use {
2+
lexopt::{Arg, Parser},
3+
regex_automata::util::syntax,
4+
regex_lite::Regex,
5+
};
6+
7+
use crate::args::{self, Configurable, Usage};
8+
9+
/// Exposes the configuration for the top-level `Regex` API.
10+
#[derive(Debug, Default)]
11+
pub struct Config {
12+
size_limit: Option<usize>,
13+
}
14+
15+
impl Config {
16+
/// Builds a `Regex` from the given syntax configuration and sequence of
17+
/// patterns. This returns an error is `patterns.len() != 1`.
18+
///
19+
/// Note that this also returns an error if any syntax options are set
20+
/// that aren't supported by `regex-lite`.
21+
pub fn from_patterns(
22+
&self,
23+
syntax: &syntax::Config,
24+
patterns: &[String],
25+
) -> anyhow::Result<Regex> {
26+
anyhow::ensure!(
27+
patterns.len() == 1,
28+
"API-level regex requires exactly one pattern, \
29+
but {} were given",
30+
patterns.len(),
31+
);
32+
anyhow::ensure!(
33+
!syntax.get_octal(),
34+
"regex-lite does not support octal mode",
35+
);
36+
anyhow::ensure!(
37+
syntax.get_utf8(),
38+
"regex-lite does not support disabling UTF-8 mode",
39+
);
40+
anyhow::ensure!(
41+
syntax.get_unicode(),
42+
"regex-lite does not support disabling Unicode mode",
43+
);
44+
let mut b = regex_lite::RegexBuilder::new(&patterns[0]);
45+
b.case_insensitive(syntax.get_case_insensitive());
46+
b.multi_line(syntax.get_multi_line());
47+
b.crlf(syntax.get_crlf());
48+
b.dot_matches_new_line(syntax.get_dot_matches_new_line());
49+
b.swap_greed(syntax.get_swap_greed());
50+
b.ignore_whitespace(syntax.get_ignore_whitespace());
51+
b.nest_limit(syntax.get_nest_limit());
52+
b.size_limit(self.size_limit.unwrap_or(usize::MAX));
53+
b.build().map_err(anyhow::Error::from)
54+
}
55+
}
56+
57+
impl Configurable for Config {
58+
fn configure(
59+
&mut self,
60+
p: &mut Parser,
61+
arg: &mut Arg,
62+
) -> anyhow::Result<bool> {
63+
match *arg {
64+
Arg::Long("size-limit") => {
65+
self.size_limit = args::parse_maybe(p, "--size-limit")?;
66+
}
67+
_ => return Ok(false),
68+
}
69+
Ok(true)
70+
}
71+
72+
fn usage(&self) -> &[Usage] {
73+
const USAGES: &'static [Usage] = &[Usage::new(
74+
"--size-limit",
75+
"Set a limit on heap used by a regex.",
76+
r#"
77+
This sets a limit, in bytes, on the heap memory used by a regex.
78+
79+
The special value 'none' indicates that no size limit should be imposed.
80+
"#,
81+
)];
82+
USAGES
83+
}
84+
}

‎regex-cli/args/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ pub mod flags;
1616
pub mod haystack;
1717
pub mod hybrid;
1818
pub mod input;
19+
pub mod lite;
1920
pub mod meta;
2021
pub mod onepass;
2122
pub mod overlapping;

‎regex-cli/cmd/find/capture/mod.rs

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ USAGE:
3232
3333
ENGINES:
3434
backtrack Search with the bounded backtracker regex engine.
35+
lite Search with the regex-lite engine.
3536
meta Search with the meta regex engine.
3637
onepass Search with the one-pass DFA regex engine.
3738
pikevm Search with the PikeVM regex engine.
@@ -40,6 +41,7 @@ ENGINES:
4041
let cmd = args::next_as_command(USAGE, p)?;
4142
match &*cmd {
4243
"backtrack" => nfa::run_backtrack(p),
44+
"lite" => run_lite(p),
4345
"meta" => run_meta(p),
4446
"onepass" => dfa::run_onepass(p),
4547
"pikevm" => nfa::run_pikevm(p),
@@ -219,6 +221,107 @@ OPTIONS:
219221
Ok(())
220222
}
221223

224+
fn run_lite(p: &mut lexopt::Parser) -> anyhow::Result<()> {
225+
const USAGE: &'static str = "\
226+
Executes a search for full matches using the top-level regex-lite engine.
227+
228+
USAGE:
229+
regex-cli find capture lite [-p <pattern> ...] <haystack-path>
230+
regex-cli find capture lite [-p <pattern> ...] -y <haystack>
231+
232+
TIP:
233+
use -h for short docs and --help for long docs
234+
235+
OPTIONS:
236+
%options%
237+
";
238+
239+
let mut common = args::common::Config::default();
240+
let mut patterns = args::patterns::Config::only_flags();
241+
let mut haystack = args::haystack::Config::default();
242+
let mut syntax = args::syntax::Config::default();
243+
let mut lite = args::lite::Config::default();
244+
let mut find = super::Config::default();
245+
args::configure(
246+
p,
247+
USAGE,
248+
&mut [
249+
&mut common,
250+
&mut patterns,
251+
&mut haystack,
252+
&mut syntax,
253+
&mut lite,
254+
&mut find,
255+
],
256+
)?;
257+
258+
let pats = patterns.get()?;
259+
let syn = syntax.syntax()?;
260+
let mut table = Table::empty();
261+
let (re, time) = util::timeitr(|| lite.from_patterns(&syn, &pats))?;
262+
table.add("build regex time", time);
263+
264+
// Check that the haystack is valid UTF-8 since regex-lite doesn't support
265+
// searching arbitrary byte sequences. (At time of writing.)
266+
haystack.get()?.to_str()?;
267+
268+
// The top-level API doesn't support regex-automata's more granular Input
269+
// abstraction.
270+
let input = args::input::Config::default();
271+
// The top-level API also doesn't use 'Captures' from regex-automata
272+
// directly, but we can map between them with some annoyance.
273+
let group_info = GroupInfo::new([re.capture_names()])
274+
.context("could not build capture group info")?;
275+
let mut locs = re.capture_locations();
276+
let search = |input: &Input<'_>, caps: &mut Captures| {
277+
let haystack = input.haystack().to_str().unwrap();
278+
caps.set_pattern(None);
279+
if !re.captures_read_at(&mut locs, haystack, input.start()).is_some() {
280+
return Ok(());
281+
}
282+
caps.set_pattern(Some(PatternID::ZERO));
283+
for i in 0..locs.len() {
284+
use regex_automata::util::primitives::NonMaxUsize;
285+
286+
let slot_start = i * 2;
287+
let slot_end = slot_start + 1;
288+
match locs.get(i) {
289+
None => {
290+
caps.slots_mut()[slot_start] = None;
291+
caps.slots_mut()[slot_end] = None;
292+
}
293+
Some((start, end)) => {
294+
caps.slots_mut()[slot_start] = NonMaxUsize::new(start);
295+
caps.slots_mut()[slot_end] = NonMaxUsize::new(end);
296+
}
297+
}
298+
}
299+
Ok(())
300+
};
301+
if find.count {
302+
run_counts(
303+
&mut table,
304+
&common,
305+
&find,
306+
&input,
307+
&haystack,
308+
&group_info,
309+
search,
310+
)?;
311+
} else {
312+
run_search(
313+
&mut table,
314+
&common,
315+
&find,
316+
&input,
317+
&haystack,
318+
&group_info,
319+
search,
320+
)?;
321+
}
322+
Ok(())
323+
}
324+
222325
/// A function that takes in a bunch of configuration, runs the given search
223326
/// routine, and prints out a table of counts.
224327
fn run_counts(

‎regex-cli/cmd/find/match/mod.rs

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ ENGINES:
2626
backtrack Search with the bounded backtracker regex engine.
2727
dense Search with the dense DFA regex engine.
2828
hybrid Search with the lazy DFA regex engine.
29+
lite Search with the regex-lite engine.
2930
meta Search with the meta regex engine.
3031
onepass Search with the one-pass DFA regex engine.
3132
pikevm Search with the PikeVM regex engine.
@@ -37,6 +38,7 @@ ENGINES:
3738
"backtrack" => nfa::run_backtrack(p),
3839
"dense" => dfa::run_dense(p),
3940
"hybrid" => dfa::run_hybrid(p),
41+
"lite" => run_lite(p),
4042
"meta" => run_meta(p),
4143
"onepass" => dfa::run_onepass(p),
4244
"pikevm" => nfa::run_pikevm(p),
@@ -164,6 +166,71 @@ OPTIONS:
164166
Ok(())
165167
}
166168

169+
fn run_lite(p: &mut lexopt::Parser) -> anyhow::Result<()> {
170+
const USAGE: &'static str = "\
171+
Executes a search for full matches using the top-level regex-lite engine.
172+
173+
Note that since the regex-lite crate doesn't have an API for search arbitrary
174+
byte slices, the haystack must be valid UTF-8. If it isn't, this command will
175+
report an error.
176+
177+
USAGE:
178+
regex-cli find match lite [-p <pattern> ...] <haystack-path>
179+
regex-cli find match lite [-p <pattern> ...] -y <haystack>
180+
181+
TIP:
182+
use -h for short docs and --help for long docs
183+
184+
OPTIONS:
185+
%options%
186+
";
187+
188+
let mut common = args::common::Config::default();
189+
let mut patterns = args::patterns::Config::only_flags();
190+
let mut haystack = args::haystack::Config::default();
191+
let mut syntax = args::syntax::Config::default();
192+
let mut lite = args::lite::Config::default();
193+
let mut find = super::Config::default();
194+
args::configure(
195+
p,
196+
USAGE,
197+
&mut [
198+
&mut common,
199+
&mut patterns,
200+
&mut haystack,
201+
&mut syntax,
202+
&mut lite,
203+
&mut find,
204+
],
205+
)?;
206+
207+
let pats = patterns.get()?;
208+
let syn = syntax.syntax()?;
209+
let mut table = Table::empty();
210+
let (re, time) = util::timeitr(|| lite.from_patterns(&syn, &pats))?;
211+
table.add("build regex time", time);
212+
213+
// Check that the haystack is valid UTF-8 since regex-lite doesn't support
214+
// searching arbitrary byte sequences. (At time of writing.)
215+
haystack.get()?.to_str()?;
216+
217+
// The top-level regex-lite API doesn't support regex-automata's more
218+
// granular Input abstraction.
219+
let input = args::input::Config::default();
220+
let search = |input: &Input<'_>| {
221+
let haystack = input.haystack().to_str().unwrap();
222+
Ok(re
223+
.find_at(haystack, input.start())
224+
.map(|m| Match::new(PatternID::ZERO, m.start()..m.end())))
225+
};
226+
if find.count {
227+
run_counts(&mut table, &common, &find, &input, &haystack, 1, search)?;
228+
} else {
229+
run_search(&mut table, &common, &find, &input, &haystack, search)?;
230+
}
231+
Ok(())
232+
}
233+
167234
/// A function that takes in a bunch of configuration, runs the given search
168235
/// routine, and prints out a table of counts.
169236
fn run_counts(

‎regex-lite/Cargo.toml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
[package]
2+
name = "regex-lite"
3+
version = "0.1.0" #:version
4+
authors = ["The Rust Project Developers", "Andrew Gallant <jamslam@gmail.com>"]
5+
license = "MIT OR Apache-2.0"
6+
repository = "https://github.com/rust-lang/regex/tree/master/regex-lite"
7+
documentation = "https://docs.rs/regex-lite"
8+
description = """
9+
A lightweight regex engine that optimizes for binary size and compilation time.
10+
"""
11+
workspace = ".."
12+
edition = "2021"
13+
rust-version = "1.60.0"
14+
autotests = false
15+
16+
# Features are documented in the "Crate features" section of the crate docs:
17+
# https://docs.rs/regex-syntax/*/#crate-features
18+
[features]
19+
default = ["std"]
20+
std = []
21+
22+
[dev-dependencies]
23+
anyhow = "1.0.69"
24+
regex-test = { path = "../regex-test", version = "0.1.0" }
25+
26+
[[test]]
27+
path = "tests/lib.rs"
28+
name = "integration"
29+
30+
[package.metadata.docs.rs]
31+
# We want to document all features.
32+
all-features = true
33+
# To test this locally, run:
34+
#
35+
# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features
36+
rustdoc-args = ["--cfg", "docsrs"]

‎regex-lite/LICENSE-APACHE

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
Apache License
2+
Version 2.0, January 2004
3+
http://www.apache.org/licenses/
4+
5+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6+
7+
1. Definitions.
8+
9+
"License" shall mean the terms and conditions for use, reproduction,
10+
and distribution as defined by Sections 1 through 9 of this document.
11+
12+
"Licensor" shall mean the copyright owner or entity authorized by
13+
the copyright owner that is granting the License.
14+
15+
"Legal Entity" shall mean the union of the acting entity and all
16+
other entities that control, are controlled by, or are under common
17+
control with that entity. For the purposes of this definition,
18+
"control" means (i) the power, direct or indirect, to cause the
19+
direction or management of such entity, whether by contract or
20+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
21+
outstanding shares, or (iii) beneficial ownership of such entity.
22+
23+
"You" (or "Your") shall mean an individual or Legal Entity
24+
exercising permissions granted by this License.
25+
26+
"Source" form shall mean the preferred form for making modifications,
27+
including but not limited to software source code, documentation
28+
source, and configuration files.
29+
30+
"Object" form shall mean any form resulting from mechanical
31+
transformation or translation of a Source form, including but
32+
not limited to compiled object code, generated documentation,
33+
and conversions to other media types.
34+
35+
"Work" shall mean the work of authorship, whether in Source or
36+
Object form, made available under the License, as indicated by a
37+
copyright notice that is included in or attached to the work
38+
(an example is provided in the Appendix below).
39+
40+
"Derivative Works" shall mean any work, whether in Source or Object
41+
form, that is based on (or derived from) the Work and for which the
42+
editorial revisions, annotations, elaborations, or other modifications
43+
represent, as a whole, an original work of authorship. For the purposes
44+
of this License, Derivative Works shall not include works that remain
45+
separable from, or merely link (or bind by name) to the interfaces of,
46+
the Work and Derivative Works thereof.
47+
48+
"Contribution" shall mean any work of authorship, including
49+
the original version of the Work and any modifications or additions
50+
to that Work or Derivative Works thereof, that is intentionally
51+
submitted to Licensor for inclusion in the Work by the copyright owner
52+
or by an individual or Legal Entity authorized to submit on behalf of
53+
the copyright owner. For the purposes of this definition, "submitted"
54+
means any form of electronic, verbal, or written communication sent
55+
to the Licensor or its representatives, including but not limited to
56+
communication on electronic mailing lists, source code control systems,
57+
and issue tracking systems that are managed by, or on behalf of, the
58+
Licensor for the purpose of discussing and improving the Work, but
59+
excluding communication that is conspicuously marked or otherwise
60+
designated in writing by the copyright owner as "Not a Contribution."
61+
62+
"Contributor" shall mean Licensor and any individual or Legal Entity
63+
on behalf of whom a Contribution has been received by Licensor and
64+
subsequently incorporated within the Work.
65+
66+
2. Grant of Copyright License. Subject to the terms and conditions of
67+
this License, each Contributor hereby grants to You a perpetual,
68+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69+
copyright license to reproduce, prepare Derivative Works of,
70+
publicly display, publicly perform, sublicense, and distribute the
71+
Work and such Derivative Works in Source or Object form.
72+
73+
3. Grant of Patent License. Subject to the terms and conditions of
74+
this License, each Contributor hereby grants to You a perpetual,
75+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76+
(except as stated in this section) patent license to make, have made,
77+
use, offer to sell, sell, import, and otherwise transfer the Work,
78+
where such license applies only to those patent claims licensable
79+
by such Contributor that are necessarily infringed by their
80+
Contribution(s) alone or by combination of their Contribution(s)
81+
with the Work to which such Contribution(s) was submitted. If You
82+
institute patent litigation against any entity (including a
83+
cross-claim or counterclaim in a lawsuit) alleging that the Work
84+
or a Contribution incorporated within the Work constitutes direct
85+
or contributory patent infringement, then any patent licenses
86+
granted to You under this License for that Work shall terminate
87+
as of the date such litigation is filed.
88+
89+
4. Redistribution. You may reproduce and distribute copies of the
90+
Work or Derivative Works thereof in any medium, with or without
91+
modifications, and in Source or Object form, provided that You
92+
meet the following conditions:
93+
94+
(a) You must give any other recipients of the Work or
95+
Derivative Works a copy of this License; and
96+
97+
(b) You must cause any modified files to carry prominent notices
98+
stating that You changed the files; and
99+
100+
(c) You must retain, in the Source form of any Derivative Works
101+
that You distribute, all copyright, patent, trademark, and
102+
attribution notices from the Source form of the Work,
103+
excluding those notices that do not pertain to any part of
104+
the Derivative Works; and
105+
106+
(d) If the Work includes a "NOTICE" text file as part of its
107+
distribution, then any Derivative Works that You distribute must
108+
include a readable copy of the attribution notices contained
109+
within such NOTICE file, excluding those notices that do not
110+
pertain to any part of the Derivative Works, in at least one
111+
of the following places: within a NOTICE text file distributed
112+
as part of the Derivative Works; within the Source form or
113+
documentation, if provided along with the Derivative Works; or,
114+
within a display generated by the Derivative Works, if and
115+
wherever such third-party notices normally appear. The contents
116+
of the NOTICE file are for informational purposes only and
117+
do not modify the License. You may add Your own attribution
118+
notices within Derivative Works that You distribute, alongside
119+
or as an addendum to the NOTICE text from the Work, provided
120+
that such additional attribution notices cannot be construed
121+
as modifying the License.
122+
123+
You may add Your own copyright statement to Your modifications and
124+
may provide additional or different license terms and conditions
125+
for use, reproduction, or distribution of Your modifications, or
126+
for any such Derivative Works as a whole, provided Your use,
127+
reproduction, and distribution of the Work otherwise complies with
128+
the conditions stated in this License.
129+
130+
5. Submission of Contributions. Unless You explicitly state otherwise,
131+
any Contribution intentionally submitted for inclusion in the Work
132+
by You to the Licensor shall be under the terms and conditions of
133+
this License, without any additional terms or conditions.
134+
Notwithstanding the above, nothing herein shall supersede or modify
135+
the terms of any separate license agreement you may have executed
136+
with Licensor regarding such Contributions.
137+
138+
6. Trademarks. This License does not grant permission to use the trade
139+
names, trademarks, service marks, or product names of the Licensor,
140+
except as required for reasonable and customary use in describing the
141+
origin of the Work and reproducing the content of the NOTICE file.
142+
143+
7. Disclaimer of Warranty. Unless required by applicable law or
144+
agreed to in writing, Licensor provides the Work (and each
145+
Contributor provides its Contributions) on an "AS IS" BASIS,
146+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147+
implied, including, without limitation, any warranties or conditions
148+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149+
PARTICULAR PURPOSE. You are solely responsible for determining the
150+
appropriateness of using or redistributing the Work and assume any
151+
risks associated with Your exercise of permissions under this License.
152+
153+
8. Limitation of Liability. In no event and under no legal theory,
154+
whether in tort (including negligence), contract, or otherwise,
155+
unless required by applicable law (such as deliberate and grossly
156+
negligent acts) or agreed to in writing, shall any Contributor be
157+
liable to You for damages, including any direct, indirect, special,
158+
incidental, or consequential damages of any character arising as a
159+
result of this License or out of the use or inability to use the
160+
Work (including but not limited to damages for loss of goodwill,
161+
work stoppage, computer failure or malfunction, or any and all
162+
other commercial damages or losses), even if such Contributor
163+
has been advised of the possibility of such damages.
164+
165+
9. Accepting Warranty or Additional Liability. While redistributing
166+
the Work or Derivative Works thereof, You may choose to offer,
167+
and charge a fee for, acceptance of support, warranty, indemnity,
168+
or other liability obligations and/or rights consistent with this
169+
License. However, in accepting such obligations, You may act only
170+
on Your own behalf and on Your sole responsibility, not on behalf
171+
of any other Contributor, and only if You agree to indemnify,
172+
defend, and hold each Contributor harmless for any liability
173+
incurred by, or claims asserted against, such Contributor by reason
174+
of your accepting any such warranty or additional liability.
175+
176+
END OF TERMS AND CONDITIONS
177+
178+
APPENDIX: How to apply the Apache License to your work.
179+
180+
To apply the Apache License to your work, attach the following
181+
boilerplate notice, with the fields enclosed by brackets "[]"
182+
replaced with your own identifying information. (Don't include
183+
the brackets!) The text should be enclosed in the appropriate
184+
comment syntax for the file format. We also recommend that a
185+
file or class name and description of purpose be included on the
186+
same "printed page" as the copyright notice for easier
187+
identification within third-party archives.
188+
189+
Copyright [yyyy] [name of copyright owner]
190+
191+
Licensed under the Apache License, Version 2.0 (the "License");
192+
you may not use this file except in compliance with the License.
193+
You may obtain a copy of the License at
194+
195+
http://www.apache.org/licenses/LICENSE-2.0
196+
197+
Unless required by applicable law or agreed to in writing, software
198+
distributed under the License is distributed on an "AS IS" BASIS,
199+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200+
See the License for the specific language governing permissions and
201+
limitations under the License.

‎regex-lite/LICENSE-MIT

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
Copyright (c) 2014 The Rust Project Developers
2+
3+
Permission is hereby granted, free of charge, to any
4+
person obtaining a copy of this software and associated
5+
documentation files (the "Software"), to deal in the
6+
Software without restriction, including without
7+
limitation the rights to use, copy, modify, merge,
8+
publish, distribute, sublicense, and/or sell copies of
9+
the Software, and to permit persons to whom the Software
10+
is furnished to do so, subject to the following
11+
conditions:
12+
13+
The above copyright notice and this permission notice
14+
shall be included in all copies or substantial portions
15+
of the Software.
16+
17+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18+
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19+
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20+
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21+
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24+
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25+
DEALINGS IN THE SOFTWARE.

‎regex-lite/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
WIP

‎regex-lite/src/error.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#[derive(Clone, Debug, Eq, PartialEq)]
2+
pub struct Error {
3+
msg: &'static str,
4+
}
5+
6+
impl Error {
7+
pub(crate) fn new(msg: &'static str) -> Error {
8+
Error { msg }
9+
}
10+
}
11+
12+
#[cfg(feature = "std")]
13+
impl std::error::Error for Error {}
14+
15+
impl core::fmt::Display for Error {
16+
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
17+
write!(f, "{}", self.msg)
18+
}
19+
}

‎regex-lite/src/hir/mod.rs

Lines changed: 644 additions & 0 deletions
Large diffs are not rendered by default.

‎regex-lite/src/hir/parse.rs

Lines changed: 2123 additions & 0 deletions
Large diffs are not rendered by default.

‎regex-lite/src/int.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
use core::num::NonZeroUsize;
2+
3+
/// An extension trait that adds routines to the `u32` primitive type.
4+
pub(crate) trait U32 {
5+
fn as_usize(self) -> usize;
6+
}
7+
8+
impl U32 for u32 {
9+
fn as_usize(self) -> usize {
10+
// OK because we require 32 or 64 bit targets. Therefore, every u32
11+
// necessarily fits into a usize.
12+
self as usize
13+
}
14+
}
15+
16+
/// A `usize` that can never be `usize::MAX`.
17+
///
18+
/// This is similar to `core::num::NonZeroUsize`, but instead of not permitting
19+
/// a zero value, this does not permit a max value.
20+
///
21+
/// This is useful in certain contexts where one wants to optimize the memory
22+
/// usage of things that contain match offsets. Namely, since Rust slices
23+
/// are guaranteed to never have a length exceeding `isize::MAX`, we can use
24+
/// `usize::MAX` as a sentinel to indicate that no match was found. Indeed,
25+
/// types like `Option<NonMaxUsize>` have exactly the same size in memory as a
26+
/// `usize`.
27+
///
28+
/// This type is defined to be `repr(transparent)` for
29+
/// `core::num::NonZeroUsize`, which is in turn defined to be
30+
/// `repr(transparent)` for `usize`.
31+
#[derive(Clone, Copy, Eq, Hash, PartialEq, PartialOrd, Ord)]
32+
#[repr(transparent)]
33+
pub(crate) struct NonMaxUsize(NonZeroUsize);
34+
35+
impl NonMaxUsize {
36+
/// Create a new `NonMaxUsize` from the given value.
37+
///
38+
/// This returns `None` only when the given value is equal to `usize::MAX`.
39+
pub(crate) fn new(value: usize) -> Option<NonMaxUsize> {
40+
NonZeroUsize::new(value.wrapping_add(1)).map(NonMaxUsize)
41+
}
42+
43+
/// Return the underlying `usize` value. The returned value is guaranteed
44+
/// to not equal `usize::MAX`.
45+
pub(crate) fn get(self) -> usize {
46+
self.0.get().wrapping_sub(1)
47+
}
48+
}
49+
50+
// We provide our own Debug impl because seeing the internal repr can be quite
51+
// surprising if you aren't expecting it. e.g., 'NonMaxUsize(5)' vs just '5'.
52+
impl core::fmt::Debug for NonMaxUsize {
53+
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
54+
write!(f, "{:?}", self.get())
55+
}
56+
}

‎regex-lite/src/interpolate.rs

Lines changed: 527 additions & 0 deletions
Large diffs are not rendered by default.

‎regex-lite/src/lib.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/*!
2+
TODO
3+
*/
4+
5+
#![allow(warnings)]
6+
#![no_std]
7+
#![forbid(unsafe_code)]
8+
// #![deny(missing_docs, rustdoc::broken_intra_doc_links)]
9+
#![warn(missing_debug_implementations)]
10+
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
11+
12+
#[cfg(not(feature = "std"))]
13+
compile_error!("'std' is currently a required feature, please file an issue");
14+
15+
#[cfg(not(any(target_pointer_width = "32", target_pointer_width = "64")))]
16+
compile_error!("not supported on non-{32,64}, please file an issue");
17+
18+
extern crate alloc;
19+
#[cfg(any(test, feature = "std"))]
20+
extern crate std;
21+
22+
pub use self::{error::Error, hir::escape, string::*};
23+
24+
mod error;
25+
mod hir;
26+
mod int;
27+
mod interpolate;
28+
mod nfa;
29+
mod pikevm;
30+
mod pool;
31+
mod string;
32+
mod utf8;

‎regex-lite/src/nfa.rs

Lines changed: 703 additions & 0 deletions
Large diffs are not rendered by default.

‎regex-lite/src/pikevm.rs

Lines changed: 957 additions & 0 deletions
Large diffs are not rendered by default.

‎regex-lite/src/pool.rs

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
use core::{
2+
cell::UnsafeCell,
3+
panic::{RefUnwindSafe, UnwindSafe},
4+
sync::atomic::{AtomicBool, Ordering},
5+
};
6+
7+
use alloc::{boxed::Box, vec, vec::Vec};
8+
9+
use crate::pikevm;
10+
11+
// Literally the only reason that this crate requires 'std' currently.
12+
//
13+
// In regex-automata, we support the no-std use case by rolling our own
14+
// spin-lock based Mutex. That's questionable on its own, but it's not clear if
15+
// we should be doing that here. It will require introducing non-safe code in a
16+
// crate that is otherwise safe. But maybe it's worth doing?
17+
use std::sync::Mutex;
18+
19+
/// A type alias for our pool of meta::Cache that fixes the type parameters to
20+
/// what we use for the meta regex below.
21+
pub(crate) type CachePool = Pool<pikevm::Cache, CachePoolFn>;
22+
23+
/// Same as above, but for the guard returned by a pool.
24+
pub(crate) type CachePoolGuard<'a> = PoolGuard<'a, pikevm::Cache, CachePoolFn>;
25+
26+
/// The type of the closure we use to create new caches. We need to spell out
27+
/// all of the marker traits or else we risk leaking !MARKER impls.
28+
pub(crate) type CachePoolFn =
29+
Box<dyn Fn() -> pikevm::Cache + Send + Sync + UnwindSafe + RefUnwindSafe>;
30+
31+
/// A thread safe pool utilizing alloc-only features.
32+
///
33+
/// Unlike the pool in regex-automata, this has no "fast path." We could add
34+
/// it, but it's more code and requires reasoning about safety.
35+
pub(super) struct Pool<T, F> {
36+
/// A stack of T values to hand out. These are used when a Pool is
37+
/// accessed by a thread that didn't create it.
38+
stack: Mutex<Vec<Box<T>>>,
39+
/// A function to create more T values when stack is empty and a caller
40+
/// has requested a T.
41+
create: F,
42+
}
43+
44+
// If T is UnwindSafe, then since we provide exclusive access to any
45+
// particular value in the pool, it should therefore also be considered
46+
// RefUnwindSafe.
47+
impl<T: UnwindSafe, F: UnwindSafe> RefUnwindSafe for Pool<T, F> {}
48+
49+
impl<T, F> Pool<T, F> {
50+
/// Create a new pool. The given closure is used to create values in
51+
/// the pool when necessary.
52+
pub(super) const fn new(create: F) -> Pool<T, F> {
53+
Pool { stack: Mutex::new(vec![]), create }
54+
}
55+
}
56+
57+
impl<T: Send, F: Fn() -> T> Pool<T, F> {
58+
/// Get a value from the pool. This may block if another thread is also
59+
/// attempting to retrieve a value from the pool.
60+
pub(super) fn get(&self) -> PoolGuard<'_, T, F> {
61+
let mut stack = self.stack.lock().unwrap();
62+
let value = match stack.pop() {
63+
None => Box::new((self.create)()),
64+
Some(value) => value,
65+
};
66+
PoolGuard { pool: self, value: Some(value) }
67+
}
68+
69+
fn put(&self, guard: PoolGuard<'_, T, F>) {
70+
let mut guard = core::mem::ManuallyDrop::new(guard);
71+
if let Some(value) = guard.value.take() {
72+
self.put_value(value);
73+
}
74+
}
75+
76+
/// Puts a value back into the pool. Callers don't need to call this.
77+
/// Once the guard that's returned by 'get' is dropped, it is put back
78+
/// into the pool automatically.
79+
fn put_value(&self, value: Box<T>) {
80+
let mut stack = self.stack.lock().unwrap();
81+
stack.push(value);
82+
}
83+
}
84+
85+
impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> {
86+
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
87+
f.debug_struct("Pool").field("stack", &self.stack).finish()
88+
}
89+
}
90+
91+
/// A guard that is returned when a caller requests a value from the pool.
92+
pub(super) struct PoolGuard<'a, T: Send, F: Fn() -> T> {
93+
/// The pool that this guard is attached to.
94+
pool: &'a Pool<T, F>,
95+
/// This is None after the guard has been put back into the pool.
96+
value: Option<Box<T>>,
97+
}
98+
99+
impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
100+
/// Return the underlying value.
101+
pub(super) fn value(&self) -> &T {
102+
self.value.as_deref().unwrap()
103+
}
104+
105+
/// Return the underlying value as a mutable borrow.
106+
pub(super) fn value_mut(&mut self) -> &mut T {
107+
self.value.as_deref_mut().unwrap()
108+
}
109+
110+
/// Consumes this guard and puts it back into the pool.
111+
pub(super) fn put(this: PoolGuard<'_, T, F>) {
112+
// Since this is effectively consuming the guard and putting the
113+
// value back into the pool, there's no reason to run its Drop
114+
// impl after doing this. I don't believe there is a correctness
115+
// problem with doing so, but there's definitely a perf problem
116+
// by redoing this work. So we avoid it.
117+
let mut this = core::mem::ManuallyDrop::new(this);
118+
this.put_imp();
119+
}
120+
121+
/// Puts this guard back into the pool by only borrowing the guard as
122+
/// mutable. This should be called at most once.
123+
#[inline(always)]
124+
fn put_imp(&mut self) {
125+
if let Some(value) = self.value.take() {
126+
self.pool.put_value(value);
127+
}
128+
}
129+
}
130+
131+
impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> {
132+
fn drop(&mut self) {
133+
self.put_imp();
134+
}
135+
}
136+
137+
impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> {
138+
type Target = T;
139+
140+
fn deref(&self) -> &T {
141+
self.value()
142+
}
143+
}
144+
145+
impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> {
146+
fn deref_mut(&mut self) -> &mut T {
147+
self.value_mut()
148+
}
149+
}
150+
151+
impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug
152+
for PoolGuard<'a, T, F>
153+
{
154+
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
155+
f.debug_struct("PoolGuard")
156+
.field("pool", &self.pool)
157+
.field("value", &self.value)
158+
.finish()
159+
}
160+
}

‎regex-lite/src/string.rs

Lines changed: 1189 additions & 0 deletions
Large diffs are not rendered by default.

‎regex-lite/src/utf8.rs

Lines changed: 445 additions & 0 deletions
Large diffs are not rendered by default.

‎regex-lite/tests/fuzz/mod.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#[test]
2+
fn captures_wrong_order() {
3+
let data = include_bytes!(
4+
"testdata/crash-a886ce2b0d64963f1232f9b08b8c9ad4740c26f5"
5+
);
6+
let _ = run(data);
7+
}
8+
9+
#[test]
10+
fn captures_wrong_order_min() {
11+
let data = include_bytes!(
12+
"testdata/minimized-from-298f84f9dbb2589cb9938a63334fa4083b609f34"
13+
);
14+
let _ = run(data);
15+
}
16+
17+
// This is the fuzz target function. We duplicate it here since this is the
18+
// thing we use to interpret the data. It is ultimately what we want to
19+
// succeed.
20+
fn run(data: &[u8]) -> Option<()> {
21+
if data.len() < 2 {
22+
return None;
23+
}
24+
let mut split_at = usize::from(data[0]);
25+
let data = std::str::from_utf8(&data[1..]).ok()?;
26+
// Split data into a regex and haystack to search.
27+
let len = usize::try_from(data.chars().count()).ok()?;
28+
split_at = std::cmp::max(split_at, 1) % len;
29+
let char_index = data.char_indices().nth(split_at)?.0;
30+
let (pattern, input) = data.split_at(char_index);
31+
let re = regex_lite::Regex::new(pattern).ok()?;
32+
re.is_match(input);
33+
Some(())
34+
}
Binary file not shown.

‎regex-lite/tests/lib.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
mod fuzz;
2+
mod string;
3+
4+
const BLACKLIST: &[&str] = &[
5+
// CRLF-aware line anchors aren't supported in regex API yet.
6+
"crlf",
7+
// Custom line terminators aren't supported in regex-lite. We could add it,
8+
// but it didn't seem worth it.
9+
"line-terminator",
10+
];
11+
12+
fn suite() -> anyhow::Result<regex_test::RegexTests> {
13+
let mut tests = regex_test::RegexTests::new();
14+
macro_rules! load {
15+
($name:expr) => {{
16+
const DATA: &[u8] =
17+
include_bytes!(concat!("../../testdata/", $name, ".toml"));
18+
tests.load_slice($name, DATA)?;
19+
}};
20+
}
21+
22+
load!("anchored");
23+
load!("bytes");
24+
load!("crazy");
25+
load!("crlf");
26+
load!("earliest");
27+
load!("empty");
28+
load!("expensive");
29+
load!("flags");
30+
load!("iter");
31+
load!("leftmost-all");
32+
load!("line-terminator");
33+
load!("misc");
34+
load!("multiline");
35+
load!("no-unicode");
36+
load!("overlapping");
37+
load!("regression");
38+
load!("set");
39+
load!("substring");
40+
load!("unicode");
41+
load!("utf8");
42+
load!("word-boundary");
43+
load!("fowler/basic");
44+
load!("fowler/nullsubexpr");
45+
load!("fowler/repetition");
46+
47+
// Special tests for regex-lite specifically.
48+
load!("regex-lite");
49+
50+
Ok(tests)
51+
}

‎regex-lite/tests/string.rs

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
use {
2+
anyhow::Result,
3+
regex_lite::{Regex, RegexBuilder},
4+
regex_test::{
5+
CompiledRegex, Match, RegexTest, Span, TestResult, TestRunner,
6+
},
7+
};
8+
9+
/// Tests the default configuration of the hybrid NFA/DFA.
10+
#[test]
11+
fn default() -> Result<()> {
12+
let mut runner = TestRunner::new()?;
13+
runner
14+
.expand(&["is_match", "find", "captures"], |test| test.compiles())
15+
.blacklist_iter(super::BLACKLIST)
16+
.test_iter(crate::suite()?.iter(), compiler)
17+
.assert();
18+
Ok(())
19+
}
20+
21+
fn run_test(re: &Regex, test: &RegexTest) -> TestResult {
22+
let hay = match std::str::from_utf8(test.haystack()) {
23+
Ok(hay) => hay,
24+
Err(err) => {
25+
return TestResult::fail(&format!(
26+
"haystack is not valid UTF-8: {}",
27+
err
28+
));
29+
}
30+
};
31+
match test.additional_name() {
32+
"is_match" => TestResult::matched(re.is_match(hay)),
33+
"find" => TestResult::matches(
34+
re.find_iter(hay)
35+
.take(test.match_limit().unwrap_or(std::usize::MAX))
36+
.map(|m| Match {
37+
id: 0,
38+
span: Span { start: m.start(), end: m.end() },
39+
}),
40+
),
41+
"captures" => {
42+
let it = re
43+
.captures_iter(hay)
44+
.take(test.match_limit().unwrap_or(std::usize::MAX))
45+
.map(|caps| testify_captures(&caps));
46+
TestResult::captures(it)
47+
}
48+
name => TestResult::fail(&format!("unrecognized test name: {}", name)),
49+
}
50+
}
51+
52+
/// Converts the given regex test to a closure that searches with a
53+
/// `bytes::Regex`. If the test configuration is unsupported, then a
54+
/// `CompiledRegex` that skips the test is returned.
55+
fn compiler(
56+
test: &RegexTest,
57+
_patterns: &[String],
58+
) -> anyhow::Result<CompiledRegex> {
59+
let Some(pattern) = skip_or_get_pattern(test) else {
60+
return Ok(CompiledRegex::skip());
61+
};
62+
let re = RegexBuilder::new(pattern)
63+
.case_insensitive(test.case_insensitive())
64+
.build()?;
65+
Ok(CompiledRegex::compiled(move |test| run_test(&re, test)))
66+
}
67+
68+
/// Whether we should skip the given test or not. If not, return the single
69+
/// pattern from the given test.
70+
fn skip_or_get_pattern(test: &RegexTest) -> Option<&str> {
71+
// We're only testing Regex here, which supports one pattern only.
72+
let pattern = match test.regexes().len() {
73+
1 => &test.regexes()[0],
74+
_ => return None,
75+
};
76+
// If the test name contains 'regex-lite', then we ALWAYS run it. Because
77+
// those tests are specifically designed for regex-lite. So if they fail,
78+
// then something needs attention.
79+
if test.full_name().contains("regex-lite/") {
80+
return Some(pattern);
81+
}
82+
// If the pattern has a \p in it, then we almost certainly don't support
83+
// it. This probably skips more than we intend, but there are likely very
84+
// few tests that contain a \p that isn't also a Unicode class.
85+
if pattern.contains(r"\p") || pattern.contains(r"\P") {
86+
return None;
87+
}
88+
// Similar deal for Perl classes, but we can abide them if the haystack
89+
// is ASCII-only.
90+
if !test.haystack().is_ascii() {
91+
if pattern.contains(r"\d") || pattern.contains(r"\D") {
92+
return None;
93+
}
94+
if pattern.contains(r"\s") || pattern.contains(r"\S") {
95+
return None;
96+
}
97+
if pattern.contains(r"\w") || pattern.contains(r"\W") {
98+
return None;
99+
}
100+
}
101+
// And also same deal for word boundaries.
102+
if !test.haystack().is_ascii() {
103+
if pattern.contains(r"\b") || pattern.contains(r"\B") {
104+
return None;
105+
}
106+
}
107+
// We only test is_match, find_iter and captures_iter. All of those are
108+
// leftmost searches.
109+
if !matches!(test.search_kind(), regex_test::SearchKind::Leftmost) {
110+
return None;
111+
}
112+
// The top-level single-pattern regex API always uses leftmost-first.
113+
if !matches!(test.match_kind(), regex_test::MatchKind::LeftmostFirst) {
114+
return None;
115+
}
116+
// The top-level regex API always runs unanchored searches. ... But we can
117+
// handle tests that are anchored but have only one match.
118+
if test.anchored() && test.match_limit() != Some(1) {
119+
return None;
120+
}
121+
// We don't support tests with explicit search bounds. We could probably
122+
// support this by using the 'find_at' (and such) APIs.
123+
let bounds = test.bounds();
124+
if !(bounds.start == 0 && bounds.end == test.haystack().len()) {
125+
return None;
126+
}
127+
// The Regex API specifically does not support disabling UTF-8 mode because
128+
// it can only search &str which is always valid UTF-8.
129+
if !test.utf8() {
130+
return None;
131+
}
132+
// regex-lite doesn't support Unicode-aware case insensitive matching.
133+
if test.case_insensitive()
134+
&& (!pattern.is_ascii() || !test.haystack().is_ascii())
135+
{
136+
return None;
137+
}
138+
Some(pattern)
139+
}
140+
141+
/// Convert `Captures` into the test suite's capture values.
142+
fn testify_captures(caps: &regex_lite::Captures<'_>) -> regex_test::Captures {
143+
let spans = caps.iter().map(|group| {
144+
group.map(|m| regex_test::Span { start: m.start(), end: m.end() })
145+
});
146+
// This unwrap is OK because we assume our 'caps' represents a match, and
147+
// a match always gives a non-zero number of groups with the first group
148+
// being non-None.
149+
regex_test::Captures::new(0, spans).unwrap()
150+
}

‎testdata/regex-lite.toml

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# These tests are specifically written to test the regex-lite crate. While it
2+
# largely has the same semantics as the regex crate, there are some differences
3+
# around Unicode support and UTF-8.
4+
#
5+
# To be clear, regex-lite supports far fewer patterns because of its lack of
6+
# Unicode support, nested character classes and character class set operations.
7+
# What we're talking about here are the patterns that both crates support but
8+
# where the semantics might differ.
9+
10+
# regex-lite uses ASCII definitions for Perl character classes.
11+
[[test]]
12+
name = "perl-class-decimal"
13+
regex = '\d'
14+
haystack = ''
15+
matches = []
16+
unicode = true
17+
18+
# regex-lite uses ASCII definitions for Perl character classes.
19+
[[test]]
20+
name = "perl-class-space"
21+
regex = '\s'
22+
haystack = "\u2000"
23+
matches = []
24+
unicode = true
25+
26+
# regex-lite uses ASCII definitions for Perl character classes.
27+
[[test]]
28+
name = "perl-class-word"
29+
regex = '\w'
30+
haystack = 'δ'
31+
matches = []
32+
unicode = true
33+
34+
# regex-lite uses the ASCII definition of word for word boundary assertions.
35+
[[test]]
36+
name = "word-boundary"
37+
regex = '\b'
38+
haystack = 'δ'
39+
matches = []
40+
unicode = true
41+
42+
# regex-lite uses the ASCII definition of word for negated word boundary
43+
# assertions. But note that it should still not split codepoints!
44+
[[test]]
45+
name = "word-boundary-negated"
46+
regex = '\B'
47+
haystack = 'δ'
48+
matches = [[0, 0], [2, 2]]
49+
unicode = true
50+
51+
# While we're here, the empty regex---which matches at every
52+
# position---shouldn't split a codepoint either.
53+
[[test]]
54+
name = "empty-no-split-codepoint"
55+
regex = ''
56+
haystack = '💩'
57+
matches = [[0, 0], [4, 4]]
58+
unicode = true
59+
60+
# A dot always matches a full codepoint.
61+
[[test]]
62+
name = "dot-always-matches-codepoint"
63+
regex = '.'
64+
haystack = '💩'
65+
matches = [[0, 4]]
66+
unicode = false
67+
68+
# A negated character class also always matches a full codepoint.
69+
[[test]]
70+
name = "negated-class-always-matches-codepoint"
71+
regex = '[^a]'
72+
haystack = '💩'
73+
matches = [[0, 4]]
74+
unicode = false
75+
76+
# regex-lite only supports ASCII-aware case insensitive matching.
77+
[[test]]
78+
name = "case-insensitive-is-ascii-only"
79+
regex = 's'
80+
haystack = 'ſ'
81+
matches = []
82+
unicode = true
83+
case-insensitive = true
84+
85+
# Negated word boundaries shouldn't split a codepoint, but they will match
86+
# between invalid UTF-8.
87+
#
88+
# This test is only valid for a 'bytes' API, but that doesn't (yet) exist in
89+
# regex-lite. This can't happen in the main API because &str can't contain
90+
# invalid UTF-8.
91+
# [[test]]
92+
# name = "word-boundary-invalid-utf8"
93+
# regex = '\B'
94+
# haystack = '\xFF\xFF\xFF\xFF'
95+
# unescape = true
96+
# matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
97+
# unicode = true
98+
# utf8 = false

‎testdata/regression.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -662,3 +662,12 @@ name = "word-boundary-interact-poorly-with-literal-optimizations"
662662
regex = '(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))'
663663
haystack = 'ubi-Darwin-x86_64.tar.gz'
664664
matches = []
665+
666+
# This was found during fuzz testing in regex-lite. The regex crate never
667+
# suffered from this bug, but it causes regex-lite to incorrectly compile
668+
# captures.
669+
[[test]]
670+
name = "captures-wrong-order"
671+
regex = '(a){0}(a)'
672+
haystack = 'a'
673+
matches = [[[0, 1], [], [0, 1]]]

0 commit comments

Comments
 (0)
Please sign in to comment.