Skip to content

Commit 83f375c

Browse files
authored
Revamp sub/3 to resolve most issues with gsub (and sub with "g") (#2641)
The primary purpose of this commit is to rectify most problems with `gsub` (and also `sub` with the `g` option), in particular fix #1425 ('\b'), fix #2354 (lookahead), and fix #2532 (regex == `"^(?!cd ).*$|^cd "`). This commit also partly resolves #2148 and resolves #1206 in that `gsub` no longer loops infinitely; however, because the new `gsub` depends critically on `match/2`, the behavior when regex == `""` is sometimes non-standard. The documentation has been updated to reflect the fact that `sub` and `gsub` are intended to be regular in the second argument. Also, `_nwise/1` has been tweaked to take advantage of TCO.
1 parent edb0d88 commit 83f375c

File tree

5 files changed

+113
-55
lines changed

5 files changed

+113
-55
lines changed

docs/content/manual/manual.yml

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2481,32 +2481,43 @@ sections:
24812481
input: '("ab,cd", "ef, gh")'
24822482
output: ['"ab"', '"cd"', '"ef"', '"gh"']
24832483

2484-
- title: "`sub(regex; tostring)`, `sub(regex; string; flags)`"
2484+
- title: "`sub(regex; tostring)`, `sub(regex; tostring; flags)`"
24852485
body: |
24862486
2487-
Emit the string obtained by replacing the first match of regex in the
2488-
input string with `tostring`, after interpolation. `tostring` should
2489-
be a jq string, and may contain references to named captures. The
2490-
named captures are, in effect, presented as a JSON object (as
2491-
constructed by `capture`) to `tostring`, so a reference to a captured
2492-
variable named "x" would take the form: `"\(.x)"`.
2487+
Emit the string obtained by replacing the first match of
2488+
regex in the input string with `tostring`, after
2489+
interpolation. `tostring` should be a jq string or a stream
2490+
of such strings, each of which may contain references to
2491+
named captures. The named captures are, in effect, presented
2492+
as a JSON object (as constructed by `capture`) to
2493+
`tostring`, so a reference to a captured variable named "x"
2494+
would take the form: `"\(.x)"`.
24932495
24942496
example:
2495-
- program: 'sub("^[^a-z]*(?<x>[a-z]*).*")'
2496-
input: '"123abc456"'
2497-
output: '"ZabcZabc"'
24982497

2498+
- program: 'sub("[^a-z]*(?<x>[a-z]*)"; "Z\(.x)"; "g")'
2499+
input: '"123abc456def"'
2500+
output: ['"ZabcZdef"']
24992501

2500-
- title: "`gsub(regex; string)`, `gsub(regex; string; flags)`"
2502+
- program: '[sub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)")]'
2503+
input: '"aB"'
2504+
output: ['["AB","aB"]']
2505+
2506+
- title: "`gsub(regex; tostring)`, `gsub(regex; tostring; flags)`"
25012507
body: |
25022508
25032509
`gsub` is like `sub` but all the non-overlapping occurrences of the regex are
2504-
replaced by the string, after interpolation.
2510+
replaced by `tostring`, after interpolation. If the second argument is a stream
2511+
of jq strings, then `gsub` will produce a corresponding stream of JSON strings.
25052512
25062513
example:
25072514
- program: 'gsub("(?<x>.)[^a]*"; "+\(.x)-")'
25082515
input: '"Abcabc"'
2509-
output: '"+A-+a-"'
2516+
output: ['"+A-+a-"']
2517+
2518+
- program: '[gsub("p"; "a", "b")]'
2519+
input: '"p"'
2520+
output: ['["a","b"]']
25102521

25112522

25122523
- title: Advanced features

src/builtin.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -930,7 +930,8 @@ static jv f_match(jq_state *jq, jv input, jv regex, jv modifiers, jv testmode) {
930930
match = jv_object_set(match, jv_string("string"), jv_string(""));
931931
match = jv_object_set(match, jv_string("captures"), jv_array());
932932
result = jv_array_append(result, match);
933-
start += 1;
933+
// ensure '"qux" | match("(?=u)"; "g")' matches just once
934+
start = (const UChar*)(input_string+region->end[0]+1);
934935
continue;
935936
}
936937

src/builtin.jq

Lines changed: 21 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,10 @@ def scan(re):
9999
#
100100
# If input is an array, then emit a stream of successive subarrays of length n (or less),
101101
# and similarly for strings.
102-
def _nwise(a; $n): if a|length <= $n then a else a[0:$n] , _nwise(a[$n:]; $n) end;
103-
def _nwise($n): _nwise(.; $n);
102+
def _nwise($n):
103+
def n: if length <= $n then . else .[0:$n] , (.[$n:] | n) end;
104+
n;
105+
def _nwise(a; $n): a | _nwise($n);
104106
#
105107
# splits/1 produces a stream; split/1 is retained for backward compatibility.
106108
def splits($re; flags): . as $s
@@ -114,47 +116,26 @@ def splits($re): splits($re; null);
114116
# split emits an array for backward compatibility
115117
def split($re; flags): [ splits($re; flags) ];
116118
#
117-
# If s contains capture variables, then create a capture object and pipe it to s
118-
def sub($re; s):
119-
. as $in
120-
| [match($re)]
121-
| if length == 0 then $in
122-
else .[0]
123-
| . as $r
124-
# # create the "capture" object:
125-
| reduce ( $r | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
126-
({}; . + $pair)
127-
| $in[0:$r.offset] + s + $in[$r.offset+$r.length:]
128-
end ;
129-
#
130-
# If s contains capture variables, then create a capture object and pipe it to s
131-
def sub($re; s; flags):
132-
def subg: [explode[] | select(. != 103)] | implode;
133-
# "fla" should be flags with all occurrences of g removed; gs should be non-nil if flags has a g
134-
def sub1(fla; gs):
135-
def mysub:
136-
. as $in
137-
| [match($re; fla)]
138-
| if length == 0 then $in
139-
else .[0] as $edit
140-
| ($edit | .offset + .length) as $len
141-
# create the "capture" object:
142-
| reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
143-
({}; . + $pair)
144-
| $in[0:$edit.offset]
145-
+ s
146-
+ ($in[$len:] | if length > 0 and gs then mysub else . end)
147-
end ;
148-
mysub ;
149-
(flags | index("g")) as $gs
150-
| (flags | if $gs then subg else . end) as $fla
151-
| sub1($fla; $gs);
119+
# If s contains capture variables, then create a capture object and pipe it to s, bearing
120+
# in mind that s could be a stream
121+
def sub($re; s; $flags):
122+
. as $in
123+
| (reduce match($re; $flags) as $edit
124+
({result: [], previous: 0};
125+
$in[ .previous: ($edit | .offset) ] as $gap
126+
# create the "capture" objects (one per item in s)
127+
| [reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
128+
({}; . + $pair) | s ] as $inserts
129+
| reduce range(0; $inserts|length) as $ix (.; .result[$ix] += $gap + $inserts[$ix])
130+
| .previous = ($edit | .offset + .length ) )
131+
| .result[] + $in[.previous:] )
132+
// $in;
152133
#
153134
def sub($re; s): sub($re; s; "");
154-
# repeated substitution of re (which may contain named captures)
135+
#
155136
def gsub($re; s; flags): sub($re; s; flags + "g");
156137
def gsub($re; s): sub($re; s; "g");
157-
138+
#
158139
########################################################################
159140
# generic iterator/generator
160141
def while(cond; update):
@@ -206,7 +187,7 @@ def transpose:
206187
| length as $length
207188
| reduce range(0; $max) as $j
208189
([]; . + [reduce range(0;$length) as $i ([]; . + [ $in[$i][$j] ] )] )
209-
end;
190+
end;
210191
def in(xs): . as $x | xs | has($x);
211192
def inside(xs): . as $x | xs | contains($x);
212193
def repeat(exp):
@@ -237,7 +218,6 @@ def tostream:
237218
getpath($p) |
238219
reduce path(.[]?) as $q ([$p, .]; [$p+$q]);
239220

240-
241221
# Assuming the input array is sorted, bsearch/1 returns
242222
# the index of the target if the target is in the input array; and otherwise
243223
# (-1 - ix), where ix is the insertion point that would leave the array sorted.

tests/jq.test

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1731,3 +1731,4 @@ false
17311731
. |= try . catch .
17321732
1
17331733
1
1734+

tests/onig.test

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,45 @@ gsub( "(.*)"; ""; "x")
7575
""
7676
""
7777

78+
gsub( ""; "a"; "g")
79+
""
80+
"a"
81+
82+
gsub( "^"; ""; "g")
83+
"a"
84+
"a"
85+
86+
87+
# The following is a regression test and should not be construed as a requirement other than that execution should terminate:
88+
gsub( ""; "a"; "g")
89+
"a"
90+
"aa"
91+
92+
gsub( "$"; "a"; "g")
93+
"a"
94+
"aa"
95+
96+
gsub( "^"; "a")
97+
""
98+
"a"
99+
100+
gsub("(?=u)"; "u")
101+
"qux"
102+
"quux"
103+
104+
gsub("^.*a"; "b")
105+
"aaa"
106+
"b"
107+
108+
gsub("^.*?a"; "b")
109+
"aaa"
110+
"baa"
111+
112+
# The following is for regression testing and should not be construed as a requirement:
113+
[gsub("a"; "b", "c")]
114+
"a"
115+
["b","c"]
116+
78117
[.[] | scan(", ")]
79118
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
80119
[", ",", ",", ",", ",", ",", ",", ",", "]
@@ -92,7 +131,33 @@ gsub("(?<x>.)[^a]*"; "+\(.x)-")
92131
"Abcabc"
93132
"+A-+a-"
94133

134+
gsub("(?<x>.)(?<y>[0-9])"; "\(.x|ascii_downcase)\(.y)")
135+
"A1 B2 CD"
136+
"a1 b2 CD"
137+
138+
gsub("\\b(?<x>.)"; "\(.x|ascii_downcase)")
139+
"ABC DEF"
140+
"aBC dEF"
141+
95142
# utf-8
96143
sub("(?<x>.)"; "\(.x)!")
97144
"’"
98145
"’!"
146+
147+
[sub("a"; "b", "c")]
148+
"a"
149+
["b","c"]
150+
151+
[sub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")]
152+
"aB"
153+
["AB","aB","cB"]
154+
155+
[gsub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")]
156+
"aB"
157+
["AB","ab","cc"]
158+
159+
# splits and _nwise
160+
[splits("")]
161+
"ab"
162+
["","a","b"]
163+

0 commit comments

Comments
 (0)