Skip to content

Commit c46559e

Browse files
remove RevString; efficient generic reverseind
These seem unrelated, but they're actually linked: * If you reverse generic strings by wrapping them in `RevString` then then this generic `reverseind` is incorrect. * In order to have a correct generic `reverseind` one needs to assume that `reverse(s)` returns a string of the same type and encoding as `s` with code points in reverse order; one also needs to assume that the code units encoding each character remain the same when reversed. This is a valid assumption for UTF-8, UTF-16 and (trivially) UTF-32. Reverse string search functions are pretty messed up by this and I've fixed them well enough to work but they may be quite inefficient for long strings now. I'm not going to spend too much time on this since there's other work going on to generalize and unify searching APIs. Close #22611 Close #24613 See also: #10593 #23612 #24103
1 parent c2feee7 commit c46559e

File tree

13 files changed

+91
-145
lines changed

13 files changed

+91
-145
lines changed

NEWS.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,12 @@ This section lists changes that do not have deprecation warnings.
295295
`AbstractArray` types that specialized broadcasting using the old internal API will
296296
need to switch to the new API. ([#20740])
297297

298+
* The `RevString` type has been removed from the language; `reverse(::String)` returns
299+
a `String` with code points (or fragments thereof) in reverse order. In general,
300+
`reverse(s)` should return a string of the same type and encoding as `s` with code
301+
points in reverse order; any string type overrides `reverse` to return a different
302+
type of string must also override `reverseind` to compute reversed indices correctly.
303+
298304
Library improvements
299305
--------------------
300306

@@ -397,6 +403,11 @@ Library improvements
397403
The generic definition is constant time but calls `endof(s)` which may be inefficient.
398404
Therefore custom string types may want to define direct `ncodeunits` methods.
399405

406+
* `reverseind(s::AbstractString, i::Integer)` now has an efficient generic fallback, so
407+
custom string types do not need to provide their own efficient defintions. The generic
408+
definition relies on `ncodeunits` however, so for optimal performance you may need to
409+
define a custom method for that function.
410+
400411
Compiler/Runtime improvements
401412
-----------------------------
402413

base/exports.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ export
8888
Rational,
8989
Regex,
9090
RegexMatch,
91-
RevString,
9291
RoundFromZero,
9392
RoundDown,
9493
RoundingMode,

base/precompile.jl

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -578,9 +578,6 @@ precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.LineEdit.PromptState,
578578
precompile(Tuple{typeof(Base.LineEdit.input_string_newlines_aftercursor), Base.LineEdit.PromptState})
579579
precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.REPL.REPLCompletionProvider, Base.LineEdit.PromptState})
580580
precompile(Tuple{getfield(Base, Symbol("#kw##parse")), Array{Any, 1}, typeof(Base.parse), String})
581-
precompile(Tuple{typeof(Base.isvalid), Base.RevString{String}, Int64})
582-
precompile(Tuple{typeof(Base.nextind), Base.RevString{String}, Int64})
583-
precompile(Tuple{typeof(Base.search), Base.RevString{String}, Array{Char, 1}, Int64})
584581
precompile(Tuple{typeof(Base.rsearch), String, Array{Char, 1}, Int64})
585582
precompile(Tuple{getfield(Base.REPLCompletions, Symbol("#kw##find_start_brace")), Array{Any, 1}, typeof(Base.REPLCompletions.find_start_brace), String})
586583
precompile(Tuple{typeof(Core.Inference.isbits), Tuple{Void, Void, Void}})

base/repl/REPLCompletions.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ end
225225
# closed start brace from the end of the string.
226226
function find_start_brace(s::AbstractString; c_start='(', c_end=')')
227227
braces = 0
228-
r = RevString(s)
228+
r = reverse(s)
229229
i = start(r)
230230
in_single_quotes = false
231231
in_double_quotes = false
@@ -259,7 +259,7 @@ function find_start_brace(s::AbstractString; c_start='(', c_end=')')
259259
braces == 1 && break
260260
end
261261
braces != 1 && return 0:-1, -1
262-
method_name_end = reverseind(r, i)
262+
method_name_end = reverseind(s, i)
263263
startind = nextind(s, rsearch(s, non_identifier_chars, method_name_end))
264264
return (startind:endof(s), method_name_end)
265265
end

base/shell.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ function shell_parse(str::AbstractString, interpolate::Bool=true;
1414
special::AbstractString="")
1515
s = lstrip(str)
1616
# strips the end but respects the space when the string ends with "\\ "
17-
r = RevString(s)
17+
r = reverse(s)
1818
i = start(r)
1919
c_old = nothing
2020
while !done(r,i)

base/strings/search.jl

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -194,12 +194,6 @@ end
194194
search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i)
195195
search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i)
196196

197-
function rsearch(s::AbstractString, c::Chars)
198-
j = search(RevString(s), c)
199-
j == 0 && return 0
200-
endof(s)-j+1
201-
end
202-
203197
"""
204198
rsearch(s::AbstractString, chars::Chars, [start::Integer])
205199
@@ -212,44 +206,50 @@ julia> rsearch("aaabbb","b")
212206
6:6
213207
```
214208
"""
215-
function rsearch(s::AbstractString, c::Chars, i::Integer)
216-
e = endof(s)
217-
j = search(RevString(s), c, e-i+1)
218-
j == 0 && return 0
219-
e-j+1
209+
function rsearch(s::AbstractString, c::Chars, i::Integer=start(s))
210+
if i < 1
211+
return i == 0 ? 0 : throw(BoundsError(s, i))
212+
end
213+
n = ncodeunits(s)
214+
if i > n
215+
return i == n+1 ? 0 : throw(BoundsError(s, i))
216+
end
217+
# r[reverseind(r,i)] == reverse(r)[i] == s[i]
218+
# s[reverseind(s,j)] == reverse(s)[j] == r[j]
219+
r = reverse(s)
220+
j = search(r, c, reverseind(r, i))
221+
j == 0 ? 0 : reverseind(s, j)
220222
end
221223

222224
function _rsearchindex(s, t, i)
223225
if isempty(t)
224-
return 1 <= i <= nextind(s,endof(s)) ? i :
226+
return 1 <= i <= nextind(s, endof(s)) ? i :
225227
throw(BoundsError(s, i))
226228
end
227-
t = RevString(t)
228-
rs = RevString(s)
229+
t = reverse(t)
230+
rs = reverse(s)
229231
l = endof(s)
230-
t1, j2 = next(t,start(t))
232+
t1, j2 = next(t, start(t))
231233
while true
232-
i = rsearch(s,t1,i)
233-
if i == 0 return 0 end
234-
c, ii = next(rs,l-i+1)
234+
i = rsearch(s, t1, i)
235+
i == 0 && return 0
236+
c, ii = next(rs, reverseind(rs, i))
235237
j = j2; k = ii
236238
matched = true
237-
while !done(t,j)
238-
if done(rs,k)
239+
while !done(t, j)
240+
if done(rs, k)
239241
matched = false
240242
break
241243
end
242-
c, k = next(rs,k)
243-
d, j = next(t,j)
244+
c, k = next(rs, k)
245+
d, j = next(t, j)
244246
if c != d
245247
matched = false
246248
break
247249
end
248250
end
249-
if matched
250-
return nextind(s,l-k+1)
251-
end
252-
i = l-ii+1
251+
matched && return nextind(s, reverseind(s, k))
252+
i = reverseind(s, ii)
253253
end
254254
end
255255

base/strings/string.jl

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -295,14 +295,6 @@ function first_utf8_byte(ch::Char)
295295
return b
296296
end
297297

298-
function reverseind(s::String, i::Integer)
299-
j = sizeof(s) + 1 - i
300-
@inbounds while is_valid_continuation(codeunit(s, j))
301-
j -= 1
302-
end
303-
return j
304-
end
305-
306298
## overload methods for efficiency ##
307299

308300
isvalid(s::String, i::Integer) =
@@ -477,38 +469,6 @@ function string(a::Union{String,Char}...)
477469
return out
478470
end
479471

480-
function reverse(s::String)
481-
dat = Vector{UInt8}(s)
482-
n = length(dat)
483-
n <= 1 && return s
484-
buf = StringVector(n)
485-
out = n
486-
pos = 1
487-
@inbounds while out > 0
488-
ch = dat[pos]
489-
if ch > 0xdf
490-
if ch < 0xf0
491-
(out -= 3) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
492-
buf[out + 1], buf[out + 2], buf[out + 3] = ch, dat[pos + 1], dat[pos + 2]
493-
pos += 3
494-
else
495-
(out -= 4) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
496-
buf[out+1], buf[out+2], buf[out+3], buf[out+4] = ch, dat[pos+1], dat[pos+2], dat[pos+3]
497-
pos += 4
498-
end
499-
elseif ch > 0x7f
500-
(out -= 2) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
501-
buf[out + 1], buf[out + 2] = ch, dat[pos + 1]
502-
pos += 2
503-
else
504-
buf[out] = ch
505-
out -= 1
506-
pos += 1
507-
end
508-
end
509-
String(buf)
510-
end
511-
512472
function repeat(s::String, r::Integer)
513473
r < 0 && throw(ArgumentError("can't repeat a string $r times"))
514474
n = sizeof(s)

base/strings/strings.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# This file is a part of Julia. License is MIT: https://julialang.org/license
22

33
include("strings/errors.jl")
4-
include("strings/types.jl")
4+
include("strings/substring.jl")
55
include("strings/basic.jl")
66
include("strings/search.jl")
77
include("strings/util.jl")

base/strings/types.jl renamed to base/strings/substring.jl

Lines changed: 19 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
# This file is a part of Julia. License is MIT: https://julialang.org/license
22

3-
# SubString and RevString types
4-
5-
## substrings reference original strings ##
6-
73
"""
84
SubString(s::AbstractString, i::Integer, j::Integer=endof(s))
95
SubString(s::AbstractString, r::UnitRange{<:Integer})
@@ -51,6 +47,9 @@ end
5147
SubString(s::AbstractString) = SubString(s, 1, endof(s))
5248
SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, endof(s))
5349

50+
convert(::Type{SubString{S}}, s::AbstractString) where {S<:AbstractString} =
51+
SubString(convert(S, s))
52+
5453
String(p::SubString{String}) =
5554
unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1)
5655

@@ -123,31 +122,15 @@ function unsafe_convert(::Type{Ptr{R}}, s::SubString{String}) where R<:Union{Int
123122
convert(Ptr{R}, pointer(s.string)) + s.offset
124123
end
125124

126-
## reversed strings without data movement ##
127-
128-
struct RevString{T<:AbstractString} <: AbstractString
129-
string::T
130-
end
131-
132-
endof(s::RevString) = endof(s.string)
133-
length(s::RevString) = length(s.string)
134-
sizeof(s::RevString) = sizeof(s.string)
135-
136-
function next(s::RevString, i::Int)
137-
n = endof(s); j = n-i+1
138-
(s.string[j], n-prevind(s.string,j)+1)
139-
end
140-
141125
"""
142126
reverse(s::AbstractString) -> AbstractString
143127
144-
Reverses a string.
145-
146-
Technically, this function reverses the codepoints in a string, and its
128+
Reverses a string. Technically, this function reverses the codepoints in a string and its
147129
main utility is for reversed-order string processing, especially for reversed
148-
regular-expression searches. See also [`reverseind`](@ref) to convert indices
149-
in `s` to indices in `reverse(s)` and vice-versa, and [`graphemes`](@ref)
150-
to operate on user-visible "characters" (graphemes) rather than codepoints.
130+
regular-expression searches. See also [`reverseind`](@ref) to convert indices in `s` to
131+
indices in `reverse(s)` and vice-versa, and [`graphemes`](@ref) to operate on user-visible
132+
"characters" (graphemes) rather than codepoints.
133+
151134
See also [`Iterators.reverse`](@ref) for reverse-order iteration without making a copy.
152135
153136
# Examples
@@ -162,10 +145,16 @@ julia> join(reverse(collect(graphemes("ax̂e")))) # reverses graphemes
162145
"ex̂a"
163146
```
164147
"""
165-
reverse(s::AbstractString) = RevString(s)
166-
reverse(s::RevString) = s.string
167-
168-
## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)]
148+
function reverse(s::S)::S where {S<:AbstractString}
149+
# TODO: generic API for sprint to a particular encoding
150+
sprint() do io
151+
i, j = start(s), endof(s)
152+
while i j
153+
c, j = s[j], prevind(s, j)
154+
write(io, c)
155+
end
156+
end
157+
end
169158

170159
"""
171160
reverseind(v, i)
@@ -185,10 +174,7 @@ julia> for i in 1:length(r)
185174
Julia
186175
```
187176
"""
188-
reverseind(s::AbstractString, i) = chr2ind(s, length(s) + 1 - ind2chr(reverse(s), i))
189-
reverseind(s::RevString, i::Integer) = endof(s) - i + 1
190-
reverseind(s::SubString{String}, i::Integer) =
191-
reverseind(s.string, nextind(s.string, endof(s.string))-s.offset-s.endof+i-1) - s.offset
177+
reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1)
192178

193179
"""
194180
repeat(s::AbstractString, r::Integer)

base/strings/util.jl

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -190,16 +190,15 @@ julia> rstrip(a)
190190
```
191191
"""
192192
function rstrip(s::AbstractString, chars::Chars=_default_delims)
193-
r = RevString(s)
194-
i = start(r)
195-
while !done(r,i)
196-
c, j = next(r,i)
197-
if !(c in chars)
198-
return SubString(s, 1, endof(s)-i+1)
199-
end
193+
a = start(s)
194+
i = endof(s)
195+
while a  i
196+
c = s[i]
197+
j = prevind(s, i)
198+
c in chars || return SubString(s, 1:i)
200199
i = j
201200
end
202-
SubString(s, 1, 0)
201+
SubString(s, a, a-1)
203202
end
204203

205204
"""

test/replcompletions.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
using Base.REPLCompletions
44

55
let ex = quote
6-
module CompletionFoo
6+
module CompletionFoo
77
mutable struct Test_y
88
yy
99
end

test/strings/types.jl

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# This file is a part of Julia. License is MIT: https://julialang.org/license
22

3-
## SubString, RevString and Cstring tests ##
3+
## SubString and Cstring tests ##
44

55
## SubString tests ##
66
u8str = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε"
@@ -207,31 +207,13 @@ let s = "|η(α)-ϕ(κ)| < ε"
207207
@test length(SubString(s,4,11))==length(s[4:11])
208208
end
209209

210-
## Reverse strings ##
211-
212-
let rs = RevString("foobar")
213-
@test length(rs) == 6
214-
@test sizeof(rs) == 6
215-
@test isascii(rs)
216-
end
217-
218-
# issue #4586
219-
@test rsplit(RevString("ailuj"),'l') == ["ju","ia"]
220-
@test parse(Float64,RevString("64")) === 46.0
221-
222-
# reverseind
223-
for T in (String, GenericString)
210+
@testset "reverseind" for T in (String, SubString, GenericString)
224211
for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1")
225212
for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4")
226213
for c in ('X', 'δ', '\U0001d6a5')
227214
s = convert(T, string(prefix, c, suffix))
228215
r = reverse(s)
229216
ri = search(r, c)
230-
@test r == RevString(s)
231-
@test c == s[reverseind(s, ri)] == r[ri]
232-
s = RevString(s)
233-
r = reverse(s)
234-
ri = search(r, c)
235217
@test c == s[reverseind(s, ri)] == r[ri]
236218
s = convert(T, string(prefix, prefix, c, suffix, suffix))
237219
pre = convert(T, prefix)
@@ -244,6 +226,18 @@ for T in (String, GenericString)
244226
end
245227
end
246228

229+
@testset "reverseind of empty strings" begin
230+
for s in ("",
231+
SubString("", 1, 0),
232+
SubString("ab", 1, 0),
233+
SubString("ab", 2, 1),
234+
SubString("ab", 3, 2),
235+
GenericString(""))
236+
@test reverseind(s, 0) == 1
237+
@test reverseind(s, 1) == 0
238+
end
239+
end
240+
247241
## Cstring tests ##
248242

249243
# issue #13974: comparison against pointers

0 commit comments

Comments
 (0)