Skip to content

Commit fdb0b35

Browse files
committed
Fix JuliaLang#10959 bugs with UTF-16 conversions
Rewrote a number of the conversions between ASCIIString, UTF8String, and UTF16String. Rewrote length() for UTF16String(). Improved reverse() for UTF16String(). Added over 150 lines of testing code to detect the above conversion problems Added (in a gist) code to show other conversion problems not yet fixed: https://gist.github.com/ScottPJones/4e6e8938f0559998f9fc Added (in a gist) code to benchmark the performance, to ensure that adding the extra validity checking did not adversely affect performance (in fact, performance was greatly improved). https://gist.github.com/ScottPJones/79ed895f05f85f333d84 Updated based on review comments Changes to error handling and check_string Rebased against JuliaLang#11575 Updated comment to go before function, not indented by 4 Updated to use unsafe_checkstring Removed redundant argument documentation
1 parent 1e081b7 commit fdb0b35

File tree

4 files changed

+371
-66
lines changed

4 files changed

+371
-66
lines changed

base/utf16.jl

Lines changed: 225 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,42 @@
11
# This file is a part of Julia. License is MIT: http://julialang.org/license
22

3-
utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
4-
utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
5-
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
6-
utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)
3+
# Quickly copy and set trailing \0
4+
@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, Char}}(
5+
::Type{S}, ::Type{T}, len, dat, flag::Bool=false)
6+
S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1))
7+
end
8+
9+
# Get rest of character ch from 3-byte UTF-8 sequence in dat
10+
@inline function get_utf8_3byte(dat, pos, ch)
11+
@inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
12+
end
13+
# Get rest of character ch from 4-byte UTF-8 sequence in dat
14+
@inline function get_utf8_4byte(dat, pos, ch)
15+
@inbounds return (((ch & 0x7) << 18)
16+
| (UInt32(dat[pos-2] & 0x3f) << 12)
17+
| (UInt32(dat[pos-1] & 0x3f) << 6)
18+
| (dat[pos] & 0x3f))
19+
end
20+
21+
# Output a character as a 4-byte UTF-8 sequence
22+
@inline function output_utf8_4byte!(buf, out, ch)
23+
@inbounds begin
24+
buf[out + 1] = 0xf0 | (ch >>> 18)
25+
buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f)
26+
buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f)
27+
buf[out + 4] = 0x80 | (ch & 0x3f)
28+
end
29+
end
30+
31+
const empty_utf16 = UTF16String(UInt16[0])
732

833
function length(s::UTF16String)
934
d = s.data
1035
len = length(d) - 1
1136
len == 0 && return 0
1237
cnum = 0
1338
for i = 1:len
14-
@inbounds cnum += !utf16_is_trail(d[i])
39+
@inbounds cnum += !is_surrogate_trail(d[i])
1540
end
1641
cnum
1742
end
@@ -20,100 +45,240 @@ function endof(s::UTF16String)
2045
d = s.data
2146
i = length(d) - 1
2247
i == 0 && return i
23-
utf16_is_surrogate(d[i]) ? i-1 : i
48+
return is_surrogate_codeunit(d[i]) ? i-1 : i
2449
end
2550

51+
get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
52+
2653
function next(s::UTF16String, i::Int)
27-
if !utf16_is_surrogate(s.data[i])
28-
return Char(s.data[i]), i+1
29-
elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
30-
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
31-
end
32-
throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0))
54+
ch = s.data[i]
55+
!is_surrogate_codeunit(ch) && return (Char(ch), i+1)
56+
# check length, account for terminating \0
57+
i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)))
58+
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch))
59+
ct = s.data[i+1]
60+
!is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch))
61+
Char(get_supplementary(ch, ct)), i+2
3362
end
3463

3564
function reverseind(s::UTF16String, i::Integer)
3665
j = length(s.data) - i
37-
return Base.utf16_is_trail(s.data[j]) ? j-1 : j
66+
return is_surrogate_trail(s.data[j]) ? j-1 : j
3867
end
3968

4069
lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
4170

4271
function reverse(s::UTF16String)
43-
d =s.data
72+
d = s.data
4473
out = similar(d)
4574
out[end] = 0 # NULL termination
4675
n = length(d)
47-
for i = 1:n-1
48-
out[i] = d[n-i]
49-
if Base.utf16_is_lead(out[i])
50-
out[i],out[i-1] = out[i-1],out[i]
76+
@inbounds for i = 1:n-1
77+
ch = d[n-i]
78+
if is_surrogate_lead(ch)
79+
out[i],out[i-1] = out[i-1],ch
80+
else
81+
out[i] = ch
82+
end
83+
end
84+
UTF16String(out)
85+
end
86+
87+
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
88+
89+
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
90+
i = 1
91+
n = length(data) # this may include NULL termination; that's okay
92+
@inbounds while i < n # check for unpaired surrogates
93+
if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
94+
i += 2
95+
elseif is_surrogate_codeunit(data[i])
96+
return false
97+
else
98+
i += 1
5199
end
52100
end
53-
return UTF16String(out)
101+
return i > n || !is_surrogate_codeunit(data[i])
54102
end
55103

56-
# TODO: optimize this
57-
function encode16(s::AbstractString)
58-
buf = UInt16[]
59-
for ch in s
60-
c = reinterpret(UInt32, ch)
104+
"
105+
Converts an `AbstractString` to a `UTF16String`
106+
107+
### Returns:
108+
* `UTF16String`
109+
110+
### Throws:
111+
* `UnicodeError`
112+
"
113+
function convert(::Type{UTF16String}, str::AbstractString)
114+
len, flags, num4byte = unsafe_checkstring(str)
115+
buf = Vector{UInt16}(len+num4byte+1)
116+
out = 0
117+
@inbounds for ch in str
118+
c = UInt32(ch)
61119
if c < 0x10000
62-
push!(buf, UInt16(c))
63-
elseif c <= 0x10ffff
64-
push!(buf, UInt16(0xd7c0 + (c>>10)))
65-
push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
120+
buf[out += 1] = UInt16(c)
66121
else
67-
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch))
122+
# output surrogate pair
123+
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
124+
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
68125
end
69126
end
70-
push!(buf, 0) # NULL termination
127+
@inbounds buf[out + 1] = 0 # NULL termination
71128
UTF16String(buf)
72129
end
73130

74-
utf16(x) = convert(UTF16String, x)
75-
convert(::Type{UTF16String}, s::UTF16String) = s
76-
convert(::Type{UTF16String}, s::AbstractString) = encode16(s)
77-
convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data
78-
convert(::Type{Array{UInt16}}, s::UTF16String) = s.data
131+
"
132+
Converts a `UTF8String` to a `UTF16String`
79133
80-
# TODO: optimize this
81-
convert(::Type{UTF8String}, s::UTF16String) =
82-
sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end)
134+
### Returns:
135+
* `UTF16String`
83136
84-
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
85-
unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
86-
convert(Ptr{T}, pointer(s))
137+
### Throws:
138+
* `UnicodeError`
139+
"
140+
function convert(::Type{UTF16String}, str::UTF8String)
141+
dat = str.data
142+
# handle zero length string quickly
143+
sizeof(dat) == 0 && return empty_utf16
144+
# Check that is correct UTF-8 encoding and get number of words needed
145+
len, flags, num4byte = unsafe_checkstring(dat)
146+
len += num4byte
147+
buf = Vector{UInt16}(len+1)
148+
@inbounds buf[len+1] = 0
149+
# Optimize case where no characters > 0x7f
150+
flags == 0 && @inbounds return UTF16String(copy!(buf, dat))
151+
out = 0
152+
pos = 0
153+
@inbounds while out < len
154+
ch::UInt32 = dat[pos += 1]
155+
# Handle ASCII characters
156+
if ch <= 0x7f
157+
buf[out += 1] = ch
158+
# Handle range 0x80-0x7ff
159+
elseif ch < 0xe0
160+
buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
161+
# Handle range 0x800-0xffff
162+
elseif ch < 0xf0
163+
pos += 2
164+
buf[out += 1] = get_utf8_3byte(dat, pos, ch)
165+
# Handle range 0x10000-0x10ffff
166+
else
167+
pos += 3
168+
ch = get_utf8_4byte(dat, pos, ch)
169+
# output surrogate pair
170+
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
171+
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
172+
end
173+
end
174+
UTF16String(buf)
175+
end
87176

88-
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
89-
i = 1
90-
n = length(data) # this may include NULL termination; that's okay
91-
while i < n # check for unpaired surrogates
92-
if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1])
93-
i += 2
94-
elseif utf16_is_surrogate(data[i])
95-
return false
177+
"
178+
Converts a UTF-16 encoded vector of `UInt16` to a `UTF8String`
179+
180+
### Returns:
181+
* `UTF8String`
182+
183+
### Throws:
184+
* `UnicodeError`
185+
"
186+
function convert(::Type{UTF8String}, dat::Vector{UInt16})
187+
len = sizeof(dat)
188+
# handle zero length string quickly
189+
len == 0 && return emtpy_utf8
190+
# get number of bytes to allocate
191+
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len>>>1)
192+
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat))
193+
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
194+
end
195+
196+
"
197+
Converts a `UTF16String` to a `UTF8String`
198+
199+
### Returns:
200+
* `UTF8String`
201+
202+
### Throws:
203+
* `UnicodeError`
204+
"
205+
function convert(::Type{UTF8String}, str::UTF16String)
206+
dat = str.data
207+
len = sizeof(dat) >>> 1
208+
# handle zero length string quickly
209+
len <= 1 && return empty_utf8
210+
# get number of bytes to allocate
211+
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1)
212+
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
213+
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
214+
end
215+
216+
"
217+
Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
218+
219+
### Input Arguments:
220+
* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
221+
* `len` length of output in bytes
222+
223+
### Returns:
224+
* `UTF8String`
225+
"
226+
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
227+
buf = Vector{UInt8}(len)
228+
out = 0
229+
pos = 0
230+
@inbounds while out < len
231+
ch::UInt32 = dat[pos += 1]
232+
# Handle ASCII characters
233+
if ch <= 0x7f
234+
buf[out += 1] = ch
235+
# Handle 0x80-0x7ff
236+
elseif ch < 0x800
237+
buf[out += 1] = 0xc0 | (ch >>> 6)
238+
buf[out += 1] = 0x80 | (ch & 0x3f)
239+
# Handle 0x10000-0x10ffff (if input is UInt32)
240+
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
241+
output_utf8_4byte!(buf, out, ch)
242+
out += 4
243+
# Handle surrogate pairs
244+
elseif is_surrogate_codeunit(ch)
245+
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
246+
out += 4
247+
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
96248
else
97-
i += 1
249+
buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
250+
buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
251+
buf[out += 1] = 0x80 | (ch & 0x3f)
98252
end
99253
end
100-
return i > n || !utf16_is_surrogate(data[i])
254+
UTF8String(buf)
101255
end
102256

103-
function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
104-
!isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
105-
len = length(data)
106-
d = Array(UInt16, len + 1)
107-
d[end] = 0 # NULL terminate
108-
UTF16String(copy!(d,1, data,1, len))
257+
function convert(::Type{UTF16String}, str::ASCIIString)
258+
dat = str.data
259+
@inbounds return fast_utf_copy(UTF16String, UInt16, length(dat), dat, true)
109260
end
110261

262+
convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data
263+
convert(::Type{Array{UInt16}}, str::UTF16String) = str.data
264+
265+
convert(::Type{UTF16String}, str::UTF16String) = str
266+
267+
unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
268+
convert(Ptr{T}, pointer(s))
269+
111270
convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
112271
convert(T, reshape(data, length(data)))
113272

114273
convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
115274
convert(T, reinterpret(UInt16, data))
116275

276+
function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
277+
!isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
278+
len = length(data)
279+
@inbounds return UTF16String(setindex!(copy!(Vector{UInt16}(len+1),1,data,1,len),0,len+1))
280+
end
281+
117282
function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
118283
isempty(bytes) && return UTF16String(UInt16[0])
119284
isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
@@ -136,6 +301,9 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
136301
UTF16String(d)
137302
end
138303

304+
convert(::Type{UTF16String}, str::UTF16String) = str
305+
306+
utf16(x) = convert(UTF16String, x)
139307
utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len))
140308
utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
141309
function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}})

0 commit comments

Comments
 (0)