Skip to content

Commit 8fb4716

Browse files
committed
Fix JuliaLang#10959 UTF-32 conversion errors
Added new `convert` methods that use the `check_string` function to validate input Added tests for many sorts of valid/invalid data Depends on PR JuliaLang#11551 and JuliaLang#11575
1 parent 70bf53c commit 8fb4716

File tree

2 files changed

+277
-16
lines changed

2 files changed

+277
-16
lines changed

base/utf32.jl

Lines changed: 252 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,264 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1)
55
endof(s::UTF32String) = length(s.data) - 1
66
length(s::UTF32String) = length(s.data) - 1
77

8+
const empty_utf32 = UTF32String(UInt32[0])
9+
810
utf32(x) = convert(UTF32String, x)
911
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
1012
convert(::Type{UTF32String}, s::UTF32String) = s
1113

12-
function convert(::Type{UTF32String}, s::AbstractString)
13-
a = Array(Char, length(s) + 1)
14-
i = 0
15-
for c in s
16-
a[i += 1] = c
14+
function convert(::Type{UTF32String}, str::AbstractString)
15+
"
16+
Converts an AbstractString to a UTF16String
17+
18+
### Input Arguments:
19+
* ::Type{UTF32String}
20+
* str::AbstractString
21+
22+
### Returns:
23+
* ::UTF32String
24+
25+
### Throws:
26+
* UnicodeError
27+
"
28+
len, flags = check_string(str)
29+
buf = Vector{Char}(len+1)
30+
out = 0
31+
@inbounds for ch in str ; buf[out += 1] = ch ; end
32+
@inbounds buf[out + 1] = 0 # NULL termination
33+
UTF32String(buf)
34+
end
35+
36+
function convert(::Type{UTF8String}, dat::Vector{UInt32})
37+
"
38+
Converts a UTF-32 encoded vector of UInt32 to a UTF8String
39+
40+
### Input Arguments:
41+
* ::Type{UTF8String}
42+
* dat::Vector{UInt32}
43+
44+
### Returns:
45+
* ::UTF8String
46+
47+
### Throws:
48+
* UnicodeError
49+
"
50+
len = sizeof(dat)
51+
# handle zero length string quickly
52+
len == 0 && return empty_utf8
53+
# get number of bytes to allocate
54+
len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>2)
55+
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
56+
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
57+
end
58+
59+
function convert(::Type{UTF8String}, str::UTF32String)
60+
"
61+
Converts a UTF32String to a UTF8String
62+
63+
### Input Arguments:
64+
* ::Type{UTF8String}
65+
* str::UTF32String
66+
67+
### Returns:
68+
* ::UTF8String
69+
70+
### Throws:
71+
* UnicodeError
72+
"
73+
dat = reinterpret(UInt32, str.data)
74+
len = sizeof(dat) >>> 2
75+
# handle zero length string quickly
76+
len <= 1 && return empty_utf8
77+
# get number of bytes to allocate
78+
len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
79+
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
80+
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
81+
end
82+
83+
function convert(::Type{UTF32String}, str::UTF8String)
84+
"
85+
Converts a UTF8String to a UTF32String
86+
87+
### Input Arguments:
88+
* ::Type{UTF32String}
89+
* str::UTF8String
90+
91+
### Returns:
92+
* ::UTF32String
93+
94+
### Throws:
95+
* UnicodeError
96+
"
97+
dat = str.data
98+
# handle zero length string quickly
99+
sizeof(dat) == 0 && return empty_utf32
100+
# Validate UTF-8 encoding, and get number of words to create
101+
len, flags = check_string(dat)
102+
# Optimize case where no characters > 0x7f
103+
totlen = len+1
104+
flags == 0 && return fast_utf_copy(UTF32String, Char, totlen, dat)
105+
# has multi-byte UTF-8 sequences
106+
buf = Vector{Char}(totlen)
107+
@inbounds buf[totlen] = 0 # NULL termination
108+
local ch::UInt32, surr::UInt32
109+
out = 0
110+
pos = 0
111+
@inbounds while out < len
112+
ch = dat[pos += 1]
113+
# Handle ASCII characters
114+
if ch <= 0x7f
115+
buf[out += 1] = ch
116+
# Handle range 0x80-0x7ff
117+
elseif ch < 0xe0
118+
buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
119+
# Handle range 0x800-0xffff
120+
elseif ch < 0xf0
121+
pos += 2
122+
ch = get_utf8_3byte(dat, pos, ch)
123+
# Handle surrogate pairs (should have been encoded in 4 bytes)
124+
if is_surrogate_lead(ch)
125+
# Build up 32-bit character from ch and trailing surrogate in next 3 bytes
126+
pos += 3
127+
surr = ((UInt32(dat[pos-2] & 0xf) << 12)
128+
| (UInt32(dat[pos-1] & 0x3f) << 6)
129+
| (dat[pos] & 0x3f))
130+
ch = get_supplementary(ch, surr)
131+
end
132+
buf[out += 1] = ch
133+
# Handle range 0x10000-0x10ffff
134+
else
135+
pos += 3
136+
buf[out += 1] = get_utf8_4byte(dat, pos, ch)
137+
end
17138
end
18-
a[end] = Char(0) # NULL terminate
19-
UTF32String(a)
139+
UTF32String(buf)
20140
end
21141

142+
function convert(::Type{UTF32String}, str::UTF16String)
143+
"
144+
Converts a UTF16String to UTF32String
145+
146+
### Input Arguments:
147+
* ::Type{UTF32String}
148+
* str::UTF16String
149+
150+
### Returns:
151+
* ::UTF32String
152+
153+
### Throws:
154+
* UnicodeError
155+
"
156+
dat = str.data
157+
len = sizeof(dat)
158+
# handle zero length string quickly (account for trailing \0)
159+
len <= 2 && return empty_utf32
160+
# get number of words to create
161+
len, flags, num4byte = check_string(dat, len>>>1)
162+
# No surrogate pairs, do optimized copy
163+
(flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
164+
local ch::UInt32
165+
buf = Vector{Char}(len)
166+
out = 0
167+
pos = 0
168+
@inbounds while out < len
169+
ch = dat[pos += 1]
170+
# check for surrogate pair
171+
if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end
172+
buf[out += 1] = ch
173+
end
174+
UTF32String(buf)
175+
end
176+
177+
function convert(::Type{UTF16String}, dat::Vector{UInt32})
178+
"
179+
Converts a UTF-32 encoded vector of UInt32 to a UTF16String
180+
181+
### Input Arguments:
182+
* ::Type{UTF16String}
183+
* dat::Vector{UInt32}
184+
185+
### Returns:
186+
* ::UTF16String
187+
188+
### Throws:
189+
* UnicodeError
190+
"
191+
len = sizeof(dat)
192+
# handle zero length string quickly
193+
len <= 4 && return empty_utf16
194+
# get number of words to allocate
195+
len, flags, num4byte = check_string(dat, len>>>2)
196+
len += num4byte + 1
197+
# optimized path, no surrogates
198+
num4byte == 0 && return fast_utf_copy(UTF16String, UInt16, len, dat)
199+
return encode_to_utf16(dat, len)
200+
end
201+
202+
function convert(::Type{UTF16String}, str::UTF32String)
203+
"
204+
Converts a UTF32String to UTF16String
205+
206+
### Input Arguments:
207+
* ::Type{UTF16String}
208+
* str::UTF32String
209+
210+
### Returns:
211+
* ::UTF16String
212+
213+
### Throws:
214+
* UnicodeError
215+
"
216+
dat = reinterpret(UInt32, str.data)
217+
len = sizeof(dat)
218+
# handle zero length string quickly
219+
len <= 4 && return empty_utf16
220+
# get number of words to allocate
221+
len, flags, num4byte = check_string(dat, len>>>2)
222+
# optimized path, no surrogates
223+
num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
224+
return encode_to_utf16(dat, len + num4byte)
225+
end
226+
227+
function encode_to_utf16(dat, len)
228+
"
229+
Converts an already validated UTF-32 encoded vector of UInt32 to a UTF16String
230+
231+
### Input Arguments:
232+
* dat::Vector{UInt32} UTF-32 encoded data
233+
* len length of output in 16-bit words
234+
235+
### Returns:
236+
* ::UTF16String
237+
"
238+
buf = Vector{UInt16}(len)
239+
@inbounds buf[len] = 0 # NULL termination
240+
out = 0
241+
pos = 0
242+
@inbounds while out < len
243+
ch = UInt32(dat[pos += 1])
244+
if ch > 0xffff
245+
# Output surrogate pair for 0x10000-0x10ffff
246+
buf[out += 1] = 0xd7c0 + (ch >>> 10)
247+
ch = 0xdc00 + (ch & 0x3ff)
248+
end
249+
buf[out += 1] = ch
250+
end
251+
UTF16String(buf)
252+
end
253+
254+
convert(::Type{UTF8String}, dat::Vector{Char}) = convert(UTF8String, reinterpret(UInt32, dat))
255+
256+
convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat))
257+
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
258+
259+
function convert(::Type{UTF32String}, str::ASCIIString)
260+
dat = str.data
261+
fast_utf_copy(UTF32String, Char, length(dat)+1, dat)
262+
end
263+
264+
convert(::Type{UTF32String}, dat::AbstractVector{Char}) = fast_utf_copy(UTF32String, Char, length(dat), dat, true)
265+
22266
function convert(::Type{UTF32String}, data::AbstractVector{Char})
23267
len = length(data)
24268
d = Array(Char, len + 1)
@@ -51,7 +295,7 @@ unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
51295
convert(Ptr{T}, pointer(s))
52296

53297
function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
54-
isempty(bytes) && return UTF32String(Char[0])
298+
isempty(bytes) && return empty_utf32
55299
length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0))
56300
data = reinterpret(Char, bytes)
57301
# check for byte-order mark (BOM):

test/strings.jl

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1820,12 +1820,16 @@ byt = 0x0
18201820
@test_throws UnicodeError Base.check_string(UInt32[0x110000])
18211821

18221822
# issue #11551 (#11004,#10959)
1823-
function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String)
1823+
function tstcvt(strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String)
18241824
@test utf16(strUTF8) == strUTF16
1825+
@test utf32(strUTF8) == strUTF32
18251826
@test utf8(strUTF16) == strUTF8
1827+
@test utf32(strUTF16) == strUTF32
1828+
@test utf8(strUTF32) == strUTF8
1829+
@test utf16(strUTF32) == strUTF16
18261830
end
18271831

1828-
# Create some ASCII, UTF8 and UTF16
1832+
# Create some ASCII, UTF8, UTF16, and UTF32 strings
18291833
strAscii = "abcdefgh"
18301834
strA_UTF8 = ("abcdefgh\uff")[1:8]
18311835
strL_UTF8 = "abcdef\uff\uff"
@@ -1844,27 +1848,40 @@ str3_UTF16 = utf16(str3_UTF8)
18441848
str4_UTF16 = utf16(str4_UTF8)
18451849
strS_UTF16 = utf16(strS_UTF8)
18461850

1851+
strA_UTF32 = utf32(strA_UTF8)
1852+
strL_UTF32 = utf32(strL_UTF8)
1853+
str2_UTF32 = utf32(str2_UTF8)
1854+
str3_UTF32 = utf32(str3_UTF8)
1855+
str4_UTF32 = utf32(str4_UTF8)
1856+
strS_UTF32 = utf32(strS_UTF8)
1857+
18471858
@test utf8(strAscii) == strAscii
18481859
@test utf16(strAscii) == strAscii
1860+
@test utf32(strAscii) == strAscii
18491861

1850-
tstcvt(strA_UTF8,strA_UTF16)
1851-
tstcvt(strL_UTF8,strL_UTF16)
1852-
tstcvt(str2_UTF8,str2_UTF16)
1853-
tstcvt(str3_UTF8,str3_UTF16)
1854-
tstcvt(str4_UTF8,str4_UTF16)
1862+
tstcvt(strA_UTF8,strA_UTF16,strA_UTF32)
1863+
tstcvt(strL_UTF8,strL_UTF16,strL_UTF32)
1864+
tstcvt(str2_UTF8,str2_UTF16,str2_UTF32)
1865+
tstcvt(str3_UTF8,str3_UTF16,str3_UTF32)
1866+
tstcvt(str4_UTF8,str4_UTF16,str4_UTF32)
18551867

18561868
# Test converting surrogate pairs
18571869
@test utf16(strS_UTF8) == strC_UTF8
1870+
@test utf32(strS_UTF8) == strC_UTF8
18581871
@test utf8(strS_UTF16) == strC_UTF8
1872+
@test utf32(strS_UTF16) == strC_UTF8
1873+
@test utf8(strS_UTF32) == strC_UTF8
1874+
@test utf16(strS_UTF32) == strC_UTF8
18591875

18601876
# Test converting overlong \0
18611877
# @test utf8(strZ_UTF8) == strz_UTF8 # currently broken! (in utf8.jl)
18621878
@test utf16(strZ_UTF8) == strz_UTF8
1879+
@test utf32(strZ_UTF8) == strz_UTF8
18631880

18641881
# Test invalid sequences
18651882

18661883
byt = 0x0
1867-
for T in (UTF16String,) # UTF32String
1884+
for T in (UTF16String, UTF32String)
18681885
try
18691886
# Continuation byte not after lead
18701887
for byt in 0x80:0xbf

0 commit comments

Comments
 (0)