Skip to content

Commit 040b647

Browse files
committed
Add check_string function that is more generic, thanks to Encodings
1 parent a420983 commit 040b647

File tree

3 files changed

+303
-3
lines changed

3 files changed

+303
-3
lines changed

src/CheckStrings.jl

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
# This file is a part of Julia. License is MIT: http://julialang.org/license
2+
3+
## Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings,
4+
# and also to return information necessary to convert to other encodings
5+
6+
module CheckStrings
7+
8+
using Encodings
9+
10+
export check_string
11+
export is_surrogate_lead, is_surrogate_trail, is_surrogate_codeunit, is_valid_continuation
12+
13+
using Base.UTF_ERR_SHORT, Base.UTF_ERR_CONT,Base.UTF_ERR_LONG,
14+
Base.UTF_ERR_NOT_LEAD, Base.UTF_ERR_NOT_TRAIL,
15+
Base.UTF_ERR_SURROGATE, Base.UTF_ERR_MISSING_SURROGATE, Base.UTF_ERR_INVALID
16+
17+
is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
18+
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
19+
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
20+
is_valid_continuation(c) = ((c & 0xc0) == 0x80)
21+
22+
## Return flags for check_string function
23+
24+
const UTF_LONG = 1 ##< Long encodings are present
25+
const UTF_LATIN1 = 2 ##< characters in range 0x80-0xFF present
26+
const UTF_UNICODE2 = 4 ##< characters in range 0x100-0x7ff present
27+
const UTF_UNICODE3 = 8 ##< characters in range 0x800-0xd7ff, 0xe000-0xffff
28+
const UTF_UNICODE4 = 16 ##< non-BMP characters present
29+
const UTF_SURROGATE = 32 ##< surrogate pairs present
30+
31+
## Get a UTF-8 continuation byte, give error if invalid, return updated character value
32+
@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
33+
!is_valid_continuation(byt) && throw(UnicodeError(UTF_ERR_CONT, pos, byt))
34+
(ch << 6) | (byt & 0x3f)
35+
end
36+
37+
CodeUnitType = Union(UInt8, UInt16, UInt32, Char)
38+
CodeUnitC = AbstractArray{Char}
39+
CodeUnit8 = AbstractArray(UInt8}
40+
CodeUnit16 = AbstractArray{UInt16}
41+
CodeUnit32 = AbstractArray{UInt32}
42+
CodeUnit = Union(CodeUnit8, CodeUnit16, CodeUnit32, CodeUnitC, AbstractString)
43+
44+
"
45+
Validates and calculates number of characters in a UTF-8 encoded vector of `UInt8`
46+
47+
### Input Arguments:
48+
* `::Type{Encoding}`
49+
* `dat::CodeUnit` Vector of `UInt8`, `UInt16`, `UInt32` or `Char`
50+
51+
### Optional Input Arguments:
52+
* `endpos` end position (defaults to `endof(dat)`)
53+
* `pos` start position (defaults to `start(dat)`)
54+
55+
### Keyword Arguments:
56+
* `accept_long_null` = `true` # Modified UTF-8 (`\\0` represented as `b\"\\xc0\\x80\"`)
57+
* `accept_surrogates` = `true` # `CESU-8`
58+
* `accept_long_char` = `false` # Accept arbitrary long encodings
59+
60+
### Returns:
61+
* (total characters, flags, 4-byte, 3-byte, 2-byte)
62+
63+
### Throws:
64+
* `UnicodeError`
65+
"
66+
function check_string{T <: CodeUnit, E <: Union(UTF8, UTF16, UTF32)} (
67+
::Type{E},
68+
dat::T,
69+
endpos = endof(dat),
70+
pos = start(dat)
71+
;
72+
accept_long_null = true,
73+
accept_surrogates = true,
74+
accept_long_char = false)
75+
local byt::UInt8, ch::UInt32, surr::UInt32
76+
flags::UInt = 0
77+
totalchar = num2byte = num3byte = num4byte = 0
78+
@inbounds while pos <= endpos
79+
ch, pos = next(dat, pos)
80+
totalchar += 1
81+
if ch > 0x7f
82+
if E <: UTF8
83+
# Check UTF-8 encoding
84+
if ch < 0xe0
85+
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
86+
(pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
87+
byt, pos = next(dat, pos)
88+
ch = get_continuation(ch & 0x3f, byt, pos)
89+
if ch > 0x7f
90+
num2byte += 1
91+
flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
92+
elseif accept_long_char
93+
flags |= UTF_LONG
94+
elseif (ch == 0) && accept_long_null
95+
flags |= UTF_LONG
96+
else
97+
throw(UnicodeError(UTF_ERR_LONG, pos, ch))
98+
end
99+
elseif ch < 0xf0
100+
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
101+
(pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
102+
byt, pos = next(dat, pos)
103+
ch = get_continuation(ch & 0x0f, byt, pos)
104+
byt, pos = next(dat, pos)
105+
ch = get_continuation(ch, byt, pos)
106+
# check for surrogate pairs, make sure correct
107+
if is_surrogate_codeunit(ch)
108+
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
109+
# next character *must* be a trailing surrogate character
110+
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
111+
byt, pos = next(dat, pos)
112+
(byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
113+
byt, pos = next(dat, pos)
114+
surr = get_continuation(0x0000d, byt, pos)
115+
byt, pos = next(dat, pos)
116+
surr = get_continuation(surr, byt, pos)
117+
!is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
118+
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
119+
flags |= UTF_SURROGATE
120+
num4byte += 1
121+
elseif ch > 0x07ff
122+
num3byte += 1
123+
elseif accept_long_char
124+
flags |= UTF_LONG
125+
num2byte += 1
126+
else
127+
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
128+
end
129+
elseif ch < 0xf5
130+
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
131+
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
132+
byt, pos = next(dat, pos)
133+
ch = get_continuation(ch & 0x07, byt, pos)
134+
byt, pos = next(dat, pos)
135+
ch = get_continuation(ch, byt, pos)
136+
byt, pos = next(dat, pos)
137+
ch = get_continuation(ch, byt, pos)
138+
if ch > 0x10ffff
139+
throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch))
140+
elseif ch > 0xffff
141+
num4byte += 1
142+
elseif is_surrogate_codeunit(ch)
143+
throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
144+
elseif accept_long_char
145+
# This is an overly long encoded character
146+
flags |= UTF_LONG
147+
if ch > 0x7ff
148+
num3byte += 1
149+
elseif ch > 0x7f
150+
num2byte += 1
151+
end
152+
else
153+
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
154+
end
155+
else
156+
throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
157+
end
158+
# Handle UTF16 and UTF32 Encodings
159+
elseif ch < 0x100
160+
num2byte += 1
161+
flags |= UTF_LATIN1
162+
elseif ch < 0x800
163+
num2byte += 1
164+
flags |= UTF_UNICODE2
165+
elseif ch > 0x0ffff
166+
(ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
167+
num4byte += 1
168+
elseif !is_surrogate_codeunit(ch)
169+
num3byte += 1
170+
elseif is_surrogate_lead(ch)
171+
pos > endpos && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch))
172+
# next character *must* be a trailing surrogate character
173+
ch, pos = next(dat, pos)
174+
!is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
175+
num4byte += 1
176+
if E !<: UTF16
177+
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
178+
flags |= UTF_SURROGATE
179+
end
180+
else
181+
throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch))
182+
end
183+
end
184+
end
185+
num3byte != 0 && (flags |= UTF_UNICODE3)
186+
num4byte != 0 && (flags |= UTF_UNICODE4)
187+
return totalchar, flags, num4byte, num3byte, num2byte
188+
end
189+
190+
check_string{T <: CodeUnit8}(dat::T, endpos) = check_string(UTF8, dat, endpos)
191+
check_string{T <: CodeUnit16}(dat::T, endpos) = check_string(UTF16, dat, endpos)
192+
check_string{T <: Union(CodeUnit32, CodeUnitC, AbstractString)}(dat::T, endpos) = check_string(UTF32, dat, endpos)
193+
check_string{T <: CodeUnit8}(dat::T) = check_string(UTF8, dat)
194+
check_string{T <: CodeUnit16}(dat::T) = check_string(UTF16, dat)
195+
check_string{T <: Union(CodeUnit32, CodeUnitC, AbstractString)}(dat::T) = check_string(UTF32, dat)
196+
end

src/Strings.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module Strings
22

3-
using Compat, Mmap, Encodings
3+
using Compat, Mmap, Encodings, CheckStrings
44

55
immutable String{T<:Encoding}
66
ptr::Ptr{UInt8}

test/runtests.jl

Lines changed: 106 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
reload("Strings")
1+
#reload("Strings")
22
using Base.Test
3+
using Encodings
34

45
# write your own tests here
56
s = Strings.String("hey there")
@@ -30,4 +31,107 @@ s = Strings.String("")
3031
space = Strings.String(" ")
3132
@time for i = 1:1000
3233
s = Strings.string(s,space)
33-
end
34+
end
35+
36+
# This is here, unless check_string actually gets merged in to Base
37+
csmod = CheckStrings # (or Base)
38+
#
39+
# Test invalid sequences
40+
byt = 0x0
41+
# Continuation byte not after lead
42+
for byt in 0x80:0xbf
43+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt])
44+
end
45+
46+
# Test lead bytes
47+
for byt in 0xc0:0xff
48+
# Single lead byte at end of string
49+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt])
50+
# Lead followed by non-continuation character < 0x80
51+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt,0])
52+
# Lead followed by non-continuation character > 0xbf
53+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt,0xc0])
54+
end
55+
56+
# Test overlong 2-byte
57+
for byt in 0x81:0xbf
58+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[0xc0,byt])
59+
end
60+
for byt in 0x80:0xbf
61+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[0xc1,byt])
62+
end
63+
64+
# Test overlong 3-byte
65+
for byt in 0x80:0x9f
66+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[0xe0,byt,0x80])
67+
end
68+
69+
# Test overlong 4-byte
70+
for byt in 0x80:0x8f
71+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[0xef,byt,0x80,0x80])
72+
end
73+
74+
# Test 4-byte > 0x10ffff
75+
for byt in 0x90:0xbf
76+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[0xf4,byt,0x80,0x80])
77+
end
78+
for byt in 0xf5:0xf7
79+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt,0x80,0x80,0x80])
80+
end
81+
82+
# Test 5-byte
83+
for byt in 0xf8:0xfb
84+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt,0x80,0x80,0x80,0x80])
85+
end
86+
87+
# Test 6-byte
88+
for byt in 0xfc:0xfd
89+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt,0x80,0x80,0x80,0x80,0x80])
90+
end
91+
92+
# Test 7-byte
93+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])
94+
95+
# Three and above byte sequences
96+
for byt in 0xe0:0xef
97+
# Lead followed by only 1 continuation byte
98+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt,0x80])
99+
# Lead ended by non-continuation character < 0x80
100+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt,0x80,0])
101+
# Lead ended by non-continuation character > 0xbf
102+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt,0x80,0xc0])
103+
end
104+
105+
# 3-byte encoded surrogate character(s)
106+
# Single surrogate
107+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[0xed,0xa0,0x80])
108+
# Not followed by surrogate
109+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])
110+
# Trailing surrogate first
111+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])
112+
# Followed by lead surrogate
113+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])
114+
115+
# Four byte sequences
116+
for byt in 0xf0:0xf4
117+
# Lead followed by only 2 continuation bytes
118+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt,0x80,0x80])
119+
# Lead followed by non-continuation character < 0x80
120+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt,0x80,0x80,0])
121+
# Lead followed by non-continuation character > 0xbf
122+
@test_throws UnicodeError csmod.check_string(UTF8, UInt8[byt,0x80,0x80,0xc0])
123+
end
124+
125+
# Surrogates
126+
@test_throws UnicodeError csmod.check_string(UTF16, UInt16[0xd800])
127+
@test_throws UnicodeError csmod.check_string(UTF16, UInt16[0xdc00])
128+
@test_throws UnicodeError csmod.check_string(UTF16, UInt16[0xdc00,0xd800])
129+
130+
# Surrogates in UTF-32
131+
@test_throws UnicodeError csmod.check_string(UTF32, UInt32[0xd800])
132+
@test_throws UnicodeError csmod.check_string(UTF32, UInt32[0xdc00])
133+
@test_throws UnicodeError csmod.check_string(UTF32, UInt32[0xdc00,0xd800])
134+
135+
# Characters > 0x10ffff
136+
@test_throws UnicodeError csmod.check_string(UTF32, UInt32[0x110000])
137+

0 commit comments

Comments
 (0)