Skip to content

Commit f1c1583

Browse files
committed
Add Unicode validation function
Add unit tests of all the errors found by check_string
1 parent d6736eb commit f1c1583

File tree

3 files changed

+273
-0
lines changed

3 files changed

+273
-0
lines changed

base/sysimg.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ include("osutils.jl")
8686
# strings & printing
8787
include("utferror.jl")
8888
include("utftypes.jl")
89+
include("utfcheck.jl")
8990
include("char.jl")
9091
include("ascii.jl")
9192
include("utf8.jl")

base/utfcheck.jl

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
# This file is a part of Julia. License is MIT: http://julialang.org/license
2+
3+
## Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings,
4+
# and also to return information necessary to convert to other encodings
5+
6+
is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
7+
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
8+
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
9+
is_valid_continuation(c) = ((c & 0xc0) == 0x80)
10+
11+
## Options for check_string_* functions
12+
13+
const UTF_NO_LONG_NULL = 1 ##< don't accept 0xc0 0x80 for '\0'
14+
const UTF_NO_SURROGATES = 2 ##< don't accept surrogate pairs in UTF-8/UTF-32
15+
const UTF_ACCEPT_LONG = 4 ##< accept long encodings (other than long null in UTF-8)
16+
17+
const UTF_LONG = 1 ##< Long encodings are present
18+
const UTF_LATIN1 = 2 ##< characters in range 0x80-0xFF present
19+
const UTF_UNICODE2 = 4 ##< characters in range 0x100-0x7ff present
20+
const UTF_UNICODE3 = 8 ##< characters in range 0x800-0xd7ff, 0xe000-0xffff
21+
const UTF_UNICODE4 = 16 ##< non-BMP characters present
22+
const UTF_SURROGATE = 32 ##< surrogate pairs present
23+
24+
## Get a UTF-8 continuation byte, give error if invalid, return updated character value
25+
@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
26+
!is_valid_continuation(byt) && throw(UnicodeError(UTF_ERR_CONT, pos, byt))
27+
(ch << 6) | (byt & 0x3f)
28+
end
29+
30+
##\brief Validates and calculates number of characters in a UTF-8 encoded vector of UInt8
31+
#
32+
# \param[in] str Vector of UInt8
33+
# \param[in] optional length
34+
# \param[in] optional start position
35+
# \param[in] options flags to determine error handling (default 0)
36+
#
37+
# \return (total characters, flags, 4-byte, 3-byte, 2-byte)
38+
# \throws UnicodeError
39+
40+
function check_string(dat::Vector{UInt8}, len = sizeof(dat), pos = 0 ; options::Integer=0)
41+
local byt::UInt8, ch::UInt32, surr::UInt32
42+
flags::UInt = 0
43+
totalchar = num2byte = num3byte = num4byte = 0
44+
@inbounds while pos < len
45+
ch = dat[pos += 1]
46+
totalchar += 1
47+
if ch > 0x7f
48+
# Check UTF-8 encoding
49+
if ch < 0xe0
50+
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
51+
(pos == len) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
52+
ch = get_continuation(ch & 0x3f, dat[pos += 1], pos)
53+
if ch > 0x7f
54+
num2byte += 1
55+
flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
56+
elseif (options & UTF_ACCEPT_LONG) != 0
57+
flags |= UTF_LONG
58+
elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0)
59+
flags |= UTF_LONG
60+
else
61+
throw(UnicodeError(UTF_ERR_LONG, pos, ch))
62+
end
63+
elseif ch < 0xf0
64+
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
65+
(pos + 2 > len) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
66+
ch = get_continuation(ch & 0x0f, dat[pos += 1], pos)
67+
ch = get_continuation(ch, dat[pos += 1], pos)
68+
# check for surrogate pairs, make sure correct
69+
if is_surrogate_codeunit(ch)
70+
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
71+
# next character *must* be a trailing surrogate character
72+
(pos + 3 > len) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
73+
byt = dat[pos += 1]
74+
(byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
75+
surr = get_continuation(0x0000d, dat[pos += 1], pos)
76+
surr = get_continuation(surr, dat[pos += 1], pos)
77+
!is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
78+
(options & UTF_NO_SURROGATES) != 0 && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
79+
flags |= UTF_SURROGATE
80+
num4byte += 1
81+
elseif ch > 0x07ff
82+
num3byte += 1
83+
elseif (options & UTF_ACCEPT_LONG) != 0
84+
flags |= UTF_LONG
85+
num2byte += 1
86+
else
87+
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
88+
end
89+
elseif ch < 0xf5
90+
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
91+
(pos + 3 > len) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
92+
ch = get_continuation(ch & 0x07, dat[pos += 1], pos)
93+
ch = get_continuation(ch, dat[pos += 1], pos)
94+
ch = get_continuation(ch, dat[pos += 1], pos)
95+
if ch > 0x10ffff
96+
throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch))
97+
elseif ch > 0xffff
98+
num4byte += 1
99+
elseif is_surrogate_codeunit(ch)
100+
throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
101+
elseif (options & UTF_ACCEPT_LONG) != 0
102+
# This is an overly long encoded character
103+
flags |= UTF_LONG
104+
if ch > 0x7ff
105+
num3byte += 1
106+
elseif ch > 0x7f
107+
num2byte += 1
108+
end
109+
else
110+
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
111+
end
112+
else
113+
throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
114+
end
115+
end
116+
end
117+
num3byte != 0 && (flags |= UTF_UNICODE3)
118+
num4byte != 0 && (flags |= UTF_UNICODE4)
119+
return totalchar, flags, num4byte, num3byte, num2byte
120+
end
121+
122+
##\brief Validates and calculates number of characters in a UTF-16 or UTF-32 encoded vector/string
123+
#
124+
# \param[in] dat::Union(Vector{UInt16}, Vector{UInt32}, AbstractString)
125+
# \param[in] optional length
126+
# \param[in] optional start position
127+
# \param[in] keyword parameter options flags to determine error handling (default 0)
128+
#
129+
# \return (total characters, flags, 4-byte, 3-byte, 2-byte)
130+
# \throws UnicodeError
131+
132+
function check_string{T <: Union(Vector{UInt16}, Vector{UInt32}, AbstractString)}(
133+
dat::T,
134+
len = endof(dat),
135+
pos = start(dat)
136+
; options::Integer = 0)
137+
local ch::UInt32
138+
flags::UInt = 0
139+
totalchar = num2byte = num3byte = num4byte = 0
140+
@inbounds while pos <= len
141+
ch, pos = next(dat, pos)
142+
totalchar += 1
143+
if ch > 0x7f
144+
if ch < 0x100
145+
num2byte += 1
146+
flags |= UTF_LATIN1
147+
elseif ch < 0x800
148+
num2byte += 1
149+
flags |= UTF_UNICODE2
150+
elseif T != Vector{UInt16} && ch > 0x0ffff
151+
(ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
152+
num4byte += 1
153+
elseif !is_surrogate_codeunit(ch)
154+
num3byte += 1
155+
elseif is_surrogate_lead(ch)
156+
pos > len && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch))
157+
# next character *must* be a trailing surrogate character
158+
ch, pos = next(dat, pos)
159+
!is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
160+
num4byte += 1
161+
if T != Vector{UInt16}
162+
(options & UTF_NO_SURROGATES) != 0 && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
163+
flags |= UTF_SURROGATE
164+
end
165+
else
166+
throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch))
167+
end
168+
end
169+
end
170+
num3byte != 0 && (flags |= UTF_UNICODE3)
171+
num4byte != 0 && (flags |= UTF_UNICODE4)
172+
return totalchar, flags, num4byte, num3byte, num2byte
173+
end

test/strings.jl

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1719,3 +1719,102 @@ d = UTF32String(c)
17191719
c[1] = 'A'
17201720
@test d=="A"
17211721

1722+
# Test invalid sequences
1723+
1724+
byt = 0x0
1725+
# Continuation byte not after lead
1726+
for byt in 0x80:0xbf
1727+
@test_throws UnicodeError Base.check_string(UInt8[byt])
1728+
end
1729+
1730+
# Test lead bytes
1731+
for byt in 0xc0:0xff
1732+
# Single lead byte at end of string
1733+
@test_throws UnicodeError Base.check_string(UInt8[byt])
1734+
# Lead followed by non-continuation character < 0x80
1735+
@test_throws UnicodeError Base.check_string(UInt8[byt,0])
1736+
# Lead followed by non-continuation character > 0xbf
1737+
@test_throws UnicodeError Base.check_string(UInt8[byt,0xc0])
1738+
end
1739+
1740+
# Test overlong 2-byte
1741+
for byt in 0x81:0xbf
1742+
@test_throws UnicodeError Base.check_string(UInt8[0xc0,byt])
1743+
end
1744+
for byt in 0x80:0xbf
1745+
@test_throws UnicodeError Base.check_string(UInt8[0xc1,byt])
1746+
end
1747+
1748+
# Test overlong 3-byte
1749+
for byt in 0x80:0x9f
1750+
@test_throws UnicodeError Base.check_string(UInt8[0xe0,byt,0x80])
1751+
end
1752+
1753+
# Test overlong 4-byte
1754+
for byt in 0x80:0x8f
1755+
@test_throws UnicodeError Base.check_string(UInt8[0xef,byt,0x80,0x80])
1756+
end
1757+
1758+
# Test 4-byte > 0x10ffff
1759+
for byt in 0x90:0xbf
1760+
@test_throws UnicodeError Base.check_string(UInt8[0xf4,byt,0x80,0x80])
1761+
end
1762+
for byt in 0xf5:0xf7
1763+
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0x80])
1764+
end
1765+
1766+
# Test 5-byte
1767+
for byt in 0xf8:0xfb
1768+
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0x80,0x80])
1769+
end
1770+
1771+
# Test 6-byte
1772+
for byt in 0xfc:0xfd
1773+
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0x80,0x80,0x80])
1774+
end
1775+
1776+
# Test 7-byte
1777+
@test_throws UnicodeError Base.check_string(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])
1778+
1779+
# Three and above byte sequences
1780+
for byt in 0xe0:0xef
1781+
# Lead followed by only 1 continuation byte
1782+
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80])
1783+
# Lead ended by non-continuation character < 0x80
1784+
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0])
1785+
# Lead ended by non-continuation character > 0xbf
1786+
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0xc0])
1787+
end
1788+
1789+
# 3-byte encoded surrogate character(s)
1790+
# Single surrogate
1791+
@test_throws UnicodeError Base.check_string(UInt8[0xed,0xa0,0x80])
1792+
# Not followed by surrogate
1793+
@test_throws UnicodeError Base.check_string(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])
1794+
# Trailing surrogate first
1795+
@test_throws UnicodeError Base.check_string(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])
1796+
# Followed by lead surrogate
1797+
@test_throws UnicodeError Base.check_string(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])
1798+
1799+
# Four byte sequences
1800+
for byt in 0xf0:0xf4
1801+
# Lead followed by only 2 continuation bytes
1802+
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80])
1803+
# Lead followed by non-continuation character < 0x80
1804+
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0])
1805+
# Lead followed by non-continuation character > 0xbf
1806+
@test_throws UnicodeError Base.check_string(UInt8[byt,0x80,0x80,0xc0])
1807+
end
1808+
1809+
# Surrogates
1810+
@test_throws UnicodeError Base.check_string(UInt16[0xd800])
1811+
@test_throws UnicodeError Base.check_string(UInt16[0xdc00])
1812+
@test_throws UnicodeError Base.check_string(UInt16[0xdc00,0xd800])
1813+
1814+
# Surrogates in UTF-32
1815+
@test_throws UnicodeError Base.check_string(UInt32[0xd800])
1816+
@test_throws UnicodeError Base.check_string(UInt32[0xdc00])
1817+
@test_throws UnicodeError Base.check_string(UInt32[0xdc00,0xd800])
1818+
1819+
# Characters > 0x10ffff
1820+
@test_throws UnicodeError Base.check_string(UInt32[0x110000])

0 commit comments

Comments
 (0)