@@ -5,20 +5,264 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1)
5
5
endof (s:: UTF32String ) = length (s. data) - 1
6
6
length (s:: UTF32String ) = length (s. data) - 1
7
7
8
+ const empty_utf32 = UTF32String (UInt32[0 ])
9
+
8
10
utf32 (x) = convert (UTF32String, x)
9
11
convert (:: Type{UTF32String} , c:: Char ) = UTF32String (Char[c, Char (0 )])
10
12
convert (:: Type{UTF32String} , s:: UTF32String ) = s
11
13
12
- function convert (:: Type{UTF32String} , s:: AbstractString )
13
- a = Array (Char, length (s) + 1 )
14
- i = 0
15
- for c in s
16
- a[i += 1 ] = c
14
+ function convert (:: Type{UTF32String} , str:: AbstractString )
15
+ "
16
+ Converts an AbstractString to a UTF16String
17
+
18
+ ### Input Arguments:
19
+ * ::Type{UTF32String}
20
+ * str::AbstractString
21
+
22
+ ### Returns:
23
+ * ::UTF32String
24
+
25
+ ### Throws:
26
+ * UnicodeError
27
+ "
28
+ len, flags = check_string (str)
29
+ buf = Vector {Char} (len+ 1 )
30
+ out = 0
31
+ @inbounds for ch in str ; buf[out += 1 ] = ch ; end
32
+ @inbounds buf[out + 1 ] = 0 # NULL termination
33
+ UTF32String (buf)
34
+ end
35
+
36
+ function convert (:: Type{UTF8String} , dat:: Vector{UInt32} )
37
+ "
38
+ Converts a UTF-32 encoded vector of UInt32 to a UTF8String
39
+
40
+ ### Input Arguments:
41
+ * ::Type{UTF8String}
42
+ * dat::Vector{UInt32}
43
+
44
+ ### Returns:
45
+ * ::UTF8String
46
+
47
+ ### Throws:
48
+ * UnicodeError
49
+ "
50
+ len = sizeof (dat)
51
+ # handle zero length string quickly
52
+ len == 0 && return empty_utf8
53
+ # get number of bytes to allocate
54
+ len, flags, num4byte, num3byte, num2byte = check_string (dat, len>>> 2 )
55
+ flags == 0 && @inbounds return UTF8String (copy! (Vector {UInt8} (len), 1 , dat, 1 , len))
56
+ return encode_to_utf8 (UInt32, dat, len + num2byte + num3byte* 2 + num4byte* 3 )
57
+ end
58
+
59
+ function convert (:: Type{UTF8String} , str:: UTF32String )
60
+ "
61
+ Converts a UTF32String to a UTF8String
62
+
63
+ ### Input Arguments:
64
+ * ::Type{UTF8String}
65
+ * str::UTF32String
66
+
67
+ ### Returns:
68
+ * ::UTF8String
69
+
70
+ ### Throws:
71
+ * UnicodeError
72
+ "
73
+ dat = reinterpret (UInt32, str. data)
74
+ len = sizeof (dat) >>> 2
75
+ # handle zero length string quickly
76
+ len <= 1 && return empty_utf8
77
+ # get number of bytes to allocate
78
+ len, flags, num4byte, num3byte, num2byte = check_string (dat, len- 1 )
79
+ flags == 0 && @inbounds return UTF8String (copy! (Vector {UInt8} (len), 1 , dat, 1 , len))
80
+ return encode_to_utf8 (UInt32, dat, len + num2byte + num3byte* 2 + num4byte* 3 )
81
+ end
82
+
83
+ function convert (:: Type{UTF32String} , str:: UTF8String )
84
+ "
85
+ Converts a UTF8String to a UTF32String
86
+
87
+ ### Input Arguments:
88
+ * ::Type{UTF32String}
89
+ * str::UTF8String
90
+
91
+ ### Returns:
92
+ * ::UTF32String
93
+
94
+ ### Throws:
95
+ * UnicodeError
96
+ "
97
+ dat = str. data
98
+ # handle zero length string quickly
99
+ sizeof (dat) == 0 && return empty_utf32
100
+ # Validate UTF-8 encoding, and get number of words to create
101
+ len, flags = check_string (dat)
102
+ # Optimize case where no characters > 0x7f
103
+ totlen = len+ 1
104
+ flags == 0 && return fast_utf_copy (UTF32String, Char, totlen, dat)
105
+ # has multi-byte UTF-8 sequences
106
+ buf = Vector {Char} (totlen)
107
+ @inbounds buf[totlen] = 0 # NULL termination
108
+ local ch:: UInt32 , surr:: UInt32
109
+ out = 0
110
+ pos = 0
111
+ @inbounds while out < len
112
+ ch = dat[pos += 1 ]
113
+ # Handle ASCII characters
114
+ if ch <= 0x7f
115
+ buf[out += 1 ] = ch
116
+ # Handle range 0x80-0x7ff
117
+ elseif ch < 0xe0
118
+ buf[out += 1 ] = ((ch & 0x1f ) << 6 ) | (dat[pos += 1 ] & 0x3f )
119
+ # Handle range 0x800-0xffff
120
+ elseif ch < 0xf0
121
+ pos += 2
122
+ ch = get_utf8_3byte (dat, pos, ch)
123
+ # Handle surrogate pairs (should have been encoded in 4 bytes)
124
+ if is_surrogate_lead (ch)
125
+ # Build up 32-bit character from ch and trailing surrogate in next 3 bytes
126
+ pos += 3
127
+ surr = ((UInt32 (dat[pos- 2 ] & 0xf ) << 12 )
128
+ | (UInt32 (dat[pos- 1 ] & 0x3f ) << 6 )
129
+ | (dat[pos] & 0x3f ))
130
+ ch = get_supplementary (ch, surr)
131
+ end
132
+ buf[out += 1 ] = ch
133
+ # Handle range 0x10000-0x10ffff
134
+ else
135
+ pos += 3
136
+ buf[out += 1 ] = get_utf8_4byte (dat, pos, ch)
137
+ end
17
138
end
18
- a[end ] = Char (0 ) # NULL terminate
19
- UTF32String (a)
139
+ UTF32String (buf)
20
140
end
21
141
142
+ function convert (:: Type{UTF32String} , str:: UTF16String )
143
+ "
144
+ Converts a UTF16String to UTF32String
145
+
146
+ ### Input Arguments:
147
+ * ::Type{UTF32String}
148
+ * str::UTF16String
149
+
150
+ ### Returns:
151
+ * ::UTF32String
152
+
153
+ ### Throws:
154
+ * UnicodeError
155
+ "
156
+ dat = str. data
157
+ len = sizeof (dat)
158
+ # handle zero length string quickly (account for trailing \0)
159
+ len <= 2 && return empty_utf32
160
+ # get number of words to create
161
+ len, flags, num4byte = check_string (dat, len>>> 1 )
162
+ # No surrogate pairs, do optimized copy
163
+ (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String (copy! (Vector {Char} (len), dat))
164
+ local ch:: UInt32
165
+ buf = Vector {Char} (len)
166
+ out = 0
167
+ pos = 0
168
+ @inbounds while out < len
169
+ ch = dat[pos += 1 ]
170
+ # check for surrogate pair
171
+ if is_surrogate_lead (ch) ; ch = get_supplementary (ch, dat[pos += 1 ]) ; end
172
+ buf[out += 1 ] = ch
173
+ end
174
+ UTF32String (buf)
175
+ end
176
+
177
+ function convert (:: Type{UTF16String} , dat:: Vector{UInt32} )
178
+ "
179
+ Converts a UTF-32 encoded vector of UInt32 to a UTF16String
180
+
181
+ ### Input Arguments:
182
+ * ::Type{UTF16String}
183
+ * dat::Vector{UInt32}
184
+
185
+ ### Returns:
186
+ * ::UTF16String
187
+
188
+ ### Throws:
189
+ * UnicodeError
190
+ "
191
+ len = sizeof (dat)
192
+ # handle zero length string quickly
193
+ len <= 4 && return empty_utf16
194
+ # get number of words to allocate
195
+ len, flags, num4byte = check_string (dat, len>>> 2 )
196
+ len += num4byte + 1
197
+ # optimized path, no surrogates
198
+ num4byte == 0 && return fast_utf_copy (UTF16String, UInt16, len, dat)
199
+ return encode_to_utf16 (dat, len)
200
+ end
201
+
202
+ function convert (:: Type{UTF16String} , str:: UTF32String )
203
+ "
204
+ Converts a UTF32String to UTF16String
205
+
206
+ ### Input Arguments:
207
+ * ::Type{UTF16String}
208
+ * str::UTF32String
209
+
210
+ ### Returns:
211
+ * ::UTF16String
212
+
213
+ ### Throws:
214
+ * UnicodeError
215
+ "
216
+ dat = reinterpret (UInt32, str. data)
217
+ len = sizeof (dat)
218
+ # handle zero length string quickly
219
+ len <= 4 && return empty_utf16
220
+ # get number of words to allocate
221
+ len, flags, num4byte = check_string (dat, len>>> 2 )
222
+ # optimized path, no surrogates
223
+ num4byte == 0 && @inbounds return UTF16String (copy! (Vector {UInt16} (len), dat))
224
+ return encode_to_utf16 (dat, len + num4byte)
225
+ end
226
+
227
+ function encode_to_utf16 (dat, len)
228
+ "
229
+ Converts an already validated UTF-32 encoded vector of UInt32 to a UTF16String
230
+
231
+ ### Input Arguments:
232
+ * dat::Vector{UInt32} UTF-32 encoded data
233
+ * len length of output in 16-bit words
234
+
235
+ ### Returns:
236
+ * ::UTF16String
237
+ "
238
+ buf = Vector {UInt16} (len)
239
+ @inbounds buf[len] = 0 # NULL termination
240
+ out = 0
241
+ pos = 0
242
+ @inbounds while out < len
243
+ ch = UInt32 (dat[pos += 1 ])
244
+ if ch > 0xffff
245
+ # Output surrogate pair for 0x10000-0x10ffff
246
+ buf[out += 1 ] = 0xd7c0 + (ch >>> 10 )
247
+ ch = 0xdc00 + (ch & 0x3ff )
248
+ end
249
+ buf[out += 1 ] = ch
250
+ end
251
+ UTF16String (buf)
252
+ end
253
+
254
+ convert (:: Type{UTF8String} , dat:: Vector{Char} ) = convert (UTF8String, reinterpret (UInt32, dat))
255
+
256
+ convert (:: Type{UTF16String} , dat:: Vector{Char} ) = convert (UTF16String, reinterpret (UInt32, dat))
257
+ convert (:: Type{UTF32String} , c:: Char ) = UTF32String (Char[c, Char (0 )])
258
+
259
+ function convert (:: Type{UTF32String} , str:: ASCIIString )
260
+ dat = str. data
261
+ fast_utf_copy (UTF32String, Char, length (dat)+ 1 , dat)
262
+ end
263
+
264
+ convert (:: Type{UTF32String} , dat:: AbstractVector{Char} ) = fast_utf_copy (UTF32String, Char, length (dat), dat, true )
265
+
22
266
function convert (:: Type{UTF32String} , data:: AbstractVector{Char} )
23
267
len = length (data)
24
268
d = Array (Char, len + 1 )
@@ -51,7 +295,7 @@ unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
51
295
convert (Ptr{T}, pointer (s))
52
296
53
297
function convert (T:: Type{UTF32String} , bytes:: AbstractArray{UInt8} )
54
- isempty (bytes) && return UTF32String (Char[ 0 ])
298
+ isempty (bytes) && return empty_utf32
55
299
length (bytes) & 3 != 0 && throw (UnicodeError (UTF_ERR_ODD_BYTES_32,0 ,0 ))
56
300
data = reinterpret (Char, bytes)
57
301
# check for byte-order mark (BOM):
0 commit comments