1 | /*===--- ConvertUTF.c - Universal Character Names conversions ---------------=== |
2 | * |
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | * See https://llvm.org/LICENSE.txt for license information. |
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | * |
7 | *===------------------------------------------------------------------------=*/ |
8 | /* |
9 | * Copyright © 1991-2015 Unicode, Inc. All rights reserved. |
10 | * Distributed under the Terms of Use in |
11 | * http://www.unicode.org/copyright.html. |
12 | * |
13 | * Permission is hereby granted, free of charge, to any person obtaining |
14 | * a copy of the Unicode data files and any associated documentation |
15 | * (the "Data Files") or Unicode software and any associated documentation |
16 | * (the "Software") to deal in the Data Files or Software |
17 | * without restriction, including without limitation the rights to use, |
18 | * copy, modify, merge, publish, distribute, and/or sell copies of |
19 | * the Data Files or Software, and to permit persons to whom the Data Files |
20 | * or Software are furnished to do so, provided that |
21 | * (a) this copyright and permission notice appear with all copies |
22 | * of the Data Files or Software, |
23 | * (b) this copyright and permission notice appear in associated |
24 | * documentation, and |
25 | * (c) there is clear notice in each modified Data File or in the Software |
26 | * as well as in the documentation associated with the Data File(s) or |
27 | * Software that the data or software has been modified. |
28 | * |
29 | * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF |
30 | * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE |
31 | * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
32 | * NONINFRINGEMENT OF THIRD PARTY RIGHTS. |
33 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS |
34 | * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL |
35 | * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, |
36 | * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER |
37 | * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
38 | * PERFORMANCE OF THE DATA FILES OR SOFTWARE. |
39 | * |
40 | * Except as contained in this notice, the name of a copyright holder |
41 | * shall not be used in advertising or otherwise to promote the sale, |
42 | * use or other dealings in these Data Files or Software without prior |
43 | * written authorization of the copyright holder. |
44 | */ |
45 | |
46 | /* --------------------------------------------------------------------- |
47 | |
48 | Conversions between UTF32, UTF-16, and UTF-8. Source code file. |
49 | Author: Mark E. Davis, 1994. |
50 | Rev History: Rick McGowan, fixes & updates May 2001. |
51 | Sept 2001: fixed const & error conditions per |
52 | mods suggested by S. Parent & A. Lillich. |
53 | June 2002: Tim Dodd added detection and handling of incomplete |
54 | source sequences, enhanced error detection, added casts |
55 | to eliminate compiler warnings. |
56 | July 2003: slight mods to back out aggressive FFFE detection. |
57 | Jan 2004: updated switches in from-UTF8 conversions. |
58 | Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. |
59 | |
60 | See the header file "ConvertUTF.h" for complete documentation. |
61 | |
62 | ------------------------------------------------------------------------ */ |
63 | |
64 | #include "llvm/Support/ConvertUTF.h" |
65 | #ifdef CVTUTF_DEBUG |
66 | #include <stdio.h> |
67 | #endif |
68 | #include <assert.h> |
69 | |
70 | /* |
71 | * This code extensively uses fall-through switches. |
72 | * Keep the compiler from warning about that. |
73 | */ |
74 | #if defined(__clang__) && defined(__has_warning) |
75 | # if __has_warning("-Wimplicit-fallthrough") |
76 | # define ConvertUTF_DISABLE_WARNINGS \ |
77 | _Pragma("clang diagnostic push") \ |
78 | _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"") |
79 | # define ConvertUTF_RESTORE_WARNINGS \ |
80 | _Pragma("clang diagnostic pop") |
81 | # endif |
82 | #elif defined(__GNUC__) && __GNUC__ > 6 |
83 | # define ConvertUTF_DISABLE_WARNINGS \ |
84 | _Pragma("GCC diagnostic push") \ |
85 | _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") |
86 | # define ConvertUTF_RESTORE_WARNINGS \ |
87 | _Pragma("GCC diagnostic pop") |
88 | #endif |
89 | #ifndef ConvertUTF_DISABLE_WARNINGS |
90 | # define ConvertUTF_DISABLE_WARNINGS |
91 | #endif |
92 | #ifndef ConvertUTF_RESTORE_WARNINGS |
93 | # define ConvertUTF_RESTORE_WARNINGS |
94 | #endif |
95 | |
96 | ConvertUTF_DISABLE_WARNINGS |
97 | |
98 | namespace llvm { |
99 | |
100 | static const int halfShift = 10; /* used for shifting by 10 bits */ |
101 | |
102 | static const UTF32 halfBase = 0x0010000UL; |
103 | static const UTF32 halfMask = 0x3FFUL; |
104 | |
105 | #define UNI_SUR_HIGH_START (UTF32)0xD800 |
106 | #define UNI_SUR_HIGH_END (UTF32)0xDBFF |
107 | #define UNI_SUR_LOW_START (UTF32)0xDC00 |
108 | #define UNI_SUR_LOW_END (UTF32)0xDFFF |
109 | |
110 | /* --------------------------------------------------------------------- */ |
111 | |
112 | /* |
113 | * Index into the table below with the first byte of a UTF-8 sequence to |
114 | * get the number of trailing bytes that are supposed to follow it. |
115 | * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is |
116 | * left as-is for anyone who may want to do such conversion, which was |
117 | * allowed in earlier algorithms. |
118 | */ |
119 | static const char trailingBytesForUTF8[256] = { |
120 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
121 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
122 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
123 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
124 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
125 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
126 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
127 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 |
128 | }; |
129 | |
130 | /* |
131 | * Magic values subtracted from a buffer value during UTF8 conversion. |
132 | * This table contains as many values as there might be trailing bytes |
133 | * in a UTF-8 sequence. |
134 | */ |
135 | static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, |
136 | 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; |
137 | |
138 | /* |
139 | * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed |
140 | * into the first byte, depending on how many bytes follow. There are |
141 | * as many entries in this table as there are UTF-8 sequence types. |
142 | * (I.e., one byte sequence, two byte... etc.). Remember that sequencs |
143 | * for *legal* UTF-8 will be 4 or fewer bytes total. |
144 | */ |
145 | static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; |
146 | |
147 | /* --------------------------------------------------------------------- */ |
148 | |
149 | /* The interface converts a whole buffer to avoid function-call overhead. |
150 | * Constants have been gathered. Loops & conditionals have been removed as |
151 | * much as possible for efficiency, in favor of drop-through switches. |
152 | * (See "Note A" at the bottom of the file for equivalent code.) |
153 | * If your compiler supports it, the "isLegalUTF8" call can be turned |
154 | * into an inline function. |
155 | */ |
156 | |
157 | |
158 | /* --------------------------------------------------------------------- */ |
159 | |
160 | ConversionResult ConvertUTF32toUTF16 ( |
161 | const UTF32** sourceStart, const UTF32* sourceEnd, |
162 | UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { |
163 | ConversionResult result = conversionOK; |
164 | const UTF32* source = *sourceStart; |
165 | UTF16* target = *targetStart; |
166 | while (source < sourceEnd) { |
167 | UTF32 ch; |
168 | if (target >= targetEnd) { |
169 | result = targetExhausted; break; |
170 | } |
171 | ch = *source++; |
172 | if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ |
173 | /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ |
174 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { |
175 | if (flags == strictConversion) { |
176 | --source; /* return to the illegal value itself */ |
177 | result = sourceIllegal; |
178 | break; |
179 | } else { |
180 | *target++ = UNI_REPLACEMENT_CHAR; |
181 | } |
182 | } else { |
183 | *target++ = (UTF16)ch; /* normal case */ |
184 | } |
185 | } else if (ch > UNI_MAX_LEGAL_UTF32) { |
186 | if (flags == strictConversion) { |
187 | result = sourceIllegal; |
188 | } else { |
189 | *target++ = UNI_REPLACEMENT_CHAR; |
190 | } |
191 | } else { |
192 | /* target is a character in range 0xFFFF - 0x10FFFF. */ |
193 | if (target + 1 >= targetEnd) { |
194 | --source; /* Back up source pointer! */ |
195 | result = targetExhausted; break; |
196 | } |
197 | ch -= halfBase; |
198 | *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); |
199 | *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); |
200 | } |
201 | } |
202 | *sourceStart = source; |
203 | *targetStart = target; |
204 | return result; |
205 | } |
206 | |
207 | /* --------------------------------------------------------------------- */ |
208 | |
209 | ConversionResult ConvertUTF16toUTF32 ( |
210 | const UTF16** sourceStart, const UTF16* sourceEnd, |
211 | UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { |
212 | ConversionResult result = conversionOK; |
213 | const UTF16* source = *sourceStart; |
214 | UTF32* target = *targetStart; |
215 | UTF32 ch, ch2; |
216 | while (source < sourceEnd) { |
217 | const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ |
218 | ch = *source++; |
219 | /* If we have a surrogate pair, convert to UTF32 first. */ |
220 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { |
221 | /* If the 16 bits following the high surrogate are in the source buffer... */ |
222 | if (source < sourceEnd) { |
223 | ch2 = *source; |
224 | /* If it's a low surrogate, convert to UTF32. */ |
225 | if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { |
226 | ch = ((ch - UNI_SUR_HIGH_START) << halfShift) |
227 | + (ch2 - UNI_SUR_LOW_START) + halfBase; |
228 | ++source; |
229 | } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ |
230 | --source; /* return to the illegal value itself */ |
231 | result = sourceIllegal; |
232 | break; |
233 | } |
234 | } else { /* We don't have the 16 bits following the high surrogate. */ |
235 | --source; /* return to the high surrogate */ |
236 | result = sourceExhausted; |
237 | break; |
238 | } |
239 | } else if (flags == strictConversion) { |
240 | /* UTF-16 surrogate values are illegal in UTF-32 */ |
241 | if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { |
242 | --source; /* return to the illegal value itself */ |
243 | result = sourceIllegal; |
244 | break; |
245 | } |
246 | } |
247 | if (target >= targetEnd) { |
248 | source = oldSource; /* Back up source pointer! */ |
249 | result = targetExhausted; break; |
250 | } |
251 | *target++ = ch; |
252 | } |
253 | *sourceStart = source; |
254 | *targetStart = target; |
255 | #ifdef CVTUTF_DEBUG |
256 | if (result == sourceIllegal) { |
257 | fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n" , ch, ch2); |
258 | fflush(stderr); |
259 | } |
260 | #endif |
261 | return result; |
262 | } |
263 | ConversionResult ConvertUTF16toUTF8 ( |
264 | const UTF16** sourceStart, const UTF16* sourceEnd, |
265 | UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { |
266 | ConversionResult result = conversionOK; |
267 | const UTF16* source = *sourceStart; |
268 | UTF8* target = *targetStart; |
269 | while (source < sourceEnd) { |
270 | UTF32 ch; |
271 | unsigned short bytesToWrite = 0; |
272 | const UTF32 byteMask = 0xBF; |
273 | const UTF32 byteMark = 0x80; |
274 | const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ |
275 | ch = *source++; |
276 | /* If we have a surrogate pair, convert to UTF32 first. */ |
277 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { |
278 | /* If the 16 bits following the high surrogate are in the source buffer... */ |
279 | if (source < sourceEnd) { |
280 | UTF32 ch2 = *source; |
281 | /* If it's a low surrogate, convert to UTF32. */ |
282 | if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { |
283 | ch = ((ch - UNI_SUR_HIGH_START) << halfShift) |
284 | + (ch2 - UNI_SUR_LOW_START) + halfBase; |
285 | ++source; |
286 | } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ |
287 | --source; /* return to the illegal value itself */ |
288 | result = sourceIllegal; |
289 | break; |
290 | } |
291 | } else { /* We don't have the 16 bits following the high surrogate. */ |
292 | --source; /* return to the high surrogate */ |
293 | result = sourceExhausted; |
294 | break; |
295 | } |
296 | } else if (flags == strictConversion) { |
297 | /* UTF-16 surrogate values are illegal in UTF-32 */ |
298 | if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { |
299 | --source; /* return to the illegal value itself */ |
300 | result = sourceIllegal; |
301 | break; |
302 | } |
303 | } |
304 | /* Figure out how many bytes the result will require */ |
305 | if (ch < (UTF32)0x80) { bytesToWrite = 1; |
306 | } else if (ch < (UTF32)0x800) { bytesToWrite = 2; |
307 | } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; |
308 | } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; |
309 | } else { bytesToWrite = 3; |
310 | ch = UNI_REPLACEMENT_CHAR; |
311 | } |
312 | |
313 | target += bytesToWrite; |
314 | if (target > targetEnd) { |
315 | source = oldSource; /* Back up source pointer! */ |
316 | target -= bytesToWrite; result = targetExhausted; break; |
317 | } |
318 | switch (bytesToWrite) { /* note: everything falls through. */ |
319 | case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; |
320 | case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; |
321 | case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; |
322 | case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); |
323 | } |
324 | target += bytesToWrite; |
325 | } |
326 | *sourceStart = source; |
327 | *targetStart = target; |
328 | return result; |
329 | } |
330 | |
331 | /* --------------------------------------------------------------------- */ |
332 | |
333 | ConversionResult ConvertUTF32toUTF8 ( |
334 | const UTF32** sourceStart, const UTF32* sourceEnd, |
335 | UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { |
336 | ConversionResult result = conversionOK; |
337 | const UTF32* source = *sourceStart; |
338 | UTF8* target = *targetStart; |
339 | while (source < sourceEnd) { |
340 | UTF32 ch; |
341 | unsigned short bytesToWrite = 0; |
342 | const UTF32 byteMask = 0xBF; |
343 | const UTF32 byteMark = 0x80; |
344 | ch = *source++; |
345 | if (flags == strictConversion ) { |
346 | /* UTF-16 surrogate values are illegal in UTF-32 */ |
347 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { |
348 | --source; /* return to the illegal value itself */ |
349 | result = sourceIllegal; |
350 | break; |
351 | } |
352 | } |
353 | /* |
354 | * Figure out how many bytes the result will require. Turn any |
355 | * illegally large UTF32 things (> Plane 17) into replacement chars. |
356 | */ |
357 | if (ch < (UTF32)0x80) { bytesToWrite = 1; |
358 | } else if (ch < (UTF32)0x800) { bytesToWrite = 2; |
359 | } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; |
360 | } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; |
361 | } else { bytesToWrite = 3; |
362 | ch = UNI_REPLACEMENT_CHAR; |
363 | result = sourceIllegal; |
364 | } |
365 | |
366 | target += bytesToWrite; |
367 | if (target > targetEnd) { |
368 | --source; /* Back up source pointer! */ |
369 | target -= bytesToWrite; result = targetExhausted; break; |
370 | } |
371 | switch (bytesToWrite) { /* note: everything falls through. */ |
372 | case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; |
373 | case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; |
374 | case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; |
375 | case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); |
376 | } |
377 | target += bytesToWrite; |
378 | } |
379 | *sourceStart = source; |
380 | *targetStart = target; |
381 | return result; |
382 | } |
383 | |
384 | /* --------------------------------------------------------------------- */ |
385 | |
386 | /* |
387 | * Utility routine to tell whether a sequence of bytes is legal UTF-8. |
388 | * This must be called with the length pre-determined by the first byte. |
389 | * If not calling this from ConvertUTF8to*, then the length can be set by: |
390 | * length = trailingBytesForUTF8[*source]+1; |
391 | * and the sequence is illegal right away if there aren't that many bytes |
392 | * available. |
393 | * If presented with a length > 4, this returns false. The Unicode |
394 | * definition of UTF-8 goes up to 4-byte sequences. |
395 | */ |
396 | |
397 | static Boolean isLegalUTF8(const UTF8 *source, int length) { |
398 | UTF8 a; |
399 | const UTF8 *srcptr = source+length; |
400 | switch (length) { |
401 | default: return false; |
402 | /* Everything else falls through when "true"... */ |
403 | case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
404 | case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
405 | case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
406 | |
407 | switch (*source) { |
408 | /* no fall-through in this inner switch */ |
409 | case 0xE0: if (a < 0xA0) return false; break; |
410 | case 0xED: if (a > 0x9F) return false; break; |
411 | case 0xF0: if (a < 0x90) return false; break; |
412 | case 0xF4: if (a > 0x8F) return false; break; |
413 | default: if (a < 0x80) return false; |
414 | } |
415 | |
416 | case 1: if (*source >= 0x80 && *source < 0xC2) return false; |
417 | } |
418 | if (*source > 0xF4) return false; |
419 | return true; |
420 | } |
421 | |
422 | /* --------------------------------------------------------------------- */ |
423 | |
424 | /* |
425 | * Exported function to return whether a UTF-8 sequence is legal or not. |
426 | * This is not used here; it's just exported. |
427 | */ |
428 | Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { |
429 | int length = trailingBytesForUTF8[*source]+1; |
430 | if (length > sourceEnd - source) { |
431 | return false; |
432 | } |
433 | return isLegalUTF8(source, length); |
434 | } |
435 | |
436 | /* |
437 | * Exported function to return the size of the first utf-8 code unit sequence, |
438 | * Or 0 if the sequence is not valid; |
439 | */ |
440 | unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) { |
441 | int length = trailingBytesForUTF8[*source] + 1; |
442 | return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length |
443 | : 0; |
444 | } |
445 | |
446 | /* --------------------------------------------------------------------- */ |
447 | |
448 | static unsigned |
449 | findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, |
450 | const UTF8 *sourceEnd) { |
451 | UTF8 b1, b2, b3; |
452 | |
453 | assert(!isLegalUTF8Sequence(source, sourceEnd)); |
454 | |
455 | /* |
456 | * Unicode 6.3.0, D93b: |
457 | * |
458 | * Maximal subpart of an ill-formed subsequence: The longest code unit |
459 | * subsequence starting at an unconvertible offset that is either: |
460 | * a. the initial subsequence of a well-formed code unit sequence, or |
461 | * b. a subsequence of length one. |
462 | */ |
463 | |
464 | if (source == sourceEnd) |
465 | return 0; |
466 | |
467 | /* |
468 | * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 |
469 | * Byte Sequences. |
470 | */ |
471 | |
472 | b1 = *source; |
473 | ++source; |
474 | if (b1 >= 0xC2 && b1 <= 0xDF) { |
475 | /* |
476 | * First byte is valid, but we know that this code unit sequence is |
477 | * invalid, so the maximal subpart has to end after the first byte. |
478 | */ |
479 | return 1; |
480 | } |
481 | |
482 | if (source == sourceEnd) |
483 | return 1; |
484 | |
485 | b2 = *source; |
486 | ++source; |
487 | |
488 | if (b1 == 0xE0) { |
489 | return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1; |
490 | } |
491 | if (b1 >= 0xE1 && b1 <= 0xEC) { |
492 | return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; |
493 | } |
494 | if (b1 == 0xED) { |
495 | return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1; |
496 | } |
497 | if (b1 >= 0xEE && b1 <= 0xEF) { |
498 | return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; |
499 | } |
500 | if (b1 == 0xF0) { |
501 | if (b2 >= 0x90 && b2 <= 0xBF) { |
502 | if (source == sourceEnd) |
503 | return 2; |
504 | |
505 | b3 = *source; |
506 | return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; |
507 | } |
508 | return 1; |
509 | } |
510 | if (b1 >= 0xF1 && b1 <= 0xF3) { |
511 | if (b2 >= 0x80 && b2 <= 0xBF) { |
512 | if (source == sourceEnd) |
513 | return 2; |
514 | |
515 | b3 = *source; |
516 | return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; |
517 | } |
518 | return 1; |
519 | } |
520 | if (b1 == 0xF4) { |
521 | if (b2 >= 0x80 && b2 <= 0x8F) { |
522 | if (source == sourceEnd) |
523 | return 2; |
524 | |
525 | b3 = *source; |
526 | return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; |
527 | } |
528 | return 1; |
529 | } |
530 | |
531 | assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5); |
532 | /* |
533 | * There are no valid sequences that start with these bytes. Maximal subpart |
534 | * is defined to have length 1 in these cases. |
535 | */ |
536 | return 1; |
537 | } |
538 | |
539 | /* --------------------------------------------------------------------- */ |
540 | |
541 | /* |
542 | * Exported function to return the total number of bytes in a codepoint |
543 | * represented in UTF-8, given the value of the first byte. |
544 | */ |
545 | unsigned getNumBytesForUTF8(UTF8 first) { |
546 | return trailingBytesForUTF8[first] + 1; |
547 | } |
548 | |
549 | /* --------------------------------------------------------------------- */ |
550 | |
551 | /* |
552 | * Exported function to return whether a UTF-8 string is legal or not. |
553 | * This is not used here; it's just exported. |
554 | */ |
555 | Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) { |
556 | while (*source != sourceEnd) { |
557 | int length = trailingBytesForUTF8[**source] + 1; |
558 | if (length > sourceEnd - *source || !isLegalUTF8(source: *source, length)) |
559 | return false; |
560 | *source += length; |
561 | } |
562 | return true; |
563 | } |
564 | |
565 | /* --------------------------------------------------------------------- */ |
566 | |
567 | ConversionResult ConvertUTF8toUTF16 ( |
568 | const UTF8** sourceStart, const UTF8* sourceEnd, |
569 | UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { |
570 | ConversionResult result = conversionOK; |
571 | const UTF8* source = *sourceStart; |
572 | UTF16* target = *targetStart; |
573 | while (source < sourceEnd) { |
574 | UTF32 ch = 0; |
575 | unsigned short = trailingBytesForUTF8[*source]; |
576 | if (extraBytesToRead >= sourceEnd - source) { |
577 | result = sourceExhausted; break; |
578 | } |
579 | /* Do this check whether lenient or strict */ |
580 | if (!isLegalUTF8(source, length: extraBytesToRead+1)) { |
581 | result = sourceIllegal; |
582 | break; |
583 | } |
584 | /* |
585 | * The cases all fall through. See "Note A" below. |
586 | */ |
587 | switch (extraBytesToRead) { |
588 | case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ |
589 | case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ |
590 | case 3: ch += *source++; ch <<= 6; |
591 | case 2: ch += *source++; ch <<= 6; |
592 | case 1: ch += *source++; ch <<= 6; |
593 | case 0: ch += *source++; |
594 | } |
595 | ch -= offsetsFromUTF8[extraBytesToRead]; |
596 | |
597 | if (target >= targetEnd) { |
598 | source -= (extraBytesToRead+1); /* Back up source pointer! */ |
599 | result = targetExhausted; break; |
600 | } |
601 | if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ |
602 | /* UTF-16 surrogate values are illegal in UTF-32 */ |
603 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { |
604 | if (flags == strictConversion) { |
605 | source -= (extraBytesToRead+1); /* return to the illegal value itself */ |
606 | result = sourceIllegal; |
607 | break; |
608 | } else { |
609 | *target++ = UNI_REPLACEMENT_CHAR; |
610 | } |
611 | } else { |
612 | *target++ = (UTF16)ch; /* normal case */ |
613 | } |
614 | } else if (ch > UNI_MAX_UTF16) { |
615 | if (flags == strictConversion) { |
616 | result = sourceIllegal; |
617 | source -= (extraBytesToRead+1); /* return to the start */ |
618 | break; /* Bail out; shouldn't continue */ |
619 | } else { |
620 | *target++ = UNI_REPLACEMENT_CHAR; |
621 | } |
622 | } else { |
623 | /* target is a character in range 0xFFFF - 0x10FFFF. */ |
624 | if (target + 1 >= targetEnd) { |
625 | source -= (extraBytesToRead+1); /* Back up source pointer! */ |
626 | result = targetExhausted; break; |
627 | } |
628 | ch -= halfBase; |
629 | *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); |
630 | *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); |
631 | } |
632 | } |
633 | *sourceStart = source; |
634 | *targetStart = target; |
635 | return result; |
636 | } |
637 | |
638 | /* --------------------------------------------------------------------- */ |
639 | |
640 | static ConversionResult ConvertUTF8toUTF32Impl( |
641 | const UTF8** sourceStart, const UTF8* sourceEnd, |
642 | UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags, |
643 | Boolean InputIsPartial) { |
644 | ConversionResult result = conversionOK; |
645 | const UTF8* source = *sourceStart; |
646 | UTF32* target = *targetStart; |
647 | while (source < sourceEnd) { |
648 | UTF32 ch = 0; |
649 | unsigned short = trailingBytesForUTF8[*source]; |
650 | if (extraBytesToRead >= sourceEnd - source) { |
651 | if (flags == strictConversion || InputIsPartial) { |
652 | result = sourceExhausted; |
653 | break; |
654 | } else { |
655 | result = sourceIllegal; |
656 | |
657 | /* |
658 | * Replace the maximal subpart of ill-formed sequence with |
659 | * replacement character. |
660 | */ |
661 | source += findMaximalSubpartOfIllFormedUTF8Sequence(source, |
662 | sourceEnd); |
663 | *target++ = UNI_REPLACEMENT_CHAR; |
664 | continue; |
665 | } |
666 | } |
667 | if (target >= targetEnd) { |
668 | result = targetExhausted; break; |
669 | } |
670 | |
671 | /* Do this check whether lenient or strict */ |
672 | if (!isLegalUTF8(source, length: extraBytesToRead+1)) { |
673 | result = sourceIllegal; |
674 | if (flags == strictConversion) { |
675 | /* Abort conversion. */ |
676 | break; |
677 | } else { |
678 | /* |
679 | * Replace the maximal subpart of ill-formed sequence with |
680 | * replacement character. |
681 | */ |
682 | source += findMaximalSubpartOfIllFormedUTF8Sequence(source, |
683 | sourceEnd); |
684 | *target++ = UNI_REPLACEMENT_CHAR; |
685 | continue; |
686 | } |
687 | } |
688 | /* |
689 | * The cases all fall through. See "Note A" below. |
690 | */ |
691 | switch (extraBytesToRead) { |
692 | case 5: ch += *source++; ch <<= 6; |
693 | case 4: ch += *source++; ch <<= 6; |
694 | case 3: ch += *source++; ch <<= 6; |
695 | case 2: ch += *source++; ch <<= 6; |
696 | case 1: ch += *source++; ch <<= 6; |
697 | case 0: ch += *source++; |
698 | } |
699 | ch -= offsetsFromUTF8[extraBytesToRead]; |
700 | |
701 | if (ch <= UNI_MAX_LEGAL_UTF32) { |
702 | /* |
703 | * UTF-16 surrogate values are illegal in UTF-32, and anything |
704 | * over Plane 17 (> 0x10FFFF) is illegal. |
705 | */ |
706 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { |
707 | if (flags == strictConversion) { |
708 | source -= (extraBytesToRead+1); /* return to the illegal value itself */ |
709 | result = sourceIllegal; |
710 | break; |
711 | } else { |
712 | *target++ = UNI_REPLACEMENT_CHAR; |
713 | } |
714 | } else { |
715 | *target++ = ch; |
716 | } |
717 | } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ |
718 | result = sourceIllegal; |
719 | *target++ = UNI_REPLACEMENT_CHAR; |
720 | } |
721 | } |
722 | *sourceStart = source; |
723 | *targetStart = target; |
724 | return result; |
725 | } |
726 | |
727 | ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, |
728 | const UTF8 *sourceEnd, |
729 | UTF32 **targetStart, |
730 | UTF32 *targetEnd, |
731 | ConversionFlags flags) { |
732 | return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, |
733 | flags, /*InputIsPartial=*/true); |
734 | } |
735 | |
736 | ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, |
737 | const UTF8 *sourceEnd, UTF32 **targetStart, |
738 | UTF32 *targetEnd, ConversionFlags flags) { |
739 | return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, |
740 | flags, /*InputIsPartial=*/false); |
741 | } |
742 | |
743 | /* --------------------------------------------------------------------- |
744 | |
745 | Note A. |
746 | The fall-through switches in UTF-8 reading code save a |
747 | temp variable, some decrements & conditionals. The switches |
748 | are equivalent to the following loop: |
749 | { |
750 | int tmpBytesToRead = extraBytesToRead+1; |
751 | do { |
752 | ch += *source++; |
753 | --tmpBytesToRead; |
754 | if (tmpBytesToRead) ch <<= 6; |
755 | } while (tmpBytesToRead > 0); |
756 | } |
757 | In UTF-8 writing code, the switches on "bytesToWrite" are |
758 | similarly unrolled loops. |
759 | |
760 | --------------------------------------------------------------------- */ |
761 | |
762 | } // namespace llvm |
763 | |
764 | ConvertUTF_RESTORE_WARNINGS |
765 | |