ConvertUTF.cpp source code [llvm_projects/llvm/lib/Support/ConvertUTF.cpp]

1	/===--- ConvertUTF.c - Universal Character Names conversions ---------------===*
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	===------------------------------------------------------------------------=/
8	/*
9	* Copyright © 1991-2015 Unicode, Inc. All rights reserved.
10	* Distributed under the Terms of Use in
11	* http://www.unicode.org/copyright.html.
12	*
13	* Permission is hereby granted, free of charge, to any person obtaining
14	* a copy of the Unicode data files and any associated documentation
15	* (the "Data Files") or Unicode software and any associated documentation
16	* (the "Software") to deal in the Data Files or Software
17	* without restriction, including without limitation the rights to use,
18	* copy, modify, merge, publish, distribute, and/or sell copies of
19	* the Data Files or Software, and to permit persons to whom the Data Files
20	* or Software are furnished to do so, provided that
21	* (a) this copyright and permission notice appear with all copies
22	* of the Data Files or Software,
23	* (b) this copyright and permission notice appear in associated
24	* documentation, and
25	* (c) there is clear notice in each modified Data File or in the Software
26	* as well as in the documentation associated with the Data File(s) or
27	* Software that the data or software has been modified.
28	*
29	* THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
30	* ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
31	* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32	* NONINFRINGEMENT OF THIRD PARTY RIGHTS.
33	* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
34	* NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
35	* DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
36	* DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
37	* TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
38	* PERFORMANCE OF THE DATA FILES OR SOFTWARE.
39	*
40	* Except as contained in this notice, the name of a copyright holder
41	* shall not be used in advertising or otherwise to promote the sale,
42	* use or other dealings in these Data Files or Software without prior
43	* written authorization of the copyright holder.
44	*/
45
46	/ ---------------------------------------------------------------------*
47
48	Conversions between UTF32, UTF-16, and UTF-8. Source code file.
49	Author: Mark E. Davis, 1994.
50	Rev History: Rick McGowan, fixes & updates May 2001.
51	Sept 2001: fixed const & error conditions per
52	mods suggested by S. Parent & A. Lillich.
53	June 2002: Tim Dodd added detection and handling of incomplete
54	source sequences, enhanced error detection, added casts
55	to eliminate compiler warnings.
56	July 2003: slight mods to back out aggressive FFFE detection.
57	Jan 2004: updated switches in from-UTF8 conversions.
58	Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
59
60	See the header file "ConvertUTF.h" for complete documentation.
61
62	------------------------------------------------------------------------ /*
63
64	#include "llvm/Support/ConvertUTF.h"
65	#ifdef CVTUTF_DEBUG
66	#include <stdio.h>
67	#endif
68	#include <assert.h>
69
70	/*
71	* This code extensively uses fall-through switches.
72	* Keep the compiler from warning about that.
73	*/
74	#if defined(__clang__) && defined(__has_warning)
75	# if __has_warning("-Wimplicit-fallthrough")
76	# define ConvertUTF_DISABLE_WARNINGS \
77	_Pragma("clang diagnostic push") \
78	_Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
79	# define ConvertUTF_RESTORE_WARNINGS \
80	_Pragma("clang diagnostic pop")
81	# endif
82	#elif defined(__GNUC__) && __GNUC__ > 6
83	# define ConvertUTF_DISABLE_WARNINGS \
84	_Pragma("GCC diagnostic push") \
85	_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
86	# define ConvertUTF_RESTORE_WARNINGS \
87	_Pragma("GCC diagnostic pop")
88	#endif
89	#ifndef ConvertUTF_DISABLE_WARNINGS
90	# define ConvertUTF_DISABLE_WARNINGS
91	#endif
92	#ifndef ConvertUTF_RESTORE_WARNINGS
93	# define ConvertUTF_RESTORE_WARNINGS
94	#endif
95
96	ConvertUTF_DISABLE_WARNINGS
97
98	namespace llvm {
99
100	static const int halfShift = `10`; / used for shifting by 10 bits /
101
102	static const UTF32 halfBase = `0x0010000UL`;
103	static const UTF32 halfMask = `0x3FFUL`;
104
105	#define UNI_SUR_HIGH_START (UTF32)0xD800
106	#define UNI_SUR_HIGH_END (UTF32)0xDBFF
107	#define UNI_SUR_LOW_START (UTF32)0xDC00
108	#define UNI_SUR_LOW_END (UTF32)0xDFFF
109
110	/ --------------------------------------------------------------------- /
111
112	/*
113	* Index into the table below with the first byte of a UTF-8 sequence to
114	* get the number of trailing bytes that are supposed to follow it.
115	* Note that legal UTF-8 values can't have 4 or 5-bytes. The table is
116	* left as-is for anyone who may want to do such conversion, which was
117	* allowed in earlier algorithms.
118	*/
119	static const char trailingBytesForUTF8[`256`] = {
120	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
121	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
122	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
123	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
124	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
125	`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`, `0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,`0`,
126	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`, `1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
127	`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`, `3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`4`,`4`,`4`,`4`,`5`,`5`,`5`,`5`
128	};
129
130	/*
131	* Magic values subtracted from a buffer value during UTF8 conversion.
132	* This table contains as many values as there might be trailing bytes
133	* in a UTF-8 sequence.
134	*/
135	static const UTF32 offsetsFromUTF8[`6`] = { `0x00000000UL`, `0x00003080UL`, `0x000E2080UL`,
136	`0x03C82080UL`, `0xFA082080UL`, `0x82082080UL` };
137
138	/*
139	* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
140	* into the first byte, depending on how many bytes follow. There are
141	* as many entries in this table as there are UTF-8 sequence types.
142	* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
143	* for legal UTF-8 will be 4 or fewer bytes total.
144	*/
145	static const UTF8 firstByteMark[`7`] = { `0x00`, `0x00`, `0xC0`, `0xE0`, `0xF0`, `0xF8`, `0xFC` };
146
147	/ --------------------------------------------------------------------- /
148
149	/ The interface converts a whole buffer to avoid function-call overhead.*
150	* Constants have been gathered. Loops & conditionals have been removed as
151	* much as possible for efficiency, in favor of drop-through switches.
152	* (See "Note A" at the bottom of the file for equivalent code.)
153	* If your compiler supports it, the "isLegalUTF8" call can be turned
154	* into an inline function.
155	*/
156
157
158	/ --------------------------------------------------------------------- /
159
160	ConversionResult ConvertUTF32toUTF16 (
161	const UTF32** sourceStart, const UTF32* sourceEnd,
162	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
163	ConversionResult result = conversionOK;
164	const UTF32* source = *sourceStart;
165	UTF16* target = *targetStart;
166	while (source < sourceEnd) {
167	UTF32 ch;
168	if (target >= targetEnd) {
169	result = targetExhausted; break;
170	}
171	ch = *source++;
172	if (ch <= UNI_MAX_BMP) { / Target is a character <= 0xFFFF /
173	/ UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values /
174	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
175	if (flags == strictConversion) {
176	--source; / return to the illegal value itself /
177	result = sourceIllegal;
178	break;
179	} else {
180	*target++ = UNI_REPLACEMENT_CHAR;
181	}
182	} else {
183	target++ = (UTF16)ch; /* normal case /
184	}
185	} else if (ch > UNI_MAX_LEGAL_UTF32) {
186	if (flags == strictConversion) {
187	result = sourceIllegal;
188	} else {
189	*target++ = UNI_REPLACEMENT_CHAR;
190	}
191	} else {
192	/ target is a character in range 0xFFFF - 0x10FFFF. /
193	if (target + `1` >= targetEnd) {
194	--source; / Back up source pointer! /
195	result = targetExhausted; break;
196	}
197	ch -= halfBase;
198	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
199	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
200	}
201	}
202	*sourceStart = source;
203	*targetStart = target;
204	return result;
205	}
206
207	/ --------------------------------------------------------------------- /
208
209	ConversionResult ConvertUTF16toUTF32 (
210	const UTF16** sourceStart, const UTF16* sourceEnd,
211	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
212	ConversionResult result = conversionOK;
213	const UTF16* source = *sourceStart;
214	UTF32* target = *targetStart;
215	UTF32 ch, ch2;
216	while (source < sourceEnd) {
217	const UTF16* oldSource = source; / In case we have to back up because of target overflow. /
218	ch = *source++;
219	/ If we have a surrogate pair, convert to UTF32 first. /
220	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
221	/ If the 16 bits following the high surrogate are in the source buffer... /
222	if (source < sourceEnd) {
223	ch2 = *source;
224	/ If it's a low surrogate, convert to UTF32. /
225	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
226	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
227	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
228	++source;
229	} else if (flags == strictConversion) { / it's an unpaired high surrogate /
230	--source; / return to the illegal value itself /
231	result = sourceIllegal;
232	break;
233	}
234	} else { / We don't have the 16 bits following the high surrogate. /
235	--source; / return to the high surrogate /
236	result = sourceExhausted;
237	break;
238	}
239	} else if (flags == strictConversion) {
240	/ UTF-16 surrogate values are illegal in UTF-32 /
241	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
242	--source; / return to the illegal value itself /
243	result = sourceIllegal;
244	break;
245	}
246	}
247	if (target >= targetEnd) {
248	source = oldSource; / Back up source pointer! /
249	result = targetExhausted; break;
250	}
251	*target++ = ch;
252	}
253	*sourceStart = source;
254	*targetStart = target;
255	#ifdef CVTUTF_DEBUG
256	if (result == sourceIllegal) {
257	fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
258	fflush(stderr);
259	}
260	#endif
261	return result;
262	}
263	ConversionResult ConvertUTF16toUTF8 (
264	const UTF16** sourceStart, const UTF16* sourceEnd,
265	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
266	ConversionResult result = conversionOK;
267	const UTF16* source = *sourceStart;
268	UTF8* target = *targetStart;
269	while (source < sourceEnd) {
270	UTF32 ch;
271	unsigned short bytesToWrite = `0`;
272	const UTF32 byteMask = `0xBF`;
273	const UTF32 byteMark = `0x80`;
274	const UTF16* oldSource = source; / In case we have to back up because of target overflow. /
275	ch = *source++;
276	/ If we have a surrogate pair, convert to UTF32 first. /
277	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
278	/ If the 16 bits following the high surrogate are in the source buffer... /
279	if (source < sourceEnd) {
280	UTF32 ch2 = *source;
281	/ If it's a low surrogate, convert to UTF32. /
282	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
283	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
284	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
285	++source;
286	} else if (flags == strictConversion) { / it's an unpaired high surrogate /
287	--source; / return to the illegal value itself /
288	result = sourceIllegal;
289	break;
290	}
291	} else { / We don't have the 16 bits following the high surrogate. /
292	--source; / return to the high surrogate /
293	result = sourceExhausted;
294	break;
295	}
296	} else if (flags == strictConversion) {
297	/ UTF-16 surrogate values are illegal in UTF-32 /
298	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
299	--source; / return to the illegal value itself /
300	result = sourceIllegal;
301	break;
302	}
303	}
304	/ Figure out how many bytes the result will require /
305	if (ch < (UTF32)`0x80`) { bytesToWrite = `1`;
306	} else if (ch < (UTF32)`0x800`) { bytesToWrite = `2`;
307	} else if (ch < (UTF32)`0x10000`) { bytesToWrite = `3`;
308	} else if (ch < (UTF32)`0x110000`) { bytesToWrite = `4`;
309	} else { bytesToWrite = `3`;
310	ch = UNI_REPLACEMENT_CHAR;
311	}
312
313	target += bytesToWrite;
314	if (target > targetEnd) {
315	source = oldSource; / Back up source pointer! /
316	target -= bytesToWrite; result = targetExhausted; break;
317	}
318	switch (bytesToWrite) { / note: everything falls through. /
319	case `4`: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= `6`;
320	case `3`: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= `6`;
321	case `2`: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= `6`;
322	case `1`: *--target = (UTF8)(ch \| firstByteMark[bytesToWrite]);
323	}
324	target += bytesToWrite;
325	}
326	*sourceStart = source;
327	*targetStart = target;
328	return result;
329	}
330
331	/ --------------------------------------------------------------------- /
332
333	ConversionResult ConvertUTF32toUTF8 (
334	const UTF32** sourceStart, const UTF32* sourceEnd,
335	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
336	ConversionResult result = conversionOK;
337	const UTF32* source = *sourceStart;
338	UTF8* target = *targetStart;
339	while (source < sourceEnd) {
340	UTF32 ch;
341	unsigned short bytesToWrite = `0`;
342	const UTF32 byteMask = `0xBF`;
343	const UTF32 byteMark = `0x80`;
344	ch = *source++;
345	if (flags == strictConversion ) {
346	/ UTF-16 surrogate values are illegal in UTF-32 /
347	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
348	--source; / return to the illegal value itself /
349	result = sourceIllegal;
350	break;
351	}
352	}
353	/*
354	* Figure out how many bytes the result will require. Turn any
355	* illegally large UTF32 things (> Plane 17) into replacement chars.
356	*/
357	if (ch < (UTF32)`0x80`) { bytesToWrite = `1`;
358	} else if (ch < (UTF32)`0x800`) { bytesToWrite = `2`;
359	} else if (ch < (UTF32)`0x10000`) { bytesToWrite = `3`;
360	} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = `4`;
361	} else { bytesToWrite = `3`;
362	ch = UNI_REPLACEMENT_CHAR;
363	result = sourceIllegal;
364	}
365
366	target += bytesToWrite;
367	if (target > targetEnd) {
368	--source; / Back up source pointer! /
369	target -= bytesToWrite; result = targetExhausted; break;
370	}
371	switch (bytesToWrite) { / note: everything falls through. /
372	case `4`: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= `6`;
373	case `3`: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= `6`;
374	case `2`: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= `6`;
375	case `1`: *--target = (UTF8) (ch \| firstByteMark[bytesToWrite]);
376	}
377	target += bytesToWrite;
378	}
379	*sourceStart = source;
380	*targetStart = target;
381	return result;
382	}
383
384	/ --------------------------------------------------------------------- /
385
386	/*
387	* Utility routine to tell whether a sequence of bytes is legal UTF-8.
388	* This must be called with the length pre-determined by the first byte.
389	* If not calling this from ConvertUTF8to*, then the length can be set by:
390	* length = trailingBytesForUTF8[*source]+1;
391	* and the sequence is illegal right away if there aren't that many bytes
392	* available.
393	* If presented with a length > 4, this returns false. The Unicode
394	* definition of UTF-8 goes up to 4-byte sequences.
395	*/
396
397	static Boolean isLegalUTF8(const UTF8 source, int* length) {
398	UTF8 a;
399	const UTF8 *srcptr = source+length;
400	switch (length) {
401	default: return false;
402	/ Everything else falls through when "true"... /
403	case `4`: if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false;
404	case `3`: if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false;
405	case `2`: if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false;
406
407	switch (*source) {
408	/ no fall-through in this inner switch /
409	case `0xE0`: if (a < `0xA0`) return false; break;
410	case `0xED`: if (a > `0x9F`) return false; break;
411	case `0xF0`: if (a < `0x90`) return false; break;
412	case `0xF4`: if (a > `0x8F`) return false; break;
413	default: if (a < `0x80`) return false;
414	}
415
416	case `1`: if (source >= `0x80` && source < `0xC2`) return false;
417	}
418	if (source > `0xF4`) return* false;
419	return true;
420	}
421
422	/ --------------------------------------------------------------------- /
423
424	/*
425	* Exported function to return whether a UTF-8 sequence is legal or not.
426	* This is not used here; it's just exported.
427	*/
428	Boolean isLegalUTF8Sequence(const UTF8 source, const* UTF8 *sourceEnd) {
429	int length = trailingBytesForUTF8[*source]+`1`;
430	if (length > sourceEnd - source) {
431	return false;
432	}
433	return isLegalUTF8(source, length);
434	}
435
436	/*
437	* Exported function to return the size of the first utf-8 code unit sequence,
438	* Or 0 if the sequence is not valid;
439	*/
440	unsigned getUTF8SequenceSize(const UTF8 source, const* UTF8 *sourceEnd) {
441	int length = trailingBytesForUTF8[*source] + `1`;
442	return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
443	: `0`;
444	}
445
446	/ --------------------------------------------------------------------- /
447
448	static unsigned
449	findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
450	const UTF8 *sourceEnd) {
451	UTF8 b1, b2, b3;
452
453	assert(!isLegalUTF8Sequence(source, sourceEnd));
454
455	/*
456	* Unicode 6.3.0, D93b:
457	*
458	* Maximal subpart of an ill-formed subsequence: The longest code unit
459	* subsequence starting at an unconvertible offset that is either:
460	* a. the initial subsequence of a well-formed code unit sequence, or
461	* b. a subsequence of length one.
462	*/
463
464	if (source == sourceEnd)
465	return `0`;
466
467	/*
468	* Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
469	* Byte Sequences.
470	*/
471
472	b1 = *source;
473	++source;
474	if (b1 >= `0xC2` && b1 <= `0xDF`) {
475	/*
476	* First byte is valid, but we know that this code unit sequence is
477	* invalid, so the maximal subpart has to end after the first byte.
478	*/
479	return `1`;
480	}
481
482	if (source == sourceEnd)
483	return `1`;
484
485	b2 = *source;
486	++source;
487
488	if (b1 == `0xE0`) {
489	return (b2 >= `0xA0` && b2 <= `0xBF`) ? `2` : `1`;
490	}
491	if (b1 >= `0xE1` && b1 <= `0xEC`) {
492	return (b2 >= `0x80` && b2 <= `0xBF`) ? `2` : `1`;
493	}
494	if (b1 == `0xED`) {
495	return (b2 >= `0x80` && b2 <= `0x9F`) ? `2` : `1`;
496	}
497	if (b1 >= `0xEE` && b1 <= `0xEF`) {
498	return (b2 >= `0x80` && b2 <= `0xBF`) ? `2` : `1`;
499	}
500	if (b1 == `0xF0`) {
501	if (b2 >= `0x90` && b2 <= `0xBF`) {
502	if (source == sourceEnd)
503	return `2`;
504
505	b3 = *source;
506	return (b3 >= `0x80` && b3 <= `0xBF`) ? `3` : `2`;
507	}
508	return `1`;
509	}
510	if (b1 >= `0xF1` && b1 <= `0xF3`) {
511	if (b2 >= `0x80` && b2 <= `0xBF`) {
512	if (source == sourceEnd)
513	return `2`;
514
515	b3 = *source;
516	return (b3 >= `0x80` && b3 <= `0xBF`) ? `3` : `2`;
517	}
518	return `1`;
519	}
520	if (b1 == `0xF4`) {
521	if (b2 >= `0x80` && b2 <= `0x8F`) {
522	if (source == sourceEnd)
523	return `2`;
524
525	b3 = *source;
526	return (b3 >= `0x80` && b3 <= `0xBF`) ? `3` : `2`;
527	}
528	return `1`;
529	}
530
531	assert((b1 >= `0x80` && b1 <= `0xC1`) \|\| b1 >= `0xF5`);
532	/*
533	* There are no valid sequences that start with these bytes. Maximal subpart
534	* is defined to have length 1 in these cases.
535	*/
536	return `1`;
537	}
538
539	/ --------------------------------------------------------------------- /
540
541	/*
542	* Exported function to return the total number of bytes in a codepoint
543	* represented in UTF-8, given the value of the first byte.
544	*/
545	unsigned getNumBytesForUTF8(UTF8 first) {
546	return trailingBytesForUTF8[first] + `1`;
547	}
548
549	/ --------------------------------------------------------------------- /
550
551	/*
552	* Exported function to return whether a UTF-8 string is legal or not.
553	* This is not used here; it's just exported.
554	*/
555	Boolean isLegalUTF8String(const UTF8 *source, const* UTF8 *sourceEnd) {
556	while (*source != sourceEnd) {
557	int length = trailingBytesForUTF8[**source] + `1`;
558	if (length > sourceEnd - source \|\| !isLegalUTF8(source: source, length))
559	return false;
560	*source += length;
561	}
562	return true;
563	}
564
565	/ --------------------------------------------------------------------- /
566
567	ConversionResult ConvertUTF8toUTF16 (
568	const UTF8** sourceStart, const UTF8* sourceEnd,
569	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
570	ConversionResult result = conversionOK;
571	const UTF8* source = *sourceStart;
572	UTF16* target = *targetStart;
573	while (source < sourceEnd) {
574	UTF32 ch = `0`;
575	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
576	if (extraBytesToRead >= sourceEnd - source) {
577	result = sourceExhausted; break;
578	}
579	/ Do this check whether lenient or strict /
580	if (!isLegalUTF8(source, length: extraBytesToRead+`1`)) {
581	result = sourceIllegal;
582	break;
583	}
584	/*
585	* The cases all fall through. See "Note A" below.
586	*/
587	switch (extraBytesToRead) {
588	case `5`: ch += source++; ch <<= `6`; /* remember, illegal UTF-8 /
589	case `4`: ch += source++; ch <<= `6`; /* remember, illegal UTF-8 /
590	case `3`: ch += *source++; ch <<= `6`;
591	case `2`: ch += *source++; ch <<= `6`;
592	case `1`: ch += *source++; ch <<= `6`;
593	case `0`: ch += *source++;
594	}
595	ch -= offsetsFromUTF8[extraBytesToRead];
596
597	if (target >= targetEnd) {
598	source -= (extraBytesToRead+`1`); / Back up source pointer! /
599	result = targetExhausted; break;
600	}
601	if (ch <= UNI_MAX_BMP) { / Target is a character <= 0xFFFF /
602	/ UTF-16 surrogate values are illegal in UTF-32 /
603	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
604	if (flags == strictConversion) {
605	source -= (extraBytesToRead+`1`); / return to the illegal value itself /
606	result = sourceIllegal;
607	break;
608	} else {
609	*target++ = UNI_REPLACEMENT_CHAR;
610	}
611	} else {
612	target++ = (UTF16)ch; /* normal case /
613	}
614	} else if (ch > UNI_MAX_UTF16) {
615	if (flags == strictConversion) {
616	result = sourceIllegal;
617	source -= (extraBytesToRead+`1`); / return to the start /
618	break; / Bail out; shouldn't continue /
619	} else {
620	*target++ = UNI_REPLACEMENT_CHAR;
621	}
622	} else {
623	/ target is a character in range 0xFFFF - 0x10FFFF. /
624	if (target + `1` >= targetEnd) {
625	source -= (extraBytesToRead+`1`); / Back up source pointer! /
626	result = targetExhausted; break;
627	}
628	ch -= halfBase;
629	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
630	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
631	}
632	}
633	*sourceStart = source;
634	*targetStart = target;
635	return result;
636	}
637
638	/ --------------------------------------------------------------------- /
639
640	static ConversionResult ConvertUTF8toUTF32Impl(
641	const UTF8** sourceStart, const UTF8* sourceEnd,
642	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
643	Boolean InputIsPartial) {
644	ConversionResult result = conversionOK;
645	const UTF8* source = *sourceStart;
646	UTF32* target = *targetStart;
647	while (source < sourceEnd) {
648	UTF32 ch = `0`;
649	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
650	if (extraBytesToRead >= sourceEnd - source) {
651	if (flags == strictConversion \|\| InputIsPartial) {
652	result = sourceExhausted;
653	break;
654	} else {
655	result = sourceIllegal;
656
657	/*
658	* Replace the maximal subpart of ill-formed sequence with
659	* replacement character.
660	*/
661	source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
662	sourceEnd);
663	*target++ = UNI_REPLACEMENT_CHAR;
664	continue;
665	}
666	}
667	if (target >= targetEnd) {
668	result = targetExhausted; break;
669	}
670
671	/ Do this check whether lenient or strict /
672	if (!isLegalUTF8(source, length: extraBytesToRead+`1`)) {
673	result = sourceIllegal;
674	if (flags == strictConversion) {
675	/ Abort conversion. /
676	break;
677	} else {
678	/*
679	* Replace the maximal subpart of ill-formed sequence with
680	* replacement character.
681	*/
682	source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
683	sourceEnd);
684	*target++ = UNI_REPLACEMENT_CHAR;
685	continue;
686	}
687	}
688	/*
689	* The cases all fall through. See "Note A" below.
690	*/
691	switch (extraBytesToRead) {
692	case `5`: ch += *source++; ch <<= `6`;
693	case `4`: ch += *source++; ch <<= `6`;
694	case `3`: ch += *source++; ch <<= `6`;
695	case `2`: ch += *source++; ch <<= `6`;
696	case `1`: ch += *source++; ch <<= `6`;
697	case `0`: ch += *source++;
698	}
699	ch -= offsetsFromUTF8[extraBytesToRead];
700
701	if (ch <= UNI_MAX_LEGAL_UTF32) {
702	/*
703	* UTF-16 surrogate values are illegal in UTF-32, and anything
704	* over Plane 17 (> 0x10FFFF) is illegal.
705	*/
706	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
707	if (flags == strictConversion) {
708	source -= (extraBytesToRead+`1`); / return to the illegal value itself /
709	result = sourceIllegal;
710	break;
711	} else {
712	*target++ = UNI_REPLACEMENT_CHAR;
713	}
714	} else {
715	*target++ = ch;
716	}
717	} else { / i.e., ch > UNI_MAX_LEGAL_UTF32 /
718	result = sourceIllegal;
719	*target++ = UNI_REPLACEMENT_CHAR;
720	}
721	}
722	*sourceStart = source;
723	*targetStart = target;
724	return result;
725	}
726
727	ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
728	const UTF8 *sourceEnd,
729	UTF32 **targetStart,
730	UTF32 *targetEnd,
731	ConversionFlags flags) {
732	return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
733	flags, /InputIsPartial=/true);
734	}
735
736	ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
737	const UTF8 sourceEnd, UTF32 *targetStart,
738	UTF32 *targetEnd, ConversionFlags flags) {
739	return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
740	flags, /InputIsPartial=/false);
741	}
742
743	/ ---------------------------------------------------------------------*
744
745	Note A.
746	The fall-through switches in UTF-8 reading code save a
747	temp variable, some decrements & conditionals. The switches
748	are equivalent to the following loop:
749	{
750	int tmpBytesToRead = extraBytesToRead+1;
751	do {
752	ch += source++;*
753	--tmpBytesToRead;
754	if (tmpBytesToRead) ch <<= 6;
755	} while (tmpBytesToRead > 0);
756	}
757	In UTF-8 writing code, the switches on "bytesToWrite" are
758	similarly unrolled loops.
759
760	--------------------------------------------------------------------- /*
761
762	} // namespace llvm
763
764	ConvertUTF_RESTORE_WARNINGS
765

Browse the source code of llvm_projects/llvm/lib/Support/ConvertUTF.cpp