ConvertUTFWrapper.cpp source code [llvm_projects/llvm/lib/Support/ConvertUTFWrapper.cpp]

1	//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "llvm/ADT/ArrayRef.h"
10	#include "llvm/ADT/StringRef.h"
11	#include "llvm/Support/ConvertUTF.h"
12	#include "llvm/Support/ErrorHandling.h"
13	#include <string>
14	#include <vector>
15
16	namespace llvm {
17
18	bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
19	char &ResultPtr, const* UTF8 *&ErrorPtr) {
20	assert(WideCharWidth == `1` \|\| WideCharWidth == `2` \|\| WideCharWidth == `4`);
21	ConversionResult result = conversionOK;
22	// Copy the character span over.
23	if (WideCharWidth == `1`) {
24	const UTF8 Pos = reinterpret_cast<const* UTF8*>(Source.begin());
25	if (!isLegalUTF8String(source: &Pos, sourceEnd: reinterpret_cast<const UTF8*>(Source.end()))) {
26	result = sourceIllegal;
27	ErrorPtr = Pos;
28	} else {
29	memcpy(dest: ResultPtr, src: Source.data(), n: Source.size());
30	ResultPtr += Source.size();
31	}
32	} else if (WideCharWidth == `2`) {
33	const UTF8 sourceStart = (const* UTF8*)Source.data();
34	// FIXME: Make the type of the result buffer correct instead of
35	// using reinterpret_cast.
36	UTF16 targetStart = reinterpret_cast<UTF16 >(ResultPtr);
37	ConversionFlags flags = strictConversion;
38	result =
39	ConvertUTF8toUTF16(sourceStart: &sourceStart, sourceEnd: sourceStart + Source.size(),
40	targetStart: &targetStart, targetEnd: targetStart + Source.size(), flags);
41	if (result == conversionOK)
42	ResultPtr = reinterpret_cast<char *>(targetStart);
43	else
44	ErrorPtr = sourceStart;
45	} else if (WideCharWidth == `4`) {
46	const UTF8 sourceStart = (const* UTF8 *)Source.data();
47	// FIXME: Make the type of the result buffer correct instead of
48	// using reinterpret_cast.
49	UTF32 targetStart = reinterpret_cast<UTF32 >(ResultPtr);
50	ConversionFlags flags = strictConversion;
51	result =
52	ConvertUTF8toUTF32(sourceStart: &sourceStart, sourceEnd: sourceStart + Source.size(),
53	targetStart: &targetStart, targetEnd: targetStart + Source.size(), flags);
54	if (result == conversionOK)
55	ResultPtr = reinterpret_cast<char *>(targetStart);
56	else
57	ErrorPtr = sourceStart;
58	}
59	assert((result != targetExhausted) &&
60	"ConvertUTF8toUTFXX exhausted target buffer");
61	return result == conversionOK;
62	}
63
64	bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
65	const UTF32 *SourceStart = &Source;
66	const UTF32 *SourceEnd = SourceStart + `1`;
67	UTF8 TargetStart = reinterpret_cast<UTF8 >(ResultPtr);
68	UTF8 *TargetEnd = TargetStart + `4`;
69	ConversionResult CR = ConvertUTF32toUTF8(
70	sourceStart: &SourceStart, sourceEnd: SourceEnd, targetStart: &TargetStart, targetEnd: TargetEnd, flags: strictConversion);
71	if (CR != conversionOK)
72	return false;
73
74	ResultPtr = reinterpret_cast<char *>(TargetStart);
75	return true;
76	}
77
78	bool hasUTF16ByteOrderMark(ArrayRef<char> S) {
79	return (S.size() >= `2` && ((S [`0`] == `'\xff'` && S [`1`] == `'\xfe'`) \|\|
80	(S [`0`] == `'\xfe'` && S [`1`] == `'\xff'`)));
81	}
82
83	bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
84	assert(Out.empty());
85
86	// Error out on an uneven byte count.
87	if (SrcBytes.size() % `2`)
88	return false;
89
90	// Avoid OOB by returning early on empty input.
91	if (SrcBytes.empty())
92	return true;
93
94	const UTF16 Src = reinterpret_cast<const* UTF16 *>(SrcBytes.begin());
95	const UTF16 SrcEnd = reinterpret_cast<const* UTF16 *>(SrcBytes.end());
96
97	assert((uintptr_t)Src % sizeof(UTF16) == `0`);
98
99	// Byteswap if necessary.
100	std::vector<UTF16> ByteSwapped;
101	if (Src[`0`] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {
102	ByteSwapped.insert(position: ByteSwapped.end(), first: Src, last: SrcEnd);
103	for (UTF16 &I : ByteSwapped)
104	I = llvm::byteswap<uint16_t>(V: I);
105	Src = &ByteSwapped [`0`];
106	SrcEnd = &ByteSwapped [ByteSwapped.size() - `1`] + `1`;
107	}
108
109	// Skip the BOM for conversion.
110	if (Src[`0`] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE)
111	Src++;
112
113	// Just allocate enough space up front. We'll shrink it later. Allocate
114	// enough that we can fit a null terminator without reallocating.
115	Out.resize(n: SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + `1`);
116	UTF8 Dst = reinterpret_cast<UTF8 >(&Out [`0`]);
117	UTF8 *DstEnd = Dst + Out.size();
118
119	ConversionResult CR =
120	ConvertUTF16toUTF8(sourceStart: &Src, sourceEnd: SrcEnd, targetStart: &Dst, targetEnd: DstEnd, flags: strictConversion);
121	assert(CR != targetExhausted);
122
123	if (CR != conversionOK) {
124	Out.clear();
125	return false;
126	}
127
128	Out.resize(n: reinterpret_cast<char *>(Dst) - &Out [`0`]);
129	Out.push_back(c: `0`);
130	Out.pop_back();
131	return true;
132	}
133
134	bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out) {
135	return convertUTF16ToUTF8String(
136	SrcBytes: llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
137	Src.size() * sizeof(UTF16)),
138	Out);
139	}
140
141	bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
142	assert(Out.empty());
143
144	// Error out on an uneven byte count.
145	if (SrcBytes.size() % `4`)
146	return false;
147
148	// Avoid OOB by returning early on empty input.
149	if (SrcBytes.empty())
150	return true;
151
152	const UTF32 Src = reinterpret_cast<const* UTF32 *>(SrcBytes.begin());
153	const UTF32 SrcEnd = reinterpret_cast<const* UTF32 *>(SrcBytes.end());
154
155	assert((uintptr_t)Src % sizeof(UTF32) == `0`);
156
157	// Byteswap if necessary.
158	std::vector<UTF32> ByteSwapped;
159	if (Src[`0`] == UNI_UTF32_BYTE_ORDER_MARK_SWAPPED) {
160	ByteSwapped.insert(position: ByteSwapped.end(), first: Src, last: SrcEnd);
161	for (UTF32 &I : ByteSwapped)
162	I = llvm::byteswap<uint32_t>(V: I);
163	Src = &ByteSwapped [`0`];
164	SrcEnd = &ByteSwapped [ByteSwapped.size() - `1`] + `1`;
165	}
166
167	// Skip the BOM for conversion.
168	if (Src[`0`] == UNI_UTF32_BYTE_ORDER_MARK_NATIVE)
169	Src++;
170
171	// Just allocate enough space up front. We'll shrink it later. Allocate
172	// enough that we can fit a null terminator without reallocating.
173	Out.resize(n: SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + `1`);
174	UTF8 Dst = reinterpret_cast<UTF8 >(&Out [`0`]);
175	UTF8 *DstEnd = Dst + Out.size();
176
177	ConversionResult CR =
178	ConvertUTF32toUTF8(sourceStart: &Src, sourceEnd: SrcEnd, targetStart: &Dst, targetEnd: DstEnd, flags: strictConversion);
179	assert(CR != targetExhausted);
180
181	if (CR != conversionOK) {
182	Out.clear();
183	return false;
184	}
185
186	Out.resize(n: reinterpret_cast<char *>(Dst) - &Out [`0`]);
187	Out.push_back(c: `0`);
188	Out.pop_back();
189	return true;
190	}
191
192	bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out) {
193	return convertUTF32ToUTF8String(
194	SrcBytes: llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
195	Src.size() * sizeof(UTF32)),
196	Out);
197	}
198
199	bool convertUTF8ToUTF16String(StringRef SrcUTF8,
200	SmallVectorImpl<UTF16> &DstUTF16) {
201	assert(DstUTF16.empty());
202
203	// Avoid OOB by returning early on empty input.
204	if (SrcUTF8.empty()) {
205	DstUTF16.push_back(Elt: `0`);
206	DstUTF16.pop_back();
207	return true;
208	}
209
210	const UTF8 Src = reinterpret_cast<const* UTF8 *>(SrcUTF8.begin());
211	const UTF8 SrcEnd = reinterpret_cast<const* UTF8 *>(SrcUTF8.end());
212
213	// Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding
214	// as UTF-16 should always require the same amount or less code units than the
215	// UTF-8 encoding. Allocate one extra byte for the null terminator though,
216	// so that someone calling DstUTF16.data() gets a null terminated string.
217	// We resize down later so we don't have to worry that this over allocates.
218	DstUTF16.resize(N: SrcUTF8.size()+`1`);
219	UTF16 *Dst = &DstUTF16 [`0`];
220	UTF16 *DstEnd = Dst + DstUTF16.size();
221
222	ConversionResult CR =
223	ConvertUTF8toUTF16(sourceStart: &Src, sourceEnd: SrcEnd, targetStart: &Dst, targetEnd: DstEnd, flags: strictConversion);
224	assert(CR != targetExhausted);
225
226	if (CR != conversionOK) {
227	DstUTF16.clear();
228	return false;
229	}
230
231	DstUTF16.resize(N: Dst - &DstUTF16 [`0`]);
232	DstUTF16.push_back(Elt: `0`);
233	DstUTF16.pop_back();
234	return true;
235	}
236
237	static_assert(sizeof(wchar_t) == `1` \|\| sizeof(wchar_t) == `2` \|\|
238	sizeof(wchar_t) == `4`,
239	"Expected wchar_t to be 1, 2, or 4 bytes");
240
241	template <typename TResult>
242	static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source,
243	TResult &Result) {
244	// Even in the case of UTF-16, the number of bytes in a UTF-8 string is
245	// at least as large as the number of elements in the resulting wide
246	// string, because surrogate pairs take at least 4 bytes in UTF-8.
247	Result.resize(Source.size() + `1`);
248	char ResultPtr = reinterpret_cast<char* *>(&Result[`0`]);
249	const UTF8 *ErrorPtr;
250	if (!ConvertUTF8toWide(WideCharWidth: sizeof(wchar_t), Source, ResultPtr, ErrorPtr)) {
251	Result.clear();
252	return false;
253	}
254	Result.resize(reinterpret_cast<wchar_t *>(ResultPtr) - &Result[`0`]);
255	return true;
256	}
257
258	bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result) {
259	return ConvertUTF8toWideInternal(Source, Result);
260	}
261
262	bool ConvertUTF8toWide(const char *Source, std::wstring &Result) {
263	if (!Source) {
264	Result.clear();
265	return true;
266	}
267	return ConvertUTF8toWide(Source: llvm::StringRef (Source), Result);
268	}
269
270	bool convertWideToUTF8(const std::wstring &Source, std::string &Result) {
271	if (sizeof(wchar_t) == `1`) {
272	const UTF8 Start = reinterpret_cast<const* UTF8 *>(Source.data());
273	const UTF8 *End =
274	reinterpret_cast<const UTF8 *>(Source.data() + Source.size());
275	if (!isLegalUTF8String(source: &Start, sourceEnd: End))
276	return false;
277	Result.resize(n: Source.size());
278	memcpy(dest: &Result [`0`], src: Source.data(), n: Source.size());
279	return true;
280	} else if (sizeof(wchar_t) == `2`) {
281	return convertUTF16ToUTF8String(
282	Src: llvm::ArrayRef<UTF16>(reinterpret_cast<const UTF16 *>(Source.data()),
283	Source.size()),
284	Out&: Result);
285	} else if (sizeof(wchar_t) == `4`) {
286	const UTF32 Start = reinterpret_cast<const* UTF32 *>(Source.data());
287	const UTF32 *End =
288	reinterpret_cast<const UTF32 *>(Source.data() + Source.size());
289	Result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * Source.size());
290	UTF8 ResultPtr = reinterpret_cast<UTF8 >(&Result [`0`]);
291	UTF8 ResultEnd = reinterpret_cast<UTF8 >(&Result [`0`] + Result.size());
292	if (ConvertUTF32toUTF8(sourceStart: &Start, sourceEnd: End, targetStart: &ResultPtr, targetEnd: ResultEnd,
293	flags: strictConversion) == conversionOK) {
294	Result.resize(n: reinterpret_cast<char *>(ResultPtr) - &Result [`0`]);
295	return true;
296	} else {
297	Result.clear();
298	return false;
299	}
300	} else {
301	llvm_unreachable(
302	"Control should never reach this point; see static_assert further up");
303	}
304	}
305
306	bool IsSingleCodeUnitUTF8Codepoint(unsigned V) { return V <= `0x7F`; }
307
308	bool IsSingleCodeUnitUTF16Codepoint(unsigned V) {
309	return V <= `0xD7FF` \|\| (V >= `0xE000` && V <= `0xFFFF`);
310	}
311
312	bool IsSingleCodeUnitUTF32Codepoint(unsigned V) {
313	return V <= `0xD7FF` \|\| (V >= `0xE000` && V <= `0x10FFFF`);
314	}
315
316	} // end namespace llvm
317
318

Browse the source code of llvm_projects/llvm/lib/Support/ConvertUTFWrapper.cpp