TextEncoding.cpp source code [llvm_projects/llvm/lib/Support/TextEncoding.cpp]

1	//===-- TextEncoding.cpp - Text encoding conversion class ---------- C++ --=//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file provides utility classes to convert between different character
11	/// encodings.
12	///
13	//===----------------------------------------------------------------------===//
14
15	#include "llvm/Support/TextEncoding.h"
16	#include "llvm/ADT/SmallString.h"
17	#include "llvm/ADT/SmallVector.h"
18	#include "llvm/ADT/StringExtras.h"
19	#include "llvm/Support/ConvertEBCDIC.h"
20	#include <system_error>
21
22	#if HAVE_ICU
23	#include <unicode/ucnv.h>
24	#elif HAVE_ICONV
25	#include <iconv.h>
26	#endif
27
28	using namespace llvm;
29
30	// Normalize the charset name with the charset alias matching algorithm proposed
31	// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
32	static void normalizeCharSetName(StringRef CSName,
33	SmallVectorImpl<char> &Normalized) {
34	bool PrevDigit = false;
35	for (auto Ch : CSName) {
36	if (isAlnum(C: Ch)) {
37	Ch = toLower(x: Ch);
38	if (Ch != `'0'` \|\| PrevDigit) {
39	PrevDigit = isDigit(C: Ch);
40	Normalized.push_back(Elt: Ch);
41	}
42	}
43	}
44	}
45
46	// Maps the encoding name to enum constant if possible.
47	static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
48	SmallString<`16`> Normalized;
49	normalizeCharSetName(CSName: Name, Normalized);
50	if (Normalized.equals(RHS: "utf8"))
51	return TextEncoding::UTF8;
52	if (Normalized.equals(RHS: "ibm1047"))
53	return TextEncoding::IBM1047;
54	return std::nullopt;
55	}
56
57	LLVM_ATTRIBUTE_UNUSED static void
58	HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
59	SmallVectorImpl<char> &Result) {
60	// No space left in output buffer. Double the size of the underlying
61	// memory in the SmallVectorImpl, adjust pointer and length and continue
62	// the conversion.
63	Capacity =
64	(Capacity < Result.max_size() / `2`) ? `2` * Capacity : Result.max_size();
65	Result.resize(N: `0`);
66	Result.resize_for_overwrite(N: Capacity);
67	Output = static_cast<char *>(Result.data());
68	OutputLength = Capacity;
69	}
70
71	namespace {
72	enum ConversionType {
73	UTF8ToIBM1047,
74	IBM1047ToUTF8,
75	};
76
77	// Support conversion between EBCDIC 1047 and UTF-8. This class uses
78	// built-in translation tables that allow for translation between the
79	// aforementioned encodings. The use of tables for conversion is only
80	// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
81	// encodings are not supported.
82	class TextEncodingConverterTable final
83	: public details::TextEncodingConverterImplBase {
84	const ConversionType ConvType;
85
86	public:
87	TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
88
89	std::error_code convertString(StringRef Source,
90	SmallVectorImpl<char> &Result) override;
91
92	void reset() override {}
93	};
94
95	std::error_code
96	TextEncodingConverterTable::convertString(StringRef Source,
97	SmallVectorImpl<char> &Result) {
98	switch (ConvType) {
99	case IBM1047ToUTF8:
100	ConverterEBCDIC::convertToUTF8(Source, Result);
101	return std::error_code ();
102	case UTF8ToIBM1047:
103	return ConverterEBCDIC::convertToEBCDIC(Source, Result);
104	}
105	llvm_unreachable("Invalid ConvType!");
106	return std::error_code ();
107	}
108
109	#if HAVE_ICU
110	struct UConverterDeleter {
111	void operator()(UConverter Converter) const* {
112	if (Converter)
113	ucnv_close(Converter);
114	}
115	};
116	using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
117
118	class TextEncodingConverterICU final
119	: public details::TextEncodingConverterImplBase {
120	UConverterUniquePtr FromConvDesc;
121	UConverterUniquePtr ToConvDesc;
122
123	public:
124	TextEncodingConverterICU(UConverterUniquePtr FromConverter,
125	UConverterUniquePtr ToConverter)
126	: FromConvDesc(std::move(FromConverter)),
127	ToConvDesc(std::move(ToConverter)) {}
128
129	std::error_code convertString(StringRef Source,
130	SmallVectorImpl<char> &Result) override;
131
132	void reset() override;
133	};
134
135	// TODO: The current implementation discards the partial result and restarts the
136	// conversion from the beginning if there is a conversion error due to
137	// insufficient buffer size. In the future, it would better to save the partial
138	// result and resume the conversion for the remaining string.
139	// TODO: Improve translation of ICU errors to error_code
140	std::error_code
141	TextEncodingConverterICU::convertString(StringRef Source,
142	SmallVectorImpl<char> &Result) {
143	// Setup the input in case it has no backing data.
144	size_t InputLength = Source.size();
145	const char In = InputLength ? const_cast<char* *>(Source.data()) : "";
146
147	// Setup the output. We directly write into the SmallVector.
148	size_t Capacity = Result.capacity();
149	size_t OutputLength = Capacity;
150	Result.resize_for_overwrite(Capacity);
151	char *Output;
152	UErrorCode EC = U_ZERO_ERROR;
153
154	ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
155	&EC);
156	ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL,
157	NULL, &EC);
158	assert(U_SUCCESS(EC));
159
160	do {
161	EC = U_ZERO_ERROR;
162	const char *Input = In;
163
164	Output = InputLength ? static_cast<char >(Result.data()) : nullptr*;
165	ucnv_convertEx(&ToConvDesc, &FromConvDesc, &Output, Result.end(), &Input,
166	In + InputLength, /pivotStart=/NULL,
167	/pivotSource=/NULL, /pivotTarget=/NULL,
168	/pivotLimit=/NULL, /reset=/true,
169	/flush=/true, &EC);
170	if (U_FAILURE(EC)) {
171	if (EC == U_BUFFER_OVERFLOW_ERROR) {
172	if (Capacity < Result.max_size()) {
173	HandleOverflow(Capacity, Output, OutputLength, Result);
174	continue;
175	} else
176	return std::error_code(E2BIG, std::generic_category());
177	}
178	// Some other error occured.
179	Result.resize(Output - Result.data());
180	return std::error_code(EILSEQ, std::generic_category());
181	}
182	break;
183	} while (true);
184
185	Result.resize(Output - Result.data());
186	return std::error_code();
187	}
188
189	void TextEncodingConverterICU::reset() {
190	ucnv_reset(&*FromConvDesc);
191	ucnv_reset(&*ToConvDesc);
192	}
193
194	#elif HAVE_ICONV
195	class TextEncodingConverterIconv final
196	: public details::TextEncodingConverterImplBase {
197	class UniqueIconvT {
198	iconv_t ConvDesc;
199
200	public:
201	operator iconv_t() const { return ConvDesc; }
202	UniqueIconvT(iconv_t CD) : ConvDesc(CD) {}
203	~UniqueIconvT() {
204	if (ConvDesc != (iconv_t)-`1`) {
205	iconv_close(ConvDesc);
206	ConvDesc = (iconv_t)-`1`;
207	}
208	}
209	UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) {
210	Other.ConvDesc = (iconv_t)-`1`;
211	}
212	UniqueIconvT &operator=(UniqueIconvT &&Other) {
213	if (&Other != this) {
214	ConvDesc = Other.ConvDesc;
215	Other.ConvDesc = (iconv_t)-`1`;
216	}
217	return *this;
218	}
219	};
220	UniqueIconvT ConvDesc;
221
222	public:
223	TextEncodingConverterIconv(UniqueIconvT ConvDesc)
224	: ConvDesc(std::move(ConvDesc)) {}
225
226	std::error_code convertString(StringRef Source,
227	SmallVectorImpl<char> &Result) override;
228
229	void reset() override;
230	};
231
232	// TODO: The current implementation discards the partial result and restarts the
233	// conversion from the beginning if there is a conversion error due to
234	// insufficient buffer size. In the future, it would better to save the partial
235	// result and resume the conversion for the remaining string.
236	std::error_code
237	TextEncodingConverterIconv::convertString(StringRef Source,
238	SmallVectorImpl<char> &Result) {
239	// Setup the output. We directly write into the SmallVector.
240	size_t Capacity = Result.capacity();
241	char Output = static_cast<char* *>(Result.data());
242	size_t OutputLength = Capacity;
243	Result.resize_for_overwrite(Capacity);
244
245	size_t Ret;
246	// Handle errors returned from iconv().
247	auto HandleError = [&Capacity, &Output, &OutputLength, &Result,
248	this](size_t Ret) {
249	if (Ret == static_cast<size_t>(-`1`)) {
250	// An error occured. Check if we can gracefully handle it.
251	if (errno == E2BIG && Capacity < Result.max_size()) {
252	HandleOverflow(Capacity, Output, OutputLength, Result);
253	// Reset converter
254	reset();
255	return std::error_code();
256	} else {
257	// Some other error occured.
258	Result.resize(Output - Result.data());
259	return std::error_code(errno, std::generic_category());
260	}
261	} else {
262	// A positive return value indicates that some characters were converted
263	// in a nonreversible way, that is, replaced with a SUB symbol. Returning
264	// an error in this case makes sure that both conversion routines behave
265	// in the same way.
266	return std::make_error_code(std::errc::illegal_byte_sequence);
267	}
268	};
269
270	do {
271	// Setup the input. Use nullptr to reset iconv state if input length is
272	// zero.
273	size_t InputLength = Source.size();
274	char Input = const_cast<char* *>(InputLength ? Source.data() : "");
275	Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
276	if (Ret != `0`) {
277	if (auto EC = HandleError(Ret))
278	return EC;
279	continue;
280	}
281	// Flush the converter
282	Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength);
283	if (Ret != `0`) {
284	if (auto EC = HandleError(Ret))
285	return EC;
286	continue;
287	}
288	break;
289	} while (true);
290
291	// Re-adjust size to actual size.
292	Result.resize(Output - Result.data());
293	return std::error_code();
294	}
295
296	inline void TextEncodingConverterIconv::reset() {
297	iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
298	}
299
300	#endif // HAVE_ICONV
301	} // namespace
302
303	ErrorOr<TextEncodingConverter>
304	TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) {
305
306	// Text encodings should be distinct.
307	if (CPFrom == CPTo)
308	return std::make_error_code(e: std::errc::invalid_argument);
309
310	ConversionType Conversion;
311	if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047)
312	Conversion = UTF8ToIBM1047;
313	else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8)
314	Conversion = IBM1047ToUTF8;
315	else
316	return std::make_error_code(e: std::errc::invalid_argument);
317
318	return TextEncodingConverter (
319	std::make_unique<TextEncodingConverterTable>(args&: Conversion));
320	}
321
322	ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From,
323	StringRef To) {
324	std::optional<TextEncoding> FromEncoding = getKnownEncoding(Name: From);
325	std::optional<TextEncoding> ToEncoding = getKnownEncoding(Name: To);
326	if (FromEncoding && ToEncoding) {
327	ErrorOr<TextEncodingConverter> Converter =
328	create(CPFrom: FromEncoding, CPTo: ToEncoding);
329	if (Converter)
330	return Converter;
331	}
332	#if HAVE_ICU
333	UErrorCode EC = U_ZERO_ERROR;
334	UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC));
335	if (U_FAILURE(EC))
336	return std::make_error_code(std::errc::invalid_argument);
337
338	UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC));
339	if (U_FAILURE(EC))
340	return std::make_error_code(std::errc::invalid_argument);
341
342	auto Converter = std::make_unique<TextEncodingConverterICU>(
343	std::move(FromConvDesc), std::move(ToConvDesc));
344	return TextEncodingConverter(std::move(Converter));
345	#elif HAVE_ICONV
346	iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
347	if (ConvDesc == (iconv_t)-`1`)
348	return std::make_error_code(std::errc::invalid_argument);
349	return TextEncodingConverter(
350	std::make_unique<TextEncodingConverterIconv>(ConvDesc));
351	#else
352	return std::make_error_code(e: std::errc::invalid_argument);
353	#endif
354	}
355

Browse the source code of llvm_projects/llvm/lib/Support/TextEncoding.cpp