1//===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file provides utility classes to convert between different character
11/// encodings.
12///
13//===----------------------------------------------------------------------===//
14
15#include "llvm/Support/TextEncoding.h"
16#include "llvm/ADT/SmallString.h"
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/StringExtras.h"
19#include "llvm/Support/ConvertEBCDIC.h"
20#include <system_error>
21
22#if HAVE_ICU
23#include <unicode/ucnv.h>
24#elif HAVE_ICONV
25#include <iconv.h>
26#endif
27
28using namespace llvm;
29
30// Normalize the charset name with the charset alias matching algorithm proposed
31// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
32static void normalizeCharSetName(StringRef CSName,
33 SmallVectorImpl<char> &Normalized) {
34 bool PrevDigit = false;
35 for (auto Ch : CSName) {
36 if (isAlnum(C: Ch)) {
37 Ch = toLower(x: Ch);
38 if (Ch != '0' || PrevDigit) {
39 PrevDigit = isDigit(C: Ch);
40 Normalized.push_back(Elt: Ch);
41 }
42 }
43 }
44}
45
46// Maps the encoding name to enum constant if possible.
47static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
48 SmallString<16> Normalized;
49 normalizeCharSetName(CSName: Name, Normalized);
50 if (Normalized.equals(RHS: "utf8"))
51 return TextEncoding::UTF8;
52 if (Normalized.equals(RHS: "ibm1047"))
53 return TextEncoding::IBM1047;
54 return std::nullopt;
55}
56
57LLVM_ATTRIBUTE_UNUSED static void
58HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
59 SmallVectorImpl<char> &Result) {
60 // No space left in output buffer. Double the size of the underlying
61 // memory in the SmallVectorImpl, adjust pointer and length and continue
62 // the conversion.
63 Capacity =
64 (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size();
65 Result.resize(N: 0);
66 Result.resize_for_overwrite(N: Capacity);
67 Output = static_cast<char *>(Result.data());
68 OutputLength = Capacity;
69}
70
71namespace {
72enum ConversionType {
73 UTF8ToIBM1047,
74 IBM1047ToUTF8,
75};
76
77// Support conversion between EBCDIC 1047 and UTF-8. This class uses
78// built-in translation tables that allow for translation between the
79// aforementioned encodings. The use of tables for conversion is only
80// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
81// encodings are not supported.
82class TextEncodingConverterTable final
83 : public details::TextEncodingConverterImplBase {
84 const ConversionType ConvType;
85
86public:
87 TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {}
88
89 std::error_code convertString(StringRef Source,
90 SmallVectorImpl<char> &Result) override;
91
92 void reset() override {}
93};
94
95std::error_code
96TextEncodingConverterTable::convertString(StringRef Source,
97 SmallVectorImpl<char> &Result) {
98 switch (ConvType) {
99 case IBM1047ToUTF8:
100 ConverterEBCDIC::convertToUTF8(Source, Result);
101 return std::error_code();
102 case UTF8ToIBM1047:
103 return ConverterEBCDIC::convertToEBCDIC(Source, Result);
104 }
105 llvm_unreachable("Invalid ConvType!");
106 return std::error_code();
107}
108
109#if HAVE_ICU
110struct UConverterDeleter {
111 void operator()(UConverter *Converter) const {
112 if (Converter)
113 ucnv_close(Converter);
114 }
115};
116using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
117
118class TextEncodingConverterICU final
119 : public details::TextEncodingConverterImplBase {
120 UConverterUniquePtr FromConvDesc;
121 UConverterUniquePtr ToConvDesc;
122
123public:
124 TextEncodingConverterICU(UConverterUniquePtr FromConverter,
125 UConverterUniquePtr ToConverter)
126 : FromConvDesc(std::move(FromConverter)),
127 ToConvDesc(std::move(ToConverter)) {}
128
129 std::error_code convertString(StringRef Source,
130 SmallVectorImpl<char> &Result) override;
131
132 void reset() override;
133};
134
135// TODO: The current implementation discards the partial result and restarts the
136// conversion from the beginning if there is a conversion error due to
137// insufficient buffer size. In the future, it would better to save the partial
138// result and resume the conversion for the remaining string.
139// TODO: Improve translation of ICU errors to error_code
140std::error_code
141TextEncodingConverterICU::convertString(StringRef Source,
142 SmallVectorImpl<char> &Result) {
143 // Setup the input in case it has no backing data.
144 size_t InputLength = Source.size();
145 const char *In = InputLength ? const_cast<char *>(Source.data()) : "";
146
147 // Setup the output. We directly write into the SmallVector.
148 size_t Capacity = Result.capacity();
149 size_t OutputLength = Capacity;
150 Result.resize_for_overwrite(Capacity);
151 char *Output;
152 UErrorCode EC = U_ZERO_ERROR;
153
154 ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL,
155 &EC);
156 ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL,
157 NULL, &EC);
158 assert(U_SUCCESS(EC));
159
160 do {
161 EC = U_ZERO_ERROR;
162 const char *Input = In;
163
164 Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
165 ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input,
166 In + InputLength, /*pivotStart=*/NULL,
167 /*pivotSource=*/NULL, /*pivotTarget=*/NULL,
168 /*pivotLimit=*/NULL, /*reset=*/true,
169 /*flush=*/true, &EC);
170 if (U_FAILURE(EC)) {
171 if (EC == U_BUFFER_OVERFLOW_ERROR) {
172 if (Capacity < Result.max_size()) {
173 HandleOverflow(Capacity, Output, OutputLength, Result);
174 continue;
175 } else
176 return std::error_code(E2BIG, std::generic_category());
177 }
178 // Some other error occured.
179 Result.resize(Output - Result.data());
180 return std::error_code(EILSEQ, std::generic_category());
181 }
182 break;
183 } while (true);
184
185 Result.resize(Output - Result.data());
186 return std::error_code();
187}
188
189void TextEncodingConverterICU::reset() {
190 ucnv_reset(&*FromConvDesc);
191 ucnv_reset(&*ToConvDesc);
192}
193
194#elif HAVE_ICONV
195class TextEncodingConverterIconv final
196 : public details::TextEncodingConverterImplBase {
197 class UniqueIconvT {
198 iconv_t ConvDesc;
199
200 public:
201 operator iconv_t() const { return ConvDesc; }
202 UniqueIconvT(iconv_t CD) : ConvDesc(CD) {}
203 ~UniqueIconvT() {
204 if (ConvDesc != (iconv_t)-1) {
205 iconv_close(ConvDesc);
206 ConvDesc = (iconv_t)-1;
207 }
208 }
209 UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) {
210 Other.ConvDesc = (iconv_t)-1;
211 }
212 UniqueIconvT &operator=(UniqueIconvT &&Other) {
213 if (&Other != this) {
214 ConvDesc = Other.ConvDesc;
215 Other.ConvDesc = (iconv_t)-1;
216 }
217 return *this;
218 }
219 };
220 UniqueIconvT ConvDesc;
221
222public:
223 TextEncodingConverterIconv(UniqueIconvT ConvDesc)
224 : ConvDesc(std::move(ConvDesc)) {}
225
226 std::error_code convertString(StringRef Source,
227 SmallVectorImpl<char> &Result) override;
228
229 void reset() override;
230};
231
232// TODO: The current implementation discards the partial result and restarts the
233// conversion from the beginning if there is a conversion error due to
234// insufficient buffer size. In the future, it would better to save the partial
235// result and resume the conversion for the remaining string.
236std::error_code
237TextEncodingConverterIconv::convertString(StringRef Source,
238 SmallVectorImpl<char> &Result) {
239 // Setup the output. We directly write into the SmallVector.
240 size_t Capacity = Result.capacity();
241 char *Output = static_cast<char *>(Result.data());
242 size_t OutputLength = Capacity;
243 Result.resize_for_overwrite(Capacity);
244
245 size_t Ret;
246 // Handle errors returned from iconv().
247 auto HandleError = [&Capacity, &Output, &OutputLength, &Result,
248 this](size_t Ret) {
249 if (Ret == static_cast<size_t>(-1)) {
250 // An error occured. Check if we can gracefully handle it.
251 if (errno == E2BIG && Capacity < Result.max_size()) {
252 HandleOverflow(Capacity, Output, OutputLength, Result);
253 // Reset converter
254 reset();
255 return std::error_code();
256 } else {
257 // Some other error occured.
258 Result.resize(Output - Result.data());
259 return std::error_code(errno, std::generic_category());
260 }
261 } else {
262 // A positive return value indicates that some characters were converted
263 // in a nonreversible way, that is, replaced with a SUB symbol. Returning
264 // an error in this case makes sure that both conversion routines behave
265 // in the same way.
266 return std::make_error_code(std::errc::illegal_byte_sequence);
267 }
268 };
269
270 do {
271 // Setup the input. Use nullptr to reset iconv state if input length is
272 // zero.
273 size_t InputLength = Source.size();
274 char *Input = const_cast<char *>(InputLength ? Source.data() : "");
275 Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
276 if (Ret != 0) {
277 if (auto EC = HandleError(Ret))
278 return EC;
279 continue;
280 }
281 // Flush the converter
282 Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength);
283 if (Ret != 0) {
284 if (auto EC = HandleError(Ret))
285 return EC;
286 continue;
287 }
288 break;
289 } while (true);
290
291 // Re-adjust size to actual size.
292 Result.resize(Output - Result.data());
293 return std::error_code();
294}
295
296inline void TextEncodingConverterIconv::reset() {
297 iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
298}
299
300#endif // HAVE_ICONV
301} // namespace
302
303ErrorOr<TextEncodingConverter>
304TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) {
305
306 // Text encodings should be distinct.
307 if (CPFrom == CPTo)
308 return std::make_error_code(e: std::errc::invalid_argument);
309
310 ConversionType Conversion;
311 if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047)
312 Conversion = UTF8ToIBM1047;
313 else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8)
314 Conversion = IBM1047ToUTF8;
315 else
316 return std::make_error_code(e: std::errc::invalid_argument);
317
318 return TextEncodingConverter(
319 std::make_unique<TextEncodingConverterTable>(args&: Conversion));
320}
321
322ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From,
323 StringRef To) {
324 std::optional<TextEncoding> FromEncoding = getKnownEncoding(Name: From);
325 std::optional<TextEncoding> ToEncoding = getKnownEncoding(Name: To);
326 if (FromEncoding && ToEncoding) {
327 ErrorOr<TextEncodingConverter> Converter =
328 create(CPFrom: *FromEncoding, CPTo: *ToEncoding);
329 if (Converter)
330 return Converter;
331 }
332#if HAVE_ICU
333 UErrorCode EC = U_ZERO_ERROR;
334 UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC));
335 if (U_FAILURE(EC))
336 return std::make_error_code(std::errc::invalid_argument);
337
338 UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC));
339 if (U_FAILURE(EC))
340 return std::make_error_code(std::errc::invalid_argument);
341
342 auto Converter = std::make_unique<TextEncodingConverterICU>(
343 std::move(FromConvDesc), std::move(ToConvDesc));
344 return TextEncodingConverter(std::move(Converter));
345#elif HAVE_ICONV
346 iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
347 if (ConvDesc == (iconv_t)-1)
348 return std::make_error_code(std::errc::invalid_argument);
349 return TextEncodingConverter(
350 std::make_unique<TextEncodingConverterIconv>(ConvDesc));
351#else
352 return std::make_error_code(e: std::errc::invalid_argument);
353#endif
354}
355