1/**
2 * @file
3 *
4 * @brief Character encoding conversion functions
5 *
6 * @copyright See Copyright for the status of this software.
7 *
8 * @author Daniel Veillard
9 */
10
11#ifndef __XML_CHAR_ENCODING_H__
12#define __XML_CHAR_ENCODING_H__
13
14#include <libxml/xmlversion.h>
15#include <libxml/xmlerror.h>
16
17#ifdef __cplusplus
18extern "C" {
19#endif
20
21/*
22 * Backward compatibility
23 */
24/** @cond ignore */
25#define UTF8Toisolat1 xmlUTF8ToIsolat1
26#define isolat1ToUTF8 xmlIsolat1ToUTF8
27/** @endcond */
28
29/**
30 * Encoding conversion errors
31 */
32typedef enum {
33 /** Success */
34 XML_ENC_ERR_SUCCESS = 0,
35 /** Internal or unclassified error */
36 XML_ENC_ERR_INTERNAL = -1,
37 /** Invalid or untranslatable input sequence */
38 XML_ENC_ERR_INPUT = -2,
39 /** Not enough space in output buffer */
40 XML_ENC_ERR_SPACE = -3,
41 /** Out-of-memory error */
42 XML_ENC_ERR_MEMORY = -4
43} xmlCharEncError;
44
45/**
46 * Predefined values for some standard encodings.
47 */
48typedef enum {
49 /** No char encoding detected */
50 XML_CHAR_ENCODING_ERROR= -1,
51 /** No char encoding detected */
52 XML_CHAR_ENCODING_NONE= 0,
53 /** UTF-8 */
54 XML_CHAR_ENCODING_UTF8= 1,
55 /** UTF-16 little endian */
56 XML_CHAR_ENCODING_UTF16LE= 2,
57 /** UTF-16 big endian */
58 XML_CHAR_ENCODING_UTF16BE= 3,
59 /** UCS-4 little endian */
60 XML_CHAR_ENCODING_UCS4LE= 4,
61 /** UCS-4 big endian */
62 XML_CHAR_ENCODING_UCS4BE= 5,
63 /** EBCDIC uh! */
64 XML_CHAR_ENCODING_EBCDIC= 6,
65 /** UCS-4 unusual ordering */
66 XML_CHAR_ENCODING_UCS4_2143=7,
67 /** UCS-4 unusual ordering */
68 XML_CHAR_ENCODING_UCS4_3412=8,
69 /** UCS-2 */
70 XML_CHAR_ENCODING_UCS2= 9,
71 /** ISO-8859-1 ISO Latin 1 */
72 XML_CHAR_ENCODING_8859_1= 10,
73 /** ISO-8859-2 ISO Latin 2 */
74 XML_CHAR_ENCODING_8859_2= 11,
75 /** ISO-8859-3 */
76 XML_CHAR_ENCODING_8859_3= 12,
77 /** ISO-8859-4 */
78 XML_CHAR_ENCODING_8859_4= 13,
79 /** ISO-8859-5 */
80 XML_CHAR_ENCODING_8859_5= 14,
81 /** ISO-8859-6 */
82 XML_CHAR_ENCODING_8859_6= 15,
83 /** ISO-8859-7 */
84 XML_CHAR_ENCODING_8859_7= 16,
85 /** ISO-8859-8 */
86 XML_CHAR_ENCODING_8859_8= 17,
87 /** ISO-8859-9 */
88 XML_CHAR_ENCODING_8859_9= 18,
89 /** ISO-2022-JP */
90 XML_CHAR_ENCODING_2022_JP= 19,
91 /** Shift_JIS */
92 XML_CHAR_ENCODING_SHIFT_JIS=20,
93 /** EUC-JP */
94 XML_CHAR_ENCODING_EUC_JP= 21,
95 /** pure ASCII */
96 XML_CHAR_ENCODING_ASCII= 22,
97 /** UTF-16 native, available since 2.14 */
98 XML_CHAR_ENCODING_UTF16= 23,
99 /** HTML (output only), available since 2.14 */
100 XML_CHAR_ENCODING_HTML= 24,
101 /** ISO-8859-10, available since 2.14 */
102 XML_CHAR_ENCODING_8859_10= 25,
103 /** ISO-8859-11, available since 2.14 */
104 XML_CHAR_ENCODING_8859_11= 26,
105 /** ISO-8859-13, available since 2.14 */
106 XML_CHAR_ENCODING_8859_13= 27,
107 /** ISO-8859-14, available since 2.14 */
108 XML_CHAR_ENCODING_8859_14= 28,
109 /** ISO-8859-15, available since 2.14 */
110 XML_CHAR_ENCODING_8859_15= 29,
111 /** ISO-8859-16, available since 2.14 */
112 XML_CHAR_ENCODING_8859_16= 30,
113 /** windows-1252, available since 2.15 */
114 XML_CHAR_ENCODING_WINDOWS_1252 = 31
115} xmlCharEncoding;
116
117/**
118 * Encoding conversion flags
119 */
120typedef enum {
121 /** Create converter for input (conversion to UTF-8) */
122 XML_ENC_INPUT = (1 << 0),
123 /** Create converter for output (conversion from UTF-8) */
124 XML_ENC_OUTPUT = (1 << 1),
125 /** Use HTML5 mappings */
126 XML_ENC_HTML = (1 << 2)
127} xmlCharEncFlags;
128
129/**
130 * Convert characters to UTF-8.
131 *
132 * On success, the value of `inlen` after return is the number of
133 * bytes consumed and `outlen` is the number of bytes produced.
134 *
135 * @param out a pointer to an array of bytes to store the UTF-8 result
136 * @param outlen the length of `out`
137 * @param in a pointer to an array of chars in the original encoding
138 * @param inlen the length of `in`
139 * @returns the number of bytes written or an xmlCharEncError code.
140 */
141typedef int (*xmlCharEncodingInputFunc)(unsigned char *out, int *outlen,
142 const unsigned char *in, int *inlen);
143
144
145/**
146 * Convert characters from UTF-8.
147 *
148 * On success, the value of `inlen` after return is the number of
149 * bytes consumed and `outlen` is the number of bytes produced.
150 *
151 * @param out a pointer to an array of bytes to store the result
152 * @param outlen the length of `out`
153 * @param in a pointer to an array of UTF-8 chars
154 * @param inlen the length of `in`
155 * @returns the number of bytes written or an xmlCharEncError code.
156 */
157typedef int (*xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
158 const unsigned char *in, int *inlen);
159
160
161/**
162 * Convert between character encodings.
163 *
164 * The value of `inlen` after return is the number of bytes consumed
165 * and `outlen` is the number of bytes produced.
166 *
167 * If the converter can consume partial multi-byte sequences, the
168 * `flush` flag can be used to detect truncated sequences at EOF.
169 * Otherwise, the flag can be ignored.
170 *
171 * @param vctxt conversion context
172 * @param out a pointer to an array of bytes to store the result
173 * @param outlen the length of `out`
174 * @param in a pointer to an array of input bytes
175 * @param inlen the length of `in`
176 * @param flush end of input
177 * @returns an xmlCharEncError code.
178 */
179typedef xmlCharEncError
180(*xmlCharEncConvFunc)(void *vctxt, unsigned char *out, int *outlen,
181 const unsigned char *in, int *inlen, int flush);
182
183/**
184 * Free a conversion context.
185 *
186 * @param vctxt conversion context
187 */
188typedef void
189(*xmlCharEncConvCtxtDtor)(void *vctxt);
190
191/** Character encoding converter */
192typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
193typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr;
194/**
195 * A character encoding conversion handler for non UTF-8 encodings.
196 *
197 * This structure will be made private.
198 */
199struct _xmlCharEncodingHandler {
200 char *name XML_DEPRECATED_MEMBER;
201 union {
202 xmlCharEncConvFunc func;
203 xmlCharEncodingInputFunc legacyFunc;
204 } input XML_DEPRECATED_MEMBER;
205 union {
206 xmlCharEncConvFunc func;
207 xmlCharEncodingOutputFunc legacyFunc;
208 } output XML_DEPRECATED_MEMBER;
209 void *inputCtxt XML_DEPRECATED_MEMBER;
210 void *outputCtxt XML_DEPRECATED_MEMBER;
211 xmlCharEncConvCtxtDtor ctxtDtor XML_DEPRECATED_MEMBER;
212 int flags XML_DEPRECATED_MEMBER;
213};
214
215/**
216 * If this function returns XML_ERR_OK, it must fill the `out`
217 * pointer with an encoding handler. The handler can be obtained
218 * from #xmlCharEncNewCustomHandler.
219 *
220 * `flags` can contain XML_ENC_INPUT, XML_ENC_OUTPUT or both.
221 *
222 * @param vctxt user data
223 * @param name encoding name
224 * @param flags bit mask of flags
225 * @param out pointer to resulting handler
226 * @returns an xmlParserErrors code.
227 */
228typedef xmlParserErrors
229(*xmlCharEncConvImpl)(void *vctxt, const char *name, xmlCharEncFlags flags,
230 xmlCharEncodingHandler **out);
231
232/*
233 * Interfaces for encoding handlers.
234 */
235XML_DEPRECATED
236XMLPUBFUN void
237 xmlInitCharEncodingHandlers (void);
238XML_DEPRECATED
239XMLPUBFUN void
240 xmlCleanupCharEncodingHandlers (void);
241XML_DEPRECATED
242XMLPUBFUN void
243 xmlRegisterCharEncodingHandler (xmlCharEncodingHandler *handler);
244XMLPUBFUN xmlParserErrors
245 xmlLookupCharEncodingHandler (xmlCharEncoding enc,
246 xmlCharEncodingHandler **out);
247XMLPUBFUN xmlParserErrors
248 xmlOpenCharEncodingHandler (const char *name,
249 int output,
250 xmlCharEncodingHandler **out);
251XMLPUBFUN xmlParserErrors
252 xmlCreateCharEncodingHandler (const char *name,
253 xmlCharEncFlags flags,
254 xmlCharEncConvImpl impl,
255 void *implCtxt,
256 xmlCharEncodingHandler **out);
257XMLPUBFUN xmlCharEncodingHandler *
258 xmlGetCharEncodingHandler (xmlCharEncoding enc);
259XMLPUBFUN xmlCharEncodingHandler *
260 xmlFindCharEncodingHandler (const char *name);
261XML_DEPRECATED
262XMLPUBFUN xmlCharEncodingHandler *
263 xmlNewCharEncodingHandler (const char *name,
264 xmlCharEncodingInputFunc input,
265 xmlCharEncodingOutputFunc output);
266XMLPUBFUN xmlParserErrors
267 xmlCharEncNewCustomHandler (const char *name,
268 xmlCharEncConvFunc input,
269 xmlCharEncConvFunc output,
270 xmlCharEncConvCtxtDtor ctxtDtor,
271 void *inputCtxt,
272 void *outputCtxt,
273 xmlCharEncodingHandler **out);
274
275/*
276 * Interfaces for encoding names and aliases.
277 */
278XML_DEPRECATED
279XMLPUBFUN int
280 xmlAddEncodingAlias (const char *name,
281 const char *alias);
282XML_DEPRECATED
283XMLPUBFUN int
284 xmlDelEncodingAlias (const char *alias);
285XML_DEPRECATED
286XMLPUBFUN const char *
287 xmlGetEncodingAlias (const char *alias);
288XML_DEPRECATED
289XMLPUBFUN void
290 xmlCleanupEncodingAliases (void);
291XMLPUBFUN xmlCharEncoding
292 xmlParseCharEncoding (const char *name);
293XMLPUBFUN const char *
294 xmlGetCharEncodingName (xmlCharEncoding enc);
295
296/*
297 * Interfaces directly used by the parsers.
298 */
299XMLPUBFUN xmlCharEncoding
300 xmlDetectCharEncoding (const unsigned char *in,
301 int len);
302
303struct _xmlBuffer;
304XMLPUBFUN int
305 xmlCharEncOutFunc (xmlCharEncodingHandler *handler,
306 struct _xmlBuffer *out,
307 struct _xmlBuffer *in);
308
309XMLPUBFUN int
310 xmlCharEncInFunc (xmlCharEncodingHandler *handler,
311 struct _xmlBuffer *out,
312 struct _xmlBuffer *in);
313XML_DEPRECATED
314XMLPUBFUN int
315 xmlCharEncFirstLine (xmlCharEncodingHandler *handler,
316 struct _xmlBuffer *out,
317 struct _xmlBuffer *in);
318XMLPUBFUN int
319 xmlCharEncCloseFunc (xmlCharEncodingHandler *handler);
320
321/*
322 * Export a few useful functions
323 */
324#ifdef LIBXML_OUTPUT_ENABLED
325XMLPUBFUN int
326 xmlUTF8ToIsolat1 (unsigned char *out,
327 int *outlen,
328 const unsigned char *in,
329 int *inlen);
330#endif /* LIBXML_OUTPUT_ENABLED */
331XMLPUBFUN int
332 xmlIsolat1ToUTF8 (unsigned char *out,
333 int *outlen,
334 const unsigned char *in,
335 int *inlen);
336#ifdef __cplusplus
337}
338#endif
339
340#endif /* __XML_CHAR_ENCODING_H__ */
341