| 1 | /** |
| 2 | * @file |
| 3 | * |
| 4 | * @brief Character encoding conversion functions |
| 5 | * |
| 6 | * @copyright See Copyright for the status of this software. |
| 7 | * |
| 8 | * @author Daniel Veillard |
| 9 | */ |
| 10 | |
| 11 | #ifndef __XML_CHAR_ENCODING_H__ |
| 12 | #define __XML_CHAR_ENCODING_H__ |
| 13 | |
| 14 | #include <libxml/xmlversion.h> |
| 15 | #include <libxml/xmlerror.h> |
| 16 | |
| 17 | #ifdef __cplusplus |
| 18 | extern "C" { |
| 19 | #endif |
| 20 | |
| 21 | /* |
| 22 | * Backward compatibility |
| 23 | */ |
| 24 | /** @cond ignore */ |
| 25 | #define UTF8Toisolat1 xmlUTF8ToIsolat1 |
| 26 | #define isolat1ToUTF8 xmlIsolat1ToUTF8 |
| 27 | /** @endcond */ |
| 28 | |
| 29 | /** |
| 30 | * Encoding conversion errors |
| 31 | */ |
| 32 | typedef enum { |
| 33 | /** Success */ |
| 34 | XML_ENC_ERR_SUCCESS = 0, |
| 35 | /** Internal or unclassified error */ |
| 36 | XML_ENC_ERR_INTERNAL = -1, |
| 37 | /** Invalid or untranslatable input sequence */ |
| 38 | XML_ENC_ERR_INPUT = -2, |
| 39 | /** Not enough space in output buffer */ |
| 40 | XML_ENC_ERR_SPACE = -3, |
| 41 | /** Out-of-memory error */ |
| 42 | XML_ENC_ERR_MEMORY = -4 |
| 43 | } xmlCharEncError; |
| 44 | |
| 45 | /** |
| 46 | * Predefined values for some standard encodings. |
| 47 | */ |
| 48 | typedef enum { |
| 49 | /** No char encoding detected */ |
| 50 | XML_CHAR_ENCODING_ERROR= -1, |
| 51 | /** No char encoding detected */ |
| 52 | XML_CHAR_ENCODING_NONE= 0, |
| 53 | /** UTF-8 */ |
| 54 | XML_CHAR_ENCODING_UTF8= 1, |
| 55 | /** UTF-16 little endian */ |
| 56 | XML_CHAR_ENCODING_UTF16LE= 2, |
| 57 | /** UTF-16 big endian */ |
| 58 | XML_CHAR_ENCODING_UTF16BE= 3, |
| 59 | /** UCS-4 little endian */ |
| 60 | XML_CHAR_ENCODING_UCS4LE= 4, |
| 61 | /** UCS-4 big endian */ |
| 62 | XML_CHAR_ENCODING_UCS4BE= 5, |
| 63 | /** EBCDIC uh! */ |
| 64 | XML_CHAR_ENCODING_EBCDIC= 6, |
| 65 | /** UCS-4 unusual ordering */ |
| 66 | XML_CHAR_ENCODING_UCS4_2143=7, |
| 67 | /** UCS-4 unusual ordering */ |
| 68 | XML_CHAR_ENCODING_UCS4_3412=8, |
| 69 | /** UCS-2 */ |
| 70 | XML_CHAR_ENCODING_UCS2= 9, |
| 71 | /** ISO-8859-1 ISO Latin 1 */ |
| 72 | XML_CHAR_ENCODING_8859_1= 10, |
| 73 | /** ISO-8859-2 ISO Latin 2 */ |
| 74 | XML_CHAR_ENCODING_8859_2= 11, |
| 75 | /** ISO-8859-3 */ |
| 76 | XML_CHAR_ENCODING_8859_3= 12, |
| 77 | /** ISO-8859-4 */ |
| 78 | XML_CHAR_ENCODING_8859_4= 13, |
| 79 | /** ISO-8859-5 */ |
| 80 | XML_CHAR_ENCODING_8859_5= 14, |
| 81 | /** ISO-8859-6 */ |
| 82 | XML_CHAR_ENCODING_8859_6= 15, |
| 83 | /** ISO-8859-7 */ |
| 84 | XML_CHAR_ENCODING_8859_7= 16, |
| 85 | /** ISO-8859-8 */ |
| 86 | XML_CHAR_ENCODING_8859_8= 17, |
| 87 | /** ISO-8859-9 */ |
| 88 | XML_CHAR_ENCODING_8859_9= 18, |
| 89 | /** ISO-2022-JP */ |
| 90 | XML_CHAR_ENCODING_2022_JP= 19, |
| 91 | /** Shift_JIS */ |
| 92 | XML_CHAR_ENCODING_SHIFT_JIS=20, |
| 93 | /** EUC-JP */ |
| 94 | XML_CHAR_ENCODING_EUC_JP= 21, |
| 95 | /** pure ASCII */ |
| 96 | XML_CHAR_ENCODING_ASCII= 22, |
| 97 | /** UTF-16 native, available since 2.14 */ |
| 98 | XML_CHAR_ENCODING_UTF16= 23, |
| 99 | /** HTML (output only), available since 2.14 */ |
| 100 | XML_CHAR_ENCODING_HTML= 24, |
| 101 | /** ISO-8859-10, available since 2.14 */ |
| 102 | XML_CHAR_ENCODING_8859_10= 25, |
| 103 | /** ISO-8859-11, available since 2.14 */ |
| 104 | XML_CHAR_ENCODING_8859_11= 26, |
| 105 | /** ISO-8859-13, available since 2.14 */ |
| 106 | XML_CHAR_ENCODING_8859_13= 27, |
| 107 | /** ISO-8859-14, available since 2.14 */ |
| 108 | XML_CHAR_ENCODING_8859_14= 28, |
| 109 | /** ISO-8859-15, available since 2.14 */ |
| 110 | XML_CHAR_ENCODING_8859_15= 29, |
| 111 | /** ISO-8859-16, available since 2.14 */ |
| 112 | XML_CHAR_ENCODING_8859_16= 30, |
| 113 | /** windows-1252, available since 2.15 */ |
| 114 | XML_CHAR_ENCODING_WINDOWS_1252 = 31 |
| 115 | } xmlCharEncoding; |
| 116 | |
| 117 | /** |
| 118 | * Encoding conversion flags |
| 119 | */ |
| 120 | typedef enum { |
| 121 | /** Create converter for input (conversion to UTF-8) */ |
| 122 | XML_ENC_INPUT = (1 << 0), |
| 123 | /** Create converter for output (conversion from UTF-8) */ |
| 124 | XML_ENC_OUTPUT = (1 << 1), |
| 125 | /** Use HTML5 mappings */ |
| 126 | XML_ENC_HTML = (1 << 2) |
| 127 | } xmlCharEncFlags; |
| 128 | |
| 129 | /** |
| 130 | * Convert characters to UTF-8. |
| 131 | * |
| 132 | * On success, the value of `inlen` after return is the number of |
| 133 | * bytes consumed and `outlen` is the number of bytes produced. |
| 134 | * |
| 135 | * @param out a pointer to an array of bytes to store the UTF-8 result |
| 136 | * @param outlen the length of `out` |
| 137 | * @param in a pointer to an array of chars in the original encoding |
| 138 | * @param inlen the length of `in` |
| 139 | * @returns the number of bytes written or an xmlCharEncError code. |
| 140 | */ |
| 141 | typedef int (*xmlCharEncodingInputFunc)(unsigned char *out, int *outlen, |
| 142 | const unsigned char *in, int *inlen); |
| 143 | |
| 144 | |
| 145 | /** |
| 146 | * Convert characters from UTF-8. |
| 147 | * |
| 148 | * On success, the value of `inlen` after return is the number of |
| 149 | * bytes consumed and `outlen` is the number of bytes produced. |
| 150 | * |
| 151 | * @param out a pointer to an array of bytes to store the result |
| 152 | * @param outlen the length of `out` |
| 153 | * @param in a pointer to an array of UTF-8 chars |
| 154 | * @param inlen the length of `in` |
| 155 | * @returns the number of bytes written or an xmlCharEncError code. |
| 156 | */ |
| 157 | typedef int (*xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, |
| 158 | const unsigned char *in, int *inlen); |
| 159 | |
| 160 | |
| 161 | /** |
| 162 | * Convert between character encodings. |
| 163 | * |
| 164 | * The value of `inlen` after return is the number of bytes consumed |
| 165 | * and `outlen` is the number of bytes produced. |
| 166 | * |
| 167 | * If the converter can consume partial multi-byte sequences, the |
| 168 | * `flush` flag can be used to detect truncated sequences at EOF. |
| 169 | * Otherwise, the flag can be ignored. |
| 170 | * |
| 171 | * @param vctxt conversion context |
| 172 | * @param out a pointer to an array of bytes to store the result |
| 173 | * @param outlen the length of `out` |
| 174 | * @param in a pointer to an array of input bytes |
| 175 | * @param inlen the length of `in` |
| 176 | * @param flush end of input |
| 177 | * @returns an xmlCharEncError code. |
| 178 | */ |
| 179 | typedef xmlCharEncError |
| 180 | (*xmlCharEncConvFunc)(void *vctxt, unsigned char *out, int *outlen, |
| 181 | const unsigned char *in, int *inlen, int flush); |
| 182 | |
| 183 | /** |
| 184 | * Free a conversion context. |
| 185 | * |
| 186 | * @param vctxt conversion context |
| 187 | */ |
| 188 | typedef void |
| 189 | (*xmlCharEncConvCtxtDtor)(void *vctxt); |
| 190 | |
| 191 | /** Character encoding converter */ |
| 192 | typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; |
| 193 | typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; |
| 194 | /** |
| 195 | * A character encoding conversion handler for non UTF-8 encodings. |
| 196 | * |
| 197 | * This structure will be made private. |
| 198 | */ |
| 199 | struct _xmlCharEncodingHandler { |
| 200 | char *name XML_DEPRECATED_MEMBER; |
| 201 | union { |
| 202 | xmlCharEncConvFunc func; |
| 203 | xmlCharEncodingInputFunc legacyFunc; |
| 204 | } input XML_DEPRECATED_MEMBER; |
| 205 | union { |
| 206 | xmlCharEncConvFunc func; |
| 207 | xmlCharEncodingOutputFunc legacyFunc; |
| 208 | } output XML_DEPRECATED_MEMBER; |
| 209 | void *inputCtxt XML_DEPRECATED_MEMBER; |
| 210 | void *outputCtxt XML_DEPRECATED_MEMBER; |
| 211 | xmlCharEncConvCtxtDtor ctxtDtor XML_DEPRECATED_MEMBER; |
| 212 | int flags XML_DEPRECATED_MEMBER; |
| 213 | }; |
| 214 | |
| 215 | /** |
| 216 | * If this function returns XML_ERR_OK, it must fill the `out` |
| 217 | * pointer with an encoding handler. The handler can be obtained |
| 218 | * from #xmlCharEncNewCustomHandler. |
| 219 | * |
| 220 | * `flags` can contain XML_ENC_INPUT, XML_ENC_OUTPUT or both. |
| 221 | * |
| 222 | * @param vctxt user data |
| 223 | * @param name encoding name |
| 224 | * @param flags bit mask of flags |
| 225 | * @param out pointer to resulting handler |
| 226 | * @returns an xmlParserErrors code. |
| 227 | */ |
| 228 | typedef xmlParserErrors |
| 229 | (*xmlCharEncConvImpl)(void *vctxt, const char *name, xmlCharEncFlags flags, |
| 230 | xmlCharEncodingHandler **out); |
| 231 | |
| 232 | /* |
| 233 | * Interfaces for encoding handlers. |
| 234 | */ |
| 235 | XML_DEPRECATED |
| 236 | XMLPUBFUN void |
| 237 | xmlInitCharEncodingHandlers (void); |
| 238 | XML_DEPRECATED |
| 239 | XMLPUBFUN void |
| 240 | xmlCleanupCharEncodingHandlers (void); |
| 241 | XML_DEPRECATED |
| 242 | XMLPUBFUN void |
| 243 | xmlRegisterCharEncodingHandler (xmlCharEncodingHandler *handler); |
| 244 | XMLPUBFUN xmlParserErrors |
| 245 | xmlLookupCharEncodingHandler (xmlCharEncoding enc, |
| 246 | xmlCharEncodingHandler **out); |
| 247 | XMLPUBFUN xmlParserErrors |
| 248 | xmlOpenCharEncodingHandler (const char *name, |
| 249 | int output, |
| 250 | xmlCharEncodingHandler **out); |
| 251 | XMLPUBFUN xmlParserErrors |
| 252 | xmlCreateCharEncodingHandler (const char *name, |
| 253 | xmlCharEncFlags flags, |
| 254 | xmlCharEncConvImpl impl, |
| 255 | void *implCtxt, |
| 256 | xmlCharEncodingHandler **out); |
| 257 | XMLPUBFUN xmlCharEncodingHandler * |
| 258 | xmlGetCharEncodingHandler (xmlCharEncoding enc); |
| 259 | XMLPUBFUN xmlCharEncodingHandler * |
| 260 | xmlFindCharEncodingHandler (const char *name); |
| 261 | XML_DEPRECATED |
| 262 | XMLPUBFUN xmlCharEncodingHandler * |
| 263 | xmlNewCharEncodingHandler (const char *name, |
| 264 | xmlCharEncodingInputFunc input, |
| 265 | xmlCharEncodingOutputFunc output); |
| 266 | XMLPUBFUN xmlParserErrors |
| 267 | xmlCharEncNewCustomHandler (const char *name, |
| 268 | xmlCharEncConvFunc input, |
| 269 | xmlCharEncConvFunc output, |
| 270 | xmlCharEncConvCtxtDtor ctxtDtor, |
| 271 | void *inputCtxt, |
| 272 | void *outputCtxt, |
| 273 | xmlCharEncodingHandler **out); |
| 274 | |
| 275 | /* |
| 276 | * Interfaces for encoding names and aliases. |
| 277 | */ |
| 278 | XML_DEPRECATED |
| 279 | XMLPUBFUN int |
| 280 | xmlAddEncodingAlias (const char *name, |
| 281 | const char *alias); |
| 282 | XML_DEPRECATED |
| 283 | XMLPUBFUN int |
| 284 | xmlDelEncodingAlias (const char *alias); |
| 285 | XML_DEPRECATED |
| 286 | XMLPUBFUN const char * |
| 287 | xmlGetEncodingAlias (const char *alias); |
| 288 | XML_DEPRECATED |
| 289 | XMLPUBFUN void |
| 290 | xmlCleanupEncodingAliases (void); |
| 291 | XMLPUBFUN xmlCharEncoding |
| 292 | xmlParseCharEncoding (const char *name); |
| 293 | XMLPUBFUN const char * |
| 294 | xmlGetCharEncodingName (xmlCharEncoding enc); |
| 295 | |
| 296 | /* |
| 297 | * Interfaces directly used by the parsers. |
| 298 | */ |
| 299 | XMLPUBFUN xmlCharEncoding |
| 300 | xmlDetectCharEncoding (const unsigned char *in, |
| 301 | int len); |
| 302 | |
| 303 | struct _xmlBuffer; |
| 304 | XMLPUBFUN int |
| 305 | xmlCharEncOutFunc (xmlCharEncodingHandler *handler, |
| 306 | struct _xmlBuffer *out, |
| 307 | struct _xmlBuffer *in); |
| 308 | |
| 309 | XMLPUBFUN int |
| 310 | xmlCharEncInFunc (xmlCharEncodingHandler *handler, |
| 311 | struct _xmlBuffer *out, |
| 312 | struct _xmlBuffer *in); |
| 313 | XML_DEPRECATED |
| 314 | XMLPUBFUN int |
| 315 | xmlCharEncFirstLine (xmlCharEncodingHandler *handler, |
| 316 | struct _xmlBuffer *out, |
| 317 | struct _xmlBuffer *in); |
| 318 | XMLPUBFUN int |
| 319 | xmlCharEncCloseFunc (xmlCharEncodingHandler *handler); |
| 320 | |
| 321 | /* |
| 322 | * Export a few useful functions |
| 323 | */ |
| 324 | #ifdef LIBXML_OUTPUT_ENABLED |
| 325 | XMLPUBFUN int |
| 326 | xmlUTF8ToIsolat1 (unsigned char *out, |
| 327 | int *outlen, |
| 328 | const unsigned char *in, |
| 329 | int *inlen); |
| 330 | #endif /* LIBXML_OUTPUT_ENABLED */ |
| 331 | XMLPUBFUN int |
| 332 | xmlIsolat1ToUTF8 (unsigned char *out, |
| 333 | int *outlen, |
| 334 | const unsigned char *in, |
| 335 | int *inlen); |
| 336 | #ifdef __cplusplus |
| 337 | } |
| 338 | #endif |
| 339 | |
| 340 | #endif /* __XML_CHAR_ENCODING_H__ */ |
| 341 | |