1/*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000 Clark Cooper <[email protected]>
11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <[email protected]>
12 Copyright (c) 2002 Greg Stein <[email protected]>
13 Copyright (c) 2002-2016 Karl Waclawek <[email protected]>
14 Copyright (c) 2005-2009 Steven Solie <[email protected]>
15 Copyright (c) 2016-2022 Sebastian Pipping <[email protected]>
16 Copyright (c) 2016 Pascal Cuoq <[email protected]>
17 Copyright (c) 2016 Don Lewis <[email protected]>
18 Copyright (c) 2017 Rhodri James <[email protected]>
19 Copyright (c) 2017 Alexander Bluhm <[email protected]>
20 Copyright (c) 2017 Benbuck Nason <[email protected]>
21 Copyright (c) 2017 José Gutiérrez de la Concha <[email protected]>
22 Copyright (c) 2019 David Loffredo <[email protected]>
23 Copyright (c) 2021 Dong-hee Na <[email protected]>
24 Licensed under the MIT license:
25
26 Permission is hereby granted, free of charge, to any person obtaining
27 a copy of this software and associated documentation files (the
28 "Software"), to deal in the Software without restriction, including
29 without limitation the rights to use, copy, modify, merge, publish,
30 distribute, sublicense, and/or sell copies of the Software, and to permit
31 persons to whom the Software is furnished to do so, subject to the
32 following conditions:
33
34 The above copyright notice and this permission notice shall be included
35 in all copies or substantial portions of the Software.
36
37 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
38 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
39 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
40 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
41 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
42 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
43 USE OR OTHER DEALINGS IN THE SOFTWARE.
44*/
45
46#include <expat_config.h>
47
48#include <stddef.h>
49#include <string.h> /* memcpy */
50#include <stdbool.h>
51
52#ifdef _WIN32
53# include "winconfig.h"
54#endif
55
56#include "expat_external.h"
57#include "internal.h"
58#include "xmltok.h"
59#include "nametab.h"
60
61#ifdef XML_DTD
62# define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
63#else
64# define IGNORE_SECTION_TOK_VTABLE /* as nothing */
65#endif
66
67#define VTABLE1 \
68 {PREFIX(prologTok), PREFIX(contentTok), \
69 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
70 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
71 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
72 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
73 PREFIX(updatePosition), PREFIX(isPublicId)
74
75#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
76
77#define UCS2_GET_NAMING(pages, hi, lo) \
78 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
79
80/* A 2 byte UTF-8 representation splits the characters 11 bits between
81 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
82 pages, 3 bits to add to that index and 5 bits to generate the mask.
83*/
84#define UTF8_GET_NAMING2(pages, byte) \
85 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
86 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
87 & (1u << (((byte)[1]) & 0x1F)))
88
89/* A 3 byte UTF-8 representation splits the characters 16 bits between
90 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
91 into pages, 3 bits to add to that index and 5 bits to generate the
92 mask.
93*/
94#define UTF8_GET_NAMING3(pages, byte) \
95 (namingBitmap \
96 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
97 << 3) \
98 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
99 & (1u << (((byte)[2]) & 0x1F)))
100
101/* Detection of invalid UTF-8 sequences is based on Table 3.1B
102 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
103 with the additional restriction of not allowing the Unicode
104 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
105 Implementation details:
106 (A & 0x80) == 0 means A < 0x80
107 and
108 (A & 0xC0) == 0xC0 means A > 0xBF
109*/
110
111#define UTF8_INVALID2(p) \
112 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
113
114#define UTF8_INVALID3(p) \
115 (((p)[2] & 0x80) == 0 \
116 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
117 : ((p)[2] & 0xC0) == 0xC0) \
118 || ((*p) == 0xE0 \
119 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
120 : ((p)[1] & 0x80) == 0 \
121 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
122
123#define UTF8_INVALID4(p) \
124 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \
125 || ((p)[2] & 0xC0) == 0xC0 \
126 || ((*p) == 0xF0 \
127 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
128 : ((p)[1] & 0x80) == 0 \
129 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
130
131static int PTRFASTCALL
132isNever(const ENCODING *enc, const char *p) {
133 UNUSED_P(enc);
134 UNUSED_P(p);
135 return 0;
136}
137
138static int PTRFASTCALL
139utf8_isName2(const ENCODING *enc, const char *p) {
140 UNUSED_P(enc);
141 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
142}
143
144static int PTRFASTCALL
145utf8_isName3(const ENCODING *enc, const char *p) {
146 UNUSED_P(enc);
147 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
148}
149
150#define utf8_isName4 isNever
151
152static int PTRFASTCALL
153utf8_isNmstrt2(const ENCODING *enc, const char *p) {
154 UNUSED_P(enc);
155 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
156}
157
158static int PTRFASTCALL
159utf8_isNmstrt3(const ENCODING *enc, const char *p) {
160 UNUSED_P(enc);
161 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
162}
163
164#define utf8_isNmstrt4 isNever
165
166static int PTRFASTCALL
167utf8_isInvalid2(const ENCODING *enc, const char *p) {
168 UNUSED_P(enc);
169 return UTF8_INVALID2((const unsigned char *)p);
170}
171
172static int PTRFASTCALL
173utf8_isInvalid3(const ENCODING *enc, const char *p) {
174 UNUSED_P(enc);
175 return UTF8_INVALID3((const unsigned char *)p);
176}
177
178static int PTRFASTCALL
179utf8_isInvalid4(const ENCODING *enc, const char *p) {
180 UNUSED_P(enc);
181 return UTF8_INVALID4((const unsigned char *)p);
182}
183
184struct normal_encoding {
185 ENCODING enc;
186 unsigned char type[256];
187#ifdef XML_MIN_SIZE
188 int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
189 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
190 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
191 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
192 int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
193#endif /* XML_MIN_SIZE */
194 int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
195 int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
196 int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
197 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
198 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
199 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
200 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
201 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
202 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
203};
204
205#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
206
207#ifdef XML_MIN_SIZE
208
209# define STANDARD_VTABLE(E) \
210 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
211
212#else
213
214# define STANDARD_VTABLE(E) /* as nothing */
215
216#endif
217
218#define NORMAL_VTABLE(E) \
219 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
220 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
221
222#define NULL_VTABLE \
223 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \
224 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \
225 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
226
227static int FASTCALL checkCharRefNumber(int);
228
229#include "xmltok_impl.h"
230#include "ascii.h"
231
232#ifdef XML_MIN_SIZE
233# define sb_isNameMin isNever
234# define sb_isNmstrtMin isNever
235#endif
236
237#ifdef XML_MIN_SIZE
238# define MINBPC(enc) ((enc)->minBytesPerChar)
239#else
240/* minimum bytes per character */
241# define MINBPC(enc) 1
242#endif
243
244#define SB_BYTE_TYPE(enc, p) \
245 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
246
247#ifdef XML_MIN_SIZE
248static int PTRFASTCALL
249sb_byteType(const ENCODING *enc, const char *p) {
250 return SB_BYTE_TYPE(enc, p);
251}
252# define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
253#else
254# define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
255#endif
256
257#ifdef XML_MIN_SIZE
258# define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
259static int PTRFASTCALL
260sb_byteToAscii(const ENCODING *enc, const char *p) {
261 UNUSED_P(enc);
262 return *p;
263}
264#else
265# define BYTE_TO_ASCII(enc, p) (*(p))
266#endif
267
268#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
269#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
270#ifdef XML_MIN_SIZE
271# define IS_INVALID_CHAR(enc, p, n) \
272 (AS_NORMAL_ENCODING(enc)->isInvalid##n \
273 && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
274#else
275# define IS_INVALID_CHAR(enc, p, n) \
276 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
277#endif
278
279#ifdef XML_MIN_SIZE
280# define IS_NAME_CHAR_MINBPC(enc, p) \
281 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
282# define IS_NMSTRT_CHAR_MINBPC(enc, p) \
283 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
284#else
285# define IS_NAME_CHAR_MINBPC(enc, p) (0)
286# define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
287#endif
288
289#ifdef XML_MIN_SIZE
290# define CHAR_MATCHES(enc, p, c) \
291 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
292static int PTRCALL
293sb_charMatches(const ENCODING *enc, const char *p, int c) {
294 UNUSED_P(enc);
295 return *p == c;
296}
297#else
298/* c is an ASCII character */
299# define CHAR_MATCHES(enc, p, c) (*(p) == c)
300#endif
301
302#define PREFIX(ident) normal_##ident
303#define XML_TOK_IMPL_C
304#include "xmltok_impl.c"
305#undef XML_TOK_IMPL_C
306
307#undef MINBPC
308#undef BYTE_TYPE
309#undef BYTE_TO_ASCII
310#undef CHAR_MATCHES
311#undef IS_NAME_CHAR
312#undef IS_NAME_CHAR_MINBPC
313#undef IS_NMSTRT_CHAR
314#undef IS_NMSTRT_CHAR_MINBPC
315#undef IS_INVALID_CHAR
316
317enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
318 UTF8_cval1 = 0x00,
319 UTF8_cval2 = 0xc0,
320 UTF8_cval3 = 0xe0,
321 UTF8_cval4 = 0xf0
322};
323
324void
325_INTERNAL_trim_to_complete_utf8_characters(const char *from,
326 const char **fromLimRef) {
327 const char *fromLim = *fromLimRef;
328 size_t walked = 0;
329 for (; fromLim > from; fromLim--, walked++) {
330 const unsigned char prev = (unsigned char)fromLim[-1];
331 if ((prev & 0xf8u)
332 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
333 if (walked + 1 >= 4) {
334 fromLim += 4 - 1;
335 break;
336 } else {
337 walked = 0;
338 }
339 } else if ((prev & 0xf0u)
340 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
341 if (walked + 1 >= 3) {
342 fromLim += 3 - 1;
343 break;
344 } else {
345 walked = 0;
346 }
347 } else if ((prev & 0xe0u)
348 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
349 if (walked + 1 >= 2) {
350 fromLim += 2 - 1;
351 break;
352 } else {
353 walked = 0;
354 }
355 } else if ((prev & 0x80u)
356 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
357 break;
358 }
359 }
360 *fromLimRef = fromLim;
361}
362
363static enum XML_Convert_Result PTRCALL
364utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
365 char **toP, const char *toLim) {
366 bool input_incomplete = false;
367 bool output_exhausted = false;
368
369 /* Avoid copying partial characters (due to limited space). */
370 const ptrdiff_t bytesAvailable = fromLim - *fromP;
371 const ptrdiff_t bytesStorable = toLim - *toP;
372 UNUSED_P(enc);
373 if (bytesAvailable > bytesStorable) {
374 fromLim = *fromP + bytesStorable;
375 output_exhausted = true;
376 }
377
378 /* Avoid copying partial characters (from incomplete input). */
379 {
380 const char *const fromLimBefore = fromLim;
381 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
382 if (fromLim < fromLimBefore) {
383 input_incomplete = true;
384 }
385 }
386
387 {
388 const ptrdiff_t bytesToCopy = fromLim - *fromP;
389 memcpy(*toP, *fromP, bytesToCopy);
390 *fromP += bytesToCopy;
391 *toP += bytesToCopy;
392 }
393
394 if (output_exhausted) /* needs to go first */
395 return XML_CONVERT_OUTPUT_EXHAUSTED;
396 else if (input_incomplete)
397 return XML_CONVERT_INPUT_INCOMPLETE;
398 else
399 return XML_CONVERT_COMPLETED;
400}
401
402static enum XML_Convert_Result PTRCALL
403utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
404 unsigned short **toP, const unsigned short *toLim) {
405 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
406 unsigned short *to = *toP;
407 const char *from = *fromP;
408 while (from < fromLim && to < toLim) {
409 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
410 case BT_LEAD2:
411 if (fromLim - from < 2) {
412 res = XML_CONVERT_INPUT_INCOMPLETE;
413 goto after;
414 }
415 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
416 from += 2;
417 break;
418 case BT_LEAD3:
419 if (fromLim - from < 3) {
420 res = XML_CONVERT_INPUT_INCOMPLETE;
421 goto after;
422 }
423 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
424 | (from[2] & 0x3f));
425 from += 3;
426 break;
427 case BT_LEAD4: {
428 unsigned long n;
429 if (toLim - to < 2) {
430 res = XML_CONVERT_OUTPUT_EXHAUSTED;
431 goto after;
432 }
433 if (fromLim - from < 4) {
434 res = XML_CONVERT_INPUT_INCOMPLETE;
435 goto after;
436 }
437 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
438 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
439 n -= 0x10000;
440 to[0] = (unsigned short)((n >> 10) | 0xD800);
441 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
442 to += 2;
443 from += 4;
444 } break;
445 default:
446 *to++ = *from++;
447 break;
448 }
449 }
450 if (from < fromLim)
451 res = XML_CONVERT_OUTPUT_EXHAUSTED;
452after:
453 *fromP = from;
454 *toP = to;
455 return res;
456}
457
458#ifdef XML_NS
459static const struct normal_encoding utf8_encoding_ns
460 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
461 {
462# include "asciitab.h"
463# include "utf8tab.h"
464 },
465 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
466#endif
467
468static const struct normal_encoding utf8_encoding
469 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
470 {
471#define BT_COLON BT_NMSTRT
472#include "asciitab.h"
473#undef BT_COLON
474#include "utf8tab.h"
475 },
476 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
477
478#ifdef XML_NS
479
480static const struct normal_encoding internal_utf8_encoding_ns
481 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
482 {
483# include "iasciitab.h"
484# include "utf8tab.h"
485 },
486 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
487
488#endif
489
490static const struct normal_encoding internal_utf8_encoding
491 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
492 {
493#define BT_COLON BT_NMSTRT
494#include "iasciitab.h"
495#undef BT_COLON
496#include "utf8tab.h"
497 },
498 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
499
500static enum XML_Convert_Result PTRCALL
501latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
502 char **toP, const char *toLim) {
503 UNUSED_P(enc);
504 for (;;) {
505 unsigned char c;
506 if (*fromP == fromLim)
507 return XML_CONVERT_COMPLETED;
508 c = (unsigned char)**fromP;
509 if (c & 0x80) {
510 if (toLim - *toP < 2)
511 return XML_CONVERT_OUTPUT_EXHAUSTED;
512 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
513 *(*toP)++ = (char)((c & 0x3f) | 0x80);
514 (*fromP)++;
515 } else {
516 if (*toP == toLim)
517 return XML_CONVERT_OUTPUT_EXHAUSTED;
518 *(*toP)++ = *(*fromP)++;
519 }
520 }
521}
522
523static enum XML_Convert_Result PTRCALL
524latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
525 unsigned short **toP, const unsigned short *toLim) {
526 UNUSED_P(enc);
527 while (*fromP < fromLim && *toP < toLim)
528 *(*toP)++ = (unsigned char)*(*fromP)++;
529
530 if ((*toP == toLim) && (*fromP < fromLim))
531 return XML_CONVERT_OUTPUT_EXHAUSTED;
532 else
533 return XML_CONVERT_COMPLETED;
534}
535
536#ifdef XML_NS
537
538static const struct normal_encoding latin1_encoding_ns
539 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
540 {
541# include "asciitab.h"
542# include "latin1tab.h"
543 },
544 STANDARD_VTABLE(sb_) NULL_VTABLE};
545
546#endif
547
548static const struct normal_encoding latin1_encoding
549 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
550 {
551#define BT_COLON BT_NMSTRT
552#include "asciitab.h"
553#undef BT_COLON
554#include "latin1tab.h"
555 },
556 STANDARD_VTABLE(sb_) NULL_VTABLE};
557
558static enum XML_Convert_Result PTRCALL
559ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
560 char **toP, const char *toLim) {
561 UNUSED_P(enc);
562 while (*fromP < fromLim && *toP < toLim)
563 *(*toP)++ = *(*fromP)++;
564
565 if ((*toP == toLim) && (*fromP < fromLim))
566 return XML_CONVERT_OUTPUT_EXHAUSTED;
567 else
568 return XML_CONVERT_COMPLETED;
569}
570
571#ifdef XML_NS
572
573static const struct normal_encoding ascii_encoding_ns
574 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
575 {
576# include "asciitab.h"
577 /* BT_NONXML == 0 */
578 },
579 STANDARD_VTABLE(sb_) NULL_VTABLE};
580
581#endif
582
583static const struct normal_encoding ascii_encoding
584 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
585 {
586#define BT_COLON BT_NMSTRT
587#include "asciitab.h"
588#undef BT_COLON
589 /* BT_NONXML == 0 */
590 },
591 STANDARD_VTABLE(sb_) NULL_VTABLE};
592
593static int PTRFASTCALL
594unicode_byte_type(char hi, char lo) {
595 switch ((unsigned char)hi) {
596 /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
597 case 0xD8:
598 case 0xD9:
599 case 0xDA:
600 case 0xDB:
601 return BT_LEAD4;
602 /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
603 case 0xDC:
604 case 0xDD:
605 case 0xDE:
606 case 0xDF:
607 return BT_TRAIL;
608 case 0xFF:
609 switch ((unsigned char)lo) {
610 case 0xFF: /* noncharacter-FFFF */
611 case 0xFE: /* noncharacter-FFFE */
612 return BT_NONXML;
613 }
614 break;
615 }
616 return BT_NONASCII;
617}
618
619#define DEFINE_UTF16_TO_UTF8(E) \
620 static enum XML_Convert_Result PTRCALL E##toUtf8( \
621 const ENCODING *enc, const char **fromP, const char *fromLim, \
622 char **toP, const char *toLim) { \
623 const char *from = *fromP; \
624 UNUSED_P(enc); \
625 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
626 for (; from < fromLim; from += 2) { \
627 int plane; \
628 unsigned char lo2; \
629 unsigned char lo = GET_LO(from); \
630 unsigned char hi = GET_HI(from); \
631 switch (hi) { \
632 case 0: \
633 if (lo < 0x80) { \
634 if (*toP == toLim) { \
635 *fromP = from; \
636 return XML_CONVERT_OUTPUT_EXHAUSTED; \
637 } \
638 *(*toP)++ = lo; \
639 break; \
640 } \
641 /* fall through */ \
642 case 0x1: \
643 case 0x2: \
644 case 0x3: \
645 case 0x4: \
646 case 0x5: \
647 case 0x6: \
648 case 0x7: \
649 if (toLim - *toP < 2) { \
650 *fromP = from; \
651 return XML_CONVERT_OUTPUT_EXHAUSTED; \
652 } \
653 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
654 *(*toP)++ = ((lo & 0x3f) | 0x80); \
655 break; \
656 default: \
657 if (toLim - *toP < 3) { \
658 *fromP = from; \
659 return XML_CONVERT_OUTPUT_EXHAUSTED; \
660 } \
661 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
662 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
663 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
664 *(*toP)++ = ((lo & 0x3f) | 0x80); \
665 break; \
666 case 0xD8: \
667 case 0xD9: \
668 case 0xDA: \
669 case 0xDB: \
670 if (toLim - *toP < 4) { \
671 *fromP = from; \
672 return XML_CONVERT_OUTPUT_EXHAUSTED; \
673 } \
674 if (fromLim - from < 4) { \
675 *fromP = from; \
676 return XML_CONVERT_INPUT_INCOMPLETE; \
677 } \
678 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
679 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \
680 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
681 from += 2; \
682 lo2 = GET_LO(from); \
683 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \
684 | (lo2 >> 6) | 0x80); \
685 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
686 break; \
687 } \
688 } \
689 *fromP = from; \
690 if (from < fromLim) \
691 return XML_CONVERT_INPUT_INCOMPLETE; \
692 else \
693 return XML_CONVERT_COMPLETED; \
694 }
695
696#define DEFINE_UTF16_TO_UTF16(E) \
697 static enum XML_Convert_Result PTRCALL E##toUtf16( \
698 const ENCODING *enc, const char **fromP, const char *fromLim, \
699 unsigned short **toP, const unsigned short *toLim) { \
700 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
701 UNUSED_P(enc); \
702 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
703 /* Avoid copying first half only of surrogate */ \
704 if (fromLim - *fromP > ((toLim - *toP) << 1) \
705 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
706 fromLim -= 2; \
707 res = XML_CONVERT_INPUT_INCOMPLETE; \
708 } \
709 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
710 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
711 if ((*toP == toLim) && (*fromP < fromLim)) \
712 return XML_CONVERT_OUTPUT_EXHAUSTED; \
713 else \
714 return res; \
715 }
716
717#define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
718#define GET_LO(ptr) ((unsigned char)(ptr)[0])
719#define GET_HI(ptr) ((unsigned char)(ptr)[1])
720
721DEFINE_UTF16_TO_UTF8(little2_)
722DEFINE_UTF16_TO_UTF16(little2_)
723
724#undef SET2
725#undef GET_LO
726#undef GET_HI
727
728#define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
729#define GET_LO(ptr) ((unsigned char)(ptr)[1])
730#define GET_HI(ptr) ((unsigned char)(ptr)[0])
731
732DEFINE_UTF16_TO_UTF8(big2_)
733DEFINE_UTF16_TO_UTF16(big2_)
734
735#undef SET2
736#undef GET_LO
737#undef GET_HI
738
739#define LITTLE2_BYTE_TYPE(enc, p) \
740 ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
741 : unicode_byte_type((p)[1], (p)[0]))
742#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
743#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
744#define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
745 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
746#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
747 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
748
749#ifdef XML_MIN_SIZE
750
751static int PTRFASTCALL
752little2_byteType(const ENCODING *enc, const char *p) {
753 return LITTLE2_BYTE_TYPE(enc, p);
754}
755
756static int PTRFASTCALL
757little2_byteToAscii(const ENCODING *enc, const char *p) {
758 UNUSED_P(enc);
759 return LITTLE2_BYTE_TO_ASCII(p);
760}
761
762static int PTRCALL
763little2_charMatches(const ENCODING *enc, const char *p, int c) {
764 UNUSED_P(enc);
765 return LITTLE2_CHAR_MATCHES(p, c);
766}
767
768static int PTRFASTCALL
769little2_isNameMin(const ENCODING *enc, const char *p) {
770 UNUSED_P(enc);
771 return LITTLE2_IS_NAME_CHAR_MINBPC(p);
772}
773
774static int PTRFASTCALL
775little2_isNmstrtMin(const ENCODING *enc, const char *p) {
776 UNUSED_P(enc);
777 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
778}
779
780# undef VTABLE
781# define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
782
783#else /* not XML_MIN_SIZE */
784
785# undef PREFIX
786# define PREFIX(ident) little2_##ident
787# define MINBPC(enc) 2
788/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
789# define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
790# define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
791# define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
792# define IS_NAME_CHAR(enc, p, n) 0
793# define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
794# define IS_NMSTRT_CHAR(enc, p, n) (0)
795# define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
796
797# define XML_TOK_IMPL_C
798# include "xmltok_impl.c"
799# undef XML_TOK_IMPL_C
800
801# undef MINBPC
802# undef BYTE_TYPE
803# undef BYTE_TO_ASCII
804# undef CHAR_MATCHES
805# undef IS_NAME_CHAR
806# undef IS_NAME_CHAR_MINBPC
807# undef IS_NMSTRT_CHAR
808# undef IS_NMSTRT_CHAR_MINBPC
809# undef IS_INVALID_CHAR
810
811#endif /* not XML_MIN_SIZE */
812
813#ifdef XML_NS
814
815static const struct normal_encoding little2_encoding_ns
816 = {{VTABLE, 2, 0,
817# if BYTEORDER == 1234
818 1
819# else
820 0
821# endif
822 },
823 {
824# include "asciitab.h"
825# include "latin1tab.h"
826 },
827 STANDARD_VTABLE(little2_) NULL_VTABLE};
828
829#endif
830
831static const struct normal_encoding little2_encoding
832 = {{VTABLE, 2, 0,
833#if BYTEORDER == 1234
834 1
835#else
836 0
837#endif
838 },
839 {
840#define BT_COLON BT_NMSTRT
841#include "asciitab.h"
842#undef BT_COLON
843#include "latin1tab.h"
844 },
845 STANDARD_VTABLE(little2_) NULL_VTABLE};
846
847#if BYTEORDER != 4321
848
849# ifdef XML_NS
850
851static const struct normal_encoding internal_little2_encoding_ns
852 = {{VTABLE, 2, 0, 1},
853 {
854# include "iasciitab.h"
855# include "latin1tab.h"
856 },
857 STANDARD_VTABLE(little2_) NULL_VTABLE};
858
859# endif
860
861static const struct normal_encoding internal_little2_encoding
862 = {{VTABLE, 2, 0, 1},
863 {
864# define BT_COLON BT_NMSTRT
865# include "iasciitab.h"
866# undef BT_COLON
867# include "latin1tab.h"
868 },
869 STANDARD_VTABLE(little2_) NULL_VTABLE};
870
871#endif
872
873#define BIG2_BYTE_TYPE(enc, p) \
874 ((p)[0] == 0 \
875 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
876 : unicode_byte_type((p)[0], (p)[1]))
877#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
878#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
879#define BIG2_IS_NAME_CHAR_MINBPC(p) \
880 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
881#define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
882 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
883
884#ifdef XML_MIN_SIZE
885
886static int PTRFASTCALL
887big2_byteType(const ENCODING *enc, const char *p) {
888 return BIG2_BYTE_TYPE(enc, p);
889}
890
891static int PTRFASTCALL
892big2_byteToAscii(const ENCODING *enc, const char *p) {
893 UNUSED_P(enc);
894 return BIG2_BYTE_TO_ASCII(p);
895}
896
897static int PTRCALL
898big2_charMatches(const ENCODING *enc, const char *p, int c) {
899 UNUSED_P(enc);
900 return BIG2_CHAR_MATCHES(p, c);
901}
902
903static int PTRFASTCALL
904big2_isNameMin(const ENCODING *enc, const char *p) {
905 UNUSED_P(enc);
906 return BIG2_IS_NAME_CHAR_MINBPC(p);
907}
908
909static int PTRFASTCALL
910big2_isNmstrtMin(const ENCODING *enc, const char *p) {
911 UNUSED_P(enc);
912 return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
913}
914
915# undef VTABLE
916# define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
917
918#else /* not XML_MIN_SIZE */
919
920# undef PREFIX
921# define PREFIX(ident) big2_##ident
922# define MINBPC(enc) 2
923/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
924# define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
925# define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
926# define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
927# define IS_NAME_CHAR(enc, p, n) 0
928# define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
929# define IS_NMSTRT_CHAR(enc, p, n) (0)
930# define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
931
932# define XML_TOK_IMPL_C
933# include "xmltok_impl.c"
934# undef XML_TOK_IMPL_C
935
936# undef MINBPC
937# undef BYTE_TYPE
938# undef BYTE_TO_ASCII
939# undef CHAR_MATCHES
940# undef IS_NAME_CHAR
941# undef IS_NAME_CHAR_MINBPC
942# undef IS_NMSTRT_CHAR
943# undef IS_NMSTRT_CHAR_MINBPC
944# undef IS_INVALID_CHAR
945
946#endif /* not XML_MIN_SIZE */
947
948#ifdef XML_NS
949
950static const struct normal_encoding big2_encoding_ns
951 = {{VTABLE, 2, 0,
952# if BYTEORDER == 4321
953 1
954# else
955 0
956# endif
957 },
958 {
959# include "asciitab.h"
960# include "latin1tab.h"
961 },
962 STANDARD_VTABLE(big2_) NULL_VTABLE};
963
964#endif
965
966static const struct normal_encoding big2_encoding
967 = {{VTABLE, 2, 0,
968#if BYTEORDER == 4321
969 1
970#else
971 0
972#endif
973 },
974 {
975#define BT_COLON BT_NMSTRT
976#include "asciitab.h"
977#undef BT_COLON
978#include "latin1tab.h"
979 },
980 STANDARD_VTABLE(big2_) NULL_VTABLE};
981
982#if BYTEORDER != 1234
983
984# ifdef XML_NS
985
986static const struct normal_encoding internal_big2_encoding_ns
987 = {{VTABLE, 2, 0, 1},
988 {
989# include "iasciitab.h"
990# include "latin1tab.h"
991 },
992 STANDARD_VTABLE(big2_) NULL_VTABLE};
993
994# endif
995
996static const struct normal_encoding internal_big2_encoding
997 = {{VTABLE, 2, 0, 1},
998 {
999# define BT_COLON BT_NMSTRT
1000# include "iasciitab.h"
1001# undef BT_COLON
1002# include "latin1tab.h"
1003 },
1004 STANDARD_VTABLE(big2_) NULL_VTABLE};
1005
1006#endif
1007
1008#undef PREFIX
1009
1010static int FASTCALL
1011streqci(const char *s1, const char *s2) {
1012 for (;;) {
1013 char c1 = *s1++;
1014 char c2 = *s2++;
1015 if (ASCII_a <= c1 && c1 <= ASCII_z)
1016 c1 += ASCII_A - ASCII_a;
1017 if (ASCII_a <= c2 && c2 <= ASCII_z)
1018 /* The following line will never get executed. streqci() is
1019 * only called from two places, both of which guarantee to put
1020 * upper-case strings into s2.
1021 */
1022 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1023 if (c1 != c2)
1024 return 0;
1025 if (! c1)
1026 break;
1027 }
1028 return 1;
1029}
1030
1031static void PTRCALL
1032initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1033 POSITION *pos) {
1034 UNUSED_P(enc);
1035 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1036}
1037
1038static int
1039toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1040 char buf[1];
1041 char *p = buf;
1042 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1043 if (p == buf)
1044 return -1;
1045 else
1046 return buf[0];
1047}
1048
1049static int FASTCALL
1050isSpace(int c) {
1051 switch (c) {
1052 case 0x20:
1053 case 0xD:
1054 case 0xA:
1055 case 0x9:
1056 return 1;
1057 }
1058 return 0;
1059}
1060
1061/* Return 1 if there's just optional white space or there's an S
1062 followed by name=val.
1063*/
1064static int
1065parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1066 const char **namePtr, const char **nameEndPtr,
1067 const char **valPtr, const char **nextTokPtr) {
1068 int c;
1069 char open;
1070 if (ptr == end) {
1071 *namePtr = NULL;
1072 return 1;
1073 }
1074 if (! isSpace(toAscii(enc, ptr, end))) {
1075 *nextTokPtr = ptr;
1076 return 0;
1077 }
1078 do {
1079 ptr += enc->minBytesPerChar;
1080 } while (isSpace(toAscii(enc, ptr, end)));
1081 if (ptr == end) {
1082 *namePtr = NULL;
1083 return 1;
1084 }
1085 *namePtr = ptr;
1086 for (;;) {
1087 c = toAscii(enc, ptr, end);
1088 if (c == -1) {
1089 *nextTokPtr = ptr;
1090 return 0;
1091 }
1092 if (c == ASCII_EQUALS) {
1093 *nameEndPtr = ptr;
1094 break;
1095 }
1096 if (isSpace(c)) {
1097 *nameEndPtr = ptr;
1098 do {
1099 ptr += enc->minBytesPerChar;
1100 } while (isSpace(c = toAscii(enc, ptr, end)));
1101 if (c != ASCII_EQUALS) {
1102 *nextTokPtr = ptr;
1103 return 0;
1104 }
1105 break;
1106 }
1107 ptr += enc->minBytesPerChar;
1108 }
1109 if (ptr == *namePtr) {
1110 *nextTokPtr = ptr;
1111 return 0;
1112 }
1113 ptr += enc->minBytesPerChar;
1114 c = toAscii(enc, ptr, end);
1115 while (isSpace(c)) {
1116 ptr += enc->minBytesPerChar;
1117 c = toAscii(enc, ptr, end);
1118 }
1119 if (c != ASCII_QUOT && c != ASCII_APOS) {
1120 *nextTokPtr = ptr;
1121 return 0;
1122 }
1123 open = (char)c;
1124 ptr += enc->minBytesPerChar;
1125 *valPtr = ptr;
1126 for (;; ptr += enc->minBytesPerChar) {
1127 c = toAscii(enc, ptr, end);
1128 if (c == open)
1129 break;
1130 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1131 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1132 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1133 *nextTokPtr = ptr;
1134 return 0;
1135 }
1136 }
1137 *nextTokPtr = ptr + enc->minBytesPerChar;
1138 return 1;
1139}
1140
1141static const char KW_version[]
1142 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1143
1144static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1145 ASCII_i, ASCII_n, ASCII_g, '\0'};
1146
1147static const char KW_standalone[]
1148 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1149 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1150
1151static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1152
1153static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1154
1155static int
1156doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1157 const char *),
1158 int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1159 const char *end, const char **badPtr, const char **versionPtr,
1160 const char **versionEndPtr, const char **encodingName,
1161 const ENCODING **encoding, int *standalone) {
1162 const char *val = NULL;
1163 const char *name = NULL;
1164 const char *nameEnd = NULL;
1165 ptr += 5 * enc->minBytesPerChar;
1166 end -= 2 * enc->minBytesPerChar;
1167 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1168 || ! name) {
1169 *badPtr = ptr;
1170 return 0;
1171 }
1172 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1173 if (! isGeneralTextEntity) {
1174 *badPtr = name;
1175 return 0;
1176 }
1177 } else {
1178 if (versionPtr)
1179 *versionPtr = val;
1180 if (versionEndPtr)
1181 *versionEndPtr = ptr;
1182 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1183 *badPtr = ptr;
1184 return 0;
1185 }
1186 if (! name) {
1187 if (isGeneralTextEntity) {
1188 /* a TextDecl must have an EncodingDecl */
1189 *badPtr = ptr;
1190 return 0;
1191 }
1192 return 1;
1193 }
1194 }
1195 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1196 int c = toAscii(enc, val, end);
1197 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1198 *badPtr = val;
1199 return 0;
1200 }
1201 if (encodingName)
1202 *encodingName = val;
1203 if (encoding)
1204 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1205 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1206 *badPtr = ptr;
1207 return 0;
1208 }
1209 if (! name)
1210 return 1;
1211 }
1212 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1213 || isGeneralTextEntity) {
1214 *badPtr = name;
1215 return 0;
1216 }
1217 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1218 if (standalone)
1219 *standalone = 1;
1220 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1221 if (standalone)
1222 *standalone = 0;
1223 } else {
1224 *badPtr = val;
1225 return 0;
1226 }
1227 while (isSpace(toAscii(enc, ptr, end)))
1228 ptr += enc->minBytesPerChar;
1229 if (ptr != end) {
1230 *badPtr = ptr;
1231 return 0;
1232 }
1233 return 1;
1234}
1235
1236static int FASTCALL
1237checkCharRefNumber(int result) {
1238 switch (result >> 8) {
1239 case 0xD8:
1240 case 0xD9:
1241 case 0xDA:
1242 case 0xDB:
1243 case 0xDC:
1244 case 0xDD:
1245 case 0xDE:
1246 case 0xDF:
1247 return -1;
1248 case 0:
1249 if (latin1_encoding.type[result] == BT_NONXML)
1250 return -1;
1251 break;
1252 case 0xFF:
1253 if (result == 0xFFFE || result == 0xFFFF)
1254 return -1;
1255 break;
1256 }
1257 return result;
1258}
1259
1260int FASTCALL
1261XmlUtf8Encode(int c, char *buf) {
1262 enum {
1263 /* minN is minimum legal resulting value for N byte sequence */
1264 min2 = 0x80,
1265 min3 = 0x800,
1266 min4 = 0x10000
1267 };
1268
1269 if (c < 0)
1270 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1271 if (c < min2) {
1272 buf[0] = (char)(c | UTF8_cval1);
1273 return 1;
1274 }
1275 if (c < min3) {
1276 buf[0] = (char)((c >> 6) | UTF8_cval2);
1277 buf[1] = (char)((c & 0x3f) | 0x80);
1278 return 2;
1279 }
1280 if (c < min4) {
1281 buf[0] = (char)((c >> 12) | UTF8_cval3);
1282 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1283 buf[2] = (char)((c & 0x3f) | 0x80);
1284 return 3;
1285 }
1286 if (c < 0x110000) {
1287 buf[0] = (char)((c >> 18) | UTF8_cval4);
1288 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1289 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1290 buf[3] = (char)((c & 0x3f) | 0x80);
1291 return 4;
1292 }
1293 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1294}
1295
1296int FASTCALL
1297XmlUtf16Encode(int charNum, unsigned short *buf) {
1298 if (charNum < 0)
1299 return 0;
1300 if (charNum < 0x10000) {
1301 buf[0] = (unsigned short)charNum;
1302 return 1;
1303 }
1304 if (charNum < 0x110000) {
1305 charNum -= 0x10000;
1306 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1307 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1308 return 2;
1309 }
1310 return 0;
1311}
1312
1313struct unknown_encoding {
1314 struct normal_encoding normal;
1315 CONVERTER convert;
1316 void *userData;
1317 unsigned short utf16[256];
1318 char utf8[256][4];
1319};
1320
1321#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1322
1323int
1324XmlSizeOfUnknownEncoding(void) {
1325 return sizeof(struct unknown_encoding);
1326}
1327
1328static int PTRFASTCALL
1329unknown_isName(const ENCODING *enc, const char *p) {
1330 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1331 int c = uenc->convert(uenc->userData, p);
1332 if (c & ~0xFFFF)
1333 return 0;
1334 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1335}
1336
1337static int PTRFASTCALL
1338unknown_isNmstrt(const ENCODING *enc, const char *p) {
1339 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1340 int c = uenc->convert(uenc->userData, p);
1341 if (c & ~0xFFFF)
1342 return 0;
1343 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1344}
1345
1346static int PTRFASTCALL
1347unknown_isInvalid(const ENCODING *enc, const char *p) {
1348 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1349 int c = uenc->convert(uenc->userData, p);
1350 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1351}
1352
1353static enum XML_Convert_Result PTRCALL
1354unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1355 char **toP, const char *toLim) {
1356 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1357 char buf[XML_UTF8_ENCODE_MAX];
1358 for (;;) {
1359 const char *utf8;
1360 int n;
1361 if (*fromP == fromLim)
1362 return XML_CONVERT_COMPLETED;
1363 utf8 = uenc->utf8[(unsigned char)**fromP];
1364 n = *utf8++;
1365 if (n == 0) {
1366 int c = uenc->convert(uenc->userData, *fromP);
1367 n = XmlUtf8Encode(c, buf);
1368 if (n > toLim - *toP)
1369 return XML_CONVERT_OUTPUT_EXHAUSTED;
1370 utf8 = buf;
1371 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1372 - (BT_LEAD2 - 2));
1373 } else {
1374 if (n > toLim - *toP)
1375 return XML_CONVERT_OUTPUT_EXHAUSTED;
1376 (*fromP)++;
1377 }
1378 memcpy(*toP, utf8, n);
1379 *toP += n;
1380 }
1381}
1382
1383static enum XML_Convert_Result PTRCALL
1384unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1385 unsigned short **toP, const unsigned short *toLim) {
1386 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1387 while (*fromP < fromLim && *toP < toLim) {
1388 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1389 if (c == 0) {
1390 c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1391 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1392 - (BT_LEAD2 - 2));
1393 } else
1394 (*fromP)++;
1395 *(*toP)++ = c;
1396 }
1397
1398 if ((*toP == toLim) && (*fromP < fromLim))
1399 return XML_CONVERT_OUTPUT_EXHAUSTED;
1400 else
1401 return XML_CONVERT_COMPLETED;
1402}
1403
1404ENCODING *
1405XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1406 void *userData) {
1407 int i;
1408 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1409 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1410 for (i = 0; i < 128; i++)
1411 if (latin1_encoding.type[i] != BT_OTHER
1412 && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1413 return 0;
1414 for (i = 0; i < 256; i++) {
1415 int c = table[i];
1416 if (c == -1) {
1417 e->normal.type[i] = BT_MALFORM;
1418 /* This shouldn't really get used. */
1419 e->utf16[i] = 0xFFFF;
1420 e->utf8[i][0] = 1;
1421 e->utf8[i][1] = 0;
1422 } else if (c < 0) {
1423 if (c < -4)
1424 return 0;
1425 /* Multi-byte sequences need a converter function */
1426 if (! convert)
1427 return 0;
1428 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1429 e->utf8[i][0] = 0;
1430 e->utf16[i] = 0;
1431 } else if (c < 0x80) {
1432 if (latin1_encoding.type[c] != BT_OTHER
1433 && latin1_encoding.type[c] != BT_NONXML && c != i)
1434 return 0;
1435 e->normal.type[i] = latin1_encoding.type[c];
1436 e->utf8[i][0] = 1;
1437 e->utf8[i][1] = (char)c;
1438 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1439 } else if (checkCharRefNumber(c) < 0) {
1440 e->normal.type[i] = BT_NONXML;
1441 /* This shouldn't really get used. */
1442 e->utf16[i] = 0xFFFF;
1443 e->utf8[i][0] = 1;
1444 e->utf8[i][1] = 0;
1445 } else {
1446 if (c > 0xFFFF)
1447 return 0;
1448 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1449 e->normal.type[i] = BT_NMSTRT;
1450 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1451 e->normal.type[i] = BT_NAME;
1452 else
1453 e->normal.type[i] = BT_OTHER;
1454 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1455 e->utf16[i] = (unsigned short)c;
1456 }
1457 }
1458 e->userData = userData;
1459 e->convert = convert;
1460 if (convert) {
1461 e->normal.isName2 = unknown_isName;
1462 e->normal.isName3 = unknown_isName;
1463 e->normal.isName4 = unknown_isName;
1464 e->normal.isNmstrt2 = unknown_isNmstrt;
1465 e->normal.isNmstrt3 = unknown_isNmstrt;
1466 e->normal.isNmstrt4 = unknown_isNmstrt;
1467 e->normal.isInvalid2 = unknown_isInvalid;
1468 e->normal.isInvalid3 = unknown_isInvalid;
1469 e->normal.isInvalid4 = unknown_isInvalid;
1470 }
1471 e->normal.enc.utf8Convert = unknown_toUtf8;
1472 e->normal.enc.utf16Convert = unknown_toUtf16;
1473 return &(e->normal.enc);
1474}
1475
1476/* If this enumeration is changed, getEncodingIndex and encodings
1477must also be changed. */
1478enum {
1479 UNKNOWN_ENC = -1,
1480 ISO_8859_1_ENC = 0,
1481 US_ASCII_ENC,
1482 UTF_8_ENC,
1483 UTF_16_ENC,
1484 UTF_16BE_ENC,
1485 UTF_16LE_ENC,
1486 /* must match encodingNames up to here */
1487 NO_ENC
1488};
1489
1490static const char KW_ISO_8859_1[]
1491 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8,
1492 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'};
1493static const char KW_US_ASCII[]
1494 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1495 ASCII_C, ASCII_I, ASCII_I, '\0'};
1496static const char KW_UTF_8[]
1497 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1498static const char KW_UTF_16[]
1499 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1500static const char KW_UTF_16BE[]
1501 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1502 ASCII_6, ASCII_B, ASCII_E, '\0'};
1503static const char KW_UTF_16LE[]
1504 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1505 ASCII_6, ASCII_L, ASCII_E, '\0'};
1506
1507static int FASTCALL
1508getEncodingIndex(const char *name) {
1509 static const char *const encodingNames[] = {
1510 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1511 };
1512 int i;
1513 if (name == NULL)
1514 return NO_ENC;
1515 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1516 if (streqci(name, encodingNames[i]))
1517 return i;
1518 return UNKNOWN_ENC;
1519}
1520
1521/* For binary compatibility, we store the index of the encoding
1522 specified at initialization in the isUtf16 member.
1523*/
1524
1525#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1526#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1527
1528/* This is what detects the encoding. encodingTable maps from
1529 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1530 the external (protocol) specified encoding; state is
1531 XML_CONTENT_STATE if we're parsing an external text entity, and
1532 XML_PROLOG_STATE otherwise.
1533*/
1534
1535static int
1536initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1537 int state, const char *ptr, const char *end, const char **nextTokPtr) {
1538 const ENCODING **encPtr;
1539
1540 if (ptr >= end)
1541 return XML_TOK_NONE;
1542 encPtr = enc->encPtr;
1543 if (ptr + 1 == end) {
1544 /* only a single byte available for auto-detection */
1545#ifndef XML_DTD /* FIXME */
1546 /* a well-formed document entity must have more than one byte */
1547 if (state != XML_CONTENT_STATE)
1548 return XML_TOK_PARTIAL;
1549#endif
1550 /* so we're parsing an external text entity... */
1551 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1552 switch (INIT_ENC_INDEX(enc)) {
1553 case UTF_16_ENC:
1554 case UTF_16LE_ENC:
1555 case UTF_16BE_ENC:
1556 return XML_TOK_PARTIAL;
1557 }
1558 switch ((unsigned char)*ptr) {
1559 case 0xFE:
1560 case 0xFF:
1561 case 0xEF: /* possibly first byte of UTF-8 BOM */
1562 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1563 break;
1564 /* fall through */
1565 case 0x00:
1566 case 0x3C:
1567 return XML_TOK_PARTIAL;
1568 }
1569 } else {
1570 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1571 case 0xFEFF:
1572 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1573 break;
1574 *nextTokPtr = ptr + 2;
1575 *encPtr = encodingTable[UTF_16BE_ENC];
1576 return XML_TOK_BOM;
1577 /* 00 3C is handled in the default case */
1578 case 0x3C00:
1579 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1580 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1581 && state == XML_CONTENT_STATE)
1582 break;
1583 *encPtr = encodingTable[UTF_16LE_ENC];
1584 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1585 case 0xFFFE:
1586 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1587 break;
1588 *nextTokPtr = ptr + 2;
1589 *encPtr = encodingTable[UTF_16LE_ENC];
1590 return XML_TOK_BOM;
1591 case 0xEFBB:
1592 /* Maybe a UTF-8 BOM (EF BB BF) */
1593 /* If there's an explicitly specified (external) encoding
1594 of ISO-8859-1 or some flavour of UTF-16
1595 and this is an external text entity,
1596 don't look for the BOM,
1597 because it might be a legal data.
1598 */
1599 if (state == XML_CONTENT_STATE) {
1600 int e = INIT_ENC_INDEX(enc);
1601 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1602 || e == UTF_16_ENC)
1603 break;
1604 }
1605 if (ptr + 2 == end)
1606 return XML_TOK_PARTIAL;
1607 if ((unsigned char)ptr[2] == 0xBF) {
1608 *nextTokPtr = ptr + 3;
1609 *encPtr = encodingTable[UTF_8_ENC];
1610 return XML_TOK_BOM;
1611 }
1612 break;
1613 default:
1614 if (ptr[0] == '\0') {
1615 /* 0 isn't a legal data character. Furthermore a document
1616 entity can only start with ASCII characters. So the only
1617 way this can fail to be big-endian UTF-16 if it it's an
1618 external parsed general entity that's labelled as
1619 UTF-16LE.
1620 */
1621 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1622 break;
1623 *encPtr = encodingTable[UTF_16BE_ENC];
1624 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1625 } else if (ptr[1] == '\0') {
1626 /* We could recover here in the case:
1627 - parsing an external entity
1628 - second byte is 0
1629 - no externally specified encoding
1630 - no encoding declaration
1631 by assuming UTF-16LE. But we don't, because this would mean when
1632 presented just with a single byte, we couldn't reliably determine
1633 whether we needed further bytes.
1634 */
1635 if (state == XML_CONTENT_STATE)
1636 break;
1637 *encPtr = encodingTable[UTF_16LE_ENC];
1638 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1639 }
1640 break;
1641 }
1642 }
1643 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1644 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1645}
1646
1647#define NS(x) x
1648#define ns(x) x
1649#define XML_TOK_NS_C
1650#include "xmltok_ns.c"
1651#undef XML_TOK_NS_C
1652#undef NS
1653#undef ns
1654
1655#ifdef XML_NS
1656
1657# define NS(x) x##NS
1658# define ns(x) x##_ns
1659
1660# define XML_TOK_NS_C
1661# include "xmltok_ns.c"
1662# undef XML_TOK_NS_C
1663
1664# undef NS
1665# undef ns
1666
1667ENCODING *
1668XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1669 void *userData) {
1670 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1671 if (enc)
1672 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1673 return enc;
1674}
1675
1676#endif /* XML_NS */
1677