1/* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)!
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000 Clark Cooper <[email protected]>
11 Copyright (c) 2002 Fred L. Drake, Jr. <[email protected]>
12 Copyright (c) 2002-2016 Karl Waclawek <[email protected]>
13 Copyright (c) 2016-2022 Sebastian Pipping <[email protected]>
14 Copyright (c) 2017 Rhodri James <[email protected]>
15 Copyright (c) 2018 Benjamin Peterson <[email protected]>
16 Copyright (c) 2018 Anton Maklakov <[email protected]>
17 Copyright (c) 2019 David Loffredo <[email protected]>
18 Copyright (c) 2020 Boris Kolpackov <[email protected]>
19 Licensed under the MIT license:
20
21 Permission is hereby granted, free of charge, to any person obtaining
22 a copy of this software and associated documentation files (the
23 "Software"), to deal in the Software without restriction, including
24 without limitation the rights to use, copy, modify, merge, publish,
25 distribute, sublicense, and/or sell copies of the Software, and to permit
26 persons to whom the Software is furnished to do so, subject to the
27 following conditions:
28
29 The above copyright notice and this permission notice shall be included
30 in all copies or substantial portions of the Software.
31
32 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
33 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
34 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
35 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
36 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
37 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
38 USE OR OTHER DEALINGS IN THE SOFTWARE.
39*/
40
41#ifdef XML_TOK_IMPL_C
42
43# ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined
44# define IS_INVALID_CHAR(enc, ptr, n) (0)
45# endif
46
47# define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
48 case BT_LEAD##n: \
49 if (end - ptr < n) \
50 return XML_TOK_PARTIAL_CHAR; \
51 if (IS_INVALID_CHAR(enc, ptr, n)) { \
52 *(nextTokPtr) = (ptr); \
53 return XML_TOK_INVALID; \
54 } \
55 ptr += n; \
56 break;
57
58# define INVALID_CASES(ptr, nextTokPtr) \
59 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
60 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
61 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
62 case BT_NONXML: \
63 case BT_MALFORM: \
64 case BT_TRAIL: \
65 *(nextTokPtr) = (ptr); \
66 return XML_TOK_INVALID;
67
68# define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
69 case BT_LEAD##n: \
70 if (end - ptr < n) \
71 return XML_TOK_PARTIAL_CHAR; \
72 if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \
73 *nextTokPtr = ptr; \
74 return XML_TOK_INVALID; \
75 } \
76 ptr += n; \
77 break;
78
79# define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
80 case BT_NONASCII: \
81 if (! IS_NAME_CHAR_MINBPC(enc, ptr)) { \
82 *nextTokPtr = ptr; \
83 return XML_TOK_INVALID; \
84 } \
85 /* fall through */ \
86 case BT_NMSTRT: \
87 case BT_HEX: \
88 case BT_DIGIT: \
89 case BT_NAME: \
90 case BT_MINUS: \
91 ptr += MINBPC(enc); \
92 break; \
93 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
94 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
95 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
96
97# define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
98 case BT_LEAD##n: \
99 if (end - ptr < n) \
100 return XML_TOK_PARTIAL_CHAR; \
101 if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \
102 *nextTokPtr = ptr; \
103 return XML_TOK_INVALID; \
104 } \
105 ptr += n; \
106 break;
107
108# define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
109 case BT_NONASCII: \
110 if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
111 *nextTokPtr = ptr; \
112 return XML_TOK_INVALID; \
113 } \
114 /* fall through */ \
115 case BT_NMSTRT: \
116 case BT_HEX: \
117 ptr += MINBPC(enc); \
118 break; \
119 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
120 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
121 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
122
123# ifndef PREFIX
124# define PREFIX(ident) ident
125# endif
126
127# define HAS_CHARS(enc, ptr, end, count) (end - ptr >= count * MINBPC(enc))
128
129# define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
130
131# define REQUIRE_CHARS(enc, ptr, end, count) \
132 { \
133 if (! HAS_CHARS(enc, ptr, end, count)) { \
134 return XML_TOK_PARTIAL; \
135 } \
136 }
137
138# define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
139
140/* ptr points to character following "<!-" */
141
142static int PTRCALL
143PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
144 const char **nextTokPtr) {
145 if (HAS_CHAR(enc, ptr, end)) {
146 if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
147 *nextTokPtr = ptr;
148 return XML_TOK_INVALID;
149 }
150 ptr += MINBPC(enc);
151 while (HAS_CHAR(enc, ptr, end)) {
152 switch (BYTE_TYPE(enc, ptr)) {
153 INVALID_CASES(ptr, nextTokPtr)
154 case BT_MINUS:
155 ptr += MINBPC(enc);
156 REQUIRE_CHAR(enc, ptr, end);
157 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
158 ptr += MINBPC(enc);
159 REQUIRE_CHAR(enc, ptr, end);
160 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
161 *nextTokPtr = ptr;
162 return XML_TOK_INVALID;
163 }
164 *nextTokPtr = ptr + MINBPC(enc);
165 return XML_TOK_COMMENT;
166 }
167 break;
168 default:
169 ptr += MINBPC(enc);
170 break;
171 }
172 }
173 }
174 return XML_TOK_PARTIAL;
175}
176
177/* ptr points to character following "<!" */
178
179static int PTRCALL
180PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
181 const char **nextTokPtr) {
182 REQUIRE_CHAR(enc, ptr, end);
183 switch (BYTE_TYPE(enc, ptr)) {
184 case BT_MINUS:
185 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
186 case BT_LSQB:
187 *nextTokPtr = ptr + MINBPC(enc);
188 return XML_TOK_COND_SECT_OPEN;
189 case BT_NMSTRT:
190 case BT_HEX:
191 ptr += MINBPC(enc);
192 break;
193 default:
194 *nextTokPtr = ptr;
195 return XML_TOK_INVALID;
196 }
197 while (HAS_CHAR(enc, ptr, end)) {
198 switch (BYTE_TYPE(enc, ptr)) {
199 case BT_PERCNT:
200 REQUIRE_CHARS(enc, ptr, end, 2);
201 /* don't allow <!ENTITY% foo "whatever"> */
202 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
203 case BT_S:
204 case BT_CR:
205 case BT_LF:
206 case BT_PERCNT:
207 *nextTokPtr = ptr;
208 return XML_TOK_INVALID;
209 }
210 /* fall through */
211 case BT_S:
212 case BT_CR:
213 case BT_LF:
214 *nextTokPtr = ptr;
215 return XML_TOK_DECL_OPEN;
216 case BT_NMSTRT:
217 case BT_HEX:
218 ptr += MINBPC(enc);
219 break;
220 default:
221 *nextTokPtr = ptr;
222 return XML_TOK_INVALID;
223 }
224 }
225 return XML_TOK_PARTIAL;
226}
227
228static int PTRCALL
229PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
230 int *tokPtr) {
231 int upper = 0;
232 UNUSED_P(enc);
233 *tokPtr = XML_TOK_PI;
234 if (end - ptr != MINBPC(enc) * 3)
235 return 1;
236 switch (BYTE_TO_ASCII(enc, ptr)) {
237 case ASCII_x:
238 break;
239 case ASCII_X:
240 upper = 1;
241 break;
242 default:
243 return 1;
244 }
245 ptr += MINBPC(enc);
246 switch (BYTE_TO_ASCII(enc, ptr)) {
247 case ASCII_m:
248 break;
249 case ASCII_M:
250 upper = 1;
251 break;
252 default:
253 return 1;
254 }
255 ptr += MINBPC(enc);
256 switch (BYTE_TO_ASCII(enc, ptr)) {
257 case ASCII_l:
258 break;
259 case ASCII_L:
260 upper = 1;
261 break;
262 default:
263 return 1;
264 }
265 if (upper)
266 return 0;
267 *tokPtr = XML_TOK_XML_DECL;
268 return 1;
269}
270
271/* ptr points to character following "<?" */
272
273static int PTRCALL
274PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
275 const char **nextTokPtr) {
276 int tok;
277 const char *target = ptr;
278 REQUIRE_CHAR(enc, ptr, end);
279 switch (BYTE_TYPE(enc, ptr)) {
280 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
281 default:
282 *nextTokPtr = ptr;
283 return XML_TOK_INVALID;
284 }
285 while (HAS_CHAR(enc, ptr, end)) {
286 switch (BYTE_TYPE(enc, ptr)) {
287 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
288 case BT_S:
289 case BT_CR:
290 case BT_LF:
291 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
292 *nextTokPtr = ptr;
293 return XML_TOK_INVALID;
294 }
295 ptr += MINBPC(enc);
296 while (HAS_CHAR(enc, ptr, end)) {
297 switch (BYTE_TYPE(enc, ptr)) {
298 INVALID_CASES(ptr, nextTokPtr)
299 case BT_QUEST:
300 ptr += MINBPC(enc);
301 REQUIRE_CHAR(enc, ptr, end);
302 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
303 *nextTokPtr = ptr + MINBPC(enc);
304 return tok;
305 }
306 break;
307 default:
308 ptr += MINBPC(enc);
309 break;
310 }
311 }
312 return XML_TOK_PARTIAL;
313 case BT_QUEST:
314 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
315 *nextTokPtr = ptr;
316 return XML_TOK_INVALID;
317 }
318 ptr += MINBPC(enc);
319 REQUIRE_CHAR(enc, ptr, end);
320 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
321 *nextTokPtr = ptr + MINBPC(enc);
322 return tok;
323 }
324 /* fall through */
325 default:
326 *nextTokPtr = ptr;
327 return XML_TOK_INVALID;
328 }
329 }
330 return XML_TOK_PARTIAL;
331}
332
333static int PTRCALL
334PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
335 const char **nextTokPtr) {
336 static const char CDATA_LSQB[]
337 = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
338 int i;
339 UNUSED_P(enc);
340 /* CDATA[ */
341 REQUIRE_CHARS(enc, ptr, end, 6);
342 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
343 if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
344 *nextTokPtr = ptr;
345 return XML_TOK_INVALID;
346 }
347 }
348 *nextTokPtr = ptr;
349 return XML_TOK_CDATA_SECT_OPEN;
350}
351
352static int PTRCALL
353PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
354 const char **nextTokPtr) {
355 if (ptr >= end)
356 return XML_TOK_NONE;
357 if (MINBPC(enc) > 1) {
358 size_t n = end - ptr;
359 if (n & (MINBPC(enc) - 1)) {
360 n &= ~(MINBPC(enc) - 1);
361 if (n == 0)
362 return XML_TOK_PARTIAL;
363 end = ptr + n;
364 }
365 }
366 switch (BYTE_TYPE(enc, ptr)) {
367 case BT_RSQB:
368 ptr += MINBPC(enc);
369 REQUIRE_CHAR(enc, ptr, end);
370 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
371 break;
372 ptr += MINBPC(enc);
373 REQUIRE_CHAR(enc, ptr, end);
374 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
375 ptr -= MINBPC(enc);
376 break;
377 }
378 *nextTokPtr = ptr + MINBPC(enc);
379 return XML_TOK_CDATA_SECT_CLOSE;
380 case BT_CR:
381 ptr += MINBPC(enc);
382 REQUIRE_CHAR(enc, ptr, end);
383 if (BYTE_TYPE(enc, ptr) == BT_LF)
384 ptr += MINBPC(enc);
385 *nextTokPtr = ptr;
386 return XML_TOK_DATA_NEWLINE;
387 case BT_LF:
388 *nextTokPtr = ptr + MINBPC(enc);
389 return XML_TOK_DATA_NEWLINE;
390 INVALID_CASES(ptr, nextTokPtr)
391 default:
392 ptr += MINBPC(enc);
393 break;
394 }
395 while (HAS_CHAR(enc, ptr, end)) {
396 switch (BYTE_TYPE(enc, ptr)) {
397# define LEAD_CASE(n) \
398 case BT_LEAD##n: \
399 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
400 *nextTokPtr = ptr; \
401 return XML_TOK_DATA_CHARS; \
402 } \
403 ptr += n; \
404 break;
405 LEAD_CASE(2)
406 LEAD_CASE(3)
407 LEAD_CASE(4)
408# undef LEAD_CASE
409 case BT_NONXML:
410 case BT_MALFORM:
411 case BT_TRAIL:
412 case BT_CR:
413 case BT_LF:
414 case BT_RSQB:
415 *nextTokPtr = ptr;
416 return XML_TOK_DATA_CHARS;
417 default:
418 ptr += MINBPC(enc);
419 break;
420 }
421 }
422 *nextTokPtr = ptr;
423 return XML_TOK_DATA_CHARS;
424}
425
426/* ptr points to character following "</" */
427
428static int PTRCALL
429PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
430 const char **nextTokPtr) {
431 REQUIRE_CHAR(enc, ptr, end);
432 switch (BYTE_TYPE(enc, ptr)) {
433 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
434 default:
435 *nextTokPtr = ptr;
436 return XML_TOK_INVALID;
437 }
438 while (HAS_CHAR(enc, ptr, end)) {
439 switch (BYTE_TYPE(enc, ptr)) {
440 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
441 case BT_S:
442 case BT_CR:
443 case BT_LF:
444 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
445 switch (BYTE_TYPE(enc, ptr)) {
446 case BT_S:
447 case BT_CR:
448 case BT_LF:
449 break;
450 case BT_GT:
451 *nextTokPtr = ptr + MINBPC(enc);
452 return XML_TOK_END_TAG;
453 default:
454 *nextTokPtr = ptr;
455 return XML_TOK_INVALID;
456 }
457 }
458 return XML_TOK_PARTIAL;
459# ifdef XML_NS
460 case BT_COLON:
461 /* no need to check qname syntax here,
462 since end-tag must match exactly */
463 ptr += MINBPC(enc);
464 break;
465# endif
466 case BT_GT:
467 *nextTokPtr = ptr + MINBPC(enc);
468 return XML_TOK_END_TAG;
469 default:
470 *nextTokPtr = ptr;
471 return XML_TOK_INVALID;
472 }
473 }
474 return XML_TOK_PARTIAL;
475}
476
477/* ptr points to character following "&#X" */
478
479static int PTRCALL
480PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
481 const char **nextTokPtr) {
482 if (HAS_CHAR(enc, ptr, end)) {
483 switch (BYTE_TYPE(enc, ptr)) {
484 case BT_DIGIT:
485 case BT_HEX:
486 break;
487 default:
488 *nextTokPtr = ptr;
489 return XML_TOK_INVALID;
490 }
491 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
492 switch (BYTE_TYPE(enc, ptr)) {
493 case BT_DIGIT:
494 case BT_HEX:
495 break;
496 case BT_SEMI:
497 *nextTokPtr = ptr + MINBPC(enc);
498 return XML_TOK_CHAR_REF;
499 default:
500 *nextTokPtr = ptr;
501 return XML_TOK_INVALID;
502 }
503 }
504 }
505 return XML_TOK_PARTIAL;
506}
507
508/* ptr points to character following "&#" */
509
510static int PTRCALL
511PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
512 const char **nextTokPtr) {
513 if (HAS_CHAR(enc, ptr, end)) {
514 if (CHAR_MATCHES(enc, ptr, ASCII_x))
515 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
516 switch (BYTE_TYPE(enc, ptr)) {
517 case BT_DIGIT:
518 break;
519 default:
520 *nextTokPtr = ptr;
521 return XML_TOK_INVALID;
522 }
523 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
524 switch (BYTE_TYPE(enc, ptr)) {
525 case BT_DIGIT:
526 break;
527 case BT_SEMI:
528 *nextTokPtr = ptr + MINBPC(enc);
529 return XML_TOK_CHAR_REF;
530 default:
531 *nextTokPtr = ptr;
532 return XML_TOK_INVALID;
533 }
534 }
535 }
536 return XML_TOK_PARTIAL;
537}
538
539/* ptr points to character following "&" */
540
541static int PTRCALL
542PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
543 const char **nextTokPtr) {
544 REQUIRE_CHAR(enc, ptr, end);
545 switch (BYTE_TYPE(enc, ptr)) {
546 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
547 case BT_NUM:
548 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
549 default:
550 *nextTokPtr = ptr;
551 return XML_TOK_INVALID;
552 }
553 while (HAS_CHAR(enc, ptr, end)) {
554 switch (BYTE_TYPE(enc, ptr)) {
555 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
556 case BT_SEMI:
557 *nextTokPtr = ptr + MINBPC(enc);
558 return XML_TOK_ENTITY_REF;
559 default:
560 *nextTokPtr = ptr;
561 return XML_TOK_INVALID;
562 }
563 }
564 return XML_TOK_PARTIAL;
565}
566
567/* ptr points to character following first character of attribute name */
568
569static int PTRCALL
570PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
571 const char **nextTokPtr) {
572# ifdef XML_NS
573 int hadColon = 0;
574# endif
575 while (HAS_CHAR(enc, ptr, end)) {
576 switch (BYTE_TYPE(enc, ptr)) {
577 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
578# ifdef XML_NS
579 case BT_COLON:
580 if (hadColon) {
581 *nextTokPtr = ptr;
582 return XML_TOK_INVALID;
583 }
584 hadColon = 1;
585 ptr += MINBPC(enc);
586 REQUIRE_CHAR(enc, ptr, end);
587 switch (BYTE_TYPE(enc, ptr)) {
588 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
589 default:
590 *nextTokPtr = ptr;
591 return XML_TOK_INVALID;
592 }
593 break;
594# endif
595 case BT_S:
596 case BT_CR:
597 case BT_LF:
598 for (;;) {
599 int t;
600
601 ptr += MINBPC(enc);
602 REQUIRE_CHAR(enc, ptr, end);
603 t = BYTE_TYPE(enc, ptr);
604 if (t == BT_EQUALS)
605 break;
606 switch (t) {
607 case BT_S:
608 case BT_LF:
609 case BT_CR:
610 break;
611 default:
612 *nextTokPtr = ptr;
613 return XML_TOK_INVALID;
614 }
615 }
616 /* fall through */
617 case BT_EQUALS: {
618 int open;
619# ifdef XML_NS
620 hadColon = 0;
621# endif
622 for (;;) {
623 ptr += MINBPC(enc);
624 REQUIRE_CHAR(enc, ptr, end);
625 open = BYTE_TYPE(enc, ptr);
626 if (open == BT_QUOT || open == BT_APOS)
627 break;
628 switch (open) {
629 case BT_S:
630 case BT_LF:
631 case BT_CR:
632 break;
633 default:
634 *nextTokPtr = ptr;
635 return XML_TOK_INVALID;
636 }
637 }
638 ptr += MINBPC(enc);
639 /* in attribute value */
640 for (;;) {
641 int t;
642 REQUIRE_CHAR(enc, ptr, end);
643 t = BYTE_TYPE(enc, ptr);
644 if (t == open)
645 break;
646 switch (t) {
647 INVALID_CASES(ptr, nextTokPtr)
648 case BT_AMP: {
649 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
650 if (tok <= 0) {
651 if (tok == XML_TOK_INVALID)
652 *nextTokPtr = ptr;
653 return tok;
654 }
655 break;
656 }
657 case BT_LT:
658 *nextTokPtr = ptr;
659 return XML_TOK_INVALID;
660 default:
661 ptr += MINBPC(enc);
662 break;
663 }
664 }
665 ptr += MINBPC(enc);
666 REQUIRE_CHAR(enc, ptr, end);
667 switch (BYTE_TYPE(enc, ptr)) {
668 case BT_S:
669 case BT_CR:
670 case BT_LF:
671 break;
672 case BT_SOL:
673 goto sol;
674 case BT_GT:
675 goto gt;
676 default:
677 *nextTokPtr = ptr;
678 return XML_TOK_INVALID;
679 }
680 /* ptr points to closing quote */
681 for (;;) {
682 ptr += MINBPC(enc);
683 REQUIRE_CHAR(enc, ptr, end);
684 switch (BYTE_TYPE(enc, ptr)) {
685 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
686 case BT_S:
687 case BT_CR:
688 case BT_LF:
689 continue;
690 case BT_GT:
691 gt:
692 *nextTokPtr = ptr + MINBPC(enc);
693 return XML_TOK_START_TAG_WITH_ATTS;
694 case BT_SOL:
695 sol:
696 ptr += MINBPC(enc);
697 REQUIRE_CHAR(enc, ptr, end);
698 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
699 *nextTokPtr = ptr;
700 return XML_TOK_INVALID;
701 }
702 *nextTokPtr = ptr + MINBPC(enc);
703 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
704 default:
705 *nextTokPtr = ptr;
706 return XML_TOK_INVALID;
707 }
708 break;
709 }
710 break;
711 }
712 default:
713 *nextTokPtr = ptr;
714 return XML_TOK_INVALID;
715 }
716 }
717 return XML_TOK_PARTIAL;
718}
719
720/* ptr points to character following "<" */
721
722static int PTRCALL
723PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
724 const char **nextTokPtr) {
725# ifdef XML_NS
726 int hadColon;
727# endif
728 REQUIRE_CHAR(enc, ptr, end);
729 switch (BYTE_TYPE(enc, ptr)) {
730 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
731 case BT_EXCL:
732 ptr += MINBPC(enc);
733 REQUIRE_CHAR(enc, ptr, end);
734 switch (BYTE_TYPE(enc, ptr)) {
735 case BT_MINUS:
736 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
737 case BT_LSQB:
738 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
739 }
740 *nextTokPtr = ptr;
741 return XML_TOK_INVALID;
742 case BT_QUEST:
743 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
744 case BT_SOL:
745 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
746 default:
747 *nextTokPtr = ptr;
748 return XML_TOK_INVALID;
749 }
750# ifdef XML_NS
751 hadColon = 0;
752# endif
753 /* we have a start-tag */
754 while (HAS_CHAR(enc, ptr, end)) {
755 switch (BYTE_TYPE(enc, ptr)) {
756 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
757# ifdef XML_NS
758 case BT_COLON:
759 if (hadColon) {
760 *nextTokPtr = ptr;
761 return XML_TOK_INVALID;
762 }
763 hadColon = 1;
764 ptr += MINBPC(enc);
765 REQUIRE_CHAR(enc, ptr, end);
766 switch (BYTE_TYPE(enc, ptr)) {
767 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
768 default:
769 *nextTokPtr = ptr;
770 return XML_TOK_INVALID;
771 }
772 break;
773# endif
774 case BT_S:
775 case BT_CR:
776 case BT_LF: {
777 ptr += MINBPC(enc);
778 while (HAS_CHAR(enc, ptr, end)) {
779 switch (BYTE_TYPE(enc, ptr)) {
780 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
781 case BT_GT:
782 goto gt;
783 case BT_SOL:
784 goto sol;
785 case BT_S:
786 case BT_CR:
787 case BT_LF:
788 ptr += MINBPC(enc);
789 continue;
790 default:
791 *nextTokPtr = ptr;
792 return XML_TOK_INVALID;
793 }
794 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
795 }
796 return XML_TOK_PARTIAL;
797 }
798 case BT_GT:
799 gt:
800 *nextTokPtr = ptr + MINBPC(enc);
801 return XML_TOK_START_TAG_NO_ATTS;
802 case BT_SOL:
803 sol:
804 ptr += MINBPC(enc);
805 REQUIRE_CHAR(enc, ptr, end);
806 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
807 *nextTokPtr = ptr;
808 return XML_TOK_INVALID;
809 }
810 *nextTokPtr = ptr + MINBPC(enc);
811 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
812 default:
813 *nextTokPtr = ptr;
814 return XML_TOK_INVALID;
815 }
816 }
817 return XML_TOK_PARTIAL;
818}
819
820static int PTRCALL
821PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
822 const char **nextTokPtr) {
823 if (ptr >= end)
824 return XML_TOK_NONE;
825 if (MINBPC(enc) > 1) {
826 size_t n = end - ptr;
827 if (n & (MINBPC(enc) - 1)) {
828 n &= ~(MINBPC(enc) - 1);
829 if (n == 0)
830 return XML_TOK_PARTIAL;
831 end = ptr + n;
832 }
833 }
834 switch (BYTE_TYPE(enc, ptr)) {
835 case BT_LT:
836 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
837 case BT_AMP:
838 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
839 case BT_CR:
840 ptr += MINBPC(enc);
841 if (! HAS_CHAR(enc, ptr, end))
842 return XML_TOK_TRAILING_CR;
843 if (BYTE_TYPE(enc, ptr) == BT_LF)
844 ptr += MINBPC(enc);
845 *nextTokPtr = ptr;
846 return XML_TOK_DATA_NEWLINE;
847 case BT_LF:
848 *nextTokPtr = ptr + MINBPC(enc);
849 return XML_TOK_DATA_NEWLINE;
850 case BT_RSQB:
851 ptr += MINBPC(enc);
852 if (! HAS_CHAR(enc, ptr, end))
853 return XML_TOK_TRAILING_RSQB;
854 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
855 break;
856 ptr += MINBPC(enc);
857 if (! HAS_CHAR(enc, ptr, end))
858 return XML_TOK_TRAILING_RSQB;
859 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
860 ptr -= MINBPC(enc);
861 break;
862 }
863 *nextTokPtr = ptr;
864 return XML_TOK_INVALID;
865 INVALID_CASES(ptr, nextTokPtr)
866 default:
867 ptr += MINBPC(enc);
868 break;
869 }
870 while (HAS_CHAR(enc, ptr, end)) {
871 switch (BYTE_TYPE(enc, ptr)) {
872# define LEAD_CASE(n) \
873 case BT_LEAD##n: \
874 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
875 *nextTokPtr = ptr; \
876 return XML_TOK_DATA_CHARS; \
877 } \
878 ptr += n; \
879 break;
880 LEAD_CASE(2)
881 LEAD_CASE(3)
882 LEAD_CASE(4)
883# undef LEAD_CASE
884 case BT_RSQB:
885 if (HAS_CHARS(enc, ptr, end, 2)) {
886 if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
887 ptr += MINBPC(enc);
888 break;
889 }
890 if (HAS_CHARS(enc, ptr, end, 3)) {
891 if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
892 ptr += MINBPC(enc);
893 break;
894 }
895 *nextTokPtr = ptr + 2 * MINBPC(enc);
896 return XML_TOK_INVALID;
897 }
898 }
899 /* fall through */
900 case BT_AMP:
901 case BT_LT:
902 case BT_NONXML:
903 case BT_MALFORM:
904 case BT_TRAIL:
905 case BT_CR:
906 case BT_LF:
907 *nextTokPtr = ptr;
908 return XML_TOK_DATA_CHARS;
909 default:
910 ptr += MINBPC(enc);
911 break;
912 }
913 }
914 *nextTokPtr = ptr;
915 return XML_TOK_DATA_CHARS;
916}
917
918/* ptr points to character following "%" */
919
920static int PTRCALL
921PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
922 const char **nextTokPtr) {
923 REQUIRE_CHAR(enc, ptr, end);
924 switch (BYTE_TYPE(enc, ptr)) {
925 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
926 case BT_S:
927 case BT_LF:
928 case BT_CR:
929 case BT_PERCNT:
930 *nextTokPtr = ptr;
931 return XML_TOK_PERCENT;
932 default:
933 *nextTokPtr = ptr;
934 return XML_TOK_INVALID;
935 }
936 while (HAS_CHAR(enc, ptr, end)) {
937 switch (BYTE_TYPE(enc, ptr)) {
938 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
939 case BT_SEMI:
940 *nextTokPtr = ptr + MINBPC(enc);
941 return XML_TOK_PARAM_ENTITY_REF;
942 default:
943 *nextTokPtr = ptr;
944 return XML_TOK_INVALID;
945 }
946 }
947 return XML_TOK_PARTIAL;
948}
949
950static int PTRCALL
951PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
952 const char **nextTokPtr) {
953 REQUIRE_CHAR(enc, ptr, end);
954 switch (BYTE_TYPE(enc, ptr)) {
955 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
956 default:
957 *nextTokPtr = ptr;
958 return XML_TOK_INVALID;
959 }
960 while (HAS_CHAR(enc, ptr, end)) {
961 switch (BYTE_TYPE(enc, ptr)) {
962 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
963 case BT_CR:
964 case BT_LF:
965 case BT_S:
966 case BT_RPAR:
967 case BT_GT:
968 case BT_PERCNT:
969 case BT_VERBAR:
970 *nextTokPtr = ptr;
971 return XML_TOK_POUND_NAME;
972 default:
973 *nextTokPtr = ptr;
974 return XML_TOK_INVALID;
975 }
976 }
977 return -XML_TOK_POUND_NAME;
978}
979
980static int PTRCALL
981PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
982 const char **nextTokPtr) {
983 while (HAS_CHAR(enc, ptr, end)) {
984 int t = BYTE_TYPE(enc, ptr);
985 switch (t) {
986 INVALID_CASES(ptr, nextTokPtr)
987 case BT_QUOT:
988 case BT_APOS:
989 ptr += MINBPC(enc);
990 if (t != open)
991 break;
992 if (! HAS_CHAR(enc, ptr, end))
993 return -XML_TOK_LITERAL;
994 *nextTokPtr = ptr;
995 switch (BYTE_TYPE(enc, ptr)) {
996 case BT_S:
997 case BT_CR:
998 case BT_LF:
999 case BT_GT:
1000 case BT_PERCNT:
1001 case BT_LSQB:
1002 return XML_TOK_LITERAL;
1003 default:
1004 return XML_TOK_INVALID;
1005 }
1006 default:
1007 ptr += MINBPC(enc);
1008 break;
1009 }
1010 }
1011 return XML_TOK_PARTIAL;
1012}
1013
1014static int PTRCALL
1015PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1016 const char **nextTokPtr) {
1017 int tok;
1018 if (ptr >= end)
1019 return XML_TOK_NONE;
1020 if (MINBPC(enc) > 1) {
1021 size_t n = end - ptr;
1022 if (n & (MINBPC(enc) - 1)) {
1023 n &= ~(MINBPC(enc) - 1);
1024 if (n == 0)
1025 return XML_TOK_PARTIAL;
1026 end = ptr + n;
1027 }
1028 }
1029 switch (BYTE_TYPE(enc, ptr)) {
1030 case BT_QUOT:
1031 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1032 case BT_APOS:
1033 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1034 case BT_LT: {
1035 ptr += MINBPC(enc);
1036 REQUIRE_CHAR(enc, ptr, end);
1037 switch (BYTE_TYPE(enc, ptr)) {
1038 case BT_EXCL:
1039 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1040 case BT_QUEST:
1041 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1042 case BT_NMSTRT:
1043 case BT_HEX:
1044 case BT_NONASCII:
1045 case BT_LEAD2:
1046 case BT_LEAD3:
1047 case BT_LEAD4:
1048 *nextTokPtr = ptr - MINBPC(enc);
1049 return XML_TOK_INSTANCE_START;
1050 }
1051 *nextTokPtr = ptr;
1052 return XML_TOK_INVALID;
1053 }
1054 case BT_CR:
1055 if (ptr + MINBPC(enc) == end) {
1056 *nextTokPtr = end;
1057 /* indicate that this might be part of a CR/LF pair */
1058 return -XML_TOK_PROLOG_S;
1059 }
1060 /* fall through */
1061 case BT_S:
1062 case BT_LF:
1063 for (;;) {
1064 ptr += MINBPC(enc);
1065 if (! HAS_CHAR(enc, ptr, end))
1066 break;
1067 switch (BYTE_TYPE(enc, ptr)) {
1068 case BT_S:
1069 case BT_LF:
1070 break;
1071 case BT_CR:
1072 /* don't split CR/LF pair */
1073 if (ptr + MINBPC(enc) != end)
1074 break;
1075 /* fall through */
1076 default:
1077 *nextTokPtr = ptr;
1078 return XML_TOK_PROLOG_S;
1079 }
1080 }
1081 *nextTokPtr = ptr;
1082 return XML_TOK_PROLOG_S;
1083 case BT_PERCNT:
1084 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1085 case BT_COMMA:
1086 *nextTokPtr = ptr + MINBPC(enc);
1087 return XML_TOK_COMMA;
1088 case BT_LSQB:
1089 *nextTokPtr = ptr + MINBPC(enc);
1090 return XML_TOK_OPEN_BRACKET;
1091 case BT_RSQB:
1092 ptr += MINBPC(enc);
1093 if (! HAS_CHAR(enc, ptr, end))
1094 return -XML_TOK_CLOSE_BRACKET;
1095 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1096 REQUIRE_CHARS(enc, ptr, end, 2);
1097 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1098 *nextTokPtr = ptr + 2 * MINBPC(enc);
1099 return XML_TOK_COND_SECT_CLOSE;
1100 }
1101 }
1102 *nextTokPtr = ptr;
1103 return XML_TOK_CLOSE_BRACKET;
1104 case BT_LPAR:
1105 *nextTokPtr = ptr + MINBPC(enc);
1106 return XML_TOK_OPEN_PAREN;
1107 case BT_RPAR:
1108 ptr += MINBPC(enc);
1109 if (! HAS_CHAR(enc, ptr, end))
1110 return -XML_TOK_CLOSE_PAREN;
1111 switch (BYTE_TYPE(enc, ptr)) {
1112 case BT_AST:
1113 *nextTokPtr = ptr + MINBPC(enc);
1114 return XML_TOK_CLOSE_PAREN_ASTERISK;
1115 case BT_QUEST:
1116 *nextTokPtr = ptr + MINBPC(enc);
1117 return XML_TOK_CLOSE_PAREN_QUESTION;
1118 case BT_PLUS:
1119 *nextTokPtr = ptr + MINBPC(enc);
1120 return XML_TOK_CLOSE_PAREN_PLUS;
1121 case BT_CR:
1122 case BT_LF:
1123 case BT_S:
1124 case BT_GT:
1125 case BT_COMMA:
1126 case BT_VERBAR:
1127 case BT_RPAR:
1128 *nextTokPtr = ptr;
1129 return XML_TOK_CLOSE_PAREN;
1130 }
1131 *nextTokPtr = ptr;
1132 return XML_TOK_INVALID;
1133 case BT_VERBAR:
1134 *nextTokPtr = ptr + MINBPC(enc);
1135 return XML_TOK_OR;
1136 case BT_GT:
1137 *nextTokPtr = ptr + MINBPC(enc);
1138 return XML_TOK_DECL_CLOSE;
1139 case BT_NUM:
1140 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1141# define LEAD_CASE(n) \
1142 case BT_LEAD##n: \
1143 if (end - ptr < n) \
1144 return XML_TOK_PARTIAL_CHAR; \
1145 if (IS_INVALID_CHAR(enc, ptr, n)) { \
1146 *nextTokPtr = ptr; \
1147 return XML_TOK_INVALID; \
1148 } \
1149 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1150 ptr += n; \
1151 tok = XML_TOK_NAME; \
1152 break; \
1153 } \
1154 if (IS_NAME_CHAR(enc, ptr, n)) { \
1155 ptr += n; \
1156 tok = XML_TOK_NMTOKEN; \
1157 break; \
1158 } \
1159 *nextTokPtr = ptr; \
1160 return XML_TOK_INVALID;
1161 LEAD_CASE(2)
1162 LEAD_CASE(3)
1163 LEAD_CASE(4)
1164# undef LEAD_CASE
1165 case BT_NMSTRT:
1166 case BT_HEX:
1167 tok = XML_TOK_NAME;
1168 ptr += MINBPC(enc);
1169 break;
1170 case BT_DIGIT:
1171 case BT_NAME:
1172 case BT_MINUS:
1173# ifdef XML_NS
1174 case BT_COLON:
1175# endif
1176 tok = XML_TOK_NMTOKEN;
1177 ptr += MINBPC(enc);
1178 break;
1179 case BT_NONASCII:
1180 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1181 ptr += MINBPC(enc);
1182 tok = XML_TOK_NAME;
1183 break;
1184 }
1185 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1186 ptr += MINBPC(enc);
1187 tok = XML_TOK_NMTOKEN;
1188 break;
1189 }
1190 /* fall through */
1191 default:
1192 *nextTokPtr = ptr;
1193 return XML_TOK_INVALID;
1194 }
1195 while (HAS_CHAR(enc, ptr, end)) {
1196 switch (BYTE_TYPE(enc, ptr)) {
1197 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1198 case BT_GT:
1199 case BT_RPAR:
1200 case BT_COMMA:
1201 case BT_VERBAR:
1202 case BT_LSQB:
1203 case BT_PERCNT:
1204 case BT_S:
1205 case BT_CR:
1206 case BT_LF:
1207 *nextTokPtr = ptr;
1208 return tok;
1209# ifdef XML_NS
1210 case BT_COLON:
1211 ptr += MINBPC(enc);
1212 switch (tok) {
1213 case XML_TOK_NAME:
1214 REQUIRE_CHAR(enc, ptr, end);
1215 tok = XML_TOK_PREFIXED_NAME;
1216 switch (BYTE_TYPE(enc, ptr)) {
1217 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1218 default:
1219 tok = XML_TOK_NMTOKEN;
1220 break;
1221 }
1222 break;
1223 case XML_TOK_PREFIXED_NAME:
1224 tok = XML_TOK_NMTOKEN;
1225 break;
1226 }
1227 break;
1228# endif
1229 case BT_PLUS:
1230 if (tok == XML_TOK_NMTOKEN) {
1231 *nextTokPtr = ptr;
1232 return XML_TOK_INVALID;
1233 }
1234 *nextTokPtr = ptr + MINBPC(enc);
1235 return XML_TOK_NAME_PLUS;
1236 case BT_AST:
1237 if (tok == XML_TOK_NMTOKEN) {
1238 *nextTokPtr = ptr;
1239 return XML_TOK_INVALID;
1240 }
1241 *nextTokPtr = ptr + MINBPC(enc);
1242 return XML_TOK_NAME_ASTERISK;
1243 case BT_QUEST:
1244 if (tok == XML_TOK_NMTOKEN) {
1245 *nextTokPtr = ptr;
1246 return XML_TOK_INVALID;
1247 }
1248 *nextTokPtr = ptr + MINBPC(enc);
1249 return XML_TOK_NAME_QUESTION;
1250 default:
1251 *nextTokPtr = ptr;
1252 return XML_TOK_INVALID;
1253 }
1254 }
1255 return -tok;
1256}
1257
1258static int PTRCALL
1259PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1260 const char **nextTokPtr) {
1261 const char *start;
1262 if (ptr >= end)
1263 return XML_TOK_NONE;
1264 else if (! HAS_CHAR(enc, ptr, end)) {
1265 /* This line cannot be executed. The incoming data has already
1266 * been tokenized once, so incomplete characters like this have
1267 * already been eliminated from the input. Retaining the paranoia
1268 * check is still valuable, however.
1269 */
1270 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1271 }
1272 start = ptr;
1273 while (HAS_CHAR(enc, ptr, end)) {
1274 switch (BYTE_TYPE(enc, ptr)) {
1275# define LEAD_CASE(n) \
1276 case BT_LEAD##n: \
1277 ptr += n; /* NOTE: The encoding has already been validated. */ \
1278 break;
1279 LEAD_CASE(2)
1280 LEAD_CASE(3)
1281 LEAD_CASE(4)
1282# undef LEAD_CASE
1283 case BT_AMP:
1284 if (ptr == start)
1285 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1286 *nextTokPtr = ptr;
1287 return XML_TOK_DATA_CHARS;
1288 case BT_LT:
1289 /* this is for inside entity references */
1290 *nextTokPtr = ptr;
1291 return XML_TOK_INVALID;
1292 case BT_LF:
1293 if (ptr == start) {
1294 *nextTokPtr = ptr + MINBPC(enc);
1295 return XML_TOK_DATA_NEWLINE;
1296 }
1297 *nextTokPtr = ptr;
1298 return XML_TOK_DATA_CHARS;
1299 case BT_CR:
1300 if (ptr == start) {
1301 ptr += MINBPC(enc);
1302 if (! HAS_CHAR(enc, ptr, end))
1303 return XML_TOK_TRAILING_CR;
1304 if (BYTE_TYPE(enc, ptr) == BT_LF)
1305 ptr += MINBPC(enc);
1306 *nextTokPtr = ptr;
1307 return XML_TOK_DATA_NEWLINE;
1308 }
1309 *nextTokPtr = ptr;
1310 return XML_TOK_DATA_CHARS;
1311 case BT_S:
1312 if (ptr == start) {
1313 *nextTokPtr = ptr + MINBPC(enc);
1314 return XML_TOK_ATTRIBUTE_VALUE_S;
1315 }
1316 *nextTokPtr = ptr;
1317 return XML_TOK_DATA_CHARS;
1318 default:
1319 ptr += MINBPC(enc);
1320 break;
1321 }
1322 }
1323 *nextTokPtr = ptr;
1324 return XML_TOK_DATA_CHARS;
1325}
1326
1327static int PTRCALL
1328PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1329 const char **nextTokPtr) {
1330 const char *start;
1331 if (ptr >= end)
1332 return XML_TOK_NONE;
1333 else if (! HAS_CHAR(enc, ptr, end)) {
1334 /* This line cannot be executed. The incoming data has already
1335 * been tokenized once, so incomplete characters like this have
1336 * already been eliminated from the input. Retaining the paranoia
1337 * check is still valuable, however.
1338 */
1339 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1340 }
1341 start = ptr;
1342 while (HAS_CHAR(enc, ptr, end)) {
1343 switch (BYTE_TYPE(enc, ptr)) {
1344# define LEAD_CASE(n) \
1345 case BT_LEAD##n: \
1346 ptr += n; /* NOTE: The encoding has already been validated. */ \
1347 break;
1348 LEAD_CASE(2)
1349 LEAD_CASE(3)
1350 LEAD_CASE(4)
1351# undef LEAD_CASE
1352 case BT_AMP:
1353 if (ptr == start)
1354 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1355 *nextTokPtr = ptr;
1356 return XML_TOK_DATA_CHARS;
1357 case BT_PERCNT:
1358 if (ptr == start) {
1359 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1360 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1361 }
1362 *nextTokPtr = ptr;
1363 return XML_TOK_DATA_CHARS;
1364 case BT_LF:
1365 if (ptr == start) {
1366 *nextTokPtr = ptr + MINBPC(enc);
1367 return XML_TOK_DATA_NEWLINE;
1368 }
1369 *nextTokPtr = ptr;
1370 return XML_TOK_DATA_CHARS;
1371 case BT_CR:
1372 if (ptr == start) {
1373 ptr += MINBPC(enc);
1374 if (! HAS_CHAR(enc, ptr, end))
1375 return XML_TOK_TRAILING_CR;
1376 if (BYTE_TYPE(enc, ptr) == BT_LF)
1377 ptr += MINBPC(enc);
1378 *nextTokPtr = ptr;
1379 return XML_TOK_DATA_NEWLINE;
1380 }
1381 *nextTokPtr = ptr;
1382 return XML_TOK_DATA_CHARS;
1383 default:
1384 ptr += MINBPC(enc);
1385 break;
1386 }
1387 }
1388 *nextTokPtr = ptr;
1389 return XML_TOK_DATA_CHARS;
1390}
1391
1392# ifdef XML_DTD
1393
1394static int PTRCALL
1395PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1396 const char **nextTokPtr) {
1397 int level = 0;
1398 if (MINBPC(enc) > 1) {
1399 size_t n = end - ptr;
1400 if (n & (MINBPC(enc) - 1)) {
1401 n &= ~(MINBPC(enc) - 1);
1402 end = ptr + n;
1403 }
1404 }
1405 while (HAS_CHAR(enc, ptr, end)) {
1406 switch (BYTE_TYPE(enc, ptr)) {
1407 INVALID_CASES(ptr, nextTokPtr)
1408 case BT_LT:
1409 ptr += MINBPC(enc);
1410 REQUIRE_CHAR(enc, ptr, end);
1411 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1412 ptr += MINBPC(enc);
1413 REQUIRE_CHAR(enc, ptr, end);
1414 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1415 ++level;
1416 ptr += MINBPC(enc);
1417 }
1418 }
1419 break;
1420 case BT_RSQB:
1421 ptr += MINBPC(enc);
1422 REQUIRE_CHAR(enc, ptr, end);
1423 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1424 ptr += MINBPC(enc);
1425 REQUIRE_CHAR(enc, ptr, end);
1426 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1427 ptr += MINBPC(enc);
1428 if (level == 0) {
1429 *nextTokPtr = ptr;
1430 return XML_TOK_IGNORE_SECT;
1431 }
1432 --level;
1433 }
1434 }
1435 break;
1436 default:
1437 ptr += MINBPC(enc);
1438 break;
1439 }
1440 }
1441 return XML_TOK_PARTIAL;
1442}
1443
1444# endif /* XML_DTD */
1445
1446static int PTRCALL
1447PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1448 const char **badPtr) {
1449 ptr += MINBPC(enc);
1450 end -= MINBPC(enc);
1451 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1452 switch (BYTE_TYPE(enc, ptr)) {
1453 case BT_DIGIT:
1454 case BT_HEX:
1455 case BT_MINUS:
1456 case BT_APOS:
1457 case BT_LPAR:
1458 case BT_RPAR:
1459 case BT_PLUS:
1460 case BT_COMMA:
1461 case BT_SOL:
1462 case BT_EQUALS:
1463 case BT_QUEST:
1464 case BT_CR:
1465 case BT_LF:
1466 case BT_SEMI:
1467 case BT_EXCL:
1468 case BT_AST:
1469 case BT_PERCNT:
1470 case BT_NUM:
1471# ifdef XML_NS
1472 case BT_COLON:
1473# endif
1474 break;
1475 case BT_S:
1476 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1477 *badPtr = ptr;
1478 return 0;
1479 }
1480 break;
1481 case BT_NAME:
1482 case BT_NMSTRT:
1483 if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1484 break;
1485 /* fall through */
1486 default:
1487 switch (BYTE_TO_ASCII(enc, ptr)) {
1488 case 0x24: /* $ */
1489 case 0x40: /* @ */
1490 break;
1491 default:
1492 *badPtr = ptr;
1493 return 0;
1494 }
1495 break;
1496 }
1497 }
1498 return 1;
1499}
1500
1501/* This must only be called for a well-formed start-tag or empty
1502 element tag. Returns the number of attributes. Pointers to the
1503 first attsMax attributes are stored in atts.
1504*/
1505
1506static int PTRCALL
1507PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1508 ATTRIBUTE *atts) {
1509 enum { other, inName, inValue } state = inName;
1510 int nAtts = 0;
1511 int open = 0; /* defined when state == inValue;
1512 initialization just to shut up compilers */
1513
1514 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1515 switch (BYTE_TYPE(enc, ptr)) {
1516# define START_NAME \
1517 if (state == other) { \
1518 if (nAtts < attsMax) { \
1519 atts[nAtts].name = ptr; \
1520 atts[nAtts].normalized = 1; \
1521 } \
1522 state = inName; \
1523 }
1524# define LEAD_CASE(n) \
1525 case BT_LEAD##n: /* NOTE: The encoding has already been validated. */ \
1526 START_NAME ptr += (n - MINBPC(enc)); \
1527 break;
1528 LEAD_CASE(2)
1529 LEAD_CASE(3)
1530 LEAD_CASE(4)
1531# undef LEAD_CASE
1532 case BT_NONASCII:
1533 case BT_NMSTRT:
1534 case BT_HEX:
1535 START_NAME
1536 break;
1537# undef START_NAME
1538 case BT_QUOT:
1539 if (state != inValue) {
1540 if (nAtts < attsMax)
1541 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1542 state = inValue;
1543 open = BT_QUOT;
1544 } else if (open == BT_QUOT) {
1545 state = other;
1546 if (nAtts < attsMax)
1547 atts[nAtts].valueEnd = ptr;
1548 nAtts++;
1549 }
1550 break;
1551 case BT_APOS:
1552 if (state != inValue) {
1553 if (nAtts < attsMax)
1554 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1555 state = inValue;
1556 open = BT_APOS;
1557 } else if (open == BT_APOS) {
1558 state = other;
1559 if (nAtts < attsMax)
1560 atts[nAtts].valueEnd = ptr;
1561 nAtts++;
1562 }
1563 break;
1564 case BT_AMP:
1565 if (nAtts < attsMax)
1566 atts[nAtts].normalized = 0;
1567 break;
1568 case BT_S:
1569 if (state == inName)
1570 state = other;
1571 else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1572 && (ptr == atts[nAtts].valuePtr
1573 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1574 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1575 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1576 atts[nAtts].normalized = 0;
1577 break;
1578 case BT_CR:
1579 case BT_LF:
1580 /* This case ensures that the first attribute name is counted
1581 Apart from that we could just change state on the quote. */
1582 if (state == inName)
1583 state = other;
1584 else if (state == inValue && nAtts < attsMax)
1585 atts[nAtts].normalized = 0;
1586 break;
1587 case BT_GT:
1588 case BT_SOL:
1589 if (state != inValue)
1590 return nAtts;
1591 break;
1592 default:
1593 break;
1594 }
1595 }
1596 /* not reached */
1597}
1598
1599static int PTRFASTCALL
1600PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1601 int result = 0;
1602 /* skip &# */
1603 UNUSED_P(enc);
1604 ptr += 2 * MINBPC(enc);
1605 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1606 for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1607 ptr += MINBPC(enc)) {
1608 int c = BYTE_TO_ASCII(enc, ptr);
1609 switch (c) {
1610 case ASCII_0:
1611 case ASCII_1:
1612 case ASCII_2:
1613 case ASCII_3:
1614 case ASCII_4:
1615 case ASCII_5:
1616 case ASCII_6:
1617 case ASCII_7:
1618 case ASCII_8:
1619 case ASCII_9:
1620 result <<= 4;
1621 result |= (c - ASCII_0);
1622 break;
1623 case ASCII_A:
1624 case ASCII_B:
1625 case ASCII_C:
1626 case ASCII_D:
1627 case ASCII_E:
1628 case ASCII_F:
1629 result <<= 4;
1630 result += 10 + (c - ASCII_A);
1631 break;
1632 case ASCII_a:
1633 case ASCII_b:
1634 case ASCII_c:
1635 case ASCII_d:
1636 case ASCII_e:
1637 case ASCII_f:
1638 result <<= 4;
1639 result += 10 + (c - ASCII_a);
1640 break;
1641 }
1642 if (result >= 0x110000)
1643 return -1;
1644 }
1645 } else {
1646 for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1647 int c = BYTE_TO_ASCII(enc, ptr);
1648 result *= 10;
1649 result += (c - ASCII_0);
1650 if (result >= 0x110000)
1651 return -1;
1652 }
1653 }
1654 return checkCharRefNumber(result);
1655}
1656
1657static int PTRCALL
1658PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1659 const char *end) {
1660 UNUSED_P(enc);
1661 switch ((end - ptr) / MINBPC(enc)) {
1662 case 2:
1663 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1664 switch (BYTE_TO_ASCII(enc, ptr)) {
1665 case ASCII_l:
1666 return ASCII_LT;
1667 case ASCII_g:
1668 return ASCII_GT;
1669 }
1670 }
1671 break;
1672 case 3:
1673 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1674 ptr += MINBPC(enc);
1675 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1676 ptr += MINBPC(enc);
1677 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1678 return ASCII_AMP;
1679 }
1680 }
1681 break;
1682 case 4:
1683 switch (BYTE_TO_ASCII(enc, ptr)) {
1684 case ASCII_q:
1685 ptr += MINBPC(enc);
1686 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1687 ptr += MINBPC(enc);
1688 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1689 ptr += MINBPC(enc);
1690 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1691 return ASCII_QUOT;
1692 }
1693 }
1694 break;
1695 case ASCII_a:
1696 ptr += MINBPC(enc);
1697 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1698 ptr += MINBPC(enc);
1699 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1700 ptr += MINBPC(enc);
1701 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1702 return ASCII_APOS;
1703 }
1704 }
1705 break;
1706 }
1707 }
1708 return 0;
1709}
1710
1711static int PTRCALL
1712PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1713 const char *end1, const char *ptr2) {
1714 UNUSED_P(enc);
1715 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1716 if (end1 - ptr1 < MINBPC(enc)) {
1717 /* This line cannot be executed. The incoming data has already
1718 * been tokenized once, so incomplete characters like this have
1719 * already been eliminated from the input. Retaining the
1720 * paranoia check is still valuable, however.
1721 */
1722 return 0; /* LCOV_EXCL_LINE */
1723 }
1724 if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1725 return 0;
1726 }
1727 return ptr1 == end1;
1728}
1729
1730static int PTRFASTCALL
1731PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1732 const char *start = ptr;
1733 for (;;) {
1734 switch (BYTE_TYPE(enc, ptr)) {
1735# define LEAD_CASE(n) \
1736 case BT_LEAD##n: \
1737 ptr += n; /* NOTE: The encoding has already been validated. */ \
1738 break;
1739 LEAD_CASE(2)
1740 LEAD_CASE(3)
1741 LEAD_CASE(4)
1742# undef LEAD_CASE
1743 case BT_NONASCII:
1744 case BT_NMSTRT:
1745# ifdef XML_NS
1746 case BT_COLON:
1747# endif
1748 case BT_HEX:
1749 case BT_DIGIT:
1750 case BT_NAME:
1751 case BT_MINUS:
1752 ptr += MINBPC(enc);
1753 break;
1754 default:
1755 return (int)(ptr - start);
1756 }
1757 }
1758}
1759
1760static const char *PTRFASTCALL
1761PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1762 for (;;) {
1763 switch (BYTE_TYPE(enc, ptr)) {
1764 case BT_LF:
1765 case BT_CR:
1766 case BT_S:
1767 ptr += MINBPC(enc);
1768 break;
1769 default:
1770 return ptr;
1771 }
1772 }
1773}
1774
1775static void PTRCALL
1776PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1777 POSITION *pos) {
1778 while (HAS_CHAR(enc, ptr, end)) {
1779 switch (BYTE_TYPE(enc, ptr)) {
1780# define LEAD_CASE(n) \
1781 case BT_LEAD##n: \
1782 ptr += n; /* NOTE: The encoding has already been validated. */ \
1783 pos->columnNumber++; \
1784 break;
1785 LEAD_CASE(2)
1786 LEAD_CASE(3)
1787 LEAD_CASE(4)
1788# undef LEAD_CASE
1789 case BT_LF:
1790 pos->columnNumber = 0;
1791 pos->lineNumber++;
1792 ptr += MINBPC(enc);
1793 break;
1794 case BT_CR:
1795 pos->lineNumber++;
1796 ptr += MINBPC(enc);
1797 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1798 ptr += MINBPC(enc);
1799 pos->columnNumber = 0;
1800 break;
1801 default:
1802 ptr += MINBPC(enc);
1803 pos->columnNumber++;
1804 break;
1805 }
1806 }
1807}
1808
1809# undef DO_LEAD_CASE
1810# undef MULTIBYTE_CASES
1811# undef INVALID_CASES
1812# undef CHECK_NAME_CASE
1813# undef CHECK_NAME_CASES
1814# undef CHECK_NMSTRT_CASE
1815# undef CHECK_NMSTRT_CASES
1816
1817#endif /* XML_TOK_IMPL_C */
1818