1 | /* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)! |
2 | __ __ _ |
3 | ___\ \/ /_ __ __ _| |_ |
4 | / _ \\ /| '_ \ / _` | __| |
5 | | __// \| |_) | (_| | |_ |
6 | \___/_/\_\ .__/ \__,_|\__| |
7 | |_| XML parser |
8 | |
9 | Copyright (c) 1997-2000 Thai Open Source Software Center Ltd |
10 | Copyright (c) 2000 Clark Cooper <[email protected]> |
11 | Copyright (c) 2002 Fred L. Drake, Jr. <[email protected]> |
12 | Copyright (c) 2002-2016 Karl Waclawek <[email protected]> |
13 | Copyright (c) 2016-2022 Sebastian Pipping <[email protected]> |
14 | Copyright (c) 2017 Rhodri James <[email protected]> |
15 | Copyright (c) 2018 Benjamin Peterson <[email protected]> |
16 | Copyright (c) 2018 Anton Maklakov <[email protected]> |
17 | Copyright (c) 2019 David Loffredo <[email protected]> |
18 | Copyright (c) 2020 Boris Kolpackov <[email protected]> |
19 | Licensed under the MIT license: |
20 | |
21 | Permission is hereby granted, free of charge, to any person obtaining |
22 | a copy of this software and associated documentation files (the |
23 | "Software"), to deal in the Software without restriction, including |
24 | without limitation the rights to use, copy, modify, merge, publish, |
25 | distribute, sublicense, and/or sell copies of the Software, and to permit |
26 | persons to whom the Software is furnished to do so, subject to the |
27 | following conditions: |
28 | |
29 | The above copyright notice and this permission notice shall be included |
30 | in all copies or substantial portions of the Software. |
31 | |
32 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
33 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
34 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN |
35 | NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, |
36 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
37 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
38 | USE OR OTHER DEALINGS IN THE SOFTWARE. |
39 | */ |
40 | |
41 | #ifdef XML_TOK_IMPL_C |
42 | |
43 | # ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined |
44 | # define IS_INVALID_CHAR(enc, ptr, n) (0) |
45 | # endif |
46 | |
47 | # define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \ |
48 | case BT_LEAD##n: \ |
49 | if (end - ptr < n) \ |
50 | return XML_TOK_PARTIAL_CHAR; \ |
51 | if (IS_INVALID_CHAR(enc, ptr, n)) { \ |
52 | *(nextTokPtr) = (ptr); \ |
53 | return XML_TOK_INVALID; \ |
54 | } \ |
55 | ptr += n; \ |
56 | break; |
57 | |
58 | # define INVALID_CASES(ptr, nextTokPtr) \ |
59 | INVALID_LEAD_CASE(2, ptr, nextTokPtr) \ |
60 | INVALID_LEAD_CASE(3, ptr, nextTokPtr) \ |
61 | INVALID_LEAD_CASE(4, ptr, nextTokPtr) \ |
62 | case BT_NONXML: \ |
63 | case BT_MALFORM: \ |
64 | case BT_TRAIL: \ |
65 | *(nextTokPtr) = (ptr); \ |
66 | return XML_TOK_INVALID; |
67 | |
68 | # define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \ |
69 | case BT_LEAD##n: \ |
70 | if (end - ptr < n) \ |
71 | return XML_TOK_PARTIAL_CHAR; \ |
72 | if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \ |
73 | *nextTokPtr = ptr; \ |
74 | return XML_TOK_INVALID; \ |
75 | } \ |
76 | ptr += n; \ |
77 | break; |
78 | |
79 | # define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ |
80 | case BT_NONASCII: \ |
81 | if (! IS_NAME_CHAR_MINBPC(enc, ptr)) { \ |
82 | *nextTokPtr = ptr; \ |
83 | return XML_TOK_INVALID; \ |
84 | } \ |
85 | /* fall through */ \ |
86 | case BT_NMSTRT: \ |
87 | case BT_HEX: \ |
88 | case BT_DIGIT: \ |
89 | case BT_NAME: \ |
90 | case BT_MINUS: \ |
91 | ptr += MINBPC(enc); \ |
92 | break; \ |
93 | CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \ |
94 | CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \ |
95 | CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr) |
96 | |
97 | # define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \ |
98 | case BT_LEAD##n: \ |
99 | if (end - ptr < n) \ |
100 | return XML_TOK_PARTIAL_CHAR; \ |
101 | if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \ |
102 | *nextTokPtr = ptr; \ |
103 | return XML_TOK_INVALID; \ |
104 | } \ |
105 | ptr += n; \ |
106 | break; |
107 | |
108 | # define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ |
109 | case BT_NONASCII: \ |
110 | if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ |
111 | *nextTokPtr = ptr; \ |
112 | return XML_TOK_INVALID; \ |
113 | } \ |
114 | /* fall through */ \ |
115 | case BT_NMSTRT: \ |
116 | case BT_HEX: \ |
117 | ptr += MINBPC(enc); \ |
118 | break; \ |
119 | CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \ |
120 | CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \ |
121 | CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr) |
122 | |
123 | # ifndef PREFIX |
124 | # define PREFIX(ident) ident |
125 | # endif |
126 | |
127 | # define HAS_CHARS(enc, ptr, end, count) (end - ptr >= count * MINBPC(enc)) |
128 | |
129 | # define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1) |
130 | |
131 | # define REQUIRE_CHARS(enc, ptr, end, count) \ |
132 | { \ |
133 | if (! HAS_CHARS(enc, ptr, end, count)) { \ |
134 | return XML_TOK_PARTIAL; \ |
135 | } \ |
136 | } |
137 | |
138 | # define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1) |
139 | |
140 | /* ptr points to character following "<!-" */ |
141 | |
142 | static int PTRCALL |
143 | PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end, |
144 | const char **nextTokPtr) { |
145 | if (HAS_CHAR(enc, ptr, end)) { |
146 | if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { |
147 | *nextTokPtr = ptr; |
148 | return XML_TOK_INVALID; |
149 | } |
150 | ptr += MINBPC(enc); |
151 | while (HAS_CHAR(enc, ptr, end)) { |
152 | switch (BYTE_TYPE(enc, ptr)) { |
153 | INVALID_CASES(ptr, nextTokPtr) |
154 | case BT_MINUS: |
155 | ptr += MINBPC(enc); |
156 | REQUIRE_CHAR(enc, ptr, end); |
157 | if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { |
158 | ptr += MINBPC(enc); |
159 | REQUIRE_CHAR(enc, ptr, end); |
160 | if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
161 | *nextTokPtr = ptr; |
162 | return XML_TOK_INVALID; |
163 | } |
164 | *nextTokPtr = ptr + MINBPC(enc); |
165 | return XML_TOK_COMMENT; |
166 | } |
167 | break; |
168 | default: |
169 | ptr += MINBPC(enc); |
170 | break; |
171 | } |
172 | } |
173 | } |
174 | return XML_TOK_PARTIAL; |
175 | } |
176 | |
177 | /* ptr points to character following "<!" */ |
178 | |
179 | static int PTRCALL |
180 | PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end, |
181 | const char **nextTokPtr) { |
182 | REQUIRE_CHAR(enc, ptr, end); |
183 | switch (BYTE_TYPE(enc, ptr)) { |
184 | case BT_MINUS: |
185 | return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
186 | case BT_LSQB: |
187 | *nextTokPtr = ptr + MINBPC(enc); |
188 | return XML_TOK_COND_SECT_OPEN; |
189 | case BT_NMSTRT: |
190 | case BT_HEX: |
191 | ptr += MINBPC(enc); |
192 | break; |
193 | default: |
194 | *nextTokPtr = ptr; |
195 | return XML_TOK_INVALID; |
196 | } |
197 | while (HAS_CHAR(enc, ptr, end)) { |
198 | switch (BYTE_TYPE(enc, ptr)) { |
199 | case BT_PERCNT: |
200 | REQUIRE_CHARS(enc, ptr, end, 2); |
201 | /* don't allow <!ENTITY% foo "whatever"> */ |
202 | switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) { |
203 | case BT_S: |
204 | case BT_CR: |
205 | case BT_LF: |
206 | case BT_PERCNT: |
207 | *nextTokPtr = ptr; |
208 | return XML_TOK_INVALID; |
209 | } |
210 | /* fall through */ |
211 | case BT_S: |
212 | case BT_CR: |
213 | case BT_LF: |
214 | *nextTokPtr = ptr; |
215 | return XML_TOK_DECL_OPEN; |
216 | case BT_NMSTRT: |
217 | case BT_HEX: |
218 | ptr += MINBPC(enc); |
219 | break; |
220 | default: |
221 | *nextTokPtr = ptr; |
222 | return XML_TOK_INVALID; |
223 | } |
224 | } |
225 | return XML_TOK_PARTIAL; |
226 | } |
227 | |
228 | static int PTRCALL |
229 | PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, |
230 | int *tokPtr) { |
231 | int upper = 0; |
232 | UNUSED_P(enc); |
233 | *tokPtr = XML_TOK_PI; |
234 | if (end - ptr != MINBPC(enc) * 3) |
235 | return 1; |
236 | switch (BYTE_TO_ASCII(enc, ptr)) { |
237 | case ASCII_x: |
238 | break; |
239 | case ASCII_X: |
240 | upper = 1; |
241 | break; |
242 | default: |
243 | return 1; |
244 | } |
245 | ptr += MINBPC(enc); |
246 | switch (BYTE_TO_ASCII(enc, ptr)) { |
247 | case ASCII_m: |
248 | break; |
249 | case ASCII_M: |
250 | upper = 1; |
251 | break; |
252 | default: |
253 | return 1; |
254 | } |
255 | ptr += MINBPC(enc); |
256 | switch (BYTE_TO_ASCII(enc, ptr)) { |
257 | case ASCII_l: |
258 | break; |
259 | case ASCII_L: |
260 | upper = 1; |
261 | break; |
262 | default: |
263 | return 1; |
264 | } |
265 | if (upper) |
266 | return 0; |
267 | *tokPtr = XML_TOK_XML_DECL; |
268 | return 1; |
269 | } |
270 | |
271 | /* ptr points to character following "<?" */ |
272 | |
273 | static int PTRCALL |
274 | PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end, |
275 | const char **nextTokPtr) { |
276 | int tok; |
277 | const char *target = ptr; |
278 | REQUIRE_CHAR(enc, ptr, end); |
279 | switch (BYTE_TYPE(enc, ptr)) { |
280 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
281 | default: |
282 | *nextTokPtr = ptr; |
283 | return XML_TOK_INVALID; |
284 | } |
285 | while (HAS_CHAR(enc, ptr, end)) { |
286 | switch (BYTE_TYPE(enc, ptr)) { |
287 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
288 | case BT_S: |
289 | case BT_CR: |
290 | case BT_LF: |
291 | if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { |
292 | *nextTokPtr = ptr; |
293 | return XML_TOK_INVALID; |
294 | } |
295 | ptr += MINBPC(enc); |
296 | while (HAS_CHAR(enc, ptr, end)) { |
297 | switch (BYTE_TYPE(enc, ptr)) { |
298 | INVALID_CASES(ptr, nextTokPtr) |
299 | case BT_QUEST: |
300 | ptr += MINBPC(enc); |
301 | REQUIRE_CHAR(enc, ptr, end); |
302 | if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
303 | *nextTokPtr = ptr + MINBPC(enc); |
304 | return tok; |
305 | } |
306 | break; |
307 | default: |
308 | ptr += MINBPC(enc); |
309 | break; |
310 | } |
311 | } |
312 | return XML_TOK_PARTIAL; |
313 | case BT_QUEST: |
314 | if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { |
315 | *nextTokPtr = ptr; |
316 | return XML_TOK_INVALID; |
317 | } |
318 | ptr += MINBPC(enc); |
319 | REQUIRE_CHAR(enc, ptr, end); |
320 | if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
321 | *nextTokPtr = ptr + MINBPC(enc); |
322 | return tok; |
323 | } |
324 | /* fall through */ |
325 | default: |
326 | *nextTokPtr = ptr; |
327 | return XML_TOK_INVALID; |
328 | } |
329 | } |
330 | return XML_TOK_PARTIAL; |
331 | } |
332 | |
333 | static int PTRCALL |
334 | PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end, |
335 | const char **nextTokPtr) { |
336 | static const char CDATA_LSQB[] |
337 | = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB}; |
338 | int i; |
339 | UNUSED_P(enc); |
340 | /* CDATA[ */ |
341 | REQUIRE_CHARS(enc, ptr, end, 6); |
342 | for (i = 0; i < 6; i++, ptr += MINBPC(enc)) { |
343 | if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) { |
344 | *nextTokPtr = ptr; |
345 | return XML_TOK_INVALID; |
346 | } |
347 | } |
348 | *nextTokPtr = ptr; |
349 | return XML_TOK_CDATA_SECT_OPEN; |
350 | } |
351 | |
352 | static int PTRCALL |
353 | PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end, |
354 | const char **nextTokPtr) { |
355 | if (ptr >= end) |
356 | return XML_TOK_NONE; |
357 | if (MINBPC(enc) > 1) { |
358 | size_t n = end - ptr; |
359 | if (n & (MINBPC(enc) - 1)) { |
360 | n &= ~(MINBPC(enc) - 1); |
361 | if (n == 0) |
362 | return XML_TOK_PARTIAL; |
363 | end = ptr + n; |
364 | } |
365 | } |
366 | switch (BYTE_TYPE(enc, ptr)) { |
367 | case BT_RSQB: |
368 | ptr += MINBPC(enc); |
369 | REQUIRE_CHAR(enc, ptr, end); |
370 | if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB)) |
371 | break; |
372 | ptr += MINBPC(enc); |
373 | REQUIRE_CHAR(enc, ptr, end); |
374 | if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
375 | ptr -= MINBPC(enc); |
376 | break; |
377 | } |
378 | *nextTokPtr = ptr + MINBPC(enc); |
379 | return XML_TOK_CDATA_SECT_CLOSE; |
380 | case BT_CR: |
381 | ptr += MINBPC(enc); |
382 | REQUIRE_CHAR(enc, ptr, end); |
383 | if (BYTE_TYPE(enc, ptr) == BT_LF) |
384 | ptr += MINBPC(enc); |
385 | *nextTokPtr = ptr; |
386 | return XML_TOK_DATA_NEWLINE; |
387 | case BT_LF: |
388 | *nextTokPtr = ptr + MINBPC(enc); |
389 | return XML_TOK_DATA_NEWLINE; |
390 | INVALID_CASES(ptr, nextTokPtr) |
391 | default: |
392 | ptr += MINBPC(enc); |
393 | break; |
394 | } |
395 | while (HAS_CHAR(enc, ptr, end)) { |
396 | switch (BYTE_TYPE(enc, ptr)) { |
397 | # define LEAD_CASE(n) \ |
398 | case BT_LEAD##n: \ |
399 | if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ |
400 | *nextTokPtr = ptr; \ |
401 | return XML_TOK_DATA_CHARS; \ |
402 | } \ |
403 | ptr += n; \ |
404 | break; |
405 | LEAD_CASE(2) |
406 | LEAD_CASE(3) |
407 | LEAD_CASE(4) |
408 | # undef LEAD_CASE |
409 | case BT_NONXML: |
410 | case BT_MALFORM: |
411 | case BT_TRAIL: |
412 | case BT_CR: |
413 | case BT_LF: |
414 | case BT_RSQB: |
415 | *nextTokPtr = ptr; |
416 | return XML_TOK_DATA_CHARS; |
417 | default: |
418 | ptr += MINBPC(enc); |
419 | break; |
420 | } |
421 | } |
422 | *nextTokPtr = ptr; |
423 | return XML_TOK_DATA_CHARS; |
424 | } |
425 | |
426 | /* ptr points to character following "</" */ |
427 | |
428 | static int PTRCALL |
429 | PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end, |
430 | const char **nextTokPtr) { |
431 | REQUIRE_CHAR(enc, ptr, end); |
432 | switch (BYTE_TYPE(enc, ptr)) { |
433 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
434 | default: |
435 | *nextTokPtr = ptr; |
436 | return XML_TOK_INVALID; |
437 | } |
438 | while (HAS_CHAR(enc, ptr, end)) { |
439 | switch (BYTE_TYPE(enc, ptr)) { |
440 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
441 | case BT_S: |
442 | case BT_CR: |
443 | case BT_LF: |
444 | for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { |
445 | switch (BYTE_TYPE(enc, ptr)) { |
446 | case BT_S: |
447 | case BT_CR: |
448 | case BT_LF: |
449 | break; |
450 | case BT_GT: |
451 | *nextTokPtr = ptr + MINBPC(enc); |
452 | return XML_TOK_END_TAG; |
453 | default: |
454 | *nextTokPtr = ptr; |
455 | return XML_TOK_INVALID; |
456 | } |
457 | } |
458 | return XML_TOK_PARTIAL; |
459 | # ifdef XML_NS |
460 | case BT_COLON: |
461 | /* no need to check qname syntax here, |
462 | since end-tag must match exactly */ |
463 | ptr += MINBPC(enc); |
464 | break; |
465 | # endif |
466 | case BT_GT: |
467 | *nextTokPtr = ptr + MINBPC(enc); |
468 | return XML_TOK_END_TAG; |
469 | default: |
470 | *nextTokPtr = ptr; |
471 | return XML_TOK_INVALID; |
472 | } |
473 | } |
474 | return XML_TOK_PARTIAL; |
475 | } |
476 | |
477 | /* ptr points to character following "&#X" */ |
478 | |
479 | static int PTRCALL |
480 | PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end, |
481 | const char **nextTokPtr) { |
482 | if (HAS_CHAR(enc, ptr, end)) { |
483 | switch (BYTE_TYPE(enc, ptr)) { |
484 | case BT_DIGIT: |
485 | case BT_HEX: |
486 | break; |
487 | default: |
488 | *nextTokPtr = ptr; |
489 | return XML_TOK_INVALID; |
490 | } |
491 | for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { |
492 | switch (BYTE_TYPE(enc, ptr)) { |
493 | case BT_DIGIT: |
494 | case BT_HEX: |
495 | break; |
496 | case BT_SEMI: |
497 | *nextTokPtr = ptr + MINBPC(enc); |
498 | return XML_TOK_CHAR_REF; |
499 | default: |
500 | *nextTokPtr = ptr; |
501 | return XML_TOK_INVALID; |
502 | } |
503 | } |
504 | } |
505 | return XML_TOK_PARTIAL; |
506 | } |
507 | |
508 | /* ptr points to character following "&#" */ |
509 | |
510 | static int PTRCALL |
511 | PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end, |
512 | const char **nextTokPtr) { |
513 | if (HAS_CHAR(enc, ptr, end)) { |
514 | if (CHAR_MATCHES(enc, ptr, ASCII_x)) |
515 | return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
516 | switch (BYTE_TYPE(enc, ptr)) { |
517 | case BT_DIGIT: |
518 | break; |
519 | default: |
520 | *nextTokPtr = ptr; |
521 | return XML_TOK_INVALID; |
522 | } |
523 | for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { |
524 | switch (BYTE_TYPE(enc, ptr)) { |
525 | case BT_DIGIT: |
526 | break; |
527 | case BT_SEMI: |
528 | *nextTokPtr = ptr + MINBPC(enc); |
529 | return XML_TOK_CHAR_REF; |
530 | default: |
531 | *nextTokPtr = ptr; |
532 | return XML_TOK_INVALID; |
533 | } |
534 | } |
535 | } |
536 | return XML_TOK_PARTIAL; |
537 | } |
538 | |
539 | /* ptr points to character following "&" */ |
540 | |
541 | static int PTRCALL |
542 | PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end, |
543 | const char **nextTokPtr) { |
544 | REQUIRE_CHAR(enc, ptr, end); |
545 | switch (BYTE_TYPE(enc, ptr)) { |
546 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
547 | case BT_NUM: |
548 | return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
549 | default: |
550 | *nextTokPtr = ptr; |
551 | return XML_TOK_INVALID; |
552 | } |
553 | while (HAS_CHAR(enc, ptr, end)) { |
554 | switch (BYTE_TYPE(enc, ptr)) { |
555 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
556 | case BT_SEMI: |
557 | *nextTokPtr = ptr + MINBPC(enc); |
558 | return XML_TOK_ENTITY_REF; |
559 | default: |
560 | *nextTokPtr = ptr; |
561 | return XML_TOK_INVALID; |
562 | } |
563 | } |
564 | return XML_TOK_PARTIAL; |
565 | } |
566 | |
567 | /* ptr points to character following first character of attribute name */ |
568 | |
569 | static int PTRCALL |
570 | PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, |
571 | const char **nextTokPtr) { |
572 | # ifdef XML_NS |
573 | int hadColon = 0; |
574 | # endif |
575 | while (HAS_CHAR(enc, ptr, end)) { |
576 | switch (BYTE_TYPE(enc, ptr)) { |
577 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
578 | # ifdef XML_NS |
579 | case BT_COLON: |
580 | if (hadColon) { |
581 | *nextTokPtr = ptr; |
582 | return XML_TOK_INVALID; |
583 | } |
584 | hadColon = 1; |
585 | ptr += MINBPC(enc); |
586 | REQUIRE_CHAR(enc, ptr, end); |
587 | switch (BYTE_TYPE(enc, ptr)) { |
588 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
589 | default: |
590 | *nextTokPtr = ptr; |
591 | return XML_TOK_INVALID; |
592 | } |
593 | break; |
594 | # endif |
595 | case BT_S: |
596 | case BT_CR: |
597 | case BT_LF: |
598 | for (;;) { |
599 | int t; |
600 | |
601 | ptr += MINBPC(enc); |
602 | REQUIRE_CHAR(enc, ptr, end); |
603 | t = BYTE_TYPE(enc, ptr); |
604 | if (t == BT_EQUALS) |
605 | break; |
606 | switch (t) { |
607 | case BT_S: |
608 | case BT_LF: |
609 | case BT_CR: |
610 | break; |
611 | default: |
612 | *nextTokPtr = ptr; |
613 | return XML_TOK_INVALID; |
614 | } |
615 | } |
616 | /* fall through */ |
617 | case BT_EQUALS: { |
618 | int open; |
619 | # ifdef XML_NS |
620 | hadColon = 0; |
621 | # endif |
622 | for (;;) { |
623 | ptr += MINBPC(enc); |
624 | REQUIRE_CHAR(enc, ptr, end); |
625 | open = BYTE_TYPE(enc, ptr); |
626 | if (open == BT_QUOT || open == BT_APOS) |
627 | break; |
628 | switch (open) { |
629 | case BT_S: |
630 | case BT_LF: |
631 | case BT_CR: |
632 | break; |
633 | default: |
634 | *nextTokPtr = ptr; |
635 | return XML_TOK_INVALID; |
636 | } |
637 | } |
638 | ptr += MINBPC(enc); |
639 | /* in attribute value */ |
640 | for (;;) { |
641 | int t; |
642 | REQUIRE_CHAR(enc, ptr, end); |
643 | t = BYTE_TYPE(enc, ptr); |
644 | if (t == open) |
645 | break; |
646 | switch (t) { |
647 | INVALID_CASES(ptr, nextTokPtr) |
648 | case BT_AMP: { |
649 | int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr); |
650 | if (tok <= 0) { |
651 | if (tok == XML_TOK_INVALID) |
652 | *nextTokPtr = ptr; |
653 | return tok; |
654 | } |
655 | break; |
656 | } |
657 | case BT_LT: |
658 | *nextTokPtr = ptr; |
659 | return XML_TOK_INVALID; |
660 | default: |
661 | ptr += MINBPC(enc); |
662 | break; |
663 | } |
664 | } |
665 | ptr += MINBPC(enc); |
666 | REQUIRE_CHAR(enc, ptr, end); |
667 | switch (BYTE_TYPE(enc, ptr)) { |
668 | case BT_S: |
669 | case BT_CR: |
670 | case BT_LF: |
671 | break; |
672 | case BT_SOL: |
673 | goto sol; |
674 | case BT_GT: |
675 | goto gt; |
676 | default: |
677 | *nextTokPtr = ptr; |
678 | return XML_TOK_INVALID; |
679 | } |
680 | /* ptr points to closing quote */ |
681 | for (;;) { |
682 | ptr += MINBPC(enc); |
683 | REQUIRE_CHAR(enc, ptr, end); |
684 | switch (BYTE_TYPE(enc, ptr)) { |
685 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
686 | case BT_S: |
687 | case BT_CR: |
688 | case BT_LF: |
689 | continue; |
690 | case BT_GT: |
691 | gt: |
692 | *nextTokPtr = ptr + MINBPC(enc); |
693 | return XML_TOK_START_TAG_WITH_ATTS; |
694 | case BT_SOL: |
695 | sol: |
696 | ptr += MINBPC(enc); |
697 | REQUIRE_CHAR(enc, ptr, end); |
698 | if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
699 | *nextTokPtr = ptr; |
700 | return XML_TOK_INVALID; |
701 | } |
702 | *nextTokPtr = ptr + MINBPC(enc); |
703 | return XML_TOK_EMPTY_ELEMENT_WITH_ATTS; |
704 | default: |
705 | *nextTokPtr = ptr; |
706 | return XML_TOK_INVALID; |
707 | } |
708 | break; |
709 | } |
710 | break; |
711 | } |
712 | default: |
713 | *nextTokPtr = ptr; |
714 | return XML_TOK_INVALID; |
715 | } |
716 | } |
717 | return XML_TOK_PARTIAL; |
718 | } |
719 | |
720 | /* ptr points to character following "<" */ |
721 | |
722 | static int PTRCALL |
723 | PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, |
724 | const char **nextTokPtr) { |
725 | # ifdef XML_NS |
726 | int hadColon; |
727 | # endif |
728 | REQUIRE_CHAR(enc, ptr, end); |
729 | switch (BYTE_TYPE(enc, ptr)) { |
730 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
731 | case BT_EXCL: |
732 | ptr += MINBPC(enc); |
733 | REQUIRE_CHAR(enc, ptr, end); |
734 | switch (BYTE_TYPE(enc, ptr)) { |
735 | case BT_MINUS: |
736 | return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
737 | case BT_LSQB: |
738 | return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
739 | } |
740 | *nextTokPtr = ptr; |
741 | return XML_TOK_INVALID; |
742 | case BT_QUEST: |
743 | return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
744 | case BT_SOL: |
745 | return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
746 | default: |
747 | *nextTokPtr = ptr; |
748 | return XML_TOK_INVALID; |
749 | } |
750 | # ifdef XML_NS |
751 | hadColon = 0; |
752 | # endif |
753 | /* we have a start-tag */ |
754 | while (HAS_CHAR(enc, ptr, end)) { |
755 | switch (BYTE_TYPE(enc, ptr)) { |
756 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
757 | # ifdef XML_NS |
758 | case BT_COLON: |
759 | if (hadColon) { |
760 | *nextTokPtr = ptr; |
761 | return XML_TOK_INVALID; |
762 | } |
763 | hadColon = 1; |
764 | ptr += MINBPC(enc); |
765 | REQUIRE_CHAR(enc, ptr, end); |
766 | switch (BYTE_TYPE(enc, ptr)) { |
767 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
768 | default: |
769 | *nextTokPtr = ptr; |
770 | return XML_TOK_INVALID; |
771 | } |
772 | break; |
773 | # endif |
774 | case BT_S: |
775 | case BT_CR: |
776 | case BT_LF: { |
777 | ptr += MINBPC(enc); |
778 | while (HAS_CHAR(enc, ptr, end)) { |
779 | switch (BYTE_TYPE(enc, ptr)) { |
780 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
781 | case BT_GT: |
782 | goto gt; |
783 | case BT_SOL: |
784 | goto sol; |
785 | case BT_S: |
786 | case BT_CR: |
787 | case BT_LF: |
788 | ptr += MINBPC(enc); |
789 | continue; |
790 | default: |
791 | *nextTokPtr = ptr; |
792 | return XML_TOK_INVALID; |
793 | } |
794 | return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr); |
795 | } |
796 | return XML_TOK_PARTIAL; |
797 | } |
798 | case BT_GT: |
799 | gt: |
800 | *nextTokPtr = ptr + MINBPC(enc); |
801 | return XML_TOK_START_TAG_NO_ATTS; |
802 | case BT_SOL: |
803 | sol: |
804 | ptr += MINBPC(enc); |
805 | REQUIRE_CHAR(enc, ptr, end); |
806 | if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
807 | *nextTokPtr = ptr; |
808 | return XML_TOK_INVALID; |
809 | } |
810 | *nextTokPtr = ptr + MINBPC(enc); |
811 | return XML_TOK_EMPTY_ELEMENT_NO_ATTS; |
812 | default: |
813 | *nextTokPtr = ptr; |
814 | return XML_TOK_INVALID; |
815 | } |
816 | } |
817 | return XML_TOK_PARTIAL; |
818 | } |
819 | |
820 | static int PTRCALL |
821 | PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, |
822 | const char **nextTokPtr) { |
823 | if (ptr >= end) |
824 | return XML_TOK_NONE; |
825 | if (MINBPC(enc) > 1) { |
826 | size_t n = end - ptr; |
827 | if (n & (MINBPC(enc) - 1)) { |
828 | n &= ~(MINBPC(enc) - 1); |
829 | if (n == 0) |
830 | return XML_TOK_PARTIAL; |
831 | end = ptr + n; |
832 | } |
833 | } |
834 | switch (BYTE_TYPE(enc, ptr)) { |
835 | case BT_LT: |
836 | return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
837 | case BT_AMP: |
838 | return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
839 | case BT_CR: |
840 | ptr += MINBPC(enc); |
841 | if (! HAS_CHAR(enc, ptr, end)) |
842 | return XML_TOK_TRAILING_CR; |
843 | if (BYTE_TYPE(enc, ptr) == BT_LF) |
844 | ptr += MINBPC(enc); |
845 | *nextTokPtr = ptr; |
846 | return XML_TOK_DATA_NEWLINE; |
847 | case BT_LF: |
848 | *nextTokPtr = ptr + MINBPC(enc); |
849 | return XML_TOK_DATA_NEWLINE; |
850 | case BT_RSQB: |
851 | ptr += MINBPC(enc); |
852 | if (! HAS_CHAR(enc, ptr, end)) |
853 | return XML_TOK_TRAILING_RSQB; |
854 | if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB)) |
855 | break; |
856 | ptr += MINBPC(enc); |
857 | if (! HAS_CHAR(enc, ptr, end)) |
858 | return XML_TOK_TRAILING_RSQB; |
859 | if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
860 | ptr -= MINBPC(enc); |
861 | break; |
862 | } |
863 | *nextTokPtr = ptr; |
864 | return XML_TOK_INVALID; |
865 | INVALID_CASES(ptr, nextTokPtr) |
866 | default: |
867 | ptr += MINBPC(enc); |
868 | break; |
869 | } |
870 | while (HAS_CHAR(enc, ptr, end)) { |
871 | switch (BYTE_TYPE(enc, ptr)) { |
872 | # define LEAD_CASE(n) \ |
873 | case BT_LEAD##n: \ |
874 | if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ |
875 | *nextTokPtr = ptr; \ |
876 | return XML_TOK_DATA_CHARS; \ |
877 | } \ |
878 | ptr += n; \ |
879 | break; |
880 | LEAD_CASE(2) |
881 | LEAD_CASE(3) |
882 | LEAD_CASE(4) |
883 | # undef LEAD_CASE |
884 | case BT_RSQB: |
885 | if (HAS_CHARS(enc, ptr, end, 2)) { |
886 | if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) { |
887 | ptr += MINBPC(enc); |
888 | break; |
889 | } |
890 | if (HAS_CHARS(enc, ptr, end, 3)) { |
891 | if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) { |
892 | ptr += MINBPC(enc); |
893 | break; |
894 | } |
895 | *nextTokPtr = ptr + 2 * MINBPC(enc); |
896 | return XML_TOK_INVALID; |
897 | } |
898 | } |
899 | /* fall through */ |
900 | case BT_AMP: |
901 | case BT_LT: |
902 | case BT_NONXML: |
903 | case BT_MALFORM: |
904 | case BT_TRAIL: |
905 | case BT_CR: |
906 | case BT_LF: |
907 | *nextTokPtr = ptr; |
908 | return XML_TOK_DATA_CHARS; |
909 | default: |
910 | ptr += MINBPC(enc); |
911 | break; |
912 | } |
913 | } |
914 | *nextTokPtr = ptr; |
915 | return XML_TOK_DATA_CHARS; |
916 | } |
917 | |
918 | /* ptr points to character following "%" */ |
919 | |
920 | static int PTRCALL |
921 | PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, |
922 | const char **nextTokPtr) { |
923 | REQUIRE_CHAR(enc, ptr, end); |
924 | switch (BYTE_TYPE(enc, ptr)) { |
925 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
926 | case BT_S: |
927 | case BT_LF: |
928 | case BT_CR: |
929 | case BT_PERCNT: |
930 | *nextTokPtr = ptr; |
931 | return XML_TOK_PERCENT; |
932 | default: |
933 | *nextTokPtr = ptr; |
934 | return XML_TOK_INVALID; |
935 | } |
936 | while (HAS_CHAR(enc, ptr, end)) { |
937 | switch (BYTE_TYPE(enc, ptr)) { |
938 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
939 | case BT_SEMI: |
940 | *nextTokPtr = ptr + MINBPC(enc); |
941 | return XML_TOK_PARAM_ENTITY_REF; |
942 | default: |
943 | *nextTokPtr = ptr; |
944 | return XML_TOK_INVALID; |
945 | } |
946 | } |
947 | return XML_TOK_PARTIAL; |
948 | } |
949 | |
950 | static int PTRCALL |
951 | PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, |
952 | const char **nextTokPtr) { |
953 | REQUIRE_CHAR(enc, ptr, end); |
954 | switch (BYTE_TYPE(enc, ptr)) { |
955 | CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) |
956 | default: |
957 | *nextTokPtr = ptr; |
958 | return XML_TOK_INVALID; |
959 | } |
960 | while (HAS_CHAR(enc, ptr, end)) { |
961 | switch (BYTE_TYPE(enc, ptr)) { |
962 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
963 | case BT_CR: |
964 | case BT_LF: |
965 | case BT_S: |
966 | case BT_RPAR: |
967 | case BT_GT: |
968 | case BT_PERCNT: |
969 | case BT_VERBAR: |
970 | *nextTokPtr = ptr; |
971 | return XML_TOK_POUND_NAME; |
972 | default: |
973 | *nextTokPtr = ptr; |
974 | return XML_TOK_INVALID; |
975 | } |
976 | } |
977 | return -XML_TOK_POUND_NAME; |
978 | } |
979 | |
980 | static int PTRCALL |
981 | PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end, |
982 | const char **nextTokPtr) { |
983 | while (HAS_CHAR(enc, ptr, end)) { |
984 | int t = BYTE_TYPE(enc, ptr); |
985 | switch (t) { |
986 | INVALID_CASES(ptr, nextTokPtr) |
987 | case BT_QUOT: |
988 | case BT_APOS: |
989 | ptr += MINBPC(enc); |
990 | if (t != open) |
991 | break; |
992 | if (! HAS_CHAR(enc, ptr, end)) |
993 | return -XML_TOK_LITERAL; |
994 | *nextTokPtr = ptr; |
995 | switch (BYTE_TYPE(enc, ptr)) { |
996 | case BT_S: |
997 | case BT_CR: |
998 | case BT_LF: |
999 | case BT_GT: |
1000 | case BT_PERCNT: |
1001 | case BT_LSQB: |
1002 | return XML_TOK_LITERAL; |
1003 | default: |
1004 | return XML_TOK_INVALID; |
1005 | } |
1006 | default: |
1007 | ptr += MINBPC(enc); |
1008 | break; |
1009 | } |
1010 | } |
1011 | return XML_TOK_PARTIAL; |
1012 | } |
1013 | |
1014 | static int PTRCALL |
1015 | PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, |
1016 | const char **nextTokPtr) { |
1017 | int tok; |
1018 | if (ptr >= end) |
1019 | return XML_TOK_NONE; |
1020 | if (MINBPC(enc) > 1) { |
1021 | size_t n = end - ptr; |
1022 | if (n & (MINBPC(enc) - 1)) { |
1023 | n &= ~(MINBPC(enc) - 1); |
1024 | if (n == 0) |
1025 | return XML_TOK_PARTIAL; |
1026 | end = ptr + n; |
1027 | } |
1028 | } |
1029 | switch (BYTE_TYPE(enc, ptr)) { |
1030 | case BT_QUOT: |
1031 | return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr); |
1032 | case BT_APOS: |
1033 | return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr); |
1034 | case BT_LT: { |
1035 | ptr += MINBPC(enc); |
1036 | REQUIRE_CHAR(enc, ptr, end); |
1037 | switch (BYTE_TYPE(enc, ptr)) { |
1038 | case BT_EXCL: |
1039 | return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1040 | case BT_QUEST: |
1041 | return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1042 | case BT_NMSTRT: |
1043 | case BT_HEX: |
1044 | case BT_NONASCII: |
1045 | case BT_LEAD2: |
1046 | case BT_LEAD3: |
1047 | case BT_LEAD4: |
1048 | *nextTokPtr = ptr - MINBPC(enc); |
1049 | return XML_TOK_INSTANCE_START; |
1050 | } |
1051 | *nextTokPtr = ptr; |
1052 | return XML_TOK_INVALID; |
1053 | } |
1054 | case BT_CR: |
1055 | if (ptr + MINBPC(enc) == end) { |
1056 | *nextTokPtr = end; |
1057 | /* indicate that this might be part of a CR/LF pair */ |
1058 | return -XML_TOK_PROLOG_S; |
1059 | } |
1060 | /* fall through */ |
1061 | case BT_S: |
1062 | case BT_LF: |
1063 | for (;;) { |
1064 | ptr += MINBPC(enc); |
1065 | if (! HAS_CHAR(enc, ptr, end)) |
1066 | break; |
1067 | switch (BYTE_TYPE(enc, ptr)) { |
1068 | case BT_S: |
1069 | case BT_LF: |
1070 | break; |
1071 | case BT_CR: |
1072 | /* don't split CR/LF pair */ |
1073 | if (ptr + MINBPC(enc) != end) |
1074 | break; |
1075 | /* fall through */ |
1076 | default: |
1077 | *nextTokPtr = ptr; |
1078 | return XML_TOK_PROLOG_S; |
1079 | } |
1080 | } |
1081 | *nextTokPtr = ptr; |
1082 | return XML_TOK_PROLOG_S; |
1083 | case BT_PERCNT: |
1084 | return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1085 | case BT_COMMA: |
1086 | *nextTokPtr = ptr + MINBPC(enc); |
1087 | return XML_TOK_COMMA; |
1088 | case BT_LSQB: |
1089 | *nextTokPtr = ptr + MINBPC(enc); |
1090 | return XML_TOK_OPEN_BRACKET; |
1091 | case BT_RSQB: |
1092 | ptr += MINBPC(enc); |
1093 | if (! HAS_CHAR(enc, ptr, end)) |
1094 | return -XML_TOK_CLOSE_BRACKET; |
1095 | if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { |
1096 | REQUIRE_CHARS(enc, ptr, end, 2); |
1097 | if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) { |
1098 | *nextTokPtr = ptr + 2 * MINBPC(enc); |
1099 | return XML_TOK_COND_SECT_CLOSE; |
1100 | } |
1101 | } |
1102 | *nextTokPtr = ptr; |
1103 | return XML_TOK_CLOSE_BRACKET; |
1104 | case BT_LPAR: |
1105 | *nextTokPtr = ptr + MINBPC(enc); |
1106 | return XML_TOK_OPEN_PAREN; |
1107 | case BT_RPAR: |
1108 | ptr += MINBPC(enc); |
1109 | if (! HAS_CHAR(enc, ptr, end)) |
1110 | return -XML_TOK_CLOSE_PAREN; |
1111 | switch (BYTE_TYPE(enc, ptr)) { |
1112 | case BT_AST: |
1113 | *nextTokPtr = ptr + MINBPC(enc); |
1114 | return XML_TOK_CLOSE_PAREN_ASTERISK; |
1115 | case BT_QUEST: |
1116 | *nextTokPtr = ptr + MINBPC(enc); |
1117 | return XML_TOK_CLOSE_PAREN_QUESTION; |
1118 | case BT_PLUS: |
1119 | *nextTokPtr = ptr + MINBPC(enc); |
1120 | return XML_TOK_CLOSE_PAREN_PLUS; |
1121 | case BT_CR: |
1122 | case BT_LF: |
1123 | case BT_S: |
1124 | case BT_GT: |
1125 | case BT_COMMA: |
1126 | case BT_VERBAR: |
1127 | case BT_RPAR: |
1128 | *nextTokPtr = ptr; |
1129 | return XML_TOK_CLOSE_PAREN; |
1130 | } |
1131 | *nextTokPtr = ptr; |
1132 | return XML_TOK_INVALID; |
1133 | case BT_VERBAR: |
1134 | *nextTokPtr = ptr + MINBPC(enc); |
1135 | return XML_TOK_OR; |
1136 | case BT_GT: |
1137 | *nextTokPtr = ptr + MINBPC(enc); |
1138 | return XML_TOK_DECL_CLOSE; |
1139 | case BT_NUM: |
1140 | return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1141 | # define LEAD_CASE(n) \ |
1142 | case BT_LEAD##n: \ |
1143 | if (end - ptr < n) \ |
1144 | return XML_TOK_PARTIAL_CHAR; \ |
1145 | if (IS_INVALID_CHAR(enc, ptr, n)) { \ |
1146 | *nextTokPtr = ptr; \ |
1147 | return XML_TOK_INVALID; \ |
1148 | } \ |
1149 | if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ |
1150 | ptr += n; \ |
1151 | tok = XML_TOK_NAME; \ |
1152 | break; \ |
1153 | } \ |
1154 | if (IS_NAME_CHAR(enc, ptr, n)) { \ |
1155 | ptr += n; \ |
1156 | tok = XML_TOK_NMTOKEN; \ |
1157 | break; \ |
1158 | } \ |
1159 | *nextTokPtr = ptr; \ |
1160 | return XML_TOK_INVALID; |
1161 | LEAD_CASE(2) |
1162 | LEAD_CASE(3) |
1163 | LEAD_CASE(4) |
1164 | # undef LEAD_CASE |
1165 | case BT_NMSTRT: |
1166 | case BT_HEX: |
1167 | tok = XML_TOK_NAME; |
1168 | ptr += MINBPC(enc); |
1169 | break; |
1170 | case BT_DIGIT: |
1171 | case BT_NAME: |
1172 | case BT_MINUS: |
1173 | # ifdef XML_NS |
1174 | case BT_COLON: |
1175 | # endif |
1176 | tok = XML_TOK_NMTOKEN; |
1177 | ptr += MINBPC(enc); |
1178 | break; |
1179 | case BT_NONASCII: |
1180 | if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { |
1181 | ptr += MINBPC(enc); |
1182 | tok = XML_TOK_NAME; |
1183 | break; |
1184 | } |
1185 | if (IS_NAME_CHAR_MINBPC(enc, ptr)) { |
1186 | ptr += MINBPC(enc); |
1187 | tok = XML_TOK_NMTOKEN; |
1188 | break; |
1189 | } |
1190 | /* fall through */ |
1191 | default: |
1192 | *nextTokPtr = ptr; |
1193 | return XML_TOK_INVALID; |
1194 | } |
1195 | while (HAS_CHAR(enc, ptr, end)) { |
1196 | switch (BYTE_TYPE(enc, ptr)) { |
1197 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
1198 | case BT_GT: |
1199 | case BT_RPAR: |
1200 | case BT_COMMA: |
1201 | case BT_VERBAR: |
1202 | case BT_LSQB: |
1203 | case BT_PERCNT: |
1204 | case BT_S: |
1205 | case BT_CR: |
1206 | case BT_LF: |
1207 | *nextTokPtr = ptr; |
1208 | return tok; |
1209 | # ifdef XML_NS |
1210 | case BT_COLON: |
1211 | ptr += MINBPC(enc); |
1212 | switch (tok) { |
1213 | case XML_TOK_NAME: |
1214 | REQUIRE_CHAR(enc, ptr, end); |
1215 | tok = XML_TOK_PREFIXED_NAME; |
1216 | switch (BYTE_TYPE(enc, ptr)) { |
1217 | CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) |
1218 | default: |
1219 | tok = XML_TOK_NMTOKEN; |
1220 | break; |
1221 | } |
1222 | break; |
1223 | case XML_TOK_PREFIXED_NAME: |
1224 | tok = XML_TOK_NMTOKEN; |
1225 | break; |
1226 | } |
1227 | break; |
1228 | # endif |
1229 | case BT_PLUS: |
1230 | if (tok == XML_TOK_NMTOKEN) { |
1231 | *nextTokPtr = ptr; |
1232 | return XML_TOK_INVALID; |
1233 | } |
1234 | *nextTokPtr = ptr + MINBPC(enc); |
1235 | return XML_TOK_NAME_PLUS; |
1236 | case BT_AST: |
1237 | if (tok == XML_TOK_NMTOKEN) { |
1238 | *nextTokPtr = ptr; |
1239 | return XML_TOK_INVALID; |
1240 | } |
1241 | *nextTokPtr = ptr + MINBPC(enc); |
1242 | return XML_TOK_NAME_ASTERISK; |
1243 | case BT_QUEST: |
1244 | if (tok == XML_TOK_NMTOKEN) { |
1245 | *nextTokPtr = ptr; |
1246 | return XML_TOK_INVALID; |
1247 | } |
1248 | *nextTokPtr = ptr + MINBPC(enc); |
1249 | return XML_TOK_NAME_QUESTION; |
1250 | default: |
1251 | *nextTokPtr = ptr; |
1252 | return XML_TOK_INVALID; |
1253 | } |
1254 | } |
1255 | return -tok; |
1256 | } |
1257 | |
1258 | static int PTRCALL |
1259 | PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end, |
1260 | const char **nextTokPtr) { |
1261 | const char *start; |
1262 | if (ptr >= end) |
1263 | return XML_TOK_NONE; |
1264 | else if (! HAS_CHAR(enc, ptr, end)) { |
1265 | /* This line cannot be executed. The incoming data has already |
1266 | * been tokenized once, so incomplete characters like this have |
1267 | * already been eliminated from the input. Retaining the paranoia |
1268 | * check is still valuable, however. |
1269 | */ |
1270 | return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */ |
1271 | } |
1272 | start = ptr; |
1273 | while (HAS_CHAR(enc, ptr, end)) { |
1274 | switch (BYTE_TYPE(enc, ptr)) { |
1275 | # define LEAD_CASE(n) \ |
1276 | case BT_LEAD##n: \ |
1277 | ptr += n; /* NOTE: The encoding has already been validated. */ \ |
1278 | break; |
1279 | LEAD_CASE(2) |
1280 | LEAD_CASE(3) |
1281 | LEAD_CASE(4) |
1282 | # undef LEAD_CASE |
1283 | case BT_AMP: |
1284 | if (ptr == start) |
1285 | return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1286 | *nextTokPtr = ptr; |
1287 | return XML_TOK_DATA_CHARS; |
1288 | case BT_LT: |
1289 | /* this is for inside entity references */ |
1290 | *nextTokPtr = ptr; |
1291 | return XML_TOK_INVALID; |
1292 | case BT_LF: |
1293 | if (ptr == start) { |
1294 | *nextTokPtr = ptr + MINBPC(enc); |
1295 | return XML_TOK_DATA_NEWLINE; |
1296 | } |
1297 | *nextTokPtr = ptr; |
1298 | return XML_TOK_DATA_CHARS; |
1299 | case BT_CR: |
1300 | if (ptr == start) { |
1301 | ptr += MINBPC(enc); |
1302 | if (! HAS_CHAR(enc, ptr, end)) |
1303 | return XML_TOK_TRAILING_CR; |
1304 | if (BYTE_TYPE(enc, ptr) == BT_LF) |
1305 | ptr += MINBPC(enc); |
1306 | *nextTokPtr = ptr; |
1307 | return XML_TOK_DATA_NEWLINE; |
1308 | } |
1309 | *nextTokPtr = ptr; |
1310 | return XML_TOK_DATA_CHARS; |
1311 | case BT_S: |
1312 | if (ptr == start) { |
1313 | *nextTokPtr = ptr + MINBPC(enc); |
1314 | return XML_TOK_ATTRIBUTE_VALUE_S; |
1315 | } |
1316 | *nextTokPtr = ptr; |
1317 | return XML_TOK_DATA_CHARS; |
1318 | default: |
1319 | ptr += MINBPC(enc); |
1320 | break; |
1321 | } |
1322 | } |
1323 | *nextTokPtr = ptr; |
1324 | return XML_TOK_DATA_CHARS; |
1325 | } |
1326 | |
1327 | static int PTRCALL |
1328 | PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end, |
1329 | const char **nextTokPtr) { |
1330 | const char *start; |
1331 | if (ptr >= end) |
1332 | return XML_TOK_NONE; |
1333 | else if (! HAS_CHAR(enc, ptr, end)) { |
1334 | /* This line cannot be executed. The incoming data has already |
1335 | * been tokenized once, so incomplete characters like this have |
1336 | * already been eliminated from the input. Retaining the paranoia |
1337 | * check is still valuable, however. |
1338 | */ |
1339 | return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */ |
1340 | } |
1341 | start = ptr; |
1342 | while (HAS_CHAR(enc, ptr, end)) { |
1343 | switch (BYTE_TYPE(enc, ptr)) { |
1344 | # define LEAD_CASE(n) \ |
1345 | case BT_LEAD##n: \ |
1346 | ptr += n; /* NOTE: The encoding has already been validated. */ \ |
1347 | break; |
1348 | LEAD_CASE(2) |
1349 | LEAD_CASE(3) |
1350 | LEAD_CASE(4) |
1351 | # undef LEAD_CASE |
1352 | case BT_AMP: |
1353 | if (ptr == start) |
1354 | return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1355 | *nextTokPtr = ptr; |
1356 | return XML_TOK_DATA_CHARS; |
1357 | case BT_PERCNT: |
1358 | if (ptr == start) { |
1359 | int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); |
1360 | return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok; |
1361 | } |
1362 | *nextTokPtr = ptr; |
1363 | return XML_TOK_DATA_CHARS; |
1364 | case BT_LF: |
1365 | if (ptr == start) { |
1366 | *nextTokPtr = ptr + MINBPC(enc); |
1367 | return XML_TOK_DATA_NEWLINE; |
1368 | } |
1369 | *nextTokPtr = ptr; |
1370 | return XML_TOK_DATA_CHARS; |
1371 | case BT_CR: |
1372 | if (ptr == start) { |
1373 | ptr += MINBPC(enc); |
1374 | if (! HAS_CHAR(enc, ptr, end)) |
1375 | return XML_TOK_TRAILING_CR; |
1376 | if (BYTE_TYPE(enc, ptr) == BT_LF) |
1377 | ptr += MINBPC(enc); |
1378 | *nextTokPtr = ptr; |
1379 | return XML_TOK_DATA_NEWLINE; |
1380 | } |
1381 | *nextTokPtr = ptr; |
1382 | return XML_TOK_DATA_CHARS; |
1383 | default: |
1384 | ptr += MINBPC(enc); |
1385 | break; |
1386 | } |
1387 | } |
1388 | *nextTokPtr = ptr; |
1389 | return XML_TOK_DATA_CHARS; |
1390 | } |
1391 | |
1392 | # ifdef XML_DTD |
1393 | |
1394 | static int PTRCALL |
1395 | PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end, |
1396 | const char **nextTokPtr) { |
1397 | int level = 0; |
1398 | if (MINBPC(enc) > 1) { |
1399 | size_t n = end - ptr; |
1400 | if (n & (MINBPC(enc) - 1)) { |
1401 | n &= ~(MINBPC(enc) - 1); |
1402 | end = ptr + n; |
1403 | } |
1404 | } |
1405 | while (HAS_CHAR(enc, ptr, end)) { |
1406 | switch (BYTE_TYPE(enc, ptr)) { |
1407 | INVALID_CASES(ptr, nextTokPtr) |
1408 | case BT_LT: |
1409 | ptr += MINBPC(enc); |
1410 | REQUIRE_CHAR(enc, ptr, end); |
1411 | if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) { |
1412 | ptr += MINBPC(enc); |
1413 | REQUIRE_CHAR(enc, ptr, end); |
1414 | if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) { |
1415 | ++level; |
1416 | ptr += MINBPC(enc); |
1417 | } |
1418 | } |
1419 | break; |
1420 | case BT_RSQB: |
1421 | ptr += MINBPC(enc); |
1422 | REQUIRE_CHAR(enc, ptr, end); |
1423 | if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { |
1424 | ptr += MINBPC(enc); |
1425 | REQUIRE_CHAR(enc, ptr, end); |
1426 | if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { |
1427 | ptr += MINBPC(enc); |
1428 | if (level == 0) { |
1429 | *nextTokPtr = ptr; |
1430 | return XML_TOK_IGNORE_SECT; |
1431 | } |
1432 | --level; |
1433 | } |
1434 | } |
1435 | break; |
1436 | default: |
1437 | ptr += MINBPC(enc); |
1438 | break; |
1439 | } |
1440 | } |
1441 | return XML_TOK_PARTIAL; |
1442 | } |
1443 | |
1444 | # endif /* XML_DTD */ |
1445 | |
1446 | static int PTRCALL |
1447 | PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end, |
1448 | const char **badPtr) { |
1449 | ptr += MINBPC(enc); |
1450 | end -= MINBPC(enc); |
1451 | for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { |
1452 | switch (BYTE_TYPE(enc, ptr)) { |
1453 | case BT_DIGIT: |
1454 | case BT_HEX: |
1455 | case BT_MINUS: |
1456 | case BT_APOS: |
1457 | case BT_LPAR: |
1458 | case BT_RPAR: |
1459 | case BT_PLUS: |
1460 | case BT_COMMA: |
1461 | case BT_SOL: |
1462 | case BT_EQUALS: |
1463 | case BT_QUEST: |
1464 | case BT_CR: |
1465 | case BT_LF: |
1466 | case BT_SEMI: |
1467 | case BT_EXCL: |
1468 | case BT_AST: |
1469 | case BT_PERCNT: |
1470 | case BT_NUM: |
1471 | # ifdef XML_NS |
1472 | case BT_COLON: |
1473 | # endif |
1474 | break; |
1475 | case BT_S: |
1476 | if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) { |
1477 | *badPtr = ptr; |
1478 | return 0; |
1479 | } |
1480 | break; |
1481 | case BT_NAME: |
1482 | case BT_NMSTRT: |
1483 | if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f)) |
1484 | break; |
1485 | /* fall through */ |
1486 | default: |
1487 | switch (BYTE_TO_ASCII(enc, ptr)) { |
1488 | case 0x24: /* $ */ |
1489 | case 0x40: /* @ */ |
1490 | break; |
1491 | default: |
1492 | *badPtr = ptr; |
1493 | return 0; |
1494 | } |
1495 | break; |
1496 | } |
1497 | } |
1498 | return 1; |
1499 | } |
1500 | |
1501 | /* This must only be called for a well-formed start-tag or empty |
1502 | element tag. Returns the number of attributes. Pointers to the |
1503 | first attsMax attributes are stored in atts. |
1504 | */ |
1505 | |
1506 | static int PTRCALL |
1507 | PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax, |
1508 | ATTRIBUTE *atts) { |
1509 | enum { other, inName, inValue } state = inName; |
1510 | int nAtts = 0; |
1511 | int open = 0; /* defined when state == inValue; |
1512 | initialization just to shut up compilers */ |
1513 | |
1514 | for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) { |
1515 | switch (BYTE_TYPE(enc, ptr)) { |
1516 | # define START_NAME \ |
1517 | if (state == other) { \ |
1518 | if (nAtts < attsMax) { \ |
1519 | atts[nAtts].name = ptr; \ |
1520 | atts[nAtts].normalized = 1; \ |
1521 | } \ |
1522 | state = inName; \ |
1523 | } |
1524 | # define LEAD_CASE(n) \ |
1525 | case BT_LEAD##n: /* NOTE: The encoding has already been validated. */ \ |
1526 | START_NAME ptr += (n - MINBPC(enc)); \ |
1527 | break; |
1528 | LEAD_CASE(2) |
1529 | LEAD_CASE(3) |
1530 | LEAD_CASE(4) |
1531 | # undef LEAD_CASE |
1532 | case BT_NONASCII: |
1533 | case BT_NMSTRT: |
1534 | case BT_HEX: |
1535 | START_NAME |
1536 | break; |
1537 | # undef START_NAME |
1538 | case BT_QUOT: |
1539 | if (state != inValue) { |
1540 | if (nAtts < attsMax) |
1541 | atts[nAtts].valuePtr = ptr + MINBPC(enc); |
1542 | state = inValue; |
1543 | open = BT_QUOT; |
1544 | } else if (open == BT_QUOT) { |
1545 | state = other; |
1546 | if (nAtts < attsMax) |
1547 | atts[nAtts].valueEnd = ptr; |
1548 | nAtts++; |
1549 | } |
1550 | break; |
1551 | case BT_APOS: |
1552 | if (state != inValue) { |
1553 | if (nAtts < attsMax) |
1554 | atts[nAtts].valuePtr = ptr + MINBPC(enc); |
1555 | state = inValue; |
1556 | open = BT_APOS; |
1557 | } else if (open == BT_APOS) { |
1558 | state = other; |
1559 | if (nAtts < attsMax) |
1560 | atts[nAtts].valueEnd = ptr; |
1561 | nAtts++; |
1562 | } |
1563 | break; |
1564 | case BT_AMP: |
1565 | if (nAtts < attsMax) |
1566 | atts[nAtts].normalized = 0; |
1567 | break; |
1568 | case BT_S: |
1569 | if (state == inName) |
1570 | state = other; |
1571 | else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized |
1572 | && (ptr == atts[nAtts].valuePtr |
1573 | || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE |
1574 | || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE |
1575 | || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open)) |
1576 | atts[nAtts].normalized = 0; |
1577 | break; |
1578 | case BT_CR: |
1579 | case BT_LF: |
1580 | /* This case ensures that the first attribute name is counted |
1581 | Apart from that we could just change state on the quote. */ |
1582 | if (state == inName) |
1583 | state = other; |
1584 | else if (state == inValue && nAtts < attsMax) |
1585 | atts[nAtts].normalized = 0; |
1586 | break; |
1587 | case BT_GT: |
1588 | case BT_SOL: |
1589 | if (state != inValue) |
1590 | return nAtts; |
1591 | break; |
1592 | default: |
1593 | break; |
1594 | } |
1595 | } |
1596 | /* not reached */ |
1597 | } |
1598 | |
1599 | static int PTRFASTCALL |
1600 | PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) { |
1601 | int result = 0; |
1602 | /* skip &# */ |
1603 | UNUSED_P(enc); |
1604 | ptr += 2 * MINBPC(enc); |
1605 | if (CHAR_MATCHES(enc, ptr, ASCII_x)) { |
1606 | for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); |
1607 | ptr += MINBPC(enc)) { |
1608 | int c = BYTE_TO_ASCII(enc, ptr); |
1609 | switch (c) { |
1610 | case ASCII_0: |
1611 | case ASCII_1: |
1612 | case ASCII_2: |
1613 | case ASCII_3: |
1614 | case ASCII_4: |
1615 | case ASCII_5: |
1616 | case ASCII_6: |
1617 | case ASCII_7: |
1618 | case ASCII_8: |
1619 | case ASCII_9: |
1620 | result <<= 4; |
1621 | result |= (c - ASCII_0); |
1622 | break; |
1623 | case ASCII_A: |
1624 | case ASCII_B: |
1625 | case ASCII_C: |
1626 | case ASCII_D: |
1627 | case ASCII_E: |
1628 | case ASCII_F: |
1629 | result <<= 4; |
1630 | result += 10 + (c - ASCII_A); |
1631 | break; |
1632 | case ASCII_a: |
1633 | case ASCII_b: |
1634 | case ASCII_c: |
1635 | case ASCII_d: |
1636 | case ASCII_e: |
1637 | case ASCII_f: |
1638 | result <<= 4; |
1639 | result += 10 + (c - ASCII_a); |
1640 | break; |
1641 | } |
1642 | if (result >= 0x110000) |
1643 | return -1; |
1644 | } |
1645 | } else { |
1646 | for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) { |
1647 | int c = BYTE_TO_ASCII(enc, ptr); |
1648 | result *= 10; |
1649 | result += (c - ASCII_0); |
1650 | if (result >= 0x110000) |
1651 | return -1; |
1652 | } |
1653 | } |
1654 | return checkCharRefNumber(result); |
1655 | } |
1656 | |
1657 | static int PTRCALL |
1658 | PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, |
1659 | const char *end) { |
1660 | UNUSED_P(enc); |
1661 | switch ((end - ptr) / MINBPC(enc)) { |
1662 | case 2: |
1663 | if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) { |
1664 | switch (BYTE_TO_ASCII(enc, ptr)) { |
1665 | case ASCII_l: |
1666 | return ASCII_LT; |
1667 | case ASCII_g: |
1668 | return ASCII_GT; |
1669 | } |
1670 | } |
1671 | break; |
1672 | case 3: |
1673 | if (CHAR_MATCHES(enc, ptr, ASCII_a)) { |
1674 | ptr += MINBPC(enc); |
1675 | if (CHAR_MATCHES(enc, ptr, ASCII_m)) { |
1676 | ptr += MINBPC(enc); |
1677 | if (CHAR_MATCHES(enc, ptr, ASCII_p)) |
1678 | return ASCII_AMP; |
1679 | } |
1680 | } |
1681 | break; |
1682 | case 4: |
1683 | switch (BYTE_TO_ASCII(enc, ptr)) { |
1684 | case ASCII_q: |
1685 | ptr += MINBPC(enc); |
1686 | if (CHAR_MATCHES(enc, ptr, ASCII_u)) { |
1687 | ptr += MINBPC(enc); |
1688 | if (CHAR_MATCHES(enc, ptr, ASCII_o)) { |
1689 | ptr += MINBPC(enc); |
1690 | if (CHAR_MATCHES(enc, ptr, ASCII_t)) |
1691 | return ASCII_QUOT; |
1692 | } |
1693 | } |
1694 | break; |
1695 | case ASCII_a: |
1696 | ptr += MINBPC(enc); |
1697 | if (CHAR_MATCHES(enc, ptr, ASCII_p)) { |
1698 | ptr += MINBPC(enc); |
1699 | if (CHAR_MATCHES(enc, ptr, ASCII_o)) { |
1700 | ptr += MINBPC(enc); |
1701 | if (CHAR_MATCHES(enc, ptr, ASCII_s)) |
1702 | return ASCII_APOS; |
1703 | } |
1704 | } |
1705 | break; |
1706 | } |
1707 | } |
1708 | return 0; |
1709 | } |
1710 | |
1711 | static int PTRCALL |
1712 | PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, |
1713 | const char *end1, const char *ptr2) { |
1714 | UNUSED_P(enc); |
1715 | for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) { |
1716 | if (end1 - ptr1 < MINBPC(enc)) { |
1717 | /* This line cannot be executed. The incoming data has already |
1718 | * been tokenized once, so incomplete characters like this have |
1719 | * already been eliminated from the input. Retaining the |
1720 | * paranoia check is still valuable, however. |
1721 | */ |
1722 | return 0; /* LCOV_EXCL_LINE */ |
1723 | } |
1724 | if (! CHAR_MATCHES(enc, ptr1, *ptr2)) |
1725 | return 0; |
1726 | } |
1727 | return ptr1 == end1; |
1728 | } |
1729 | |
1730 | static int PTRFASTCALL |
1731 | PREFIX(nameLength)(const ENCODING *enc, const char *ptr) { |
1732 | const char *start = ptr; |
1733 | for (;;) { |
1734 | switch (BYTE_TYPE(enc, ptr)) { |
1735 | # define LEAD_CASE(n) \ |
1736 | case BT_LEAD##n: \ |
1737 | ptr += n; /* NOTE: The encoding has already been validated. */ \ |
1738 | break; |
1739 | LEAD_CASE(2) |
1740 | LEAD_CASE(3) |
1741 | LEAD_CASE(4) |
1742 | # undef LEAD_CASE |
1743 | case BT_NONASCII: |
1744 | case BT_NMSTRT: |
1745 | # ifdef XML_NS |
1746 | case BT_COLON: |
1747 | # endif |
1748 | case BT_HEX: |
1749 | case BT_DIGIT: |
1750 | case BT_NAME: |
1751 | case BT_MINUS: |
1752 | ptr += MINBPC(enc); |
1753 | break; |
1754 | default: |
1755 | return (int)(ptr - start); |
1756 | } |
1757 | } |
1758 | } |
1759 | |
1760 | static const char *PTRFASTCALL |
1761 | PREFIX(skipS)(const ENCODING *enc, const char *ptr) { |
1762 | for (;;) { |
1763 | switch (BYTE_TYPE(enc, ptr)) { |
1764 | case BT_LF: |
1765 | case BT_CR: |
1766 | case BT_S: |
1767 | ptr += MINBPC(enc); |
1768 | break; |
1769 | default: |
1770 | return ptr; |
1771 | } |
1772 | } |
1773 | } |
1774 | |
1775 | static void PTRCALL |
1776 | PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end, |
1777 | POSITION *pos) { |
1778 | while (HAS_CHAR(enc, ptr, end)) { |
1779 | switch (BYTE_TYPE(enc, ptr)) { |
1780 | # define LEAD_CASE(n) \ |
1781 | case BT_LEAD##n: \ |
1782 | ptr += n; /* NOTE: The encoding has already been validated. */ \ |
1783 | pos->columnNumber++; \ |
1784 | break; |
1785 | LEAD_CASE(2) |
1786 | LEAD_CASE(3) |
1787 | LEAD_CASE(4) |
1788 | # undef LEAD_CASE |
1789 | case BT_LF: |
1790 | pos->columnNumber = 0; |
1791 | pos->lineNumber++; |
1792 | ptr += MINBPC(enc); |
1793 | break; |
1794 | case BT_CR: |
1795 | pos->lineNumber++; |
1796 | ptr += MINBPC(enc); |
1797 | if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF) |
1798 | ptr += MINBPC(enc); |
1799 | pos->columnNumber = 0; |
1800 | break; |
1801 | default: |
1802 | ptr += MINBPC(enc); |
1803 | pos->columnNumber++; |
1804 | break; |
1805 | } |
1806 | } |
1807 | } |
1808 | |
1809 | # undef DO_LEAD_CASE |
1810 | # undef MULTIBYTE_CASES |
1811 | # undef INVALID_CASES |
1812 | # undef CHECK_NAME_CASE |
1813 | # undef CHECK_NAME_CASES |
1814 | # undef CHECK_NMSTRT_CASE |
1815 | # undef CHECK_NMSTRT_CASES |
1816 | |
1817 | #endif /* XML_TOK_IMPL_C */ |
1818 | |