xmltok.c source code [python/Modules/expat/xmltok.c]

1	/*
2	__ __ _
3	___\ \/ /_ __ __ _\| \|_
4	/ _ \\ /\| '_ \ / _` \| __\|
5	\| __// \\| \|_) \| (_\| \| \|_
6	\___/_/\_\ .__/ \__,_\|\__\|
7	\|_\| XML parser
8
9	Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10	Copyright (c) 2000 Clark Cooper <[email protected]>
11	Copyright (c) 2001-2003 Fred L. Drake, Jr. <[email protected]>
12	Copyright (c) 2002 Greg Stein <[email protected]>
13	Copyright (c) 2002-2016 Karl Waclawek <[email protected]>
14	Copyright (c) 2005-2009 Steven Solie <[email protected]>
15	Copyright (c) 2016-2022 Sebastian Pipping <[email protected]>
16	Copyright (c) 2016 Pascal Cuoq <[email protected]>
17	Copyright (c) 2016 Don Lewis <[email protected]>
18	Copyright (c) 2017 Rhodri James <[email protected]>
19	Copyright (c) 2017 Alexander Bluhm <[email protected]>
20	Copyright (c) 2017 Benbuck Nason <[email protected]>
21	Copyright (c) 2017 José Gutiérrez de la Concha <[email protected]>
22	Copyright (c) 2019 David Loffredo <[email protected]>
23	Copyright (c) 2021 Dong-hee Na <[email protected]>
24	Licensed under the MIT license:
25
26	Permission is hereby granted, free of charge, to any person obtaining
27	a copy of this software and associated documentation files (the
28	"Software"), to deal in the Software without restriction, including
29	without limitation the rights to use, copy, modify, merge, publish,
30	distribute, sublicense, and/or sell copies of the Software, and to permit
31	persons to whom the Software is furnished to do so, subject to the
32	following conditions:
33
34	The above copyright notice and this permission notice shall be included
35	in all copies or substantial portions of the Software.
36
37	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
38	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
39	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
40	NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
41	DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
42	OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
43	USE OR OTHER DEALINGS IN THE SOFTWARE.
44	*/
45
46	#include <expat_config.h>
47
48	#include <stddef.h>
49	#include <string.h> /* memcpy */
50	#include <stdbool.h>
51
52	#ifdef _WIN32
53	# include "winconfig.h"
54	#endif
55
56	#include "expat_external.h"
57	#include "internal.h"
58	#include "xmltok.h"
59	#include "nametab.h"
60
61	#ifdef XML_DTD
62	# define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
63	#else
64	# define IGNORE_SECTION_TOK_VTABLE /* as nothing */
65	#endif
66
67	#define VTABLE1 \
68	{PREFIX(prologTok), PREFIX(contentTok), \
69	PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
70	{PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
71	PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
72	PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
73	PREFIX(updatePosition), PREFIX(isPublicId)
74
75	#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
76
77	#define UCS2_GET_NAMING(pages, hi, lo) \
78	(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
79
80	/ A 2 byte UTF-8 representation splits the characters 11 bits between*
81	the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
82	pages, 3 bits to add to that index and 5 bits to generate the mask.
83	*/
84	#define UTF8_GET_NAMING2(pages, byte) \
85	(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
86	+ ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
87	& (1u << (((byte)[1]) & 0x1F)))
88
89	/ A 3 byte UTF-8 representation splits the characters 16 bits between*
90	the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
91	into pages, 3 bits to add to that index and 5 bits to generate the
92	mask.
93	*/
94	#define UTF8_GET_NAMING3(pages, byte) \
95	(namingBitmap \
96	[((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
97	<< 3) \
98	+ ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
99	& (1u << (((byte)[2]) & 0x1F)))
100
101	/ Detection of invalid UTF-8 sequences is based on Table 3.1B*
102	of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
103	with the additional restriction of not allowing the Unicode
104	code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
105	Implementation details:
106	(A & 0x80) == 0 means A < 0x80
107	and
108	(A & 0xC0) == 0xC0 means A > 0xBF
109	*/
110
111	#define UTF8_INVALID2(p) \
112	((*p) < 0xC2 \|\| ((p)[1] & 0x80) == 0 \|\| ((p)[1] & 0xC0) == 0xC0)
113
114	#define UTF8_INVALID3(p) \
115	(((p)[2] & 0x80) == 0 \
116	\|\| ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
117	: ((p)[2] & 0xC0) == 0xC0) \
118	\|\| ((*p) == 0xE0 \
119	? (p)[1] < 0xA0 \|\| ((p)[1] & 0xC0) == 0xC0 \
120	: ((p)[1] & 0x80) == 0 \
121	\|\| ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
122
123	#define UTF8_INVALID4(p) \
124	(((p)[3] & 0x80) == 0 \|\| ((p)[3] & 0xC0) == 0xC0 \|\| ((p)[2] & 0x80) == 0 \
125	\|\| ((p)[2] & 0xC0) == 0xC0 \
126	\|\| ((*p) == 0xF0 \
127	? (p)[1] < 0x90 \|\| ((p)[1] & 0xC0) == 0xC0 \
128	: ((p)[1] & 0x80) == 0 \
129	\|\| ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
130
131	static int PTRFASTCALL
132	isNever(const ENCODING enc, const* char *p) {
133	UNUSED_P(enc);
134	UNUSED_P(p);
135	return `0`;
136	}
137
138	static int PTRFASTCALL
139	utf8_isName2(const ENCODING enc, const* char *p) {
140	UNUSED_P(enc);
141	return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
142	}
143
144	static int PTRFASTCALL
145	utf8_isName3(const ENCODING enc, const* char *p) {
146	UNUSED_P(enc);
147	return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
148	}
149
150	#define utf8_isName4 isNever
151
152	static int PTRFASTCALL
153	utf8_isNmstrt2(const ENCODING enc, const* char *p) {
154	UNUSED_P(enc);
155	return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
156	}
157
158	static int PTRFASTCALL
159	utf8_isNmstrt3(const ENCODING enc, const* char *p) {
160	UNUSED_P(enc);
161	return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
162	}
163
164	#define utf8_isNmstrt4 isNever
165
166	static int PTRFASTCALL
167	utf8_isInvalid2(const ENCODING enc, const* char *p) {
168	UNUSED_P(enc);
169	return UTF8_INVALID2((const unsigned char *)p);
170	}
171
172	static int PTRFASTCALL
173	utf8_isInvalid3(const ENCODING enc, const* char *p) {
174	UNUSED_P(enc);
175	return UTF8_INVALID3((const unsigned char *)p);
176	}
177
178	static int PTRFASTCALL
179	utf8_isInvalid4(const ENCODING enc, const* char *p) {
180	UNUSED_P(enc);
181	return UTF8_INVALID4((const unsigned char *)p);
182	}
183
184	struct normal_encoding {
185	ENCODING enc;
186	unsigned char type[`256`];
187	#ifdef XML_MIN_SIZE
188	int(PTRFASTCALL byteType)(const* ENCODING , const* char *);
189	int(PTRFASTCALL isNameMin)(const* ENCODING , const* char *);
190	int(PTRFASTCALL isNmstrtMin)(const* ENCODING , const* char *);
191	int(PTRFASTCALL byteToAscii)(const* ENCODING , const* char *);
192	int(PTRCALL charMatches)(const* ENCODING , const* char , int*);
193	#endif /* XML_MIN_SIZE */
194	int(PTRFASTCALL isName2)(const* ENCODING , const* char *);
195	int(PTRFASTCALL isName3)(const* ENCODING , const* char *);
196	int(PTRFASTCALL isName4)(const* ENCODING , const* char *);
197	int(PTRFASTCALL isNmstrt2)(const* ENCODING , const* char *);
198	int(PTRFASTCALL isNmstrt3)(const* ENCODING , const* char *);
199	int(PTRFASTCALL isNmstrt4)(const* ENCODING , const* char *);
200	int(PTRFASTCALL isInvalid2)(const* ENCODING , const* char *);
201	int(PTRFASTCALL isInvalid3)(const* ENCODING , const* char *);
202	int(PTRFASTCALL isInvalid4)(const* ENCODING , const* char *);
203	};
204
205	#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
206
207	#ifdef XML_MIN_SIZE
208
209	# define STANDARD_VTABLE(E) \
210	E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
211
212	#else
213
214	# define STANDARD_VTABLE(E) /* as nothing */
215
216	#endif
217
218	#define NORMAL_VTABLE(E) \
219	E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
220	E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
221
222	#define NULL_VTABLE \
223	/* isName2 / NULL, / isName3 / NULL, / isName4 */ NULL, \
224	/* isNmstrt2 / NULL, / isNmstrt3 / NULL, / isNmstrt4 */ NULL, \
225	/* isInvalid2 / NULL, / isInvalid3 / NULL, / isInvalid4 */ NULL
226
227	static int FASTCALL checkCharRefNumber(int);
228
229	#include "xmltok_impl.h"
230	#include "ascii.h"
231
232	#ifdef XML_MIN_SIZE
233	# define sb_isNameMin isNever
234	# define sb_isNmstrtMin isNever
235	#endif
236
237	#ifdef XML_MIN_SIZE
238	# define MINBPC(enc) ((enc)->minBytesPerChar)
239	#else
240	/ minimum bytes per character /
241	# define MINBPC(enc) 1
242	#endif
243
244	#define SB_BYTE_TYPE(enc, p) \
245	(((struct normal_encoding )(enc))->type[(unsigned char)(p)])
246
247	#ifdef XML_MIN_SIZE
248	static int PTRFASTCALL
249	sb_byteType(const ENCODING enc, const* char *p) {
250	return SB_BYTE_TYPE(enc, p);
251	}
252	# define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
253	#else
254	# define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
255	#endif
256
257	#ifdef XML_MIN_SIZE
258	# define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
259	static int PTRFASTCALL
260	sb_byteToAscii(const ENCODING enc, const* char *p) {
261	UNUSED_P(enc);
262	return *p;
263	}
264	#else
265	# define BYTE_TO_ASCII(enc, p) (*(p))
266	#endif
267
268	#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
269	#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
270	#ifdef XML_MIN_SIZE
271	# define IS_INVALID_CHAR(enc, p, n) \
272	(AS_NORMAL_ENCODING(enc)->isInvalid##n \
273	&& AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
274	#else
275	# define IS_INVALID_CHAR(enc, p, n) \
276	(AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
277	#endif
278
279	#ifdef XML_MIN_SIZE
280	# define IS_NAME_CHAR_MINBPC(enc, p) \
281	(AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
282	# define IS_NMSTRT_CHAR_MINBPC(enc, p) \
283	(AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
284	#else
285	# define IS_NAME_CHAR_MINBPC(enc, p) (0)
286	# define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
287	#endif
288
289	#ifdef XML_MIN_SIZE
290	# define CHAR_MATCHES(enc, p, c) \
291	(AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
292	static int PTRCALL
293	sb_charMatches(const ENCODING enc, const* char p, int* c) {
294	UNUSED_P(enc);
295	return *p == c;
296	}
297	#else
298	/ c is an ASCII character /
299	# define CHAR_MATCHES(enc, p, c) (*(p) == c)
300	#endif
301
302	#define PREFIX(ident) normal_##ident
303	#define XML_TOK_IMPL_C
304	#include "xmltok_impl.c"
305	#undef XML_TOK_IMPL_C
306
307	#undef MINBPC
308	#undef BYTE_TYPE
309	#undef BYTE_TO_ASCII
310	#undef CHAR_MATCHES
311	#undef IS_NAME_CHAR
312	#undef IS_NAME_CHAR_MINBPC
313	#undef IS_NMSTRT_CHAR
314	#undef IS_NMSTRT_CHAR_MINBPC
315	#undef IS_INVALID_CHAR
316
317	enum { / UTF8_cvalN is value of masked first byte of N byte sequence /
318	UTF8_cval1 = `0x00`,
319	UTF8_cval2 = `0xc0`,
320	UTF8_cval3 = `0xe0`,
321	UTF8_cval4 = `0xf0`
322	};
323
324	void
325	_INTERNAL_trim_to_complete_utf8_characters(const char *from,
326	const char **fromLimRef) {
327	const char fromLim = fromLimRef;
328	size_t walked = `0`;
329	for (; fromLim > from; fromLim--, walked++) {
330	const unsigned char prev = (unsigned char)fromLim[-`1`];
331	if ((prev & `0xf8u`)
332	== `0xf0u`) { / 4-byte character, lead by 0b11110xxx byte /
333	if (walked + `1` >= `4`) {
334	fromLim += `4` - `1`;
335	break;
336	} else {
337	walked = `0`;
338	}
339	} else if ((prev & `0xf0u`)
340	== `0xe0u`) { / 3-byte character, lead by 0b1110xxxx byte /
341	if (walked + `1` >= `3`) {
342	fromLim += `3` - `1`;
343	break;
344	} else {
345	walked = `0`;
346	}
347	} else if ((prev & `0xe0u`)
348	== `0xc0u`) { / 2-byte character, lead by 0b110xxxxx byte /
349	if (walked + `1` >= `2`) {
350	fromLim += `2` - `1`;
351	break;
352	} else {
353	walked = `0`;
354	}
355	} else if ((prev & `0x80u`)
356	== `0x00u`) { / 1-byte character, matching 0b0xxxxxxx /
357	break;
358	}
359	}
360	*fromLimRef = fromLim;
361	}
362
363	static enum XML_Convert_Result PTRCALL
364	utf8_toUtf8(const ENCODING enc, const* char *fromP, const* char *fromLim,
365	char *toP, const* char *toLim) {
366	bool input_incomplete = false;
367	bool output_exhausted = false;
368
369	/ Avoid copying partial characters (due to limited space). /
370	const ptrdiff_t bytesAvailable = fromLim - *fromP;
371	const ptrdiff_t bytesStorable = toLim - *toP;
372	UNUSED_P(enc);
373	if (bytesAvailable > bytesStorable) {
374	fromLim = *fromP + bytesStorable;
375	output_exhausted = true;
376	}
377
378	/ Avoid copying partial characters (from incomplete input). /
379	{
380	const char *const fromLimBefore = fromLim;
381	_INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
382	if (fromLim < fromLimBefore) {
383	input_incomplete = true;
384	}
385	}
386
387	{
388	const ptrdiff_t bytesToCopy = fromLim - *fromP;
389	memcpy(toP, fromP, bytesToCopy);
390	*fromP += bytesToCopy;
391	*toP += bytesToCopy;
392	}
393
394	if (output_exhausted) / needs to go first /
395	return XML_CONVERT_OUTPUT_EXHAUSTED;
396	else if (input_incomplete)
397	return XML_CONVERT_INPUT_INCOMPLETE;
398	else
399	return XML_CONVERT_COMPLETED;
400	}
401
402	static enum XML_Convert_Result PTRCALL
403	utf8_toUtf16(const ENCODING enc, const* char *fromP, const* char *fromLim,
404	unsigned short *toP, const* unsigned short *toLim) {
405	enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
406	unsigned short to = toP;
407	const char from = fromP;
408	while (from < fromLim && to < toLim) {
409	switch (((struct normal_encoding )enc)->type[(unsigned* char)*from]) {
410	case BT_LEAD2:
411	if (fromLim - from < `2`) {
412	res = XML_CONVERT_INPUT_INCOMPLETE;
413	goto after;
414	}
415	to++ = (unsigned* short)(((from[`0`] & `0x1f`) << `6`) \| (from[`1`] & `0x3f`));
416	from += `2`;
417	break;
418	case BT_LEAD3:
419	if (fromLim - from < `3`) {
420	res = XML_CONVERT_INPUT_INCOMPLETE;
421	goto after;
422	}
423	to++ = (unsigned* short)(((from[`0`] & `0xf`) << `12`) \| ((from[`1`] & `0x3f`) << `6`)
424	\| (from[`2`] & `0x3f`));
425	from += `3`;
426	break;
427	case BT_LEAD4: {
428	unsigned long n;
429	if (toLim - to < `2`) {
430	res = XML_CONVERT_OUTPUT_EXHAUSTED;
431	goto after;
432	}
433	if (fromLim - from < `4`) {
434	res = XML_CONVERT_INPUT_INCOMPLETE;
435	goto after;
436	}
437	n = ((from[`0`] & `0x7`) << `18`) \| ((from[`1`] & `0x3f`) << `12`)
438	\| ((from[`2`] & `0x3f`) << `6`) \| (from[`3`] & `0x3f`);
439	n -= `0x10000`;
440	to[`0`] = (unsigned short)((n >> `10`) \| `0xD800`);
441	to[`1`] = (unsigned short)((n & `0x3FF`) \| `0xDC00`);
442	to += `2`;
443	from += `4`;
444	} break;
445	default:
446	to++ = from++;
447	break;
448	}
449	}
450	if (from < fromLim)
451	res = XML_CONVERT_OUTPUT_EXHAUSTED;
452	after:
453	*fromP = from;
454	*toP = to;
455	return res;
456	}
457
458	#ifdef XML_NS
459	static const struct normal_encoding utf8_encoding_ns
460	= {{VTABLE1, utf8_toUtf8, utf8_toUtf16, `1`, `1`, `0`},
461	{
462	# include "asciitab.h"
463	# include "utf8tab.h"
464	},
465	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
466	#endif
467
468	static const struct normal_encoding utf8_encoding
469	= {{VTABLE1, utf8_toUtf8, utf8_toUtf16, `1`, `1`, `0`},
470	{
471	#define BT_COLON BT_NMSTRT
472	#include "asciitab.h"
473	#undef BT_COLON
474	#include "utf8tab.h"
475	},
476	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
477
478	#ifdef XML_NS
479
480	static const struct normal_encoding internal_utf8_encoding_ns
481	= {{VTABLE1, utf8_toUtf8, utf8_toUtf16, `1`, `1`, `0`},
482	{
483	# include "iasciitab.h"
484	# include "utf8tab.h"
485	},
486	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
487
488	#endif
489
490	static const struct normal_encoding internal_utf8_encoding
491	= {{VTABLE1, utf8_toUtf8, utf8_toUtf16, `1`, `1`, `0`},
492	{
493	#define BT_COLON BT_NMSTRT
494	#include "iasciitab.h"
495	#undef BT_COLON
496	#include "utf8tab.h"
497	},
498	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
499
500	static enum XML_Convert_Result PTRCALL
501	latin1_toUtf8(const ENCODING enc, const* char *fromP, const* char *fromLim,
502	char *toP, const* char *toLim) {
503	UNUSED_P(enc);
504	for (;;) {
505	unsigned char c;
506	if (*fromP == fromLim)
507	return XML_CONVERT_COMPLETED;
508	c = (unsigned char)**fromP;
509	if (c & `0x80`) {
510	if (toLim - *toP < `2`)
511	return XML_CONVERT_OUTPUT_EXHAUSTED;
512	(toP)++ = (char)((c >> `6`) \| UTF8_cval2);
513	(toP)++ = (char)((c & `0x3f`) \| `0x80`);
514	(*fromP)++;
515	} else {
516	if (*toP == toLim)
517	return XML_CONVERT_OUTPUT_EXHAUSTED;
518	(toP)++ = (fromP)++;
519	}
520	}
521	}
522
523	static enum XML_Convert_Result PTRCALL
524	latin1_toUtf16(const ENCODING enc, const* char *fromP, const* char *fromLim,
525	unsigned short *toP, const* unsigned short *toLim) {
526	UNUSED_P(enc);
527	while (fromP < fromLim && toP < toLim)
528	(toP)++ = (unsigned char)(fromP)++;
529
530	if ((toP == toLim) && (fromP < fromLim))
531	return XML_CONVERT_OUTPUT_EXHAUSTED;
532	else
533	return XML_CONVERT_COMPLETED;
534	}
535
536	#ifdef XML_NS
537
538	static const struct normal_encoding latin1_encoding_ns
539	= {{VTABLE1, latin1_toUtf8, latin1_toUtf16, `1`, `0`, `0`},
540	{
541	# include "asciitab.h"
542	# include "latin1tab.h"
543	},
544	STANDARD_VTABLE(sb_) NULL_VTABLE};
545
546	#endif
547
548	static const struct normal_encoding latin1_encoding
549	= {{VTABLE1, latin1_toUtf8, latin1_toUtf16, `1`, `0`, `0`},
550	{
551	#define BT_COLON BT_NMSTRT
552	#include "asciitab.h"
553	#undef BT_COLON
554	#include "latin1tab.h"
555	},
556	STANDARD_VTABLE(sb_) NULL_VTABLE};
557
558	static enum XML_Convert_Result PTRCALL
559	ascii_toUtf8(const ENCODING enc, const* char *fromP, const* char *fromLim,
560	char *toP, const* char *toLim) {
561	UNUSED_P(enc);
562	while (fromP < fromLim && toP < toLim)
563	(toP)++ = (fromP)++;
564
565	if ((toP == toLim) && (fromP < fromLim))
566	return XML_CONVERT_OUTPUT_EXHAUSTED;
567	else
568	return XML_CONVERT_COMPLETED;
569	}
570
571	#ifdef XML_NS
572
573	static const struct normal_encoding ascii_encoding_ns
574	= {{VTABLE1, ascii_toUtf8, latin1_toUtf16, `1`, `1`, `0`},
575	{
576	# include "asciitab.h"
577	/ BT_NONXML == 0 /
578	},
579	STANDARD_VTABLE(sb_) NULL_VTABLE};
580
581	#endif
582
583	static const struct normal_encoding ascii_encoding
584	= {{VTABLE1, ascii_toUtf8, latin1_toUtf16, `1`, `1`, `0`},
585	{
586	#define BT_COLON BT_NMSTRT
587	#include "asciitab.h"
588	#undef BT_COLON
589	/ BT_NONXML == 0 /
590	},
591	STANDARD_VTABLE(sb_) NULL_VTABLE};
592
593	static int PTRFASTCALL
594	unicode_byte_type(char hi, char lo) {
595	switch ((unsigned char)hi) {
596	/ 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) /
597	case `0xD8`:
598	case `0xD9`:
599	case `0xDA`:
600	case `0xDB`:
601	return BT_LEAD4;
602	/ 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) /
603	case `0xDC`:
604	case `0xDD`:
605	case `0xDE`:
606	case `0xDF`:
607	return BT_TRAIL;
608	case `0xFF`:
609	switch ((unsigned char)lo) {
610	case `0xFF`: / noncharacter-FFFF /
611	case `0xFE`: / noncharacter-FFFE /
612	return BT_NONXML;
613	}
614	break;
615	}
616	return BT_NONASCII;
617	}
618
619	#define DEFINE_UTF16_TO_UTF8(E) \
620	static enum XML_Convert_Result PTRCALL E##toUtf8( \
621	const ENCODING enc, const char fromP, const char fromLim, \
622	char *toP, const char toLim) { \
623	const char from = fromP; \
624	UNUSED_P(enc); \
625	fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
626	for (; from < fromLim; from += 2) { \
627	int plane; \
628	unsigned char lo2; \
629	unsigned char lo = GET_LO(from); \
630	unsigned char hi = GET_HI(from); \
631	switch (hi) { \
632	case 0: \
633	if (lo < 0x80) { \
634	if (*toP == toLim) { \
635	*fromP = from; \
636	return XML_CONVERT_OUTPUT_EXHAUSTED; \
637	} \
638	(toP)++ = lo; \
639	break; \
640	} \
641	/* fall through */ \
642	case 0x1: \
643	case 0x2: \
644	case 0x3: \
645	case 0x4: \
646	case 0x5: \
647	case 0x6: \
648	case 0x7: \
649	if (toLim - *toP < 2) { \
650	*fromP = from; \
651	return XML_CONVERT_OUTPUT_EXHAUSTED; \
652	} \
653	(toP)++ = ((lo >> 6) \| (hi << 2) \| UTF8_cval2); \
654	(toP)++ = ((lo & 0x3f) \| 0x80); \
655	break; \
656	default: \
657	if (toLim - *toP < 3) { \
658	*fromP = from; \
659	return XML_CONVERT_OUTPUT_EXHAUSTED; \
660	} \
661	/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
662	(toP)++ = ((hi >> 4) \| UTF8_cval3); \
663	(toP)++ = (((hi & 0xf) << 2) \| (lo >> 6) \| 0x80); \
664	(toP)++ = ((lo & 0x3f) \| 0x80); \
665	break; \
666	case 0xD8: \
667	case 0xD9: \
668	case 0xDA: \
669	case 0xDB: \
670	if (toLim - *toP < 4) { \
671	*fromP = from; \
672	return XML_CONVERT_OUTPUT_EXHAUSTED; \
673	} \
674	if (fromLim - from < 4) { \
675	*fromP = from; \
676	return XML_CONVERT_INPUT_INCOMPLETE; \
677	} \
678	plane = (((hi & 0x3) << 2) \| ((lo >> 6) & 0x3)) + 1; \
679	(toP)++ = (char)((plane >> 2) \| UTF8_cval4); \
680	(toP)++ = (((lo >> 2) & 0xF) \| ((plane & 0x3) << 4) \| 0x80); \
681	from += 2; \
682	lo2 = GET_LO(from); \
683	(toP)++ = (((lo & 0x3) << 4) \| ((GET_HI(from) & 0x3) << 2) \
684	\| (lo2 >> 6) \| 0x80); \
685	(toP)++ = ((lo2 & 0x3f) \| 0x80); \
686	break; \
687	} \
688	} \
689	*fromP = from; \
690	if (from < fromLim) \
691	return XML_CONVERT_INPUT_INCOMPLETE; \
692	else \
693	return XML_CONVERT_COMPLETED; \
694	}
695
696	#define DEFINE_UTF16_TO_UTF16(E) \
697	static enum XML_Convert_Result PTRCALL E##toUtf16( \
698	const ENCODING enc, const char fromP, const char fromLim, \
699	unsigned short *toP, const unsigned short toLim) { \
700	enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
701	UNUSED_P(enc); \
702	fromLim = fromP + (((fromLim - fromP) >> 1) << 1); /* shrink to even */ \
703	/* Avoid copying first half only of surrogate */ \
704	if (fromLim - fromP > ((toLim - toP) << 1) \
705	&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
706	fromLim -= 2; \
707	res = XML_CONVERT_INPUT_INCOMPLETE; \
708	} \
709	for (; fromP < fromLim && toP < toLim; *fromP += 2) \
710	(toP)++ = (GET_HI(fromP) << 8) \| GET_LO(fromP); \
711	if ((toP == toLim) && (fromP < fromLim)) \
712	return XML_CONVERT_OUTPUT_EXHAUSTED; \
713	else \
714	return res; \
715	}
716
717	#define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
718	#define GET_LO(ptr) ((unsigned char)(ptr)[0])
719	#define GET_HI(ptr) ((unsigned char)(ptr)[1])
720
721	DEFINE_UTF16_TO_UTF8(little2_)
722	DEFINE_UTF16_TO_UTF16(little2_)
723
724	#undef SET2
725	#undef GET_LO
726	#undef GET_HI
727
728	#define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
729	#define GET_LO(ptr) ((unsigned char)(ptr)[1])
730	#define GET_HI(ptr) ((unsigned char)(ptr)[0])
731
732	DEFINE_UTF16_TO_UTF8(big2_)
733	DEFINE_UTF16_TO_UTF16(big2_)
734
735	#undef SET2
736	#undef GET_LO
737	#undef GET_HI
738
739	#define LITTLE2_BYTE_TYPE(enc, p) \
740	((p)[1] == 0 ? ((struct normal_encoding )(enc))->type[(unsigned char)(p)] \
741	: unicode_byte_type((p)[1], (p)[0]))
742	#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
743	#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
744	#define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
745	UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
746	#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
747	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
748
749	#ifdef XML_MIN_SIZE
750
751	static int PTRFASTCALL
752	little2_byteType(const ENCODING enc, const* char *p) {
753	return LITTLE2_BYTE_TYPE(enc, p);
754	}
755
756	static int PTRFASTCALL
757	little2_byteToAscii(const ENCODING enc, const* char *p) {
758	UNUSED_P(enc);
759	return LITTLE2_BYTE_TO_ASCII(p);
760	}
761
762	static int PTRCALL
763	little2_charMatches(const ENCODING enc, const* char p, int* c) {
764	UNUSED_P(enc);
765	return LITTLE2_CHAR_MATCHES(p, c);
766	}
767
768	static int PTRFASTCALL
769	little2_isNameMin(const ENCODING enc, const* char *p) {
770	UNUSED_P(enc);
771	return LITTLE2_IS_NAME_CHAR_MINBPC(p);
772	}
773
774	static int PTRFASTCALL
775	little2_isNmstrtMin(const ENCODING enc, const* char *p) {
776	UNUSED_P(enc);
777	return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
778	}
779
780	# undef VTABLE
781	# define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
782
783	#else /* not XML_MIN_SIZE */
784
785	# undef PREFIX
786	# define PREFIX(ident) little2_##ident
787	# define MINBPC(enc) 2
788	/ CHAR_MATCHES is guaranteed to have MINBPC bytes available. /
789	# define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
790	# define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
791	# define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
792	# define IS_NAME_CHAR(enc, p, n) 0
793	# define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
794	# define IS_NMSTRT_CHAR(enc, p, n) (0)
795	# define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
796
797	# define XML_TOK_IMPL_C
798	# include "xmltok_impl.c"
799	# undef XML_TOK_IMPL_C
800
801	# undef MINBPC
802	# undef BYTE_TYPE
803	# undef BYTE_TO_ASCII
804	# undef CHAR_MATCHES
805	# undef IS_NAME_CHAR
806	# undef IS_NAME_CHAR_MINBPC
807	# undef IS_NMSTRT_CHAR
808	# undef IS_NMSTRT_CHAR_MINBPC
809	# undef IS_INVALID_CHAR
810
811	#endif /* not XML_MIN_SIZE */
812
813	#ifdef XML_NS
814
815	static const struct normal_encoding little2_encoding_ns
816	= {{VTABLE, `2`, `0`,
817	# if BYTEORDER == 1234
818	`1`
819	# else
820	`0`
821	# endif
822	},
823	{
824	# include "asciitab.h"
825	# include "latin1tab.h"
826	},
827	STANDARD_VTABLE(little2_) NULL_VTABLE};
828
829	#endif
830
831	static const struct normal_encoding little2_encoding
832	= {{VTABLE, `2`, `0`,
833	#if BYTEORDER == 1234
834	`1`
835	#else
836	`0`
837	#endif
838	},
839	{
840	#define BT_COLON BT_NMSTRT
841	#include "asciitab.h"
842	#undef BT_COLON
843	#include "latin1tab.h"
844	},
845	STANDARD_VTABLE(little2_) NULL_VTABLE};
846
847	#if BYTEORDER != 4321
848
849	# ifdef XML_NS
850
851	static const struct normal_encoding internal_little2_encoding_ns
852	= {{VTABLE, `2`, `0`, `1`},
853	{
854	# include "iasciitab.h"
855	# include "latin1tab.h"
856	},
857	STANDARD_VTABLE(little2_) NULL_VTABLE};
858
859	# endif
860
861	static const struct normal_encoding internal_little2_encoding
862	= {{VTABLE, `2`, `0`, `1`},
863	{
864	# define BT_COLON BT_NMSTRT
865	# include "iasciitab.h"
866	# undef BT_COLON
867	# include "latin1tab.h"
868	},
869	STANDARD_VTABLE(little2_) NULL_VTABLE};
870
871	#endif
872
873	#define BIG2_BYTE_TYPE(enc, p) \
874	((p)[0] == 0 \
875	? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
876	: unicode_byte_type((p)[0], (p)[1]))
877	#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
878	#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
879	#define BIG2_IS_NAME_CHAR_MINBPC(p) \
880	UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
881	#define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
882	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
883
884	#ifdef XML_MIN_SIZE
885
886	static int PTRFASTCALL
887	big2_byteType(const ENCODING enc, const* char *p) {
888	return BIG2_BYTE_TYPE(enc, p);
889	}
890
891	static int PTRFASTCALL
892	big2_byteToAscii(const ENCODING enc, const* char *p) {
893	UNUSED_P(enc);
894	return BIG2_BYTE_TO_ASCII(p);
895	}
896
897	static int PTRCALL
898	big2_charMatches(const ENCODING enc, const* char p, int* c) {
899	UNUSED_P(enc);
900	return BIG2_CHAR_MATCHES(p, c);
901	}
902
903	static int PTRFASTCALL
904	big2_isNameMin(const ENCODING enc, const* char *p) {
905	UNUSED_P(enc);
906	return BIG2_IS_NAME_CHAR_MINBPC(p);
907	}
908
909	static int PTRFASTCALL
910	big2_isNmstrtMin(const ENCODING enc, const* char *p) {
911	UNUSED_P(enc);
912	return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
913	}
914
915	# undef VTABLE
916	# define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
917
918	#else /* not XML_MIN_SIZE */
919
920	# undef PREFIX
921	# define PREFIX(ident) big2_##ident
922	# define MINBPC(enc) 2
923	/ CHAR_MATCHES is guaranteed to have MINBPC bytes available. /
924	# define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
925	# define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
926	# define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
927	# define IS_NAME_CHAR(enc, p, n) 0
928	# define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
929	# define IS_NMSTRT_CHAR(enc, p, n) (0)
930	# define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
931
932	# define XML_TOK_IMPL_C
933	# include "xmltok_impl.c"
934	# undef XML_TOK_IMPL_C
935
936	# undef MINBPC
937	# undef BYTE_TYPE
938	# undef BYTE_TO_ASCII
939	# undef CHAR_MATCHES
940	# undef IS_NAME_CHAR
941	# undef IS_NAME_CHAR_MINBPC
942	# undef IS_NMSTRT_CHAR
943	# undef IS_NMSTRT_CHAR_MINBPC
944	# undef IS_INVALID_CHAR
945
946	#endif /* not XML_MIN_SIZE */
947
948	#ifdef XML_NS
949
950	static const struct normal_encoding big2_encoding_ns
951	= {{VTABLE, `2`, `0`,
952	# if BYTEORDER == 4321
953	`1`
954	# else
955	`0`
956	# endif
957	},
958	{
959	# include "asciitab.h"
960	# include "latin1tab.h"
961	},
962	STANDARD_VTABLE(big2_) NULL_VTABLE};
963
964	#endif
965
966	static const struct normal_encoding big2_encoding
967	= {{VTABLE, `2`, `0`,
968	#if BYTEORDER == 4321
969	`1`
970	#else
971	`0`
972	#endif
973	},
974	{
975	#define BT_COLON BT_NMSTRT
976	#include "asciitab.h"
977	#undef BT_COLON
978	#include "latin1tab.h"
979	},
980	STANDARD_VTABLE(big2_) NULL_VTABLE};
981
982	#if BYTEORDER != 1234
983
984	# ifdef XML_NS
985
986	static const struct normal_encoding internal_big2_encoding_ns
987	= {{VTABLE, `2`, `0`, `1`},
988	{
989	# include "iasciitab.h"
990	# include "latin1tab.h"
991	},
992	STANDARD_VTABLE(big2_) NULL_VTABLE};
993
994	# endif
995
996	static const struct normal_encoding internal_big2_encoding
997	= {{VTABLE, `2`, `0`, `1`},
998	{
999	# define BT_COLON BT_NMSTRT
1000	# include "iasciitab.h"
1001	# undef BT_COLON
1002	# include "latin1tab.h"
1003	},
1004	STANDARD_VTABLE(big2_) NULL_VTABLE};
1005
1006	#endif
1007
1008	#undef PREFIX
1009
1010	static int FASTCALL
1011	streqci(const char s1, const* char *s2) {
1012	for (;;) {
1013	char c1 = *s1++;
1014	char c2 = *s2++;
1015	if (ASCII_a <= c1 && c1 <= ASCII_z)
1016	c1 += ASCII_A - ASCII_a;
1017	if (ASCII_a <= c2 && c2 <= ASCII_z)
1018	/ The following line will never get executed. streqci() is*
1019	* only called from two places, both of which guarantee to put
1020	* upper-case strings into s2.
1021	*/
1022	c2 += ASCII_A - ASCII_a; / LCOV_EXCL_LINE /
1023	if (c1 != c2)
1024	return `0`;
1025	if (! c1)
1026	break;
1027	}
1028	return `1`;
1029	}
1030
1031	static void PTRCALL
1032	initUpdatePosition(const ENCODING enc, const* char ptr, const* char *end,
1033	POSITION *pos) {
1034	UNUSED_P(enc);
1035	normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1036	}
1037
1038	static int
1039	toAscii(const ENCODING enc, const* char ptr, const* char *end) {
1040	char buf[`1`];
1041	char *p = buf;
1042	XmlUtf8Convert(enc, &ptr, end, &p, p + `1`);
1043	if (p == buf)
1044	return -`1`;
1045	else
1046	return buf[`0`];
1047	}
1048
1049	static int FASTCALL
1050	isSpace(int c) {
1051	switch (c) {
1052	case `0x20`:
1053	case `0xD`:
1054	case `0xA`:
1055	case `0x9`:
1056	return `1`;
1057	}
1058	return `0`;
1059	}
1060
1061	/ Return 1 if there's just optional white space or there's an S*
1062	followed by name=val.
1063	*/
1064	static int
1065	parsePseudoAttribute(const ENCODING enc, const* char ptr, const* char *end,
1066	const char *namePtr, const* char **nameEndPtr,
1067	const char *valPtr, const* char **nextTokPtr) {
1068	int c;
1069	char open;
1070	if (ptr == end) {
1071	*namePtr = NULL;
1072	return `1`;
1073	}
1074	if (! isSpace(toAscii(enc, ptr, end))) {
1075	*nextTokPtr = ptr;
1076	return `0`;
1077	}
1078	do {
1079	ptr += enc->minBytesPerChar;
1080	} while (isSpace(toAscii(enc, ptr, end)));
1081	if (ptr == end) {
1082	*namePtr = NULL;
1083	return `1`;
1084	}
1085	*namePtr = ptr;
1086	for (;;) {
1087	c = toAscii(enc, ptr, end);
1088	if (c == -`1`) {
1089	*nextTokPtr = ptr;
1090	return `0`;
1091	}
1092	if (c == ASCII_EQUALS) {
1093	*nameEndPtr = ptr;
1094	break;
1095	}
1096	if (isSpace(c)) {
1097	*nameEndPtr = ptr;
1098	do {
1099	ptr += enc->minBytesPerChar;
1100	} while (isSpace(c = toAscii(enc, ptr, end)));
1101	if (c != ASCII_EQUALS) {
1102	*nextTokPtr = ptr;
1103	return `0`;
1104	}
1105	break;
1106	}
1107	ptr += enc->minBytesPerChar;
1108	}
1109	if (ptr == *namePtr) {
1110	*nextTokPtr = ptr;
1111	return `0`;
1112	}
1113	ptr += enc->minBytesPerChar;
1114	c = toAscii(enc, ptr, end);
1115	while (isSpace(c)) {
1116	ptr += enc->minBytesPerChar;
1117	c = toAscii(enc, ptr, end);
1118	}
1119	if (c != ASCII_QUOT && c != ASCII_APOS) {
1120	*nextTokPtr = ptr;
1121	return `0`;
1122	}
1123	open = (char)c;
1124	ptr += enc->minBytesPerChar;
1125	*valPtr = ptr;
1126	for (;; ptr += enc->minBytesPerChar) {
1127	c = toAscii(enc, ptr, end);
1128	if (c == open)
1129	break;
1130	if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1131	&& ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1132	&& c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1133	*nextTokPtr = ptr;
1134	return `0`;
1135	}
1136	}
1137	*nextTokPtr = ptr + enc->minBytesPerChar;
1138	return `1`;
1139	}
1140
1141	static const char KW_version[]
1142	= {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, `'\0'`};
1143
1144	static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1145	ASCII_i, ASCII_n, ASCII_g, `'\0'`};
1146
1147	static const char KW_standalone[]
1148	= {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1149	ASCII_l, ASCII_o, ASCII_n, ASCII_e, `'\0'`};
1150
1151	static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, `'\0'`};
1152
1153	static const char KW_no[] = {ASCII_n, ASCII_o, `'\0'`};
1154
1155	static int
1156	doParseXmlDecl(const ENCODING (encodingFinder)(const ENCODING , const* char *,
1157	const char *),
1158	int isGeneralTextEntity, const ENCODING enc, const* char *ptr,
1159	const char end, const* char *badPtr, const* char **versionPtr,
1160	const char *versionEndPtr, const* char **encodingName,
1161	const ENCODING *encoding, int* *standalone) {
1162	const char *val = NULL;
1163	const char *name = NULL;
1164	const char *nameEnd = NULL;
1165	ptr += `5` * enc->minBytesPerChar;
1166	end -= `2` * enc->minBytesPerChar;
1167	if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1168	\|\| ! name) {
1169	*badPtr = ptr;
1170	return `0`;
1171	}
1172	if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1173	if (! isGeneralTextEntity) {
1174	*badPtr = name;
1175	return `0`;
1176	}
1177	} else {
1178	if (versionPtr)
1179	*versionPtr = val;
1180	if (versionEndPtr)
1181	*versionEndPtr = ptr;
1182	if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1183	*badPtr = ptr;
1184	return `0`;
1185	}
1186	if (! name) {
1187	if (isGeneralTextEntity) {
1188	/ a TextDecl must have an EncodingDecl /
1189	*badPtr = ptr;
1190	return `0`;
1191	}
1192	return `1`;
1193	}
1194	}
1195	if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1196	int c = toAscii(enc, val, end);
1197	if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1198	*badPtr = val;
1199	return `0`;
1200	}
1201	if (encodingName)
1202	*encodingName = val;
1203	if (encoding)
1204	*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1205	if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1206	*badPtr = ptr;
1207	return `0`;
1208	}
1209	if (! name)
1210	return `1`;
1211	}
1212	if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1213	\|\| isGeneralTextEntity) {
1214	*badPtr = name;
1215	return `0`;
1216	}
1217	if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1218	if (standalone)
1219	*standalone = `1`;
1220	} else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1221	if (standalone)
1222	*standalone = `0`;
1223	} else {
1224	*badPtr = val;
1225	return `0`;
1226	}
1227	while (isSpace(toAscii(enc, ptr, end)))
1228	ptr += enc->minBytesPerChar;
1229	if (ptr != end) {
1230	*badPtr = ptr;
1231	return `0`;
1232	}
1233	return `1`;
1234	}
1235
1236	static int FASTCALL
1237	checkCharRefNumber(int result) {
1238	switch (result >> `8`) {
1239	case `0xD8`:
1240	case `0xD9`:
1241	case `0xDA`:
1242	case `0xDB`:
1243	case `0xDC`:
1244	case `0xDD`:
1245	case `0xDE`:
1246	case `0xDF`:
1247	return -`1`;
1248	case `0`:
1249	if (latin1_encoding.type[result] == BT_NONXML)
1250	return -`1`;
1251	break;
1252	case `0xFF`:
1253	if (result == `0xFFFE` \|\| result == `0xFFFF`)
1254	return -`1`;
1255	break;
1256	}
1257	return result;
1258	}
1259
1260	int FASTCALL
1261	XmlUtf8Encode(int c, char *buf) {
1262	enum {
1263	/ minN is minimum legal resulting value for N byte sequence /
1264	min2 = `0x80`,
1265	min3 = `0x800`,
1266	min4 = `0x10000`
1267	};
1268
1269	if (c < `0`)
1270	return `0`; / LCOV_EXCL_LINE: this case is always eliminated beforehand /
1271	if (c < min2) {
1272	buf[`0`] = (char)(c \| UTF8_cval1);
1273	return `1`;
1274	}
1275	if (c < min3) {
1276	buf[`0`] = (char)((c >> `6`) \| UTF8_cval2);
1277	buf[`1`] = (char)((c & `0x3f`) \| `0x80`);
1278	return `2`;
1279	}
1280	if (c < min4) {
1281	buf[`0`] = (char)((c >> `12`) \| UTF8_cval3);
1282	buf[`1`] = (char)(((c >> `6`) & `0x3f`) \| `0x80`);
1283	buf[`2`] = (char)((c & `0x3f`) \| `0x80`);
1284	return `3`;
1285	}
1286	if (c < `0x110000`) {
1287	buf[`0`] = (char)((c >> `18`) \| UTF8_cval4);
1288	buf[`1`] = (char)(((c >> `12`) & `0x3f`) \| `0x80`);
1289	buf[`2`] = (char)(((c >> `6`) & `0x3f`) \| `0x80`);
1290	buf[`3`] = (char)((c & `0x3f`) \| `0x80`);
1291	return `4`;
1292	}
1293	return `0`; / LCOV_EXCL_LINE: this case too is eliminated before calling /
1294	}
1295
1296	int FASTCALL
1297	XmlUtf16Encode(int charNum, unsigned short *buf) {
1298	if (charNum < `0`)
1299	return `0`;
1300	if (charNum < `0x10000`) {
1301	buf[`0`] = (unsigned short)charNum;
1302	return `1`;
1303	}
1304	if (charNum < `0x110000`) {
1305	charNum -= `0x10000`;
1306	buf[`0`] = (unsigned short)((charNum >> `10`) + `0xD800`);
1307	buf[`1`] = (unsigned short)((charNum & `0x3FF`) + `0xDC00`);
1308	return `2`;
1309	}
1310	return `0`;
1311	}
1312
1313	struct unknown_encoding {
1314	struct normal_encoding normal;
1315	CONVERTER convert;
1316	void *userData;
1317	unsigned short utf16[`256`];
1318	char utf8[`256`][`4`];
1319	};
1320
1321	#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1322
1323	int
1324	XmlSizeOfUnknownEncoding(void) {
1325	return sizeof(struct unknown_encoding);
1326	}
1327
1328	static int PTRFASTCALL
1329	unknown_isName(const ENCODING enc, const* char *p) {
1330	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1331	int c = uenc->convert(uenc->userData, p);
1332	if (c & ~`0xFFFF`)
1333	return `0`;
1334	return UCS2_GET_NAMING(namePages, c >> `8`, c & `0xFF`);
1335	}
1336
1337	static int PTRFASTCALL
1338	unknown_isNmstrt(const ENCODING enc, const* char *p) {
1339	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1340	int c = uenc->convert(uenc->userData, p);
1341	if (c & ~`0xFFFF`)
1342	return `0`;
1343	return UCS2_GET_NAMING(nmstrtPages, c >> `8`, c & `0xFF`);
1344	}
1345
1346	static int PTRFASTCALL
1347	unknown_isInvalid(const ENCODING enc, const* char *p) {
1348	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1349	int c = uenc->convert(uenc->userData, p);
1350	return (c & ~`0xFFFF`) \|\| checkCharRefNumber(c) < `0`;
1351	}
1352
1353	static enum XML_Convert_Result PTRCALL
1354	unknown_toUtf8(const ENCODING enc, const* char *fromP, const* char *fromLim,
1355	char *toP, const* char *toLim) {
1356	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1357	char buf[XML_UTF8_ENCODE_MAX];
1358	for (;;) {
1359	const char *utf8;
1360	int n;
1361	if (*fromP == fromLim)
1362	return XML_CONVERT_COMPLETED;
1363	utf8 = uenc->utf8[(unsigned char)**fromP];
1364	n = *utf8++;
1365	if (n == `0`) {
1366	int c = uenc->convert(uenc->userData, *fromP);
1367	n = XmlUtf8Encode(c, buf);
1368	if (n > toLim - *toP)
1369	return XML_CONVERT_OUTPUT_EXHAUSTED;
1370	utf8 = buf;
1371	fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned* char)**fromP]
1372	- (BT_LEAD2 - `2`));
1373	} else {
1374	if (n > toLim - *toP)
1375	return XML_CONVERT_OUTPUT_EXHAUSTED;
1376	(*fromP)++;
1377	}
1378	memcpy(*toP, utf8, n);
1379	*toP += n;
1380	}
1381	}
1382
1383	static enum XML_Convert_Result PTRCALL
1384	unknown_toUtf16(const ENCODING enc, const* char *fromP, const* char *fromLim,
1385	unsigned short *toP, const* unsigned short *toLim) {
1386	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1387	while (fromP < fromLim && toP < toLim) {
1388	unsigned short c = uenc->utf16[(unsigned char)**fromP];
1389	if (c == `0`) {
1390	c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1391	fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned* char)**fromP]
1392	- (BT_LEAD2 - `2`));
1393	} else
1394	(*fromP)++;
1395	(toP)++ = c;
1396	}
1397
1398	if ((toP == toLim) && (fromP < fromLim))
1399	return XML_CONVERT_OUTPUT_EXHAUSTED;
1400	else
1401	return XML_CONVERT_COMPLETED;
1402	}
1403
1404	ENCODING *
1405	XmlInitUnknownEncoding(void mem, int* *table, CONVERTER convert,
1406	void *userData) {
1407	int i;
1408	struct unknown_encoding e = (struct* unknown_encoding *)mem;
1409	memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1410	for (i = `0`; i < `128`; i++)
1411	if (latin1_encoding.type[i] != BT_OTHER
1412	&& latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1413	return `0`;
1414	for (i = `0`; i < `256`; i++) {
1415	int c = table[i];
1416	if (c == -`1`) {
1417	e->normal.type[i] = BT_MALFORM;
1418	/ This shouldn't really get used. /
1419	e->utf16[i] = `0xFFFF`;
1420	e->utf8[i][`0`] = `1`;
1421	e->utf8[i][`1`] = `0`;
1422	} else if (c < `0`) {
1423	if (c < -`4`)
1424	return `0`;
1425	/ Multi-byte sequences need a converter function /
1426	if (! convert)
1427	return `0`;
1428	e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + `2`));
1429	e->utf8[i][`0`] = `0`;
1430	e->utf16[i] = `0`;
1431	} else if (c < `0x80`) {
1432	if (latin1_encoding.type[c] != BT_OTHER
1433	&& latin1_encoding.type[c] != BT_NONXML && c != i)
1434	return `0`;
1435	e->normal.type[i] = latin1_encoding.type[c];
1436	e->utf8[i][`0`] = `1`;
1437	e->utf8[i][`1`] = (char)c;
1438	e->utf16[i] = (unsigned short)(c == `0` ? `0xFFFF` : c);
1439	} else if (checkCharRefNumber(c) < `0`) {
1440	e->normal.type[i] = BT_NONXML;
1441	/ This shouldn't really get used. /
1442	e->utf16[i] = `0xFFFF`;
1443	e->utf8[i][`0`] = `1`;
1444	e->utf8[i][`1`] = `0`;
1445	} else {
1446	if (c > `0xFFFF`)
1447	return `0`;
1448	if (UCS2_GET_NAMING(nmstrtPages, c >> `8`, c & `0xff`))
1449	e->normal.type[i] = BT_NMSTRT;
1450	else if (UCS2_GET_NAMING(namePages, c >> `8`, c & `0xff`))
1451	e->normal.type[i] = BT_NAME;
1452	else
1453	e->normal.type[i] = BT_OTHER;
1454	e->utf8[i][`0`] = (char)XmlUtf8Encode(c, e->utf8[i] + `1`);
1455	e->utf16[i] = (unsigned short)c;
1456	}
1457	}
1458	e->userData = userData;
1459	e->convert = convert;
1460	if (convert) {
1461	e->normal.isName2 = unknown_isName;
1462	e->normal.isName3 = unknown_isName;
1463	e->normal.isName4 = unknown_isName;
1464	e->normal.isNmstrt2 = unknown_isNmstrt;
1465	e->normal.isNmstrt3 = unknown_isNmstrt;
1466	e->normal.isNmstrt4 = unknown_isNmstrt;
1467	e->normal.isInvalid2 = unknown_isInvalid;
1468	e->normal.isInvalid3 = unknown_isInvalid;
1469	e->normal.isInvalid4 = unknown_isInvalid;
1470	}
1471	e->normal.enc.utf8Convert = unknown_toUtf8;
1472	e->normal.enc.utf16Convert = unknown_toUtf16;
1473	return &(e->normal.enc);
1474	}
1475
1476	/ If this enumeration is changed, getEncodingIndex and encodings*
1477	must also be changed. /*
1478	enum {
1479	UNKNOWN_ENC = -`1`,
1480	ISO_8859_1_ENC = `0`,
1481	US_ASCII_ENC,
1482	UTF_8_ENC,
1483	UTF_16_ENC,
1484	UTF_16BE_ENC,
1485	UTF_16LE_ENC,
1486	/ must match encodingNames up to here /
1487	NO_ENC
1488	};
1489
1490	static const char KW_ISO_8859_1[]
1491	= {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8,
1492	ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, `'\0'`};
1493	static const char KW_US_ASCII[]
1494	= {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1495	ASCII_C, ASCII_I, ASCII_I, `'\0'`};
1496	static const char KW_UTF_8[]
1497	= {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, `'\0'`};
1498	static const char KW_UTF_16[]
1499	= {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, `'\0'`};
1500	static const char KW_UTF_16BE[]
1501	= {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1502	ASCII_6, ASCII_B, ASCII_E, `'\0'`};
1503	static const char KW_UTF_16LE[]
1504	= {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1505	ASCII_6, ASCII_L, ASCII_E, `'\0'`};
1506
1507	static int FASTCALL
1508	getEncodingIndex(const char *name) {
1509	static const char *const encodingNames[] = {
1510	KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1511	};
1512	int i;
1513	if (name == NULL)
1514	return NO_ENC;
1515	for (i = `0`; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[`0`])); i++)
1516	if (streqci(name, encodingNames[i]))
1517	return i;
1518	return UNKNOWN_ENC;
1519	}
1520
1521	/ For binary compatibility, we store the index of the encoding*
1522	specified at initialization in the isUtf16 member.
1523	*/
1524
1525	#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1526	#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1527
1528	/ This is what detects the encoding. encodingTable maps from*
1529	encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1530	the external (protocol) specified encoding; state is
1531	XML_CONTENT_STATE if we're parsing an external text entity, and
1532	XML_PROLOG_STATE otherwise.
1533	*/
1534
1535	static int
1536	initScan(const ENCODING *const encodingTable, const* INIT_ENCODING *enc,
1537	int state, const char ptr, const* char end, const* char **nextTokPtr) {
1538	const ENCODING **encPtr;
1539
1540	if (ptr >= end)
1541	return XML_TOK_NONE;
1542	encPtr = enc->encPtr;
1543	if (ptr + `1` == end) {
1544	/ only a single byte available for auto-detection /
1545	#ifndef XML_DTD /* FIXME */
1546	/ a well-formed document entity must have more than one byte /
1547	if (state != XML_CONTENT_STATE)
1548	return XML_TOK_PARTIAL;
1549	#endif
1550	/ so we're parsing an external text entity... /
1551	/ if UTF-16 was externally specified, then we need at least 2 bytes /
1552	switch (INIT_ENC_INDEX(enc)) {
1553	case UTF_16_ENC:
1554	case UTF_16LE_ENC:
1555	case UTF_16BE_ENC:
1556	return XML_TOK_PARTIAL;
1557	}
1558	switch ((unsigned char)*ptr) {
1559	case `0xFE`:
1560	case `0xFF`:
1561	case `0xEF`: / possibly first byte of UTF-8 BOM /
1562	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1563	break;
1564	/ fall through /
1565	case `0x00`:
1566	case `0x3C`:
1567	return XML_TOK_PARTIAL;
1568	}
1569	} else {
1570	switch (((unsigned char)ptr[`0`] << `8`) \| (unsigned char)ptr[`1`]) {
1571	case `0xFEFF`:
1572	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1573	break;
1574	*nextTokPtr = ptr + `2`;
1575	*encPtr = encodingTable[UTF_16BE_ENC];
1576	return XML_TOK_BOM;
1577	/ 00 3C is handled in the default case /
1578	case `0x3C00`:
1579	if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1580	\|\| INIT_ENC_INDEX(enc) == UTF_16_ENC)
1581	&& state == XML_CONTENT_STATE)
1582	break;
1583	*encPtr = encodingTable[UTF_16LE_ENC];
1584	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1585	case `0xFFFE`:
1586	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1587	break;
1588	*nextTokPtr = ptr + `2`;
1589	*encPtr = encodingTable[UTF_16LE_ENC];
1590	return XML_TOK_BOM;
1591	case `0xEFBB`:
1592	/ Maybe a UTF-8 BOM (EF BB BF) /
1593	/ If there's an explicitly specified (external) encoding*
1594	of ISO-8859-1 or some flavour of UTF-16
1595	and this is an external text entity,
1596	don't look for the BOM,
1597	because it might be a legal data.
1598	*/
1599	if (state == XML_CONTENT_STATE) {
1600	int e = INIT_ENC_INDEX(enc);
1601	if (e == ISO_8859_1_ENC \|\| e == UTF_16BE_ENC \|\| e == UTF_16LE_ENC
1602	\|\| e == UTF_16_ENC)
1603	break;
1604	}
1605	if (ptr + `2` == end)
1606	return XML_TOK_PARTIAL;
1607	if ((unsigned char)ptr[`2`] == `0xBF`) {
1608	*nextTokPtr = ptr + `3`;
1609	*encPtr = encodingTable[UTF_8_ENC];
1610	return XML_TOK_BOM;
1611	}
1612	break;
1613	default:
1614	if (ptr[`0`] == `'\0'`) {
1615	/ 0 isn't a legal data character. Furthermore a document*
1616	entity can only start with ASCII characters. So the only
1617	way this can fail to be big-endian UTF-16 if it it's an
1618	external parsed general entity that's labelled as
1619	UTF-16LE.
1620	*/
1621	if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1622	break;
1623	*encPtr = encodingTable[UTF_16BE_ENC];
1624	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1625	} else if (ptr[`1`] == `'\0'`) {
1626	/ We could recover here in the case:*
1627	- parsing an external entity
1628	- second byte is 0
1629	- no externally specified encoding
1630	- no encoding declaration
1631	by assuming UTF-16LE. But we don't, because this would mean when
1632	presented just with a single byte, we couldn't reliably determine
1633	whether we needed further bytes.
1634	*/
1635	if (state == XML_CONTENT_STATE)
1636	break;
1637	*encPtr = encodingTable[UTF_16LE_ENC];
1638	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1639	}
1640	break;
1641	}
1642	}
1643	*encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1644	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1645	}
1646
1647	#define NS(x) x
1648	#define ns(x) x
1649	#define XML_TOK_NS_C
1650	#include "xmltok_ns.c"
1651	#undef XML_TOK_NS_C
1652	#undef NS
1653	#undef ns
1654
1655	#ifdef XML_NS
1656
1657	# define NS(x) x##NS
1658	# define ns(x) x##_ns
1659
1660	# define XML_TOK_NS_C
1661	# include "xmltok_ns.c"
1662	# undef XML_TOK_NS_C
1663
1664	# undef NS
1665	# undef ns
1666
1667	ENCODING *
1668	XmlInitUnknownEncodingNS(void mem, int* *table, CONVERTER convert,
1669	void *userData) {
1670	ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1671	if (enc)
1672	((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1673	return enc;
1674	}
1675
1676	#endif /* XML_NS */
1677

Browse the source code of python/Modules/expat/xmltok.c