codecs.h source code [python/Objects/stringlib/codecs.h]

1	/ stringlib: codec implementations /
2
3	#if !STRINGLIB_IS_UNICODE
4	# error "codecs.h is specific to Unicode"
5	#endif
6
7	#include "pycore_bitutils.h" // _Py_bswap32()
8
9	/ Mask to quickly check whether a C 'size_t' contains a*
10	non-ASCII, UTF8-encoded char. /*
11	#if (SIZEOF_SIZE_T == 8)
12	# define ASCII_CHAR_MASK 0x8080808080808080ULL
13	#elif (SIZEOF_SIZE_T == 4)
14	# define ASCII_CHAR_MASK 0x80808080U
15	#else
16	# error C 'size_t' size should be either 4 or 8!
17	#endif
18
19	/ 10xxxxxx /
20	#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
21
22	Py_LOCAL_INLINE(Py_UCS4)
23	STRINGLIB(utf8_decode)(const char *inptr, const* char *end,
24	STRINGLIB_CHAR *dest,
25	Py_ssize_t *outpos)
26	{
27	Py_UCS4 ch;
28	const char s = inptr;
29	STRINGLIB_CHAR p = dest + outpos;
30
31	while (s < end) {
32	ch = (unsigned char)*s;
33
34	if (ch < `0x80`) {
35	/ Fast path for runs of ASCII characters. Given that common UTF-8*
36	input will consist of an overwhelming majority of ASCII
37	characters, we try to optimize for this case by checking
38	as many characters as a C 'size_t' can contain.
39	First, check if we can do an aligned read, as most CPUs have
40	a penalty for unaligned reads.
41	*/
42	if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
43	/ Help register allocation /
44	const char *_s = s;
45	STRINGLIB_CHAR *_p = p;
46	while (_s + SIZEOF_SIZE_T <= end) {
47	/ Read a whole size_t at a time (either 4 or 8 bytes),*
48	and do a fast unrolled copy if it only contains ASCII
49	characters. /*
50	size_t value = (const* size_t *) _s;
51	if (value & ASCII_CHAR_MASK)
52	break;
53	#if PY_LITTLE_ENDIAN
54	_p[`0`] = (STRINGLIB_CHAR)(value & `0xFFu`);
55	_p[`1`] = (STRINGLIB_CHAR)((value >> `8`) & `0xFFu`);
56	_p[`2`] = (STRINGLIB_CHAR)((value >> `16`) & `0xFFu`);
57	_p[`3`] = (STRINGLIB_CHAR)((value >> `24`) & `0xFFu`);
58	# if SIZEOF_SIZE_T == 8
59	_p[`4`] = (STRINGLIB_CHAR)((value >> `32`) & `0xFFu`);
60	_p[`5`] = (STRINGLIB_CHAR)((value >> `40`) & `0xFFu`);
61	_p[`6`] = (STRINGLIB_CHAR)((value >> `48`) & `0xFFu`);
62	_p[`7`] = (STRINGLIB_CHAR)((value >> `56`) & `0xFFu`);
63	# endif
64	#else
65	# if SIZEOF_SIZE_T == 8
66	_p[`0`] = (STRINGLIB_CHAR)((value >> `56`) & `0xFFu`);
67	_p[`1`] = (STRINGLIB_CHAR)((value >> `48`) & `0xFFu`);
68	_p[`2`] = (STRINGLIB_CHAR)((value >> `40`) & `0xFFu`);
69	_p[`3`] = (STRINGLIB_CHAR)((value >> `32`) & `0xFFu`);
70	_p[`4`] = (STRINGLIB_CHAR)((value >> `24`) & `0xFFu`);
71	_p[`5`] = (STRINGLIB_CHAR)((value >> `16`) & `0xFFu`);
72	_p[`6`] = (STRINGLIB_CHAR)((value >> `8`) & `0xFFu`);
73	_p[`7`] = (STRINGLIB_CHAR)(value & `0xFFu`);
74	# else
75	_p[`0`] = (STRINGLIB_CHAR)((value >> `24`) & `0xFFu`);
76	_p[`1`] = (STRINGLIB_CHAR)((value >> `16`) & `0xFFu`);
77	_p[`2`] = (STRINGLIB_CHAR)((value >> `8`) & `0xFFu`);
78	_p[`3`] = (STRINGLIB_CHAR)(value & `0xFFu`);
79	# endif
80	#endif
81	_s += SIZEOF_SIZE_T;
82	_p += SIZEOF_SIZE_T;
83	}
84	s = _s;
85	p = _p;
86	if (s == end)
87	break;
88	ch = (unsigned char)*s;
89	}
90	if (ch < `0x80`) {
91	s++;
92	*p++ = ch;
93	continue;
94	}
95	}
96
97	if (ch < `0xE0`) {
98	/ \xC2\x80-\xDF\xBF -- 0080-07FF /
99	Py_UCS4 ch2;
100	if (ch < `0xC2`) {
101	/ invalid sequence*
102	\x80-\xBF -- continuation byte
103	\xC0-\xC1 -- fake 0000-007F /*
104	goto InvalidStart;
105	}
106	if (end - s < `2`) {
107	/ unexpected end of data: the caller will decide whether*
108	it's an error or not /*
109	break;
110	}
111	ch2 = (unsigned char)s[`1`];
112	if (!IS_CONTINUATION_BYTE(ch2))
113	/ invalid continuation byte /
114	goto InvalidContinuation1;
115	ch = (ch << `6`) + ch2 -
116	((`0xC0` << `6`) + `0x80`);
117	assert ((ch > `0x007F`) && (ch <= `0x07FF`));
118	s += `2`;
119	if (STRINGLIB_MAX_CHAR <= `0x007F` \|\|
120	(STRINGLIB_MAX_CHAR < `0x07FF` && ch > STRINGLIB_MAX_CHAR))
121	/ Out-of-range /
122	goto Return;
123	*p++ = ch;
124	continue;
125	}
126
127	if (ch < `0xF0`) {
128	/ \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF /
129	Py_UCS4 ch2, ch3;
130	if (end - s < `3`) {
131	/ unexpected end of data: the caller will decide whether*
132	it's an error or not /*
133	if (end - s < `2`)
134	break;
135	ch2 = (unsigned char)s[`1`];
136	if (!IS_CONTINUATION_BYTE(ch2) \|\|
137	(ch2 < `0xA0` ? ch == `0xE0` : ch == `0xED`))
138	/ for clarification see comments below /
139	goto InvalidContinuation1;
140	break;
141	}
142	ch2 = (unsigned char)s[`1`];
143	ch3 = (unsigned char)s[`2`];
144	if (!IS_CONTINUATION_BYTE(ch2)) {
145	/ invalid continuation byte /
146	goto InvalidContinuation1;
147	}
148	if (ch == `0xE0`) {
149	if (ch2 < `0xA0`)
150	/ invalid sequence*
151	\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 /*
152	goto InvalidContinuation1;
153	} else if (ch == `0xED` && ch2 >= `0xA0`) {
154	/ Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF*
155	will result in surrogates in range D800-DFFF. Surrogates are
156	not valid UTF-8 so they are rejected.
157	See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
158	(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt /*
159	goto InvalidContinuation1;
160	}
161	if (!IS_CONTINUATION_BYTE(ch3)) {
162	/ invalid continuation byte /
163	goto InvalidContinuation2;
164	}
165	ch = (ch << `12`) + (ch2 << `6`) + ch3 -
166	((`0xE0` << `12`) + (`0x80` << `6`) + `0x80`);
167	assert ((ch > `0x07FF`) && (ch <= `0xFFFF`));
168	s += `3`;
169	if (STRINGLIB_MAX_CHAR <= `0x07FF` \|\|
170	(STRINGLIB_MAX_CHAR < `0xFFFF` && ch > STRINGLIB_MAX_CHAR))
171	/ Out-of-range /
172	goto Return;
173	*p++ = ch;
174	continue;
175	}
176
177	if (ch < `0xF5`) {
178	/ \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF /
179	Py_UCS4 ch2, ch3, ch4;
180	if (end - s < `4`) {
181	/ unexpected end of data: the caller will decide whether*
182	it's an error or not /*
183	if (end - s < `2`)
184	break;
185	ch2 = (unsigned char)s[`1`];
186	if (!IS_CONTINUATION_BYTE(ch2) \|\|
187	(ch2 < `0x90` ? ch == `0xF0` : ch == `0xF4`))
188	/ for clarification see comments below /
189	goto InvalidContinuation1;
190	if (end - s < `3`)
191	break;
192	ch3 = (unsigned char)s[`2`];
193	if (!IS_CONTINUATION_BYTE(ch3))
194	goto InvalidContinuation2;
195	break;
196	}
197	ch2 = (unsigned char)s[`1`];
198	ch3 = (unsigned char)s[`2`];
199	ch4 = (unsigned char)s[`3`];
200	if (!IS_CONTINUATION_BYTE(ch2)) {
201	/ invalid continuation byte /
202	goto InvalidContinuation1;
203	}
204	if (ch == `0xF0`) {
205	if (ch2 < `0x90`)
206	/ invalid sequence*
207	\xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF /*
208	goto InvalidContinuation1;
209	} else if (ch == `0xF4` && ch2 >= `0x90`) {
210	/ invalid sequence*
211	\xF4\x90\x80\x80- -- 110000- overflow /*
212	goto InvalidContinuation1;
213	}
214	if (!IS_CONTINUATION_BYTE(ch3)) {
215	/ invalid continuation byte /
216	goto InvalidContinuation2;
217	}
218	if (!IS_CONTINUATION_BYTE(ch4)) {
219	/ invalid continuation byte /
220	goto InvalidContinuation3;
221	}
222	ch = (ch << `18`) + (ch2 << `12`) + (ch3 << `6`) + ch4 -
223	((`0xF0` << `18`) + (`0x80` << `12`) + (`0x80` << `6`) + `0x80`);
224	assert ((ch > `0xFFFF`) && (ch <= `0x10FFFF`));
225	s += `4`;
226	if (STRINGLIB_MAX_CHAR <= `0xFFFF` \|\|
227	(STRINGLIB_MAX_CHAR < `0x10FFFF` && ch > STRINGLIB_MAX_CHAR))
228	/ Out-of-range /
229	goto Return;
230	*p++ = ch;
231	continue;
232	}
233	goto InvalidStart;
234	}
235	ch = `0`;
236	Return:
237	*inptr = s;
238	*outpos = p - dest;
239	return ch;
240	InvalidStart:
241	ch = `1`;
242	goto Return;
243	InvalidContinuation1:
244	ch = `2`;
245	goto Return;
246	InvalidContinuation2:
247	ch = `3`;
248	goto Return;
249	InvalidContinuation3:
250	ch = `4`;
251	goto Return;
252	}
253
254	#undef ASCII_CHAR_MASK
255
256
257	/ UTF-8 encoder specialized for a Unicode kind to avoid the slow*
258	PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
259	UCS-1 strings don't need to handle surrogates for example. /*
260	Py_LOCAL_INLINE(char *)
261	STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
262	PyObject *unicode,
263	const STRINGLIB_CHAR *data,
264	Py_ssize_t size,
265	_Py_error_handler error_handler,
266	const char *errors)
267	{
268	Py_ssize_t i; / index into data of next input character /
269	char p; /* next free byte in output buffer /
270	#if STRINGLIB_SIZEOF_CHAR > 1
271	PyObject *error_handler_obj = NULL;
272	PyObject *exc = NULL;
273	PyObject *rep = NULL;
274	#endif
275	#if STRINGLIB_SIZEOF_CHAR == 1
276	const Py_ssize_t max_char_size = `2`;
277	#elif STRINGLIB_SIZEOF_CHAR == 2
278	const Py_ssize_t max_char_size = `3`;
279	#else /* STRINGLIB_SIZEOF_CHAR == 4 */
280	const Py_ssize_t max_char_size = `4`;
281	#endif
282
283	assert(size >= `0`);
284	if (size > PY_SSIZE_T_MAX / max_char_size) {
285	/ integer overflow /
286	PyErr_NoMemory();
287	return NULL;
288	}
289
290	_PyBytesWriter_Init(writer);
291	p = _PyBytesWriter_Alloc(writer, size * max_char_size);
292	if (p == NULL)
293	return NULL;
294
295	for (i = `0`; i < size;) {
296	Py_UCS4 ch = data[i++];
297
298	if (ch < `0x80`) {
299	/ Encode ASCII /
300	p++ = (char*) ch;
301
302	}
303	else
304	#if STRINGLIB_SIZEOF_CHAR > 1
305	if (ch < `0x0800`)
306	#endif
307	{
308	/ Encode Latin-1 /
309	p++ = (char*)(`0xc0` \| (ch >> `6`));
310	p++ = (char*)(`0x80` \| (ch & `0x3f`));
311	}
312	#if STRINGLIB_SIZEOF_CHAR > 1
313	else if (Py_UNICODE_IS_SURROGATE(ch)) {
314	Py_ssize_t startpos, endpos, newpos;
315	Py_ssize_t k;
316	if (error_handler == _Py_ERROR_UNKNOWN) {
317	error_handler = _Py_GetErrorHandler(errors);
318	}
319
320	startpos = i-`1`;
321	endpos = startpos+`1`;
322
323	while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
324	endpos++;
325
326	/ Only overallocate the buffer if it's not the last write /
327	writer->overallocate = (endpos < size);
328
329	switch (error_handler)
330	{
331	case _Py_ERROR_REPLACE:
332	memset(p, `'?'`, endpos - startpos);
333	p += (endpos - startpos);
334	/ fall through /
335	case _Py_ERROR_IGNORE:
336	i += (endpos - startpos - `1`);
337	break;
338
339	case _Py_ERROR_SURROGATEPASS:
340	for (k=startpos; k<endpos; k++) {
341	ch = data[k];
342	p++ = (char*)(`0xe0` \| (ch >> `12`));
343	p++ = (char*)(`0x80` \| ((ch >> `6`) & `0x3f`));
344	p++ = (char*)(`0x80` \| (ch & `0x3f`));
345	}
346	i += (endpos - startpos - `1`);
347	break;
348
349	case _Py_ERROR_BACKSLASHREPLACE:
350	/ subtract preallocated bytes /
351	writer->min_size -= max_char_size * (endpos - startpos);
352	p = backslashreplace(writer, p,
353	unicode, startpos, endpos);
354	if (p == NULL)
355	goto error;
356	i += (endpos - startpos - `1`);
357	break;
358
359	case _Py_ERROR_XMLCHARREFREPLACE:
360	/ subtract preallocated bytes /
361	writer->min_size -= max_char_size * (endpos - startpos);
362	p = xmlcharrefreplace(writer, p,
363	unicode, startpos, endpos);
364	if (p == NULL)
365	goto error;
366	i += (endpos - startpos - `1`);
367	break;
368
369	case _Py_ERROR_SURROGATEESCAPE:
370	for (k=startpos; k<endpos; k++) {
371	ch = data[k];
372	if (!(`0xDC80` <= ch && ch <= `0xDCFF`))
373	break;
374	p++ = (char*)(ch & `0xff`);
375	}
376	if (k >= endpos) {
377	i += (endpos - startpos - `1`);
378	break;
379	}
380	startpos = k;
381	assert(startpos < endpos);
382	/ fall through /
383	default:
384	rep = unicode_encode_call_errorhandler(
385	errors, &error_handler_obj, "utf-8", "surrogates not allowed",
386	unicode, &exc, startpos, endpos, &newpos);
387	if (!rep)
388	goto error;
389
390	if (newpos < startpos) {
391	writer->overallocate = `1`;
392	p = _PyBytesWriter_Prepare(writer, p,
393	max_char_size * (startpos - newpos));
394	if (p == NULL)
395	goto error;
396	}
397	else {
398	/ subtract preallocated bytes /
399	writer->min_size -= max_char_size * (newpos - startpos);
400	/ Only overallocate the buffer if it's not the last write /
401	writer->overallocate = (newpos < size);
402	}
403
404	if (PyBytes_Check(rep)) {
405	p = _PyBytesWriter_WriteBytes(writer, p,
406	PyBytes_AS_STRING(rep),
407	PyBytes_GET_SIZE(rep));
408	}
409	else {
410	/ rep is unicode /
411	if (PyUnicode_READY(rep) < `0`)
412	goto error;
413
414	if (!PyUnicode_IS_ASCII(rep)) {
415	raise_encode_exception(&exc, "utf-8", unicode,
416	startpos, endpos,
417	"surrogates not allowed");
418	goto error;
419	}
420
421	p = _PyBytesWriter_WriteBytes(writer, p,
422	PyUnicode_DATA(rep),
423	PyUnicode_GET_LENGTH(rep));
424	}
425
426	if (p == NULL)
427	goto error;
428	Py_CLEAR(rep);
429
430	i = newpos;
431	}
432
433	/ If overallocation was disabled, ensure that it was the last*
434	write. Otherwise, we missed an optimization /*
435	assert(writer->overallocate \|\| i == size);
436	}
437	else
438	#if STRINGLIB_SIZEOF_CHAR > 2
439	if (ch < `0x10000`)
440	#endif
441	{
442	p++ = (char*)(`0xe0` \| (ch >> `12`));
443	p++ = (char*)(`0x80` \| ((ch >> `6`) & `0x3f`));
444	p++ = (char*)(`0x80` \| (ch & `0x3f`));
445	}
446	#if STRINGLIB_SIZEOF_CHAR > 2
447	else / ch >= 0x10000 /
448	{
449	assert(ch <= MAX_UNICODE);
450	/ Encode UCS4 Unicode ordinals /
451	p++ = (char*)(`0xf0` \| (ch >> `18`));
452	p++ = (char*)(`0x80` \| ((ch >> `12`) & `0x3f`));
453	p++ = (char*)(`0x80` \| ((ch >> `6`) & `0x3f`));
454	p++ = (char*)(`0x80` \| (ch & `0x3f`));
455	}
456	#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
457	#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
458	}
459
460	#if STRINGLIB_SIZEOF_CHAR > 1
461	Py_XDECREF(error_handler_obj);
462	Py_XDECREF(exc);
463	#endif
464	return p;
465
466	#if STRINGLIB_SIZEOF_CHAR > 1
467	error:
468	Py_XDECREF(rep);
469	Py_XDECREF(error_handler_obj);
470	Py_XDECREF(exc);
471	return NULL;
472	#endif
473	}
474
475	/ The pattern for constructing UCS2-repeated masks. /
476	#if SIZEOF_LONG == 8
477	# define UCS2_REPEAT_MASK 0x0001000100010001ul
478	#elif SIZEOF_LONG == 4
479	# define UCS2_REPEAT_MASK 0x00010001ul
480	#else
481	# error C 'long' size should be either 4 or 8!
482	#endif
483
484	/ The mask for fast checking. /
485	#if STRINGLIB_SIZEOF_CHAR == 1
486	/ The mask for fast checking of whether a C 'long' contains a*
487	non-ASCII or non-Latin1 UTF16-encoded characters. /*
488	# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
489	#else
490	/ The mask for fast checking of whether a C 'long' may contain*
491	UTF16-encoded surrogate characters. This is an efficient heuristic,
492	assuming that non-surrogate characters with a code point >= 0x8000 are
493	rare in most input.
494	*/
495	# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
496	#endif
497	/ The mask for fast byte-swapping. /
498	#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
499	/ Swap bytes. /
500	#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) \| \
501	(((value) & STRIPPED_MASK) << 8))
502
503	Py_LOCAL_INLINE(Py_UCS4)
504	STRINGLIB(utf16_decode)(const unsigned char *inptr, const* unsigned char *e,
505	STRINGLIB_CHAR dest, Py_ssize_t outpos,
506	int native_ordering)
507	{
508	Py_UCS4 ch;
509	const unsigned char q = inptr;
510	STRINGLIB_CHAR p = dest + outpos;
511	/ Offsets from q for retrieving byte pairs in the right order. /
512	#if PY_LITTLE_ENDIAN
513	int ihi = !!native_ordering, ilo = !native_ordering;
514	#else
515	int ihi = !native_ordering, ilo = !!native_ordering;
516	#endif
517	--e;
518
519	while (q < e) {
520	Py_UCS4 ch2;
521	/ First check for possible aligned read of a C 'long'. Unaligned*
522	reads are more expensive, better to defer to another iteration. /*
523	if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) {
524	/ Fast path for runs of in-range non-surrogate chars. /
525	const unsigned char *_q = q;
526	while (_q + SIZEOF_LONG <= e) {
527	unsigned long block = * (const unsigned long *) _q;
528	if (native_ordering) {
529	/ Can use buffer directly /
530	if (block & FAST_CHAR_MASK)
531	break;
532	}
533	else {
534	/ Need to byte-swap /
535	if (block & SWAB(FAST_CHAR_MASK))
536	break;
537	#if STRINGLIB_SIZEOF_CHAR == 1
538	block >>= `8`;
539	#else
540	block = SWAB(block);
541	#endif
542	}
543	#if PY_LITTLE_ENDIAN
544	# if SIZEOF_LONG == 4
545	p[`0`] = (STRINGLIB_CHAR)(block & `0xFFFFu`);
546	p[`1`] = (STRINGLIB_CHAR)(block >> `16`);
547	# elif SIZEOF_LONG == 8
548	p[`0`] = (STRINGLIB_CHAR)(block & `0xFFFFu`);
549	p[`1`] = (STRINGLIB_CHAR)((block >> `16`) & `0xFFFFu`);
550	p[`2`] = (STRINGLIB_CHAR)((block >> `32`) & `0xFFFFu`);
551	p[`3`] = (STRINGLIB_CHAR)(block >> `48`);
552	# endif
553	#else
554	# if SIZEOF_LONG == 4
555	p[`0`] = (STRINGLIB_CHAR)(block >> `16`);
556	p[`1`] = (STRINGLIB_CHAR)(block & `0xFFFFu`);
557	# elif SIZEOF_LONG == 8
558	p[`0`] = (STRINGLIB_CHAR)(block >> `48`);
559	p[`1`] = (STRINGLIB_CHAR)((block >> `32`) & `0xFFFFu`);
560	p[`2`] = (STRINGLIB_CHAR)((block >> `16`) & `0xFFFFu`);
561	p[`3`] = (STRINGLIB_CHAR)(block & `0xFFFFu`);
562	# endif
563	#endif
564	_q += SIZEOF_LONG;
565	p += SIZEOF_LONG / `2`;
566	}
567	q = _q;
568	if (q >= e)
569	break;
570	}
571
572	ch = (q[ihi] << `8`) \| q[ilo];
573	q += `2`;
574	if (!Py_UNICODE_IS_SURROGATE(ch)) {
575	#if STRINGLIB_SIZEOF_CHAR < 2
576	if (ch > STRINGLIB_MAX_CHAR)
577	/ Out-of-range /
578	goto Return;
579	#endif
580	*p++ = (STRINGLIB_CHAR)ch;
581	continue;
582	}
583
584	/ UTF-16 code pair: /
585	if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
586	goto IllegalEncoding;
587	if (q >= e)
588	goto UnexpectedEnd;
589	ch2 = (q[ihi] << `8`) \| q[ilo];
590	q += `2`;
591	if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
592	goto IllegalSurrogate;
593	ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
594	#if STRINGLIB_SIZEOF_CHAR < 4
595	/ Out-of-range /
596	goto Return;
597	#else
598	*p++ = (STRINGLIB_CHAR)ch;
599	#endif
600	}
601	ch = `0`;
602	Return:
603	*inptr = q;
604	*outpos = p - dest;
605	return ch;
606	UnexpectedEnd:
607	ch = `1`;
608	goto Return;
609	IllegalEncoding:
610	ch = `2`;
611	goto Return;
612	IllegalSurrogate:
613	ch = `3`;
614	goto Return;
615	}
616	#undef UCS2_REPEAT_MASK
617	#undef FAST_CHAR_MASK
618	#undef STRIPPED_MASK
619	#undef SWAB
620
621
622	#if STRINGLIB_MAX_CHAR >= 0x80
623	Py_LOCAL_INLINE(Py_ssize_t)
624	STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
625	Py_ssize_t len,
626	unsigned short **outptr,
627	int native_ordering)
628	{
629	unsigned short out = outptr;
630	const STRINGLIB_CHAR *end = in + len;
631	#if STRINGLIB_SIZEOF_CHAR == 1
632	if (native_ordering) {
633	const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, `4`);
634	while (in < unrolled_end) {
635	out[`0`] = in[`0`];
636	out[`1`] = in[`1`];
637	out[`2`] = in[`2`];
638	out[`3`] = in[`3`];
639	in += `4`; out += `4`;
640	}
641	while (in < end) {
642	out++ = in++;
643	}
644	} else {
645	# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
646	const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, `4`);
647	while (in < unrolled_end) {
648	out[`0`] = SWAB2(in[`0`]);
649	out[`1`] = SWAB2(in[`1`]);
650	out[`2`] = SWAB2(in[`2`]);
651	out[`3`] = SWAB2(in[`3`]);
652	in += `4`; out += `4`;
653	}
654	while (in < end) {
655	Py_UCS4 ch = *in++;
656	*out++ = SWAB2((Py_UCS2)ch);
657	}
658	#undef SWAB2
659	}
660	*outptr = out;
661	return len;
662	#else
663	if (native_ordering) {
664	#if STRINGLIB_MAX_CHAR < 0x10000
665	const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, `4`);
666	while (in < unrolled_end) {
667	/ check if any character is a surrogate character /
668	if (((in[`0`] ^ `0xd800`) &
669	(in[`1`] ^ `0xd800`) &
670	(in[`2`] ^ `0xd800`) &
671	(in[`3`] ^ `0xd800`) & `0xf800`) == `0`)
672	break;
673	out[`0`] = in[`0`];
674	out[`1`] = in[`1`];
675	out[`2`] = in[`2`];
676	out[`3`] = in[`3`];
677	in += `4`; out += `4`;
678	}
679	#endif
680	while (in < end) {
681	Py_UCS4 ch;
682	ch = *in++;
683	if (ch < `0xd800`)
684	*out++ = ch;
685	else if (ch < `0xe000`)
686	/ reject surrogate characters (U+D800-U+DFFF) /
687	goto fail;
688	#if STRINGLIB_MAX_CHAR >= 0x10000
689	else if (ch >= `0x10000`) {
690	out[`0`] = Py_UNICODE_HIGH_SURROGATE(ch);
691	out[`1`] = Py_UNICODE_LOW_SURROGATE(ch);
692	out += `2`;
693	}
694	#endif
695	else
696	*out++ = ch;
697	}
698	} else {
699	#define SWAB2(CH) (((CH) << 8) \| ((CH) >> 8))
700	#if STRINGLIB_MAX_CHAR < 0x10000
701	const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, `4`);
702	while (in < unrolled_end) {
703	/ check if any character is a surrogate character /
704	if (((in[`0`] ^ `0xd800`) &
705	(in[`1`] ^ `0xd800`) &
706	(in[`2`] ^ `0xd800`) &
707	(in[`3`] ^ `0xd800`) & `0xf800`) == `0`)
708	break;
709	out[`0`] = SWAB2(in[`0`]);
710	out[`1`] = SWAB2(in[`1`]);
711	out[`2`] = SWAB2(in[`2`]);
712	out[`3`] = SWAB2(in[`3`]);
713	in += `4`; out += `4`;
714	}
715	#endif
716	while (in < end) {
717	Py_UCS4 ch = *in++;
718	if (ch < `0xd800`)
719	*out++ = SWAB2((Py_UCS2)ch);
720	else if (ch < `0xe000`)
721	/ reject surrogate characters (U+D800-U+DFFF) /
722	goto fail;
723	#if STRINGLIB_MAX_CHAR >= 0x10000
724	else if (ch >= `0x10000`) {
725	Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
726	Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
727	out[`0`] = SWAB2(ch1);
728	out[`1`] = SWAB2(ch2);
729	out += `2`;
730	}
731	#endif
732	else
733	*out++ = SWAB2((Py_UCS2)ch);
734	}
735	#undef SWAB2
736	}
737	*outptr = out;
738	return len;
739	fail:
740	*outptr = out;
741	return len - (end - in + `1`);
742	#endif
743	}
744
745	static inline uint32_t
746	STRINGLIB(SWAB4)(STRINGLIB_CHAR ch)
747	{
748	uint32_t word = ch;
749	#if STRINGLIB_SIZEOF_CHAR == 1
750	/ high bytes are zero /
751	return (word << `24`);
752	#elif STRINGLIB_SIZEOF_CHAR == 2
753	/ high bytes are zero /
754	return ((word & `0x00FFu`) << `24`) \| ((word & `0xFF00u`) << `8`);
755	#else
756	return _Py_bswap32(word);
757	#endif
758	}
759
760	Py_LOCAL_INLINE(Py_ssize_t)
761	STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
762	Py_ssize_t len,
763	uint32_t **outptr,
764	int native_ordering)
765	{
766	uint32_t out = outptr;
767	const STRINGLIB_CHAR *end = in + len;
768	if (native_ordering) {
769	const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, `4`);
770	while (in < unrolled_end) {
771	#if STRINGLIB_SIZEOF_CHAR > 1
772	/ check if any character is a surrogate character /
773	if (((in[`0`] ^ `0xd800`) &
774	(in[`1`] ^ `0xd800`) &
775	(in[`2`] ^ `0xd800`) &
776	(in[`3`] ^ `0xd800`) & `0xf800`) == `0`)
777	break;
778	#endif
779	out[`0`] = in[`0`];
780	out[`1`] = in[`1`];
781	out[`2`] = in[`2`];
782	out[`3`] = in[`3`];
783	in += `4`; out += `4`;
784	}
785	while (in < end) {
786	Py_UCS4 ch;
787	ch = *in++;
788	#if STRINGLIB_SIZEOF_CHAR > 1
789	if (Py_UNICODE_IS_SURROGATE(ch)) {
790	/ reject surrogate characters (U+D800-U+DFFF) /
791	goto fail;
792	}
793	#endif
794	*out++ = ch;
795	}
796	} else {
797	const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, `4`);
798	while (in < unrolled_end) {
799	#if STRINGLIB_SIZEOF_CHAR > 1
800	/ check if any character is a surrogate character /
801	if (((in[`0`] ^ `0xd800`) &
802	(in[`1`] ^ `0xd800`) &
803	(in[`2`] ^ `0xd800`) &
804	(in[`3`] ^ `0xd800`) & `0xf800`) == `0`)
805	break;
806	#endif
807	out[`0`] = STRINGLIB(SWAB4)(in[`0`]);
808	out[`1`] = STRINGLIB(SWAB4)(in[`1`]);
809	out[`2`] = STRINGLIB(SWAB4)(in[`2`]);
810	out[`3`] = STRINGLIB(SWAB4)(in[`3`]);
811	in += `4`; out += `4`;
812	}
813	while (in < end) {
814	Py_UCS4 ch = *in++;
815	#if STRINGLIB_SIZEOF_CHAR > 1
816	if (Py_UNICODE_IS_SURROGATE(ch)) {
817	/ reject surrogate characters (U+D800-U+DFFF) /
818	goto fail;
819	}
820	#endif
821	*out++ = STRINGLIB(SWAB4)(ch);
822	}
823	}
824	*outptr = out;
825	return len;
826	#if STRINGLIB_SIZEOF_CHAR > 1
827	fail:
828	*outptr = out;
829	return len - (end - in + `1`);
830	#endif
831	}
832
833	#endif
834

Browse the source code of python/Objects/stringlib/codecs.h