unicodeobject.c source code [python/Objects/unicodeobject.c]

1	/*
2
3	Unicode implementation based on original code by Fredrik Lundh,
4	modified by Marc-Andre Lemburg <[email protected]>.
5
6	Major speed upgrades to the method implementations at the Reykjavik
7	NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9	Copyright (c) Corporation for National Research Initiatives.
10
11	--------------------------------------------------------------------
12	The original string type implementation is:
13
14	Copyright (c) 1999 by Secret Labs AB
15	Copyright (c) 1999 by Fredrik Lundh
16
17	By obtaining, using, and/or copying this software and/or its
18	associated documentation, you agree that you have read, understood,
19	and will comply with the following terms and conditions:
20
21	Permission to use, copy, modify, and distribute this software and its
22	associated documentation for any purpose and without fee is hereby
23	granted, provided that the above copyright notice appears in all
24	copies, and that both that copyright notice and this permission notice
25	appear in supporting documentation, and that the name of Secret Labs
26	AB or the author not be used in advertising or publicity pertaining to
27	distribution of the software without specific, written prior
28	permission.
29
30	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37	--------------------------------------------------------------------
38
39	*/
40
41	#define PY_SSIZE_T_CLEAN
42	#include "Python.h"
43	#include "pycore_abstract.h" // _PyIndex_Check()
44	#include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
45	#include "pycore_bytes_methods.h" // _Py_bytes_lower()
46	#include "pycore_format.h" // F_LJUST
47	#include "pycore_initconfig.h" // _PyStatus_OK()
48	#include "pycore_interp.h" // PyInterpreterState.fs_codec
49	#include "pycore_object.h" // _PyObject_GC_TRACK()
50	#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
51	#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
52	#include "pycore_pystate.h" // _PyInterpreterState_GET()
53	#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
54	#include "stringlib/eq.h" // unicode_eq()
55
56	#ifdef MS_WINDOWS
57	#include <windows.h>
58	#endif
59
60	#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
61	#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
62	#endif
63
64	/ Uncomment to display statistics on interned strings at exit*
65	in _PyUnicode_ClearInterned(). /*
66	/ #define INTERNED_STATS 1 /
67
68
69	/[clinic input]*
70	class str "PyObject " "&PyUnicode_Type"*
71	[clinic start generated code]/*
72	/[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]/
73
74	/[python input]*
75	class Py_UCS4_converter(CConverter):
76	type = 'Py_UCS4'
77	converter = 'convert_uc'
78
79	def converter_init(self):
80	if self.default is not unspecified:
81	self.c_default = ascii(self.default)
82	if len(self.c_default) > 4 or self.c_default[0] != "'":
83	self.c_default = hex(ord(self.default))
84
85	[python start generated code]/*
86	/[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]/
87
88	/ --- Globals ------------------------------------------------------------*
89
90	NOTE: In the interpreter's initialization phase, some globals are currently
91	initialized dynamically as needed. In the process Unicode objects may
92	be created before the Unicode type is ready.
93
94	*/
95
96
97	#ifdef __cplusplus
98	extern "C" {
99	#endif
100
101	// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
102	// The value must be the same in fileutils.c.
103	#define MAX_UNICODE 0x10ffff
104
105	#ifdef Py_DEBUG
106	# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
107	#else
108	# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
109	#endif
110
111	#define _PyUnicode_UTF8(op) \
112	(((PyCompactUnicodeObject*)(op))->utf8)
113	#define PyUnicode_UTF8(op) \
114	(assert(_PyUnicode_CHECK(op)), \
115	assert(PyUnicode_IS_READY(op)), \
116	PyUnicode_IS_COMPACT_ASCII(op) ? \
117	((char)((PyASCIIObject)(op) + 1)) : \
118	_PyUnicode_UTF8(op))
119	#define _PyUnicode_UTF8_LENGTH(op) \
120	(((PyCompactUnicodeObject*)(op))->utf8_length)
121	#define PyUnicode_UTF8_LENGTH(op) \
122	(assert(_PyUnicode_CHECK(op)), \
123	assert(PyUnicode_IS_READY(op)), \
124	PyUnicode_IS_COMPACT_ASCII(op) ? \
125	((PyASCIIObject*)(op))->length : \
126	_PyUnicode_UTF8_LENGTH(op))
127	#define _PyUnicode_WSTR(op) \
128	(((PyASCIIObject*)(op))->wstr)
129
130	/ Don't use deprecated macro of unicodeobject.h /
131	#undef PyUnicode_WSTR_LENGTH
132	#define PyUnicode_WSTR_LENGTH(op) \
133	(PyUnicode_IS_COMPACT_ASCII(op) ? \
134	((PyASCIIObject*)op)->length : \
135	((PyCompactUnicodeObject*)op)->wstr_length)
136	#define _PyUnicode_WSTR_LENGTH(op) \
137	(((PyCompactUnicodeObject*)(op))->wstr_length)
138	#define _PyUnicode_LENGTH(op) \
139	(((PyASCIIObject *)(op))->length)
140	#define _PyUnicode_STATE(op) \
141	(((PyASCIIObject *)(op))->state)
142	#define _PyUnicode_HASH(op) \
143	(((PyASCIIObject *)(op))->hash)
144	#define _PyUnicode_KIND(op) \
145	(assert(_PyUnicode_CHECK(op)), \
146	((PyASCIIObject *)(op))->state.kind)
147	#define _PyUnicode_GET_LENGTH(op) \
148	(assert(_PyUnicode_CHECK(op)), \
149	((PyASCIIObject *)(op))->length)
150	#define _PyUnicode_DATA_ANY(op) \
151	(((PyUnicodeObject*)(op))->data.any)
152
153	#undef PyUnicode_READY
154	#define PyUnicode_READY(op) \
155	(assert(_PyUnicode_CHECK(op)), \
156	(PyUnicode_IS_READY(op) ? \
157	0 : \
158	_PyUnicode_Ready(op)))
159
160	#define _PyUnicode_SHARE_UTF8(op) \
161	(assert(_PyUnicode_CHECK(op)), \
162	assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
163	(_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
164	#define _PyUnicode_SHARE_WSTR(op) \
165	(assert(_PyUnicode_CHECK(op)), \
166	(_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
167
168	/ true if the Unicode object has an allocated UTF-8 memory block*
169	(not shared with other data) /*
170	#define _PyUnicode_HAS_UTF8_MEMORY(op) \
171	((!PyUnicode_IS_COMPACT_ASCII(op) \
172	&& _PyUnicode_UTF8(op) \
173	&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
174
175	/ true if the Unicode object has an allocated wstr memory block*
176	(not shared with other data) /*
177	#define _PyUnicode_HAS_WSTR_MEMORY(op) \
178	((_PyUnicode_WSTR(op) && \
179	(!PyUnicode_IS_READY(op) \|\| \
180	_PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
181
182	/ Generic helper macro to convert characters of different types.*
183	from_type and to_type have to be valid type names, begin and end
184	are pointers to the source characters which should be of type
185	"from_type ". to is a pointer of type "to_type " and points to the
186	buffer where the result characters are written to. /*
187	#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
188	do { \
189	to_type _to = (to_type )(to); \
190	const from_type _iter = (const from_type )(begin);\
191	const from_type _end = (const from_type )(end);\
192	Py_ssize_t n = (_end) - (_iter); \
193	const from_type *_unrolled_end = \
194	_iter + _Py_SIZE_ROUND_DOWN(n, 4); \
195	while (_iter < (_unrolled_end)) { \
196	_to[0] = (to_type) _iter[0]; \
197	_to[1] = (to_type) _iter[1]; \
198	_to[2] = (to_type) _iter[2]; \
199	_to[3] = (to_type) _iter[3]; \
200	_iter += 4; _to += 4; \
201	} \
202	while (_iter < (_end)) \
203	_to++ = (to_type) _iter++; \
204	} while (0)
205
206	#ifdef MS_WINDOWS
207	/ On Windows, overallocate by 50% is the best factor /
208	# define OVERALLOCATE_FACTOR 2
209	#else
210	/ On Linux, overallocate by 25% is the best factor /
211	# define OVERALLOCATE_FACTOR 4
212	#endif
213
214	/ bpo-40521: Interned strings are shared by all interpreters. /
215	#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
216	# define INTERNED_STRINGS
217	#endif
218
219	/ This dictionary holds all interned unicode strings. Note that references*
220	to strings in this dictionary are not* counted in the string's ob_refcnt.*
221	When the interned string reaches a refcnt of 0 the string deallocation
222	function will delete the reference from this dictionary.
223
224	Another way to look at this is that to say that the actual reference
225	count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
226	*/
227	#ifdef INTERNED_STRINGS
228	static PyObject *interned = NULL;
229	#endif
230
231	static struct _Py_unicode_state*
232	get_unicode_state(void)
233	{
234	PyInterpreterState *interp = _PyInterpreterState_GET();
235	return &interp->unicode;
236	}
237
238
239	// Return a borrowed reference to the empty string singleton.
240	static inline PyObject* unicode_get_empty(void)
241	{
242	struct _Py_unicode_state *state = get_unicode_state();
243	// unicode_get_empty() must not be called before _PyUnicode_Init()
244	// or after _PyUnicode_Fini()
245	assert(state->empty_string != NULL);
246	return state->empty_string;
247	}
248
249
250	// Return a strong reference to the empty string singleton.
251	static inline PyObject* unicode_new_empty(void)
252	{
253	PyObject *empty = unicode_get_empty();
254	Py_INCREF(empty);
255	return empty;
256	}
257
258	#define _Py_RETURN_UNICODE_EMPTY() \
259	do { \
260	return unicode_new_empty(); \
261	} while (0)
262
263	static inline void
264	unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
265	Py_ssize_t start, Py_ssize_t length)
266	{
267	assert(`0` <= start);
268	assert(kind != PyUnicode_WCHAR_KIND);
269	switch (kind) {
270	case PyUnicode_1BYTE_KIND: {
271	assert(value <= `0xff`);
272	Py_UCS1 ch = (unsigned char)value;
273	Py_UCS1 to = (Py_UCS1 )data + start;
274	memset(to, ch, length);
275	break;
276	}
277	case PyUnicode_2BYTE_KIND: {
278	assert(value <= `0xffff`);
279	Py_UCS2 ch = (Py_UCS2)value;
280	Py_UCS2 to = (Py_UCS2 )data + start;
281	const Py_UCS2 *end = to + length;
282	for (; to < end; ++to) *to = ch;
283	break;
284	}
285	case PyUnicode_4BYTE_KIND: {
286	assert(value <= MAX_UNICODE);
287	Py_UCS4 ch = value;
288	Py_UCS4 * to = (Py_UCS4 *)data + start;
289	const Py_UCS4 *end = to + length;
290	for (; to < end; ++to) *to = ch;
291	break;
292	}
293	default: Py_UNREACHABLE();
294	}
295	}
296
297
298	/ Forward declaration /
299	static inline int
300	_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
301	static inline void
302	_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter writer, PyObject buffer);
303	static PyObject *
304	unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
305	const char *errors);
306	static PyObject *
307	unicode_decode_utf8(const char *s, Py_ssize_t size,
308	_Py_error_handler error_handler, const char *errors,
309	Py_ssize_t *consumed);
310
311	/ Fast detection of the most frequent whitespace characters /
312	const unsigned char _Py_ascii_whitespace[] = {
313	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
314	/ case 0x0009: * CHARACTER TABULATION /
315	/ case 0x000A: * LINE FEED /
316	/ case 0x000B: * LINE TABULATION /
317	/ case 0x000C: * FORM FEED /
318	/ case 0x000D: * CARRIAGE RETURN /
319	`0`, `1`, `1`, `1`, `1`, `1`, `0`, `0`,
320	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
321	/ case 0x001C: * FILE SEPARATOR /
322	/ case 0x001D: * GROUP SEPARATOR /
323	/ case 0x001E: * RECORD SEPARATOR /
324	/ case 0x001F: * UNIT SEPARATOR /
325	`0`, `0`, `0`, `0`, `1`, `1`, `1`, `1`,
326	/ case 0x0020: * SPACE /
327	`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
328	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
329	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
330	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
331
332	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
333	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
334	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
335	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
336	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
337	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
338	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
339	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`
340	};
341
342	/ forward /
343	static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
344	static PyObject* get_latin1_char(unsigned char ch);
345	static int unicode_modifiable(PyObject *unicode);
346
347
348	static PyObject *
349	_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
350	static PyObject *
351	_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
352	static PyObject *
353	_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
354
355	static PyObject *
356	unicode_encode_call_errorhandler(const char *errors,
357	PyObject *errorHandler,const* char encoding, const* char *reason,
358	PyObject unicode, PyObject *exceptionObject,
359	Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
360
361	static void
362	raise_encode_exception(PyObject **exceptionObject,
363	const char *encoding,
364	PyObject *unicode,
365	Py_ssize_t startpos, Py_ssize_t endpos,
366	const char *reason);
367
368	/ Same for linebreaks /
369	static const unsigned char ascii_linebreak[] = {
370	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
371	/ 0x000A, * LINE FEED /
372	/ 0x000B, * LINE TABULATION /
373	/ 0x000C, * FORM FEED /
374	/ 0x000D, * CARRIAGE RETURN /
375	`0`, `0`, `1`, `1`, `1`, `1`, `0`, `0`,
376	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
377	/ 0x001C, * FILE SEPARATOR /
378	/ 0x001D, * GROUP SEPARATOR /
379	/ 0x001E, * RECORD SEPARATOR /
380	`0`, `0`, `0`, `0`, `1`, `1`, `1`, `0`,
381	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
382	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
383	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
384	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
385
386	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
387	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
388	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
389	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
390	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
391	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
392	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
393	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`
394	};
395
396	static int convert_uc(PyObject obj, void* *addr);
397
398	#include "clinic/unicodeobject.c.h"
399
400	_Py_error_handler
401	_Py_GetErrorHandler(const char *errors)
402	{
403	if (errors == NULL \|\| strcmp(errors, "strict") == `0`) {
404	return _Py_ERROR_STRICT;
405	}
406	if (strcmp(errors, "surrogateescape") == `0`) {
407	return _Py_ERROR_SURROGATEESCAPE;
408	}
409	if (strcmp(errors, "replace") == `0`) {
410	return _Py_ERROR_REPLACE;
411	}
412	if (strcmp(errors, "ignore") == `0`) {
413	return _Py_ERROR_IGNORE;
414	}
415	if (strcmp(errors, "backslashreplace") == `0`) {
416	return _Py_ERROR_BACKSLASHREPLACE;
417	}
418	if (strcmp(errors, "surrogatepass") == `0`) {
419	return _Py_ERROR_SURROGATEPASS;
420	}
421	if (strcmp(errors, "xmlcharrefreplace") == `0`) {
422	return _Py_ERROR_XMLCHARREFREPLACE;
423	}
424	return _Py_ERROR_OTHER;
425	}
426
427
428	static _Py_error_handler
429	get_error_handler_wide(const wchar_t *errors)
430	{
431	if (errors == NULL \|\| wcscmp(errors, L"strict") == `0`) {
432	return _Py_ERROR_STRICT;
433	}
434	if (wcscmp(errors, L"surrogateescape") == `0`) {
435	return _Py_ERROR_SURROGATEESCAPE;
436	}
437	if (wcscmp(errors, L"replace") == `0`) {
438	return _Py_ERROR_REPLACE;
439	}
440	if (wcscmp(errors, L"ignore") == `0`) {
441	return _Py_ERROR_IGNORE;
442	}
443	if (wcscmp(errors, L"backslashreplace") == `0`) {
444	return _Py_ERROR_BACKSLASHREPLACE;
445	}
446	if (wcscmp(errors, L"surrogatepass") == `0`) {
447	return _Py_ERROR_SURROGATEPASS;
448	}
449	if (wcscmp(errors, L"xmlcharrefreplace") == `0`) {
450	return _Py_ERROR_XMLCHARREFREPLACE;
451	}
452	return _Py_ERROR_OTHER;
453	}
454
455
456	static inline int
457	unicode_check_encoding_errors(const char encoding, const* char *errors)
458	{
459	if (encoding == NULL && errors == NULL) {
460	return `0`;
461	}
462
463	PyInterpreterState *interp = _PyInterpreterState_GET();
464	#ifndef Py_DEBUG
465	/ In release mode, only check in development mode (-X dev) /
466	if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
467	return `0`;
468	}
469	#else
470	/ Always check in debug mode /
471	#endif
472
473	/ Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the*
474	codec registry is ready: before_PyUnicode_InitEncodings() is called. /*
475	if (!interp->unicode.fs_codec.encoding) {
476	return `0`;
477	}
478
479	/ Disable checks during Python finalization. For example, it allows to*
480	call _PyObject_Dump() during finalization for debugging purpose. /*
481	if (interp->finalizing) {
482	return `0`;
483	}
484
485	if (encoding != NULL) {
486	PyObject *handler = _PyCodec_Lookup(encoding);
487	if (handler == NULL) {
488	return -`1`;
489	}
490	Py_DECREF(handler);
491	}
492
493	if (errors != NULL) {
494	PyObject *handler = PyCodec_LookupError(errors);
495	if (handler == NULL) {
496	return -`1`;
497	}
498	Py_DECREF(handler);
499	}
500	return `0`;
501	}
502
503
504	int
505	_PyUnicode_CheckConsistency(PyObject op, int* check_content)
506	{
507	#define CHECK(expr) \
508	do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
509
510	PyASCIIObject *ascii;
511	unsigned int kind;
512
513	assert(op != NULL);
514	CHECK(PyUnicode_Check(op));
515
516	ascii = (PyASCIIObject *)op;
517	kind = ascii->state.kind;
518
519	if (ascii->state.ascii == `1` && ascii->state.compact == `1`) {
520	CHECK(kind == PyUnicode_1BYTE_KIND);
521	CHECK(ascii->state.ready == `1`);
522	}
523	else {
524	PyCompactUnicodeObject compact = (PyCompactUnicodeObject )op;
525	void *data;
526
527	if (ascii->state.compact == `1`) {
528	data = compact + `1`;
529	CHECK(kind == PyUnicode_1BYTE_KIND
530	\|\| kind == PyUnicode_2BYTE_KIND
531	\|\| kind == PyUnicode_4BYTE_KIND);
532	CHECK(ascii->state.ascii == `0`);
533	CHECK(ascii->state.ready == `1`);
534	CHECK(compact->utf8 != data);
535	}
536	else {
537	PyUnicodeObject unicode = (PyUnicodeObject )op;
538
539	data = unicode->data.any;
540	if (kind == PyUnicode_WCHAR_KIND) {
541	CHECK(ascii->length == `0`);
542	CHECK(ascii->hash == -`1`);
543	CHECK(ascii->state.compact == `0`);
544	CHECK(ascii->state.ascii == `0`);
545	CHECK(ascii->state.ready == `0`);
546	CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
547	CHECK(ascii->wstr != NULL);
548	CHECK(data == NULL);
549	CHECK(compact->utf8 == NULL);
550	}
551	else {
552	CHECK(kind == PyUnicode_1BYTE_KIND
553	\|\| kind == PyUnicode_2BYTE_KIND
554	\|\| kind == PyUnicode_4BYTE_KIND);
555	CHECK(ascii->state.compact == `0`);
556	CHECK(ascii->state.ready == `1`);
557	CHECK(data != NULL);
558	if (ascii->state.ascii) {
559	CHECK(compact->utf8 == data);
560	CHECK(compact->utf8_length == ascii->length);
561	}
562	else
563	CHECK(compact->utf8 != data);
564	}
565	}
566	if (kind != PyUnicode_WCHAR_KIND) {
567	if (
568	#if SIZEOF_WCHAR_T == 2
569	kind == PyUnicode_2BYTE_KIND
570	#else
571	kind == PyUnicode_4BYTE_KIND
572	#endif
573	)
574	{
575	CHECK(ascii->wstr == data);
576	CHECK(compact->wstr_length == ascii->length);
577	} else
578	CHECK(ascii->wstr != data);
579	}
580
581	if (compact->utf8 == NULL)
582	CHECK(compact->utf8_length == `0`);
583	if (ascii->wstr == NULL)
584	CHECK(compact->wstr_length == `0`);
585	}
586
587	/ check that the best kind is used: O(n) operation /
588	if (check_content && kind != PyUnicode_WCHAR_KIND) {
589	Py_ssize_t i;
590	Py_UCS4 maxchar = `0`;
591	const void *data;
592	Py_UCS4 ch;
593
594	data = PyUnicode_DATA(ascii);
595	for (i=`0`; i < ascii->length; i++)
596	{
597	ch = PyUnicode_READ(kind, data, i);
598	if (ch > maxchar)
599	maxchar = ch;
600	}
601	if (kind == PyUnicode_1BYTE_KIND) {
602	if (ascii->state.ascii == `0`) {
603	CHECK(maxchar >= `128`);
604	CHECK(maxchar <= `255`);
605	}
606	else
607	CHECK(maxchar < `128`);
608	}
609	else if (kind == PyUnicode_2BYTE_KIND) {
610	CHECK(maxchar >= `0x100`);
611	CHECK(maxchar <= `0xFFFF`);
612	}
613	else {
614	CHECK(maxchar >= `0x10000`);
615	CHECK(maxchar <= MAX_UNICODE);
616	}
617	CHECK(PyUnicode_READ(kind, data, ascii->length) == `0`);
618	}
619	return `1`;
620
621	#undef CHECK
622	}
623
624
625	static PyObject*
626	unicode_result_wchar(PyObject *unicode)
627	{
628	#ifndef Py_DEBUG
629	Py_ssize_t len;
630
631	len = _PyUnicode_WSTR_LENGTH(unicode);
632	if (len == `0`) {
633	Py_DECREF(unicode);
634	_Py_RETURN_UNICODE_EMPTY();
635	}
636
637	if (len == `1`) {
638	wchar_t ch = _PyUnicode_WSTR(unicode)[`0`];
639	if ((Py_UCS4)ch < `256`) {
640	Py_DECREF(unicode);
641	return get_latin1_char((unsigned char)ch);
642	}
643	}
644
645	if (_PyUnicode_Ready(unicode) < `0`) {
646	Py_DECREF(unicode);
647	return NULL;
648	}
649	#else
650	assert(Py_REFCNT(unicode) == `1`);
651
652	/ don't make the result ready in debug mode to ensure that the caller*
653	makes the string ready before using it /*
654	assert(_PyUnicode_CheckConsistency(unicode, `1`));
655	#endif
656	return unicode;
657	}
658
659	static PyObject*
660	unicode_result_ready(PyObject *unicode)
661	{
662	Py_ssize_t length;
663
664	length = PyUnicode_GET_LENGTH(unicode);
665	if (length == `0`) {
666	PyObject *empty = unicode_get_empty();
667	if (unicode != empty) {
668	Py_DECREF(unicode);
669	Py_INCREF(empty);
670	}
671	return empty;
672	}
673
674	if (length == `1`) {
675	int kind = PyUnicode_KIND(unicode);
676	if (kind == PyUnicode_1BYTE_KIND) {
677	const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
678	Py_UCS1 ch = data[`0`];
679	struct _Py_unicode_state *state = get_unicode_state();
680	PyObject *latin1_char = state->latin1[ch];
681	if (latin1_char != NULL) {
682	if (unicode != latin1_char) {
683	Py_INCREF(latin1_char);
684	Py_DECREF(unicode);
685	}
686	return latin1_char;
687	}
688	else {
689	assert(_PyUnicode_CheckConsistency(unicode, `1`));
690	Py_INCREF(unicode);
691	state->latin1[ch] = unicode;
692	return unicode;
693	}
694	}
695	else {
696	assert(PyUnicode_READ_CHAR(unicode, `0`) >= `256`);
697	}
698	}
699
700	assert(_PyUnicode_CheckConsistency(unicode, `1`));
701	return unicode;
702	}
703
704	static PyObject*
705	unicode_result(PyObject *unicode)
706	{
707	assert(_PyUnicode_CHECK(unicode));
708	if (PyUnicode_IS_READY(unicode))
709	return unicode_result_ready(unicode);
710	else
711	return unicode_result_wchar(unicode);
712	}
713
714	static PyObject*
715	unicode_result_unchanged(PyObject *unicode)
716	{
717	if (PyUnicode_CheckExact(unicode)) {
718	if (PyUnicode_READY(unicode) == -`1`)
719	return NULL;
720	Py_INCREF(unicode);
721	return unicode;
722	}
723	else
724	/ Subtype -- return genuine unicode string with the same value. /
725	return _PyUnicode_Copy(unicode);
726	}
727
728	/ Implementation of the "backslashreplace" error handler for 8-bit encodings:*
729	ASCII, Latin1, UTF-8, etc. /*
730	static char*
731	backslashreplace(_PyBytesWriter writer, char* *str,
732	PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
733	{
734	Py_ssize_t size, i;
735	Py_UCS4 ch;
736	enum PyUnicode_Kind kind;
737	const void *data;
738
739	assert(PyUnicode_IS_READY(unicode));
740	kind = PyUnicode_KIND(unicode);
741	data = PyUnicode_DATA(unicode);
742
743	size = `0`;
744	/ determine replacement size /
745	for (i = collstart; i < collend; ++i) {
746	Py_ssize_t incr;
747
748	ch = PyUnicode_READ(kind, data, i);
749	if (ch < `0x100`)
750	incr = `2`+`2`;
751	else if (ch < `0x10000`)
752	incr = `2`+`4`;
753	else {
754	assert(ch <= MAX_UNICODE);
755	incr = `2`+`8`;
756	}
757	if (size > PY_SSIZE_T_MAX - incr) {
758	PyErr_SetString(PyExc_OverflowError,
759	"encoded result is too long for a Python string");
760	return NULL;
761	}
762	size += incr;
763	}
764
765	str = _PyBytesWriter_Prepare(writer, str, size);
766	if (str == NULL)
767	return NULL;
768
769	/ generate replacement /
770	for (i = collstart; i < collend; ++i) {
771	ch = PyUnicode_READ(kind, data, i);
772	*str++ = `'\\'`;
773	if (ch >= `0x00010000`) {
774	*str++ = `'U'`;
775	*str++ = Py_hexdigits[(ch>>`28`)&`0xf`];
776	*str++ = Py_hexdigits[(ch>>`24`)&`0xf`];
777	*str++ = Py_hexdigits[(ch>>`20`)&`0xf`];
778	*str++ = Py_hexdigits[(ch>>`16`)&`0xf`];
779	*str++ = Py_hexdigits[(ch>>`12`)&`0xf`];
780	*str++ = Py_hexdigits[(ch>>`8`)&`0xf`];
781	}
782	else if (ch >= `0x100`) {
783	*str++ = `'u'`;
784	*str++ = Py_hexdigits[(ch>>`12`)&`0xf`];
785	*str++ = Py_hexdigits[(ch>>`8`)&`0xf`];
786	}
787	else
788	*str++ = `'x'`;
789	*str++ = Py_hexdigits[(ch>>`4`)&`0xf`];
790	*str++ = Py_hexdigits[ch&`0xf`];
791	}
792	return str;
793	}
794
795	/ Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:*
796	ASCII, Latin1, UTF-8, etc. /*
797	static char*
798	xmlcharrefreplace(_PyBytesWriter writer, char* *str,
799	PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
800	{
801	Py_ssize_t size, i;
802	Py_UCS4 ch;
803	enum PyUnicode_Kind kind;
804	const void *data;
805
806	assert(PyUnicode_IS_READY(unicode));
807	kind = PyUnicode_KIND(unicode);
808	data = PyUnicode_DATA(unicode);
809
810	size = `0`;
811	/ determine replacement size /
812	for (i = collstart; i < collend; ++i) {
813	Py_ssize_t incr;
814
815	ch = PyUnicode_READ(kind, data, i);
816	if (ch < `10`)
817	incr = `2`+`1`+`1`;
818	else if (ch < `100`)
819	incr = `2`+`2`+`1`;
820	else if (ch < `1000`)
821	incr = `2`+`3`+`1`;
822	else if (ch < `10000`)
823	incr = `2`+`4`+`1`;
824	else if (ch < `100000`)
825	incr = `2`+`5`+`1`;
826	else if (ch < `1000000`)
827	incr = `2`+`6`+`1`;
828	else {
829	assert(ch <= MAX_UNICODE);
830	incr = `2`+`7`+`1`;
831	}
832	if (size > PY_SSIZE_T_MAX - incr) {
833	PyErr_SetString(PyExc_OverflowError,
834	"encoded result is too long for a Python string");
835	return NULL;
836	}
837	size += incr;
838	}
839
840	str = _PyBytesWriter_Prepare(writer, str, size);
841	if (str == NULL)
842	return NULL;
843
844	/ generate replacement /
845	for (i = collstart; i < collend; ++i) {
846	size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
847	if (size < `0`) {
848	return NULL;
849	}
850	str += size;
851	}
852	return str;
853	}
854
855	/ --- Bloom Filters ----------------------------------------------------- /
856
857	/ stuff to implement simple "bloom filters" for Unicode characters.*
858	to keep things simple, we use a single bitmask, using the least 5
859	bits from each unicode characters as the bit index. /*
860
861	/ the linebreak mask is set up by _PyUnicode_Init() below /
862
863	#if LONG_BIT >= 128
864	#define BLOOM_WIDTH 128
865	#elif LONG_BIT >= 64
866	#define BLOOM_WIDTH 64
867	#elif LONG_BIT >= 32
868	#define BLOOM_WIDTH 32
869	#else
870	#error "LONG_BIT is smaller than 32"
871	#endif
872
873	#define BLOOM_MASK unsigned long
874
875	static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)`0`;
876
877	#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
878
879	#define BLOOM_LINEBREAK(ch) \
880	((ch) < 128U ? ascii_linebreak[(ch)] : \
881	(BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
882
883	static inline BLOOM_MASK
884	make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
885	{
886	#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
887	do { \
888	TYPE data = (TYPE )PTR; \
889	TYPE *end = data + LEN; \
890	Py_UCS4 ch; \
891	for (; data != end; data++) { \
892	ch = *data; \
893	MASK \|= (1UL << (ch & (BLOOM_WIDTH - 1))); \
894	} \
895	break; \
896	} while (0)
897
898	/ calculate simple bloom-style bitmask for a given unicode string /
899
900	BLOOM_MASK mask;
901
902	mask = `0`;
903	switch (kind) {
904	case PyUnicode_1BYTE_KIND:
905	BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
906	break;
907	case PyUnicode_2BYTE_KIND:
908	BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
909	break;
910	case PyUnicode_4BYTE_KIND:
911	BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
912	break;
913	default:
914	Py_UNREACHABLE();
915	}
916	return mask;
917
918	#undef BLOOM_UPDATE
919	}
920
921	static int
922	ensure_unicode(PyObject *obj)
923	{
924	if (!PyUnicode_Check(obj)) {
925	PyErr_Format(PyExc_TypeError,
926	"must be str, not %.100s",
927	Py_TYPE(obj)->tp_name);
928	return -`1`;
929	}
930	return PyUnicode_READY(obj);
931	}
932
933	/ Compilation of templated routines /
934
935	#define STRINGLIB_GET_EMPTY() unicode_get_empty()
936
937	#include "stringlib/asciilib.h"
938	#include "stringlib/fastsearch.h"
939	#include "stringlib/partition.h"
940	#include "stringlib/split.h"
941	#include "stringlib/count.h"
942	#include "stringlib/find.h"
943	#include "stringlib/find_max_char.h"
944	#include "stringlib/undef.h"
945
946	#include "stringlib/ucs1lib.h"
947	#include "stringlib/fastsearch.h"
948	#include "stringlib/partition.h"
949	#include "stringlib/split.h"
950	#include "stringlib/count.h"
951	#include "stringlib/find.h"
952	#include "stringlib/replace.h"
953	#include "stringlib/find_max_char.h"
954	#include "stringlib/undef.h"
955
956	#include "stringlib/ucs2lib.h"
957	#include "stringlib/fastsearch.h"
958	#include "stringlib/partition.h"
959	#include "stringlib/split.h"
960	#include "stringlib/count.h"
961	#include "stringlib/find.h"
962	#include "stringlib/replace.h"
963	#include "stringlib/find_max_char.h"
964	#include "stringlib/undef.h"
965
966	#include "stringlib/ucs4lib.h"
967	#include "stringlib/fastsearch.h"
968	#include "stringlib/partition.h"
969	#include "stringlib/split.h"
970	#include "stringlib/count.h"
971	#include "stringlib/find.h"
972	#include "stringlib/replace.h"
973	#include "stringlib/find_max_char.h"
974	#include "stringlib/undef.h"
975
976	_Py_COMP_DIAG_PUSH
977	_Py_COMP_DIAG_IGNORE_DEPR_DECLS
978	#include "stringlib/unicodedefs.h"
979	#include "stringlib/fastsearch.h"
980	#include "stringlib/count.h"
981	#include "stringlib/find.h"
982	#include "stringlib/undef.h"
983	_Py_COMP_DIAG_POP
984
985	#undef STRINGLIB_GET_EMPTY
986
987	/ --- Unicode Object ----------------------------------------------------- /
988
989	static inline Py_ssize_t
990	findchar(const void s, int* kind,
991	Py_ssize_t size, Py_UCS4 ch,
992	int direction)
993	{
994	switch (kind) {
995	case PyUnicode_1BYTE_KIND:
996	if ((Py_UCS1) ch != ch)
997	return -`1`;
998	if (direction > `0`)
999	return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1000	else
1001	return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1002	case PyUnicode_2BYTE_KIND:
1003	if ((Py_UCS2) ch != ch)
1004	return -`1`;
1005	if (direction > `0`)
1006	return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1007	else
1008	return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1009	case PyUnicode_4BYTE_KIND:
1010	if (direction > `0`)
1011	return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1012	else
1013	return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1014	default:
1015	Py_UNREACHABLE();
1016	}
1017	}
1018
1019	#ifdef Py_DEBUG
1020	/ Fill the data of a Unicode string with invalid characters to detect bugs*
1021	earlier.
1022
1023	_PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1024	ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1025	invalid character in Unicode 6.0. /*
1026	static void
1027	unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1028	{
1029	int kind = PyUnicode_KIND(unicode);
1030	Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1031	Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1032	if (length <= old_length)
1033	return;
1034	memset(data + old_length * kind, `0xff`, (length - old_length) * kind);
1035	}
1036	#endif
1037
1038	static PyObject*
1039	resize_compact(PyObject *unicode, Py_ssize_t length)
1040	{
1041	Py_ssize_t char_size;
1042	Py_ssize_t struct_size;
1043	Py_ssize_t new_size;
1044	int share_wstr;
1045	PyObject *new_unicode;
1046	#ifdef Py_DEBUG
1047	Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1048	#endif
1049
1050	assert(unicode_modifiable(unicode));
1051	assert(PyUnicode_IS_READY(unicode));
1052	assert(PyUnicode_IS_COMPACT(unicode));
1053
1054	char_size = PyUnicode_KIND(unicode);
1055	if (PyUnicode_IS_ASCII(unicode))
1056	struct_size = sizeof(PyASCIIObject);
1057	else
1058	struct_size = sizeof(PyCompactUnicodeObject);
1059	share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1060
1061	if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - `1`)) {
1062	PyErr_NoMemory();
1063	return NULL;
1064	}
1065	new_size = (struct_size + (length + `1`) * char_size);
1066
1067	if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1068	PyObject_Free(_PyUnicode_UTF8(unicode));
1069	_PyUnicode_UTF8(unicode) = NULL;
1070	_PyUnicode_UTF8_LENGTH(unicode) = `0`;
1071	}
1072	#ifdef Py_REF_DEBUG
1073	_Py_RefTotal--;
1074	#endif
1075	#ifdef Py_TRACE_REFS
1076	_Py_ForgetReference(unicode);
1077	#endif
1078
1079	new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1080	if (new_unicode == NULL) {
1081	_Py_NewReference(unicode);
1082	PyErr_NoMemory();
1083	return NULL;
1084	}
1085	unicode = new_unicode;
1086	_Py_NewReference(unicode);
1087
1088	_PyUnicode_LENGTH(unicode) = length;
1089	if (share_wstr) {
1090	_PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1091	if (!PyUnicode_IS_ASCII(unicode))
1092	_PyUnicode_WSTR_LENGTH(unicode) = length;
1093	}
1094	else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1095	PyObject_Free(_PyUnicode_WSTR(unicode));
1096	_PyUnicode_WSTR(unicode) = NULL;
1097	if (!PyUnicode_IS_ASCII(unicode))
1098	_PyUnicode_WSTR_LENGTH(unicode) = `0`;
1099	}
1100	#ifdef Py_DEBUG
1101	unicode_fill_invalid(unicode, old_length);
1102	#endif
1103	PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1104	length, `0`);
1105	assert(_PyUnicode_CheckConsistency(unicode, `0`));
1106	return unicode;
1107	}
1108
1109	static int
1110	resize_inplace(PyObject *unicode, Py_ssize_t length)
1111	{
1112	wchar_t *wstr;
1113	Py_ssize_t new_size;
1114	assert(!PyUnicode_IS_COMPACT(unicode));
1115	assert(Py_REFCNT(unicode) == `1`);
1116
1117	if (PyUnicode_IS_READY(unicode)) {
1118	Py_ssize_t char_size;
1119	int share_wstr, share_utf8;
1120	void *data;
1121	#ifdef Py_DEBUG
1122	Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1123	#endif
1124
1125	data = _PyUnicode_DATA_ANY(unicode);
1126	char_size = PyUnicode_KIND(unicode);
1127	share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1128	share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1129
1130	if (length > (PY_SSIZE_T_MAX / char_size - `1`)) {
1131	PyErr_NoMemory();
1132	return -`1`;
1133	}
1134	new_size = (length + `1`) * char_size;
1135
1136	if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1137	{
1138	PyObject_Free(_PyUnicode_UTF8(unicode));
1139	_PyUnicode_UTF8(unicode) = NULL;
1140	_PyUnicode_UTF8_LENGTH(unicode) = `0`;
1141	}
1142
1143	data = (PyObject *)PyObject_Realloc(data, new_size);
1144	if (data == NULL) {
1145	PyErr_NoMemory();
1146	return -`1`;
1147	}
1148	_PyUnicode_DATA_ANY(unicode) = data;
1149	if (share_wstr) {
1150	_PyUnicode_WSTR(unicode) = data;
1151	_PyUnicode_WSTR_LENGTH(unicode) = length;
1152	}
1153	if (share_utf8) {
1154	_PyUnicode_UTF8(unicode) = data;
1155	_PyUnicode_UTF8_LENGTH(unicode) = length;
1156	}
1157	_PyUnicode_LENGTH(unicode) = length;
1158	PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, `0`);
1159	#ifdef Py_DEBUG
1160	unicode_fill_invalid(unicode, old_length);
1161	#endif
1162	if (share_wstr \|\| _PyUnicode_WSTR(unicode) == NULL) {
1163	assert(_PyUnicode_CheckConsistency(unicode, `0`));
1164	return `0`;
1165	}
1166	}
1167	assert(_PyUnicode_WSTR(unicode) != NULL);
1168
1169	/ check for integer overflow /
1170	if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - `1`) {
1171	PyErr_NoMemory();
1172	return -`1`;
1173	}
1174	new_size = sizeof(wchar_t) * (length + `1`);
1175	wstr = _PyUnicode_WSTR(unicode);
1176	wstr = PyObject_Realloc(wstr, new_size);
1177	if (!wstr) {
1178	PyErr_NoMemory();
1179	return -`1`;
1180	}
1181	_PyUnicode_WSTR(unicode) = wstr;
1182	_PyUnicode_WSTR(unicode)[length] = `0`;
1183	_PyUnicode_WSTR_LENGTH(unicode) = length;
1184	assert(_PyUnicode_CheckConsistency(unicode, `0`));
1185	return `0`;
1186	}
1187
1188	static PyObject*
1189	resize_copy(PyObject *unicode, Py_ssize_t length)
1190	{
1191	Py_ssize_t copy_length;
1192	if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1193	PyObject *copy;
1194
1195	assert(PyUnicode_IS_READY(unicode));
1196
1197	copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1198	if (copy == NULL)
1199	return NULL;
1200
1201	copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1202	_PyUnicode_FastCopyCharacters(copy, `0`, unicode, `0`, copy_length);
1203	return copy;
1204	}
1205	else {
1206	PyObject *w;
1207
1208	w = (PyObject*)_PyUnicode_New(length);
1209	if (w == NULL)
1210	return NULL;
1211	copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1212	copy_length = Py_MIN(copy_length, length);
1213	memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1214	copy_length * sizeof(wchar_t));
1215	return w;
1216	}
1217	}
1218
1219	/ We allocate one more byte to make sure the string is*
1220	Ux0000 terminated; some code (e.g. new_identifier)
1221	relies on that.
1222
1223	XXX This allocator could further be enhanced by assuring that the
1224	free list never reduces its size below 1.
1225
1226	*/
1227
1228	static PyUnicodeObject *
1229	_PyUnicode_New(Py_ssize_t length)
1230	{
1231	PyUnicodeObject *unicode;
1232	size_t new_size;
1233
1234	/ Optimization for empty strings /
1235	if (length == `0`) {
1236	return (PyUnicodeObject *)unicode_new_empty();
1237	}
1238
1239	/ Ensure we won't overflow the size. /
1240	if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - `1`)) {
1241	return (PyUnicodeObject *)PyErr_NoMemory();
1242	}
1243	if (length < `0`) {
1244	PyErr_SetString(PyExc_SystemError,
1245	"Negative size passed to _PyUnicode_New");
1246	return NULL;
1247	}
1248
1249	unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1250	if (unicode == NULL)
1251	return NULL;
1252	new_size = sizeof(Py_UNICODE) * ((size_t)length + `1`);
1253
1254	_PyUnicode_WSTR_LENGTH(unicode) = length;
1255	_PyUnicode_HASH(unicode) = -`1`;
1256	_PyUnicode_STATE(unicode).interned = `0`;
1257	_PyUnicode_STATE(unicode).kind = `0`;
1258	_PyUnicode_STATE(unicode).compact = `0`;
1259	_PyUnicode_STATE(unicode).ready = `0`;
1260	_PyUnicode_STATE(unicode).ascii = `0`;
1261	_PyUnicode_DATA_ANY(unicode) = NULL;
1262	_PyUnicode_LENGTH(unicode) = `0`;
1263	_PyUnicode_UTF8(unicode) = NULL;
1264	_PyUnicode_UTF8_LENGTH(unicode) = `0`;
1265
1266	_PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
1267	if (!_PyUnicode_WSTR(unicode)) {
1268	Py_DECREF(unicode);
1269	PyErr_NoMemory();
1270	return NULL;
1271	}
1272
1273	/ Initialize the first element to guard against cases where*
1274	* the caller fails before initializing str -- unicode_resize()
1275	* reads str[0], and the Keep-Alive optimization can keep memory
1276	* allocated for str alive across a call to unicode_dealloc(unicode).
1277	* We don't want unicode_resize to read uninitialized memory in
1278	* that case.
1279	*/
1280	_PyUnicode_WSTR(unicode)[`0`] = `0`;
1281	_PyUnicode_WSTR(unicode)[length] = `0`;
1282
1283	assert(_PyUnicode_CheckConsistency((PyObject *)unicode, `0`));
1284	return unicode;
1285	}
1286
1287	static const char*
1288	unicode_kind_name(PyObject *unicode)
1289	{
1290	/ don't check consistency: unicode_kind_name() is called from*
1291	_PyUnicode_Dump() /*
1292	if (!PyUnicode_IS_COMPACT(unicode))
1293	{
1294	if (!PyUnicode_IS_READY(unicode))
1295	return "wstr";
1296	switch (PyUnicode_KIND(unicode))
1297	{
1298	case PyUnicode_1BYTE_KIND:
1299	if (PyUnicode_IS_ASCII(unicode))
1300	return "legacy ascii";
1301	else
1302	return "legacy latin1";
1303	case PyUnicode_2BYTE_KIND:
1304	return "legacy UCS2";
1305	case PyUnicode_4BYTE_KIND:
1306	return "legacy UCS4";
1307	default:
1308	return "<legacy invalid kind>";
1309	}
1310	}
1311	assert(PyUnicode_IS_READY(unicode));
1312	switch (PyUnicode_KIND(unicode)) {
1313	case PyUnicode_1BYTE_KIND:
1314	if (PyUnicode_IS_ASCII(unicode))
1315	return "ascii";
1316	else
1317	return "latin1";
1318	case PyUnicode_2BYTE_KIND:
1319	return "UCS2";
1320	case PyUnicode_4BYTE_KIND:
1321	return "UCS4";
1322	default:
1323	return "<invalid compact kind>";
1324	}
1325	}
1326
1327	#ifdef Py_DEBUG
1328	/ Functions wrapping macros for use in debugger /
1329	const char _PyUnicode_utf8(void* *unicode_raw){
1330	PyObject *unicode = _PyObject_CAST(unicode_raw);
1331	return PyUnicode_UTF8(unicode);
1332	}
1333
1334	const void _PyUnicode_compact_data(void* *unicode_raw) {
1335	PyObject *unicode = _PyObject_CAST(unicode_raw);
1336	return _PyUnicode_COMPACT_DATA(unicode);
1337	}
1338	const void _PyUnicode_data(void* *unicode_raw) {
1339	PyObject *unicode = _PyObject_CAST(unicode_raw);
1340	printf("obj %p\n", (void*)unicode);
1341	printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1342	printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1343	printf("ascii op %p\n", ((void)((PyASCIIObject)(unicode) + `1`)));
1344	printf("compact op %p\n", ((void)((PyCompactUnicodeObject)(unicode) + `1`)));
1345	printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1346	return PyUnicode_DATA(unicode);
1347	}
1348
1349	void
1350	_PyUnicode_Dump(PyObject *op)
1351	{
1352	PyASCIIObject ascii = (PyASCIIObject )op;
1353	PyCompactUnicodeObject compact = (PyCompactUnicodeObject )op;
1354	PyUnicodeObject unicode = (PyUnicodeObject )op;
1355	const void *data;
1356
1357	if (ascii->state.compact)
1358	{
1359	if (ascii->state.ascii)
1360	data = (ascii + `1`);
1361	else
1362	data = (compact + `1`);
1363	}
1364	else
1365	data = unicode->data.any;
1366	printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1367
1368	if (ascii->wstr == data)
1369	printf("shared ");
1370	printf("wstr=%p", (void *)ascii->wstr);
1371
1372	if (!(ascii->state.ascii == `1` && ascii->state.compact == `1`)) {
1373	printf(" (%zu), ", compact->wstr_length);
1374	if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
1375	printf("shared ");
1376	}
1377	printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1378	}
1379	printf(", data=%p\n", data);
1380	}
1381	#endif
1382
1383	static int
1384	unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1385	{
1386	// Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1387	// optimized to always use state->empty_string without having to check if
1388	// it is NULL or not.
1389	PyObject *empty = PyUnicode_New(`1`, `0`);
1390	if (empty == NULL) {
1391	return -`1`;
1392	}
1393	PyUnicode_1BYTE_DATA(empty)[`0`] = `0`;
1394	_PyUnicode_LENGTH(empty) = `0`;
1395	assert(_PyUnicode_CheckConsistency(empty, `1`));
1396
1397	assert(state->empty_string == NULL);
1398	state->empty_string = empty;
1399	return `0`;
1400	}
1401
1402
1403	PyObject *
1404	PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1405	{
1406	/ Optimization for empty strings /
1407	if (size == `0`) {
1408	return unicode_new_empty();
1409	}
1410
1411	PyObject *obj;
1412	PyCompactUnicodeObject *unicode;
1413	void *data;
1414	enum PyUnicode_Kind kind;
1415	int is_sharing, is_ascii;
1416	Py_ssize_t char_size;
1417	Py_ssize_t struct_size;
1418
1419	is_ascii = `0`;
1420	is_sharing = `0`;
1421	struct_size = sizeof(PyCompactUnicodeObject);
1422	if (maxchar < `128`) {
1423	kind = PyUnicode_1BYTE_KIND;
1424	char_size = `1`;
1425	is_ascii = `1`;
1426	struct_size = sizeof(PyASCIIObject);
1427	}
1428	else if (maxchar < `256`) {
1429	kind = PyUnicode_1BYTE_KIND;
1430	char_size = `1`;
1431	}
1432	else if (maxchar < `65536`) {
1433	kind = PyUnicode_2BYTE_KIND;
1434	char_size = `2`;
1435	if (sizeof(wchar_t) == `2`)
1436	is_sharing = `1`;
1437	}
1438	else {
1439	if (maxchar > MAX_UNICODE) {
1440	PyErr_SetString(PyExc_SystemError,
1441	"invalid maximum character passed to PyUnicode_New");
1442	return NULL;
1443	}
1444	kind = PyUnicode_4BYTE_KIND;
1445	char_size = `4`;
1446	if (sizeof(wchar_t) == `4`)
1447	is_sharing = `1`;
1448	}
1449
1450	/ Ensure we won't overflow the size. /
1451	if (size < `0`) {
1452	PyErr_SetString(PyExc_SystemError,
1453	"Negative size passed to PyUnicode_New");
1454	return NULL;
1455	}
1456	if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - `1`))
1457	return PyErr_NoMemory();
1458
1459	/ Duplicated allocation code from _PyObject_New() instead of a call to*
1460	* PyObject_New() so we are able to allocate space for the object and
1461	* it's data buffer.
1462	*/
1463	obj = (PyObject ) PyObject_Malloc(struct_size + (size + `1`) char_size);
1464	if (obj == NULL) {
1465	return PyErr_NoMemory();
1466	}
1467	_PyObject_Init(obj, &PyUnicode_Type);
1468
1469	unicode = (PyCompactUnicodeObject *)obj;
1470	if (is_ascii)
1471	data = ((PyASCIIObject*)obj) + `1`;
1472	else
1473	data = unicode + `1`;
1474	_PyUnicode_LENGTH(unicode) = size;
1475	_PyUnicode_HASH(unicode) = -`1`;
1476	_PyUnicode_STATE(unicode).interned = `0`;
1477	_PyUnicode_STATE(unicode).kind = kind;
1478	_PyUnicode_STATE(unicode).compact = `1`;
1479	_PyUnicode_STATE(unicode).ready = `1`;
1480	_PyUnicode_STATE(unicode).ascii = is_ascii;
1481	if (is_ascii) {
1482	((char*)data)[size] = `0`;
1483	_PyUnicode_WSTR(unicode) = NULL;
1484	}
1485	else if (kind == PyUnicode_1BYTE_KIND) {
1486	((char*)data)[size] = `0`;
1487	_PyUnicode_WSTR(unicode) = NULL;
1488	_PyUnicode_WSTR_LENGTH(unicode) = `0`;
1489	unicode->utf8 = NULL;
1490	unicode->utf8_length = `0`;
1491	}
1492	else {
1493	unicode->utf8 = NULL;
1494	unicode->utf8_length = `0`;
1495	if (kind == PyUnicode_2BYTE_KIND)
1496	((Py_UCS2*)data)[size] = `0`;
1497	else / kind == PyUnicode_4BYTE_KIND /
1498	((Py_UCS4*)data)[size] = `0`;
1499	if (is_sharing) {
1500	_PyUnicode_WSTR_LENGTH(unicode) = size;
1501	_PyUnicode_WSTR(unicode) = (wchar_t *)data;
1502	}
1503	else {
1504	_PyUnicode_WSTR_LENGTH(unicode) = `0`;
1505	_PyUnicode_WSTR(unicode) = NULL;
1506	}
1507	}
1508	#ifdef Py_DEBUG
1509	unicode_fill_invalid((PyObject*)unicode, `0`);
1510	#endif
1511	assert(_PyUnicode_CheckConsistency((PyObject*)unicode, `0`));
1512	return obj;
1513	}
1514
1515	#if SIZEOF_WCHAR_T == 2
1516	/ Helper function to convert a 16-bits wchar_t representation to UCS4, this*
1517	will decode surrogate pairs, the other conversions are implemented as macros
1518	for efficiency.
1519
1520	This function assumes that unicode can hold one more code point than wstr
1521	characters for a terminating null character. /*
1522	static void
1523	unicode_convert_wchar_to_ucs4(const wchar_t begin, const* wchar_t *end,
1524	PyObject *unicode)
1525	{
1526	const wchar_t *iter;
1527	Py_UCS4 *ucs4_out;
1528
1529	assert(unicode != NULL);
1530	assert(_PyUnicode_CHECK(unicode));
1531	assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1532	ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1533
1534	for (iter = begin; iter < end; ) {
1535	assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1536	_PyUnicode_GET_LENGTH(unicode)));
1537	if (Py_UNICODE_IS_HIGH_SURROGATE(iter[`0`])
1538	&& (iter+`1`) < end
1539	&& Py_UNICODE_IS_LOW_SURROGATE(iter[`1`]))
1540	{
1541	*ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[`0`], iter[`1`]);
1542	iter += `2`;
1543	}
1544	else {
1545	ucs4_out++ = iter;
1546	iter++;
1547	}
1548	}
1549	assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1550	_PyUnicode_GET_LENGTH(unicode)));
1551
1552	}
1553	#endif
1554
1555	static int
1556	unicode_check_modifiable(PyObject *unicode)
1557	{
1558	if (!unicode_modifiable(unicode)) {
1559	PyErr_SetString(PyExc_SystemError,
1560	"Cannot modify a string currently used");
1561	return -`1`;
1562	}
1563	return `0`;
1564	}
1565
1566	static int
1567	_copy_characters(PyObject *to, Py_ssize_t to_start,
1568	PyObject *from, Py_ssize_t from_start,
1569	Py_ssize_t how_many, int check_maxchar)
1570	{
1571	unsigned int from_kind, to_kind;
1572	const void *from_data;
1573	void *to_data;
1574
1575	assert(`0` <= how_many);
1576	assert(`0` <= from_start);
1577	assert(`0` <= to_start);
1578	assert(PyUnicode_Check(from));
1579	assert(PyUnicode_IS_READY(from));
1580	assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1581
1582	assert(PyUnicode_Check(to));
1583	assert(PyUnicode_IS_READY(to));
1584	assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1585
1586	if (how_many == `0`)
1587	return `0`;
1588
1589	from_kind = PyUnicode_KIND(from);
1590	from_data = PyUnicode_DATA(from);
1591	to_kind = PyUnicode_KIND(to);
1592	to_data = PyUnicode_DATA(to);
1593
1594	#ifdef Py_DEBUG
1595	if (!check_maxchar
1596	&& PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1597	{
1598	Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1599	Py_UCS4 ch;
1600	Py_ssize_t i;
1601	for (i=`0`; i < how_many; i++) {
1602	ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1603	assert(ch <= to_maxchar);
1604	}
1605	}
1606	#endif
1607
1608	if (from_kind == to_kind) {
1609	if (check_maxchar
1610	&& !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1611	{
1612	/ Writing Latin-1 characters into an ASCII string requires to*
1613	check that all written characters are pure ASCII /*
1614	Py_UCS4 max_char;
1615	max_char = ucs1lib_find_max_char(from_data,
1616	(const Py_UCS1*)from_data + how_many);
1617	if (max_char >= `128`)
1618	return -`1`;
1619	}
1620	memcpy((char)to_data + to_kind to_start,
1621	(const char)from_data + from_kind from_start,
1622	to_kind * how_many);
1623	}
1624	else if (from_kind == PyUnicode_1BYTE_KIND
1625	&& to_kind == PyUnicode_2BYTE_KIND)
1626	{
1627	_PyUnicode_CONVERT_BYTES(
1628	Py_UCS1, Py_UCS2,
1629	PyUnicode_1BYTE_DATA(from) + from_start,
1630	PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1631	PyUnicode_2BYTE_DATA(to) + to_start
1632	);
1633	}
1634	else if (from_kind == PyUnicode_1BYTE_KIND
1635	&& to_kind == PyUnicode_4BYTE_KIND)
1636	{
1637	_PyUnicode_CONVERT_BYTES(
1638	Py_UCS1, Py_UCS4,
1639	PyUnicode_1BYTE_DATA(from) + from_start,
1640	PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1641	PyUnicode_4BYTE_DATA(to) + to_start
1642	);
1643	}
1644	else if (from_kind == PyUnicode_2BYTE_KIND
1645	&& to_kind == PyUnicode_4BYTE_KIND)
1646	{
1647	_PyUnicode_CONVERT_BYTES(
1648	Py_UCS2, Py_UCS4,
1649	PyUnicode_2BYTE_DATA(from) + from_start,
1650	PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1651	PyUnicode_4BYTE_DATA(to) + to_start
1652	);
1653	}
1654	else {
1655	assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1656
1657	if (!check_maxchar) {
1658	if (from_kind == PyUnicode_2BYTE_KIND
1659	&& to_kind == PyUnicode_1BYTE_KIND)
1660	{
1661	_PyUnicode_CONVERT_BYTES(
1662	Py_UCS2, Py_UCS1,
1663	PyUnicode_2BYTE_DATA(from) + from_start,
1664	PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1665	PyUnicode_1BYTE_DATA(to) + to_start
1666	);
1667	}
1668	else if (from_kind == PyUnicode_4BYTE_KIND
1669	&& to_kind == PyUnicode_1BYTE_KIND)
1670	{
1671	_PyUnicode_CONVERT_BYTES(
1672	Py_UCS4, Py_UCS1,
1673	PyUnicode_4BYTE_DATA(from) + from_start,
1674	PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1675	PyUnicode_1BYTE_DATA(to) + to_start
1676	);
1677	}
1678	else if (from_kind == PyUnicode_4BYTE_KIND
1679	&& to_kind == PyUnicode_2BYTE_KIND)
1680	{
1681	_PyUnicode_CONVERT_BYTES(
1682	Py_UCS4, Py_UCS2,
1683	PyUnicode_4BYTE_DATA(from) + from_start,
1684	PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1685	PyUnicode_2BYTE_DATA(to) + to_start
1686	);
1687	}
1688	else {
1689	Py_UNREACHABLE();
1690	}
1691	}
1692	else {
1693	const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1694	Py_UCS4 ch;
1695	Py_ssize_t i;
1696
1697	for (i=`0`; i < how_many; i++) {
1698	ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1699	if (ch > to_maxchar)
1700	return -`1`;
1701	PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1702	}
1703	}
1704	}
1705	return `0`;
1706	}
1707
1708	void
1709	_PyUnicode_FastCopyCharacters(
1710	PyObject *to, Py_ssize_t to_start,
1711	PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1712	{
1713	(void)_copy_characters(to, to_start, from, from_start, how_many, `0`);
1714	}
1715
1716	Py_ssize_t
1717	PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1718	PyObject *from, Py_ssize_t from_start,
1719	Py_ssize_t how_many)
1720	{
1721	int err;
1722
1723	if (!PyUnicode_Check(from) \|\| !PyUnicode_Check(to)) {
1724	PyErr_BadInternalCall();
1725	return -`1`;
1726	}
1727
1728	if (PyUnicode_READY(from) == -`1`)
1729	return -`1`;
1730	if (PyUnicode_READY(to) == -`1`)
1731	return -`1`;
1732
1733	if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1734	PyErr_SetString(PyExc_IndexError, "string index out of range");
1735	return -`1`;
1736	}
1737	if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1738	PyErr_SetString(PyExc_IndexError, "string index out of range");
1739	return -`1`;
1740	}
1741	if (how_many < `0`) {
1742	PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1743	return -`1`;
1744	}
1745	how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1746	if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1747	PyErr_Format(PyExc_SystemError,
1748	"Cannot write %zi characters at %zi "
1749	"in a string of %zi characters",
1750	how_many, to_start, PyUnicode_GET_LENGTH(to));
1751	return -`1`;
1752	}
1753
1754	if (how_many == `0`)
1755	return `0`;
1756
1757	if (unicode_check_modifiable(to))
1758	return -`1`;
1759
1760	err = _copy_characters(to, to_start, from, from_start, how_many, `1`);
1761	if (err) {
1762	PyErr_Format(PyExc_SystemError,
1763	"Cannot copy %s characters "
1764	"into a string of %s characters",
1765	unicode_kind_name(from),
1766	unicode_kind_name(to));
1767	return -`1`;
1768	}
1769	return how_many;
1770	}
1771
1772	/ Find the maximum code point and count the number of surrogate pairs so a*
1773	correct string length can be computed before converting a string to UCS4.
1774	This function counts single surrogates as a character and not as a pair.
1775
1776	Return 0 on success, or -1 on error. /*
1777	static int
1778	find_maxchar_surrogates(const wchar_t begin, const* wchar_t *end,
1779	Py_UCS4 maxchar, Py_ssize_t num_surrogates)
1780	{
1781	const wchar_t *iter;
1782	Py_UCS4 ch;
1783
1784	assert(num_surrogates != NULL && maxchar != NULL);
1785	*num_surrogates = `0`;
1786	*maxchar = `0`;
1787
1788	for (iter = begin; iter < end; ) {
1789	#if SIZEOF_WCHAR_T == 2
1790	if (Py_UNICODE_IS_HIGH_SURROGATE(iter[`0`])
1791	&& (iter+`1`) < end
1792	&& Py_UNICODE_IS_LOW_SURROGATE(iter[`1`]))
1793	{
1794	ch = Py_UNICODE_JOIN_SURROGATES(iter[`0`], iter[`1`]);
1795	++(*num_surrogates);
1796	iter += `2`;
1797	}
1798	else
1799	#endif
1800	{
1801	ch = *iter;
1802	iter++;
1803	}
1804	if (ch > *maxchar) {
1805	*maxchar = ch;
1806	if (*maxchar > MAX_UNICODE) {
1807	PyErr_Format(PyExc_ValueError,
1808	"character U+%x is not in range [U+0000; U+%x]",
1809	ch, MAX_UNICODE);
1810	return -`1`;
1811	}
1812	}
1813	}
1814	return `0`;
1815	}
1816
1817	int
1818	_PyUnicode_Ready(PyObject *unicode)
1819	{
1820	wchar_t *end;
1821	Py_UCS4 maxchar = `0`;
1822	Py_ssize_t num_surrogates;
1823	#if SIZEOF_WCHAR_T == 2
1824	Py_ssize_t length_wo_surrogates;
1825	#endif
1826
1827	/ _PyUnicode_Ready() is only intended for old-style API usage where*
1828	strings were created using _PyObject_New() and where no canonical
1829	representation (the str field) has been set yet aka strings
1830	which are not yet ready. /*
1831	assert(_PyUnicode_CHECK(unicode));
1832	assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1833	assert(_PyUnicode_WSTR(unicode) != NULL);
1834	assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1835	assert(_PyUnicode_UTF8(unicode) == NULL);
1836	/ Actually, it should neither be interned nor be anything else: /
1837	assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1838
1839	end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1840	if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1841	&maxchar, &num_surrogates) == -`1`)
1842	return -`1`;
1843
1844	if (maxchar < `256`) {
1845	_PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + `1`);
1846	if (!_PyUnicode_DATA_ANY(unicode)) {
1847	PyErr_NoMemory();
1848	return -`1`;
1849	}
1850	_PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1851	_PyUnicode_WSTR(unicode), end,
1852	PyUnicode_1BYTE_DATA(unicode));
1853	PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = `'\0'`;
1854	_PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1855	_PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1856	if (maxchar < `128`) {
1857	_PyUnicode_STATE(unicode).ascii = `1`;
1858	_PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1859	_PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1860	}
1861	else {
1862	_PyUnicode_STATE(unicode).ascii = `0`;
1863	_PyUnicode_UTF8(unicode) = NULL;
1864	_PyUnicode_UTF8_LENGTH(unicode) = `0`;
1865	}
1866	PyObject_Free(_PyUnicode_WSTR(unicode));
1867	_PyUnicode_WSTR(unicode) = NULL;
1868	_PyUnicode_WSTR_LENGTH(unicode) = `0`;
1869	}
1870	/ In this case we might have to convert down from 4-byte native*
1871	wchar_t to 2-byte unicode. /*
1872	else if (maxchar < `65536`) {
1873	assert(num_surrogates == `0` &&
1874	"FindMaxCharAndNumSurrogatePairs() messed up");
1875
1876	#if SIZEOF_WCHAR_T == 2
1877	/ We can share representations and are done. /
1878	_PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1879	PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = `'\0'`;
1880	_PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1881	_PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1882	_PyUnicode_UTF8(unicode) = NULL;
1883	_PyUnicode_UTF8_LENGTH(unicode) = `0`;
1884	#else
1885	/ sizeof(wchar_t) == 4 /
1886	_PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
1887	`2` * (_PyUnicode_WSTR_LENGTH(unicode) + `1`));
1888	if (!_PyUnicode_DATA_ANY(unicode)) {
1889	PyErr_NoMemory();
1890	return -`1`;
1891	}
1892	_PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1893	_PyUnicode_WSTR(unicode), end,
1894	PyUnicode_2BYTE_DATA(unicode));
1895	PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = `'\0'`;
1896	_PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1897	_PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1898	_PyUnicode_UTF8(unicode) = NULL;
1899	_PyUnicode_UTF8_LENGTH(unicode) = `0`;
1900	PyObject_Free(_PyUnicode_WSTR(unicode));
1901	_PyUnicode_WSTR(unicode) = NULL;
1902	_PyUnicode_WSTR_LENGTH(unicode) = `0`;
1903	#endif
1904	}
1905	/ maxchar exceeds 16 bit, wee need 4 bytes for unicode characters /
1906	else {
1907	#if SIZEOF_WCHAR_T == 2
1908	/ in case the native representation is 2-bytes, we need to allocate a*
1909	new normalized 4-byte version. /*
1910	length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1911	if (length_wo_surrogates > PY_SSIZE_T_MAX / `4` - `1`) {
1912	PyErr_NoMemory();
1913	return -`1`;
1914	}
1915	_PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(`4` * (length_wo_surrogates + `1`));
1916	if (!_PyUnicode_DATA_ANY(unicode)) {
1917	PyErr_NoMemory();
1918	return -`1`;
1919	}
1920	_PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1921	_PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1922	_PyUnicode_UTF8(unicode) = NULL;
1923	_PyUnicode_UTF8_LENGTH(unicode) = `0`;
1924	/ unicode_convert_wchar_to_ucs4() requires a ready string /
1925	_PyUnicode_STATE(unicode).ready = `1`;
1926	unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1927	PyObject_Free(_PyUnicode_WSTR(unicode));
1928	_PyUnicode_WSTR(unicode) = NULL;
1929	_PyUnicode_WSTR_LENGTH(unicode) = `0`;
1930	#else
1931	assert(num_surrogates == `0`);
1932
1933	_PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1934	_PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1935	_PyUnicode_UTF8(unicode) = NULL;
1936	_PyUnicode_UTF8_LENGTH(unicode) = `0`;
1937	_PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1938	#endif
1939	PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = `'\0'`;
1940	}
1941	_PyUnicode_STATE(unicode).ready = `1`;
1942	assert(_PyUnicode_CheckConsistency(unicode, `1`));
1943	return `0`;
1944	}
1945
1946	static void
1947	unicode_dealloc(PyObject *unicode)
1948	{
1949	switch (PyUnicode_CHECK_INTERNED(unicode)) {
1950	case SSTATE_NOT_INTERNED:
1951	break;
1952
1953	case SSTATE_INTERNED_MORTAL:
1954	{
1955	#ifdef INTERNED_STRINGS
1956	/ Revive the dead object temporarily. PyDict_DelItem() removes two*
1957	references (key and value) which were ignored by
1958	PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1959	to prevent calling unicode_dealloc() again. Adjust refcnt after
1960	PyDict_DelItem(). /*
1961	assert(Py_REFCNT(unicode) == `0`);
1962	Py_SET_REFCNT(unicode, `3`);
1963	if (PyDict_DelItem(interned, unicode) != `0`) {
1964	_PyErr_WriteUnraisableMsg("deletion of interned string failed",
1965	NULL);
1966	}
1967	assert(Py_REFCNT(unicode) == `1`);
1968	Py_SET_REFCNT(unicode, `0`);
1969	#endif
1970	break;
1971	}
1972
1973	case SSTATE_INTERNED_IMMORTAL:
1974	_PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1975	break;
1976
1977	default:
1978	Py_UNREACHABLE();
1979	}
1980
1981	if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1982	PyObject_Free(_PyUnicode_WSTR(unicode));
1983	}
1984	if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1985	PyObject_Free(_PyUnicode_UTF8(unicode));
1986	}
1987	if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1988	PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1989	}
1990
1991	Py_TYPE(unicode)->tp_free(unicode);
1992	}
1993
1994	#ifdef Py_DEBUG
1995	static int
1996	unicode_is_singleton(PyObject *unicode)
1997	{
1998	struct _Py_unicode_state *state = get_unicode_state();
1999	if (unicode == state->empty_string) {
2000	return `1`;
2001	}
2002	PyASCIIObject ascii = (PyASCIIObject )unicode;
2003	if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == `1`)
2004	{
2005	Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, `0`);
2006	if (ch < `256` && state->latin1[ch] == unicode) {
2007	return `1`;
2008	}
2009	}
2010	return `0`;
2011	}
2012	#endif
2013
2014	static int
2015	unicode_modifiable(PyObject *unicode)
2016	{
2017	assert(_PyUnicode_CHECK(unicode));
2018	if (Py_REFCNT(unicode) != `1`)
2019	return `0`;
2020	if (_PyUnicode_HASH(unicode) != -`1`)
2021	return `0`;
2022	if (PyUnicode_CHECK_INTERNED(unicode))
2023	return `0`;
2024	if (!PyUnicode_CheckExact(unicode))
2025	return `0`;
2026	#ifdef Py_DEBUG
2027	/ singleton refcount is greater than 1 /
2028	assert(!unicode_is_singleton(unicode));
2029	#endif
2030	return `1`;
2031	}
2032
2033	static int
2034	unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2035	{
2036	PyObject *unicode;
2037	Py_ssize_t old_length;
2038
2039	assert(p_unicode != NULL);
2040	unicode = *p_unicode;
2041
2042	assert(unicode != NULL);
2043	assert(PyUnicode_Check(unicode));
2044	assert(`0` <= length);
2045
2046	if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2047	old_length = PyUnicode_WSTR_LENGTH(unicode);
2048	else
2049	old_length = PyUnicode_GET_LENGTH(unicode);
2050	if (old_length == length)
2051	return `0`;
2052
2053	if (length == `0`) {
2054	PyObject *empty = unicode_new_empty();
2055	Py_SETREF(*p_unicode, empty);
2056	return `0`;
2057	}
2058
2059	if (!unicode_modifiable(unicode)) {
2060	PyObject *copy = resize_copy(unicode, length);
2061	if (copy == NULL)
2062	return -`1`;
2063	Py_SETREF(*p_unicode, copy);
2064	return `0`;
2065	}
2066
2067	if (PyUnicode_IS_COMPACT(unicode)) {
2068	PyObject *new_unicode = resize_compact(unicode, length);
2069	if (new_unicode == NULL)
2070	return -`1`;
2071	*p_unicode = new_unicode;
2072	return `0`;
2073	}
2074	return resize_inplace(unicode, length);
2075	}
2076
2077	int
2078	PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2079	{
2080	PyObject *unicode;
2081	if (p_unicode == NULL) {
2082	PyErr_BadInternalCall();
2083	return -`1`;
2084	}
2085	unicode = *p_unicode;
2086	if (unicode == NULL \|\| !PyUnicode_Check(unicode) \|\| length < `0`)
2087	{
2088	PyErr_BadInternalCall();
2089	return -`1`;
2090	}
2091	return unicode_resize(p_unicode, length);
2092	}
2093
2094	/ Copy an ASCII or latin1 char* string into a Python Unicode string.*
2095
2096	WARNING: The function doesn't copy the terminating null character and
2097	doesn't check the maximum character (may write a latin1 character in an
2098	ASCII string). /*
2099	static void
2100	unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2101	const char *str, Py_ssize_t len)
2102	{
2103	enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2104	const void *data = PyUnicode_DATA(unicode);
2105	const char *end = str + len;
2106
2107	assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2108	switch (kind) {
2109	case PyUnicode_1BYTE_KIND: {
2110	#ifdef Py_DEBUG
2111	if (PyUnicode_IS_ASCII(unicode)) {
2112	Py_UCS4 maxchar = ucs1lib_find_max_char(
2113	(const Py_UCS1*)str,
2114	(const Py_UCS1*)str + len);
2115	assert(maxchar < `128`);
2116	}
2117	#endif
2118	memcpy((char *) data + index, str, len);
2119	break;
2120	}
2121	case PyUnicode_2BYTE_KIND: {
2122	Py_UCS2 start = (Py_UCS2 )data + index;
2123	Py_UCS2 *ucs2 = start;
2124
2125	for (; str < end; ++ucs2, ++str)
2126	ucs2 = (Py_UCS2)str;
2127
2128	assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2129	break;
2130	}
2131	case PyUnicode_4BYTE_KIND: {
2132	Py_UCS4 start = (Py_UCS4 )data + index;
2133	Py_UCS4 *ucs4 = start;
2134
2135	for (; str < end; ++ucs4, ++str)
2136	ucs4 = (Py_UCS4)str;
2137
2138	assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2139	break;
2140	}
2141	default:
2142	Py_UNREACHABLE();
2143	}
2144	}
2145
2146	static PyObject*
2147	get_latin1_char(Py_UCS1 ch)
2148	{
2149	struct _Py_unicode_state *state = get_unicode_state();
2150
2151	PyObject *unicode = state->latin1[ch];
2152	if (unicode) {
2153	Py_INCREF(unicode);
2154	return unicode;
2155	}
2156
2157	unicode = PyUnicode_New(`1`, ch);
2158	if (!unicode) {
2159	return NULL;
2160	}
2161
2162	PyUnicode_1BYTE_DATA(unicode)[`0`] = ch;
2163	assert(_PyUnicode_CheckConsistency(unicode, `1`));
2164
2165	Py_INCREF(unicode);
2166	state->latin1[ch] = unicode;
2167	return unicode;
2168	}
2169
2170	static PyObject*
2171	unicode_char(Py_UCS4 ch)
2172	{
2173	PyObject *unicode;
2174
2175	assert(ch <= MAX_UNICODE);
2176
2177	if (ch < `256`) {
2178	return get_latin1_char(ch);
2179	}
2180
2181	unicode = PyUnicode_New(`1`, ch);
2182	if (unicode == NULL)
2183	return NULL;
2184
2185	assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2186	if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2187	PyUnicode_2BYTE_DATA(unicode)[`0`] = (Py_UCS2)ch;
2188	} else {
2189	assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2190	PyUnicode_4BYTE_DATA(unicode)[`0`] = ch;
2191	}
2192	assert(_PyUnicode_CheckConsistency(unicode, `1`));
2193	return unicode;
2194	}
2195
2196	PyObject *
2197	PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2198	{
2199	if (u == NULL) {
2200	if (size > `0`) {
2201	if (PyErr_WarnEx(PyExc_DeprecationWarning,
2202	"PyUnicode_FromUnicode(NULL, size) is deprecated; "
2203	"use PyUnicode_New() instead", `1`) < `0`) {
2204	return NULL;
2205	}
2206	}
2207	return (PyObject*)_PyUnicode_New(size);
2208	}
2209
2210	if (size < `0`) {
2211	PyErr_BadInternalCall();
2212	return NULL;
2213	}
2214
2215	return PyUnicode_FromWideChar(u, size);
2216	}
2217
2218	PyObject *
2219	PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2220	{
2221	PyObject *unicode;
2222	Py_UCS4 maxchar = `0`;
2223	Py_ssize_t num_surrogates;
2224
2225	if (u == NULL && size != `0`) {
2226	PyErr_BadInternalCall();
2227	return NULL;
2228	}
2229
2230	if (size == -`1`) {
2231	size = wcslen(u);
2232	}
2233
2234	/ If the Unicode data is known at construction time, we can apply*
2235	some optimizations which share commonly used objects. /*
2236
2237	/ Optimization for empty strings /
2238	if (size == `0`)
2239	_Py_RETURN_UNICODE_EMPTY();
2240
2241	#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2242	/ Oracle Solaris uses non-Unicode internal wchar_t form for*
2243	non-Unicode locales and hence needs conversion to UCS-4 first. /*
2244	if (_Py_LocaleUsesNonUnicodeWchar()) {
2245	wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2246	if (!converted) {
2247	return NULL;
2248	}
2249	PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2250	PyMem_Free(converted);
2251	return unicode;
2252	}
2253	#endif
2254
2255	/ Single character Unicode objects in the Latin-1 range are*
2256	shared when using this constructor /*
2257	if (size == `1` && (Py_UCS4)*u < `256`)
2258	return get_latin1_char((unsigned char)*u);
2259
2260	/ If not empty and not single character, copy the Unicode data*
2261	into the new object /*
2262	if (find_maxchar_surrogates(u, u + size,
2263	&maxchar, &num_surrogates) == -`1`)
2264	return NULL;
2265
2266	unicode = PyUnicode_New(size - num_surrogates, maxchar);
2267	if (!unicode)
2268	return NULL;
2269
2270	switch (PyUnicode_KIND(unicode)) {
2271	case PyUnicode_1BYTE_KIND:
2272	_PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2273	u, u + size, PyUnicode_1BYTE_DATA(unicode));
2274	break;
2275	case PyUnicode_2BYTE_KIND:
2276	#if Py_UNICODE_SIZE == 2
2277	memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * `2`);
2278	#else
2279	_PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2280	u, u + size, PyUnicode_2BYTE_DATA(unicode));
2281	#endif
2282	break;
2283	case PyUnicode_4BYTE_KIND:
2284	#if SIZEOF_WCHAR_T == 2
2285	/ This is the only case which has to process surrogates, thus*
2286	a simple copy loop is not enough and we need a function. /*
2287	unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2288	#else
2289	assert(num_surrogates == `0`);
2290	memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * `4`);
2291	#endif
2292	break;
2293	default:
2294	Py_UNREACHABLE();
2295	}
2296
2297	return unicode_result(unicode);
2298	}
2299
2300	PyObject *
2301	PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2302	{
2303	if (size < `0`) {
2304	PyErr_SetString(PyExc_SystemError,
2305	"Negative size passed to PyUnicode_FromStringAndSize");
2306	return NULL;
2307	}
2308	if (u != NULL) {
2309	return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2310	}
2311	else {
2312	if (size > `0`) {
2313	if (PyErr_WarnEx(PyExc_DeprecationWarning,
2314	"PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2315	"use PyUnicode_New() instead", `1`) < `0`) {
2316	return NULL;
2317	}
2318	}
2319	return (PyObject *)_PyUnicode_New(size);
2320	}
2321	}
2322
2323	PyObject *
2324	PyUnicode_FromString(const char *u)
2325	{
2326	size_t size = strlen(u);
2327	if (size > PY_SSIZE_T_MAX) {
2328	PyErr_SetString(PyExc_OverflowError, "input too long");
2329	return NULL;
2330	}
2331	return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2332	}
2333
2334
2335	PyObject *
2336	_PyUnicode_FromId(_Py_Identifier *id)
2337	{
2338	PyInterpreterState *interp = _PyInterpreterState_GET();
2339	struct _Py_unicode_ids *ids = &interp->unicode.ids;
2340
2341	Py_ssize_t index = _Py_atomic_size_get(&id->index);
2342	if (index < `0`) {
2343	struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2344
2345	PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2346	// Check again to detect concurrent access. Another thread can have
2347	// initialized the index while this thread waited for the lock.
2348	index = _Py_atomic_size_get(&id->index);
2349	if (index < `0`) {
2350	assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2351	index = rt_ids->next_index;
2352	rt_ids->next_index++;
2353	_Py_atomic_size_set(&id->index, index);
2354	}
2355	PyThread_release_lock(rt_ids->lock);
2356	}
2357	assert(index >= `0`);
2358
2359	PyObject *obj;
2360	if (index < ids->size) {
2361	obj = ids->array[index];
2362	if (obj) {
2363	// Return a borrowed reference
2364	return obj;
2365	}
2366	}
2367
2368	obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2369	NULL, NULL);
2370	if (!obj) {
2371	return NULL;
2372	}
2373	PyUnicode_InternInPlace(&obj);
2374
2375	if (index >= ids->size) {
2376	// Overallocate to reduce the number of realloc
2377	Py_ssize_t new_size = Py_MAX(index * `2`, `16`);
2378	Py_ssize_t item_size = sizeof(ids->array[`0`]);
2379	PyObject *new_array = PyMem_Realloc(ids->array, new_size item_size);
2380	if (new_array == NULL) {
2381	PyErr_NoMemory();
2382	return NULL;
2383	}
2384	memset(&new_array[ids->size], `0`, (new_size - ids->size) * item_size);
2385	ids->array = new_array;
2386	ids->size = new_size;
2387	}
2388
2389	// The array stores a strong reference
2390	ids->array[index] = obj;
2391
2392	// Return a borrowed reference
2393	return obj;
2394	}
2395
2396
2397	static void
2398	unicode_clear_identifiers(struct _Py_unicode_state *state)
2399	{
2400	struct _Py_unicode_ids *ids = &state->ids;
2401	for (Py_ssize_t i=`0`; i < ids->size; i++) {
2402	Py_XDECREF(ids->array[i]);
2403	}
2404	ids->size = `0`;
2405	PyMem_Free(ids->array);
2406	ids->array = NULL;
2407	// Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2408	// after Py_Finalize().
2409	}
2410
2411
2412	/ Internal function, doesn't check maximum character /
2413
2414	PyObject*
2415	_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2416	{
2417	const unsigned char s = (const* unsigned char *)buffer;
2418	PyObject *unicode;
2419	if (size == `1`) {
2420	#ifdef Py_DEBUG
2421	assert((unsigned char)s[`0`] < `128`);
2422	#endif
2423	return get_latin1_char(s[`0`]);
2424	}
2425	unicode = PyUnicode_New(size, `127`);
2426	if (!unicode)
2427	return NULL;
2428	memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2429	assert(_PyUnicode_CheckConsistency(unicode, `1`));
2430	return unicode;
2431	}
2432
2433	static Py_UCS4
2434	kind_maxchar_limit(unsigned int kind)
2435	{
2436	switch (kind) {
2437	case PyUnicode_1BYTE_KIND:
2438	return `0x80`;
2439	case PyUnicode_2BYTE_KIND:
2440	return `0x100`;
2441	case PyUnicode_4BYTE_KIND:
2442	return `0x10000`;
2443	default:
2444	Py_UNREACHABLE();
2445	}
2446	}
2447
2448	static PyObject*
2449	_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2450	{
2451	PyObject *res;
2452	unsigned char max_char;
2453
2454	if (size == `0`) {
2455	_Py_RETURN_UNICODE_EMPTY();
2456	}
2457	assert(size > `0`);
2458	if (size == `1`) {
2459	return get_latin1_char(u[`0`]);
2460	}
2461
2462	max_char = ucs1lib_find_max_char(u, u + size);
2463	res = PyUnicode_New(size, max_char);
2464	if (!res)
2465	return NULL;
2466	memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2467	assert(_PyUnicode_CheckConsistency(res, `1`));
2468	return res;
2469	}
2470
2471	static PyObject*
2472	_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2473	{
2474	PyObject *res;
2475	Py_UCS2 max_char;
2476
2477	if (size == `0`)
2478	_Py_RETURN_UNICODE_EMPTY();
2479	assert(size > `0`);
2480	if (size == `1`)
2481	return unicode_char(u[`0`]);
2482
2483	max_char = ucs2lib_find_max_char(u, u + size);
2484	res = PyUnicode_New(size, max_char);
2485	if (!res)
2486	return NULL;
2487	if (max_char >= `256`)
2488	memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2489	else {
2490	_PyUnicode_CONVERT_BYTES(
2491	Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2492	}
2493	assert(_PyUnicode_CheckConsistency(res, `1`));
2494	return res;
2495	}
2496
2497	static PyObject*
2498	_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2499	{
2500	PyObject *res;
2501	Py_UCS4 max_char;
2502
2503	if (size == `0`)
2504	_Py_RETURN_UNICODE_EMPTY();
2505	assert(size > `0`);
2506	if (size == `1`)
2507	return unicode_char(u[`0`]);
2508
2509	max_char = ucs4lib_find_max_char(u, u + size);
2510	res = PyUnicode_New(size, max_char);
2511	if (!res)
2512	return NULL;
2513	if (max_char < `256`)
2514	_PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2515	PyUnicode_1BYTE_DATA(res));
2516	else if (max_char < `0x10000`)
2517	_PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2518	PyUnicode_2BYTE_DATA(res));
2519	else
2520	memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2521	assert(_PyUnicode_CheckConsistency(res, `1`));
2522	return res;
2523	}
2524
2525	PyObject*
2526	PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2527	{
2528	if (size < `0`) {
2529	PyErr_SetString(PyExc_ValueError, "size must be positive");
2530	return NULL;
2531	}
2532	switch (kind) {
2533	case PyUnicode_1BYTE_KIND:
2534	return _PyUnicode_FromUCS1(buffer, size);
2535	case PyUnicode_2BYTE_KIND:
2536	return _PyUnicode_FromUCS2(buffer, size);
2537	case PyUnicode_4BYTE_KIND:
2538	return _PyUnicode_FromUCS4(buffer, size);
2539	default:
2540	PyErr_SetString(PyExc_SystemError, "invalid kind");
2541	return NULL;
2542	}
2543	}
2544
2545	Py_UCS4
2546	_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2547	{
2548	enum PyUnicode_Kind kind;
2549	const void startptr, endptr;
2550
2551	assert(PyUnicode_IS_READY(unicode));
2552	assert(`0` <= start);
2553	assert(end <= PyUnicode_GET_LENGTH(unicode));
2554	assert(start <= end);
2555
2556	if (start == `0` && end == PyUnicode_GET_LENGTH(unicode))
2557	return PyUnicode_MAX_CHAR_VALUE(unicode);
2558
2559	if (start == end)
2560	return `127`;
2561
2562	if (PyUnicode_IS_ASCII(unicode))
2563	return `127`;
2564
2565	kind = PyUnicode_KIND(unicode);
2566	startptr = PyUnicode_DATA(unicode);
2567	endptr = (char )startptr + end kind;
2568	startptr = (char )startptr + start kind;
2569	switch(kind) {
2570	case PyUnicode_1BYTE_KIND:
2571	return ucs1lib_find_max_char(startptr, endptr);
2572	case PyUnicode_2BYTE_KIND:
2573	return ucs2lib_find_max_char(startptr, endptr);
2574	case PyUnicode_4BYTE_KIND:
2575	return ucs4lib_find_max_char(startptr, endptr);
2576	default:
2577	Py_UNREACHABLE();
2578	}
2579	}
2580
2581	/ Ensure that a string uses the most efficient storage, if it is not the*
2582	case: create a new string with of the right kind. Write NULL into p_unicode*
2583	on error. /*
2584	static void
2585	unicode_adjust_maxchar(PyObject **p_unicode)
2586	{
2587	PyObject unicode, copy;
2588	Py_UCS4 max_char;
2589	Py_ssize_t len;
2590	unsigned int kind;
2591
2592	assert(p_unicode != NULL);
2593	unicode = *p_unicode;
2594	assert(PyUnicode_IS_READY(unicode));
2595	if (PyUnicode_IS_ASCII(unicode))
2596	return;
2597
2598	len = PyUnicode_GET_LENGTH(unicode);
2599	kind = PyUnicode_KIND(unicode);
2600	if (kind == PyUnicode_1BYTE_KIND) {
2601	const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2602	max_char = ucs1lib_find_max_char(u, u + len);
2603	if (max_char >= `128`)
2604	return;
2605	}
2606	else if (kind == PyUnicode_2BYTE_KIND) {
2607	const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2608	max_char = ucs2lib_find_max_char(u, u + len);
2609	if (max_char >= `256`)
2610	return;
2611	}
2612	else if (kind == PyUnicode_4BYTE_KIND) {
2613	const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2614	max_char = ucs4lib_find_max_char(u, u + len);
2615	if (max_char >= `0x10000`)
2616	return;
2617	}
2618	else
2619	Py_UNREACHABLE();
2620
2621	copy = PyUnicode_New(len, max_char);
2622	if (copy != NULL)
2623	_PyUnicode_FastCopyCharacters(copy, `0`, unicode, `0`, len);
2624	Py_DECREF(unicode);
2625	*p_unicode = copy;
2626	}
2627
2628	PyObject*
2629	_PyUnicode_Copy(PyObject *unicode)
2630	{
2631	Py_ssize_t length;
2632	PyObject *copy;
2633
2634	if (!PyUnicode_Check(unicode)) {
2635	PyErr_BadInternalCall();
2636	return NULL;
2637	}
2638	if (PyUnicode_READY(unicode) == -`1`)
2639	return NULL;
2640
2641	length = PyUnicode_GET_LENGTH(unicode);
2642	copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2643	if (!copy)
2644	return NULL;
2645	assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2646
2647	memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2648	length * PyUnicode_KIND(unicode));
2649	assert(_PyUnicode_CheckConsistency(copy, `1`));
2650	return copy;
2651	}
2652
2653
2654	/ Widen Unicode objects to larger buffers. Don't write terminating null*
2655	character. Return NULL on error. /*
2656
2657	static void*
2658	unicode_askind(unsigned int skind, void const data, Py_ssize_t len, unsigned* int kind)
2659	{
2660	void *result;
2661
2662	assert(skind < kind);
2663	switch (kind) {
2664	case PyUnicode_2BYTE_KIND:
2665	result = PyMem_New(Py_UCS2, len);
2666	if (!result)
2667	return PyErr_NoMemory();
2668	assert(skind == PyUnicode_1BYTE_KIND);
2669	_PyUnicode_CONVERT_BYTES(
2670	Py_UCS1, Py_UCS2,
2671	(const Py_UCS1 *)data,
2672	((const Py_UCS1 *)data) + len,
2673	result);
2674	return result;
2675	case PyUnicode_4BYTE_KIND:
2676	result = PyMem_New(Py_UCS4, len);
2677	if (!result)
2678	return PyErr_NoMemory();
2679	if (skind == PyUnicode_2BYTE_KIND) {
2680	_PyUnicode_CONVERT_BYTES(
2681	Py_UCS2, Py_UCS4,
2682	(const Py_UCS2 *)data,
2683	((const Py_UCS2 *)data) + len,
2684	result);
2685	}
2686	else {
2687	assert(skind == PyUnicode_1BYTE_KIND);
2688	_PyUnicode_CONVERT_BYTES(
2689	Py_UCS1, Py_UCS4,
2690	(const Py_UCS1 *)data,
2691	((const Py_UCS1 *)data) + len,
2692	result);
2693	}
2694	return result;
2695	default:
2696	Py_UNREACHABLE();
2697	return NULL;
2698	}
2699	}
2700
2701	static Py_UCS4*
2702	as_ucs4(PyObject string, Py_UCS4 target, Py_ssize_t targetsize,
2703	int copy_null)
2704	{
2705	int kind;
2706	const void *data;
2707	Py_ssize_t len, targetlen;
2708	if (PyUnicode_READY(string) == -`1`)
2709	return NULL;
2710	kind = PyUnicode_KIND(string);
2711	data = PyUnicode_DATA(string);
2712	len = PyUnicode_GET_LENGTH(string);
2713	targetlen = len;
2714	if (copy_null)
2715	targetlen++;
2716	if (!target) {
2717	target = PyMem_New(Py_UCS4, targetlen);
2718	if (!target) {
2719	PyErr_NoMemory();
2720	return NULL;
2721	}
2722	}
2723	else {
2724	if (targetsize < targetlen) {
2725	PyErr_Format(PyExc_SystemError,
2726	"string is longer than the buffer");
2727	if (copy_null && `0` < targetsize)
2728	target[`0`] = `0`;
2729	return NULL;
2730	}
2731	}
2732	if (kind == PyUnicode_1BYTE_KIND) {
2733	const Py_UCS1 start = (const* Py_UCS1 *) data;
2734	_PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2735	}
2736	else if (kind == PyUnicode_2BYTE_KIND) {
2737	const Py_UCS2 start = (const* Py_UCS2 *) data;
2738	_PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2739	}
2740	else if (kind == PyUnicode_4BYTE_KIND) {
2741	memcpy(target, data, len * sizeof(Py_UCS4));
2742	}
2743	else {
2744	Py_UNREACHABLE();
2745	}
2746	if (copy_null)
2747	target[len] = `0`;
2748	return target;
2749	}
2750
2751	Py_UCS4*
2752	PyUnicode_AsUCS4(PyObject string, Py_UCS4 target, Py_ssize_t targetsize,
2753	int copy_null)
2754	{
2755	if (target == NULL \|\| targetsize < `0`) {
2756	PyErr_BadInternalCall();
2757	return NULL;
2758	}
2759	return as_ucs4(string, target, targetsize, copy_null);
2760	}
2761
2762	Py_UCS4*
2763	PyUnicode_AsUCS4Copy(PyObject *string)
2764	{
2765	return as_ucs4(string, NULL, `0`, `1`);
2766	}
2767
2768	/ maximum number of characters required for output of %lld or %p.*
2769	We need at most ceil(log10(256)SIZEOF_LONG_LONG) digits,*
2770	plus 1 for the sign. 53/22 is an upper bound for log10(256). /*
2771	#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2772
2773	static int
2774	unicode_fromformat_write_str(_PyUnicodeWriter writer, PyObject str,
2775	Py_ssize_t width, Py_ssize_t precision)
2776	{
2777	Py_ssize_t length, fill, arglen;
2778	Py_UCS4 maxchar;
2779
2780	if (PyUnicode_READY(str) == -`1`)
2781	return -`1`;
2782
2783	length = PyUnicode_GET_LENGTH(str);
2784	if ((precision == -`1` \|\| precision >= length)
2785	&& width <= length)
2786	return _PyUnicodeWriter_WriteStr(writer, str);
2787
2788	if (precision != -`1`)
2789	length = Py_MIN(precision, length);
2790
2791	arglen = Py_MAX(length, width);
2792	if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2793	maxchar = _PyUnicode_FindMaxChar(str, `0`, length);
2794	else
2795	maxchar = writer->maxchar;
2796
2797	if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -`1`)
2798	return -`1`;
2799
2800	if (width > length) {
2801	fill = width - length;
2802	if (PyUnicode_Fill(writer->buffer, writer->pos, fill, `' '`) == -`1`)
2803	return -`1`;
2804	writer->pos += fill;
2805	}
2806
2807	_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2808	str, `0`, length);
2809	writer->pos += length;
2810	return `0`;
2811	}
2812
2813	static int
2814	unicode_fromformat_write_cstr(_PyUnicodeWriter writer, const* char *str,
2815	Py_ssize_t width, Py_ssize_t precision)
2816	{
2817	/ UTF-8 /
2818	Py_ssize_t length;
2819	PyObject *unicode;
2820	int res;
2821
2822	if (precision == -`1`) {
2823	length = strlen(str);
2824	}
2825	else {
2826	length = `0`;
2827	while (length < precision && str[length]) {
2828	length++;
2829	}
2830	}
2831	unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2832	if (unicode == NULL)
2833	return -`1`;
2834
2835	res = unicode_fromformat_write_str(writer, unicode, width, -`1`);
2836	Py_DECREF(unicode);
2837	return res;
2838	}
2839
2840	static const char*
2841	unicode_fromformat_arg(_PyUnicodeWriter *writer,
2842	const char f, va_list vargs)
2843	{
2844	const char *p;
2845	Py_ssize_t len;
2846	int zeropad;
2847	Py_ssize_t width;
2848	Py_ssize_t precision;
2849	int longflag;
2850	int longlongflag;
2851	int size_tflag;
2852	Py_ssize_t fill;
2853
2854	p = f;
2855	f++;
2856	zeropad = `0`;
2857	if (*f == `'0'`) {
2858	zeropad = `1`;
2859	f++;
2860	}
2861
2862	/ parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 /
2863	width = -`1`;
2864	if (Py_ISDIGIT((unsigned)*f)) {
2865	width = *f - `'0'`;
2866	f++;
2867	while (Py_ISDIGIT((unsigned)*f)) {
2868	if (width > (PY_SSIZE_T_MAX - ((int)*f - `'0'`)) / `10`) {
2869	PyErr_SetString(PyExc_ValueError,
2870	"width too big");
2871	return NULL;
2872	}
2873	width = (width * `10`) + (*f - `'0'`);
2874	f++;
2875	}
2876	}
2877	precision = -`1`;
2878	if (*f == `'.'`) {
2879	f++;
2880	if (Py_ISDIGIT((unsigned)*f)) {
2881	precision = (*f - `'0'`);
2882	f++;
2883	while (Py_ISDIGIT((unsigned)*f)) {
2884	if (precision > (PY_SSIZE_T_MAX - ((int)*f - `'0'`)) / `10`) {
2885	PyErr_SetString(PyExc_ValueError,
2886	"precision too big");
2887	return NULL;
2888	}
2889	precision = (precision * `10`) + (*f - `'0'`);
2890	f++;
2891	}
2892	}
2893	if (*f == `'%'`) {
2894	/ "%.3%s" => f points to "3" /
2895	f--;
2896	}
2897	}
2898	if (*f == `'\0'`) {
2899	/ bogus format "%.123" => go backward, f points to "3" /
2900	f--;
2901	}
2902
2903	/ Handle %ld, %lu, %lld and %llu. /
2904	longflag = `0`;
2905	longlongflag = `0`;
2906	size_tflag = `0`;
2907	if (*f == `'l'`) {
2908	if (f[`1`] == `'d'` \|\| f[`1`] == `'u'` \|\| f[`1`] == `'i'`) {
2909	longflag = `1`;
2910	++f;
2911	}
2912	else if (f[`1`] == `'l'` &&
2913	(f[`2`] == `'d'` \|\| f[`2`] == `'u'` \|\| f[`2`] == `'i'`)) {
2914	longlongflag = `1`;
2915	f += `2`;
2916	}
2917	}
2918	/ handle the size_t flag. /
2919	else if (*f == `'z'` && (f[`1`] == `'d'` \|\| f[`1`] == `'u'` \|\| f[`1`] == `'i'`)) {
2920	size_tflag = `1`;
2921	++f;
2922	}
2923
2924	if (f[`1`] == `'\0'`)
2925	writer->overallocate = `0`;
2926
2927	switch (*f) {
2928	case `'c'`:
2929	{
2930	int ordinal = va_arg(vargs, int*);
2931	if (ordinal < `0` \|\| ordinal > MAX_UNICODE) {
2932	PyErr_SetString(PyExc_OverflowError,
2933	"character argument not in range(0x110000)");
2934	return NULL;
2935	}
2936	if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < `0`)
2937	return NULL;
2938	break;
2939	}
2940
2941	case `'i'`:
2942	case `'d'`:
2943	case `'u'`:
2944	case `'x'`:
2945	{
2946	/ used by sprintf /
2947	char buffer[MAX_LONG_LONG_CHARS];
2948	Py_ssize_t arglen;
2949
2950	if (*f == `'u'`) {
2951	if (longflag) {
2952	len = sprintf(buffer, "%lu", va_arg(vargs, unsigned* long));
2953	}
2954	else if (longlongflag) {
2955	len = sprintf(buffer, "%llu", va_arg(vargs, unsigned* long long));
2956	}
2957	else if (size_tflag) {
2958	len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2959	}
2960	else {
2961	len = sprintf(buffer, "%u", va_arg(vargs, unsigned* int));
2962	}
2963	}
2964	else if (*f == `'x'`) {
2965	len = sprintf(buffer, "%x", va_arg(vargs, int*));
2966	}
2967	else {
2968	if (longflag) {
2969	len = sprintf(buffer, "%li", va_arg(vargs, long*));
2970	}
2971	else if (longlongflag) {
2972	len = sprintf(buffer, "%lli", va_arg(vargs, long* long));
2973	}
2974	else if (size_tflag) {
2975	len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2976	}
2977	else {
2978	len = sprintf(buffer, "%i", va_arg(vargs, int*));
2979	}
2980	}
2981	assert(len >= `0`);
2982
2983	if (precision < len)
2984	precision = len;
2985
2986	arglen = Py_MAX(precision, width);
2987	if (_PyUnicodeWriter_Prepare(writer, arglen, `127`) == -`1`)
2988	return NULL;
2989
2990	if (width > precision) {
2991	Py_UCS4 fillchar;
2992	fill = width - precision;
2993	fillchar = zeropad?`'0'`:`' '`;
2994	if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -`1`)
2995	return NULL;
2996	writer->pos += fill;
2997	}
2998	if (precision > len) {
2999	fill = precision - len;
3000	if (PyUnicode_Fill(writer->buffer, writer->pos, fill, `'0'`) == -`1`)
3001	return NULL;
3002	writer->pos += fill;
3003	}
3004
3005	if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < `0`)
3006	return NULL;
3007	break;
3008	}
3009
3010	case `'p'`:
3011	{
3012	char number[MAX_LONG_LONG_CHARS];
3013
3014	len = sprintf(number, "%p", va_arg(vargs, void**));
3015	assert(len >= `0`);
3016
3017	/ %p is ill-defined: ensure leading 0x. /
3018	if (number[`1`] == `'X'`)
3019	number[`1`] = `'x'`;
3020	else if (number[`1`] != `'x'`) {
3021	memmove(number + `2`, number,
3022	strlen(number) + `1`);
3023	number[`0`] = `'0'`;
3024	number[`1`] = `'x'`;
3025	len += `2`;
3026	}
3027
3028	if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < `0`)
3029	return NULL;
3030	break;
3031	}
3032
3033	case `'s'`:
3034	{
3035	/ UTF-8 /
3036	const char s = va_arg(vargs, const char*);
3037	if (unicode_fromformat_write_cstr(writer, s, width, precision) < `0`)
3038	return NULL;
3039	break;
3040	}
3041
3042	case `'U'`:
3043	{
3044	PyObject obj = va_arg(vargs, PyObject *);
3045	assert(obj && _PyUnicode_CHECK(obj));
3046
3047	if (unicode_fromformat_write_str(writer, obj, width, precision) == -`1`)
3048	return NULL;
3049	break;
3050	}
3051
3052	case `'V'`:
3053	{
3054	PyObject obj = va_arg(vargs, PyObject *);
3055	const char str = va_arg(vargs, const char *);
3056	if (obj) {
3057	assert(_PyUnicode_CHECK(obj));
3058	if (unicode_fromformat_write_str(writer, obj, width, precision) == -`1`)
3059	return NULL;
3060	}
3061	else {
3062	assert(str != NULL);
3063	if (unicode_fromformat_write_cstr(writer, str, width, precision) < `0`)
3064	return NULL;
3065	}
3066	break;
3067	}
3068
3069	case `'S'`:
3070	{
3071	PyObject obj = va_arg(vargs, PyObject *);
3072	PyObject *str;
3073	assert(obj);
3074	str = PyObject_Str(obj);
3075	if (!str)
3076	return NULL;
3077	if (unicode_fromformat_write_str(writer, str, width, precision) == -`1`) {
3078	Py_DECREF(str);
3079	return NULL;
3080	}
3081	Py_DECREF(str);
3082	break;
3083	}
3084
3085	case `'R'`:
3086	{
3087	PyObject obj = va_arg(vargs, PyObject *);
3088	PyObject *repr;
3089	assert(obj);
3090	repr = PyObject_Repr(obj);
3091	if (!repr)
3092	return NULL;
3093	if (unicode_fromformat_write_str(writer, repr, width, precision) == -`1`) {
3094	Py_DECREF(repr);
3095	return NULL;
3096	}
3097	Py_DECREF(repr);
3098	break;
3099	}
3100
3101	case `'A'`:
3102	{
3103	PyObject obj = va_arg(vargs, PyObject *);
3104	PyObject *ascii;
3105	assert(obj);
3106	ascii = PyObject_ASCII(obj);
3107	if (!ascii)
3108	return NULL;
3109	if (unicode_fromformat_write_str(writer, ascii, width, precision) == -`1`) {
3110	Py_DECREF(ascii);
3111	return NULL;
3112	}
3113	Py_DECREF(ascii);
3114	break;
3115	}
3116
3117	case `'%'`:
3118	if (_PyUnicodeWriter_WriteCharInline(writer, `'%'`) < `0`)
3119	return NULL;
3120	break;
3121
3122	default:
3123	/ if we stumble upon an unknown formatting code, copy the rest*
3124	of the format string to the output string. (we cannot just
3125	skip the code, since there's no way to know what's in the
3126	argument list) /*
3127	len = strlen(p);
3128	if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -`1`)
3129	return NULL;
3130	f = p+len;
3131	return f;
3132	}
3133
3134	f++;
3135	return f;
3136	}
3137
3138	PyObject *
3139	PyUnicode_FromFormatV(const char *format, va_list vargs)
3140	{
3141	va_list vargs2;
3142	const char *f;
3143	_PyUnicodeWriter writer;
3144
3145	_PyUnicodeWriter_Init(&writer);
3146	writer.min_length = strlen(format) + `100`;
3147	writer.overallocate = `1`;
3148
3149	// Copy varags to be able to pass a reference to a subfunction.
3150	va_copy(vargs2, vargs);
3151
3152	for (f = format; *f; ) {
3153	if (*f == `'%'`) {
3154	f = unicode_fromformat_arg(&writer, f, &vargs2);
3155	if (f == NULL)
3156	goto fail;
3157	}
3158	else {
3159	const char *p;
3160	Py_ssize_t len;
3161
3162	p = f;
3163	do
3164	{
3165	if ((unsigned char)*p > `127`) {
3166	PyErr_Format(PyExc_ValueError,
3167	"PyUnicode_FromFormatV() expects an ASCII-encoded format "
3168	"string, got a non-ASCII byte: 0x%02x",
3169	(unsigned char)*p);
3170	goto fail;
3171	}
3172	p++;
3173	}
3174	while (p != `'\0'` && p != `'%'`);
3175	len = p - f;
3176
3177	if (*p == `'\0'`)
3178	writer.overallocate = `0`;
3179
3180	if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < `0`)
3181	goto fail;
3182
3183	f = p;
3184	}
3185	}
3186	va_end(vargs2);
3187	return _PyUnicodeWriter_Finish(&writer);
3188
3189	fail:
3190	va_end(vargs2);
3191	_PyUnicodeWriter_Dealloc(&writer);
3192	return NULL;
3193	}
3194
3195	PyObject *
3196	PyUnicode_FromFormat(const char *format, ...)
3197	{
3198	PyObject* ret;
3199	va_list vargs;
3200
3201	#ifdef HAVE_STDARG_PROTOTYPES
3202	va_start(vargs, format);
3203	#else
3204	va_start(vargs);
3205	#endif
3206	ret = PyUnicode_FromFormatV(format, vargs);
3207	va_end(vargs);
3208	return ret;
3209	}
3210
3211	static Py_ssize_t
3212	unicode_get_widechar_size(PyObject *unicode)
3213	{
3214	Py_ssize_t res;
3215
3216	assert(unicode != NULL);
3217	assert(_PyUnicode_CHECK(unicode));
3218
3219	#if USE_UNICODE_WCHAR_CACHE
3220	if (_PyUnicode_WSTR(unicode) != NULL) {
3221	return PyUnicode_WSTR_LENGTH(unicode);
3222	}
3223	#endif /* USE_UNICODE_WCHAR_CACHE */
3224	assert(PyUnicode_IS_READY(unicode));
3225
3226	res = _PyUnicode_LENGTH(unicode);
3227	#if SIZEOF_WCHAR_T == 2
3228	if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3229	const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3230	const Py_UCS4 *end = s + res;
3231	for (; s < end; ++s) {
3232	if (*s > `0xFFFF`) {
3233	++res;
3234	}
3235	}
3236	}
3237	#endif
3238	return res;
3239	}
3240
3241	static void
3242	unicode_copy_as_widechar(PyObject unicode, wchar_t w, Py_ssize_t size)
3243	{
3244	assert(unicode != NULL);
3245	assert(_PyUnicode_CHECK(unicode));
3246
3247	#if USE_UNICODE_WCHAR_CACHE
3248	const wchar_t *wstr = _PyUnicode_WSTR(unicode);
3249	if (wstr != NULL) {
3250	memcpy(w, wstr, size * sizeof(wchar_t));
3251	return;
3252	}
3253	#else /* USE_UNICODE_WCHAR_CACHE */
3254	if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3255	memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3256	return;
3257	}
3258	#endif /* USE_UNICODE_WCHAR_CACHE */
3259	assert(PyUnicode_IS_READY(unicode));
3260
3261	if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3262	const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3263	for (; size--; ++s, ++w) {
3264	w = s;
3265	}
3266	}
3267	else {
3268	#if SIZEOF_WCHAR_T == 4
3269	assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3270	const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3271	for (; size--; ++s, ++w) {
3272	w = s;
3273	}
3274	#else
3275	assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3276	const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3277	for (; size--; ++s, ++w) {
3278	Py_UCS4 ch = *s;
3279	if (ch > `0xFFFF`) {
3280	assert(ch <= MAX_UNICODE);
3281	/ encode surrogate pair in this case /
3282	*w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3283	if (!size--)
3284	break;
3285	*w = Py_UNICODE_LOW_SURROGATE(ch);
3286	}
3287	else {
3288	*w = ch;
3289	}
3290	}
3291	#endif
3292	}
3293	}
3294
3295	#ifdef HAVE_WCHAR_H
3296
3297	/ Convert a Unicode object to a wide character string.*
3298
3299	- If w is NULL: return the number of wide characters (including the null
3300	character) required to convert the unicode object. Ignore size argument.
3301
3302	- Otherwise: return the number of wide characters (excluding the null
3303	character) written into w. Write at most size wide characters (including
3304	the null character). /*
3305	Py_ssize_t
3306	PyUnicode_AsWideChar(PyObject *unicode,
3307	wchar_t *w,
3308	Py_ssize_t size)
3309	{
3310	Py_ssize_t res;
3311
3312	if (unicode == NULL) {
3313	PyErr_BadInternalCall();
3314	return -`1`;
3315	}
3316	if (!PyUnicode_Check(unicode)) {
3317	PyErr_BadArgument();
3318	return -`1`;
3319	}
3320
3321	res = unicode_get_widechar_size(unicode);
3322	if (w == NULL) {
3323	return res + `1`;
3324	}
3325
3326	if (size > res) {
3327	size = res + `1`;
3328	}
3329	else {
3330	res = size;
3331	}
3332	unicode_copy_as_widechar(unicode, w, size);
3333
3334	#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3335	/ Oracle Solaris uses non-Unicode internal wchar_t form for*
3336	non-Unicode locales and hence needs conversion first. /*
3337	if (_Py_LocaleUsesNonUnicodeWchar()) {
3338	if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < `0`) {
3339	return -`1`;
3340	}
3341	}
3342	#endif
3343
3344	return res;
3345	}
3346
3347	wchar_t*
3348	PyUnicode_AsWideCharString(PyObject *unicode,
3349	Py_ssize_t *size)
3350	{
3351	wchar_t *buffer;
3352	Py_ssize_t buflen;
3353
3354	if (unicode == NULL) {
3355	PyErr_BadInternalCall();
3356	return NULL;
3357	}
3358	if (!PyUnicode_Check(unicode)) {
3359	PyErr_BadArgument();
3360	return NULL;
3361	}
3362
3363	buflen = unicode_get_widechar_size(unicode);
3364	buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + `1`));
3365	if (buffer == NULL) {
3366	PyErr_NoMemory();
3367	return NULL;
3368	}
3369	unicode_copy_as_widechar(unicode, buffer, buflen + `1`);
3370
3371	#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3372	/ Oracle Solaris uses non-Unicode internal wchar_t form for*
3373	non-Unicode locales and hence needs conversion first. /*
3374	if (_Py_LocaleUsesNonUnicodeWchar()) {
3375	if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + `1`)) < `0`) {
3376	return NULL;
3377	}
3378	}
3379	#endif
3380
3381	if (size != NULL) {
3382	*size = buflen;
3383	}
3384	else if (wcslen(buffer) != (size_t)buflen) {
3385	PyMem_Free(buffer);
3386	PyErr_SetString(PyExc_ValueError,
3387	"embedded null character");
3388	return NULL;
3389	}
3390	return buffer;
3391	}
3392
3393	#endif /* HAVE_WCHAR_H */
3394
3395	int
3396	_PyUnicode_WideCharString_Converter(PyObject obj, void* *ptr)
3397	{
3398	wchar_t p = (wchar_t )ptr;
3399	if (obj == NULL) {
3400	#if !USE_UNICODE_WCHAR_CACHE
3401	PyMem_Free(*p);
3402	#endif /* USE_UNICODE_WCHAR_CACHE */
3403	*p = NULL;
3404	return `1`;
3405	}
3406	if (PyUnicode_Check(obj)) {
3407	#if USE_UNICODE_WCHAR_CACHE
3408	p = (wchar_t )_PyUnicode_AsUnicode(obj);
3409	if (*p == NULL) {
3410	return `0`;
3411	}
3412	return `1`;
3413	#else /* USE_UNICODE_WCHAR_CACHE */
3414	*p = PyUnicode_AsWideCharString(obj, NULL);
3415	if (*p == NULL) {
3416	return `0`;
3417	}
3418	return Py_CLEANUP_SUPPORTED;
3419	#endif /* USE_UNICODE_WCHAR_CACHE */
3420	}
3421	PyErr_Format(PyExc_TypeError,
3422	"argument must be str, not %.50s",
3423	Py_TYPE(obj)->tp_name);
3424	return `0`;
3425	}
3426
3427	int
3428	_PyUnicode_WideCharString_Opt_Converter(PyObject obj, void* *ptr)
3429	{
3430	wchar_t p = (wchar_t )ptr;
3431	if (obj == NULL) {
3432	#if !USE_UNICODE_WCHAR_CACHE
3433	PyMem_Free(*p);
3434	#endif /* USE_UNICODE_WCHAR_CACHE */
3435	*p = NULL;
3436	return `1`;
3437	}
3438	if (obj == Py_None) {
3439	*p = NULL;
3440	return `1`;
3441	}
3442	if (PyUnicode_Check(obj)) {
3443	#if USE_UNICODE_WCHAR_CACHE
3444	p = (wchar_t )_PyUnicode_AsUnicode(obj);
3445	if (*p == NULL) {
3446	return `0`;
3447	}
3448	return `1`;
3449	#else /* USE_UNICODE_WCHAR_CACHE */
3450	*p = PyUnicode_AsWideCharString(obj, NULL);
3451	if (*p == NULL) {
3452	return `0`;
3453	}
3454	return Py_CLEANUP_SUPPORTED;
3455	#endif /* USE_UNICODE_WCHAR_CACHE */
3456	}
3457	PyErr_Format(PyExc_TypeError,
3458	"argument must be str or None, not %.50s",
3459	Py_TYPE(obj)->tp_name);
3460	return `0`;
3461	}
3462
3463	PyObject *
3464	PyUnicode_FromOrdinal(int ordinal)
3465	{
3466	if (ordinal < `0` \|\| ordinal > MAX_UNICODE) {
3467	PyErr_SetString(PyExc_ValueError,
3468	"chr() arg not in range(0x110000)");
3469	return NULL;
3470	}
3471
3472	return unicode_char((Py_UCS4)ordinal);
3473	}
3474
3475	PyObject *
3476	PyUnicode_FromObject(PyObject *obj)
3477	{
3478	/ XXX Perhaps we should make this API an alias of*
3479	PyObject_Str() instead ?! /*
3480	if (PyUnicode_CheckExact(obj)) {
3481	if (PyUnicode_READY(obj) == -`1`)
3482	return NULL;
3483	Py_INCREF(obj);
3484	return obj;
3485	}
3486	if (PyUnicode_Check(obj)) {
3487	/ For a Unicode subtype that's not a Unicode object,*
3488	return a true Unicode object with the same data. /*
3489	return _PyUnicode_Copy(obj);
3490	}
3491	PyErr_Format(PyExc_TypeError,
3492	"Can't convert '%.100s' object to str implicitly",
3493	Py_TYPE(obj)->tp_name);
3494	return NULL;
3495	}
3496
3497	PyObject *
3498	PyUnicode_FromEncodedObject(PyObject *obj,
3499	const char *encoding,
3500	const char *errors)
3501	{
3502	Py_buffer buffer;
3503	PyObject *v;
3504
3505	if (obj == NULL) {
3506	PyErr_BadInternalCall();
3507	return NULL;
3508	}
3509
3510	/ Decoding bytes objects is the most common case and should be fast /
3511	if (PyBytes_Check(obj)) {
3512	if (PyBytes_GET_SIZE(obj) == `0`) {
3513	if (unicode_check_encoding_errors(encoding, errors) < `0`) {
3514	return NULL;
3515	}
3516	_Py_RETURN_UNICODE_EMPTY();
3517	}
3518	return PyUnicode_Decode(
3519	PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3520	encoding, errors);
3521	}
3522
3523	if (PyUnicode_Check(obj)) {
3524	PyErr_SetString(PyExc_TypeError,
3525	"decoding str is not supported");
3526	return NULL;
3527	}
3528
3529	/ Retrieve a bytes buffer view through the PEP 3118 buffer interface /
3530	if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < `0`) {
3531	PyErr_Format(PyExc_TypeError,
3532	"decoding to str: need a bytes-like object, %.80s found",
3533	Py_TYPE(obj)->tp_name);
3534	return NULL;
3535	}
3536
3537	if (buffer.len == `0`) {
3538	PyBuffer_Release(&buffer);
3539	if (unicode_check_encoding_errors(encoding, errors) < `0`) {
3540	return NULL;
3541	}
3542	_Py_RETURN_UNICODE_EMPTY();
3543	}
3544
3545	v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3546	PyBuffer_Release(&buffer);
3547	return v;
3548	}
3549
3550	/ Normalize an encoding name: similar to encodings.normalize_encoding(), but*
3551	also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3552	longer than lower_len-1). /*
3553	int
3554	_Py_normalize_encoding(const char *encoding,
3555	char *lower,
3556	size_t lower_len)
3557	{
3558	const char *e;
3559	char *l;
3560	char *l_end;
3561	int punct;
3562
3563	assert(encoding != NULL);
3564
3565	e = encoding;
3566	l = lower;
3567	l_end = &lower[lower_len - `1`];
3568	punct = `0`;
3569	while (`1`) {
3570	char c = *e;
3571	if (c == `0`) {
3572	break;
3573	}
3574
3575	if (Py_ISALNUM(c) \|\| c == `'.'`) {
3576	if (punct && l != lower) {
3577	if (l == l_end) {
3578	return `0`;
3579	}
3580	*l++ = `'_'`;
3581	}
3582	punct = `0`;
3583
3584	if (l == l_end) {
3585	return `0`;
3586	}
3587	*l++ = Py_TOLOWER(c);
3588	}
3589	else {
3590	punct = `1`;
3591	}
3592
3593	e++;
3594	}
3595	*l = `'\0'`;
3596	return `1`;
3597	}
3598
3599	PyObject *
3600	PyUnicode_Decode(const char *s,
3601	Py_ssize_t size,
3602	const char *encoding,
3603	const char *errors)
3604	{
3605	PyObject buffer = NULL, unicode;
3606	Py_buffer info;
3607	char buflower[`11`]; / strlen("iso-8859-1\0") == 11, longest shortcut /
3608
3609	if (unicode_check_encoding_errors(encoding, errors) < `0`) {
3610	return NULL;
3611	}
3612
3613	if (size == `0`) {
3614	_Py_RETURN_UNICODE_EMPTY();
3615	}
3616
3617	if (encoding == NULL) {
3618	return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3619	}
3620
3621	/ Shortcuts for common default encodings /
3622	if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3623	char *lower = buflower;
3624
3625	/ Fast paths /
3626	if (lower[`0`] == `'u'` && lower[`1`] == `'t'` && lower[`2`] == `'f'`) {
3627	lower += `3`;
3628	if (*lower == `'_'`) {
3629	/ Match "utf8" and "utf_8" /
3630	lower++;
3631	}
3632
3633	if (lower[`0`] == `'8'` && lower[`1`] == `0`) {
3634	return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3635	}
3636	else if (lower[`0`] == `'1'` && lower[`1`] == `'6'` && lower[`2`] == `0`) {
3637	return PyUnicode_DecodeUTF16(s, size, errors, `0`);
3638	}
3639	else if (lower[`0`] == `'3'` && lower[`1`] == `'2'` && lower[`2`] == `0`) {
3640	return PyUnicode_DecodeUTF32(s, size, errors, `0`);
3641	}
3642	}
3643	else {
3644	if (strcmp(lower, "ascii") == `0`
3645	\|\| strcmp(lower, "us_ascii") == `0`) {
3646	return PyUnicode_DecodeASCII(s, size, errors);
3647	}
3648	#ifdef MS_WINDOWS
3649	else if (strcmp(lower, "mbcs") == `0`) {
3650	return PyUnicode_DecodeMBCS(s, size, errors);
3651	}
3652	#endif
3653	else if (strcmp(lower, "latin1") == `0`
3654	\|\| strcmp(lower, "latin_1") == `0`
3655	\|\| strcmp(lower, "iso_8859_1") == `0`
3656	\|\| strcmp(lower, "iso8859_1") == `0`) {
3657	return PyUnicode_DecodeLatin1(s, size, errors);
3658	}
3659	}
3660	}
3661
3662	/ Decode via the codec registry /
3663	buffer = NULL;
3664	if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, `1`, PyBUF_FULL_RO) < `0`)
3665	goto onError;
3666	buffer = PyMemoryView_FromBuffer(&info);
3667	if (buffer == NULL)
3668	goto onError;
3669	unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3670	if (unicode == NULL)
3671	goto onError;
3672	if (!PyUnicode_Check(unicode)) {
3673	PyErr_Format(PyExc_TypeError,
3674	"'%.400s' decoder returned '%.400s' instead of 'str'; "
3675	"use codecs.decode() to decode to arbitrary types",
3676	encoding,
3677	Py_TYPE(unicode)->tp_name);
3678	Py_DECREF(unicode);
3679	goto onError;
3680	}
3681	Py_DECREF(buffer);
3682	return unicode_result(unicode);
3683
3684	onError:
3685	Py_XDECREF(buffer);
3686	return NULL;
3687	}
3688
3689	PyObject *
3690	PyUnicode_AsDecodedObject(PyObject *unicode,
3691	const char *encoding,
3692	const char *errors)
3693	{
3694	if (!PyUnicode_Check(unicode)) {
3695	PyErr_BadArgument();
3696	return NULL;
3697	}
3698
3699	if (PyErr_WarnEx(PyExc_DeprecationWarning,
3700	"PyUnicode_AsDecodedObject() is deprecated; "
3701	"use PyCodec_Decode() to decode from str", `1`) < `0`)
3702	return NULL;
3703
3704	if (encoding == NULL)
3705	encoding = PyUnicode_GetDefaultEncoding();
3706
3707	/ Decode via the codec registry /
3708	return PyCodec_Decode(unicode, encoding, errors);
3709	}
3710
3711	PyObject *
3712	PyUnicode_AsDecodedUnicode(PyObject *unicode,
3713	const char *encoding,
3714	const char *errors)
3715	{
3716	PyObject *v;
3717
3718	if (!PyUnicode_Check(unicode)) {
3719	PyErr_BadArgument();
3720	goto onError;
3721	}
3722
3723	if (PyErr_WarnEx(PyExc_DeprecationWarning,
3724	"PyUnicode_AsDecodedUnicode() is deprecated; "
3725	"use PyCodec_Decode() to decode from str to str", `1`) < `0`)
3726	return NULL;
3727
3728	if (encoding == NULL)
3729	encoding = PyUnicode_GetDefaultEncoding();
3730
3731	/ Decode via the codec registry /
3732	v = PyCodec_Decode(unicode, encoding, errors);
3733	if (v == NULL)
3734	goto onError;
3735	if (!PyUnicode_Check(v)) {
3736	PyErr_Format(PyExc_TypeError,
3737	"'%.400s' decoder returned '%.400s' instead of 'str'; "
3738	"use codecs.decode() to decode to arbitrary types",
3739	encoding,
3740	Py_TYPE(unicode)->tp_name);
3741	Py_DECREF(v);
3742	goto onError;
3743	}
3744	return unicode_result(v);
3745
3746	onError:
3747	return NULL;
3748	}
3749
3750	PyObject *
3751	PyUnicode_Encode(const Py_UNICODE *s,
3752	Py_ssize_t size,
3753	const char *encoding,
3754	const char *errors)
3755	{
3756	PyObject v, unicode;
3757
3758	unicode = PyUnicode_FromWideChar(s, size);
3759	if (unicode == NULL)
3760	return NULL;
3761	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3762	Py_DECREF(unicode);
3763	return v;
3764	}
3765
3766	PyObject *
3767	PyUnicode_AsEncodedObject(PyObject *unicode,
3768	const char *encoding,
3769	const char *errors)
3770	{
3771	PyObject *v;
3772
3773	if (!PyUnicode_Check(unicode)) {
3774	PyErr_BadArgument();
3775	goto onError;
3776	}
3777
3778	if (PyErr_WarnEx(PyExc_DeprecationWarning,
3779	"PyUnicode_AsEncodedObject() is deprecated; "
3780	"use PyUnicode_AsEncodedString() to encode from str to bytes "
3781	"or PyCodec_Encode() for generic encoding", `1`) < `0`)
3782	return NULL;
3783
3784	if (encoding == NULL)
3785	encoding = PyUnicode_GetDefaultEncoding();
3786
3787	/ Encode via the codec registry /
3788	v = PyCodec_Encode(unicode, encoding, errors);
3789	if (v == NULL)
3790	goto onError;
3791	return v;
3792
3793	onError:
3794	return NULL;
3795	}
3796
3797
3798	static PyObject *
3799	unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3800	int current_locale)
3801	{
3802	Py_ssize_t wlen;
3803	wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3804	if (wstr == NULL) {
3805	return NULL;
3806	}
3807
3808	if ((size_t)wlen != wcslen(wstr)) {
3809	PyErr_SetString(PyExc_ValueError, "embedded null character");
3810	PyMem_Free(wstr);
3811	return NULL;
3812	}
3813
3814	char *str;
3815	size_t error_pos;
3816	const char *reason;
3817	int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3818	current_locale, error_handler);
3819	PyMem_Free(wstr);
3820
3821	if (res != `0`) {
3822	if (res == -`2`) {
3823	PyObject *exc;
3824	exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3825	"locale", unicode,
3826	(Py_ssize_t)error_pos,
3827	(Py_ssize_t)(error_pos+`1`),
3828	reason);
3829	if (exc != NULL) {
3830	PyCodec_StrictErrors(exc);
3831	Py_DECREF(exc);
3832	}
3833	}
3834	else if (res == -`3`) {
3835	PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3836	}
3837	else {
3838	PyErr_NoMemory();
3839	}
3840	return NULL;
3841	}
3842
3843	PyObject *bytes = PyBytes_FromString(str);
3844	PyMem_RawFree(str);
3845	return bytes;
3846	}
3847
3848	PyObject *
3849	PyUnicode_EncodeLocale(PyObject unicode, const* char *errors)
3850	{
3851	_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3852	return unicode_encode_locale(unicode, error_handler, `1`);
3853	}
3854
3855	PyObject *
3856	PyUnicode_EncodeFSDefault(PyObject *unicode)
3857	{
3858	PyInterpreterState *interp = _PyInterpreterState_GET();
3859	struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3860	if (fs_codec->utf8) {
3861	return unicode_encode_utf8(unicode,
3862	fs_codec->error_handler,
3863	fs_codec->errors);
3864	}
3865	#ifndef _Py_FORCE_UTF8_FS_ENCODING
3866	else if (fs_codec->encoding) {
3867	return PyUnicode_AsEncodedString(unicode,
3868	fs_codec->encoding,
3869	fs_codec->errors);
3870	}
3871	#endif
3872	else {
3873	/ Before _PyUnicode_InitEncodings() is called, the Python codec*
3874	machinery is not ready and so cannot be used:
3875	use wcstombs() in this case. /*
3876	const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3877	const wchar_t *filesystem_errors = config->filesystem_errors;
3878	assert(filesystem_errors != NULL);
3879	_Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3880	assert(errors != _Py_ERROR_UNKNOWN);
3881	#ifdef _Py_FORCE_UTF8_FS_ENCODING
3882	return unicode_encode_utf8(unicode, errors, NULL);
3883	#else
3884	return unicode_encode_locale(unicode, errors, `0`);
3885	#endif
3886	}
3887	}
3888
3889	PyObject *
3890	PyUnicode_AsEncodedString(PyObject *unicode,
3891	const char *encoding,
3892	const char *errors)
3893	{
3894	PyObject *v;
3895	char buflower[`11`]; / strlen("iso_8859_1\0") == 11, longest shortcut /
3896
3897	if (!PyUnicode_Check(unicode)) {
3898	PyErr_BadArgument();
3899	return NULL;
3900	}
3901
3902	if (unicode_check_encoding_errors(encoding, errors) < `0`) {
3903	return NULL;
3904	}
3905
3906	if (encoding == NULL) {
3907	return _PyUnicode_AsUTF8String(unicode, errors);
3908	}
3909
3910	/ Shortcuts for common default encodings /
3911	if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3912	char *lower = buflower;
3913
3914	/ Fast paths /
3915	if (lower[`0`] == `'u'` && lower[`1`] == `'t'` && lower[`2`] == `'f'`) {
3916	lower += `3`;
3917	if (*lower == `'_'`) {
3918	/ Match "utf8" and "utf_8" /
3919	lower++;
3920	}
3921
3922	if (lower[`0`] == `'8'` && lower[`1`] == `0`) {
3923	return _PyUnicode_AsUTF8String(unicode, errors);
3924	}
3925	else if (lower[`0`] == `'1'` && lower[`1`] == `'6'` && lower[`2`] == `0`) {
3926	return _PyUnicode_EncodeUTF16(unicode, errors, `0`);
3927	}
3928	else if (lower[`0`] == `'3'` && lower[`1`] == `'2'` && lower[`2`] == `0`) {
3929	return _PyUnicode_EncodeUTF32(unicode, errors, `0`);
3930	}
3931	}
3932	else {
3933	if (strcmp(lower, "ascii") == `0`
3934	\|\| strcmp(lower, "us_ascii") == `0`) {
3935	return _PyUnicode_AsASCIIString(unicode, errors);
3936	}
3937	#ifdef MS_WINDOWS
3938	else if (strcmp(lower, "mbcs") == `0`) {
3939	return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3940	}
3941	#endif
3942	else if (strcmp(lower, "latin1") == `0` \|\|
3943	strcmp(lower, "latin_1") == `0` \|\|
3944	strcmp(lower, "iso_8859_1") == `0` \|\|
3945	strcmp(lower, "iso8859_1") == `0`) {
3946	return _PyUnicode_AsLatin1String(unicode, errors);
3947	}
3948	}
3949	}
3950
3951	/ Encode via the codec registry /
3952	v = _PyCodec_EncodeText(unicode, encoding, errors);
3953	if (v == NULL)
3954	return NULL;
3955
3956	/ The normal path /
3957	if (PyBytes_Check(v))
3958	return v;
3959
3960	/ If the codec returns a buffer, raise a warning and convert to bytes /
3961	if (PyByteArray_Check(v)) {
3962	int error;
3963	PyObject *b;
3964
3965	error = PyErr_WarnFormat(PyExc_RuntimeWarning, `1`,
3966	"encoder %s returned bytearray instead of bytes; "
3967	"use codecs.encode() to encode to arbitrary types",
3968	encoding);
3969	if (error) {
3970	Py_DECREF(v);
3971	return NULL;
3972	}
3973
3974	b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3975	PyByteArray_GET_SIZE(v));
3976	Py_DECREF(v);
3977	return b;
3978	}
3979
3980	PyErr_Format(PyExc_TypeError,
3981	"'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3982	"use codecs.encode() to encode to arbitrary types",
3983	encoding,
3984	Py_TYPE(v)->tp_name);
3985	Py_DECREF(v);
3986	return NULL;
3987	}
3988
3989	PyObject *
3990	PyUnicode_AsEncodedUnicode(PyObject *unicode,
3991	const char *encoding,
3992	const char *errors)
3993	{
3994	PyObject *v;
3995
3996	if (!PyUnicode_Check(unicode)) {
3997	PyErr_BadArgument();
3998	goto onError;
3999	}
4000
4001	if (PyErr_WarnEx(PyExc_DeprecationWarning,
4002	"PyUnicode_AsEncodedUnicode() is deprecated; "
4003	"use PyCodec_Encode() to encode from str to str", `1`) < `0`)
4004	return NULL;
4005
4006	if (encoding == NULL)
4007	encoding = PyUnicode_GetDefaultEncoding();
4008
4009	/ Encode via the codec registry /
4010	v = PyCodec_Encode(unicode, encoding, errors);
4011	if (v == NULL)
4012	goto onError;
4013	if (!PyUnicode_Check(v)) {
4014	PyErr_Format(PyExc_TypeError,
4015	"'%.400s' encoder returned '%.400s' instead of 'str'; "
4016	"use codecs.encode() to encode to arbitrary types",
4017	encoding,
4018	Py_TYPE(v)->tp_name);
4019	Py_DECREF(v);
4020	goto onError;
4021	}
4022	return v;
4023
4024	onError:
4025	return NULL;
4026	}
4027
4028	static PyObject*
4029	unicode_decode_locale(const char *str, Py_ssize_t len,
4030	_Py_error_handler errors, int current_locale)
4031	{
4032	if (str[len] != `'\0'` \|\| (size_t)len != strlen(str)) {
4033	PyErr_SetString(PyExc_ValueError, "embedded null byte");
4034	return NULL;
4035	}
4036
4037	wchar_t *wstr;
4038	size_t wlen;
4039	const char *reason;
4040	int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
4041	current_locale, errors);
4042	if (res != `0`) {
4043	if (res == -`2`) {
4044	PyObject *exc;
4045	exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4046	"locale", str, len,
4047	(Py_ssize_t)wlen,
4048	(Py_ssize_t)(wlen + `1`),
4049	reason);
4050	if (exc != NULL) {
4051	PyCodec_StrictErrors(exc);
4052	Py_DECREF(exc);
4053	}
4054	}
4055	else if (res == -`3`) {
4056	PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4057	}
4058	else {
4059	PyErr_NoMemory();
4060	}
4061	return NULL;
4062	}
4063
4064	PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4065	PyMem_RawFree(wstr);
4066	return unicode;
4067	}
4068
4069	PyObject*
4070	PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4071	const char *errors)
4072	{
4073	_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4074	return unicode_decode_locale(str, len, error_handler, `1`);
4075	}
4076
4077	PyObject*
4078	PyUnicode_DecodeLocale(const char str, const* char *errors)
4079	{
4080	Py_ssize_t size = (Py_ssize_t)strlen(str);
4081	_Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4082	return unicode_decode_locale(str, size, error_handler, `1`);
4083	}
4084
4085
4086	PyObject*
4087	PyUnicode_DecodeFSDefault(const char *s) {
4088	Py_ssize_t size = (Py_ssize_t)strlen(s);
4089	return PyUnicode_DecodeFSDefaultAndSize(s, size);
4090	}
4091
4092	PyObject*
4093	PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4094	{
4095	PyInterpreterState *interp = _PyInterpreterState_GET();
4096	struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4097	if (fs_codec->utf8) {
4098	return unicode_decode_utf8(s, size,
4099	fs_codec->error_handler,
4100	fs_codec->errors,
4101	NULL);
4102	}
4103	#ifndef _Py_FORCE_UTF8_FS_ENCODING
4104	else if (fs_codec->encoding) {
4105	return PyUnicode_Decode(s, size,
4106	fs_codec->encoding,
4107	fs_codec->errors);
4108	}
4109	#endif
4110	else {
4111	/ Before _PyUnicode_InitEncodings() is called, the Python codec*
4112	machinery is not ready and so cannot be used:
4113	use mbstowcs() in this case. /*
4114	const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4115	const wchar_t *filesystem_errors = config->filesystem_errors;
4116	assert(filesystem_errors != NULL);
4117	_Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4118	assert(errors != _Py_ERROR_UNKNOWN);
4119	#ifdef _Py_FORCE_UTF8_FS_ENCODING
4120	return unicode_decode_utf8(s, size, errors, NULL, NULL);
4121	#else
4122	return unicode_decode_locale(s, size, errors, `0`);
4123	#endif
4124	}
4125	}
4126
4127
4128	int
4129	PyUnicode_FSConverter(PyObject* arg, void* addr)
4130	{
4131	PyObject *path = NULL;
4132	PyObject *output = NULL;
4133	Py_ssize_t size;
4134	const char *data;
4135	if (arg == NULL) {
4136	Py_DECREF((PyObject*)addr);
4137	(PyObject*)addr = NULL;
4138	return `1`;
4139	}
4140	path = PyOS_FSPath(arg);
4141	if (path == NULL) {
4142	return `0`;
4143	}
4144	if (PyBytes_Check(path)) {
4145	output = path;
4146	}
4147	else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4148	output = PyUnicode_EncodeFSDefault(path);
4149	Py_DECREF(path);
4150	if (!output) {
4151	return `0`;
4152	}
4153	assert(PyBytes_Check(output));
4154	}
4155
4156	size = PyBytes_GET_SIZE(output);
4157	data = PyBytes_AS_STRING(output);
4158	if ((size_t)size != strlen(data)) {
4159	PyErr_SetString(PyExc_ValueError, "embedded null byte");
4160	Py_DECREF(output);
4161	return `0`;
4162	}
4163	(PyObject*)addr = output;
4164	return Py_CLEANUP_SUPPORTED;
4165	}
4166
4167
4168	int
4169	PyUnicode_FSDecoder(PyObject* arg, void* addr)
4170	{
4171	int is_buffer = `0`;
4172	PyObject *path = NULL;
4173	PyObject *output = NULL;
4174	if (arg == NULL) {
4175	Py_DECREF((PyObject*)addr);
4176	(PyObject*)addr = NULL;
4177	return `1`;
4178	}
4179
4180	is_buffer = PyObject_CheckBuffer(arg);
4181	if (!is_buffer) {
4182	path = PyOS_FSPath(arg);
4183	if (path == NULL) {
4184	return `0`;
4185	}
4186	}
4187	else {
4188	path = arg;
4189	Py_INCREF(arg);
4190	}
4191
4192	if (PyUnicode_Check(path)) {
4193	output = path;
4194	}
4195	else if (PyBytes_Check(path) \|\| is_buffer) {
4196	PyObject *path_bytes = NULL;
4197
4198	if (!PyBytes_Check(path) &&
4199	PyErr_WarnFormat(PyExc_DeprecationWarning, `1`,
4200	"path should be string, bytes, or os.PathLike, not %.200s",
4201	Py_TYPE(arg)->tp_name)) {
4202	Py_DECREF(path);
4203	return `0`;
4204	}
4205	path_bytes = PyBytes_FromObject(path);
4206	Py_DECREF(path);
4207	if (!path_bytes) {
4208	return `0`;
4209	}
4210	output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4211	PyBytes_GET_SIZE(path_bytes));
4212	Py_DECREF(path_bytes);
4213	if (!output) {
4214	return `0`;
4215	}
4216	}
4217	else {
4218	PyErr_Format(PyExc_TypeError,
4219	"path should be string, bytes, or os.PathLike, not %.200s",
4220	Py_TYPE(arg)->tp_name);
4221	Py_DECREF(path);
4222	return `0`;
4223	}
4224	if (PyUnicode_READY(output) == -`1`) {
4225	Py_DECREF(output);
4226	return `0`;
4227	}
4228	if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4229	PyUnicode_GET_LENGTH(output), `0`, `1`) >= `0`) {
4230	PyErr_SetString(PyExc_ValueError, "embedded null character");
4231	Py_DECREF(output);
4232	return `0`;
4233	}
4234	(PyObject*)addr = output;
4235	return Py_CLEANUP_SUPPORTED;
4236	}
4237
4238
4239	static int unicode_fill_utf8(PyObject *unicode);
4240
4241	const char *
4242	PyUnicode_AsUTF8AndSize(PyObject unicode, Py_ssize_t psize)
4243	{
4244	if (!PyUnicode_Check(unicode)) {
4245	PyErr_BadArgument();
4246	return NULL;
4247	}
4248	if (PyUnicode_READY(unicode) == -`1`)
4249	return NULL;
4250
4251	if (PyUnicode_UTF8(unicode) == NULL) {
4252	if (unicode_fill_utf8(unicode) == -`1`) {
4253	return NULL;
4254	}
4255	}
4256
4257	if (psize)
4258	*psize = PyUnicode_UTF8_LENGTH(unicode);
4259	return PyUnicode_UTF8(unicode);
4260	}
4261
4262	const char *
4263	PyUnicode_AsUTF8(PyObject *unicode)
4264	{
4265	return PyUnicode_AsUTF8AndSize(unicode, NULL);
4266	}
4267
4268	Py_UNICODE *
4269	PyUnicode_AsUnicodeAndSize(PyObject unicode, Py_ssize_t size)
4270	{
4271	if (!PyUnicode_Check(unicode)) {
4272	PyErr_BadArgument();
4273	return NULL;
4274	}
4275	Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4276	if (w == NULL) {
4277	/ Non-ASCII compact unicode object /
4278	assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4279	assert(PyUnicode_IS_READY(unicode));
4280
4281	Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4282	if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - `1`) {
4283	PyErr_NoMemory();
4284	return NULL;
4285	}
4286	w = (wchar_t ) PyObject_Malloc(sizeof(wchar_t) (wlen + `1`));
4287	if (w == NULL) {
4288	PyErr_NoMemory();
4289	return NULL;
4290	}
4291	unicode_copy_as_widechar(unicode, w, wlen + `1`);
4292	_PyUnicode_WSTR(unicode) = w;
4293	if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4294	_PyUnicode_WSTR_LENGTH(unicode) = wlen;
4295	}
4296	}
4297	if (size != NULL)
4298	*size = PyUnicode_WSTR_LENGTH(unicode);
4299	return w;
4300	}
4301
4302	/ Deprecated APIs /
4303
4304	_Py_COMP_DIAG_PUSH
4305	_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4306
4307	Py_UNICODE *
4308	PyUnicode_AsUnicode(PyObject *unicode)
4309	{
4310	return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4311	}
4312
4313	const Py_UNICODE *
4314	_PyUnicode_AsUnicode(PyObject *unicode)
4315	{
4316	Py_ssize_t size;
4317	const Py_UNICODE *wstr;
4318
4319	wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4320	if (wstr && wcslen(wstr) != (size_t)size) {
4321	PyErr_SetString(PyExc_ValueError, "embedded null character");
4322	return NULL;
4323	}
4324	return wstr;
4325	}
4326
4327
4328	Py_ssize_t
4329	PyUnicode_GetSize(PyObject *unicode)
4330	{
4331	if (!PyUnicode_Check(unicode)) {
4332	PyErr_BadArgument();
4333	goto onError;
4334	}
4335	if (_PyUnicode_WSTR(unicode) == NULL) {
4336	if (PyUnicode_AsUnicode(unicode) == NULL)
4337	goto onError;
4338	}
4339	return PyUnicode_WSTR_LENGTH(unicode);
4340
4341	onError:
4342	return -`1`;
4343	}
4344
4345	_Py_COMP_DIAG_POP
4346
4347	Py_ssize_t
4348	PyUnicode_GetLength(PyObject *unicode)
4349	{
4350	if (!PyUnicode_Check(unicode)) {
4351	PyErr_BadArgument();
4352	return -`1`;
4353	}
4354	if (PyUnicode_READY(unicode) == -`1`)
4355	return -`1`;
4356	return PyUnicode_GET_LENGTH(unicode);
4357	}
4358
4359	Py_UCS4
4360	PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4361	{
4362	const void *data;
4363	int kind;
4364
4365	if (!PyUnicode_Check(unicode)) {
4366	PyErr_BadArgument();
4367	return (Py_UCS4)-`1`;
4368	}
4369	if (PyUnicode_READY(unicode) == -`1`) {
4370	return (Py_UCS4)-`1`;
4371	}
4372	if (index < `0` \|\| index >= PyUnicode_GET_LENGTH(unicode)) {
4373	PyErr_SetString(PyExc_IndexError, "string index out of range");
4374	return (Py_UCS4)-`1`;
4375	}
4376	data = PyUnicode_DATA(unicode);
4377	kind = PyUnicode_KIND(unicode);
4378	return PyUnicode_READ(kind, data, index);
4379	}
4380
4381	int
4382	PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4383	{
4384	if (!PyUnicode_Check(unicode) \|\| !PyUnicode_IS_COMPACT(unicode)) {
4385	PyErr_BadArgument();
4386	return -`1`;
4387	}
4388	assert(PyUnicode_IS_READY(unicode));
4389	if (index < `0` \|\| index >= PyUnicode_GET_LENGTH(unicode)) {
4390	PyErr_SetString(PyExc_IndexError, "string index out of range");
4391	return -`1`;
4392	}
4393	if (unicode_check_modifiable(unicode))
4394	return -`1`;
4395	if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4396	PyErr_SetString(PyExc_ValueError, "character out of range");
4397	return -`1`;
4398	}
4399	PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4400	index, ch);
4401	return `0`;
4402	}
4403
4404	const char *
4405	PyUnicode_GetDefaultEncoding(void)
4406	{
4407	return "utf-8";
4408	}
4409
4410	/ create or adjust a UnicodeDecodeError /
4411	static void
4412	make_decode_exception(PyObject **exceptionObject,
4413	const char *encoding,
4414	const char *input, Py_ssize_t length,
4415	Py_ssize_t startpos, Py_ssize_t endpos,
4416	const char *reason)
4417	{
4418	if (*exceptionObject == NULL) {
4419	*exceptionObject = PyUnicodeDecodeError_Create(
4420	encoding, input, length, startpos, endpos, reason);
4421	}
4422	else {
4423	if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4424	goto onError;
4425	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4426	goto onError;
4427	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4428	goto onError;
4429	}
4430	return;
4431
4432	onError:
4433	Py_CLEAR(*exceptionObject);
4434	}
4435
4436	#ifdef MS_WINDOWS
4437	static int
4438	widechar_resize(wchar_t *buf, Py_ssize_t size, Py_ssize_t newsize)
4439	{
4440	if (newsize > *size) {
4441	wchar_t newbuf = buf;
4442	if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4443	PyErr_NoMemory();
4444	return -`1`;
4445	}
4446	*buf = newbuf;
4447	}
4448	*size = newsize;
4449	return `0`;
4450	}
4451
4452	/ error handling callback helper:*
4453	build arguments, call the callback and check the arguments,
4454	if no exception occurred, copy the replacement to the output
4455	and adjust various state variables.
4456	return 0 on success, -1 on error
4457	*/
4458
4459	static int
4460	unicode_decode_call_errorhandler_wchar(
4461	const char errors, PyObject *errorHandler,
4462	const char encoding, const* char *reason,
4463	const char *input, const* char *inend, Py_ssize_t startinpos,
4464	Py_ssize_t endinpos, PyObject exceptionObject, const* char **inptr,
4465	wchar_t *buf, Py_ssize_t bufsize, Py_ssize_t *outpos)
4466	{
4467	static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4468
4469	PyObject *restuple = NULL;
4470	PyObject *repunicode = NULL;
4471	Py_ssize_t outsize;
4472	Py_ssize_t insize;
4473	Py_ssize_t requiredsize;
4474	Py_ssize_t newpos;
4475	PyObject *inputobj = NULL;
4476	Py_ssize_t repwlen;
4477
4478	if (*errorHandler == NULL) {
4479	*errorHandler = PyCodec_LookupError(errors);
4480	if (*errorHandler == NULL)
4481	goto onError;
4482	}
4483
4484	make_decode_exception(exceptionObject,
4485	encoding,
4486	input, inend - *input,
4487	startinpos, endinpos,
4488	reason);
4489	if (*exceptionObject == NULL)
4490	goto onError;
4491
4492	restuple = PyObject_CallOneArg(errorHandler, exceptionObject);
4493	if (restuple == NULL)
4494	goto onError;
4495	if (!PyTuple_Check(restuple)) {
4496	PyErr_SetString(PyExc_TypeError, &argparse[`3`]);
4497	goto onError;
4498	}
4499	if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4500	goto onError;
4501
4502	/ Copy back the bytes variables, which might have been modified by the*
4503	callback /*
4504	inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4505	if (!inputobj)
4506	goto onError;
4507	*input = PyBytes_AS_STRING(inputobj);
4508	insize = PyBytes_GET_SIZE(inputobj);
4509	inend = input + insize;
4510	/ we can DECREF safely, as the exception has another reference,*
4511	so the object won't go away. /*
4512	Py_DECREF(inputobj);
4513
4514	if (newpos<`0`)
4515	newpos = insize+newpos;
4516	if (newpos<`0` \|\| newpos>insize) {
4517	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4518	goto onError;
4519	}
4520
4521	#if USE_UNICODE_WCHAR_CACHE
4522	_Py_COMP_DIAG_PUSH
4523	_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4524	repwlen = PyUnicode_GetSize(repunicode);
4525	if (repwlen < `0`)
4526	goto onError;
4527	_Py_COMP_DIAG_POP
4528	#else /* USE_UNICODE_WCHAR_CACHE */
4529	repwlen = PyUnicode_AsWideChar(repunicode, NULL, `0`);
4530	if (repwlen < `0`)
4531	goto onError;
4532	repwlen--;
4533	#endif /* USE_UNICODE_WCHAR_CACHE */
4534	/ need more space? (at least enough for what we*
4535	have+the replacement+the rest of the string (starting
4536	at the new input position), so we won't have to check space
4537	when there are no errors in the rest of the string) /*
4538	requiredsize = *outpos;
4539	if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4540	goto overflow;
4541	requiredsize += repwlen;
4542	if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4543	goto overflow;
4544	requiredsize += insize - newpos;
4545	outsize = *bufsize;
4546	if (requiredsize > outsize) {
4547	if (outsize <= PY_SSIZE_T_MAX/`2` && requiredsize < `2`*outsize)
4548	requiredsize = `2`*outsize;
4549	if (widechar_resize(buf, bufsize, requiredsize) < `0`) {
4550	goto onError;
4551	}
4552	}
4553	PyUnicode_AsWideChar(repunicode, buf + outpos, repwlen);
4554	*outpos += repwlen;
4555	*endinpos = newpos;
4556	inptr = input + newpos;
4557
4558	/ we made it! /
4559	Py_DECREF(restuple);
4560	return `0`;
4561
4562	overflow:
4563	PyErr_SetString(PyExc_OverflowError,
4564	"decoded result is too long for a Python string");
4565
4566	onError:
4567	Py_XDECREF(restuple);
4568	return -`1`;
4569	}
4570	#endif /* MS_WINDOWS */
4571
4572	static int
4573	unicode_decode_call_errorhandler_writer(
4574	const char errors, PyObject *errorHandler,
4575	const char encoding, const* char *reason,
4576	const char *input, const* char *inend, Py_ssize_t startinpos,
4577	Py_ssize_t endinpos, PyObject exceptionObject, const* char **inptr,
4578	_PyUnicodeWriter writer /* PyObject *output, Py_ssize_t outpos /)
4579	{
4580	static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4581
4582	PyObject *restuple = NULL;
4583	PyObject *repunicode = NULL;
4584	Py_ssize_t insize;
4585	Py_ssize_t newpos;
4586	Py_ssize_t replen;
4587	Py_ssize_t remain;
4588	PyObject *inputobj = NULL;
4589	int need_to_grow = `0`;
4590	const char *new_inptr;
4591
4592	if (*errorHandler == NULL) {
4593	*errorHandler = PyCodec_LookupError(errors);
4594	if (*errorHandler == NULL)
4595	goto onError;
4596	}
4597
4598	make_decode_exception(exceptionObject,
4599	encoding,
4600	input, inend - *input,
4601	startinpos, endinpos,
4602	reason);
4603	if (*exceptionObject == NULL)
4604	goto onError;
4605
4606	restuple = PyObject_CallOneArg(errorHandler, exceptionObject);
4607	if (restuple == NULL)
4608	goto onError;
4609	if (!PyTuple_Check(restuple)) {
4610	PyErr_SetString(PyExc_TypeError, &argparse[`3`]);
4611	goto onError;
4612	}
4613	if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4614	goto onError;
4615
4616	/ Copy back the bytes variables, which might have been modified by the*
4617	callback /*
4618	inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4619	if (!inputobj)
4620	goto onError;
4621	remain = inend - input - *endinpos;
4622	*input = PyBytes_AS_STRING(inputobj);
4623	insize = PyBytes_GET_SIZE(inputobj);
4624	inend = input + insize;
4625	/ we can DECREF safely, as the exception has another reference,*
4626	so the object won't go away. /*
4627	Py_DECREF(inputobj);
4628
4629	if (newpos<`0`)
4630	newpos = insize+newpos;
4631	if (newpos<`0` \|\| newpos>insize) {
4632	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4633	goto onError;
4634	}
4635
4636	replen = PyUnicode_GET_LENGTH(repunicode);
4637	if (replen > `1`) {
4638	writer->min_length += replen - `1`;
4639	need_to_grow = `1`;
4640	}
4641	new_inptr = *input + newpos;
4642	if (*inend - new_inptr > remain) {
4643	/ We don't know the decoding algorithm here so we make the worst*
4644	assumption that one byte decodes to one unicode character.
4645	If unfortunately one byte could decode to more unicode characters,
4646	the decoder may write out-of-bound then. Is it possible for the
4647	algorithms using this function? /*
4648	writer->min_length += *inend - new_inptr - remain;
4649	need_to_grow = `1`;
4650	}
4651	if (need_to_grow) {
4652	writer->overallocate = `1`;
4653	if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4654	PyUnicode_MAX_CHAR_VALUE(repunicode)) == -`1`)
4655	goto onError;
4656	}
4657	if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -`1`)
4658	goto onError;
4659
4660	*endinpos = newpos;
4661	*inptr = new_inptr;
4662
4663	/ we made it! /
4664	Py_DECREF(restuple);
4665	return `0`;
4666
4667	onError:
4668	Py_XDECREF(restuple);
4669	return -`1`;
4670	}
4671
4672	/ --- UTF-7 Codec -------------------------------------------------------- /
4673
4674	/ See RFC2152 for details. We encode conservatively and decode liberally. /
4675
4676	/ Three simple macros defining base-64. /
4677
4678	/ Is c a base-64 character? /
4679
4680	#define IS_BASE64(c) \
4681	(((c) >= 'A' && (c) <= 'Z') \|\| \
4682	((c) >= 'a' && (c) <= 'z') \|\| \
4683	((c) >= '0' && (c) <= '9') \|\| \
4684	(c) == '+' \|\| (c) == '/')
4685
4686	/ given that c is a base-64 character, what is its base-64 value? /
4687
4688	#define FROM_BASE64(c) \
4689	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4690	((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4691	((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4692	(c) == '+' ? 62 : 63)
4693
4694	/ What is the base-64 character of the bottom 6 bits of n? /
4695
4696	#define TO_BASE64(n) \
4697	("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4698
4699	/ DECODE_DIRECT: this byte encountered in a UTF-7 string should be*
4700	* decoded as itself. We are permissive on decoding; the only ASCII
4701	* byte not decoding to itself is the + which begins a base64
4702	* string. */
4703
4704	#define DECODE_DIRECT(c) \
4705	((c) <= 127 && (c) != '+')
4706
4707	/ The UTF-7 encoder treats ASCII characters differently according to*
4708	* whether they are Set D, Set O, Whitespace, or special (i.e. none of
4709	* the above). See RFC2152. This array identifies these different
4710	* sets:
4711	* 0 : "Set D"
4712	* alphanumeric and '(),-./:?
4713	* 1 : "Set O"
4714	* !"#$%&*;<=>@[]^_`{\|}
4715	* 2 : "whitespace"
4716	* ht nl cr sp
4717	* 3 : special (must be base64 encoded)
4718	* everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4719	*/
4720
4721	static
4722	char utf7_category[`128`] = {
4723	/ nul soh stx etx eot enq ack bel bs ht nl vt np cr so si /
4724	`3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `2`, `2`, `3`, `3`, `2`, `3`, `3`,
4725	/ dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us /
4726	`3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`,
4727	/ sp ! " # $ % & ' ( ) * + , - . / /
4728	`2`, `1`, `1`, `1`, `1`, `1`, `1`, `0`, `0`, `0`, `1`, `3`, `0`, `0`, `0`, `0`,
4729	/ 0 1 2 3 4 5 6 7 8 9 : ; < = > ? /
4730	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `1`, `1`, `0`,
4731	/ @ A B C D E F G H I J K L M N O /
4732	`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
4733	/ P Q R S T U V W X Y Z [ \ ] ^ _ /
4734	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `3`, `1`, `1`, `1`,
4735	/ ` a b c d e f g h i j k l m n o /
4736	`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
4737	/ p q r s t u v w x y z { \| } ~ del /
4738	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `1`, `1`, `1`, `3`, `3`,
4739	};
4740
4741	/ ENCODE_DIRECT: this character should be encoded as itself. The*
4742	* answer depends on whether we are encoding set O as itself, and also
4743	* on whether we are encoding whitespace as itself. RFC2152 makes it
4744	* clear that the answers to these questions vary between
4745	* applications, so this code needs to be flexible. */
4746
4747	#define ENCODE_DIRECT(c, directO, directWS) \
4748	((c) < 128 && (c) > 0 && \
4749	((utf7_category[(c)] == 0) \|\| \
4750	(directWS && (utf7_category[(c)] == 2)) \|\| \
4751	(directO && (utf7_category[(c)] == 1))))
4752
4753	PyObject *
4754	PyUnicode_DecodeUTF7(const char *s,
4755	Py_ssize_t size,
4756	const char *errors)
4757	{
4758	return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4759	}
4760
4761	/ The decoder. The only state we preserve is our read position,*
4762	* i.e. how many characters we have consumed. So if we end in the
4763	* middle of a shift sequence we have to back off the read position
4764	* and the output to the beginning of the sequence, otherwise we lose
4765	* all the shift state (seen bits, number of bits seen, high
4766	* surrogate). */
4767
4768	PyObject *
4769	PyUnicode_DecodeUTF7Stateful(const char *s,
4770	Py_ssize_t size,
4771	const char *errors,
4772	Py_ssize_t *consumed)
4773	{
4774	const char *starts = s;
4775	Py_ssize_t startinpos;
4776	Py_ssize_t endinpos;
4777	const char *e;
4778	_PyUnicodeWriter writer;
4779	const char *errmsg = "";
4780	int inShift = `0`;
4781	Py_ssize_t shiftOutStart;
4782	unsigned int base64bits = `0`;
4783	unsigned long base64buffer = `0`;
4784	Py_UCS4 surrogate = `0`;
4785	PyObject *errorHandler = NULL;
4786	PyObject *exc = NULL;
4787
4788	if (size == `0`) {
4789	if (consumed)
4790	*consumed = `0`;
4791	_Py_RETURN_UNICODE_EMPTY();
4792	}
4793
4794	/ Start off assuming it's all ASCII. Widen later as necessary. /
4795	_PyUnicodeWriter_Init(&writer);
4796	writer.min_length = size;
4797
4798	shiftOutStart = `0`;
4799	e = s + size;
4800
4801	while (s < e) {
4802	Py_UCS4 ch;
4803	restart:
4804	ch = (unsigned char) *s;
4805
4806	if (inShift) { / in a base-64 section /
4807	if (IS_BASE64(ch)) { / consume a base-64 character /
4808	base64buffer = (base64buffer << `6`) \| FROM_BASE64(ch);
4809	base64bits += `6`;
4810	s++;
4811	if (base64bits >= `16`) {
4812	/ we have enough bits for a UTF-16 value /
4813	Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-`16`));
4814	base64bits -= `16`;
4815	base64buffer &= (`1` << base64bits) - `1`; / clear high bits /
4816	assert(outCh <= `0xffff`);
4817	if (surrogate) {
4818	/ expecting a second surrogate /
4819	if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4820	Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4821	if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < `0`)
4822	goto onError;
4823	surrogate = `0`;
4824	continue;
4825	}
4826	else {
4827	if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < `0`)
4828	goto onError;
4829	surrogate = `0`;
4830	}
4831	}
4832	if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4833	/ first surrogate /
4834	surrogate = outCh;
4835	}
4836	else {
4837	if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < `0`)
4838	goto onError;
4839	}
4840	}
4841	}
4842	else { / now leaving a base-64 section /
4843	inShift = `0`;
4844	if (base64bits > `0`) { / left-over bits /
4845	if (base64bits >= `6`) {
4846	/ We've seen at least one base-64 character /
4847	s++;
4848	errmsg = "partial character in shift sequence";
4849	goto utf7Error;
4850	}
4851	else {
4852	/ Some bits remain; they should be zero /
4853	if (base64buffer != `0`) {
4854	s++;
4855	errmsg = "non-zero padding bits in shift sequence";
4856	goto utf7Error;
4857	}
4858	}
4859	}
4860	if (surrogate && DECODE_DIRECT(ch)) {
4861	if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < `0`)
4862	goto onError;
4863	}
4864	surrogate = `0`;
4865	if (ch == `'-'`) {
4866	/ '-' is absorbed; other terminating*
4867	characters are preserved /*
4868	s++;
4869	}
4870	}
4871	}
4872	else if ( ch == `'+'` ) {
4873	startinpos = s-starts;
4874	s++; / consume '+' /
4875	if (s < e && s == `'-'`) { /* '+-' encodes '+' /
4876	s++;
4877	if (_PyUnicodeWriter_WriteCharInline(&writer, `'+'`) < `0`)
4878	goto onError;
4879	}
4880	else if (s < e && !IS_BASE64(*s)) {
4881	s++;
4882	errmsg = "ill-formed sequence";
4883	goto utf7Error;
4884	}
4885	else { / begin base64-encoded section /
4886	inShift = `1`;
4887	surrogate = `0`;
4888	shiftOutStart = writer.pos;
4889	base64bits = `0`;
4890	base64buffer = `0`;
4891	}
4892	}
4893	else if (DECODE_DIRECT(ch)) { / character decodes as itself /
4894	s++;
4895	if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < `0`)
4896	goto onError;
4897	}
4898	else {
4899	startinpos = s-starts;
4900	s++;
4901	errmsg = "unexpected special character";
4902	goto utf7Error;
4903	}
4904	continue;
4905	utf7Error:
4906	endinpos = s-starts;
4907	if (unicode_decode_call_errorhandler_writer(
4908	errors, &errorHandler,
4909	"utf7", errmsg,
4910	&starts, &e, &startinpos, &endinpos, &exc, &s,
4911	&writer))
4912	goto onError;
4913	}
4914
4915	/ end of string /
4916
4917	if (inShift && !consumed) { / in shift sequence, no more to follow /
4918	/ if we're in an inconsistent state, that's an error /
4919	inShift = `0`;
4920	if (surrogate \|\|
4921	(base64bits >= `6`) \|\|
4922	(base64bits > `0` && base64buffer != `0`)) {
4923	endinpos = size;
4924	if (unicode_decode_call_errorhandler_writer(
4925	errors, &errorHandler,
4926	"utf7", "unterminated shift sequence",
4927	&starts, &e, &startinpos, &endinpos, &exc, &s,
4928	&writer))
4929	goto onError;
4930	if (s < e)
4931	goto restart;
4932	}
4933	}
4934
4935	/ return state /
4936	if (consumed) {
4937	if (inShift) {
4938	*consumed = startinpos;
4939	if (writer.pos != shiftOutStart && writer.maxchar > `127`) {
4940	PyObject *result = PyUnicode_FromKindAndData(
4941	writer.kind, writer.data, shiftOutStart);
4942	Py_XDECREF(errorHandler);
4943	Py_XDECREF(exc);
4944	_PyUnicodeWriter_Dealloc(&writer);
4945	return result;
4946	}
4947	writer.pos = shiftOutStart; / back off output /
4948	}
4949	else {
4950	*consumed = s-starts;
4951	}
4952	}
4953
4954	Py_XDECREF(errorHandler);
4955	Py_XDECREF(exc);
4956	return _PyUnicodeWriter_Finish(&writer);
4957
4958	onError:
4959	Py_XDECREF(errorHandler);
4960	Py_XDECREF(exc);
4961	_PyUnicodeWriter_Dealloc(&writer);
4962	return NULL;
4963	}
4964
4965
4966	PyObject *
4967	_PyUnicode_EncodeUTF7(PyObject *str,
4968	int base64SetO,
4969	int base64WhiteSpace,
4970	const char *errors)
4971	{
4972	int kind;
4973	const void *data;
4974	Py_ssize_t len;
4975	PyObject *v;
4976	int inShift = `0`;
4977	Py_ssize_t i;
4978	unsigned int base64bits = `0`;
4979	unsigned long base64buffer = `0`;
4980	char * out;
4981	const char * start;
4982
4983	if (PyUnicode_READY(str) == -`1`)
4984	return NULL;
4985	kind = PyUnicode_KIND(str);
4986	data = PyUnicode_DATA(str);
4987	len = PyUnicode_GET_LENGTH(str);
4988
4989	if (len == `0`)
4990	return PyBytes_FromStringAndSize(NULL, `0`);
4991
4992	/ It might be possible to tighten this worst case /
4993	if (len > PY_SSIZE_T_MAX / `8`)
4994	return PyErr_NoMemory();
4995	v = PyBytes_FromStringAndSize(NULL, len * `8`);
4996	if (v == NULL)
4997	return NULL;
4998
4999	start = out = PyBytes_AS_STRING(v);
5000	for (i = `0`; i < len; ++i) {
5001	Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5002
5003	if (inShift) {
5004	if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5005	/ shifting out /
5006	if (base64bits) { / output remaining bits /
5007	*out++ = TO_BASE64(base64buffer << (`6`-base64bits));
5008	base64buffer = `0`;
5009	base64bits = `0`;
5010	}
5011	inShift = `0`;
5012	/ Characters not in the BASE64 set implicitly unshift the sequence*
5013	so no '-' is required, except if the character is itself a '-' /*
5014	if (IS_BASE64(ch) \|\| ch == `'-'`) {
5015	*out++ = `'-'`;
5016	}
5017	out++ = (char*) ch;
5018	}
5019	else {
5020	goto encode_char;
5021	}
5022	}
5023	else { / not in a shift sequence /
5024	if (ch == `'+'`) {
5025	*out++ = `'+'`;
5026	*out++ = `'-'`;
5027	}
5028	else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5029	out++ = (char*) ch;
5030	}
5031	else {
5032	*out++ = `'+'`;
5033	inShift = `1`;
5034	goto encode_char;
5035	}
5036	}
5037	continue;
5038	encode_char:
5039	if (ch >= `0x10000`) {
5040	assert(ch <= MAX_UNICODE);
5041
5042	/ code first surrogate /
5043	base64bits += `16`;
5044	base64buffer = (base64buffer << `16`) \| Py_UNICODE_HIGH_SURROGATE(ch);
5045	while (base64bits >= `6`) {
5046	*out++ = TO_BASE64(base64buffer >> (base64bits-`6`));
5047	base64bits -= `6`;
5048	}
5049	/ prepare second surrogate /
5050	ch = Py_UNICODE_LOW_SURROGATE(ch);
5051	}
5052	base64bits += `16`;
5053	base64buffer = (base64buffer << `16`) \| ch;
5054	while (base64bits >= `6`) {
5055	*out++ = TO_BASE64(base64buffer >> (base64bits-`6`));
5056	base64bits -= `6`;
5057	}
5058	}
5059	if (base64bits)
5060	*out++= TO_BASE64(base64buffer << (`6`-base64bits) );
5061	if (inShift)
5062	*out++ = `'-'`;
5063	if (_PyBytes_Resize(&v, out - start) < `0`)
5064	return NULL;
5065	return v;
5066	}
5067	PyObject *
5068	PyUnicode_EncodeUTF7(const Py_UNICODE *s,
5069	Py_ssize_t size,
5070	int base64SetO,
5071	int base64WhiteSpace,
5072	const char *errors)
5073	{
5074	PyObject *result;
5075	PyObject *tmp = PyUnicode_FromWideChar(s, size);
5076	if (tmp == NULL)
5077	return NULL;
5078	result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
5079	base64WhiteSpace, errors);
5080	Py_DECREF(tmp);
5081	return result;
5082	}
5083
5084	#undef IS_BASE64
5085	#undef FROM_BASE64
5086	#undef TO_BASE64
5087	#undef DECODE_DIRECT
5088	#undef ENCODE_DIRECT
5089
5090	/ --- UTF-8 Codec -------------------------------------------------------- /
5091
5092	PyObject *
5093	PyUnicode_DecodeUTF8(const char *s,
5094	Py_ssize_t size,
5095	const char *errors)
5096	{
5097	return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5098	}
5099
5100	#include "stringlib/asciilib.h"
5101	#include "stringlib/codecs.h"
5102	#include "stringlib/undef.h"
5103
5104	#include "stringlib/ucs1lib.h"
5105	#include "stringlib/codecs.h"
5106	#include "stringlib/undef.h"
5107
5108	#include "stringlib/ucs2lib.h"
5109	#include "stringlib/codecs.h"
5110	#include "stringlib/undef.h"
5111
5112	#include "stringlib/ucs4lib.h"
5113	#include "stringlib/codecs.h"
5114	#include "stringlib/undef.h"
5115
5116	/ Mask to quickly check whether a C 'size_t' contains a*
5117	non-ASCII, UTF8-encoded char. /*
5118	#if (SIZEOF_SIZE_T == 8)
5119	# define ASCII_CHAR_MASK 0x8080808080808080ULL
5120	#elif (SIZEOF_SIZE_T == 4)
5121	# define ASCII_CHAR_MASK 0x80808080U
5122	#else
5123	# error C 'size_t' size should be either 4 or 8!
5124	#endif
5125
5126	static Py_ssize_t
5127	ascii_decode(const char start, const* char end, Py_UCS1 dest)
5128	{
5129	const char *p = start;
5130
5131	#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5132	assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5133	if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5134	/ Fast path, see in STRINGLIB(utf8_decode) for*
5135	an explanation. /*
5136	/ Help allocation /
5137	const char *_p = p;
5138	Py_UCS1 * q = dest;
5139	while (_p + SIZEOF_SIZE_T <= end) {
5140	size_t value = (const* size_t *) _p;
5141	if (value & ASCII_CHAR_MASK)
5142	break;
5143	((size_t )q) = value;
5144	_p += SIZEOF_SIZE_T;
5145	q += SIZEOF_SIZE_T;
5146	}
5147	p = _p;
5148	while (p < end) {
5149	if ((unsigned char)*p & `0x80`)
5150	break;
5151	q++ = p++;
5152	}
5153	return p - start;
5154	}
5155	#endif
5156	while (p < end) {
5157	/ Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h*
5158	for an explanation. /*
5159	if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5160	/ Help allocation /
5161	const char *_p = p;
5162	while (_p + SIZEOF_SIZE_T <= end) {
5163	size_t value = (const* size_t *) _p;
5164	if (value & ASCII_CHAR_MASK)
5165	break;
5166	_p += SIZEOF_SIZE_T;
5167	}
5168	p = _p;
5169	if (_p == end)
5170	break;
5171	}
5172	if ((unsigned char)*p & `0x80`)
5173	break;
5174	++p;
5175	}
5176	memcpy(dest, start, p - start);
5177	return p - start;
5178	}
5179
5180	static PyObject *
5181	unicode_decode_utf8(const char *s, Py_ssize_t size,
5182	_Py_error_handler error_handler, const char *errors,
5183	Py_ssize_t *consumed)
5184	{
5185	if (size == `0`) {
5186	if (consumed)
5187	*consumed = `0`;
5188	_Py_RETURN_UNICODE_EMPTY();
5189	}
5190
5191	/ ASCII is equivalent to the first 128 ordinals in Unicode. /
5192	if (size == `1` && (unsigned char)s[`0`] < `128`) {
5193	if (consumed) {
5194	*consumed = `1`;
5195	}
5196	return get_latin1_char((unsigned char)s[`0`]);
5197	}
5198
5199	const char *starts = s;
5200	const char *end = s + size;
5201
5202	// fast path: try ASCII string.
5203	PyObject *u = PyUnicode_New(size, `127`);
5204	if (u == NULL) {
5205	return NULL;
5206	}
5207	s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5208	if (s == end) {
5209	return u;
5210	}
5211
5212	// Use _PyUnicodeWriter after fast path is failed.
5213	_PyUnicodeWriter writer;
5214	_PyUnicodeWriter_InitWithBuffer(&writer, u);
5215	writer.pos = s - starts;
5216
5217	Py_ssize_t startinpos, endinpos;
5218	const char *errmsg = "";
5219	PyObject *error_handler_obj = NULL;
5220	PyObject *exc = NULL;
5221
5222	while (s < end) {
5223	Py_UCS4 ch;
5224	int kind = writer.kind;
5225
5226	if (kind == PyUnicode_1BYTE_KIND) {
5227	if (PyUnicode_IS_ASCII(writer.buffer))
5228	ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5229	else
5230	ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5231	} else if (kind == PyUnicode_2BYTE_KIND) {
5232	ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5233	} else {
5234	assert(kind == PyUnicode_4BYTE_KIND);
5235	ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5236	}
5237
5238	switch (ch) {
5239	case `0`:
5240	if (s == end \|\| consumed)
5241	goto End;
5242	errmsg = "unexpected end of data";
5243	startinpos = s - starts;
5244	endinpos = end - starts;
5245	break;
5246	case `1`:
5247	errmsg = "invalid start byte";
5248	startinpos = s - starts;
5249	endinpos = startinpos + `1`;
5250	break;
5251	case `2`:
5252	if (consumed && (unsigned char)s[`0`] == `0xED` && end - s == `2`
5253	&& (unsigned char)s[`1`] >= `0xA0` && (unsigned char)s[`1`] <= `0xBF`)
5254	{
5255	/ Truncated surrogate code in range D800-DFFF /
5256	goto End;
5257	}
5258	/ fall through /
5259	case `3`:
5260	case `4`:
5261	errmsg = "invalid continuation byte";
5262	startinpos = s - starts;
5263	endinpos = startinpos + ch - `1`;
5264	break;
5265	default:
5266	if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < `0`)
5267	goto onError;
5268	continue;
5269	}
5270
5271	if (error_handler == _Py_ERROR_UNKNOWN)
5272	error_handler = _Py_GetErrorHandler(errors);
5273
5274	switch (error_handler) {
5275	case _Py_ERROR_IGNORE:
5276	s += (endinpos - startinpos);
5277	break;
5278
5279	case _Py_ERROR_REPLACE:
5280	if (_PyUnicodeWriter_WriteCharInline(&writer, `0xfffd`) < `0`)
5281	goto onError;
5282	s += (endinpos - startinpos);
5283	break;
5284
5285	case _Py_ERROR_SURROGATEESCAPE:
5286	{
5287	Py_ssize_t i;
5288
5289	if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < `0`)
5290	goto onError;
5291	for (i=startinpos; i<endinpos; i++) {
5292	ch = (Py_UCS4)(unsigned char)(starts[i]);
5293	PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5294	ch + `0xdc00`);
5295	writer.pos++;
5296	}
5297	s += (endinpos - startinpos);
5298	break;
5299	}
5300
5301	default:
5302	if (unicode_decode_call_errorhandler_writer(
5303	errors, &error_handler_obj,
5304	"utf-8", errmsg,
5305	&starts, &end, &startinpos, &endinpos, &exc, &s,
5306	&writer))
5307	goto onError;
5308	}
5309	}
5310
5311	End:
5312	if (consumed)
5313	*consumed = s - starts;
5314
5315	Py_XDECREF(error_handler_obj);
5316	Py_XDECREF(exc);
5317	return _PyUnicodeWriter_Finish(&writer);
5318
5319	onError:
5320	Py_XDECREF(error_handler_obj);
5321	Py_XDECREF(exc);
5322	_PyUnicodeWriter_Dealloc(&writer);
5323	return NULL;
5324	}
5325
5326
5327	PyObject *
5328	PyUnicode_DecodeUTF8Stateful(const char *s,
5329	Py_ssize_t size,
5330	const char *errors,
5331	Py_ssize_t *consumed)
5332	{
5333	return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5334	}
5335
5336
5337	/ UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is*
5338	non-zero, use strict error handler otherwise.
5339
5340	On success, write a pointer to a newly allocated wide character string into
5341	*wstr (use PyMem_RawFree() to free the memory) and write the output length
5342	(in number of wchar_t units) into wlen (if wlen is set).*
5343
5344	On memory allocation failure, return -1.
5345
5346	On decoding error (if surrogateescape is zero), return -2. If wlen is
5347	non-NULL, write the start of the illegal byte sequence into wlen. If reason*
5348	is not NULL, write the decoding error message into reason. /
5349	int
5350	_Py_DecodeUTF8Ex(const char s, Py_ssize_t size, wchar_t wstr, size_t wlen,
5351	const char **reason, _Py_error_handler errors)
5352	{
5353	const char *orig_s = s;
5354	const char *e;
5355	wchar_t *unicode;
5356	Py_ssize_t outpos;
5357
5358	int surrogateescape = `0`;
5359	int surrogatepass = `0`;
5360	switch (errors)
5361	{
5362	case _Py_ERROR_STRICT:
5363	break;
5364	case _Py_ERROR_SURROGATEESCAPE:
5365	surrogateescape = `1`;
5366	break;
5367	case _Py_ERROR_SURROGATEPASS:
5368	surrogatepass = `1`;
5369	break;
5370	default:
5371	return -`3`;
5372	}
5373
5374	/ Note: size will always be longer than the resulting Unicode*
5375	character count /*
5376	if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - `1` < size) {
5377	return -`1`;
5378	}
5379
5380	unicode = PyMem_RawMalloc((size + `1`) * sizeof(wchar_t));
5381	if (!unicode) {
5382	return -`1`;
5383	}
5384
5385	/ Unpack UTF-8 encoded data /
5386	e = s + size;
5387	outpos = `0`;
5388	while (s < e) {
5389	Py_UCS4 ch;
5390	#if SIZEOF_WCHAR_T == 4
5391	ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5392	#else
5393	ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5394	#endif
5395	if (ch > `0xFF`) {
5396	#if SIZEOF_WCHAR_T == 4
5397	Py_UNREACHABLE();
5398	#else
5399	assert(ch > `0xFFFF` && ch <= MAX_UNICODE);
5400	/ write a surrogate pair /
5401	unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5402	unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5403	#endif
5404	}
5405	else {
5406	if (!ch && s == e) {
5407	break;
5408	}
5409
5410	if (surrogateescape) {
5411	unicode[outpos++] = `0xDC00` + (unsigned char)*s++;
5412	}
5413	else {
5414	/ Is it a valid three-byte code? /
5415	if (surrogatepass
5416	&& (e - s) >= `3`
5417	&& (s[`0`] & `0xf0`) == `0xe0`
5418	&& (s[`1`] & `0xc0`) == `0x80`
5419	&& (s[`2`] & `0xc0`) == `0x80`)
5420	{
5421	ch = ((s[`0`] & `0x0f`) << `12`) + ((s[`1`] & `0x3f`) << `6`) + (s[`2`] & `0x3f`);
5422	s += `3`;
5423	unicode[outpos++] = ch;
5424	}
5425	else {
5426	PyMem_RawFree(unicode );
5427	if (reason != NULL) {
5428	switch (ch) {
5429	case `0`:
5430	*reason = "unexpected end of data";
5431	break;
5432	case `1`:
5433	*reason = "invalid start byte";
5434	break;
5435	/ 2, 3, 4 /
5436	default:
5437	*reason = "invalid continuation byte";
5438	break;
5439	}
5440	}
5441	if (wlen != NULL) {
5442	*wlen = s - orig_s;
5443	}
5444	return -`2`;
5445	}
5446	}
5447	}
5448	}
5449	unicode[outpos] = L`'\0'`;
5450	if (wlen) {
5451	*wlen = outpos;
5452	}
5453	*wstr = unicode;
5454	return `0`;
5455	}
5456
5457
5458	wchar_t*
5459	_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5460	size_t *wlen)
5461	{
5462	wchar_t *wstr;
5463	int res = _Py_DecodeUTF8Ex(arg, arglen,
5464	&wstr, wlen,
5465	NULL, _Py_ERROR_SURROGATEESCAPE);
5466	if (res != `0`) {
5467	/ _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE /
5468	assert(res != -`3`);
5469	if (wlen) {
5470	*wlen = (size_t)res;
5471	}
5472	return NULL;
5473	}
5474	return wstr;
5475	}
5476
5477
5478	/ UTF-8 encoder using the surrogateescape error handler .*
5479
5480	On success, return 0 and write the newly allocated character string (use
5481	PyMem_Free() to free the memory) into str.*
5482
5483	On encoding failure, return -2 and write the position of the invalid
5484	surrogate character into error_pos (if error_pos is set) and the decoding*
5485	error message into reason (if reason is set).*
5486
5487	On memory allocation failure, return -1. /*
5488	int
5489	_Py_EncodeUTF8Ex(const wchar_t text, char* *str, size_t error_pos,
5490	const char *reason, int* raw_malloc, _Py_error_handler errors)
5491	{
5492	const Py_ssize_t max_char_size = `4`;
5493	Py_ssize_t len = wcslen(text);
5494
5495	assert(len >= `0`);
5496
5497	int surrogateescape = `0`;
5498	int surrogatepass = `0`;
5499	switch (errors)
5500	{
5501	case _Py_ERROR_STRICT:
5502	break;
5503	case _Py_ERROR_SURROGATEESCAPE:
5504	surrogateescape = `1`;
5505	break;
5506	case _Py_ERROR_SURROGATEPASS:
5507	surrogatepass = `1`;
5508	break;
5509	default:
5510	return -`3`;
5511	}
5512
5513	if (len > PY_SSIZE_T_MAX / max_char_size - `1`) {
5514	return -`1`;
5515	}
5516	char *bytes;
5517	if (raw_malloc) {
5518	bytes = PyMem_RawMalloc((len + `1`) * max_char_size);
5519	}
5520	else {
5521	bytes = PyMem_Malloc((len + `1`) * max_char_size);
5522	}
5523	if (bytes == NULL) {
5524	return -`1`;
5525	}
5526
5527	char *p = bytes;
5528	Py_ssize_t i;
5529	for (i = `0`; i < len; ) {
5530	Py_ssize_t ch_pos = i;
5531	Py_UCS4 ch = text[i];
5532	i++;
5533	#if Py_UNICODE_SIZE == 2
5534	if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5535	&& i < len
5536	&& Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5537	{
5538	ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5539	i++;
5540	}
5541	#endif
5542
5543	if (ch < `0x80`) {
5544	/ Encode ASCII /
5545	p++ = (char*) ch;
5546
5547	}
5548	else if (ch < `0x0800`) {
5549	/ Encode Latin-1 /
5550	p++ = (char*)(`0xc0` \| (ch >> `6`));
5551	p++ = (char*)(`0x80` \| (ch & `0x3f`));
5552	}
5553	else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5554	/ surrogateescape error handler /
5555	if (!surrogateescape \|\| !(`0xDC80` <= ch && ch <= `0xDCFF`)) {
5556	if (error_pos != NULL) {
5557	*error_pos = (size_t)ch_pos;
5558	}
5559	if (reason != NULL) {
5560	*reason = "encoding error";
5561	}
5562	if (raw_malloc) {
5563	PyMem_RawFree(bytes);
5564	}
5565	else {
5566	PyMem_Free(bytes);
5567	}
5568	return -`2`;
5569	}
5570	p++ = (char*)(ch & `0xff`);
5571	}
5572	else if (ch < `0x10000`) {
5573	p++ = (char*)(`0xe0` \| (ch >> `12`));
5574	p++ = (char*)(`0x80` \| ((ch >> `6`) & `0x3f`));
5575	p++ = (char*)(`0x80` \| (ch & `0x3f`));
5576	}
5577	else { / ch >= 0x10000 /
5578	assert(ch <= MAX_UNICODE);
5579	/ Encode UCS4 Unicode ordinals /
5580	p++ = (char*)(`0xf0` \| (ch >> `18`));
5581	p++ = (char*)(`0x80` \| ((ch >> `12`) & `0x3f`));
5582	p++ = (char*)(`0x80` \| ((ch >> `6`) & `0x3f`));
5583	p++ = (char*)(`0x80` \| (ch & `0x3f`));
5584	}
5585	}
5586	*p++ = `'\0'`;
5587
5588	size_t final_size = (p - bytes);
5589	char *bytes2;
5590	if (raw_malloc) {
5591	bytes2 = PyMem_RawRealloc(bytes, final_size);
5592	}
5593	else {
5594	bytes2 = PyMem_Realloc(bytes, final_size);
5595	}
5596	if (bytes2 == NULL) {
5597	if (error_pos != NULL) {
5598	*error_pos = (size_t)-`1`;
5599	}
5600	if (raw_malloc) {
5601	PyMem_RawFree(bytes);
5602	}
5603	else {
5604	PyMem_Free(bytes);
5605	}
5606	return -`1`;
5607	}
5608	*str = bytes2;
5609	return `0`;
5610	}
5611
5612
5613	/ Primary internal function which creates utf8 encoded bytes objects.*
5614
5615	Allocation strategy: if the string is short, convert into a stack buffer
5616	and allocate exactly as much space needed at the end. Else allocate the
5617	maximum possible needed (4 result bytes per Unicode character), and return
5618	the excess memory at the end.
5619	*/
5620	static PyObject *
5621	unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5622	const char *errors)
5623	{
5624	if (!PyUnicode_Check(unicode)) {
5625	PyErr_BadArgument();
5626	return NULL;
5627	}
5628
5629	if (PyUnicode_READY(unicode) == -`1`)
5630	return NULL;
5631
5632	if (PyUnicode_UTF8(unicode))
5633	return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5634	PyUnicode_UTF8_LENGTH(unicode));
5635
5636	enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5637	const void *data = PyUnicode_DATA(unicode);
5638	Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5639
5640	_PyBytesWriter writer;
5641	char *end;
5642
5643	switch (kind) {
5644	default:
5645	Py_UNREACHABLE();
5646	case PyUnicode_1BYTE_KIND:
5647	/ the string cannot be ASCII, or PyUnicode_UTF8() would be set /
5648	assert(!PyUnicode_IS_ASCII(unicode));
5649	end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5650	break;
5651	case PyUnicode_2BYTE_KIND:
5652	end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5653	break;
5654	case PyUnicode_4BYTE_KIND:
5655	end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5656	break;
5657	}
5658
5659	if (end == NULL) {
5660	_PyBytesWriter_Dealloc(&writer);
5661	return NULL;
5662	}
5663	return _PyBytesWriter_Finish(&writer, end);
5664	}
5665
5666	static int
5667	unicode_fill_utf8(PyObject *unicode)
5668	{
5669	/ the string cannot be ASCII, or PyUnicode_UTF8() would be set /
5670	assert(!PyUnicode_IS_ASCII(unicode));
5671
5672	enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5673	const void *data = PyUnicode_DATA(unicode);
5674	Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5675
5676	_PyBytesWriter writer;
5677	char *end;
5678
5679	switch (kind) {
5680	default:
5681	Py_UNREACHABLE();
5682	case PyUnicode_1BYTE_KIND:
5683	end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5684	_Py_ERROR_STRICT, NULL);
5685	break;
5686	case PyUnicode_2BYTE_KIND:
5687	end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5688	_Py_ERROR_STRICT, NULL);
5689	break;
5690	case PyUnicode_4BYTE_KIND:
5691	end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5692	_Py_ERROR_STRICT, NULL);
5693	break;
5694	}
5695	if (end == NULL) {
5696	_PyBytesWriter_Dealloc(&writer);
5697	return -`1`;
5698	}
5699
5700	const char *start = writer.use_small_buffer ? writer.small_buffer :
5701	PyBytes_AS_STRING(writer.buffer);
5702	Py_ssize_t len = end - start;
5703
5704	char *cache = PyObject_Malloc(len + `1`);
5705	if (cache == NULL) {
5706	_PyBytesWriter_Dealloc(&writer);
5707	PyErr_NoMemory();
5708	return -`1`;
5709	}
5710	_PyUnicode_UTF8(unicode) = cache;
5711	_PyUnicode_UTF8_LENGTH(unicode) = len;
5712	memcpy(cache, start, len);
5713	cache[len] = `'\0'`;
5714	_PyBytesWriter_Dealloc(&writer);
5715	return `0`;
5716	}
5717
5718	PyObject *
5719	_PyUnicode_AsUTF8String(PyObject unicode, const* char *errors)
5720	{
5721	return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5722	}
5723
5724
5725	PyObject *
5726	PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5727	Py_ssize_t size,
5728	const char *errors)
5729	{
5730	PyObject v, unicode;
5731
5732	unicode = PyUnicode_FromWideChar(s, size);
5733	if (unicode == NULL)
5734	return NULL;
5735	v = _PyUnicode_AsUTF8String(unicode, errors);
5736	Py_DECREF(unicode);
5737	return v;
5738	}
5739
5740	PyObject *
5741	PyUnicode_AsUTF8String(PyObject *unicode)
5742	{
5743	return _PyUnicode_AsUTF8String(unicode, NULL);
5744	}
5745
5746	/ --- UTF-32 Codec ------------------------------------------------------- /
5747
5748	PyObject *
5749	PyUnicode_DecodeUTF32(const char *s,
5750	Py_ssize_t size,
5751	const char *errors,
5752	int *byteorder)
5753	{
5754	return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5755	}
5756
5757	PyObject *
5758	PyUnicode_DecodeUTF32Stateful(const char *s,
5759	Py_ssize_t size,
5760	const char *errors,
5761	int *byteorder,
5762	Py_ssize_t *consumed)
5763	{
5764	const char *starts = s;
5765	Py_ssize_t startinpos;
5766	Py_ssize_t endinpos;
5767	_PyUnicodeWriter writer;
5768	const unsigned char q, e;
5769	int le, bo = `0`; / assume native ordering by default /
5770	const char *encoding;
5771	const char *errmsg = "";
5772	PyObject *errorHandler = NULL;
5773	PyObject *exc = NULL;
5774
5775	q = (const unsigned char *)s;
5776	e = q + size;
5777
5778	if (byteorder)
5779	bo = *byteorder;
5780
5781	/ Check for BOM marks (U+FEFF) in the input and adjust current*
5782	byte order setting accordingly. In native mode, the leading BOM
5783	mark is skipped, in all other modes, it is copied to the output
5784	stream as-is (giving a ZWNBSP character). /*
5785	if (bo == `0` && size >= `4`) {
5786	Py_UCS4 bom = ((unsigned int)q[`3`] << `24`) \| (q[`2`] << `16`) \| (q[`1`] << `8`) \| q[`0`];
5787	if (bom == `0x0000FEFF`) {
5788	bo = -`1`;
5789	q += `4`;
5790	}
5791	else if (bom == `0xFFFE0000`) {
5792	bo = `1`;
5793	q += `4`;
5794	}
5795	if (byteorder)
5796	*byteorder = bo;
5797	}
5798
5799	if (q == e) {
5800	if (consumed)
5801	*consumed = size;
5802	_Py_RETURN_UNICODE_EMPTY();
5803	}
5804
5805	#ifdef WORDS_BIGENDIAN
5806	le = bo < `0`;
5807	#else
5808	le = bo <= `0`;
5809	#endif
5810	encoding = le ? "utf-32-le" : "utf-32-be";
5811
5812	_PyUnicodeWriter_Init(&writer);
5813	writer.min_length = (e - q + `3`) / `4`;
5814	if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, `127`) == -`1`)
5815	goto onError;
5816
5817	while (`1`) {
5818	Py_UCS4 ch = `0`;
5819	Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5820
5821	if (e - q >= `4`) {
5822	enum PyUnicode_Kind kind = writer.kind;
5823	void *data = writer.data;
5824	const unsigned char *last = e - `4`;
5825	Py_ssize_t pos = writer.pos;
5826	if (le) {
5827	do {
5828	ch = ((unsigned int)q[`3`] << `24`) \| (q[`2`] << `16`) \| (q[`1`] << `8`) \| q[`0`];
5829	if (ch > maxch)
5830	break;
5831	if (kind != PyUnicode_1BYTE_KIND &&
5832	Py_UNICODE_IS_SURROGATE(ch))
5833	break;
5834	PyUnicode_WRITE(kind, data, pos++, ch);
5835	q += `4`;
5836	} while (q <= last);
5837	}
5838	else {
5839	do {
5840	ch = ((unsigned int)q[`0`] << `24`) \| (q[`1`] << `16`) \| (q[`2`] << `8`) \| q[`3`];
5841	if (ch > maxch)
5842	break;
5843	if (kind != PyUnicode_1BYTE_KIND &&
5844	Py_UNICODE_IS_SURROGATE(ch))
5845	break;
5846	PyUnicode_WRITE(kind, data, pos++, ch);
5847	q += `4`;
5848	} while (q <= last);
5849	}
5850	writer.pos = pos;
5851	}
5852
5853	if (Py_UNICODE_IS_SURROGATE(ch)) {
5854	errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5855	startinpos = ((const char *)q) - starts;
5856	endinpos = startinpos + `4`;
5857	}
5858	else if (ch <= maxch) {
5859	if (q == e \|\| consumed)
5860	break;
5861	/ remaining bytes at the end? (size should be divisible by 4) /
5862	errmsg = "truncated data";
5863	startinpos = ((const char *)q) - starts;
5864	endinpos = ((const char *)e) - starts;
5865	}
5866	else {
5867	if (ch < `0x110000`) {
5868	if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < `0`)
5869	goto onError;
5870	q += `4`;
5871	continue;
5872	}
5873	errmsg = "code point not in range(0x110000)";
5874	startinpos = ((const char *)q) - starts;
5875	endinpos = startinpos + `4`;
5876	}
5877
5878	/ The remaining input chars are ignored if the callback*
5879	chooses to skip the input /*
5880	if (unicode_decode_call_errorhandler_writer(
5881	errors, &errorHandler,
5882	encoding, errmsg,
5883	&starts, (const char *)&e, &startinpos, &endinpos, &exc, (const* char **)&q,
5884	&writer))
5885	goto onError;
5886	}
5887
5888	if (consumed)
5889	consumed = (const* char *)q-starts;
5890
5891	Py_XDECREF(errorHandler);
5892	Py_XDECREF(exc);
5893	return _PyUnicodeWriter_Finish(&writer);
5894
5895	onError:
5896	_PyUnicodeWriter_Dealloc(&writer);
5897	Py_XDECREF(errorHandler);
5898	Py_XDECREF(exc);
5899	return NULL;
5900	}
5901
5902	PyObject *
5903	_PyUnicode_EncodeUTF32(PyObject *str,
5904	const char *errors,
5905	int byteorder)
5906	{
5907	enum PyUnicode_Kind kind;
5908	const void *data;
5909	Py_ssize_t len;
5910	PyObject *v;
5911	uint32_t *out;
5912	#if PY_LITTLE_ENDIAN
5913	int native_ordering = byteorder <= `0`;
5914	#else
5915	int native_ordering = byteorder >= `0`;
5916	#endif
5917	const char *encoding;
5918	Py_ssize_t nsize, pos;
5919	PyObject *errorHandler = NULL;
5920	PyObject *exc = NULL;
5921	PyObject *rep = NULL;
5922
5923	if (!PyUnicode_Check(str)) {
5924	PyErr_BadArgument();
5925	return NULL;
5926	}
5927	if (PyUnicode_READY(str) == -`1`)
5928	return NULL;
5929	kind = PyUnicode_KIND(str);
5930	data = PyUnicode_DATA(str);
5931	len = PyUnicode_GET_LENGTH(str);
5932
5933	if (len > PY_SSIZE_T_MAX / `4` - (byteorder == `0`))
5934	return PyErr_NoMemory();
5935	nsize = len + (byteorder == `0`);
5936	v = PyBytes_FromStringAndSize(NULL, nsize * `4`);
5937	if (v == NULL)
5938	return NULL;
5939
5940	/ output buffer is 4-bytes aligned /
5941	assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), `4`));
5942	out = (uint32_t *)PyBytes_AS_STRING(v);
5943	if (byteorder == `0`)
5944	*out++ = `0xFEFF`;
5945	if (len == `0`)
5946	goto done;
5947
5948	if (byteorder == -`1`)
5949	encoding = "utf-32-le";
5950	else if (byteorder == `1`)
5951	encoding = "utf-32-be";
5952	else
5953	encoding = "utf-32";
5954
5955	if (kind == PyUnicode_1BYTE_KIND) {
5956	ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5957	goto done;
5958	}
5959
5960	pos = `0`;
5961	while (pos < len) {
5962	Py_ssize_t newpos, repsize, moreunits;
5963
5964	if (kind == PyUnicode_2BYTE_KIND) {
5965	pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5966	&out, native_ordering);
5967	}
5968	else {
5969	assert(kind == PyUnicode_4BYTE_KIND);
5970	pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5971	&out, native_ordering);
5972	}
5973	if (pos == len)
5974	break;
5975
5976	rep = unicode_encode_call_errorhandler(
5977	errors, &errorHandler,
5978	encoding, "surrogates not allowed",
5979	str, &exc, pos, pos + `1`, &newpos);
5980	if (!rep)
5981	goto error;
5982
5983	if (PyBytes_Check(rep)) {
5984	repsize = PyBytes_GET_SIZE(rep);
5985	if (repsize & `3`) {
5986	raise_encode_exception(&exc, encoding,
5987	str, pos, pos + `1`,
5988	"surrogates not allowed");
5989	goto error;
5990	}
5991	moreunits = repsize / `4`;
5992	}
5993	else {
5994	assert(PyUnicode_Check(rep));
5995	if (PyUnicode_READY(rep) < `0`)
5996	goto error;
5997	moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5998	if (!PyUnicode_IS_ASCII(rep)) {
5999	raise_encode_exception(&exc, encoding,
6000	str, pos, pos + `1`,
6001	"surrogates not allowed");
6002	goto error;
6003	}
6004	}
6005	moreunits += pos - newpos;
6006	pos = newpos;
6007
6008	/ four bytes are reserved for each surrogate /
6009	if (moreunits > `0`) {
6010	Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
6011	if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / `4`) {
6012	/ integer overflow /
6013	PyErr_NoMemory();
6014	goto error;
6015	}
6016	if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + `4` * moreunits) < `0`)
6017	goto error;
6018	out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
6019	}
6020
6021	if (PyBytes_Check(rep)) {
6022	memcpy(out, PyBytes_AS_STRING(rep), repsize);
6023	out += repsize / `4`;
6024	} else / rep is unicode / {
6025	assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6026	ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6027	&out, native_ordering);
6028	}
6029
6030	Py_CLEAR(rep);
6031	}
6032
6033	/ Cut back to size actually needed. This is necessary for, for example,*
6034	encoding of a string containing isolated surrogates and the 'ignore'
6035	handler is used. /*
6036	nsize = (unsigned char) out - (unsigned* char*) PyBytes_AS_STRING(v);
6037	if (nsize != PyBytes_GET_SIZE(v))
6038	_PyBytes_Resize(&v, nsize);
6039	Py_XDECREF(errorHandler);
6040	Py_XDECREF(exc);
6041	done:
6042	return v;
6043	error:
6044	Py_XDECREF(rep);
6045	Py_XDECREF(errorHandler);
6046	Py_XDECREF(exc);
6047	Py_XDECREF(v);
6048	return NULL;
6049	}
6050
6051	PyObject *
6052	PyUnicode_EncodeUTF32(const Py_UNICODE *s,
6053	Py_ssize_t size,
6054	const char *errors,
6055	int byteorder)
6056	{
6057	PyObject *result;
6058	PyObject *tmp = PyUnicode_FromWideChar(s, size);
6059	if (tmp == NULL)
6060	return NULL;
6061	result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
6062	Py_DECREF(tmp);
6063	return result;
6064	}
6065
6066	PyObject *
6067	PyUnicode_AsUTF32String(PyObject *unicode)
6068	{
6069	return _PyUnicode_EncodeUTF32(unicode, NULL, `0`);
6070	}
6071
6072	/ --- UTF-16 Codec ------------------------------------------------------- /
6073
6074	PyObject *
6075	PyUnicode_DecodeUTF16(const char *s,
6076	Py_ssize_t size,
6077	const char *errors,
6078	int *byteorder)
6079	{
6080	return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6081	}
6082
6083	PyObject *
6084	PyUnicode_DecodeUTF16Stateful(const char *s,
6085	Py_ssize_t size,
6086	const char *errors,
6087	int *byteorder,
6088	Py_ssize_t *consumed)
6089	{
6090	const char *starts = s;
6091	Py_ssize_t startinpos;
6092	Py_ssize_t endinpos;
6093	_PyUnicodeWriter writer;
6094	const unsigned char q, e;
6095	int bo = `0`; / assume native ordering by default /
6096	int native_ordering;
6097	const char *errmsg = "";
6098	PyObject *errorHandler = NULL;
6099	PyObject *exc = NULL;
6100	const char *encoding;
6101
6102	q = (const unsigned char *)s;
6103	e = q + size;
6104
6105	if (byteorder)
6106	bo = *byteorder;
6107
6108	/ Check for BOM marks (U+FEFF) in the input and adjust current*
6109	byte order setting accordingly. In native mode, the leading BOM
6110	mark is skipped, in all other modes, it is copied to the output
6111	stream as-is (giving a ZWNBSP character). /*
6112	if (bo == `0` && size >= `2`) {
6113	const Py_UCS4 bom = (q[`1`] << `8`) \| q[`0`];
6114	if (bom == `0xFEFF`) {
6115	q += `2`;
6116	bo = -`1`;
6117	}
6118	else if (bom == `0xFFFE`) {
6119	q += `2`;
6120	bo = `1`;
6121	}
6122	if (byteorder)
6123	*byteorder = bo;
6124	}
6125
6126	if (q == e) {
6127	if (consumed)
6128	*consumed = size;
6129	_Py_RETURN_UNICODE_EMPTY();
6130	}
6131
6132	#if PY_LITTLE_ENDIAN
6133	native_ordering = bo <= `0`;
6134	encoding = bo <= `0` ? "utf-16-le" : "utf-16-be";
6135	#else
6136	native_ordering = bo >= `0`;
6137	encoding = bo >= `0` ? "utf-16-be" : "utf-16-le";
6138	#endif
6139
6140	/ Note: size will always be longer than the resulting Unicode*
6141	character count normally. Error handler will take care of
6142	resizing when needed. /*
6143	_PyUnicodeWriter_Init(&writer);
6144	writer.min_length = (e - q + `1`) / `2`;
6145	if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, `127`) == -`1`)
6146	goto onError;
6147
6148	while (`1`) {
6149	Py_UCS4 ch = `0`;
6150	if (e - q >= `2`) {
6151	int kind = writer.kind;
6152	if (kind == PyUnicode_1BYTE_KIND) {
6153	if (PyUnicode_IS_ASCII(writer.buffer))
6154	ch = asciilib_utf16_decode(&q, e,
6155	(Py_UCS1*)writer.data, &writer.pos,
6156	native_ordering);
6157	else
6158	ch = ucs1lib_utf16_decode(&q, e,
6159	(Py_UCS1*)writer.data, &writer.pos,
6160	native_ordering);
6161	} else if (kind == PyUnicode_2BYTE_KIND) {
6162	ch = ucs2lib_utf16_decode(&q, e,
6163	(Py_UCS2*)writer.data, &writer.pos,
6164	native_ordering);
6165	} else {
6166	assert(kind == PyUnicode_4BYTE_KIND);
6167	ch = ucs4lib_utf16_decode(&q, e,
6168	(Py_UCS4*)writer.data, &writer.pos,
6169	native_ordering);
6170	}
6171	}
6172
6173	switch (ch)
6174	{
6175	case `0`:
6176	/ remaining byte at the end? (size should be even) /
6177	if (q == e \|\| consumed)
6178	goto End;
6179	errmsg = "truncated data";
6180	startinpos = ((const char *)q) - starts;
6181	endinpos = ((const char *)e) - starts;
6182	break;
6183	/ The remaining input chars are ignored if the callback*
6184	chooses to skip the input /*
6185	case `1`:
6186	q -= `2`;
6187	if (consumed)
6188	goto End;
6189	errmsg = "unexpected end of data";
6190	startinpos = ((const char *)q) - starts;
6191	endinpos = ((const char *)e) - starts;
6192	break;
6193	case `2`:
6194	errmsg = "illegal encoding";
6195	startinpos = ((const char *)q) - `2` - starts;
6196	endinpos = startinpos + `2`;
6197	break;
6198	case `3`:
6199	errmsg = "illegal UTF-16 surrogate";
6200	startinpos = ((const char *)q) - `4` - starts;
6201	endinpos = startinpos + `2`;
6202	break;
6203	default:
6204	if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < `0`)
6205	goto onError;
6206	continue;
6207	}
6208
6209	if (unicode_decode_call_errorhandler_writer(
6210	errors,
6211	&errorHandler,
6212	encoding, errmsg,
6213	&starts,
6214	(const char **)&e,
6215	&startinpos,
6216	&endinpos,
6217	&exc,
6218	(const char **)&q,
6219	&writer))
6220	goto onError;
6221	}
6222
6223	End:
6224	if (consumed)
6225	consumed = (const* char *)q-starts;
6226
6227	Py_XDECREF(errorHandler);
6228	Py_XDECREF(exc);
6229	return _PyUnicodeWriter_Finish(&writer);
6230
6231	onError:
6232	_PyUnicodeWriter_Dealloc(&writer);
6233	Py_XDECREF(errorHandler);
6234	Py_XDECREF(exc);
6235	return NULL;
6236	}
6237
6238	PyObject *
6239	_PyUnicode_EncodeUTF16(PyObject *str,
6240	const char *errors,
6241	int byteorder)
6242	{
6243	enum PyUnicode_Kind kind;
6244	const void *data;
6245	Py_ssize_t len;
6246	PyObject *v;
6247	unsigned short *out;
6248	Py_ssize_t pairs;
6249	#if PY_BIG_ENDIAN
6250	int native_ordering = byteorder >= `0`;
6251	#else
6252	int native_ordering = byteorder <= `0`;
6253	#endif
6254	const char *encoding;
6255	Py_ssize_t nsize, pos;
6256	PyObject *errorHandler = NULL;
6257	PyObject *exc = NULL;
6258	PyObject *rep = NULL;
6259
6260	if (!PyUnicode_Check(str)) {
6261	PyErr_BadArgument();
6262	return NULL;
6263	}
6264	if (PyUnicode_READY(str) == -`1`)
6265	return NULL;
6266	kind = PyUnicode_KIND(str);
6267	data = PyUnicode_DATA(str);
6268	len = PyUnicode_GET_LENGTH(str);
6269
6270	pairs = `0`;
6271	if (kind == PyUnicode_4BYTE_KIND) {
6272	const Py_UCS4 in = (const* Py_UCS4 *)data;
6273	const Py_UCS4 *end = in + len;
6274	while (in < end) {
6275	if (*in++ >= `0x10000`) {
6276	pairs++;
6277	}
6278	}
6279	}
6280	if (len > PY_SSIZE_T_MAX / `2` - pairs - (byteorder == `0`)) {
6281	return PyErr_NoMemory();
6282	}
6283	nsize = len + pairs + (byteorder == `0`);
6284	v = PyBytes_FromStringAndSize(NULL, nsize * `2`);
6285	if (v == NULL) {
6286	return NULL;
6287	}
6288
6289	/ output buffer is 2-bytes aligned /
6290	assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), `2`));
6291	out = (unsigned short *)PyBytes_AS_STRING(v);
6292	if (byteorder == `0`) {
6293	*out++ = `0xFEFF`;
6294	}
6295	if (len == `0`) {
6296	goto done;
6297	}
6298
6299	if (kind == PyUnicode_1BYTE_KIND) {
6300	ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6301	goto done;
6302	}
6303
6304	if (byteorder < `0`) {
6305	encoding = "utf-16-le";
6306	}
6307	else if (byteorder > `0`) {
6308	encoding = "utf-16-be";
6309	}
6310	else {
6311	encoding = "utf-16";
6312	}
6313
6314	pos = `0`;
6315	while (pos < len) {
6316	Py_ssize_t newpos, repsize, moreunits;
6317
6318	if (kind == PyUnicode_2BYTE_KIND) {
6319	pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6320	&out, native_ordering);
6321	}
6322	else {
6323	assert(kind == PyUnicode_4BYTE_KIND);
6324	pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6325	&out, native_ordering);
6326	}
6327	if (pos == len)
6328	break;
6329
6330	rep = unicode_encode_call_errorhandler(
6331	errors, &errorHandler,
6332	encoding, "surrogates not allowed",
6333	str, &exc, pos, pos + `1`, &newpos);
6334	if (!rep)
6335	goto error;
6336
6337	if (PyBytes_Check(rep)) {
6338	repsize = PyBytes_GET_SIZE(rep);
6339	if (repsize & `1`) {
6340	raise_encode_exception(&exc, encoding,
6341	str, pos, pos + `1`,
6342	"surrogates not allowed");
6343	goto error;
6344	}
6345	moreunits = repsize / `2`;
6346	}
6347	else {
6348	assert(PyUnicode_Check(rep));
6349	if (PyUnicode_READY(rep) < `0`)
6350	goto error;
6351	moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6352	if (!PyUnicode_IS_ASCII(rep)) {
6353	raise_encode_exception(&exc, encoding,
6354	str, pos, pos + `1`,
6355	"surrogates not allowed");
6356	goto error;
6357	}
6358	}
6359	moreunits += pos - newpos;
6360	pos = newpos;
6361
6362	/ two bytes are reserved for each surrogate /
6363	if (moreunits > `0`) {
6364	Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6365	if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / `2`) {
6366	/ integer overflow /
6367	PyErr_NoMemory();
6368	goto error;
6369	}
6370	if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + `2` * moreunits) < `0`)
6371	goto error;
6372	out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6373	}
6374
6375	if (PyBytes_Check(rep)) {
6376	memcpy(out, PyBytes_AS_STRING(rep), repsize);
6377	out += repsize / `2`;
6378	} else / rep is unicode / {
6379	assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6380	ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6381	&out, native_ordering);
6382	}
6383
6384	Py_CLEAR(rep);
6385	}
6386
6387	/ Cut back to size actually needed. This is necessary for, for example,*
6388	encoding of a string containing isolated surrogates and the 'ignore' handler
6389	is used. /*
6390	nsize = (unsigned char) out - (unsigned* char*) PyBytes_AS_STRING(v);
6391	if (nsize != PyBytes_GET_SIZE(v))
6392	_PyBytes_Resize(&v, nsize);
6393	Py_XDECREF(errorHandler);
6394	Py_XDECREF(exc);
6395	done:
6396	return v;
6397	error:
6398	Py_XDECREF(rep);
6399	Py_XDECREF(errorHandler);
6400	Py_XDECREF(exc);
6401	Py_XDECREF(v);
6402	return NULL;
6403	#undef STORECHAR
6404	}
6405
6406	PyObject *
6407	PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6408	Py_ssize_t size,
6409	const char *errors,
6410	int byteorder)
6411	{
6412	PyObject *result;
6413	PyObject *tmp = PyUnicode_FromWideChar(s, size);
6414	if (tmp == NULL)
6415	return NULL;
6416	result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6417	Py_DECREF(tmp);
6418	return result;
6419	}
6420
6421	PyObject *
6422	PyUnicode_AsUTF16String(PyObject *unicode)
6423	{
6424	return _PyUnicode_EncodeUTF16(unicode, NULL, `0`);
6425	}
6426
6427	/ --- Unicode Escape Codec ----------------------------------------------- /
6428
6429	static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
6430
6431	PyObject *
6432	_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6433	Py_ssize_t size,
6434	const char *errors,
6435	Py_ssize_t *consumed,
6436	const char **first_invalid_escape)
6437	{
6438	const char *starts = s;
6439	_PyUnicodeWriter writer;
6440	const char *end;
6441	PyObject *errorHandler = NULL;
6442	PyObject *exc = NULL;
6443
6444	// so we can remember if we've seen an invalid escape char or not
6445	*first_invalid_escape = NULL;
6446
6447	if (size == `0`) {
6448	if (consumed) {
6449	*consumed = `0`;
6450	}
6451	_Py_RETURN_UNICODE_EMPTY();
6452	}
6453	/ Escaped strings will always be longer than the resulting*
6454	Unicode string, so we start with size here and then reduce the
6455	length after conversion to the true value.
6456	(but if the error callback returns a long replacement string
6457	we'll have to allocate more space) /*
6458	_PyUnicodeWriter_Init(&writer);
6459	writer.min_length = size;
6460	if (_PyUnicodeWriter_Prepare(&writer, size, `127`) < `0`) {
6461	goto onError;
6462	}
6463
6464	end = s + size;
6465	while (s < end) {
6466	unsigned char c = (unsigned char) *s++;
6467	Py_UCS4 ch;
6468	int count;
6469	const char *message;
6470
6471	#define WRITE_ASCII_CHAR(ch) \
6472	do { \
6473	assert(ch <= 127); \
6474	assert(writer.pos < writer.size); \
6475	PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6476	} while(0)
6477
6478	#define WRITE_CHAR(ch) \
6479	do { \
6480	if (ch <= writer.maxchar) { \
6481	assert(writer.pos < writer.size); \
6482	PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6483	} \
6484	else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6485	goto onError; \
6486	} \
6487	} while(0)
6488
6489	/ Non-escape characters are interpreted as Unicode ordinals /
6490	if (c != `'\\'`) {
6491	WRITE_CHAR(c);
6492	continue;
6493	}
6494
6495	Py_ssize_t startinpos = s - starts - `1`;
6496	/ \ - Escapes /
6497	if (s >= end) {
6498	message = "\\ at end of string";
6499	goto incomplete;
6500	}
6501	c = (unsigned char) *s++;
6502
6503	assert(writer.pos < writer.size);
6504	switch (c) {
6505
6506	/ \x escapes /
6507	case `'\n'`: continue;
6508	case `'\\'`: WRITE_ASCII_CHAR(`'\\'`); continue;
6509	case `'\''`: WRITE_ASCII_CHAR(`'\''`); continue;
6510	case `'\"'`: WRITE_ASCII_CHAR(`'\"'`); continue;
6511	case `'b'`: WRITE_ASCII_CHAR(`'\b'`); continue;
6512	/ FF /
6513	case `'f'`: WRITE_ASCII_CHAR(`'\014'`); continue;
6514	case `'t'`: WRITE_ASCII_CHAR(`'\t'`); continue;
6515	case `'n'`: WRITE_ASCII_CHAR(`'\n'`); continue;
6516	case `'r'`: WRITE_ASCII_CHAR(`'\r'`); continue;
6517	/ VT /
6518	case `'v'`: WRITE_ASCII_CHAR(`'\013'`); continue;
6519	/ BEL, not classic C /
6520	case `'a'`: WRITE_ASCII_CHAR(`'\007'`); continue;
6521
6522	/ \OOO (octal) escapes /
6523	case `'0'`: case `'1'`: case `'2'`: case `'3'`:
6524	case `'4'`: case `'5'`: case `'6'`: case `'7'`:
6525	ch = c - `'0'`;
6526	if (s < end && `'0'` <= s && s <= `'7'`) {
6527	ch = (ch<<`3`) + *s++ - `'0'`;
6528	if (s < end && `'0'` <= s && s <= `'7'`) {
6529	ch = (ch<<`3`) + *s++ - `'0'`;
6530	}
6531	}
6532	WRITE_CHAR(ch);
6533	continue;
6534
6535	/ hex escapes /
6536	/ \xXX /
6537	case `'x'`:
6538	count = `2`;
6539	message = "truncated \\xXX escape";
6540	goto hexescape;
6541
6542	/ \uXXXX /
6543	case `'u'`:
6544	count = `4`;
6545	message = "truncated \\uXXXX escape";
6546	goto hexescape;
6547
6548	/ \UXXXXXXXX /
6549	case `'U'`:
6550	count = `8`;
6551	message = "truncated \\UXXXXXXXX escape";
6552	hexescape:
6553	for (ch = `0`; count; ++s, --count) {
6554	if (s >= end) {
6555	goto incomplete;
6556	}
6557	c = (unsigned char)*s;
6558	ch <<= `4`;
6559	if (c >= `'0'` && c <= `'9'`) {
6560	ch += c - `'0'`;
6561	}
6562	else if (c >= `'a'` && c <= `'f'`) {
6563	ch += c - (`'a'` - `10`);
6564	}
6565	else if (c >= `'A'` && c <= `'F'`) {
6566	ch += c - (`'A'` - `10`);
6567	}
6568	else {
6569	goto error;
6570	}
6571	}
6572
6573	/ when we get here, ch is a 32-bit unicode character /
6574	if (ch > MAX_UNICODE) {
6575	message = "illegal Unicode character";
6576	goto error;
6577	}
6578
6579	WRITE_CHAR(ch);
6580	continue;
6581
6582	/ \N{name} /
6583	case `'N'`:
6584	if (ucnhash_capi == NULL) {
6585	/ load the unicode data module /
6586	ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6587	PyUnicodeData_CAPSULE_NAME, `1`);
6588	if (ucnhash_capi == NULL) {
6589	PyErr_SetString(
6590	PyExc_UnicodeError,
6591	"\\N escapes not supported (can't load unicodedata module)"
6592	);
6593	goto onError;
6594	}
6595	}
6596
6597	message = "malformed \\N character escape";
6598	if (s >= end) {
6599	goto incomplete;
6600	}
6601	if (*s == `'{'`) {
6602	const char *start = ++s;
6603	size_t namelen;
6604	/ look for the closing brace /
6605	while (s < end && *s != `'}'`)
6606	s++;
6607	if (s >= end) {
6608	goto incomplete;
6609	}
6610	namelen = s - start;
6611	if (namelen) {
6612	/ found a name. look it up in the unicode database /
6613	s++;
6614	ch = `0xffffffff`; / in case 'getcode' messes up /
6615	if (namelen <= INT_MAX &&
6616	ucnhash_capi->getcode(start, (int)namelen,
6617	&ch, `0`)) {
6618	assert(ch <= MAX_UNICODE);
6619	WRITE_CHAR(ch);
6620	continue;
6621	}
6622	message = "unknown Unicode character name";
6623	}
6624	}
6625	goto error;
6626
6627	default:
6628	if (*first_invalid_escape == NULL) {
6629	first_invalid_escape = s-`1`; /* Back up one char, since we've*
6630	already incremented s. /*
6631	}
6632	WRITE_ASCII_CHAR(`'\\'`);
6633	WRITE_CHAR(c);
6634	continue;
6635	}
6636
6637	incomplete:
6638	if (consumed) {
6639	*consumed = startinpos;
6640	break;
6641	}
6642	error:;
6643	Py_ssize_t endinpos = s-starts;
6644	writer.min_length = end - s + writer.pos;
6645	if (unicode_decode_call_errorhandler_writer(
6646	errors, &errorHandler,
6647	"unicodeescape", message,
6648	&starts, &end, &startinpos, &endinpos, &exc, &s,
6649	&writer)) {
6650	goto onError;
6651	}
6652	assert(end - s <= writer.size - writer.pos);
6653
6654	#undef WRITE_ASCII_CHAR
6655	#undef WRITE_CHAR
6656	}
6657
6658	Py_XDECREF(errorHandler);
6659	Py_XDECREF(exc);
6660	return _PyUnicodeWriter_Finish(&writer);
6661
6662	onError:
6663	_PyUnicodeWriter_Dealloc(&writer);
6664	Py_XDECREF(errorHandler);
6665	Py_XDECREF(exc);
6666	return NULL;
6667	}
6668
6669	PyObject *
6670	_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6671	Py_ssize_t size,
6672	const char *errors,
6673	Py_ssize_t *consumed)
6674	{
6675	const char *first_invalid_escape;
6676	PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6677	consumed,
6678	&first_invalid_escape);
6679	if (result == NULL)
6680	return NULL;
6681	if (first_invalid_escape != NULL) {
6682	if (PyErr_WarnFormat(PyExc_DeprecationWarning, `1`,
6683	"invalid escape sequence '\\%c'",
6684	(unsigned char)*first_invalid_escape) < `0`) {
6685	Py_DECREF(result);
6686	return NULL;
6687	}
6688	}
6689	return result;
6690	}
6691
6692	PyObject *
6693	PyUnicode_DecodeUnicodeEscape(const char *s,
6694	Py_ssize_t size,
6695	const char *errors)
6696	{
6697	return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6698	}
6699
6700	/ Return a Unicode-Escape string version of the Unicode object. /
6701
6702	PyObject *
6703	PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6704	{
6705	Py_ssize_t i, len;
6706	PyObject *repr;
6707	char *p;
6708	enum PyUnicode_Kind kind;
6709	const void *data;
6710	Py_ssize_t expandsize;
6711
6712	/ Initial allocation is based on the longest-possible character*
6713	escape.
6714
6715	For UCS1 strings it's '\xxx', 4 bytes per source character.
6716	For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6717	For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6718	*/
6719
6720	if (!PyUnicode_Check(unicode)) {
6721	PyErr_BadArgument();
6722	return NULL;
6723	}
6724	if (PyUnicode_READY(unicode) == -`1`) {
6725	return NULL;
6726	}
6727
6728	len = PyUnicode_GET_LENGTH(unicode);
6729	if (len == `0`) {
6730	return PyBytes_FromStringAndSize(NULL, `0`);
6731	}
6732
6733	kind = PyUnicode_KIND(unicode);
6734	data = PyUnicode_DATA(unicode);
6735	/ 4 byte characters can take up 10 bytes, 2 byte characters can take up 6*
6736	bytes, and 1 byte characters 4. /*
6737	expandsize = kind * `2` + `2`;
6738	if (len > PY_SSIZE_T_MAX / expandsize) {
6739	return PyErr_NoMemory();
6740	}
6741	repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6742	if (repr == NULL) {
6743	return NULL;
6744	}
6745
6746	p = PyBytes_AS_STRING(repr);
6747	for (i = `0`; i < len; i++) {
6748	Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6749
6750	/ U+0000-U+00ff range /
6751	if (ch < `0x100`) {
6752	if (ch >= `' '` && ch < `127`) {
6753	if (ch != `'\\'`) {
6754	/ Copy printable US ASCII as-is /
6755	p++ = (char*) ch;
6756	}
6757	/ Escape backslashes /
6758	else {
6759	*p++ = `'\\'`;
6760	*p++ = `'\\'`;
6761	}
6762	}
6763
6764	/ Map special whitespace to '\t', \n', '\r' /
6765	else if (ch == `'\t'`) {
6766	*p++ = `'\\'`;
6767	*p++ = `'t'`;
6768	}
6769	else if (ch == `'\n'`) {
6770	*p++ = `'\\'`;
6771	*p++ = `'n'`;
6772	}
6773	else if (ch == `'\r'`) {
6774	*p++ = `'\\'`;
6775	*p++ = `'r'`;
6776	}
6777
6778	/ Map non-printable US ASCII and 8-bit characters to '\xHH' /
6779	else {
6780	*p++ = `'\\'`;
6781	*p++ = `'x'`;
6782	*p++ = Py_hexdigits[(ch >> `4`) & `0x000F`];
6783	*p++ = Py_hexdigits[ch & `0x000F`];
6784	}
6785	}
6786	/ U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' /
6787	else if (ch < `0x10000`) {
6788	*p++ = `'\\'`;
6789	*p++ = `'u'`;
6790	*p++ = Py_hexdigits[(ch >> `12`) & `0x000F`];
6791	*p++ = Py_hexdigits[(ch >> `8`) & `0x000F`];
6792	*p++ = Py_hexdigits[(ch >> `4`) & `0x000F`];
6793	*p++ = Py_hexdigits[ch & `0x000F`];
6794	}
6795	/ U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' /
6796	else {
6797
6798	/ Make sure that the first two digits are zero /
6799	assert(ch <= MAX_UNICODE && MAX_UNICODE <= `0x10ffff`);
6800	*p++ = `'\\'`;
6801	*p++ = `'U'`;
6802	*p++ = `'0'`;
6803	*p++ = `'0'`;
6804	*p++ = Py_hexdigits[(ch >> `20`) & `0x0000000F`];
6805	*p++ = Py_hexdigits[(ch >> `16`) & `0x0000000F`];
6806	*p++ = Py_hexdigits[(ch >> `12`) & `0x0000000F`];
6807	*p++ = Py_hexdigits[(ch >> `8`) & `0x0000000F`];
6808	*p++ = Py_hexdigits[(ch >> `4`) & `0x0000000F`];
6809	*p++ = Py_hexdigits[ch & `0x0000000F`];
6810	}
6811	}
6812
6813	assert(p - PyBytes_AS_STRING(repr) > `0`);
6814	if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < `0`) {
6815	return NULL;
6816	}
6817	return repr;
6818	}
6819
6820	PyObject *
6821	PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6822	Py_ssize_t size)
6823	{
6824	PyObject *result;
6825	PyObject *tmp = PyUnicode_FromWideChar(s, size);
6826	if (tmp == NULL) {
6827	return NULL;
6828	}
6829
6830	result = PyUnicode_AsUnicodeEscapeString(tmp);
6831	Py_DECREF(tmp);
6832	return result;
6833	}
6834
6835	/ --- Raw Unicode Escape Codec ------------------------------------------- /
6836
6837	PyObject *
6838	_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6839	Py_ssize_t size,
6840	const char *errors,
6841	Py_ssize_t *consumed)
6842	{
6843	const char *starts = s;
6844	_PyUnicodeWriter writer;
6845	const char *end;
6846	PyObject *errorHandler = NULL;
6847	PyObject *exc = NULL;
6848
6849	if (size == `0`) {
6850	if (consumed) {
6851	*consumed = `0`;
6852	}
6853	_Py_RETURN_UNICODE_EMPTY();
6854	}
6855
6856	/ Escaped strings will always be longer than the resulting*
6857	Unicode string, so we start with size here and then reduce the
6858	length after conversion to the true value. (But decoding error
6859	handler might have to resize the string) /*
6860	_PyUnicodeWriter_Init(&writer);
6861	writer.min_length = size;
6862	if (_PyUnicodeWriter_Prepare(&writer, size, `127`) < `0`) {
6863	goto onError;
6864	}
6865
6866	end = s + size;
6867	while (s < end) {
6868	unsigned char c = (unsigned char) *s++;
6869	Py_UCS4 ch;
6870	int count;
6871	const char *message;
6872
6873	#define WRITE_CHAR(ch) \
6874	do { \
6875	if (ch <= writer.maxchar) { \
6876	assert(writer.pos < writer.size); \
6877	PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6878	} \
6879	else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6880	goto onError; \
6881	} \
6882	} while(0)
6883
6884	/ Non-escape characters are interpreted as Unicode ordinals /
6885	if (c != `'\\'` \|\| (s >= end && !consumed)) {
6886	WRITE_CHAR(c);
6887	continue;
6888	}
6889
6890	Py_ssize_t startinpos = s - starts - `1`;
6891	/ \ - Escapes /
6892	if (s >= end) {
6893	assert(consumed);
6894	// Set message to silent compiler warning.
6895	// Actually it is never used.
6896	message = "\\ at end of string";
6897	goto incomplete;
6898	}
6899
6900	c = (unsigned char) *s++;
6901	if (c == `'u'`) {
6902	count = `4`;
6903	message = "truncated \\uXXXX escape";
6904	}
6905	else if (c == `'U'`) {
6906	count = `8`;
6907	message = "truncated \\UXXXXXXXX escape";
6908	}
6909	else {
6910	assert(writer.pos < writer.size);
6911	PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, `'\\'`);
6912	WRITE_CHAR(c);
6913	continue;
6914	}
6915
6916	/ \uHHHH with 4 hex digits, \U00HHHHHH with 8 /
6917	for (ch = `0`; count; ++s, --count) {
6918	if (s >= end) {
6919	goto incomplete;
6920	}
6921	c = (unsigned char)*s;
6922	ch <<= `4`;
6923	if (c >= `'0'` && c <= `'9'`) {
6924	ch += c - `'0'`;
6925	}
6926	else if (c >= `'a'` && c <= `'f'`) {
6927	ch += c - (`'a'` - `10`);
6928	}
6929	else if (c >= `'A'` && c <= `'F'`) {
6930	ch += c - (`'A'` - `10`);
6931	}
6932	else {
6933	goto error;
6934	}
6935	}
6936	if (ch > MAX_UNICODE) {
6937	message = "\\Uxxxxxxxx out of range";
6938	goto error;
6939	}
6940	WRITE_CHAR(ch);
6941	continue;
6942
6943	incomplete:
6944	if (consumed) {
6945	*consumed = startinpos;
6946	break;
6947	}
6948	error:;
6949	Py_ssize_t endinpos = s-starts;
6950	writer.min_length = end - s + writer.pos;
6951	if (unicode_decode_call_errorhandler_writer(
6952	errors, &errorHandler,
6953	"rawunicodeescape", message,
6954	&starts, &end, &startinpos, &endinpos, &exc, &s,
6955	&writer)) {
6956	goto onError;
6957	}
6958	assert(end - s <= writer.size - writer.pos);
6959
6960	#undef WRITE_CHAR
6961	}
6962	Py_XDECREF(errorHandler);
6963	Py_XDECREF(exc);
6964	return _PyUnicodeWriter_Finish(&writer);
6965
6966	onError:
6967	_PyUnicodeWriter_Dealloc(&writer);
6968	Py_XDECREF(errorHandler);
6969	Py_XDECREF(exc);
6970	return NULL;
6971	}
6972
6973	PyObject *
6974	PyUnicode_DecodeRawUnicodeEscape(const char *s,
6975	Py_ssize_t size,
6976	const char *errors)
6977	{
6978	return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6979	}
6980
6981
6982	PyObject *
6983	PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6984	{
6985	PyObject *repr;
6986	char *p;
6987	Py_ssize_t expandsize, pos;
6988	int kind;
6989	const void *data;
6990	Py_ssize_t len;
6991
6992	if (!PyUnicode_Check(unicode)) {
6993	PyErr_BadArgument();
6994	return NULL;
6995	}
6996	if (PyUnicode_READY(unicode) == -`1`) {
6997	return NULL;
6998	}
6999	kind = PyUnicode_KIND(unicode);
7000	data = PyUnicode_DATA(unicode);
7001	len = PyUnicode_GET_LENGTH(unicode);
7002	if (kind == PyUnicode_1BYTE_KIND) {
7003	return PyBytes_FromStringAndSize(data, len);
7004	}
7005
7006	/ 4 byte characters can take up 10 bytes, 2 byte characters can take up 6*
7007	bytes, and 1 byte characters 4. /*
7008	expandsize = kind * `2` + `2`;
7009
7010	if (len > PY_SSIZE_T_MAX / expandsize) {
7011	return PyErr_NoMemory();
7012	}
7013	repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
7014	if (repr == NULL) {
7015	return NULL;
7016	}
7017	if (len == `0`) {
7018	return repr;
7019	}
7020
7021	p = PyBytes_AS_STRING(repr);
7022	for (pos = `0`; pos < len; pos++) {
7023	Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7024
7025	/ U+0000-U+00ff range: Copy 8-bit characters as-is /
7026	if (ch < `0x100`) {
7027	p++ = (char*) ch;
7028	}
7029	/ U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' /
7030	else if (ch < `0x10000`) {
7031	*p++ = `'\\'`;
7032	*p++ = `'u'`;
7033	*p++ = Py_hexdigits[(ch >> `12`) & `0xf`];
7034	*p++ = Py_hexdigits[(ch >> `8`) & `0xf`];
7035	*p++ = Py_hexdigits[(ch >> `4`) & `0xf`];
7036	*p++ = Py_hexdigits[ch & `15`];
7037	}
7038	/ U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' /
7039	else {
7040	assert(ch <= MAX_UNICODE && MAX_UNICODE <= `0x10ffff`);
7041	*p++ = `'\\'`;
7042	*p++ = `'U'`;
7043	*p++ = `'0'`;
7044	*p++ = `'0'`;
7045	*p++ = Py_hexdigits[(ch >> `20`) & `0xf`];
7046	*p++ = Py_hexdigits[(ch >> `16`) & `0xf`];
7047	*p++ = Py_hexdigits[(ch >> `12`) & `0xf`];
7048	*p++ = Py_hexdigits[(ch >> `8`) & `0xf`];
7049	*p++ = Py_hexdigits[(ch >> `4`) & `0xf`];
7050	*p++ = Py_hexdigits[ch & `15`];
7051	}
7052	}
7053
7054	assert(p > PyBytes_AS_STRING(repr));
7055	if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < `0`) {
7056	return NULL;
7057	}
7058	return repr;
7059	}
7060
7061	PyObject *
7062	PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
7063	Py_ssize_t size)
7064	{
7065	PyObject *result;
7066	PyObject *tmp = PyUnicode_FromWideChar(s, size);
7067	if (tmp == NULL)
7068	return NULL;
7069	result = PyUnicode_AsRawUnicodeEscapeString(tmp);
7070	Py_DECREF(tmp);
7071	return result;
7072	}
7073
7074	/ --- Latin-1 Codec ------------------------------------------------------ /
7075
7076	PyObject *
7077	PyUnicode_DecodeLatin1(const char *s,
7078	Py_ssize_t size,
7079	const char *errors)
7080	{
7081	/ Latin-1 is equivalent to the first 256 ordinals in Unicode. /
7082	return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7083	}
7084
7085	/ create or adjust a UnicodeEncodeError /
7086	static void
7087	make_encode_exception(PyObject **exceptionObject,
7088	const char *encoding,
7089	PyObject *unicode,
7090	Py_ssize_t startpos, Py_ssize_t endpos,
7091	const char *reason)
7092	{
7093	if (*exceptionObject == NULL) {
7094	*exceptionObject = PyObject_CallFunction(
7095	PyExc_UnicodeEncodeError, "sOnns",
7096	encoding, unicode, startpos, endpos, reason);
7097	}
7098	else {
7099	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7100	goto onError;
7101	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7102	goto onError;
7103	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7104	goto onError;
7105	return;
7106	onError:
7107	Py_CLEAR(*exceptionObject);
7108	}
7109	}
7110
7111	/ raises a UnicodeEncodeError /
7112	static void
7113	raise_encode_exception(PyObject **exceptionObject,
7114	const char *encoding,
7115	PyObject *unicode,
7116	Py_ssize_t startpos, Py_ssize_t endpos,
7117	const char *reason)
7118	{
7119	make_encode_exception(exceptionObject,
7120	encoding, unicode, startpos, endpos, reason);
7121	if (*exceptionObject != NULL)
7122	PyCodec_StrictErrors(*exceptionObject);
7123	}
7124
7125	/ error handling callback helper:*
7126	build arguments, call the callback and check the arguments,
7127	put the result into newpos and return the replacement string, which
7128	has to be freed by the caller /*
7129	static PyObject *
7130	unicode_encode_call_errorhandler(const char *errors,
7131	PyObject **errorHandler,
7132	const char encoding, const* char *reason,
7133	PyObject unicode, PyObject *exceptionObject,
7134	Py_ssize_t startpos, Py_ssize_t endpos,
7135	Py_ssize_t *newpos)
7136	{
7137	static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7138	Py_ssize_t len;
7139	PyObject *restuple;
7140	PyObject *resunicode;
7141
7142	if (*errorHandler == NULL) {
7143	*errorHandler = PyCodec_LookupError(errors);
7144	if (*errorHandler == NULL)
7145	return NULL;
7146	}
7147
7148	if (PyUnicode_READY(unicode) == -`1`)
7149	return NULL;
7150	len = PyUnicode_GET_LENGTH(unicode);
7151
7152	make_encode_exception(exceptionObject,
7153	encoding, unicode, startpos, endpos, reason);
7154	if (*exceptionObject == NULL)
7155	return NULL;
7156
7157	restuple = PyObject_CallOneArg(errorHandler, exceptionObject);
7158	if (restuple == NULL)
7159	return NULL;
7160	if (!PyTuple_Check(restuple)) {
7161	PyErr_SetString(PyExc_TypeError, &argparse[`3`]);
7162	Py_DECREF(restuple);
7163	return NULL;
7164	}
7165	if (!PyArg_ParseTuple(restuple, argparse,
7166	&resunicode, newpos)) {
7167	Py_DECREF(restuple);
7168	return NULL;
7169	}
7170	if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7171	PyErr_SetString(PyExc_TypeError, &argparse[`3`]);
7172	Py_DECREF(restuple);
7173	return NULL;
7174	}
7175	if (*newpos<`0`)
7176	newpos = len + newpos;
7177	if (newpos<`0` \|\| newpos>len) {
7178	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7179	Py_DECREF(restuple);
7180	return NULL;
7181	}
7182	Py_INCREF(resunicode);
7183	Py_DECREF(restuple);
7184	return resunicode;
7185	}
7186
7187	static PyObject *
7188	unicode_encode_ucs1(PyObject *unicode,
7189	const char *errors,
7190	const Py_UCS4 limit)
7191	{
7192	/ input state /
7193	Py_ssize_t pos=`0`, size;
7194	int kind;
7195	const void *data;
7196	/ pointer into the output /
7197	char *str;
7198	const char *encoding = (limit == `256`) ? "latin-1" : "ascii";
7199	const char *reason = (limit == `256`) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7200	PyObject *error_handler_obj = NULL;
7201	PyObject *exc = NULL;
7202	_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7203	PyObject *rep = NULL;
7204	/ output object /
7205	_PyBytesWriter writer;
7206
7207	if (PyUnicode_READY(unicode) == -`1`)
7208	return NULL;
7209	size = PyUnicode_GET_LENGTH(unicode);
7210	kind = PyUnicode_KIND(unicode);
7211	data = PyUnicode_DATA(unicode);
7212	/ allocate enough for a simple encoding without*
7213	replacements, if we need more, we'll resize /*
7214	if (size == `0`)
7215	return PyBytes_FromStringAndSize(NULL, `0`);
7216
7217	_PyBytesWriter_Init(&writer);
7218	str = _PyBytesWriter_Alloc(&writer, size);
7219	if (str == NULL)
7220	return NULL;
7221
7222	while (pos < size) {
7223	Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7224
7225	/ can we encode this? /
7226	if (ch < limit) {
7227	/ no overflow check, because we know that the space is enough /
7228	str++ = (char*)ch;
7229	++pos;
7230	}
7231	else {
7232	Py_ssize_t newpos, i;
7233	/ startpos for collecting unencodable chars /
7234	Py_ssize_t collstart = pos;
7235	Py_ssize_t collend = collstart + `1`;
7236	/ find all unecodable characters /
7237
7238	while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7239	++collend;
7240
7241	/ Only overallocate the buffer if it's not the last write /
7242	writer.overallocate = (collend < size);
7243
7244	/ cache callback name lookup (if not done yet, i.e. it's the first error) /
7245	if (error_handler == _Py_ERROR_UNKNOWN)
7246	error_handler = _Py_GetErrorHandler(errors);
7247
7248	switch (error_handler) {
7249	case _Py_ERROR_STRICT:
7250	raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7251	goto onError;
7252
7253	case _Py_ERROR_REPLACE:
7254	memset(str, `'?'`, collend - collstart);
7255	str += (collend - collstart);
7256	/ fall through /
7257	case _Py_ERROR_IGNORE:
7258	pos = collend;
7259	break;
7260
7261	case _Py_ERROR_BACKSLASHREPLACE:
7262	/ subtract preallocated bytes /
7263	writer.min_size -= (collend - collstart);
7264	str = backslashreplace(&writer, str,
7265	unicode, collstart, collend);
7266	if (str == NULL)
7267	goto onError;
7268	pos = collend;
7269	break;
7270
7271	case _Py_ERROR_XMLCHARREFREPLACE:
7272	/ subtract preallocated bytes /
7273	writer.min_size -= (collend - collstart);
7274	str = xmlcharrefreplace(&writer, str,
7275	unicode, collstart, collend);
7276	if (str == NULL)
7277	goto onError;
7278	pos = collend;
7279	break;
7280
7281	case _Py_ERROR_SURROGATEESCAPE:
7282	for (i = collstart; i < collend; ++i) {
7283	ch = PyUnicode_READ(kind, data, i);
7284	if (ch < `0xdc80` \|\| `0xdcff` < ch) {
7285	/ Not a UTF-8b surrogate /
7286	break;
7287	}
7288	str++ = (char*)(ch - `0xdc00`);
7289	++pos;
7290	}
7291	if (i >= collend)
7292	break;
7293	collstart = pos;
7294	assert(collstart != collend);
7295	/ fall through /
7296
7297	default:
7298	rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7299	encoding, reason, unicode, &exc,
7300	collstart, collend, &newpos);
7301	if (rep == NULL)
7302	goto onError;
7303
7304	if (newpos < collstart) {
7305	writer.overallocate = `1`;
7306	str = _PyBytesWriter_Prepare(&writer, str,
7307	collstart - newpos);
7308	if (str == NULL)
7309	goto onError;
7310	}
7311	else {
7312	/ subtract preallocated bytes /
7313	writer.min_size -= newpos - collstart;
7314	/ Only overallocate the buffer if it's not the last write /
7315	writer.overallocate = (newpos < size);
7316	}
7317
7318	if (PyBytes_Check(rep)) {
7319	/ Directly copy bytes result to output. /
7320	str = _PyBytesWriter_WriteBytes(&writer, str,
7321	PyBytes_AS_STRING(rep),
7322	PyBytes_GET_SIZE(rep));
7323	}
7324	else {
7325	assert(PyUnicode_Check(rep));
7326
7327	if (PyUnicode_READY(rep) < `0`)
7328	goto onError;
7329
7330	if (limit == `256` ?
7331	PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7332	!PyUnicode_IS_ASCII(rep))
7333	{
7334	/ Not all characters are smaller than limit /
7335	raise_encode_exception(&exc, encoding, unicode,
7336	collstart, collend, reason);
7337	goto onError;
7338	}
7339	assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7340	str = _PyBytesWriter_WriteBytes(&writer, str,
7341	PyUnicode_DATA(rep),
7342	PyUnicode_GET_LENGTH(rep));
7343	}
7344	if (str == NULL)
7345	goto onError;
7346
7347	pos = newpos;
7348	Py_CLEAR(rep);
7349	}
7350
7351	/ If overallocation was disabled, ensure that it was the last*
7352	write. Otherwise, we missed an optimization /*
7353	assert(writer.overallocate \|\| pos == size);
7354	}
7355	}
7356
7357	Py_XDECREF(error_handler_obj);
7358	Py_XDECREF(exc);
7359	return _PyBytesWriter_Finish(&writer, str);
7360
7361	onError:
7362	Py_XDECREF(rep);
7363	_PyBytesWriter_Dealloc(&writer);
7364	Py_XDECREF(error_handler_obj);
7365	Py_XDECREF(exc);
7366	return NULL;
7367	}
7368
7369	/ Deprecated /
7370	PyObject *
7371	PyUnicode_EncodeLatin1(const Py_UNICODE *p,
7372	Py_ssize_t size,
7373	const char *errors)
7374	{
7375	PyObject *result;
7376	PyObject *unicode = PyUnicode_FromWideChar(p, size);
7377	if (unicode == NULL)
7378	return NULL;
7379	result = unicode_encode_ucs1(unicode, errors, `256`);
7380	Py_DECREF(unicode);
7381	return result;
7382	}
7383
7384	PyObject *
7385	_PyUnicode_AsLatin1String(PyObject unicode, const* char *errors)
7386	{
7387	if (!PyUnicode_Check(unicode)) {
7388	PyErr_BadArgument();
7389	return NULL;
7390	}
7391	if (PyUnicode_READY(unicode) == -`1`)
7392	return NULL;
7393	/ Fast path: if it is a one-byte string, construct*
7394	bytes object directly. /*
7395	if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7396	return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7397	PyUnicode_GET_LENGTH(unicode));
7398	/ Non-Latin-1 characters present. Defer to above function to*
7399	raise the exception. /*
7400	return unicode_encode_ucs1(unicode, errors, `256`);
7401	}
7402
7403	PyObject*
7404	PyUnicode_AsLatin1String(PyObject *unicode)
7405	{
7406	return _PyUnicode_AsLatin1String(unicode, NULL);
7407	}
7408
7409	/ --- 7-bit ASCII Codec -------------------------------------------------- /
7410
7411	PyObject *
7412	PyUnicode_DecodeASCII(const char *s,
7413	Py_ssize_t size,
7414	const char *errors)
7415	{
7416	const char *starts = s;
7417	const char *e = s + size;
7418	PyObject *error_handler_obj = NULL;
7419	PyObject *exc = NULL;
7420	_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7421
7422	if (size == `0`)
7423	_Py_RETURN_UNICODE_EMPTY();
7424
7425	/ ASCII is equivalent to the first 128 ordinals in Unicode. /
7426	if (size == `1` && (unsigned char)s[`0`] < `128`) {
7427	return get_latin1_char((unsigned char)s[`0`]);
7428	}
7429
7430	// Shortcut for simple case
7431	PyObject *u = PyUnicode_New(size, `127`);
7432	if (u == NULL) {
7433	return NULL;
7434	}
7435	Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7436	if (outpos == size) {
7437	return u;
7438	}
7439
7440	_PyUnicodeWriter writer;
7441	_PyUnicodeWriter_InitWithBuffer(&writer, u);
7442	writer.pos = outpos;
7443
7444	s += outpos;
7445	int kind = writer.kind;
7446	void *data = writer.data;
7447	Py_ssize_t startinpos, endinpos;
7448
7449	while (s < e) {
7450	unsigned char c = (unsigned char)*s;
7451	if (c < `128`) {
7452	PyUnicode_WRITE(kind, data, writer.pos, c);
7453	writer.pos++;
7454	++s;
7455	continue;
7456	}
7457
7458	/ byte outsize range 0x00..0x7f: call the error handler /
7459
7460	if (error_handler == _Py_ERROR_UNKNOWN)
7461	error_handler = _Py_GetErrorHandler(errors);
7462
7463	switch (error_handler)
7464	{
7465	case _Py_ERROR_REPLACE:
7466	case _Py_ERROR_SURROGATEESCAPE:
7467	/ Fast-path: the error handler only writes one character,*
7468	but we may switch to UCS2 at the first write /*
7469	if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < `0`)
7470	goto onError;
7471	kind = writer.kind;
7472	data = writer.data;
7473
7474	if (error_handler == _Py_ERROR_REPLACE)
7475	PyUnicode_WRITE(kind, data, writer.pos, `0xfffd`);
7476	else
7477	PyUnicode_WRITE(kind, data, writer.pos, c + `0xdc00`);
7478	writer.pos++;
7479	++s;
7480	break;
7481
7482	case _Py_ERROR_IGNORE:
7483	++s;
7484	break;
7485
7486	default:
7487	startinpos = s-starts;
7488	endinpos = startinpos + `1`;
7489	if (unicode_decode_call_errorhandler_writer(
7490	errors, &error_handler_obj,
7491	"ascii", "ordinal not in range(128)",
7492	&starts, &e, &startinpos, &endinpos, &exc, &s,
7493	&writer))
7494	goto onError;
7495	kind = writer.kind;
7496	data = writer.data;
7497	}
7498	}
7499	Py_XDECREF(error_handler_obj);
7500	Py_XDECREF(exc);
7501	return _PyUnicodeWriter_Finish(&writer);
7502
7503	onError:
7504	_PyUnicodeWriter_Dealloc(&writer);
7505	Py_XDECREF(error_handler_obj);
7506	Py_XDECREF(exc);
7507	return NULL;
7508	}
7509
7510	/ Deprecated /
7511	PyObject *
7512	PyUnicode_EncodeASCII(const Py_UNICODE *p,
7513	Py_ssize_t size,
7514	const char *errors)
7515	{
7516	PyObject *result;
7517	PyObject *unicode = PyUnicode_FromWideChar(p, size);
7518	if (unicode == NULL)
7519	return NULL;
7520	result = unicode_encode_ucs1(unicode, errors, `128`);
7521	Py_DECREF(unicode);
7522	return result;
7523	}
7524
7525	PyObject *
7526	_PyUnicode_AsASCIIString(PyObject unicode, const* char *errors)
7527	{
7528	if (!PyUnicode_Check(unicode)) {
7529	PyErr_BadArgument();
7530	return NULL;
7531	}
7532	if (PyUnicode_READY(unicode) == -`1`)
7533	return NULL;
7534	/ Fast path: if it is an ASCII-only string, construct bytes object*
7535	directly. Else defer to above function to raise the exception. /*
7536	if (PyUnicode_IS_ASCII(unicode))
7537	return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7538	PyUnicode_GET_LENGTH(unicode));
7539	return unicode_encode_ucs1(unicode, errors, `128`);
7540	}
7541
7542	PyObject *
7543	PyUnicode_AsASCIIString(PyObject *unicode)
7544	{
7545	return _PyUnicode_AsASCIIString(unicode, NULL);
7546	}
7547
7548	#ifdef MS_WINDOWS
7549
7550	/ --- MBCS codecs for Windows -------------------------------------------- /
7551
7552	#if SIZEOF_INT < SIZEOF_SIZE_T
7553	#define NEED_RETRY
7554	#endif
7555
7556	/ INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when*
7557	transcoding from UTF-16), but INT_MAX / 4 performs better in
7558	both cases also and avoids partial characters overrunning the
7559	length limit in MultiByteToWideChar on Windows /*
7560	#define DECODING_CHUNK_SIZE (INT_MAX/4)
7561
7562	#ifndef WC_ERR_INVALID_CHARS
7563	# define WC_ERR_INVALID_CHARS 0x0080
7564	#endif
7565
7566	static const char*
7567	code_page_name(UINT code_page, PyObject **obj)
7568	{
7569	*obj = NULL;
7570	if (code_page == CP_ACP)
7571	return "mbcs";
7572	if (code_page == CP_UTF7)
7573	return "CP_UTF7";
7574	if (code_page == CP_UTF8)
7575	return "CP_UTF8";
7576
7577	*obj = PyBytes_FromFormat("cp%u", code_page);
7578	if (*obj == NULL)
7579	return NULL;
7580	return PyBytes_AS_STRING(*obj);
7581	}
7582
7583	static DWORD
7584	decode_code_page_flags(UINT code_page)
7585	{
7586	if (code_page == CP_UTF7) {
7587	/ The CP_UTF7 decoder only supports flags=0 /
7588	return `0`;
7589	}
7590	else
7591	return MB_ERR_INVALID_CHARS;
7592	}
7593
7594	/*
7595	* Decode a byte string from a Windows code page into unicode object in strict
7596	* mode.
7597	*
7598	* Returns consumed size if succeed, returns -2 on decode error, or raise an
7599	* OSError and returns -1 on other error.
7600	*/
7601	static int
7602	decode_code_page_strict(UINT code_page,
7603	wchar_t **buf,
7604	Py_ssize_t *bufsize,
7605	const char *in,
7606	int insize)
7607	{
7608	DWORD flags = MB_ERR_INVALID_CHARS;
7609	wchar_t *out;
7610	DWORD outsize;
7611
7612	/ First get the size of the result /
7613	assert(insize > `0`);
7614	while ((outsize = MultiByteToWideChar(code_page, flags,
7615	in, insize, NULL, `0`)) <= `0`)
7616	{
7617	if (!flags \|\| GetLastError() != ERROR_INVALID_FLAGS) {
7618	goto error;
7619	}
7620	/ For some code pages (e.g. UTF-7) flags must be set to 0. /
7621	flags = `0`;
7622	}
7623
7624	/ Extend a wchar_t* buffer /
7625	Py_ssize_t n = bufsize; /* Get the current length /
7626	if (widechar_resize(buf, bufsize, n + outsize) < `0`) {
7627	return -`1`;
7628	}
7629	out = *buf + n;
7630
7631	/ Do the conversion /
7632	outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7633	if (outsize <= `0`)
7634	goto error;
7635	return insize;
7636
7637	error:
7638	if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7639	return -`2`;
7640	PyErr_SetFromWindowsErr(`0`);
7641	return -`1`;
7642	}
7643
7644	/*
7645	* Decode a byte string from a code page into unicode object with an error
7646	* handler.
7647	*
7648	* Returns consumed size if succeed, or raise an OSError or
7649	* UnicodeDecodeError exception and returns -1 on error.
7650	*/
7651	static int
7652	decode_code_page_errors(UINT code_page,
7653	wchar_t **buf,
7654	Py_ssize_t *bufsize,
7655	const char in, const* int size,
7656	const char errors, int* final)
7657	{
7658	const char *startin = in;
7659	const char *endin = in + size;
7660	DWORD flags = MB_ERR_INVALID_CHARS;
7661	/ Ideally, we should get reason from FormatMessage. This is the Windows*
7662	2000 English version of the message. /*
7663	const char *reason = "No mapping for the Unicode character exists "
7664	"in the target code page.";
7665	/ each step cannot decode more than 1 character, but a character can be*
7666	represented as a surrogate pair /*
7667	wchar_t buffer[`2`], *out;
7668	int insize;
7669	Py_ssize_t outsize;
7670	PyObject *errorHandler = NULL;
7671	PyObject *exc = NULL;
7672	PyObject *encoding_obj = NULL;
7673	const char *encoding;
7674	DWORD err;
7675	int ret = -`1`;
7676
7677	assert(size > `0`);
7678
7679	encoding = code_page_name(code_page, &encoding_obj);
7680	if (encoding == NULL)
7681	return -`1`;
7682
7683	if ((errors == NULL \|\| strcmp(errors, "strict") == `0`) && final) {
7684	/ The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a*
7685	UnicodeDecodeError. /*
7686	make_decode_exception(&exc, encoding, in, size, `0`, `0`, reason);
7687	if (exc != NULL) {
7688	PyCodec_StrictErrors(exc);
7689	Py_CLEAR(exc);
7690	}
7691	goto error;
7692	}
7693
7694	/ Extend a wchar_t* buffer /
7695	Py_ssize_t n = bufsize; /* Get the current length /
7696	if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7697	PyErr_NoMemory();
7698	goto error;
7699	}
7700	if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < `0`) {
7701	goto error;
7702	}
7703	out = *buf + n;
7704
7705	/ Decode the byte string character per character /
7706	while (in < endin)
7707	{
7708	/ Decode a character /
7709	insize = `1`;
7710	do
7711	{
7712	outsize = MultiByteToWideChar(code_page, flags,
7713	in, insize,
7714	buffer, Py_ARRAY_LENGTH(buffer));
7715	if (outsize > `0`)
7716	break;
7717	err = GetLastError();
7718	if (err == ERROR_INVALID_FLAGS && flags) {
7719	/ For some code pages (e.g. UTF-7) flags must be set to 0. /
7720	flags = `0`;
7721	continue;
7722	}
7723	if (err != ERROR_NO_UNICODE_TRANSLATION
7724	&& err != ERROR_INSUFFICIENT_BUFFER)
7725	{
7726	PyErr_SetFromWindowsErr(`0`);
7727	goto error;
7728	}
7729	insize++;
7730	}
7731	/ 4=maximum length of a UTF-8 sequence /
7732	while (insize <= `4` && (in + insize) <= endin);
7733
7734	if (outsize <= `0`) {
7735	Py_ssize_t startinpos, endinpos, outpos;
7736
7737	/ last character in partial decode? /
7738	if (in + insize >= endin && !final)
7739	break;
7740
7741	startinpos = in - startin;
7742	endinpos = startinpos + `1`;
7743	outpos = out - *buf;
7744	if (unicode_decode_call_errorhandler_wchar(
7745	errors, &errorHandler,
7746	encoding, reason,
7747	&startin, &endin, &startinpos, &endinpos, &exc, &in,
7748	buf, bufsize, &outpos))
7749	{
7750	goto error;
7751	}
7752	out = *buf + outpos;
7753	}
7754	else {
7755	in += insize;
7756	memcpy(out, buffer, outsize * sizeof(wchar_t));
7757	out += outsize;
7758	}
7759	}
7760
7761	/ Shrink the buffer /
7762	assert(out - buf <= bufsize);
7763	bufsize = out - buf;
7764	/ (in - startin) <= size and size is an int /
7765	ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7766
7767	error:
7768	Py_XDECREF(encoding_obj);
7769	Py_XDECREF(errorHandler);
7770	Py_XDECREF(exc);
7771	return ret;
7772	}
7773
7774	static PyObject *
7775	decode_code_page_stateful(int code_page,
7776	const char *s, Py_ssize_t size,
7777	const char errors, Py_ssize_t consumed)
7778	{
7779	wchar_t *buf = NULL;
7780	Py_ssize_t bufsize = `0`;
7781	int chunk_size, final, converted, done;
7782
7783	if (code_page < `0`) {
7784	PyErr_SetString(PyExc_ValueError, "invalid code page number");
7785	return NULL;
7786	}
7787	if (size < `0`) {
7788	PyErr_BadInternalCall();
7789	return NULL;
7790	}
7791
7792	if (consumed)
7793	*consumed = `0`;
7794
7795	do
7796	{
7797	#ifdef NEED_RETRY
7798	if (size > DECODING_CHUNK_SIZE) {
7799	chunk_size = DECODING_CHUNK_SIZE;
7800	final = `0`;
7801	done = `0`;
7802	}
7803	else
7804	#endif
7805	{
7806	chunk_size = (int)size;
7807	final = (consumed == NULL);
7808	done = `1`;
7809	}
7810
7811	if (chunk_size == `0` && done) {
7812	if (buf != NULL)
7813	break;
7814	_Py_RETURN_UNICODE_EMPTY();
7815	}
7816
7817	converted = decode_code_page_strict(code_page, &buf, &bufsize,
7818	s, chunk_size);
7819	if (converted == -`2`)
7820	converted = decode_code_page_errors(code_page, &buf, &bufsize,
7821	s, chunk_size,
7822	errors, final);
7823	assert(converted != `0` \|\| done);
7824
7825	if (converted < `0`) {
7826	PyMem_Free(buf);
7827	return NULL;
7828	}
7829
7830	if (consumed)
7831	*consumed += converted;
7832
7833	s += converted;
7834	size -= converted;
7835	} while (!done);
7836
7837	PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7838	PyMem_Free(buf);
7839	return v;
7840	}
7841
7842	PyObject *
7843	PyUnicode_DecodeCodePageStateful(int code_page,
7844	const char *s,
7845	Py_ssize_t size,
7846	const char *errors,
7847	Py_ssize_t *consumed)
7848	{
7849	return decode_code_page_stateful(code_page, s, size, errors, consumed);
7850	}
7851
7852	PyObject *
7853	PyUnicode_DecodeMBCSStateful(const char *s,
7854	Py_ssize_t size,
7855	const char *errors,
7856	Py_ssize_t *consumed)
7857	{
7858	return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7859	}
7860
7861	PyObject *
7862	PyUnicode_DecodeMBCS(const char *s,
7863	Py_ssize_t size,
7864	const char *errors)
7865	{
7866	return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7867	}
7868
7869	static DWORD
7870	encode_code_page_flags(UINT code_page, const char *errors)
7871	{
7872	if (code_page == CP_UTF8) {
7873	return WC_ERR_INVALID_CHARS;
7874	}
7875	else if (code_page == CP_UTF7) {
7876	/ CP_UTF7 only supports flags=0 /
7877	return `0`;
7878	}
7879	else {
7880	if (errors != NULL && strcmp(errors, "replace") == `0`)
7881	return `0`;
7882	else
7883	return WC_NO_BEST_FIT_CHARS;
7884	}
7885	}
7886
7887	/*
7888	* Encode a Unicode string to a Windows code page into a byte string in strict
7889	* mode.
7890	*
7891	* Returns consumed characters if succeed, returns -2 on encode error, or raise
7892	* an OSError and returns -1 on other error.
7893	*/
7894	static int
7895	encode_code_page_strict(UINT code_page, PyObject **outbytes,
7896	PyObject unicode, Py_ssize_t offset, int* len,
7897	const char* errors)
7898	{
7899	BOOL usedDefaultChar = FALSE;
7900	BOOL *pusedDefaultChar = &usedDefaultChar;
7901	int outsize;
7902	wchar_t *p;
7903	Py_ssize_t size;
7904	const DWORD flags = encode_code_page_flags(code_page, NULL);
7905	char *out;
7906	/ Create a substring so that we can get the UTF-16 representation*
7907	of just the slice under consideration. /*
7908	PyObject *substring;
7909	int ret = -`1`;
7910
7911	assert(len > `0`);
7912
7913	if (code_page != CP_UTF8 && code_page != CP_UTF7)
7914	pusedDefaultChar = &usedDefaultChar;
7915	else
7916	pusedDefaultChar = NULL;
7917
7918	substring = PyUnicode_Substring(unicode, offset, offset+len);
7919	if (substring == NULL)
7920	return -`1`;
7921	#if USE_UNICODE_WCHAR_CACHE
7922	_Py_COMP_DIAG_PUSH
7923	_Py_COMP_DIAG_IGNORE_DEPR_DECLS
7924	p = PyUnicode_AsUnicodeAndSize(substring, &size);
7925	if (p == NULL) {
7926	Py_DECREF(substring);
7927	return -`1`;
7928	}
7929	_Py_COMP_DIAG_POP
7930	#else /* USE_UNICODE_WCHAR_CACHE */
7931	p = PyUnicode_AsWideCharString(substring, &size);
7932	Py_CLEAR(substring);
7933	if (p == NULL) {
7934	return -`1`;
7935	}
7936	#endif /* USE_UNICODE_WCHAR_CACHE */
7937	assert(size <= INT_MAX);
7938
7939	/ First get the size of the result /
7940	outsize = WideCharToMultiByte(code_page, flags,
7941	p, (int)size,
7942	NULL, `0`,
7943	NULL, pusedDefaultChar);
7944	if (outsize <= `0`)
7945	goto error;
7946	/ If we used a default char, then we failed! /
7947	if (pusedDefaultChar && *pusedDefaultChar) {
7948	ret = -`2`;
7949	goto done;
7950	}
7951
7952	if (*outbytes == NULL) {
7953	/ Create string object /
7954	*outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7955	if (*outbytes == NULL) {
7956	goto done;
7957	}
7958	out = PyBytes_AS_STRING(*outbytes);
7959	}
7960	else {
7961	/ Extend string object /
7962	const Py_ssize_t n = PyBytes_Size(*outbytes);
7963	if (outsize > PY_SSIZE_T_MAX - n) {
7964	PyErr_NoMemory();
7965	goto done;
7966	}
7967	if (_PyBytes_Resize(outbytes, n + outsize) < `0`) {
7968	goto done;
7969	}
7970	out = PyBytes_AS_STRING(*outbytes) + n;
7971	}
7972
7973	/ Do the conversion /
7974	outsize = WideCharToMultiByte(code_page, flags,
7975	p, (int)size,
7976	out, outsize,
7977	NULL, pusedDefaultChar);
7978	if (outsize <= `0`)
7979	goto error;
7980	if (pusedDefaultChar && *pusedDefaultChar) {
7981	ret = -`2`;
7982	goto done;
7983	}
7984	ret = `0`;
7985
7986	done:
7987	#if USE_UNICODE_WCHAR_CACHE
7988	Py_DECREF(substring);
7989	#else /* USE_UNICODE_WCHAR_CACHE */
7990	PyMem_Free(p);
7991	#endif /* USE_UNICODE_WCHAR_CACHE */
7992	return ret;
7993
7994	error:
7995	if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7996	ret = -`2`;
7997	goto done;
7998	}
7999	PyErr_SetFromWindowsErr(`0`);
8000	goto done;
8001	}
8002
8003	/*
8004	* Encode a Unicode string to a Windows code page into a byte string using an
8005	* error handler.
8006	*
8007	* Returns consumed characters if succeed, or raise an OSError and returns
8008	* -1 on other error.
8009	*/
8010	static int
8011	encode_code_page_errors(UINT code_page, PyObject **outbytes,
8012	PyObject *unicode, Py_ssize_t unicode_offset,
8013	Py_ssize_t insize, const char* errors)
8014	{
8015	const DWORD flags = encode_code_page_flags(code_page, errors);
8016	Py_ssize_t pos = unicode_offset;
8017	Py_ssize_t endin = unicode_offset + insize;
8018	/ Ideally, we should get reason from FormatMessage. This is the Windows*
8019	2000 English version of the message. /*
8020	const char *reason = "invalid character";
8021	/ 4=maximum length of a UTF-8 sequence /
8022	char buffer[`4`];
8023	BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
8024	Py_ssize_t outsize;
8025	char *out;
8026	PyObject *errorHandler = NULL;
8027	PyObject *exc = NULL;
8028	PyObject *encoding_obj = NULL;
8029	const char *encoding;
8030	Py_ssize_t newpos, newoutsize;
8031	PyObject *rep;
8032	int ret = -`1`;
8033
8034	assert(insize > `0`);
8035
8036	encoding = code_page_name(code_page, &encoding_obj);
8037	if (encoding == NULL)
8038	return -`1`;
8039
8040	if (errors == NULL \|\| strcmp(errors, "strict") == `0`) {
8041	/ The last error was ERROR_NO_UNICODE_TRANSLATION,*
8042	then we raise a UnicodeEncodeError. /*
8043	make_encode_exception(&exc, encoding, unicode, `0`, `0`, reason);
8044	if (exc != NULL) {
8045	PyCodec_StrictErrors(exc);
8046	Py_DECREF(exc);
8047	}
8048	Py_XDECREF(encoding_obj);
8049	return -`1`;
8050	}
8051
8052	if (code_page != CP_UTF8 && code_page != CP_UTF7)
8053	pusedDefaultChar = &usedDefaultChar;
8054	else
8055	pusedDefaultChar = NULL;
8056
8057	if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8058	PyErr_NoMemory();
8059	goto error;
8060	}
8061	outsize = insize * Py_ARRAY_LENGTH(buffer);
8062
8063	if (*outbytes == NULL) {
8064	/ Create string object /
8065	*outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8066	if (*outbytes == NULL)
8067	goto error;
8068	out = PyBytes_AS_STRING(*outbytes);
8069	}
8070	else {
8071	/ Extend string object /
8072	Py_ssize_t n = PyBytes_Size(*outbytes);
8073	if (n > PY_SSIZE_T_MAX - outsize) {
8074	PyErr_NoMemory();
8075	goto error;
8076	}
8077	if (_PyBytes_Resize(outbytes, n + outsize) < `0`)
8078	goto error;
8079	out = PyBytes_AS_STRING(*outbytes) + n;
8080	}
8081
8082	/ Encode the string character per character /
8083	while (pos < endin)
8084	{
8085	Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8086	wchar_t chars[`2`];
8087	int charsize;
8088	if (ch < `0x10000`) {
8089	chars[`0`] = (wchar_t)ch;
8090	charsize = `1`;
8091	}
8092	else {
8093	chars[`0`] = Py_UNICODE_HIGH_SURROGATE(ch);
8094	chars[`1`] = Py_UNICODE_LOW_SURROGATE(ch);
8095	charsize = `2`;
8096	}
8097
8098	outsize = WideCharToMultiByte(code_page, flags,
8099	chars, charsize,
8100	buffer, Py_ARRAY_LENGTH(buffer),
8101	NULL, pusedDefaultChar);
8102	if (outsize > `0`) {
8103	if (pusedDefaultChar == NULL \|\| !(*pusedDefaultChar))
8104	{
8105	pos++;
8106	memcpy(out, buffer, outsize);
8107	out += outsize;
8108	continue;
8109	}
8110	}
8111	else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8112	PyErr_SetFromWindowsErr(`0`);
8113	goto error;
8114	}
8115
8116	rep = unicode_encode_call_errorhandler(
8117	errors, &errorHandler, encoding, reason,
8118	unicode, &exc,
8119	pos, pos + `1`, &newpos);
8120	if (rep == NULL)
8121	goto error;
8122
8123	Py_ssize_t morebytes = pos - newpos;
8124	if (PyBytes_Check(rep)) {
8125	outsize = PyBytes_GET_SIZE(rep);
8126	morebytes += outsize;
8127	if (morebytes > `0`) {
8128	Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8129	newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
8130	if (_PyBytes_Resize(outbytes, newoutsize) < `0`) {
8131	Py_DECREF(rep);
8132	goto error;
8133	}
8134	out = PyBytes_AS_STRING(*outbytes) + offset;
8135	}
8136	memcpy(out, PyBytes_AS_STRING(rep), outsize);
8137	out += outsize;
8138	}
8139	else {
8140	Py_ssize_t i;
8141	enum PyUnicode_Kind kind;
8142	const void *data;
8143
8144	if (PyUnicode_READY(rep) == -`1`) {
8145	Py_DECREF(rep);
8146	goto error;
8147	}
8148
8149	outsize = PyUnicode_GET_LENGTH(rep);
8150	morebytes += outsize;
8151	if (morebytes > `0`) {
8152	Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8153	newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
8154	if (_PyBytes_Resize(outbytes, newoutsize) < `0`) {
8155	Py_DECREF(rep);
8156	goto error;
8157	}
8158	out = PyBytes_AS_STRING(*outbytes) + offset;
8159	}
8160	kind = PyUnicode_KIND(rep);
8161	data = PyUnicode_DATA(rep);
8162	for (i=`0`; i < outsize; i++) {
8163	Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8164	if (ch > `127`) {
8165	raise_encode_exception(&exc,
8166	encoding, unicode,
8167	pos, pos + `1`,
8168	"unable to encode error handler result to ASCII");
8169	Py_DECREF(rep);
8170	goto error;
8171	}
8172	out = (unsigned* char)ch;
8173	out++;
8174	}
8175	}
8176	pos = newpos;
8177	Py_DECREF(rep);
8178	}
8179	/ write a NUL byte /
8180	*out = `0`;
8181	outsize = out - PyBytes_AS_STRING(*outbytes);
8182	assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8183	if (_PyBytes_Resize(outbytes, outsize) < `0`)
8184	goto error;
8185	ret = `0`;
8186
8187	error:
8188	Py_XDECREF(encoding_obj);
8189	Py_XDECREF(errorHandler);
8190	Py_XDECREF(exc);
8191	return ret;
8192	}
8193
8194	static PyObject *
8195	encode_code_page(int code_page,
8196	PyObject *unicode,
8197	const char *errors)
8198	{
8199	Py_ssize_t len;
8200	PyObject *outbytes = NULL;
8201	Py_ssize_t offset;
8202	int chunk_len, ret, done;
8203
8204	if (!PyUnicode_Check(unicode)) {
8205	PyErr_BadArgument();
8206	return NULL;
8207	}
8208
8209	if (PyUnicode_READY(unicode) == -`1`)
8210	return NULL;
8211	len = PyUnicode_GET_LENGTH(unicode);
8212
8213	if (code_page < `0`) {
8214	PyErr_SetString(PyExc_ValueError, "invalid code page number");
8215	return NULL;
8216	}
8217
8218	if (len == `0`)
8219	return PyBytes_FromStringAndSize(NULL, `0`);
8220
8221	offset = `0`;
8222	do
8223	{
8224	#ifdef NEED_RETRY
8225	if (len > DECODING_CHUNK_SIZE) {
8226	chunk_len = DECODING_CHUNK_SIZE;
8227	done = `0`;
8228	}
8229	else
8230	#endif
8231	{
8232	chunk_len = (int)len;
8233	done = `1`;
8234	}
8235
8236	ret = encode_code_page_strict(code_page, &outbytes,
8237	unicode, offset, chunk_len,
8238	errors);
8239	if (ret == -`2`)
8240	ret = encode_code_page_errors(code_page, &outbytes,
8241	unicode, offset,
8242	chunk_len, errors);
8243	if (ret < `0`) {
8244	Py_XDECREF(outbytes);
8245	return NULL;
8246	}
8247
8248	offset += chunk_len;
8249	len -= chunk_len;
8250	} while (!done);
8251
8252	return outbytes;
8253	}
8254
8255	PyObject *
8256	PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8257	Py_ssize_t size,
8258	const char *errors)
8259	{
8260	PyObject unicode, res;
8261	unicode = PyUnicode_FromWideChar(p, size);
8262	if (unicode == NULL)
8263	return NULL;
8264	res = encode_code_page(CP_ACP, unicode, errors);
8265	Py_DECREF(unicode);
8266	return res;
8267	}
8268
8269	PyObject *
8270	PyUnicode_EncodeCodePage(int code_page,
8271	PyObject *unicode,
8272	const char *errors)
8273	{
8274	return encode_code_page(code_page, unicode, errors);
8275	}
8276
8277	PyObject *
8278	PyUnicode_AsMBCSString(PyObject *unicode)
8279	{
8280	return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8281	}
8282
8283	#undef NEED_RETRY
8284
8285	#endif /* MS_WINDOWS */
8286
8287	/ --- Character Mapping Codec -------------------------------------------- /
8288
8289	static int
8290	charmap_decode_string(const char *s,
8291	Py_ssize_t size,
8292	PyObject *mapping,
8293	const char *errors,
8294	_PyUnicodeWriter *writer)
8295	{
8296	const char *starts = s;
8297	const char *e;
8298	Py_ssize_t startinpos, endinpos;
8299	PyObject errorHandler = NULL, exc = NULL;
8300	Py_ssize_t maplen;
8301	enum PyUnicode_Kind mapkind;
8302	const void *mapdata;
8303	Py_UCS4 x;
8304	unsigned char ch;
8305
8306	if (PyUnicode_READY(mapping) == -`1`)
8307	return -`1`;
8308
8309	maplen = PyUnicode_GET_LENGTH(mapping);
8310	mapdata = PyUnicode_DATA(mapping);
8311	mapkind = PyUnicode_KIND(mapping);
8312
8313	e = s + size;
8314
8315	if (mapkind == PyUnicode_1BYTE_KIND && maplen >= `256`) {
8316	/ fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1*
8317	* is disabled in encoding aliases, latin1 is preferred because
8318	* its implementation is faster. */
8319	const Py_UCS1 mapdata_ucs1 = (const* Py_UCS1 *)mapdata;
8320	Py_UCS1 outdata = (Py_UCS1 )writer->data;
8321	Py_UCS4 maxchar = writer->maxchar;
8322
8323	assert (writer->kind == PyUnicode_1BYTE_KIND);
8324	while (s < e) {
8325	ch = *s;
8326	x = mapdata_ucs1[ch];
8327	if (x > maxchar) {
8328	if (_PyUnicodeWriter_Prepare(writer, `1`, `0xff`) == -`1`)
8329	goto onError;
8330	maxchar = writer->maxchar;
8331	outdata = (Py_UCS1 *)writer->data;
8332	}
8333	outdata[writer->pos] = x;
8334	writer->pos++;
8335	++s;
8336	}
8337	return `0`;
8338	}
8339
8340	while (s < e) {
8341	if (mapkind == PyUnicode_2BYTE_KIND && maplen >= `256`) {
8342	enum PyUnicode_Kind outkind = writer->kind;
8343	const Py_UCS2 mapdata_ucs2 = (const* Py_UCS2 *)mapdata;
8344	if (outkind == PyUnicode_1BYTE_KIND) {
8345	Py_UCS1 outdata = (Py_UCS1 )writer->data;
8346	Py_UCS4 maxchar = writer->maxchar;
8347	while (s < e) {
8348	ch = *s;
8349	x = mapdata_ucs2[ch];
8350	if (x > maxchar)
8351	goto Error;
8352	outdata[writer->pos] = x;
8353	writer->pos++;
8354	++s;
8355	}
8356	break;
8357	}
8358	else if (outkind == PyUnicode_2BYTE_KIND) {
8359	Py_UCS2 outdata = (Py_UCS2 )writer->data;
8360	while (s < e) {
8361	ch = *s;
8362	x = mapdata_ucs2[ch];
8363	if (x == `0xFFFE`)
8364	goto Error;
8365	outdata[writer->pos] = x;
8366	writer->pos++;
8367	++s;
8368	}
8369	break;
8370	}
8371	}
8372	ch = *s;
8373
8374	if (ch < maplen)
8375	x = PyUnicode_READ(mapkind, mapdata, ch);
8376	else
8377	x = `0xfffe`; / invalid value /
8378	Error:
8379	if (x == `0xfffe`)
8380	{
8381	/ undefined mapping /
8382	startinpos = s-starts;
8383	endinpos = startinpos+`1`;
8384	if (unicode_decode_call_errorhandler_writer(
8385	errors, &errorHandler,
8386	"charmap", "character maps to <undefined>",
8387	&starts, &e, &startinpos, &endinpos, &exc, &s,
8388	writer)) {
8389	goto onError;
8390	}
8391	continue;
8392	}
8393
8394	if (_PyUnicodeWriter_WriteCharInline(writer, x) < `0`)
8395	goto onError;
8396	++s;
8397	}
8398	Py_XDECREF(errorHandler);
8399	Py_XDECREF(exc);
8400	return `0`;
8401
8402	onError:
8403	Py_XDECREF(errorHandler);
8404	Py_XDECREF(exc);
8405	return -`1`;
8406	}
8407
8408	static int
8409	charmap_decode_mapping(const char *s,
8410	Py_ssize_t size,
8411	PyObject *mapping,
8412	const char *errors,
8413	_PyUnicodeWriter *writer)
8414	{
8415	const char *starts = s;
8416	const char *e;
8417	Py_ssize_t startinpos, endinpos;
8418	PyObject errorHandler = NULL, exc = NULL;
8419	unsigned char ch;
8420	PyObject key, item = NULL;
8421
8422	e = s + size;
8423
8424	while (s < e) {
8425	ch = *s;
8426
8427	/ Get mapping (char ordinal -> integer, Unicode char or None) /
8428	key = PyLong_FromLong((long)ch);
8429	if (key == NULL)
8430	goto onError;
8431
8432	item = PyObject_GetItem(mapping, key);
8433	Py_DECREF(key);
8434	if (item == NULL) {
8435	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8436	/ No mapping found means: mapping is undefined. /
8437	PyErr_Clear();
8438	goto Undefined;
8439	} else
8440	goto onError;
8441	}
8442
8443	/ Apply mapping /
8444	if (item == Py_None)
8445	goto Undefined;
8446	if (PyLong_Check(item)) {
8447	long value = PyLong_AS_LONG(item);
8448	if (value == `0xFFFE`)
8449	goto Undefined;
8450	if (value < `0` \|\| value > MAX_UNICODE) {
8451	PyErr_Format(PyExc_TypeError,
8452	"character mapping must be in range(0x%x)",
8453	(unsigned long)MAX_UNICODE + `1`);
8454	goto onError;
8455	}
8456
8457	if (_PyUnicodeWriter_WriteCharInline(writer, value) < `0`)
8458	goto onError;
8459	}
8460	else if (PyUnicode_Check(item)) {
8461	if (PyUnicode_READY(item) == -`1`)
8462	goto onError;
8463	if (PyUnicode_GET_LENGTH(item) == `1`) {
8464	Py_UCS4 value = PyUnicode_READ_CHAR(item, `0`);
8465	if (value == `0xFFFE`)
8466	goto Undefined;
8467	if (_PyUnicodeWriter_WriteCharInline(writer, value) < `0`)
8468	goto onError;
8469	}
8470	else {
8471	writer->overallocate = `1`;
8472	if (_PyUnicodeWriter_WriteStr(writer, item) == -`1`)
8473	goto onError;
8474	}
8475	}
8476	else {
8477	/ wrong return value /
8478	PyErr_SetString(PyExc_TypeError,
8479	"character mapping must return integer, None or str");
8480	goto onError;
8481	}
8482	Py_CLEAR(item);
8483	++s;
8484	continue;
8485
8486	Undefined:
8487	/ undefined mapping /
8488	Py_CLEAR(item);
8489	startinpos = s-starts;
8490	endinpos = startinpos+`1`;
8491	if (unicode_decode_call_errorhandler_writer(
8492	errors, &errorHandler,
8493	"charmap", "character maps to <undefined>",
8494	&starts, &e, &startinpos, &endinpos, &exc, &s,
8495	writer)) {
8496	goto onError;
8497	}
8498	}
8499	Py_XDECREF(errorHandler);
8500	Py_XDECREF(exc);
8501	return `0`;
8502
8503	onError:
8504	Py_XDECREF(item);
8505	Py_XDECREF(errorHandler);
8506	Py_XDECREF(exc);
8507	return -`1`;
8508	}
8509
8510	PyObject *
8511	PyUnicode_DecodeCharmap(const char *s,
8512	Py_ssize_t size,
8513	PyObject *mapping,
8514	const char *errors)
8515	{
8516	_PyUnicodeWriter writer;
8517
8518	/ Default to Latin-1 /
8519	if (mapping == NULL)
8520	return PyUnicode_DecodeLatin1(s, size, errors);
8521
8522	if (size == `0`)
8523	_Py_RETURN_UNICODE_EMPTY();
8524	_PyUnicodeWriter_Init(&writer);
8525	writer.min_length = size;
8526	if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, `127`) == -`1`)
8527	goto onError;
8528
8529	if (PyUnicode_CheckExact(mapping)) {
8530	if (charmap_decode_string(s, size, mapping, errors, &writer) < `0`)
8531	goto onError;
8532	}
8533	else {
8534	if (charmap_decode_mapping(s, size, mapping, errors, &writer) < `0`)
8535	goto onError;
8536	}
8537	return _PyUnicodeWriter_Finish(&writer);
8538
8539	onError:
8540	_PyUnicodeWriter_Dealloc(&writer);
8541	return NULL;
8542	}
8543
8544	/ Charmap encoding: the lookup table /
8545
8546	struct encoding_map {
8547	PyObject_HEAD
8548	unsigned char level1[`32`];
8549	int count2, count3;
8550	unsigned char level23[`1`];
8551	};
8552
8553	static PyObject*
8554	encoding_map_size(PyObject obj, PyObject args)
8555	{
8556	struct encoding_map map = (struct* encoding_map*)obj;
8557	return PyLong_FromLong(sizeof(map) - `1` + `16`map->count2 +
8558	`128`*map->count3);
8559	}
8560
8561	static PyMethodDef encoding_map_methods[] = {
8562	{"size", encoding_map_size, METH_NOARGS,
8563	PyDoc_STR("Return the size (in bytes) of this object") },
8564	{ `0` }
8565	};
8566
8567	static PyTypeObject EncodingMapType = {
8568	PyVarObject_HEAD_INIT(NULL, `0`)
8569	"EncodingMap", /tp_name/
8570	sizeof(struct encoding_map), /tp_basicsize/
8571	`0`, /tp_itemsize/
8572	/ methods /
8573	`0`, /tp_dealloc/
8574	`0`, /tp_vectorcall_offset/
8575	`0`, /tp_getattr/
8576	`0`, /tp_setattr/
8577	`0`, /tp_as_async/
8578	`0`, /tp_repr/
8579	`0`, /tp_as_number/
8580	`0`, /tp_as_sequence/
8581	`0`, /tp_as_mapping/
8582	`0`, /tp_hash/
8583	`0`, /tp_call/
8584	`0`, /tp_str/
8585	`0`, /tp_getattro/
8586	`0`, /tp_setattro/
8587	`0`, /tp_as_buffer/
8588	Py_TPFLAGS_DEFAULT, /tp_flags/
8589	`0`, /tp_doc/
8590	`0`, /tp_traverse/
8591	`0`, /tp_clear/
8592	`0`, /tp_richcompare/
8593	`0`, /tp_weaklistoffset/
8594	`0`, /tp_iter/
8595	`0`, /tp_iternext/
8596	encoding_map_methods, /tp_methods/
8597	`0`, /tp_members/
8598	`0`, /tp_getset/
8599	`0`, /tp_base/
8600	`0`, /tp_dict/
8601	`0`, /tp_descr_get/
8602	`0`, /tp_descr_set/
8603	`0`, /tp_dictoffset/
8604	`0`, /tp_init/
8605	`0`, /tp_alloc/
8606	`0`, /tp_new/
8607	`0`, /tp_free/
8608	`0`, /tp_is_gc/
8609	};
8610
8611	PyObject*
8612	PyUnicode_BuildEncodingMap(PyObject* string)
8613	{
8614	PyObject *result;
8615	struct encoding_map *mresult;
8616	int i;
8617	int need_dict = `0`;
8618	unsigned char level1[`32`];
8619	unsigned char level2[`512`];
8620	unsigned char mlevel1, mlevel2, *mlevel3;
8621	int count2 = `0`, count3 = `0`;
8622	int kind;
8623	const void *data;
8624	Py_ssize_t length;
8625	Py_UCS4 ch;
8626
8627	if (!PyUnicode_Check(string) \|\| !PyUnicode_GET_LENGTH(string)) {
8628	PyErr_BadArgument();
8629	return NULL;
8630	}
8631	kind = PyUnicode_KIND(string);
8632	data = PyUnicode_DATA(string);
8633	length = PyUnicode_GET_LENGTH(string);
8634	length = Py_MIN(length, `256`);
8635	memset(level1, `0xFF`, sizeof level1);
8636	memset(level2, `0xFF`, sizeof level2);
8637
8638	/ If there isn't a one-to-one mapping of NULL to \0,*
8639	or if there are non-BMP characters, we need to use
8640	a mapping dictionary. /*
8641	if (PyUnicode_READ(kind, data, `0`) != `0`)
8642	need_dict = `1`;
8643	for (i = `1`; i < length; i++) {
8644	int l1, l2;
8645	ch = PyUnicode_READ(kind, data, i);
8646	if (ch == `0` \|\| ch > `0xFFFF`) {
8647	need_dict = `1`;
8648	break;
8649	}
8650	if (ch == `0xFFFE`)
8651	/ unmapped character /
8652	continue;
8653	l1 = ch >> `11`;
8654	l2 = ch >> `7`;
8655	if (level1[l1] == `0xFF`)
8656	level1[l1] = count2++;
8657	if (level2[l2] == `0xFF`)
8658	level2[l2] = count3++;
8659	}
8660
8661	if (count2 >= `0xFF` \|\| count3 >= `0xFF`)
8662	need_dict = `1`;
8663
8664	if (need_dict) {
8665	PyObject *result = PyDict_New();
8666	PyObject key, value;
8667	if (!result)
8668	return NULL;
8669	for (i = `0`; i < length; i++) {
8670	key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8671	value = PyLong_FromLong(i);
8672	if (!key \|\| !value)
8673	goto failed1;
8674	if (PyDict_SetItem(result, key, value) == -`1`)
8675	goto failed1;
8676	Py_DECREF(key);
8677	Py_DECREF(value);
8678	}
8679	return result;
8680	failed1:
8681	Py_XDECREF(key);
8682	Py_XDECREF(value);
8683	Py_DECREF(result);
8684	return NULL;
8685	}
8686
8687	/ Create a three-level trie /
8688	result = PyObject_Malloc(sizeof(struct encoding_map) +
8689	`16`count2 + `128`count3 - `1`);
8690	if (!result) {
8691	return PyErr_NoMemory();
8692	}
8693
8694	_PyObject_Init(result, &EncodingMapType);
8695	mresult = (struct encoding_map*)result;
8696	mresult->count2 = count2;
8697	mresult->count3 = count3;
8698	mlevel1 = mresult->level1;
8699	mlevel2 = mresult->level23;
8700	mlevel3 = mresult->level23 + `16`*count2;
8701	memcpy(mlevel1, level1, `32`);
8702	memset(mlevel2, `0xFF`, `16`*count2);
8703	memset(mlevel3, `0`, `128`*count3);
8704	count3 = `0`;
8705	for (i = `1`; i < length; i++) {
8706	int o1, o2, o3, i2, i3;
8707	Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8708	if (ch == `0xFFFE`)
8709	/ unmapped character /
8710	continue;
8711	o1 = ch>>`11`;
8712	o2 = (ch>>`7`) & `0xF`;
8713	i2 = `16`*mlevel1[o1] + o2;
8714	if (mlevel2[i2] == `0xFF`)
8715	mlevel2[i2] = count3++;
8716	o3 = ch & `0x7F`;
8717	i3 = `128`*mlevel2[i2] + o3;
8718	mlevel3[i3] = i;
8719	}
8720	return result;
8721	}
8722
8723	static int
8724	encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8725	{
8726	struct encoding_map map = (struct* encoding_map*)mapping;
8727	int l1 = c>>`11`;
8728	int l2 = (c>>`7`) & `0xF`;
8729	int l3 = c & `0x7F`;
8730	int i;
8731
8732	if (c > `0xFFFF`)
8733	return -`1`;
8734	if (c == `0`)
8735	return `0`;
8736	/ level 1/
8737	i = map->level1[l1];
8738	if (i == `0xFF`) {
8739	return -`1`;
8740	}
8741	/ level 2/
8742	i = map->level23[`16`*i+l2];
8743	if (i == `0xFF`) {
8744	return -`1`;
8745	}
8746	/ level 3 /
8747	i = map->level23[`16`map->count2 + `128`i + l3];
8748	if (i == `0`) {
8749	return -`1`;
8750	}
8751	return i;
8752	}
8753
8754	/ Lookup the character ch in the mapping. If the character*
8755	can't be found, Py_None is returned (or NULL, if another
8756	error occurred). /*
8757	static PyObject *
8758	charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8759	{
8760	PyObject w = PyLong_FromLong((long*)c);
8761	PyObject *x;
8762
8763	if (w == NULL)
8764	return NULL;
8765	x = PyObject_GetItem(mapping, w);
8766	Py_DECREF(w);
8767	if (x == NULL) {
8768	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8769	/ No mapping found means: mapping is undefined. /
8770	PyErr_Clear();
8771	Py_RETURN_NONE;
8772	} else
8773	return NULL;
8774	}
8775	else if (x == Py_None)
8776	return x;
8777	else if (PyLong_Check(x)) {
8778	long value = PyLong_AS_LONG(x);
8779	if (value < `0` \|\| value > `255`) {
8780	PyErr_SetString(PyExc_TypeError,
8781	"character mapping must be in range(256)");
8782	Py_DECREF(x);
8783	return NULL;
8784	}
8785	return x;
8786	}
8787	else if (PyBytes_Check(x))
8788	return x;
8789	else {
8790	/ wrong return value /
8791	PyErr_Format(PyExc_TypeError,
8792	"character mapping must return integer, bytes or None, not %.400s",
8793	Py_TYPE(x)->tp_name);
8794	Py_DECREF(x);
8795	return NULL;
8796	}
8797	}
8798
8799	static int
8800	charmapencode_resize(PyObject *outobj, Py_ssize_t outpos, Py_ssize_t requiredsize)
8801	{
8802	Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8803	/ exponentially overallocate to minimize reallocations /
8804	if (requiredsize < `2`*outsize)
8805	requiredsize = `2`*outsize;
8806	if (_PyBytes_Resize(outobj, requiredsize))
8807	return -`1`;
8808	return `0`;
8809	}
8810
8811	typedef enum charmapencode_result {
8812	enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8813	} charmapencode_result;
8814	/ lookup the character, put the result in the output string and adjust*
8815	various state variables. Resize the output bytes object if not enough
8816	space is available. Return a new reference to the object that
8817	was put in the output buffer, or Py_None, if the mapping was undefined
8818	(in which case no character was written) or NULL, if a
8819	reallocation error occurred. The caller must decref the result /*
8820	static charmapencode_result
8821	charmapencode_output(Py_UCS4 c, PyObject *mapping,
8822	PyObject *outobj, Py_ssize_t outpos)
8823	{
8824	PyObject *rep;
8825	char *outstart;
8826	Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8827
8828	if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8829	int res = encoding_map_lookup(c, mapping);
8830	Py_ssize_t requiredsize = *outpos+`1`;
8831	if (res == -`1`)
8832	return enc_FAILED;
8833	if (outsize<requiredsize)
8834	if (charmapencode_resize(outobj, outpos, requiredsize))
8835	return enc_EXCEPTION;
8836	outstart = PyBytes_AS_STRING(*outobj);
8837	outstart[(outpos)++] = (char*)res;
8838	return enc_SUCCESS;
8839	}
8840
8841	rep = charmapencode_lookup(c, mapping);
8842	if (rep==NULL)
8843	return enc_EXCEPTION;
8844	else if (rep==Py_None) {
8845	Py_DECREF(rep);
8846	return enc_FAILED;
8847	} else {
8848	if (PyLong_Check(rep)) {
8849	Py_ssize_t requiredsize = *outpos+`1`;
8850	if (outsize<requiredsize)
8851	if (charmapencode_resize(outobj, outpos, requiredsize)) {
8852	Py_DECREF(rep);
8853	return enc_EXCEPTION;
8854	}
8855	outstart = PyBytes_AS_STRING(*outobj);
8856	outstart[(outpos)++] = (char*)PyLong_AS_LONG(rep);
8857	}
8858	else {
8859	const char *repchars = PyBytes_AS_STRING(rep);
8860	Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8861	Py_ssize_t requiredsize = *outpos+repsize;
8862	if (outsize<requiredsize)
8863	if (charmapencode_resize(outobj, outpos, requiredsize)) {
8864	Py_DECREF(rep);
8865	return enc_EXCEPTION;
8866	}
8867	outstart = PyBytes_AS_STRING(*outobj);
8868	memcpy(outstart + *outpos, repchars, repsize);
8869	*outpos += repsize;
8870	}
8871	}
8872	Py_DECREF(rep);
8873	return enc_SUCCESS;
8874	}
8875
8876	/ handle an error in PyUnicode_EncodeCharmap*
8877	Return 0 on success, -1 on error /*
8878	static int
8879	charmap_encoding_error(
8880	PyObject unicode, Py_ssize_t inpos, PyObject *mapping,
8881	PyObject **exceptionObject,
8882	_Py_error_handler error_handler, PyObject error_handler_obj, const* char *errors,
8883	PyObject *res, Py_ssize_t respos)
8884	{
8885	PyObject repunicode = NULL; /* initialize to prevent gcc warning /
8886	Py_ssize_t size, repsize;
8887	Py_ssize_t newpos;
8888	enum PyUnicode_Kind kind;
8889	const void *data;
8890	Py_ssize_t index;
8891	/ startpos for collecting unencodable chars /
8892	Py_ssize_t collstartpos = *inpos;
8893	Py_ssize_t collendpos = *inpos+`1`;
8894	Py_ssize_t collpos;
8895	const char *encoding = "charmap";
8896	const char *reason = "character maps to <undefined>";
8897	charmapencode_result x;
8898	Py_UCS4 ch;
8899	int val;
8900
8901	if (PyUnicode_READY(unicode) == -`1`)
8902	return -`1`;
8903	size = PyUnicode_GET_LENGTH(unicode);
8904	/ find all unencodable characters /
8905	while (collendpos < size) {
8906	PyObject *rep;
8907	if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8908	ch = PyUnicode_READ_CHAR(unicode, collendpos);
8909	val = encoding_map_lookup(ch, mapping);
8910	if (val != -`1`)
8911	break;
8912	++collendpos;
8913	continue;
8914	}
8915
8916	ch = PyUnicode_READ_CHAR(unicode, collendpos);
8917	rep = charmapencode_lookup(ch, mapping);
8918	if (rep==NULL)
8919	return -`1`;
8920	else if (rep!=Py_None) {
8921	Py_DECREF(rep);
8922	break;
8923	}
8924	Py_DECREF(rep);
8925	++collendpos;
8926	}
8927	/ cache callback name lookup*
8928	* (if not done yet, i.e. it's the first error) */
8929	if (*error_handler == _Py_ERROR_UNKNOWN)
8930	*error_handler = _Py_GetErrorHandler(errors);
8931
8932	switch (*error_handler) {
8933	case _Py_ERROR_STRICT:
8934	raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8935	return -`1`;
8936
8937	case _Py_ERROR_REPLACE:
8938	for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8939	x = charmapencode_output(`'?'`, mapping, res, respos);
8940	if (x==enc_EXCEPTION) {
8941	return -`1`;
8942	}
8943	else if (x==enc_FAILED) {
8944	raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8945	return -`1`;
8946	}
8947	}
8948	/ fall through /
8949	case _Py_ERROR_IGNORE:
8950	*inpos = collendpos;
8951	break;
8952
8953	case _Py_ERROR_XMLCHARREFREPLACE:
8954	/ generate replacement (temporarily (mis)uses p) /
8955	for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8956	char buffer[`2`+`29`+`1`+`1`];
8957	char *cp;
8958	sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8959	for (cp = buffer; *cp; ++cp) {
8960	x = charmapencode_output(*cp, mapping, res, respos);
8961	if (x==enc_EXCEPTION)
8962	return -`1`;
8963	else if (x==enc_FAILED) {
8964	raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8965	return -`1`;
8966	}
8967	}
8968	}
8969	*inpos = collendpos;
8970	break;
8971
8972	default:
8973	repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8974	encoding, reason, unicode, exceptionObject,
8975	collstartpos, collendpos, &newpos);
8976	if (repunicode == NULL)
8977	return -`1`;
8978	if (PyBytes_Check(repunicode)) {
8979	/ Directly copy bytes result to output. /
8980	Py_ssize_t outsize = PyBytes_Size(*res);
8981	Py_ssize_t requiredsize;
8982	repsize = PyBytes_Size(repunicode);
8983	requiredsize = *respos + repsize;
8984	if (requiredsize > outsize)
8985	/ Make room for all additional bytes. /
8986	if (charmapencode_resize(res, respos, requiredsize)) {
8987	Py_DECREF(repunicode);
8988	return -`1`;
8989	}
8990	memcpy(PyBytes_AsString(res) + respos,
8991	PyBytes_AsString(repunicode), repsize);
8992	*respos += repsize;
8993	*inpos = newpos;
8994	Py_DECREF(repunicode);
8995	break;
8996	}
8997	/ generate replacement /
8998	if (PyUnicode_READY(repunicode) == -`1`) {
8999	Py_DECREF(repunicode);
9000	return -`1`;
9001	}
9002	repsize = PyUnicode_GET_LENGTH(repunicode);
9003	data = PyUnicode_DATA(repunicode);
9004	kind = PyUnicode_KIND(repunicode);
9005	for (index = `0`; index < repsize; index++) {
9006	Py_UCS4 repch = PyUnicode_READ(kind, data, index);
9007	x = charmapencode_output(repch, mapping, res, respos);
9008	if (x==enc_EXCEPTION) {
9009	Py_DECREF(repunicode);
9010	return -`1`;
9011	}
9012	else if (x==enc_FAILED) {
9013	Py_DECREF(repunicode);
9014	raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9015	return -`1`;
9016	}
9017	}
9018	*inpos = newpos;
9019	Py_DECREF(repunicode);
9020	}
9021	return `0`;
9022	}
9023
9024	PyObject *
9025	_PyUnicode_EncodeCharmap(PyObject *unicode,
9026	PyObject *mapping,
9027	const char *errors)
9028	{
9029	/ output object /
9030	PyObject *res = NULL;
9031	/ current input position /
9032	Py_ssize_t inpos = `0`;
9033	Py_ssize_t size;
9034	/ current output position /
9035	Py_ssize_t respos = `0`;
9036	PyObject *error_handler_obj = NULL;
9037	PyObject *exc = NULL;
9038	_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
9039	const void *data;
9040	int kind;
9041
9042	if (PyUnicode_READY(unicode) == -`1`)
9043	return NULL;
9044	size = PyUnicode_GET_LENGTH(unicode);
9045	data = PyUnicode_DATA(unicode);
9046	kind = PyUnicode_KIND(unicode);
9047
9048	/ Default to Latin-1 /
9049	if (mapping == NULL)
9050	return unicode_encode_ucs1(unicode, errors, `256`);
9051
9052	/ allocate enough for a simple encoding without*
9053	replacements, if we need more, we'll resize /*
9054	res = PyBytes_FromStringAndSize(NULL, size);
9055	if (res == NULL)
9056	goto onError;
9057	if (size == `0`)
9058	return res;
9059
9060	while (inpos<size) {
9061	Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9062	/ try to encode it /
9063	charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
9064	if (x==enc_EXCEPTION) / error /
9065	goto onError;
9066	if (x==enc_FAILED) { / unencodable character /
9067	if (charmap_encoding_error(unicode, &inpos, mapping,
9068	&exc,
9069	&error_handler, &error_handler_obj, errors,
9070	&res, &respos)) {
9071	goto onError;
9072	}
9073	}
9074	else
9075	/ done with this character => adjust input position /
9076	++inpos;
9077	}
9078
9079	/ Resize if we allocated to much /
9080	if (respos<PyBytes_GET_SIZE(res))
9081	if (_PyBytes_Resize(&res, respos) < `0`)
9082	goto onError;
9083
9084	Py_XDECREF(exc);
9085	Py_XDECREF(error_handler_obj);
9086	return res;
9087
9088	onError:
9089	Py_XDECREF(res);
9090	Py_XDECREF(exc);
9091	Py_XDECREF(error_handler_obj);
9092	return NULL;
9093	}
9094
9095	/ Deprecated /
9096	PyObject *
9097	PyUnicode_EncodeCharmap(const Py_UNICODE *p,
9098	Py_ssize_t size,
9099	PyObject *mapping,
9100	const char *errors)
9101	{
9102	PyObject *result;
9103	PyObject *unicode = PyUnicode_FromWideChar(p, size);
9104	if (unicode == NULL)
9105	return NULL;
9106	result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
9107	Py_DECREF(unicode);
9108	return result;
9109	}
9110
9111	PyObject *
9112	PyUnicode_AsCharmapString(PyObject *unicode,
9113	PyObject *mapping)
9114	{
9115	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
9116	PyErr_BadArgument();
9117	return NULL;
9118	}
9119	return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9120	}
9121
9122	/ create or adjust a UnicodeTranslateError /
9123	static void
9124	make_translate_exception(PyObject **exceptionObject,
9125	PyObject *unicode,
9126	Py_ssize_t startpos, Py_ssize_t endpos,
9127	const char *reason)
9128	{
9129	if (*exceptionObject == NULL) {
9130	*exceptionObject = _PyUnicodeTranslateError_Create(
9131	unicode, startpos, endpos, reason);
9132	}
9133	else {
9134	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9135	goto onError;
9136	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9137	goto onError;
9138	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9139	goto onError;
9140	return;
9141	onError:
9142	Py_CLEAR(*exceptionObject);
9143	}
9144	}
9145
9146	/ error handling callback helper:*
9147	build arguments, call the callback and check the arguments,
9148	put the result into newpos and return the replacement string, which
9149	has to be freed by the caller /*
9150	static PyObject *
9151	unicode_translate_call_errorhandler(const char *errors,
9152	PyObject **errorHandler,
9153	const char *reason,
9154	PyObject unicode, PyObject *exceptionObject,
9155	Py_ssize_t startpos, Py_ssize_t endpos,
9156	Py_ssize_t *newpos)
9157	{
9158	static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9159
9160	Py_ssize_t i_newpos;
9161	PyObject *restuple;
9162	PyObject *resunicode;
9163
9164	if (*errorHandler == NULL) {
9165	*errorHandler = PyCodec_LookupError(errors);
9166	if (*errorHandler == NULL)
9167	return NULL;
9168	}
9169
9170	make_translate_exception(exceptionObject,
9171	unicode, startpos, endpos, reason);
9172	if (*exceptionObject == NULL)
9173	return NULL;
9174
9175	restuple = PyObject_CallOneArg(errorHandler, exceptionObject);
9176	if (restuple == NULL)
9177	return NULL;
9178	if (!PyTuple_Check(restuple)) {
9179	PyErr_SetString(PyExc_TypeError, &argparse[`3`]);
9180	Py_DECREF(restuple);
9181	return NULL;
9182	}
9183	if (!PyArg_ParseTuple(restuple, argparse,
9184	&resunicode, &i_newpos)) {
9185	Py_DECREF(restuple);
9186	return NULL;
9187	}
9188	if (i_newpos<`0`)
9189	*newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9190	else
9191	*newpos = i_newpos;
9192	if (newpos<`0` \|\| newpos>PyUnicode_GET_LENGTH(unicode)) {
9193	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9194	Py_DECREF(restuple);
9195	return NULL;
9196	}
9197	Py_INCREF(resunicode);
9198	Py_DECREF(restuple);
9199	return resunicode;
9200	}
9201
9202	/ Lookup the character ch in the mapping and put the result in result,*
9203	which must be decrefed by the caller.
9204	Return 0 on success, -1 on error /*
9205	static int
9206	charmaptranslate_lookup(Py_UCS4 c, PyObject mapping, PyObject *result)
9207	{
9208	PyObject w = PyLong_FromLong((long*)c);
9209	PyObject *x;
9210
9211	if (w == NULL)
9212	return -`1`;
9213	x = PyObject_GetItem(mapping, w);
9214	Py_DECREF(w);
9215	if (x == NULL) {
9216	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9217	/ No mapping found means: use 1:1 mapping. /
9218	PyErr_Clear();
9219	*result = NULL;
9220	return `0`;
9221	} else
9222	return -`1`;
9223	}
9224	else if (x == Py_None) {
9225	*result = x;
9226	return `0`;
9227	}
9228	else if (PyLong_Check(x)) {
9229	long value = PyLong_AS_LONG(x);
9230	if (value < `0` \|\| value > MAX_UNICODE) {
9231	PyErr_Format(PyExc_ValueError,
9232	"character mapping must be in range(0x%x)",
9233	MAX_UNICODE+`1`);
9234	Py_DECREF(x);
9235	return -`1`;
9236	}
9237	*result = x;
9238	return `0`;
9239	}
9240	else if (PyUnicode_Check(x)) {
9241	*result = x;
9242	return `0`;
9243	}
9244	else {
9245	/ wrong return value /
9246	PyErr_SetString(PyExc_TypeError,
9247	"character mapping must return integer, None or str");
9248	Py_DECREF(x);
9249	return -`1`;
9250	}
9251	}
9252
9253	/ lookup the character, write the result into the writer.*
9254	Return 1 if the result was written into the writer, return 0 if the mapping
9255	was undefined, raise an exception return -1 on error. /*
9256	static int
9257	charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9258	_PyUnicodeWriter *writer)
9259	{
9260	PyObject *item;
9261
9262	if (charmaptranslate_lookup(ch, mapping, &item))
9263	return -`1`;
9264
9265	if (item == NULL) {
9266	/ not found => default to 1:1 mapping /
9267	if (_PyUnicodeWriter_WriteCharInline(writer, ch) < `0`) {
9268	return -`1`;
9269	}
9270	return `1`;
9271	}
9272
9273	if (item == Py_None) {
9274	Py_DECREF(item);
9275	return `0`;
9276	}
9277
9278	if (PyLong_Check(item)) {
9279	long ch = (Py_UCS4)PyLong_AS_LONG(item);
9280	/ PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already*
9281	used it /*
9282	if (_PyUnicodeWriter_WriteCharInline(writer, ch) < `0`) {
9283	Py_DECREF(item);
9284	return -`1`;
9285	}
9286	Py_DECREF(item);
9287	return `1`;
9288	}
9289
9290	if (!PyUnicode_Check(item)) {
9291	Py_DECREF(item);
9292	return -`1`;
9293	}
9294
9295	if (_PyUnicodeWriter_WriteStr(writer, item) < `0`) {
9296	Py_DECREF(item);
9297	return -`1`;
9298	}
9299
9300	Py_DECREF(item);
9301	return `1`;
9302	}
9303
9304	static int
9305	unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9306	Py_UCS1 *translate)
9307	{
9308	PyObject *item = NULL;
9309	int ret = `0`;
9310
9311	if (charmaptranslate_lookup(ch, mapping, &item)) {
9312	return -`1`;
9313	}
9314
9315	if (item == Py_None) {
9316	/ deletion /
9317	translate[ch] = `0xfe`;
9318	}
9319	else if (item == NULL) {
9320	/ not found => default to 1:1 mapping /
9321	translate[ch] = ch;
9322	return `1`;
9323	}
9324	else if (PyLong_Check(item)) {
9325	long replace = PyLong_AS_LONG(item);
9326	/ PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already*
9327	used it /*
9328	if (`127` < replace) {
9329	/ invalid character or character outside ASCII:*
9330	skip the fast translate /*
9331	goto exit;
9332	}
9333	translate[ch] = (Py_UCS1)replace;
9334	}
9335	else if (PyUnicode_Check(item)) {
9336	Py_UCS4 replace;
9337
9338	if (PyUnicode_READY(item) == -`1`) {
9339	Py_DECREF(item);
9340	return -`1`;
9341	}
9342	if (PyUnicode_GET_LENGTH(item) != `1`)
9343	goto exit;
9344
9345	replace = PyUnicode_READ_CHAR(item, `0`);
9346	if (replace > `127`)
9347	goto exit;
9348	translate[ch] = (Py_UCS1)replace;
9349	}
9350	else {
9351	/ not None, NULL, long or unicode /
9352	goto exit;
9353	}
9354	ret = `1`;
9355
9356	exit:
9357	Py_DECREF(item);
9358	return ret;
9359	}
9360
9361	/ Fast path for ascii => ascii translation. Return 1 if the whole string*
9362	was translated into writer, return 0 if the input string was partially
9363	translated into writer, raise an exception and return -1 on error. /*
9364	static int
9365	unicode_fast_translate(PyObject input, PyObject mapping,
9366	_PyUnicodeWriter writer, int* ignore,
9367	Py_ssize_t *input_pos)
9368	{
9369	Py_UCS1 ascii_table[`128`], ch, ch2;
9370	Py_ssize_t len;
9371	const Py_UCS1 in, end;
9372	Py_UCS1 *out;
9373	int res = `0`;
9374
9375	len = PyUnicode_GET_LENGTH(input);
9376
9377	memset(ascii_table, `0xff`, `128`);
9378
9379	in = PyUnicode_1BYTE_DATA(input);
9380	end = in + len;
9381
9382	assert(PyUnicode_IS_ASCII(writer->buffer));
9383	assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9384	out = PyUnicode_1BYTE_DATA(writer->buffer);
9385
9386	for (; in < end; in++) {
9387	ch = *in;
9388	ch2 = ascii_table[ch];
9389	if (ch2 == `0xff`) {
9390	int translate = unicode_fast_translate_lookup(mapping, ch,
9391	ascii_table);
9392	if (translate < `0`)
9393	return -`1`;
9394	if (translate == `0`)
9395	goto exit;
9396	ch2 = ascii_table[ch];
9397	}
9398	if (ch2 == `0xfe`) {
9399	if (ignore)
9400	continue;
9401	goto exit;
9402	}
9403	assert(ch2 < `128`);
9404	*out = ch2;
9405	out++;
9406	}
9407	res = `1`;
9408
9409	exit:
9410	writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9411	*input_pos = in - PyUnicode_1BYTE_DATA(input);
9412	return res;
9413	}
9414
9415	static PyObject *
9416	_PyUnicode_TranslateCharmap(PyObject *input,
9417	PyObject *mapping,
9418	const char *errors)
9419	{
9420	/ input object /
9421	const void *data;
9422	Py_ssize_t size, i;
9423	int kind;
9424	/ output buffer /
9425	_PyUnicodeWriter writer;
9426	/ error handler /
9427	const char *reason = "character maps to <undefined>";
9428	PyObject *errorHandler = NULL;
9429	PyObject *exc = NULL;
9430	int ignore;
9431	int res;
9432
9433	if (mapping == NULL) {
9434	PyErr_BadArgument();
9435	return NULL;
9436	}
9437
9438	if (PyUnicode_READY(input) == -`1`)
9439	return NULL;
9440	data = PyUnicode_DATA(input);
9441	kind = PyUnicode_KIND(input);
9442	size = PyUnicode_GET_LENGTH(input);
9443
9444	if (size == `0`)
9445	return PyUnicode_FromObject(input);
9446
9447	/ allocate enough for a simple 1:1 translation without*
9448	replacements, if we need more, we'll resize /*
9449	_PyUnicodeWriter_Init(&writer);
9450	if (_PyUnicodeWriter_Prepare(&writer, size, `127`) == -`1`)
9451	goto onError;
9452
9453	ignore = (errors != NULL && strcmp(errors, "ignore") == `0`);
9454
9455	if (PyUnicode_READY(input) == -`1`)
9456	return NULL;
9457	if (PyUnicode_IS_ASCII(input)) {
9458	res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9459	if (res < `0`) {
9460	_PyUnicodeWriter_Dealloc(&writer);
9461	return NULL;
9462	}
9463	if (res == `1`)
9464	return _PyUnicodeWriter_Finish(&writer);
9465	}
9466	else {
9467	i = `0`;
9468	}
9469
9470	while (i<size) {
9471	/ try to encode it /
9472	int translate;
9473	PyObject repunicode = NULL; /* initialize to prevent gcc warning /
9474	Py_ssize_t newpos;
9475	/ startpos for collecting untranslatable chars /
9476	Py_ssize_t collstart;
9477	Py_ssize_t collend;
9478	Py_UCS4 ch;
9479
9480	ch = PyUnicode_READ(kind, data, i);
9481	translate = charmaptranslate_output(ch, mapping, &writer);
9482	if (translate < `0`)
9483	goto onError;
9484
9485	if (translate != `0`) {
9486	/ it worked => adjust input pointer /
9487	++i;
9488	continue;
9489	}
9490
9491	/ untranslatable character /
9492	collstart = i;
9493	collend = i+`1`;
9494
9495	/ find all untranslatable characters /
9496	while (collend < size) {
9497	PyObject *x;
9498	ch = PyUnicode_READ(kind, data, collend);
9499	if (charmaptranslate_lookup(ch, mapping, &x))
9500	goto onError;
9501	Py_XDECREF(x);
9502	if (x != Py_None)
9503	break;
9504	++collend;
9505	}
9506
9507	if (ignore) {
9508	i = collend;
9509	}
9510	else {
9511	repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9512	reason, input, &exc,
9513	collstart, collend, &newpos);
9514	if (repunicode == NULL)
9515	goto onError;
9516	if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < `0`) {
9517	Py_DECREF(repunicode);
9518	goto onError;
9519	}
9520	Py_DECREF(repunicode);
9521	i = newpos;
9522	}
9523	}
9524	Py_XDECREF(exc);
9525	Py_XDECREF(errorHandler);
9526	return _PyUnicodeWriter_Finish(&writer);
9527
9528	onError:
9529	_PyUnicodeWriter_Dealloc(&writer);
9530	Py_XDECREF(exc);
9531	Py_XDECREF(errorHandler);
9532	return NULL;
9533	}
9534
9535	/ Deprecated. Use PyUnicode_Translate instead. /
9536	PyObject *
9537	PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9538	Py_ssize_t size,
9539	PyObject *mapping,
9540	const char *errors)
9541	{
9542	PyObject *result;
9543	PyObject *unicode = PyUnicode_FromWideChar(p, size);
9544	if (!unicode)
9545	return NULL;
9546	result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9547	Py_DECREF(unicode);
9548	return result;
9549	}
9550
9551	PyObject *
9552	PyUnicode_Translate(PyObject *str,
9553	PyObject *mapping,
9554	const char *errors)
9555	{
9556	if (ensure_unicode(str) < `0`)
9557	return NULL;
9558	return _PyUnicode_TranslateCharmap(str, mapping, errors);
9559	}
9560
9561	PyObject *
9562	_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9563	{
9564	if (!PyUnicode_Check(unicode)) {
9565	PyErr_BadInternalCall();
9566	return NULL;
9567	}
9568	if (PyUnicode_READY(unicode) == -`1`)
9569	return NULL;
9570	if (PyUnicode_IS_ASCII(unicode)) {
9571	/ If the string is already ASCII, just return the same string /
9572	Py_INCREF(unicode);
9573	return unicode;
9574	}
9575
9576	Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9577	PyObject *result = PyUnicode_New(len, `127`);
9578	if (result == NULL) {
9579	return NULL;
9580	}
9581
9582	Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9583	int kind = PyUnicode_KIND(unicode);
9584	const void *data = PyUnicode_DATA(unicode);
9585	Py_ssize_t i;
9586	for (i = `0`; i < len; ++i) {
9587	Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9588	if (ch < `127`) {
9589	out[i] = ch;
9590	}
9591	else if (Py_UNICODE_ISSPACE(ch)) {
9592	out[i] = `' '`;
9593	}
9594	else {
9595	int decimal = Py_UNICODE_TODECIMAL(ch);
9596	if (decimal < `0`) {
9597	out[i] = `'?'`;
9598	out[i+`1`] = `'\0'`;
9599	_PyUnicode_LENGTH(result) = i + `1`;
9600	break;
9601	}
9602	out[i] = `'0'` + decimal;
9603	}
9604	}
9605
9606	assert(_PyUnicode_CheckConsistency(result, `1`));
9607	return result;
9608	}
9609
9610	PyObject *
9611	PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9612	Py_ssize_t length)
9613	{
9614	PyObject *decimal;
9615	Py_ssize_t i;
9616	Py_UCS4 maxchar;
9617	enum PyUnicode_Kind kind;
9618	const void *data;
9619
9620	maxchar = `127`;
9621	for (i = `0`; i < length; i++) {
9622	Py_UCS4 ch = s[i];
9623	if (ch > `127`) {
9624	int decimal = Py_UNICODE_TODECIMAL(ch);
9625	if (decimal >= `0`)
9626	ch = `'0'` + decimal;
9627	maxchar = Py_MAX(maxchar, ch);
9628	}
9629	}
9630
9631	/ Copy to a new string /
9632	decimal = PyUnicode_New(length, maxchar);
9633	if (decimal == NULL)
9634	return decimal;
9635	kind = PyUnicode_KIND(decimal);
9636	data = PyUnicode_DATA(decimal);
9637	/ Iterate over code points /
9638	for (i = `0`; i < length; i++) {
9639	Py_UCS4 ch = s[i];
9640	if (ch > `127`) {
9641	int decimal = Py_UNICODE_TODECIMAL(ch);
9642	if (decimal >= `0`)
9643	ch = `'0'` + decimal;
9644	}
9645	PyUnicode_WRITE(kind, data, i, ch);
9646	}
9647	return unicode_result(decimal);
9648	}
9649	/ --- Decimal Encoder ---------------------------------------------------- /
9650
9651	int
9652	PyUnicode_EncodeDecimal(Py_UNICODE *s,
9653	Py_ssize_t length,
9654	char *output,
9655	const char *errors)
9656	{
9657	PyObject *unicode;
9658	Py_ssize_t i;
9659	enum PyUnicode_Kind kind;
9660	const void *data;
9661
9662	if (output == NULL) {
9663	PyErr_BadArgument();
9664	return -`1`;
9665	}
9666
9667	unicode = PyUnicode_FromWideChar(s, length);
9668	if (unicode == NULL)
9669	return -`1`;
9670
9671	kind = PyUnicode_KIND(unicode);
9672	data = PyUnicode_DATA(unicode);
9673
9674	for (i=`0`; i < length; ) {
9675	PyObject *exc;
9676	Py_UCS4 ch;
9677	int decimal;
9678	Py_ssize_t startpos;
9679
9680	ch = PyUnicode_READ(kind, data, i);
9681
9682	if (Py_UNICODE_ISSPACE(ch)) {
9683	*output++ = `' '`;
9684	i++;
9685	continue;
9686	}
9687	decimal = Py_UNICODE_TODECIMAL(ch);
9688	if (decimal >= `0`) {
9689	*output++ = `'0'` + decimal;
9690	i++;
9691	continue;
9692	}
9693	if (`0` < ch && ch < `256`) {
9694	output++ = (char*)ch;
9695	i++;
9696	continue;
9697	}
9698
9699	startpos = i;
9700	exc = NULL;
9701	raise_encode_exception(&exc, "decimal", unicode,
9702	startpos, startpos+`1`,
9703	"invalid decimal Unicode string");
9704	Py_XDECREF(exc);
9705	Py_DECREF(unicode);
9706	return -`1`;
9707	}
9708	/ 0-terminate the output string /
9709	*output++ = `'\0'`;
9710	Py_DECREF(unicode);
9711	return `0`;
9712	}
9713
9714	/ --- Helpers ------------------------------------------------------------ /
9715
9716	/ helper macro to fixup start/end slice values /
9717	#define ADJUST_INDICES(start, end, len) \
9718	if (end > len) \
9719	end = len; \
9720	else if (end < 0) { \
9721	end += len; \
9722	if (end < 0) \
9723	end = 0; \
9724	} \
9725	if (start < 0) { \
9726	start += len; \
9727	if (start < 0) \
9728	start = 0; \
9729	}
9730
9731	static Py_ssize_t
9732	any_find_slice(PyObject* s1, PyObject* s2,
9733	Py_ssize_t start,
9734	Py_ssize_t end,
9735	int direction)
9736	{
9737	int kind1, kind2;
9738	const void buf1, buf2;
9739	Py_ssize_t len1, len2, result;
9740
9741	kind1 = PyUnicode_KIND(s1);
9742	kind2 = PyUnicode_KIND(s2);
9743	if (kind1 < kind2)
9744	return -`1`;
9745
9746	len1 = PyUnicode_GET_LENGTH(s1);
9747	len2 = PyUnicode_GET_LENGTH(s2);
9748	ADJUST_INDICES(start, end, len1);
9749	if (end - start < len2)
9750	return -`1`;
9751
9752	buf1 = PyUnicode_DATA(s1);
9753	buf2 = PyUnicode_DATA(s2);
9754	if (len2 == `1`) {
9755	Py_UCS4 ch = PyUnicode_READ(kind2, buf2, `0`);
9756	result = findchar((const char )buf1 + kind1start,
9757	kind1, end - start, ch, direction);
9758	if (result == -`1`)
9759	return -`1`;
9760	else
9761	return start + result;
9762	}
9763
9764	if (kind2 != kind1) {
9765	buf2 = unicode_askind(kind2, buf2, len2, kind1);
9766	if (!buf2)
9767	return -`2`;
9768	}
9769
9770	if (direction > `0`) {
9771	switch (kind1) {
9772	case PyUnicode_1BYTE_KIND:
9773	if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9774	result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9775	else
9776	result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9777	break;
9778	case PyUnicode_2BYTE_KIND:
9779	result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9780	break;
9781	case PyUnicode_4BYTE_KIND:
9782	result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9783	break;
9784	default:
9785	Py_UNREACHABLE();
9786	}
9787	}
9788	else {
9789	switch (kind1) {
9790	case PyUnicode_1BYTE_KIND:
9791	if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9792	result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9793	else
9794	result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9795	break;
9796	case PyUnicode_2BYTE_KIND:
9797	result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9798	break;
9799	case PyUnicode_4BYTE_KIND:
9800	result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9801	break;
9802	default:
9803	Py_UNREACHABLE();
9804	}
9805	}
9806
9807	assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9808	if (kind2 != kind1)
9809	PyMem_Free((void *)buf2);
9810
9811	return result;
9812	}
9813
9814	/ _PyUnicode_InsertThousandsGrouping() helper functions /
9815	#include "stringlib/localeutil.h"
9816
9817	/**
9818	* InsertThousandsGrouping:
9819	* @writer: Unicode writer.
9820	* @n_buffer: Number of characters in @buffer.
9821	* @digits: Digits we're reading from. If count is non-NULL, this is unused.
9822	* @d_pos: Start of digits string.
9823	* @n_digits: The number of digits in the string, in which we want
9824	* to put the grouping chars.
9825	* @min_width: The minimum width of the digits in the output string.
9826	* Output will be zero-padded on the left to fill.
9827	* @grouping: see definition in localeconv().
9828	* @thousands_sep: see definition in localeconv().
9829	*
9830	* There are 2 modes: counting and filling. If @writer is NULL,
9831	* we are in counting mode, else filling mode.
9832	* If counting, the required buffer size is returned.
9833	* If filling, we know the buffer will be large enough, so we don't
9834	* need to pass in the buffer size.
9835	* Inserts thousand grouping characters (as defined by grouping and
9836	* thousands_sep) into @writer.
9837	*
9838	* Return value: -1 on error, number of characters otherwise.
9839	**/
9840	Py_ssize_t
9841	_PyUnicode_InsertThousandsGrouping(
9842	_PyUnicodeWriter *writer,
9843	Py_ssize_t n_buffer,
9844	PyObject *digits,
9845	Py_ssize_t d_pos,
9846	Py_ssize_t n_digits,
9847	Py_ssize_t min_width,
9848	const char *grouping,
9849	PyObject *thousands_sep,
9850	Py_UCS4 *maxchar)
9851	{
9852	min_width = Py_MAX(`0`, min_width);
9853	if (writer) {
9854	assert(digits != NULL);
9855	assert(maxchar == NULL);
9856	}
9857	else {
9858	assert(digits == NULL);
9859	assert(maxchar != NULL);
9860	}
9861	assert(`0` <= d_pos);
9862	assert(`0` <= n_digits);
9863	assert(grouping != NULL);
9864
9865	if (digits != NULL) {
9866	if (PyUnicode_READY(digits) == -`1`) {
9867	return -`1`;
9868	}
9869	}
9870	if (PyUnicode_READY(thousands_sep) == -`1`) {
9871	return -`1`;
9872	}
9873
9874	Py_ssize_t count = `0`;
9875	Py_ssize_t n_zeros;
9876	int loop_broken = `0`;
9877	int use_separator = `0`; / First time through, don't append the*
9878	separator. They only go between
9879	groups. /*
9880	Py_ssize_t buffer_pos;
9881	Py_ssize_t digits_pos;
9882	Py_ssize_t len;
9883	Py_ssize_t n_chars;
9884	Py_ssize_t remaining = n_digits; / Number of chars remaining to*
9885	be looked at /*
9886	/ A generator that returns all of the grouping widths, until it*
9887	returns 0. /*
9888	GroupGenerator groupgen;
9889	GroupGenerator_init(&groupgen, grouping);
9890	const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9891
9892	/ if digits are not grouped, thousands separator*
9893	should be an empty string /*
9894	assert(!(grouping[`0`] == CHAR_MAX && thousands_sep_len != `0`));
9895
9896	digits_pos = d_pos + n_digits;
9897	if (writer) {
9898	buffer_pos = writer->pos + n_buffer;
9899	assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9900	assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9901	}
9902	else {
9903	buffer_pos = n_buffer;
9904	}
9905
9906	if (!writer) {
9907	*maxchar = `127`;
9908	}
9909
9910	while ((len = GroupGenerator_next(&groupgen)) > `0`) {
9911	len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), `1`));
9912	n_zeros = Py_MAX(`0`, len - remaining);
9913	n_chars = Py_MAX(`0`, Py_MIN(remaining, len));
9914
9915	/ Use n_zero zero's and n_chars chars /
9916
9917	/ Count only, don't do anything. /
9918	count += (use_separator ? thousands_sep_len : `0`) + n_zeros + n_chars;
9919
9920	/ Copy into the writer. /
9921	InsertThousandsGrouping_fill(writer, &buffer_pos,
9922	digits, &digits_pos,
9923	n_chars, n_zeros,
9924	use_separator ? thousands_sep : NULL,
9925	thousands_sep_len, maxchar);
9926
9927	/ Use a separator next time. /
9928	use_separator = `1`;
9929
9930	remaining -= n_chars;
9931	min_width -= len;
9932
9933	if (remaining <= `0` && min_width <= `0`) {
9934	loop_broken = `1`;
9935	break;
9936	}
9937	min_width -= thousands_sep_len;
9938	}
9939	if (!loop_broken) {
9940	/ We left the loop without using a break statement. /
9941
9942	len = Py_MAX(Py_MAX(remaining, min_width), `1`);
9943	n_zeros = Py_MAX(`0`, len - remaining);
9944	n_chars = Py_MAX(`0`, Py_MIN(remaining, len));
9945
9946	/ Use n_zero zero's and n_chars chars /
9947	count += (use_separator ? thousands_sep_len : `0`) + n_zeros + n_chars;
9948
9949	/ Copy into the writer. /
9950	InsertThousandsGrouping_fill(writer, &buffer_pos,
9951	digits, &digits_pos,
9952	n_chars, n_zeros,
9953	use_separator ? thousands_sep : NULL,
9954	thousands_sep_len, maxchar);
9955	}
9956	return count;
9957	}
9958
9959
9960	Py_ssize_t
9961	PyUnicode_Count(PyObject *str,
9962	PyObject *substr,
9963	Py_ssize_t start,
9964	Py_ssize_t end)
9965	{
9966	Py_ssize_t result;
9967	int kind1, kind2;
9968	const void buf1 = NULL, buf2 = NULL;
9969	Py_ssize_t len1, len2;
9970
9971	if (ensure_unicode(str) < `0` \|\| ensure_unicode(substr) < `0`)
9972	return -`1`;
9973
9974	kind1 = PyUnicode_KIND(str);
9975	kind2 = PyUnicode_KIND(substr);
9976	if (kind1 < kind2)
9977	return `0`;
9978
9979	len1 = PyUnicode_GET_LENGTH(str);
9980	len2 = PyUnicode_GET_LENGTH(substr);
9981	ADJUST_INDICES(start, end, len1);
9982	if (end - start < len2)
9983	return `0`;
9984
9985	buf1 = PyUnicode_DATA(str);
9986	buf2 = PyUnicode_DATA(substr);
9987	if (kind2 != kind1) {
9988	buf2 = unicode_askind(kind2, buf2, len2, kind1);
9989	if (!buf2)
9990	goto onError;
9991	}
9992
9993	switch (kind1) {
9994	case PyUnicode_1BYTE_KIND:
9995	if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9996	result = asciilib_count(
9997	((const Py_UCS1*)buf1) + start, end - start,
9998	buf2, len2, PY_SSIZE_T_MAX
9999	);
10000	else
10001	result = ucs1lib_count(
10002	((const Py_UCS1*)buf1) + start, end - start,
10003	buf2, len2, PY_SSIZE_T_MAX
10004	);
10005	break;
10006	case PyUnicode_2BYTE_KIND:
10007	result = ucs2lib_count(
10008	((const Py_UCS2*)buf1) + start, end - start,
10009	buf2, len2, PY_SSIZE_T_MAX
10010	);
10011	break;
10012	case PyUnicode_4BYTE_KIND:
10013	result = ucs4lib_count(
10014	((const Py_UCS4*)buf1) + start, end - start,
10015	buf2, len2, PY_SSIZE_T_MAX
10016	);
10017	break;
10018	default:
10019	Py_UNREACHABLE();
10020	}
10021
10022	assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
10023	if (kind2 != kind1)
10024	PyMem_Free((void *)buf2);
10025
10026	return result;
10027	onError:
10028	assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
10029	if (kind2 != kind1)
10030	PyMem_Free((void *)buf2);
10031	return -`1`;
10032	}
10033
10034	Py_ssize_t
10035	PyUnicode_Find(PyObject *str,
10036	PyObject *substr,
10037	Py_ssize_t start,
10038	Py_ssize_t end,
10039	int direction)
10040	{
10041	if (ensure_unicode(str) < `0` \|\| ensure_unicode(substr) < `0`)
10042	return -`2`;
10043
10044	return any_find_slice(str, substr, start, end, direction);
10045	}
10046
10047	Py_ssize_t
10048	PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
10049	Py_ssize_t start, Py_ssize_t end,
10050	int direction)
10051	{
10052	int kind;
10053	Py_ssize_t len, result;
10054	if (PyUnicode_READY(str) == -`1`)
10055	return -`2`;
10056	len = PyUnicode_GET_LENGTH(str);
10057	ADJUST_INDICES(start, end, len);
10058	if (end - start < `1`)
10059	return -`1`;
10060	kind = PyUnicode_KIND(str);
10061	result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
10062	kind, end-start, ch, direction);
10063	if (result == -`1`)
10064	return -`1`;
10065	else
10066	return start + result;
10067	}
10068
10069	static int
10070	tailmatch(PyObject *self,
10071	PyObject *substring,
10072	Py_ssize_t start,
10073	Py_ssize_t end,
10074	int direction)
10075	{
10076	int kind_self;
10077	int kind_sub;
10078	const void *data_self;
10079	const void *data_sub;
10080	Py_ssize_t offset;
10081	Py_ssize_t i;
10082	Py_ssize_t end_sub;
10083
10084	if (PyUnicode_READY(self) == -`1` \|\|
10085	PyUnicode_READY(substring) == -`1`)
10086	return -`1`;
10087
10088	ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
10089	end -= PyUnicode_GET_LENGTH(substring);
10090	if (end < start)
10091	return `0`;
10092
10093	if (PyUnicode_GET_LENGTH(substring) == `0`)
10094	return `1`;
10095
10096	kind_self = PyUnicode_KIND(self);
10097	data_self = PyUnicode_DATA(self);
10098	kind_sub = PyUnicode_KIND(substring);
10099	data_sub = PyUnicode_DATA(substring);
10100	end_sub = PyUnicode_GET_LENGTH(substring) - `1`;
10101
10102	if (direction > `0`)
10103	offset = end;
10104	else
10105	offset = start;
10106
10107	if (PyUnicode_READ(kind_self, data_self, offset) ==
10108	PyUnicode_READ(kind_sub, data_sub, `0`) &&
10109	PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
10110	PyUnicode_READ(kind_sub, data_sub, end_sub)) {
10111	/ If both are of the same kind, memcmp is sufficient /
10112	if (kind_self == kind_sub) {
10113	return ! memcmp((char *)data_self +
10114	(offset * PyUnicode_KIND(substring)),
10115	data_sub,
10116	PyUnicode_GET_LENGTH(substring) *
10117	PyUnicode_KIND(substring));
10118	}
10119	/ otherwise we have to compare each character by first accessing it /
10120	else {
10121	/ We do not need to compare 0 and len(substring)-1 because*
10122	the if statement above ensured already that they are equal
10123	when we end up here. /*
10124	for (i = `1`; i < end_sub; ++i) {
10125	if (PyUnicode_READ(kind_self, data_self, offset + i) !=
10126	PyUnicode_READ(kind_sub, data_sub, i))
10127	return `0`;
10128	}
10129	return `1`;
10130	}
10131	}
10132
10133	return `0`;
10134	}
10135
10136	Py_ssize_t
10137	PyUnicode_Tailmatch(PyObject *str,
10138	PyObject *substr,
10139	Py_ssize_t start,
10140	Py_ssize_t end,
10141	int direction)
10142	{
10143	if (ensure_unicode(str) < `0` \|\| ensure_unicode(substr) < `0`)
10144	return -`1`;
10145
10146	return tailmatch(str, substr, start, end, direction);
10147	}
10148
10149	static PyObject *
10150	ascii_upper_or_lower(PyObject self, int* lower)
10151	{
10152	Py_ssize_t len = PyUnicode_GET_LENGTH(self);
10153	const char *data = PyUnicode_DATA(self);
10154	char *resdata;
10155	PyObject *res;
10156
10157	res = PyUnicode_New(len, `127`);
10158	if (res == NULL)
10159	return NULL;
10160	resdata = PyUnicode_DATA(res);
10161	if (lower)
10162	_Py_bytes_lower(resdata, data, len);
10163	else
10164	_Py_bytes_upper(resdata, data, len);
10165	return res;
10166	}
10167
10168	static Py_UCS4
10169	handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
10170	{
10171	Py_ssize_t j;
10172	int final_sigma;
10173	Py_UCS4 c = `0`; / initialize to prevent gcc warning /
10174	/ U+03A3 is in the Final_Sigma context when, it is found like this:*
10175
10176	\p{cased}\p{case-ignorable}U+03A3!(\p{case-ignorable}\p{cased})
10177
10178	where ! is a negation and \p{xxx} is a character with property xxx.
10179	*/
10180	for (j = i - `1`; j >= `0`; j--) {
10181	c = PyUnicode_READ(kind, data, j);
10182	if (!_PyUnicode_IsCaseIgnorable(c))
10183	break;
10184	}
10185	final_sigma = j >= `0` && _PyUnicode_IsCased(c);
10186	if (final_sigma) {
10187	for (j = i + `1`; j < length; j++) {
10188	c = PyUnicode_READ(kind, data, j);
10189	if (!_PyUnicode_IsCaseIgnorable(c))
10190	break;
10191	}
10192	final_sigma = j == length \|\| !_PyUnicode_IsCased(c);
10193	}
10194	return (final_sigma) ? `0x3C2` : `0x3C3`;
10195	}
10196
10197	static int
10198	lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
10199	Py_UCS4 c, Py_UCS4 *mapped)
10200	{
10201	/ Obscure special case. /
10202	if (c == `0x3A3`) {
10203	mapped[`0`] = handle_capital_sigma(kind, data, length, i);
10204	return `1`;
10205	}
10206	return _PyUnicode_ToLowerFull(c, mapped);
10207	}
10208
10209	static Py_ssize_t
10210	do_capitalize(int kind, const void data, Py_ssize_t length, Py_UCS4 res, Py_UCS4 *maxchar)
10211	{
10212	Py_ssize_t i, k = `0`;
10213	int n_res, j;
10214	Py_UCS4 c, mapped[`3`];
10215
10216	c = PyUnicode_READ(kind, data, `0`);
10217	n_res = _PyUnicode_ToTitleFull(c, mapped);
10218	for (j = `0`; j < n_res; j++) {
10219	maxchar = Py_MAX(maxchar, mapped[j]);
10220	res[k++] = mapped[j];
10221	}
10222	for (i = `1`; i < length; i++) {
10223	c = PyUnicode_READ(kind, data, i);
10224	n_res = lower_ucs4(kind, data, length, i, c, mapped);
10225	for (j = `0`; j < n_res; j++) {
10226	maxchar = Py_MAX(maxchar, mapped[j]);
10227	res[k++] = mapped[j];
10228	}
10229	}
10230	return k;
10231	}
10232
10233	static Py_ssize_t
10234	do_swapcase(int kind, const void data, Py_ssize_t length, Py_UCS4 res, Py_UCS4 *maxchar) {
10235	Py_ssize_t i, k = `0`;
10236
10237	for (i = `0`; i < length; i++) {
10238	Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[`3`];
10239	int n_res, j;
10240	if (Py_UNICODE_ISUPPER(c)) {
10241	n_res = lower_ucs4(kind, data, length, i, c, mapped);
10242	}
10243	else if (Py_UNICODE_ISLOWER(c)) {
10244	n_res = _PyUnicode_ToUpperFull(c, mapped);
10245	}
10246	else {
10247	n_res = `1`;
10248	mapped[`0`] = c;
10249	}
10250	for (j = `0`; j < n_res; j++) {
10251	maxchar = Py_MAX(maxchar, mapped[j]);
10252	res[k++] = mapped[j];
10253	}
10254	}
10255	return k;
10256	}
10257
10258	static Py_ssize_t
10259	do_upper_or_lower(int kind, const void data, Py_ssize_t length, Py_UCS4 res,
10260	Py_UCS4 maxchar, int* lower)
10261	{
10262	Py_ssize_t i, k = `0`;
10263
10264	for (i = `0`; i < length; i++) {
10265	Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[`3`];
10266	int n_res, j;
10267	if (lower)
10268	n_res = lower_ucs4(kind, data, length, i, c, mapped);
10269	else
10270	n_res = _PyUnicode_ToUpperFull(c, mapped);
10271	for (j = `0`; j < n_res; j++) {
10272	maxchar = Py_MAX(maxchar, mapped[j]);
10273	res[k++] = mapped[j];
10274	}
10275	}
10276	return k;
10277	}
10278
10279	static Py_ssize_t
10280	do_upper(int kind, const void data, Py_ssize_t length, Py_UCS4 res, Py_UCS4 *maxchar)
10281	{
10282	return do_upper_or_lower(kind, data, length, res, maxchar, `0`);
10283	}
10284
10285	static Py_ssize_t
10286	do_lower(int kind, const void data, Py_ssize_t length, Py_UCS4 res, Py_UCS4 *maxchar)
10287	{
10288	return do_upper_or_lower(kind, data, length, res, maxchar, `1`);
10289	}
10290
10291	static Py_ssize_t
10292	do_casefold(int kind, const void data, Py_ssize_t length, Py_UCS4 res, Py_UCS4 *maxchar)
10293	{
10294	Py_ssize_t i, k = `0`;
10295
10296	for (i = `0`; i < length; i++) {
10297	Py_UCS4 c = PyUnicode_READ(kind, data, i);
10298	Py_UCS4 mapped[`3`];
10299	int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10300	for (j = `0`; j < n_res; j++) {
10301	maxchar = Py_MAX(maxchar, mapped[j]);
10302	res[k++] = mapped[j];
10303	}
10304	}
10305	return k;
10306	}
10307
10308	static Py_ssize_t
10309	do_title(int kind, const void data, Py_ssize_t length, Py_UCS4 res, Py_UCS4 *maxchar)
10310	{
10311	Py_ssize_t i, k = `0`;
10312	int previous_is_cased;
10313
10314	previous_is_cased = `0`;
10315	for (i = `0`; i < length; i++) {
10316	const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10317	Py_UCS4 mapped[`3`];
10318	int n_res, j;
10319
10320	if (previous_is_cased)
10321	n_res = lower_ucs4(kind, data, length, i, c, mapped);
10322	else
10323	n_res = _PyUnicode_ToTitleFull(c, mapped);
10324
10325	for (j = `0`; j < n_res; j++) {
10326	maxchar = Py_MAX(maxchar, mapped[j]);
10327	res[k++] = mapped[j];
10328	}
10329
10330	previous_is_cased = _PyUnicode_IsCased(c);
10331	}
10332	return k;
10333	}
10334
10335	static PyObject *
10336	case_operation(PyObject *self,
10337	Py_ssize_t (perform)(int, const* void , Py_ssize_t, Py_UCS4 , Py_UCS4 *))
10338	{
10339	PyObject *res = NULL;
10340	Py_ssize_t length, newlength = `0`;
10341	int kind, outkind;
10342	const void *data;
10343	void *outdata;
10344	Py_UCS4 maxchar = `0`, tmp, tmpend;
10345
10346	assert(PyUnicode_IS_READY(self));
10347
10348	kind = PyUnicode_KIND(self);
10349	data = PyUnicode_DATA(self);
10350	length = PyUnicode_GET_LENGTH(self);
10351	if ((size_t) length > PY_SSIZE_T_MAX / (`3` * sizeof(Py_UCS4))) {
10352	PyErr_SetString(PyExc_OverflowError, "string is too long");
10353	return NULL;
10354	}
10355	tmp = PyMem_Malloc(sizeof(Py_UCS4) * `3` * length);
10356	if (tmp == NULL)
10357	return PyErr_NoMemory();
10358	newlength = perform(kind, data, length, tmp, &maxchar);
10359	res = PyUnicode_New(newlength, maxchar);
10360	if (res == NULL)
10361	goto leave;
10362	tmpend = tmp + newlength;
10363	outdata = PyUnicode_DATA(res);
10364	outkind = PyUnicode_KIND(res);
10365	switch (outkind) {
10366	case PyUnicode_1BYTE_KIND:
10367	_PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10368	break;
10369	case PyUnicode_2BYTE_KIND:
10370	_PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10371	break;
10372	case PyUnicode_4BYTE_KIND:
10373	memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10374	break;
10375	default:
10376	Py_UNREACHABLE();
10377	}
10378	leave:
10379	PyMem_Free(tmp);
10380	return res;
10381	}
10382
10383	PyObject *
10384	PyUnicode_Join(PyObject separator, PyObject seq)
10385	{
10386	PyObject *res;
10387	PyObject *fseq;
10388	Py_ssize_t seqlen;
10389	PyObject **items;
10390
10391	fseq = PySequence_Fast(seq, "can only join an iterable");
10392	if (fseq == NULL) {
10393	return NULL;
10394	}
10395
10396	/ NOTE: the following code can't call back into Python code,*
10397	* so we are sure that fseq won't be mutated.
10398	*/
10399
10400	items = PySequence_Fast_ITEMS(fseq);
10401	seqlen = PySequence_Fast_GET_SIZE(fseq);
10402	res = _PyUnicode_JoinArray(separator, items, seqlen);
10403	Py_DECREF(fseq);
10404	return res;
10405	}
10406
10407	PyObject *
10408	_PyUnicode_JoinArray(PyObject separator, PyObject const *items, Py_ssize_t seqlen)
10409	{
10410	PyObject res = NULL; /* the result /
10411	PyObject *sep = NULL;
10412	Py_ssize_t seplen;
10413	PyObject *item;
10414	Py_ssize_t sz, i, res_offset;
10415	Py_UCS4 maxchar;
10416	Py_UCS4 item_maxchar;
10417	int use_memcpy;
10418	unsigned char res_data = NULL, sep_data = NULL;
10419	PyObject *last_obj;
10420	unsigned int kind = `0`;
10421
10422	/ If empty sequence, return u"". /
10423	if (seqlen == `0`) {
10424	_Py_RETURN_UNICODE_EMPTY();
10425	}
10426
10427	/ If singleton sequence with an exact Unicode, return that. /
10428	last_obj = NULL;
10429	if (seqlen == `1`) {
10430	if (PyUnicode_CheckExact(items[`0`])) {
10431	res = items[`0`];
10432	Py_INCREF(res);
10433	return res;
10434	}
10435	seplen = `0`;
10436	maxchar = `0`;
10437	}
10438	else {
10439	/ Set up sep and seplen /
10440	if (separator == NULL) {
10441	/ fall back to a blank space separator /
10442	sep = PyUnicode_FromOrdinal(`' '`);
10443	if (!sep)
10444	goto onError;
10445	seplen = `1`;
10446	maxchar = `32`;
10447	}
10448	else {
10449	if (!PyUnicode_Check(separator)) {
10450	PyErr_Format(PyExc_TypeError,
10451	"separator: expected str instance,"
10452	" %.80s found",
10453	Py_TYPE(separator)->tp_name);
10454	goto onError;
10455	}
10456	if (PyUnicode_READY(separator))
10457	goto onError;
10458	sep = separator;
10459	seplen = PyUnicode_GET_LENGTH(separator);
10460	maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10461	/ inc refcount to keep this code path symmetric with the*
10462	above case of a blank separator /*
10463	Py_INCREF(sep);
10464	}
10465	last_obj = sep;
10466	}
10467
10468	/ There are at least two things to join, or else we have a subclass*
10469	* of str in the sequence.
10470	* Do a pre-pass to figure out the total amount of space we'll
10471	* need (sz), and see whether all argument are strings.
10472	*/
10473	sz = `0`;
10474	#ifdef Py_DEBUG
10475	use_memcpy = `0`;
10476	#else
10477	use_memcpy = `1`;
10478	#endif
10479	for (i = `0`; i < seqlen; i++) {
10480	size_t add_sz;
10481	item = items[i];
10482	if (!PyUnicode_Check(item)) {
10483	PyErr_Format(PyExc_TypeError,
10484	"sequence item %zd: expected str instance,"
10485	" %.80s found",
10486	i, Py_TYPE(item)->tp_name);
10487	goto onError;
10488	}
10489	if (PyUnicode_READY(item) == -`1`)
10490	goto onError;
10491	add_sz = PyUnicode_GET_LENGTH(item);
10492	item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10493	maxchar = Py_MAX(maxchar, item_maxchar);
10494	if (i != `0`) {
10495	add_sz += seplen;
10496	}
10497	if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10498	PyErr_SetString(PyExc_OverflowError,
10499	"join() result is too long for a Python string");
10500	goto onError;
10501	}
10502	sz += add_sz;
10503	if (use_memcpy && last_obj != NULL) {
10504	if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10505	use_memcpy = `0`;
10506	}
10507	last_obj = item;
10508	}
10509
10510	res = PyUnicode_New(sz, maxchar);
10511	if (res == NULL)
10512	goto onError;
10513
10514	/ Catenate everything. /
10515	#ifdef Py_DEBUG
10516	use_memcpy = `0`;
10517	#else
10518	if (use_memcpy) {
10519	res_data = PyUnicode_1BYTE_DATA(res);
10520	kind = PyUnicode_KIND(res);
10521	if (seplen != `0`)
10522	sep_data = PyUnicode_1BYTE_DATA(sep);
10523	}
10524	#endif
10525	if (use_memcpy) {
10526	for (i = `0`; i < seqlen; ++i) {
10527	Py_ssize_t itemlen;
10528	item = items[i];
10529
10530	/ Copy item, and maybe the separator. /
10531	if (i && seplen != `0`) {
10532	memcpy(res_data,
10533	sep_data,
10534	kind * seplen);
10535	res_data += kind * seplen;
10536	}
10537
10538	itemlen = PyUnicode_GET_LENGTH(item);
10539	if (itemlen != `0`) {
10540	memcpy(res_data,
10541	PyUnicode_DATA(item),
10542	kind * itemlen);
10543	res_data += kind * itemlen;
10544	}
10545	}
10546	assert(res_data == PyUnicode_1BYTE_DATA(res)
10547	+ kind * PyUnicode_GET_LENGTH(res));
10548	}
10549	else {
10550	for (i = `0`, res_offset = `0`; i < seqlen; ++i) {
10551	Py_ssize_t itemlen;
10552	item = items[i];
10553
10554	/ Copy item, and maybe the separator. /
10555	if (i && seplen != `0`) {
10556	_PyUnicode_FastCopyCharacters(res, res_offset, sep, `0`, seplen);
10557	res_offset += seplen;
10558	}
10559
10560	itemlen = PyUnicode_GET_LENGTH(item);
10561	if (itemlen != `0`) {
10562	_PyUnicode_FastCopyCharacters(res, res_offset, item, `0`, itemlen);
10563	res_offset += itemlen;
10564	}
10565	}
10566	assert(res_offset == PyUnicode_GET_LENGTH(res));
10567	}
10568
10569	Py_XDECREF(sep);
10570	assert(_PyUnicode_CheckConsistency(res, `1`));
10571	return res;
10572
10573	onError:
10574	Py_XDECREF(sep);
10575	Py_XDECREF(res);
10576	return NULL;
10577	}
10578
10579	void
10580	_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10581	Py_UCS4 fill_char)
10582	{
10583	const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10584	void *data = PyUnicode_DATA(unicode);
10585	assert(PyUnicode_IS_READY(unicode));
10586	assert(unicode_modifiable(unicode));
10587	assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10588	assert(start >= `0`);
10589	assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10590	unicode_fill(kind, data, fill_char, start, length);
10591	}
10592
10593	Py_ssize_t
10594	PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10595	Py_UCS4 fill_char)
10596	{
10597	Py_ssize_t maxlen;
10598
10599	if (!PyUnicode_Check(unicode)) {
10600	PyErr_BadInternalCall();
10601	return -`1`;
10602	}
10603	if (PyUnicode_READY(unicode) == -`1`)
10604	return -`1`;
10605	if (unicode_check_modifiable(unicode))
10606	return -`1`;
10607
10608	if (start < `0`) {
10609	PyErr_SetString(PyExc_IndexError, "string index out of range");
10610	return -`1`;
10611	}
10612	if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10613	PyErr_SetString(PyExc_ValueError,
10614	"fill character is bigger than "
10615	"the string maximum character");
10616	return -`1`;
10617	}
10618
10619	maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10620	length = Py_MIN(maxlen, length);
10621	if (length <= `0`)
10622	return `0`;
10623
10624	_PyUnicode_FastFill(unicode, start, length, fill_char);
10625	return length;
10626	}
10627
10628	static PyObject *
10629	pad(PyObject *self,
10630	Py_ssize_t left,
10631	Py_ssize_t right,
10632	Py_UCS4 fill)
10633	{
10634	PyObject *u;
10635	Py_UCS4 maxchar;
10636	int kind;
10637	void *data;
10638
10639	if (left < `0`)
10640	left = `0`;
10641	if (right < `0`)
10642	right = `0`;
10643
10644	if (left == `0` && right == `0`)
10645	return unicode_result_unchanged(self);
10646
10647	if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) \|\|
10648	right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10649	PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10650	return NULL;
10651	}
10652	maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10653	maxchar = Py_MAX(maxchar, fill);
10654	u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10655	if (!u)
10656	return NULL;
10657
10658	kind = PyUnicode_KIND(u);
10659	data = PyUnicode_DATA(u);
10660	if (left)
10661	unicode_fill(kind, data, fill, `0`, left);
10662	if (right)
10663	unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10664	_PyUnicode_FastCopyCharacters(u, left, self, `0`, _PyUnicode_LENGTH(self));
10665	assert(_PyUnicode_CheckConsistency(u, `1`));
10666	return u;
10667	}
10668
10669	PyObject *
10670	PyUnicode_Splitlines(PyObject string, int* keepends)
10671	{
10672	PyObject *list;
10673
10674	if (ensure_unicode(string) < `0`)
10675	return NULL;
10676
10677	switch (PyUnicode_KIND(string)) {
10678	case PyUnicode_1BYTE_KIND:
10679	if (PyUnicode_IS_ASCII(string))
10680	list = asciilib_splitlines(
10681	string, PyUnicode_1BYTE_DATA(string),
10682	PyUnicode_GET_LENGTH(string), keepends);
10683	else
10684	list = ucs1lib_splitlines(
10685	string, PyUnicode_1BYTE_DATA(string),
10686	PyUnicode_GET_LENGTH(string), keepends);
10687	break;
10688	case PyUnicode_2BYTE_KIND:
10689	list = ucs2lib_splitlines(
10690	string, PyUnicode_2BYTE_DATA(string),
10691	PyUnicode_GET_LENGTH(string), keepends);
10692	break;
10693	case PyUnicode_4BYTE_KIND:
10694	list = ucs4lib_splitlines(
10695	string, PyUnicode_4BYTE_DATA(string),
10696	PyUnicode_GET_LENGTH(string), keepends);
10697	break;
10698	default:
10699	Py_UNREACHABLE();
10700	}
10701	return list;
10702	}
10703
10704	static PyObject *
10705	split(PyObject *self,
10706	PyObject *substring,
10707	Py_ssize_t maxcount)
10708	{
10709	int kind1, kind2;
10710	const void buf1, buf2;
10711	Py_ssize_t len1, len2;
10712	PyObject* out;
10713
10714	if (maxcount < `0`)
10715	maxcount = PY_SSIZE_T_MAX;
10716
10717	if (PyUnicode_READY(self) == -`1`)
10718	return NULL;
10719
10720	if (substring == NULL)
10721	switch (PyUnicode_KIND(self)) {
10722	case PyUnicode_1BYTE_KIND:
10723	if (PyUnicode_IS_ASCII(self))
10724	return asciilib_split_whitespace(
10725	self, PyUnicode_1BYTE_DATA(self),
10726	PyUnicode_GET_LENGTH(self), maxcount
10727	);
10728	else
10729	return ucs1lib_split_whitespace(
10730	self, PyUnicode_1BYTE_DATA(self),
10731	PyUnicode_GET_LENGTH(self), maxcount
10732	);
10733	case PyUnicode_2BYTE_KIND:
10734	return ucs2lib_split_whitespace(
10735	self, PyUnicode_2BYTE_DATA(self),
10736	PyUnicode_GET_LENGTH(self), maxcount
10737	);
10738	case PyUnicode_4BYTE_KIND:
10739	return ucs4lib_split_whitespace(
10740	self, PyUnicode_4BYTE_DATA(self),
10741	PyUnicode_GET_LENGTH(self), maxcount
10742	);
10743	default:
10744	Py_UNREACHABLE();
10745	}
10746
10747	if (PyUnicode_READY(substring) == -`1`)
10748	return NULL;
10749
10750	kind1 = PyUnicode_KIND(self);
10751	kind2 = PyUnicode_KIND(substring);
10752	len1 = PyUnicode_GET_LENGTH(self);
10753	len2 = PyUnicode_GET_LENGTH(substring);
10754	if (kind1 < kind2 \|\| len1 < len2) {
10755	out = PyList_New(`1`);
10756	if (out == NULL)
10757	return NULL;
10758	Py_INCREF(self);
10759	PyList_SET_ITEM(out, `0`, self);
10760	return out;
10761	}
10762	buf1 = PyUnicode_DATA(self);
10763	buf2 = PyUnicode_DATA(substring);
10764	if (kind2 != kind1) {
10765	buf2 = unicode_askind(kind2, buf2, len2, kind1);
10766	if (!buf2)
10767	return NULL;
10768	}
10769
10770	switch (kind1) {
10771	case PyUnicode_1BYTE_KIND:
10772	if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10773	out = asciilib_split(
10774	self, buf1, len1, buf2, len2, maxcount);
10775	else
10776	out = ucs1lib_split(
10777	self, buf1, len1, buf2, len2, maxcount);
10778	break;
10779	case PyUnicode_2BYTE_KIND:
10780	out = ucs2lib_split(
10781	self, buf1, len1, buf2, len2, maxcount);
10782	break;
10783	case PyUnicode_4BYTE_KIND:
10784	out = ucs4lib_split(
10785	self, buf1, len1, buf2, len2, maxcount);
10786	break;
10787	default:
10788	out = NULL;
10789	}
10790	assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10791	if (kind2 != kind1)
10792	PyMem_Free((void *)buf2);
10793	return out;
10794	}
10795
10796	static PyObject *
10797	rsplit(PyObject *self,
10798	PyObject *substring,
10799	Py_ssize_t maxcount)
10800	{
10801	int kind1, kind2;
10802	const void buf1, buf2;
10803	Py_ssize_t len1, len2;
10804	PyObject* out;
10805
10806	if (maxcount < `0`)
10807	maxcount = PY_SSIZE_T_MAX;
10808
10809	if (PyUnicode_READY(self) == -`1`)
10810	return NULL;
10811
10812	if (substring == NULL)
10813	switch (PyUnicode_KIND(self)) {
10814	case PyUnicode_1BYTE_KIND:
10815	if (PyUnicode_IS_ASCII(self))
10816	return asciilib_rsplit_whitespace(
10817	self, PyUnicode_1BYTE_DATA(self),
10818	PyUnicode_GET_LENGTH(self), maxcount
10819	);
10820	else
10821	return ucs1lib_rsplit_whitespace(
10822	self, PyUnicode_1BYTE_DATA(self),
10823	PyUnicode_GET_LENGTH(self), maxcount
10824	);
10825	case PyUnicode_2BYTE_KIND:
10826	return ucs2lib_rsplit_whitespace(
10827	self, PyUnicode_2BYTE_DATA(self),
10828	PyUnicode_GET_LENGTH(self), maxcount
10829	);
10830	case PyUnicode_4BYTE_KIND:
10831	return ucs4lib_rsplit_whitespace(
10832	self, PyUnicode_4BYTE_DATA(self),
10833	PyUnicode_GET_LENGTH(self), maxcount
10834	);
10835	default:
10836	Py_UNREACHABLE();
10837	}
10838
10839	if (PyUnicode_READY(substring) == -`1`)
10840	return NULL;
10841
10842	kind1 = PyUnicode_KIND(self);
10843	kind2 = PyUnicode_KIND(substring);
10844	len1 = PyUnicode_GET_LENGTH(self);
10845	len2 = PyUnicode_GET_LENGTH(substring);
10846	if (kind1 < kind2 \|\| len1 < len2) {
10847	out = PyList_New(`1`);
10848	if (out == NULL)
10849	return NULL;
10850	Py_INCREF(self);
10851	PyList_SET_ITEM(out, `0`, self);
10852	return out;
10853	}
10854	buf1 = PyUnicode_DATA(self);
10855	buf2 = PyUnicode_DATA(substring);
10856	if (kind2 != kind1) {
10857	buf2 = unicode_askind(kind2, buf2, len2, kind1);
10858	if (!buf2)
10859	return NULL;
10860	}
10861
10862	switch (kind1) {
10863	case PyUnicode_1BYTE_KIND:
10864	if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10865	out = asciilib_rsplit(
10866	self, buf1, len1, buf2, len2, maxcount);
10867	else
10868	out = ucs1lib_rsplit(
10869	self, buf1, len1, buf2, len2, maxcount);
10870	break;
10871	case PyUnicode_2BYTE_KIND:
10872	out = ucs2lib_rsplit(
10873	self, buf1, len1, buf2, len2, maxcount);
10874	break;
10875	case PyUnicode_4BYTE_KIND:
10876	out = ucs4lib_rsplit(
10877	self, buf1, len1, buf2, len2, maxcount);
10878	break;
10879	default:
10880	out = NULL;
10881	}
10882	assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10883	if (kind2 != kind1)
10884	PyMem_Free((void *)buf2);
10885	return out;
10886	}
10887
10888	static Py_ssize_t
10889	anylib_find(int kind, PyObject str1, const* void *buf1, Py_ssize_t len1,
10890	PyObject str2, const* void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10891	{
10892	switch (kind) {
10893	case PyUnicode_1BYTE_KIND:
10894	if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10895	return asciilib_find(buf1, len1, buf2, len2, offset);
10896	else
10897	return ucs1lib_find(buf1, len1, buf2, len2, offset);
10898	case PyUnicode_2BYTE_KIND:
10899	return ucs2lib_find(buf1, len1, buf2, len2, offset);
10900	case PyUnicode_4BYTE_KIND:
10901	return ucs4lib_find(buf1, len1, buf2, len2, offset);
10902	}
10903	Py_UNREACHABLE();
10904	}
10905
10906	static Py_ssize_t
10907	anylib_count(int kind, PyObject sstr, const* void* sbuf, Py_ssize_t slen,
10908	PyObject str1, const* void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10909	{
10910	switch (kind) {
10911	case PyUnicode_1BYTE_KIND:
10912	if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10913	return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10914	else
10915	return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10916	case PyUnicode_2BYTE_KIND:
10917	return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10918	case PyUnicode_4BYTE_KIND:
10919	return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10920	}
10921	Py_UNREACHABLE();
10922	}
10923
10924	static void
10925	replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10926	Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10927	{
10928	int kind = PyUnicode_KIND(u);
10929	void *data = PyUnicode_DATA(u);
10930	Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10931	if (kind == PyUnicode_1BYTE_KIND) {
10932	ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10933	(Py_UCS1 *)data + len,
10934	u1, u2, maxcount);
10935	}
10936	else if (kind == PyUnicode_2BYTE_KIND) {
10937	ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10938	(Py_UCS2 *)data + len,
10939	u1, u2, maxcount);
10940	}
10941	else {
10942	assert(kind == PyUnicode_4BYTE_KIND);
10943	ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10944	(Py_UCS4 *)data + len,
10945	u1, u2, maxcount);
10946	}
10947	}
10948
10949	static PyObject *
10950	replace(PyObject self, PyObject str1,
10951	PyObject *str2, Py_ssize_t maxcount)
10952	{
10953	PyObject *u;
10954	const char *sbuf = PyUnicode_DATA(self);
10955	const void *buf1 = PyUnicode_DATA(str1);
10956	const void *buf2 = PyUnicode_DATA(str2);
10957	int srelease = `0`, release1 = `0`, release2 = `0`;
10958	int skind = PyUnicode_KIND(self);
10959	int kind1 = PyUnicode_KIND(str1);
10960	int kind2 = PyUnicode_KIND(str2);
10961	Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10962	Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10963	Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10964	int mayshrink;
10965	Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10966
10967	if (slen < len1)
10968	goto nothing;
10969
10970	if (maxcount < `0`)
10971	maxcount = PY_SSIZE_T_MAX;
10972	else if (maxcount == `0`)
10973	goto nothing;
10974
10975	if (str1 == str2)
10976	goto nothing;
10977
10978	maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10979	maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10980	if (maxchar < maxchar_str1)
10981	/ substring too wide to be present /
10982	goto nothing;
10983	maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10984	/ Replacing str1 with str2 may cause a maxchar reduction in the*
10985	result string. /*
10986	mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10987	maxchar = Py_MAX(maxchar, maxchar_str2);
10988
10989	if (len1 == len2) {
10990	/ same length /
10991	if (len1 == `0`)
10992	goto nothing;
10993	if (len1 == `1`) {
10994	/ replace characters /
10995	Py_UCS4 u1, u2;
10996	Py_ssize_t pos;
10997
10998	u1 = PyUnicode_READ(kind1, buf1, `0`);
10999	pos = findchar(sbuf, skind, slen, u1, `1`);
11000	if (pos < `0`)
11001	goto nothing;
11002	u2 = PyUnicode_READ(kind2, buf2, `0`);
11003	u = PyUnicode_New(slen, maxchar);
11004	if (!u)
11005	goto error;
11006
11007	_PyUnicode_FastCopyCharacters(u, `0`, self, `0`, slen);
11008	replace_1char_inplace(u, pos, u1, u2, maxcount);
11009	}
11010	else {
11011	int rkind = skind;
11012	char *res;
11013	Py_ssize_t i;
11014
11015	if (kind1 < rkind) {
11016	/ widen substring /
11017	buf1 = unicode_askind(kind1, buf1, len1, rkind);
11018	if (!buf1) goto error;
11019	release1 = `1`;
11020	}
11021	i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, `0`);
11022	if (i < `0`)
11023	goto nothing;
11024	if (rkind > kind2) {
11025	/ widen replacement /
11026	buf2 = unicode_askind(kind2, buf2, len2, rkind);
11027	if (!buf2) goto error;
11028	release2 = `1`;
11029	}
11030	else if (rkind < kind2) {
11031	/ widen self and buf1 /
11032	rkind = kind2;
11033	if (release1) {
11034	assert(buf1 != PyUnicode_DATA(str1));
11035	PyMem_Free((void *)buf1);
11036	buf1 = PyUnicode_DATA(str1);
11037	release1 = `0`;
11038	}
11039	sbuf = unicode_askind(skind, sbuf, slen, rkind);
11040	if (!sbuf) goto error;
11041	srelease = `1`;
11042	buf1 = unicode_askind(kind1, buf1, len1, rkind);
11043	if (!buf1) goto error;
11044	release1 = `1`;
11045	}
11046	u = PyUnicode_New(slen, maxchar);
11047	if (!u)
11048	goto error;
11049	assert(PyUnicode_KIND(u) == rkind);
11050	res = PyUnicode_DATA(u);
11051
11052	memcpy(res, sbuf, rkind * slen);
11053	/ change everything in-place, starting with this one /
11054	memcpy(res + rkind * i,
11055	buf2,
11056	rkind * len2);
11057	i += len1;
11058
11059	while ( --maxcount > `0`) {
11060	i = anylib_find(rkind, self,
11061	sbuf+rkind*i, slen-i,
11062	str1, buf1, len1, i);
11063	if (i == -`1`)
11064	break;
11065	memcpy(res + rkind * i,
11066	buf2,
11067	rkind * len2);
11068	i += len1;
11069	}
11070	}
11071	}
11072	else {
11073	Py_ssize_t n, i, j, ires;
11074	Py_ssize_t new_size;
11075	int rkind = skind;
11076	char *res;
11077
11078	if (kind1 < rkind) {
11079	/ widen substring /
11080	buf1 = unicode_askind(kind1, buf1, len1, rkind);
11081	if (!buf1) goto error;
11082	release1 = `1`;
11083	}
11084	n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
11085	if (n == `0`)
11086	goto nothing;
11087	if (kind2 < rkind) {
11088	/ widen replacement /
11089	buf2 = unicode_askind(kind2, buf2, len2, rkind);
11090	if (!buf2) goto error;
11091	release2 = `1`;
11092	}
11093	else if (kind2 > rkind) {
11094	/ widen self and buf1 /
11095	rkind = kind2;
11096	sbuf = unicode_askind(skind, sbuf, slen, rkind);
11097	if (!sbuf) goto error;
11098	srelease = `1`;
11099	if (release1) {
11100	assert(buf1 != PyUnicode_DATA(str1));
11101	PyMem_Free((void *)buf1);
11102	buf1 = PyUnicode_DATA(str1);
11103	release1 = `0`;
11104	}
11105	buf1 = unicode_askind(kind1, buf1, len1, rkind);
11106	if (!buf1) goto error;
11107	release1 = `1`;
11108	}
11109	/ new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -*
11110	PyUnicode_GET_LENGTH(str1)); /*
11111	if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
11112	PyErr_SetString(PyExc_OverflowError,
11113	"replace string is too long");
11114	goto error;
11115	}
11116	new_size = slen + n * (len2 - len1);
11117	if (new_size == `0`) {
11118	u = unicode_new_empty();
11119	goto done;
11120	}
11121	if (new_size > (PY_SSIZE_T_MAX / rkind)) {
11122	PyErr_SetString(PyExc_OverflowError,
11123	"replace string is too long");
11124	goto error;
11125	}
11126	u = PyUnicode_New(new_size, maxchar);
11127	if (!u)
11128	goto error;
11129	assert(PyUnicode_KIND(u) == rkind);
11130	res = PyUnicode_DATA(u);
11131	ires = i = `0`;
11132	if (len1 > `0`) {
11133	while (n-- > `0`) {
11134	/ look for next match /
11135	j = anylib_find(rkind, self,
11136	sbuf + rkind * i, slen-i,
11137	str1, buf1, len1, i);
11138	if (j == -`1`)
11139	break;
11140	else if (j > i) {
11141	/ copy unchanged part [i:j] /
11142	memcpy(res + rkind * ires,
11143	sbuf + rkind * i,
11144	rkind * (j-i));
11145	ires += j - i;
11146	}
11147	/ copy substitution string /
11148	if (len2 > `0`) {
11149	memcpy(res + rkind * ires,
11150	buf2,
11151	rkind * len2);
11152	ires += len2;
11153	}
11154	i = j + len1;
11155	}
11156	if (i < slen)
11157	/ copy tail [i:] /
11158	memcpy(res + rkind * ires,
11159	sbuf + rkind * i,
11160	rkind * (slen-i));
11161	}
11162	else {
11163	/ interleave /
11164	while (n > `0`) {
11165	memcpy(res + rkind * ires,
11166	buf2,
11167	rkind * len2);
11168	ires += len2;
11169	if (--n <= `0`)
11170	break;
11171	memcpy(res + rkind * ires,
11172	sbuf + rkind * i,
11173	rkind);
11174	ires++;
11175	i++;
11176	}
11177	memcpy(res + rkind * ires,
11178	sbuf + rkind * i,
11179	rkind * (slen-i));
11180	}
11181	}
11182
11183	if (mayshrink) {
11184	unicode_adjust_maxchar(&u);
11185	if (u == NULL)
11186	goto error;
11187	}
11188
11189	done:
11190	assert(srelease == (sbuf != PyUnicode_DATA(self)));
11191	assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11192	assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11193	if (srelease)
11194	PyMem_Free((void *)sbuf);
11195	if (release1)
11196	PyMem_Free((void *)buf1);
11197	if (release2)
11198	PyMem_Free((void *)buf2);
11199	assert(_PyUnicode_CheckConsistency(u, `1`));
11200	return u;
11201
11202	nothing:
11203	/ nothing to replace; return original string (when possible) /
11204	assert(srelease == (sbuf != PyUnicode_DATA(self)));
11205	assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11206	assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11207	if (srelease)
11208	PyMem_Free((void *)sbuf);
11209	if (release1)
11210	PyMem_Free((void *)buf1);
11211	if (release2)
11212	PyMem_Free((void *)buf2);
11213	return unicode_result_unchanged(self);
11214
11215	error:
11216	assert(srelease == (sbuf != PyUnicode_DATA(self)));
11217	assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11218	assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11219	if (srelease)
11220	PyMem_Free((void *)sbuf);
11221	if (release1)
11222	PyMem_Free((void *)buf1);
11223	if (release2)
11224	PyMem_Free((void *)buf2);
11225	return NULL;
11226	}
11227
11228	/ --- Unicode Object Methods --------------------------------------------- /
11229
11230	/[clinic input]*
11231	str.title as unicode_title
11232
11233	Return a version of the string where each word is titlecased.
11234
11235	More specifically, words start with uppercased characters and all remaining
11236	cased characters have lower case.
11237	[clinic start generated code]/*
11238
11239	static PyObject *
11240	unicode_title_impl(PyObject *self)
11241	/[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]/
11242	{
11243	if (PyUnicode_READY(self) == -`1`)
11244	return NULL;
11245	return case_operation(self, do_title);
11246	}
11247
11248	/[clinic input]*
11249	str.capitalize as unicode_capitalize
11250
11251	Return a capitalized version of the string.
11252
11253	More specifically, make the first character have upper case and the rest lower
11254	case.
11255	[clinic start generated code]/*
11256
11257	static PyObject *
11258	unicode_capitalize_impl(PyObject *self)
11259	/[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]/
11260	{
11261	if (PyUnicode_READY(self) == -`1`)
11262	return NULL;
11263	if (PyUnicode_GET_LENGTH(self) == `0`)
11264	return unicode_result_unchanged(self);
11265	return case_operation(self, do_capitalize);
11266	}
11267
11268	/[clinic input]*
11269	str.casefold as unicode_casefold
11270
11271	Return a version of the string suitable for caseless comparisons.
11272	[clinic start generated code]/*
11273
11274	static PyObject *
11275	unicode_casefold_impl(PyObject *self)
11276	/[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]/
11277	{
11278	if (PyUnicode_READY(self) == -`1`)
11279	return NULL;
11280	if (PyUnicode_IS_ASCII(self))
11281	return ascii_upper_or_lower(self, `1`);
11282	return case_operation(self, do_casefold);
11283	}
11284
11285
11286	/ Argument converter. Accepts a single Unicode character. /
11287
11288	static int
11289	convert_uc(PyObject obj, void* *addr)
11290	{
11291	Py_UCS4 fillcharloc = (Py_UCS4 )addr;
11292
11293	if (!PyUnicode_Check(obj)) {
11294	PyErr_Format(PyExc_TypeError,
11295	"The fill character must be a unicode character, "
11296	"not %.100s", Py_TYPE(obj)->tp_name);
11297	return `0`;
11298	}
11299	if (PyUnicode_READY(obj) < `0`)
11300	return `0`;
11301	if (PyUnicode_GET_LENGTH(obj) != `1`) {
11302	PyErr_SetString(PyExc_TypeError,
11303	"The fill character must be exactly one character long");
11304	return `0`;
11305	}
11306	*fillcharloc = PyUnicode_READ_CHAR(obj, `0`);
11307	return `1`;
11308	}
11309
11310	/[clinic input]*
11311	str.center as unicode_center
11312
11313	width: Py_ssize_t
11314	fillchar: Py_UCS4 = ' '
11315	/
11316
11317	Return a centered string of length width.
11318
11319	Padding is done using the specified fill character (default is a space).
11320	[clinic start generated code]/*
11321
11322	static PyObject *
11323	unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11324	/[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]/
11325	{
11326	Py_ssize_t marg, left;
11327
11328	if (PyUnicode_READY(self) == -`1`)
11329	return NULL;
11330
11331	if (PyUnicode_GET_LENGTH(self) >= width)
11332	return unicode_result_unchanged(self);
11333
11334	marg = width - PyUnicode_GET_LENGTH(self);
11335	left = marg / `2` + (marg & width & `1`);
11336
11337	return pad(self, left, marg - left, fillchar);
11338	}
11339
11340	/ This function assumes that str1 and str2 are readied by the caller. /
11341
11342	static int
11343	unicode_compare(PyObject str1, PyObject str2)
11344	{
11345	#define COMPARE(TYPE1, TYPE2) \
11346	do { \
11347	TYPE1* p1 = (TYPE1 *)data1; \
11348	TYPE2* p2 = (TYPE2 *)data2; \
11349	TYPE1* end = p1 + len; \
11350	Py_UCS4 c1, c2; \
11351	for (; p1 != end; p1++, p2++) { \
11352	c1 = *p1; \
11353	c2 = *p2; \
11354	if (c1 != c2) \
11355	return (c1 < c2) ? -1 : 1; \
11356	} \
11357	} \
11358	while (0)
11359
11360	int kind1, kind2;
11361	const void data1, data2;
11362	Py_ssize_t len1, len2, len;
11363
11364	kind1 = PyUnicode_KIND(str1);
11365	kind2 = PyUnicode_KIND(str2);
11366	data1 = PyUnicode_DATA(str1);
11367	data2 = PyUnicode_DATA(str2);
11368	len1 = PyUnicode_GET_LENGTH(str1);
11369	len2 = PyUnicode_GET_LENGTH(str2);
11370	len = Py_MIN(len1, len2);
11371
11372	switch(kind1) {
11373	case PyUnicode_1BYTE_KIND:
11374	{
11375	switch(kind2) {
11376	case PyUnicode_1BYTE_KIND:
11377	{
11378	int cmp = memcmp(data1, data2, len);
11379	/ normalize result of memcmp() into the range [-1; 1] /
11380	if (cmp < `0`)
11381	return -`1`;
11382	if (cmp > `0`)
11383	return `1`;
11384	break;
11385	}
11386	case PyUnicode_2BYTE_KIND:
11387	COMPARE(Py_UCS1, Py_UCS2);
11388	break;
11389	case PyUnicode_4BYTE_KIND:
11390	COMPARE(Py_UCS1, Py_UCS4);
11391	break;
11392	default:
11393	Py_UNREACHABLE();
11394	}
11395	break;
11396	}
11397	case PyUnicode_2BYTE_KIND:
11398	{
11399	switch(kind2) {
11400	case PyUnicode_1BYTE_KIND:
11401	COMPARE(Py_UCS2, Py_UCS1);
11402	break;
11403	case PyUnicode_2BYTE_KIND:
11404	{
11405	COMPARE(Py_UCS2, Py_UCS2);
11406	break;
11407	}
11408	case PyUnicode_4BYTE_KIND:
11409	COMPARE(Py_UCS2, Py_UCS4);
11410	break;
11411	default:
11412	Py_UNREACHABLE();
11413	}
11414	break;
11415	}
11416	case PyUnicode_4BYTE_KIND:
11417	{
11418	switch(kind2) {
11419	case PyUnicode_1BYTE_KIND:
11420	COMPARE(Py_UCS4, Py_UCS1);
11421	break;
11422	case PyUnicode_2BYTE_KIND:
11423	COMPARE(Py_UCS4, Py_UCS2);
11424	break;
11425	case PyUnicode_4BYTE_KIND:
11426	{
11427	#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11428	int cmp = wmemcmp((wchar_t )data1, (wchar_t )data2, len);
11429	/ normalize result of wmemcmp() into the range [-1; 1] /
11430	if (cmp < `0`)
11431	return -`1`;
11432	if (cmp > `0`)
11433	return `1`;
11434	#else
11435	COMPARE(Py_UCS4, Py_UCS4);
11436	#endif
11437	break;
11438	}
11439	default:
11440	Py_UNREACHABLE();
11441	}
11442	break;
11443	}
11444	default:
11445	Py_UNREACHABLE();
11446	}
11447
11448	if (len1 == len2)
11449	return `0`;
11450	if (len1 < len2)
11451	return -`1`;
11452	else
11453	return `1`;
11454
11455	#undef COMPARE
11456	}
11457
11458	static int
11459	unicode_compare_eq(PyObject str1, PyObject str2)
11460	{
11461	int kind;
11462	const void data1, data2;
11463	Py_ssize_t len;
11464	int cmp;
11465
11466	len = PyUnicode_GET_LENGTH(str1);
11467	if (PyUnicode_GET_LENGTH(str2) != len)
11468	return `0`;
11469	kind = PyUnicode_KIND(str1);
11470	if (PyUnicode_KIND(str2) != kind)
11471	return `0`;
11472	data1 = PyUnicode_DATA(str1);
11473	data2 = PyUnicode_DATA(str2);
11474
11475	cmp = memcmp(data1, data2, len * kind);
11476	return (cmp == `0`);
11477	}
11478
11479
11480	int
11481	PyUnicode_Compare(PyObject left, PyObject right)
11482	{
11483	if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11484	if (PyUnicode_READY(left) == -`1` \|\|
11485	PyUnicode_READY(right) == -`1`)
11486	return -`1`;
11487
11488	/ a string is equal to itself /
11489	if (left == right)
11490	return `0`;
11491
11492	return unicode_compare(left, right);
11493	}
11494	PyErr_Format(PyExc_TypeError,
11495	"Can't compare %.100s and %.100s",
11496	Py_TYPE(left)->tp_name,
11497	Py_TYPE(right)->tp_name);
11498	return -`1`;
11499	}
11500
11501	int
11502	PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11503	{
11504	Py_ssize_t i;
11505	int kind;
11506	Py_UCS4 chr;
11507	const unsigned char ustr = (const* unsigned char *)str;
11508
11509	assert(_PyUnicode_CHECK(uni));
11510	if (!PyUnicode_IS_READY(uni)) {
11511	const wchar_t *ws = _PyUnicode_WSTR(uni);
11512	/ Compare Unicode string and source character set string /
11513	for (i = `0`; (chr = ws[i]) && ustr[i]; i++) {
11514	if (chr != ustr[i])
11515	return (chr < ustr[i]) ? -`1` : `1`;
11516	}
11517	/ This check keeps Python strings that end in '\0' from comparing equal*
11518	to C strings identical up to that point. /*
11519	if (_PyUnicode_WSTR_LENGTH(uni) != i \|\| chr)
11520	return `1`; / uni is longer /
11521	if (ustr[i])
11522	return -`1`; / str is longer /
11523	return `0`;
11524	}
11525	kind = PyUnicode_KIND(uni);
11526	if (kind == PyUnicode_1BYTE_KIND) {
11527	const void *data = PyUnicode_1BYTE_DATA(uni);
11528	size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11529	size_t len, len2 = strlen(str);
11530	int cmp;
11531
11532	len = Py_MIN(len1, len2);
11533	cmp = memcmp(data, str, len);
11534	if (cmp != `0`) {
11535	if (cmp < `0`)
11536	return -`1`;
11537	else
11538	return `1`;
11539	}
11540	if (len1 > len2)
11541	return `1`; / uni is longer /
11542	if (len1 < len2)
11543	return -`1`; / str is longer /
11544	return `0`;
11545	}
11546	else {
11547	const void *data = PyUnicode_DATA(uni);
11548	/ Compare Unicode string and source character set string /
11549	for (i = `0`; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11550	if (chr != (unsigned char)str[i])
11551	return (chr < (unsigned char)(str[i])) ? -`1` : `1`;
11552	/ This check keeps Python strings that end in '\0' from comparing equal*
11553	to C strings identical up to that point. /*
11554	if (PyUnicode_GET_LENGTH(uni) != i \|\| chr)
11555	return `1`; / uni is longer /
11556	if (str[i])
11557	return -`1`; / str is longer /
11558	return `0`;
11559	}
11560	}
11561
11562	static int
11563	non_ready_unicode_equal_to_ascii_string(PyObject unicode, const* char *str)
11564	{
11565	size_t i, len;
11566	const wchar_t *p;
11567	len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11568	if (strlen(str) != len)
11569	return `0`;
11570	p = _PyUnicode_WSTR(unicode);
11571	assert(p);
11572	for (i = `0`; i < len; i++) {
11573	unsigned char c = (unsigned char)str[i];
11574	if (c >= `128` \|\| p[i] != (wchar_t)c)
11575	return `0`;
11576	}
11577	return `1`;
11578	}
11579
11580	int
11581	_PyUnicode_EqualToASCIIString(PyObject unicode, const* char *str)
11582	{
11583	size_t len;
11584	assert(_PyUnicode_CHECK(unicode));
11585	assert(str);
11586	#ifndef NDEBUG
11587	for (const char p = str; p; p++) {
11588	assert((unsigned char)*p < `128`);
11589	}
11590	#endif
11591	if (PyUnicode_READY(unicode) == -`1`) {
11592	/ Memory error or bad data /
11593	PyErr_Clear();
11594	return non_ready_unicode_equal_to_ascii_string(unicode, str);
11595	}
11596	if (!PyUnicode_IS_ASCII(unicode))
11597	return `0`;
11598	len = (size_t)PyUnicode_GET_LENGTH(unicode);
11599	return strlen(str) == len &&
11600	memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == `0`;
11601	}
11602
11603	int
11604	_PyUnicode_EqualToASCIIId(PyObject left, _Py_Identifier right)
11605	{
11606	PyObject *right_uni;
11607
11608	assert(_PyUnicode_CHECK(left));
11609	assert(right->string);
11610	#ifndef NDEBUG
11611	for (const char p = right->string; p; p++) {
11612	assert((unsigned char)*p < `128`);
11613	}
11614	#endif
11615
11616	if (PyUnicode_READY(left) == -`1`) {
11617	/ memory error or bad data /
11618	PyErr_Clear();
11619	return non_ready_unicode_equal_to_ascii_string(left, right->string);
11620	}
11621
11622	if (!PyUnicode_IS_ASCII(left))
11623	return `0`;
11624
11625	right_uni = _PyUnicode_FromId(right); / borrowed /
11626	if (right_uni == NULL) {
11627	/ memory error or bad data /
11628	PyErr_Clear();
11629	return _PyUnicode_EqualToASCIIString(left, right->string);
11630	}
11631
11632	if (left == right_uni)
11633	return `1`;
11634
11635	if (PyUnicode_CHECK_INTERNED(left))
11636	return `0`;
11637
11638	#ifdef INTERNED_STRINGS
11639	assert(_PyUnicode_HASH(right_uni) != -`1`);
11640	Py_hash_t hash = _PyUnicode_HASH(left);
11641	if (hash != -`1` && hash != _PyUnicode_HASH(right_uni)) {
11642	return `0`;
11643	}
11644	#endif
11645
11646	return unicode_compare_eq(left, right_uni);
11647	}
11648
11649	PyObject *
11650	PyUnicode_RichCompare(PyObject left, PyObject right, int op)
11651	{
11652	int result;
11653
11654	if (!PyUnicode_Check(left) \|\| !PyUnicode_Check(right))
11655	Py_RETURN_NOTIMPLEMENTED;
11656
11657	if (PyUnicode_READY(left) == -`1` \|\|
11658	PyUnicode_READY(right) == -`1`)
11659	return NULL;
11660
11661	if (left == right) {
11662	switch (op) {
11663	case Py_EQ:
11664	case Py_LE:
11665	case Py_GE:
11666	/ a string is equal to itself /
11667	Py_RETURN_TRUE;
11668	case Py_NE:
11669	case Py_LT:
11670	case Py_GT:
11671	Py_RETURN_FALSE;
11672	default:
11673	PyErr_BadArgument();
11674	return NULL;
11675	}
11676	}
11677	else if (op == Py_EQ \|\| op == Py_NE) {
11678	result = unicode_compare_eq(left, right);
11679	result ^= (op == Py_NE);
11680	return PyBool_FromLong(result);
11681	}
11682	else {
11683	result = unicode_compare(left, right);
11684	Py_RETURN_RICHCOMPARE(result, `0`, op);
11685	}
11686	}
11687
11688	int
11689	_PyUnicode_EQ(PyObject aa, PyObject bb)
11690	{
11691	return unicode_eq(aa, bb);
11692	}
11693
11694	int
11695	PyUnicode_Contains(PyObject str, PyObject substr)
11696	{
11697	int kind1, kind2;
11698	const void buf1, buf2;
11699	Py_ssize_t len1, len2;
11700	int result;
11701
11702	if (!PyUnicode_Check(substr)) {
11703	PyErr_Format(PyExc_TypeError,
11704	"'in <string>' requires string as left operand, not %.100s",
11705	Py_TYPE(substr)->tp_name);
11706	return -`1`;
11707	}
11708	if (PyUnicode_READY(substr) == -`1`)
11709	return -`1`;
11710	if (ensure_unicode(str) < `0`)
11711	return -`1`;
11712
11713	kind1 = PyUnicode_KIND(str);
11714	kind2 = PyUnicode_KIND(substr);
11715	if (kind1 < kind2)
11716	return `0`;
11717	len1 = PyUnicode_GET_LENGTH(str);
11718	len2 = PyUnicode_GET_LENGTH(substr);
11719	if (len1 < len2)
11720	return `0`;
11721	buf1 = PyUnicode_DATA(str);
11722	buf2 = PyUnicode_DATA(substr);
11723	if (len2 == `1`) {
11724	Py_UCS4 ch = PyUnicode_READ(kind2, buf2, `0`);
11725	result = findchar((const char *)buf1, kind1, len1, ch, `1`) != -`1`;
11726	return result;
11727	}
11728	if (kind2 != kind1) {
11729	buf2 = unicode_askind(kind2, buf2, len2, kind1);
11730	if (!buf2)
11731	return -`1`;
11732	}
11733
11734	switch (kind1) {
11735	case PyUnicode_1BYTE_KIND:
11736	result = ucs1lib_find(buf1, len1, buf2, len2, `0`) != -`1`;
11737	break;
11738	case PyUnicode_2BYTE_KIND:
11739	result = ucs2lib_find(buf1, len1, buf2, len2, `0`) != -`1`;
11740	break;
11741	case PyUnicode_4BYTE_KIND:
11742	result = ucs4lib_find(buf1, len1, buf2, len2, `0`) != -`1`;
11743	break;
11744	default:
11745	Py_UNREACHABLE();
11746	}
11747
11748	assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11749	if (kind2 != kind1)
11750	PyMem_Free((void *)buf2);
11751
11752	return result;
11753	}
11754
11755	/ Concat to string or Unicode object giving a new Unicode object. /
11756
11757	PyObject *
11758	PyUnicode_Concat(PyObject left, PyObject right)
11759	{
11760	PyObject *result;
11761	Py_UCS4 maxchar, maxchar2;
11762	Py_ssize_t left_len, right_len, new_len;
11763
11764	if (ensure_unicode(left) < `0`)
11765	return NULL;
11766
11767	if (!PyUnicode_Check(right)) {
11768	PyErr_Format(PyExc_TypeError,
11769	"can only concatenate str (not \"%.200s\") to str",
11770	Py_TYPE(right)->tp_name);
11771	return NULL;
11772	}
11773	if (PyUnicode_READY(right) < `0`)
11774	return NULL;
11775
11776	/ Shortcuts /
11777	PyObject empty = unicode_get_empty(); // Borrowed reference*
11778	if (left == empty) {
11779	return PyUnicode_FromObject(right);
11780	}
11781	if (right == empty) {
11782	return PyUnicode_FromObject(left);
11783	}
11784
11785	left_len = PyUnicode_GET_LENGTH(left);
11786	right_len = PyUnicode_GET_LENGTH(right);
11787	if (left_len > PY_SSIZE_T_MAX - right_len) {
11788	PyErr_SetString(PyExc_OverflowError,
11789	"strings are too large to concat");
11790	return NULL;
11791	}
11792	new_len = left_len + right_len;
11793
11794	maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11795	maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11796	maxchar = Py_MAX(maxchar, maxchar2);
11797
11798	/ Concat the two Unicode strings /
11799	result = PyUnicode_New(new_len, maxchar);
11800	if (result == NULL)
11801	return NULL;
11802	_PyUnicode_FastCopyCharacters(result, `0`, left, `0`, left_len);
11803	_PyUnicode_FastCopyCharacters(result, left_len, right, `0`, right_len);
11804	assert(_PyUnicode_CheckConsistency(result, `1`));
11805	return result;
11806	}
11807
11808	void
11809	PyUnicode_Append(PyObject *p_left, PyObject right)
11810	{
11811	PyObject left, res;
11812	Py_UCS4 maxchar, maxchar2;
11813	Py_ssize_t left_len, right_len, new_len;
11814
11815	if (p_left == NULL) {
11816	if (!PyErr_Occurred())
11817	PyErr_BadInternalCall();
11818	return;
11819	}
11820	left = *p_left;
11821	if (right == NULL \|\| left == NULL
11822	\|\| !PyUnicode_Check(left) \|\| !PyUnicode_Check(right)) {
11823	if (!PyErr_Occurred())
11824	PyErr_BadInternalCall();
11825	goto error;
11826	}
11827
11828	if (PyUnicode_READY(left) == -`1`)
11829	goto error;
11830	if (PyUnicode_READY(right) == -`1`)
11831	goto error;
11832
11833	/ Shortcuts /
11834	PyObject empty = unicode_get_empty(); // Borrowed reference*
11835	if (left == empty) {
11836	Py_DECREF(left);
11837	Py_INCREF(right);
11838	*p_left = right;
11839	return;
11840	}
11841	if (right == empty) {
11842	return;
11843	}
11844
11845	left_len = PyUnicode_GET_LENGTH(left);
11846	right_len = PyUnicode_GET_LENGTH(right);
11847	if (left_len > PY_SSIZE_T_MAX - right_len) {
11848	PyErr_SetString(PyExc_OverflowError,
11849	"strings are too large to concat");
11850	goto error;
11851	}
11852	new_len = left_len + right_len;
11853
11854	if (unicode_modifiable(left)
11855	&& PyUnicode_CheckExact(right)
11856	&& PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11857	/ Don't resize for ascii += latin1. Convert ascii to latin1 requires*
11858	to change the structure size, but characters are stored just after
11859	the structure, and so it requires to move all characters which is
11860	not so different than duplicating the string. /*
11861	&& !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11862	{
11863	/ append inplace /
11864	if (unicode_resize(p_left, new_len) != `0`)
11865	goto error;
11866
11867	/ copy 'right' into the newly allocated area of 'left' /
11868	_PyUnicode_FastCopyCharacters(*p_left, left_len, right, `0`, right_len);
11869	}
11870	else {
11871	maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11872	maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11873	maxchar = Py_MAX(maxchar, maxchar2);
11874
11875	/ Concat the two Unicode strings /
11876	res = PyUnicode_New(new_len, maxchar);
11877	if (res == NULL)
11878	goto error;
11879	_PyUnicode_FastCopyCharacters(res, `0`, left, `0`, left_len);
11880	_PyUnicode_FastCopyCharacters(res, left_len, right, `0`, right_len);
11881	Py_DECREF(left);
11882	*p_left = res;
11883	}
11884	assert(_PyUnicode_CheckConsistency(*p_left, `1`));
11885	return;
11886
11887	error:
11888	Py_CLEAR(*p_left);
11889	}
11890
11891	void
11892	PyUnicode_AppendAndDel(PyObject *pleft, PyObject right)
11893	{
11894	PyUnicode_Append(pleft, right);
11895	Py_XDECREF(right);
11896	}
11897
11898	/*
11899	Wraps stringlib_parse_args_finds() and additionally ensures that the
11900	first argument is a unicode object.
11901	*/
11902
11903	static inline int
11904	parse_args_finds_unicode(const char * function_name, PyObject *args,
11905	PyObject **substring,
11906	Py_ssize_t start, Py_ssize_t end)
11907	{
11908	if(stringlib_parse_args_finds(function_name, args, substring,
11909	start, end)) {
11910	if (ensure_unicode(*substring) < `0`)
11911	return `0`;
11912	return `1`;
11913	}
11914	return `0`;
11915	}
11916
11917	PyDoc_STRVAR(count__doc__,
11918	"S.count(sub[, start[, end]]) -> int\n\
11919	\n\
11920	Return the number of non-overlapping occurrences of substring sub in\n\
11921	string S[start:end]. Optional arguments start and end are\n\
11922	interpreted as in slice notation.");
11923
11924	static PyObject *
11925	unicode_count(PyObject self, PyObject args)
11926	{
11927	PyObject substring = NULL; /* initialize to fix a compiler warning /
11928	Py_ssize_t start = `0`;
11929	Py_ssize_t end = PY_SSIZE_T_MAX;
11930	PyObject *result;
11931	int kind1, kind2;
11932	const void buf1, buf2;
11933	Py_ssize_t len1, len2, iresult;
11934
11935	if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11936	return NULL;
11937
11938	kind1 = PyUnicode_KIND(self);
11939	kind2 = PyUnicode_KIND(substring);
11940	if (kind1 < kind2)
11941	return PyLong_FromLong(`0`);
11942
11943	len1 = PyUnicode_GET_LENGTH(self);
11944	len2 = PyUnicode_GET_LENGTH(substring);
11945	ADJUST_INDICES(start, end, len1);
11946	if (end - start < len2)
11947	return PyLong_FromLong(`0`);
11948
11949	buf1 = PyUnicode_DATA(self);
11950	buf2 = PyUnicode_DATA(substring);
11951	if (kind2 != kind1) {
11952	buf2 = unicode_askind(kind2, buf2, len2, kind1);
11953	if (!buf2)
11954	return NULL;
11955	}
11956	switch (kind1) {
11957	case PyUnicode_1BYTE_KIND:
11958	iresult = ucs1lib_count(
11959	((const Py_UCS1*)buf1) + start, end - start,
11960	buf2, len2, PY_SSIZE_T_MAX
11961	);
11962	break;
11963	case PyUnicode_2BYTE_KIND:
11964	iresult = ucs2lib_count(
11965	((const Py_UCS2*)buf1) + start, end - start,
11966	buf2, len2, PY_SSIZE_T_MAX
11967	);
11968	break;
11969	case PyUnicode_4BYTE_KIND:
11970	iresult = ucs4lib_count(
11971	((const Py_UCS4*)buf1) + start, end - start,
11972	buf2, len2, PY_SSIZE_T_MAX
11973	);
11974	break;
11975	default:
11976	Py_UNREACHABLE();
11977	}
11978
11979	result = PyLong_FromSsize_t(iresult);
11980
11981	assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11982	if (kind2 != kind1)
11983	PyMem_Free((void *)buf2);
11984
11985	return result;
11986	}
11987
11988	/[clinic input]*
11989	str.encode as unicode_encode
11990
11991	encoding: str(c_default="NULL") = 'utf-8'
11992	The encoding in which to encode the string.
11993	errors: str(c_default="NULL") = 'strict'
11994	The error handling scheme to use for encoding errors.
11995	The default is 'strict' meaning that encoding errors raise a
11996	UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11997	'xmlcharrefreplace' as well as any other name registered with
11998	codecs.register_error that can handle UnicodeEncodeErrors.
11999
12000	Encode the string using the codec registered for encoding.
12001	[clinic start generated code]/*
12002
12003	static PyObject *
12004	unicode_encode_impl(PyObject self, const* char encoding, const* char *errors)
12005	/[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]/
12006	{
12007	return PyUnicode_AsEncodedString(self, encoding, errors);
12008	}
12009
12010	/[clinic input]*
12011	str.expandtabs as unicode_expandtabs
12012
12013	tabsize: int = 8
12014
12015	Return a copy where all tab characters are expanded using spaces.
12016
12017	If tabsize is not given, a tab size of 8 characters is assumed.
12018	[clinic start generated code]/*
12019
12020	static PyObject *
12021	unicode_expandtabs_impl(PyObject self, int* tabsize)
12022	/[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]/
12023	{
12024	Py_ssize_t i, j, line_pos, src_len, incr;
12025	Py_UCS4 ch;
12026	PyObject *u;
12027	const void *src_data;
12028	void *dest_data;
12029	int kind;
12030	int found;
12031
12032	if (PyUnicode_READY(self) == -`1`)
12033	return NULL;
12034
12035	/ First pass: determine size of output string /
12036	src_len = PyUnicode_GET_LENGTH(self);
12037	i = j = line_pos = `0`;
12038	kind = PyUnicode_KIND(self);
12039	src_data = PyUnicode_DATA(self);
12040	found = `0`;
12041	for (; i < src_len; i++) {
12042	ch = PyUnicode_READ(kind, src_data, i);
12043	if (ch == `'\t'`) {
12044	found = `1`;
12045	if (tabsize > `0`) {
12046	incr = tabsize - (line_pos % tabsize); / cannot overflow /
12047	if (j > PY_SSIZE_T_MAX - incr)
12048	goto overflow;
12049	line_pos += incr;
12050	j += incr;
12051	}
12052	}
12053	else {
12054	if (j > PY_SSIZE_T_MAX - `1`)
12055	goto overflow;
12056	line_pos++;
12057	j++;
12058	if (ch == `'\n'` \|\| ch == `'\r'`)
12059	line_pos = `0`;
12060	}
12061	}
12062	if (!found)
12063	return unicode_result_unchanged(self);
12064
12065	/ Second pass: create output string and fill it /
12066	u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
12067	if (!u)
12068	return NULL;
12069	dest_data = PyUnicode_DATA(u);
12070
12071	i = j = line_pos = `0`;
12072
12073	for (; i < src_len; i++) {
12074	ch = PyUnicode_READ(kind, src_data, i);
12075	if (ch == `'\t'`) {
12076	if (tabsize > `0`) {
12077	incr = tabsize - (line_pos % tabsize);
12078	line_pos += incr;
12079	unicode_fill(kind, dest_data, `' '`, j, incr);
12080	j += incr;
12081	}
12082	}
12083	else {
12084	line_pos++;
12085	PyUnicode_WRITE(kind, dest_data, j, ch);
12086	j++;
12087	if (ch == `'\n'` \|\| ch == `'\r'`)
12088	line_pos = `0`;
12089	}
12090	}
12091	assert (j == PyUnicode_GET_LENGTH(u));
12092	return unicode_result(u);
12093
12094	overflow:
12095	PyErr_SetString(PyExc_OverflowError, "new string is too long");
12096	return NULL;
12097	}
12098
12099	PyDoc_STRVAR(find__doc__,
12100	"S.find(sub[, start[, end]]) -> int\n\
12101	\n\
12102	Return the lowest index in S where substring sub is found,\n\
12103	such that sub is contained within S[start:end]. Optional\n\
12104	arguments start and end are interpreted as in slice notation.\n\
12105	\n\
12106	Return -1 on failure.");
12107
12108	static PyObject *
12109	unicode_find(PyObject self, PyObject args)
12110	{
12111	/ initialize variables to prevent gcc warning /
12112	PyObject *substring = NULL;
12113	Py_ssize_t start = `0`;
12114	Py_ssize_t end = `0`;
12115	Py_ssize_t result;
12116
12117	if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
12118	return NULL;
12119
12120	if (PyUnicode_READY(self) == -`1`)
12121	return NULL;
12122
12123	result = any_find_slice(self, substring, start, end, `1`);
12124
12125	if (result == -`2`)
12126	return NULL;
12127
12128	return PyLong_FromSsize_t(result);
12129	}
12130
12131	static PyObject *
12132	unicode_getitem(PyObject *self, Py_ssize_t index)
12133	{
12134	const void *data;
12135	enum PyUnicode_Kind kind;
12136	Py_UCS4 ch;
12137
12138	if (!PyUnicode_Check(self)) {
12139	PyErr_BadArgument();
12140	return NULL;
12141	}
12142	if (PyUnicode_READY(self) == -`1`) {
12143	return NULL;
12144	}
12145	if (index < `0` \|\| index >= PyUnicode_GET_LENGTH(self)) {
12146	PyErr_SetString(PyExc_IndexError, "string index out of range");
12147	return NULL;
12148	}
12149	kind = PyUnicode_KIND(self);
12150	data = PyUnicode_DATA(self);
12151	ch = PyUnicode_READ(kind, data, index);
12152	return unicode_char(ch);
12153	}
12154
12155	/ Believe it or not, this produces the same value for ASCII strings*
12156	as bytes_hash(). /*
12157	static Py_hash_t
12158	unicode_hash(PyObject *self)
12159	{
12160	Py_uhash_t x; / Unsigned for defined overflow behavior. /
12161
12162	#ifdef Py_DEBUG
12163	assert(_Py_HashSecret_Initialized);
12164	#endif
12165	if (_PyUnicode_HASH(self) != -`1`)
12166	return _PyUnicode_HASH(self);
12167	if (PyUnicode_READY(self) == -`1`)
12168	return -`1`;
12169
12170	x = _Py_HashBytes(PyUnicode_DATA(self),
12171	PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
12172	_PyUnicode_HASH(self) = x;
12173	return x;
12174	}
12175
12176	PyDoc_STRVAR(index__doc__,
12177	"S.index(sub[, start[, end]]) -> int\n\
12178	\n\
12179	Return the lowest index in S where substring sub is found,\n\
12180	such that sub is contained within S[start:end]. Optional\n\
12181	arguments start and end are interpreted as in slice notation.\n\
12182	\n\
12183	Raises ValueError when the substring is not found.");
12184
12185	static PyObject *
12186	unicode_index(PyObject self, PyObject args)
12187	{
12188	/ initialize variables to prevent gcc warning /
12189	Py_ssize_t result;
12190	PyObject *substring = NULL;
12191	Py_ssize_t start = `0`;
12192	Py_ssize_t end = `0`;
12193
12194	if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
12195	return NULL;
12196
12197	if (PyUnicode_READY(self) == -`1`)
12198	return NULL;
12199
12200	result = any_find_slice(self, substring, start, end, `1`);
12201
12202	if (result == -`2`)
12203	return NULL;
12204
12205	if (result < `0`) {
12206	PyErr_SetString(PyExc_ValueError, "substring not found");
12207	return NULL;
12208	}
12209
12210	return PyLong_FromSsize_t(result);
12211	}
12212
12213	/[clinic input]*
12214	str.isascii as unicode_isascii
12215
12216	Return True if all characters in the string are ASCII, False otherwise.
12217
12218	ASCII characters have code points in the range U+0000-U+007F.
12219	Empty string is ASCII too.
12220	[clinic start generated code]/*
12221
12222	static PyObject *
12223	unicode_isascii_impl(PyObject *self)
12224	/[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]/
12225	{
12226	if (PyUnicode_READY(self) == -`1`) {
12227	return NULL;
12228	}
12229	return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12230	}
12231
12232	/[clinic input]*
12233	str.islower as unicode_islower
12234
12235	Return True if the string is a lowercase string, False otherwise.
12236
12237	A string is lowercase if all cased characters in the string are lowercase and
12238	there is at least one cased character in the string.
12239	[clinic start generated code]/*
12240
12241	static PyObject *
12242	unicode_islower_impl(PyObject *self)
12243	/[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]/
12244	{
12245	Py_ssize_t i, length;
12246	int kind;
12247	const void *data;
12248	int cased;
12249
12250	if (PyUnicode_READY(self) == -`1`)
12251	return NULL;
12252	length = PyUnicode_GET_LENGTH(self);
12253	kind = PyUnicode_KIND(self);
12254	data = PyUnicode_DATA(self);
12255
12256	/ Shortcut for single character strings /
12257	if (length == `1`)
12258	return PyBool_FromLong(
12259	Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, `0`)));
12260
12261	/ Special case for empty strings /
12262	if (length == `0`)
12263	Py_RETURN_FALSE;
12264
12265	cased = `0`;
12266	for (i = `0`; i < length; i++) {
12267	const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12268
12269	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
12270	Py_RETURN_FALSE;
12271	else if (!cased && Py_UNICODE_ISLOWER(ch))
12272	cased = `1`;
12273	}
12274	return PyBool_FromLong(cased);
12275	}
12276
12277	/[clinic input]*
12278	str.isupper as unicode_isupper
12279
12280	Return True if the string is an uppercase string, False otherwise.
12281
12282	A string is uppercase if all cased characters in the string are uppercase and
12283	there is at least one cased character in the string.
12284	[clinic start generated code]/*
12285
12286	static PyObject *
12287	unicode_isupper_impl(PyObject *self)
12288	/[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]/
12289	{
12290	Py_ssize_t i, length;
12291	int kind;
12292	const void *data;
12293	int cased;
12294
12295	if (PyUnicode_READY(self) == -`1`)
12296	return NULL;
12297	length = PyUnicode_GET_LENGTH(self);
12298	kind = PyUnicode_KIND(self);
12299	data = PyUnicode_DATA(self);
12300
12301	/ Shortcut for single character strings /
12302	if (length == `1`)
12303	return PyBool_FromLong(
12304	Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, `0`)) != `0`);
12305
12306	/ Special case for empty strings /
12307	if (length == `0`)
12308	Py_RETURN_FALSE;
12309
12310	cased = `0`;
12311	for (i = `0`; i < length; i++) {
12312	const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12313
12314	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
12315	Py_RETURN_FALSE;
12316	else if (!cased && Py_UNICODE_ISUPPER(ch))
12317	cased = `1`;
12318	}
12319	return PyBool_FromLong(cased);
12320	}
12321
12322	/[clinic input]*
12323	str.istitle as unicode_istitle
12324
12325	Return True if the string is a title-cased string, False otherwise.
12326
12327	In a title-cased string, upper- and title-case characters may only
12328	follow uncased characters and lowercase characters only cased ones.
12329	[clinic start generated code]/*
12330
12331	static PyObject *
12332	unicode_istitle_impl(PyObject *self)
12333	/[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]/
12334	{
12335	Py_ssize_t i, length;
12336	int kind;
12337	const void *data;
12338	int cased, previous_is_cased;
12339
12340	if (PyUnicode_READY(self) == -`1`)
12341	return NULL;
12342	length = PyUnicode_GET_LENGTH(self);
12343	kind = PyUnicode_KIND(self);
12344	data = PyUnicode_DATA(self);
12345
12346	/ Shortcut for single character strings /
12347	if (length == `1`) {
12348	Py_UCS4 ch = PyUnicode_READ(kind, data, `0`);
12349	return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != `0`) \|\|
12350	(Py_UNICODE_ISUPPER(ch) != `0`));
12351	}
12352
12353	/ Special case for empty strings /
12354	if (length == `0`)
12355	Py_RETURN_FALSE;
12356
12357	cased = `0`;
12358	previous_is_cased = `0`;
12359	for (i = `0`; i < length; i++) {
12360	const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12361
12362	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
12363	if (previous_is_cased)
12364	Py_RETURN_FALSE;
12365	previous_is_cased = `1`;
12366	cased = `1`;
12367	}
12368	else if (Py_UNICODE_ISLOWER(ch)) {
12369	if (!previous_is_cased)
12370	Py_RETURN_FALSE;
12371	previous_is_cased = `1`;
12372	cased = `1`;
12373	}
12374	else
12375	previous_is_cased = `0`;
12376	}
12377	return PyBool_FromLong(cased);
12378	}
12379
12380	/[clinic input]*
12381	str.isspace as unicode_isspace
12382
12383	Return True if the string is a whitespace string, False otherwise.
12384
12385	A string is whitespace if all characters in the string are whitespace and there
12386	is at least one character in the string.
12387	[clinic start generated code]/*
12388
12389	static PyObject *
12390	unicode_isspace_impl(PyObject *self)
12391	/[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]/
12392	{
12393	Py_ssize_t i, length;
12394	int kind;
12395	const void *data;
12396
12397	if (PyUnicode_READY(self) == -`1`)
12398	return NULL;
12399	length = PyUnicode_GET_LENGTH(self);
12400	kind = PyUnicode_KIND(self);
12401	data = PyUnicode_DATA(self);
12402
12403	/ Shortcut for single character strings /
12404	if (length == `1`)
12405	return PyBool_FromLong(
12406	Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, `0`)));
12407
12408	/ Special case for empty strings /
12409	if (length == `0`)
12410	Py_RETURN_FALSE;
12411
12412	for (i = `0`; i < length; i++) {
12413	const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12414	if (!Py_UNICODE_ISSPACE(ch))
12415	Py_RETURN_FALSE;
12416	}
12417	Py_RETURN_TRUE;
12418	}
12419
12420	/[clinic input]*
12421	str.isalpha as unicode_isalpha
12422
12423	Return True if the string is an alphabetic string, False otherwise.
12424
12425	A string is alphabetic if all characters in the string are alphabetic and there
12426	is at least one character in the string.
12427	[clinic start generated code]/*
12428
12429	static PyObject *
12430	unicode_isalpha_impl(PyObject *self)
12431	/[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]/
12432	{
12433	Py_ssize_t i, length;
12434	int kind;
12435	const void *data;
12436
12437	if (PyUnicode_READY(self) == -`1`)
12438	return NULL;
12439	length = PyUnicode_GET_LENGTH(self);
12440	kind = PyUnicode_KIND(self);
12441	data = PyUnicode_DATA(self);
12442
12443	/ Shortcut for single character strings /
12444	if (length == `1`)
12445	return PyBool_FromLong(
12446	Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, `0`)));
12447
12448	/ Special case for empty strings /
12449	if (length == `0`)
12450	Py_RETURN_FALSE;
12451
12452	for (i = `0`; i < length; i++) {
12453	if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12454	Py_RETURN_FALSE;
12455	}
12456	Py_RETURN_TRUE;
12457	}
12458
12459	/[clinic input]*
12460	str.isalnum as unicode_isalnum
12461
12462	Return True if the string is an alpha-numeric string, False otherwise.
12463
12464	A string is alpha-numeric if all characters in the string are alpha-numeric and
12465	there is at least one character in the string.
12466	[clinic start generated code]/*
12467
12468	static PyObject *
12469	unicode_isalnum_impl(PyObject *self)
12470	/[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]/
12471	{
12472	int kind;
12473	const void *data;
12474	Py_ssize_t len, i;
12475
12476	if (PyUnicode_READY(self) == -`1`)
12477	return NULL;
12478
12479	kind = PyUnicode_KIND(self);
12480	data = PyUnicode_DATA(self);
12481	len = PyUnicode_GET_LENGTH(self);
12482
12483	/ Shortcut for single character strings /
12484	if (len == `1`) {
12485	const Py_UCS4 ch = PyUnicode_READ(kind, data, `0`);
12486	return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12487	}
12488
12489	/ Special case for empty strings /
12490	if (len == `0`)
12491	Py_RETURN_FALSE;
12492
12493	for (i = `0`; i < len; i++) {
12494	const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12495	if (!Py_UNICODE_ISALNUM(ch))
12496	Py_RETURN_FALSE;
12497	}
12498	Py_RETURN_TRUE;
12499	}
12500
12501	/[clinic input]*
12502	str.isdecimal as unicode_isdecimal
12503
12504	Return True if the string is a decimal string, False otherwise.
12505
12506	A string is a decimal string if all characters in the string are decimal and
12507	there is at least one character in the string.
12508	[clinic start generated code]/*
12509
12510	static PyObject *
12511	unicode_isdecimal_impl(PyObject *self)
12512	/[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]/
12513	{
12514	Py_ssize_t i, length;
12515	int kind;
12516	const void *data;
12517
12518	if (PyUnicode_READY(self) == -`1`)
12519	return NULL;
12520	length = PyUnicode_GET_LENGTH(self);
12521	kind = PyUnicode_KIND(self);
12522	data = PyUnicode_DATA(self);
12523
12524	/ Shortcut for single character strings /
12525	if (length == `1`)
12526	return PyBool_FromLong(
12527	Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, `0`)));
12528
12529	/ Special case for empty strings /
12530	if (length == `0`)
12531	Py_RETURN_FALSE;
12532
12533	for (i = `0`; i < length; i++) {
12534	if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12535	Py_RETURN_FALSE;
12536	}
12537	Py_RETURN_TRUE;
12538	}
12539
12540	/[clinic input]*
12541	str.isdigit as unicode_isdigit
12542
12543	Return True if the string is a digit string, False otherwise.
12544
12545	A string is a digit string if all characters in the string are digits and there
12546	is at least one character in the string.
12547	[clinic start generated code]/*
12548
12549	static PyObject *
12550	unicode_isdigit_impl(PyObject *self)
12551	/[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]/
12552	{
12553	Py_ssize_t i, length;
12554	int kind;
12555	const void *data;
12556
12557	if (PyUnicode_READY(self) == -`1`)
12558	return NULL;
12559	length = PyUnicode_GET_LENGTH(self);
12560	kind = PyUnicode_KIND(self);
12561	data = PyUnicode_DATA(self);
12562
12563	/ Shortcut for single character strings /
12564	if (length == `1`) {
12565	const Py_UCS4 ch = PyUnicode_READ(kind, data, `0`);
12566	return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12567	}
12568
12569	/ Special case for empty strings /
12570	if (length == `0`)
12571	Py_RETURN_FALSE;
12572
12573	for (i = `0`; i < length; i++) {
12574	if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12575	Py_RETURN_FALSE;
12576	}
12577	Py_RETURN_TRUE;
12578	}
12579
12580	/[clinic input]*
12581	str.isnumeric as unicode_isnumeric
12582
12583	Return True if the string is a numeric string, False otherwise.
12584
12585	A string is numeric if all characters in the string are numeric and there is at
12586	least one character in the string.
12587	[clinic start generated code]/*
12588
12589	static PyObject *
12590	unicode_isnumeric_impl(PyObject *self)
12591	/[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]/
12592	{
12593	Py_ssize_t i, length;
12594	int kind;
12595	const void *data;
12596
12597	if (PyUnicode_READY(self) == -`1`)
12598	return NULL;
12599	length = PyUnicode_GET_LENGTH(self);
12600	kind = PyUnicode_KIND(self);
12601	data = PyUnicode_DATA(self);
12602
12603	/ Shortcut for single character strings /
12604	if (length == `1`)
12605	return PyBool_FromLong(
12606	Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, `0`)));
12607
12608	/ Special case for empty strings /
12609	if (length == `0`)
12610	Py_RETURN_FALSE;
12611
12612	for (i = `0`; i < length; i++) {
12613	if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12614	Py_RETURN_FALSE;
12615	}
12616	Py_RETURN_TRUE;
12617	}
12618
12619	Py_ssize_t
12620	_PyUnicode_ScanIdentifier(PyObject *self)
12621	{
12622	Py_ssize_t i;
12623	if (PyUnicode_READY(self) == -`1`)
12624	return -`1`;
12625
12626	Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12627	if (len == `0`) {
12628	/ an empty string is not a valid identifier /
12629	return `0`;
12630	}
12631
12632	int kind = PyUnicode_KIND(self);
12633	const void *data = PyUnicode_DATA(self);
12634	Py_UCS4 ch = PyUnicode_READ(kind, data, `0`);
12635	/ PEP 3131 says that the first character must be in*
12636	XID_Start and subsequent characters in XID_Continue,
12637	and for the ASCII range, the 2.x rules apply (i.e
12638	start with letters and underscore, continue with
12639	letters, digits, underscore). However, given the current
12640	definition of XID_Start and XID_Continue, it is sufficient
12641	to check just for these, except that _ must be allowed
12642	as starting an identifier. /*
12643	if (!_PyUnicode_IsXidStart(ch) && ch != `0x5F` / LOW LINE /) {
12644	return `0`;
12645	}
12646
12647	for (i = `1`; i < len; i++) {
12648	ch = PyUnicode_READ(kind, data, i);
12649	if (!_PyUnicode_IsXidContinue(ch)) {
12650	return i;
12651	}
12652	}
12653	return i;
12654	}
12655
12656	int
12657	PyUnicode_IsIdentifier(PyObject *self)
12658	{
12659	if (PyUnicode_IS_READY(self)) {
12660	Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12661	Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12662	/ an empty string is not a valid identifier /
12663	return len && i == len;
12664	}
12665	else {
12666	_Py_COMP_DIAG_PUSH
12667	_Py_COMP_DIAG_IGNORE_DEPR_DECLS
12668	Py_ssize_t i = `0`, len = PyUnicode_GET_SIZE(self);
12669	if (len == `0`) {
12670	/ an empty string is not a valid identifier /
12671	return `0`;
12672	}
12673
12674	const wchar_t *wstr = _PyUnicode_WSTR(self);
12675	Py_UCS4 ch = wstr[i++];
12676	#if SIZEOF_WCHAR_T == 2
12677	if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12678	&& i < len
12679	&& Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12680	{
12681	ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12682	i++;
12683	}
12684	#endif
12685	if (!_PyUnicode_IsXidStart(ch) && ch != `0x5F` / LOW LINE /) {
12686	return `0`;
12687	}
12688
12689	while (i < len) {
12690	ch = wstr[i++];
12691	#if SIZEOF_WCHAR_T == 2
12692	if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12693	&& i < len
12694	&& Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12695	{
12696	ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12697	i++;
12698	}
12699	#endif
12700	if (!_PyUnicode_IsXidContinue(ch)) {
12701	return `0`;
12702	}
12703	}
12704	return `1`;
12705	_Py_COMP_DIAG_POP
12706	}
12707	}
12708
12709	/[clinic input]*
12710	str.isidentifier as unicode_isidentifier
12711
12712	Return True if the string is a valid Python identifier, False otherwise.
12713
12714	Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12715	such as "def" or "class".
12716	[clinic start generated code]/*
12717
12718	static PyObject *
12719	unicode_isidentifier_impl(PyObject *self)
12720	/[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]/
12721	{
12722	return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12723	}
12724
12725	/[clinic input]*
12726	str.isprintable as unicode_isprintable
12727
12728	Return True if the string is printable, False otherwise.
12729
12730	A string is printable if all of its characters are considered printable in
12731	repr() or if it is empty.
12732	[clinic start generated code]/*
12733
12734	static PyObject *
12735	unicode_isprintable_impl(PyObject *self)
12736	/[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]/
12737	{
12738	Py_ssize_t i, length;
12739	int kind;
12740	const void *data;
12741
12742	if (PyUnicode_READY(self) == -`1`)
12743	return NULL;
12744	length = PyUnicode_GET_LENGTH(self);
12745	kind = PyUnicode_KIND(self);
12746	data = PyUnicode_DATA(self);
12747
12748	/ Shortcut for single character strings /
12749	if (length == `1`)
12750	return PyBool_FromLong(
12751	Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, `0`)));
12752
12753	for (i = `0`; i < length; i++) {
12754	if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12755	Py_RETURN_FALSE;
12756	}
12757	}
12758	Py_RETURN_TRUE;
12759	}
12760
12761	/[clinic input]*
12762	str.join as unicode_join
12763
12764	iterable: object
12765	/
12766
12767	Concatenate any number of strings.
12768
12769	The string whose method is called is inserted in between each given string.
12770	The result is returned as a new string.
12771
12772	Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12773	[clinic start generated code]/*
12774
12775	static PyObject *
12776	unicode_join(PyObject self, PyObject iterable)
12777	/[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]/
12778	{
12779	return PyUnicode_Join(self, iterable);
12780	}
12781
12782	static Py_ssize_t
12783	unicode_length(PyObject *self)
12784	{
12785	if (PyUnicode_READY(self) == -`1`)
12786	return -`1`;
12787	return PyUnicode_GET_LENGTH(self);
12788	}
12789
12790	/[clinic input]*
12791	str.ljust as unicode_ljust
12792
12793	width: Py_ssize_t
12794	fillchar: Py_UCS4 = ' '
12795	/
12796
12797	Return a left-justified string of length width.
12798
12799	Padding is done using the specified fill character (default is a space).
12800	[clinic start generated code]/*
12801
12802	static PyObject *
12803	unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12804	/[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]/
12805	{
12806	if (PyUnicode_READY(self) == -`1`)
12807	return NULL;
12808
12809	if (PyUnicode_GET_LENGTH(self) >= width)
12810	return unicode_result_unchanged(self);
12811
12812	return pad(self, `0`, width - PyUnicode_GET_LENGTH(self), fillchar);
12813	}
12814
12815	/[clinic input]*
12816	str.lower as unicode_lower
12817
12818	Return a copy of the string converted to lowercase.
12819	[clinic start generated code]/*
12820
12821	static PyObject *
12822	unicode_lower_impl(PyObject *self)
12823	/[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]/
12824	{
12825	if (PyUnicode_READY(self) == -`1`)
12826	return NULL;
12827	if (PyUnicode_IS_ASCII(self))
12828	return ascii_upper_or_lower(self, `1`);
12829	return case_operation(self, do_lower);
12830	}
12831
12832	#define LEFTSTRIP 0
12833	#define RIGHTSTRIP 1
12834	#define BOTHSTRIP 2
12835
12836	/ Arrays indexed by above /
12837	static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12838
12839	#define STRIPNAME(i) (stripfuncnames[i])
12840
12841	/ externally visible for str.strip(unicode) /
12842	PyObject *
12843	_PyUnicode_XStrip(PyObject self, int* striptype, PyObject *sepobj)
12844	{
12845	const void *data;
12846	int kind;
12847	Py_ssize_t i, j, len;
12848	BLOOM_MASK sepmask;
12849	Py_ssize_t seplen;
12850
12851	if (PyUnicode_READY(self) == -`1` \|\| PyUnicode_READY(sepobj) == -`1`)
12852	return NULL;
12853
12854	kind = PyUnicode_KIND(self);
12855	data = PyUnicode_DATA(self);
12856	len = PyUnicode_GET_LENGTH(self);
12857	seplen = PyUnicode_GET_LENGTH(sepobj);
12858	sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12859	PyUnicode_DATA(sepobj),
12860	seplen);
12861
12862	i = `0`;
12863	if (striptype != RIGHTSTRIP) {
12864	while (i < len) {
12865	Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12866	if (!BLOOM(sepmask, ch))
12867	break;
12868	if (PyUnicode_FindChar(sepobj, ch, `0`, seplen, `1`) < `0`)
12869	break;
12870	i++;
12871	}
12872	}
12873
12874	j = len;
12875	if (striptype != LEFTSTRIP) {
12876	j--;
12877	while (j >= i) {
12878	Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12879	if (!BLOOM(sepmask, ch))
12880	break;
12881	if (PyUnicode_FindChar(sepobj, ch, `0`, seplen, `1`) < `0`)
12882	break;
12883	j--;
12884	}
12885
12886	j++;
12887	}
12888
12889	return PyUnicode_Substring(self, i, j);
12890	}
12891
12892	PyObject*
12893	PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12894	{
12895	const unsigned char *data;
12896	int kind;
12897	Py_ssize_t length;
12898
12899	if (PyUnicode_READY(self) == -`1`)
12900	return NULL;
12901
12902	length = PyUnicode_GET_LENGTH(self);
12903	end = Py_MIN(end, length);
12904
12905	if (start == `0` && end == length)
12906	return unicode_result_unchanged(self);
12907
12908	if (start < `0` \|\| end < `0`) {
12909	PyErr_SetString(PyExc_IndexError, "string index out of range");
12910	return NULL;
12911	}
12912	if (start >= length \|\| end < start)
12913	_Py_RETURN_UNICODE_EMPTY();
12914
12915	length = end - start;
12916	if (PyUnicode_IS_ASCII(self)) {
12917	data = PyUnicode_1BYTE_DATA(self);
12918	return _PyUnicode_FromASCII((const char*)(data + start), length);
12919	}
12920	else {
12921	kind = PyUnicode_KIND(self);
12922	data = PyUnicode_1BYTE_DATA(self);
12923	return PyUnicode_FromKindAndData(kind,
12924	data + kind * start,
12925	length);
12926	}
12927	}
12928
12929	static PyObject *
12930	do_strip(PyObject self, int* striptype)
12931	{
12932	Py_ssize_t len, i, j;
12933
12934	if (PyUnicode_READY(self) == -`1`)
12935	return NULL;
12936
12937	len = PyUnicode_GET_LENGTH(self);
12938
12939	if (PyUnicode_IS_ASCII(self)) {
12940	const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12941
12942	i = `0`;
12943	if (striptype != RIGHTSTRIP) {
12944	while (i < len) {
12945	Py_UCS1 ch = data[i];
12946	if (!_Py_ascii_whitespace[ch])
12947	break;
12948	i++;
12949	}
12950	}
12951
12952	j = len;
12953	if (striptype != LEFTSTRIP) {
12954	j--;
12955	while (j >= i) {
12956	Py_UCS1 ch = data[j];
12957	if (!_Py_ascii_whitespace[ch])
12958	break;
12959	j--;
12960	}
12961	j++;
12962	}
12963	}
12964	else {
12965	int kind = PyUnicode_KIND(self);
12966	const void *data = PyUnicode_DATA(self);
12967
12968	i = `0`;
12969	if (striptype != RIGHTSTRIP) {
12970	while (i < len) {
12971	Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12972	if (!Py_UNICODE_ISSPACE(ch))
12973	break;
12974	i++;
12975	}
12976	}
12977
12978	j = len;
12979	if (striptype != LEFTSTRIP) {
12980	j--;
12981	while (j >= i) {
12982	Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12983	if (!Py_UNICODE_ISSPACE(ch))
12984	break;
12985	j--;
12986	}
12987	j++;
12988	}
12989	}
12990
12991	return PyUnicode_Substring(self, i, j);
12992	}
12993
12994
12995	static PyObject *
12996	do_argstrip(PyObject self, int* striptype, PyObject *sep)
12997	{
12998	if (sep != Py_None) {
12999	if (PyUnicode_Check(sep))
13000	return _PyUnicode_XStrip(self, striptype, sep);
13001	else {
13002	PyErr_Format(PyExc_TypeError,
13003	"%s arg must be None or str",
13004	STRIPNAME(striptype));
13005	return NULL;
13006	}
13007	}
13008
13009	return do_strip(self, striptype);
13010	}
13011
13012
13013	/[clinic input]*
13014	str.strip as unicode_strip
13015
13016	chars: object = None
13017	/
13018
13019	Return a copy of the string with leading and trailing whitespace removed.
13020
13021	If chars is given and not None, remove characters in chars instead.
13022	[clinic start generated code]/*
13023
13024	static PyObject *
13025	unicode_strip_impl(PyObject self, PyObject chars)
13026	/[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]/
13027	{
13028	return do_argstrip(self, BOTHSTRIP, chars);
13029	}
13030
13031
13032	/[clinic input]*
13033	str.lstrip as unicode_lstrip
13034
13035	chars: object = None
13036	/
13037
13038	Return a copy of the string with leading whitespace removed.
13039
13040	If chars is given and not None, remove characters in chars instead.
13041	[clinic start generated code]/*
13042
13043	static PyObject *
13044	unicode_lstrip_impl(PyObject self, PyObject chars)
13045	/[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]/
13046	{
13047	return do_argstrip(self, LEFTSTRIP, chars);
13048	}
13049
13050
13051	/[clinic input]*
13052	str.rstrip as unicode_rstrip
13053
13054	chars: object = None
13055	/
13056
13057	Return a copy of the string with trailing whitespace removed.
13058
13059	If chars is given and not None, remove characters in chars instead.
13060	[clinic start generated code]/*
13061
13062	static PyObject *
13063	unicode_rstrip_impl(PyObject self, PyObject chars)
13064	/[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]/
13065	{
13066	return do_argstrip(self, RIGHTSTRIP, chars);
13067	}
13068
13069
13070	static PyObject*
13071	unicode_repeat(PyObject *str, Py_ssize_t len)
13072	{
13073	PyObject *u;
13074	Py_ssize_t nchars, n;
13075
13076	if (len < `1`)
13077	_Py_RETURN_UNICODE_EMPTY();
13078
13079	/ no repeat, return original string /
13080	if (len == `1`)
13081	return unicode_result_unchanged(str);
13082
13083	if (PyUnicode_READY(str) == -`1`)
13084	return NULL;
13085
13086	if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
13087	PyErr_SetString(PyExc_OverflowError,
13088	"repeated string is too long");
13089	return NULL;
13090	}
13091	nchars = len * PyUnicode_GET_LENGTH(str);
13092
13093	u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
13094	if (!u)
13095	return NULL;
13096	assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
13097
13098	if (PyUnicode_GET_LENGTH(str) == `1`) {
13099	int kind = PyUnicode_KIND(str);
13100	Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), `0`);
13101	if (kind == PyUnicode_1BYTE_KIND) {
13102	void *to = PyUnicode_DATA(u);
13103	memset(to, (unsigned char)fill_char, len);
13104	}
13105	else if (kind == PyUnicode_2BYTE_KIND) {
13106	Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
13107	for (n = `0`; n < len; ++n)
13108	ucs2[n] = fill_char;
13109	} else {
13110	Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
13111	assert(kind == PyUnicode_4BYTE_KIND);
13112	for (n = `0`; n < len; ++n)
13113	ucs4[n] = fill_char;
13114	}
13115	}
13116	else {
13117	/ number of characters copied this far /
13118	Py_ssize_t done = PyUnicode_GET_LENGTH(str);
13119	Py_ssize_t char_size = PyUnicode_KIND(str);
13120	char to = (char* *) PyUnicode_DATA(u);
13121	memcpy(to, PyUnicode_DATA(str),
13122	PyUnicode_GET_LENGTH(str) * char_size);
13123	while (done < nchars) {
13124	n = (done <= nchars-done) ? done : nchars-done;
13125	memcpy(to + (done * char_size), to, n * char_size);
13126	done += n;
13127	}
13128	}
13129
13130	assert(_PyUnicode_CheckConsistency(u, `1`));
13131	return u;
13132	}
13133
13134	PyObject *
13135	PyUnicode_Replace(PyObject *str,
13136	PyObject *substr,
13137	PyObject *replstr,
13138	Py_ssize_t maxcount)
13139	{
13140	if (ensure_unicode(str) < `0` \|\| ensure_unicode(substr) < `0` \|\|
13141	ensure_unicode(replstr) < `0`)
13142	return NULL;
13143	return replace(str, substr, replstr, maxcount);
13144	}
13145
13146	/[clinic input]*
13147	str.replace as unicode_replace
13148
13149	old: unicode
13150	new: unicode
13151	count: Py_ssize_t = -1
13152	Maximum number of occurrences to replace.
13153	-1 (the default value) means replace all occurrences.
13154	/
13155
13156	Return a copy with all occurrences of substring old replaced by new.
13157
13158	If the optional argument count is given, only the first count occurrences are
13159	replaced.
13160	[clinic start generated code]/*
13161
13162	static PyObject *
13163	unicode_replace_impl(PyObject self, PyObject old, PyObject *new,
13164	Py_ssize_t count)
13165	/[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]/
13166	{
13167	if (PyUnicode_READY(self) == -`1`)
13168	return NULL;
13169	return replace(self, old, new, count);
13170	}
13171
13172	/[clinic input]*
13173	str.removeprefix as unicode_removeprefix
13174
13175	prefix: unicode
13176	/
13177
13178	Return a str with the given prefix string removed if present.
13179
13180	If the string starts with the prefix string, return string[len(prefix):].
13181	Otherwise, return a copy of the original string.
13182	[clinic start generated code]/*
13183
13184	static PyObject *
13185	unicode_removeprefix_impl(PyObject self, PyObject prefix)
13186	/[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]/
13187	{
13188	int match = tailmatch(self, prefix, `0`, PY_SSIZE_T_MAX, -`1`);
13189	if (match == -`1`) {
13190	return NULL;
13191	}
13192	if (match) {
13193	return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13194	PyUnicode_GET_LENGTH(self));
13195	}
13196	return unicode_result_unchanged(self);
13197	}
13198
13199	/[clinic input]*
13200	str.removesuffix as unicode_removesuffix
13201
13202	suffix: unicode
13203	/
13204
13205	Return a str with the given suffix string removed if present.
13206
13207	If the string ends with the suffix string and that suffix is not empty,
13208	return string[:-len(suffix)]. Otherwise, return a copy of the original
13209	string.
13210	[clinic start generated code]/*
13211
13212	static PyObject *
13213	unicode_removesuffix_impl(PyObject self, PyObject suffix)
13214	/[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]/
13215	{
13216	int match = tailmatch(self, suffix, `0`, PY_SSIZE_T_MAX, +`1`);
13217	if (match == -`1`) {
13218	return NULL;
13219	}
13220	if (match) {
13221	return PyUnicode_Substring(self, `0`, PyUnicode_GET_LENGTH(self)
13222	- PyUnicode_GET_LENGTH(suffix));
13223	}
13224	return unicode_result_unchanged(self);
13225	}
13226
13227	static PyObject *
13228	unicode_repr(PyObject *unicode)
13229	{
13230	PyObject *repr;
13231	Py_ssize_t isize;
13232	Py_ssize_t osize, squote, dquote, i, o;
13233	Py_UCS4 max, quote;
13234	int ikind, okind, unchanged;
13235	const void *idata;
13236	void *odata;
13237
13238	if (PyUnicode_READY(unicode) == -`1`)
13239	return NULL;
13240
13241	isize = PyUnicode_GET_LENGTH(unicode);
13242	idata = PyUnicode_DATA(unicode);
13243
13244	/ Compute length of output, quote characters, and*
13245	maximum character /*
13246	osize = `0`;
13247	max = `127`;
13248	squote = dquote = `0`;
13249	ikind = PyUnicode_KIND(unicode);
13250	for (i = `0`; i < isize; i++) {
13251	Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
13252	Py_ssize_t incr = `1`;
13253	switch (ch) {
13254	case `'\''`: squote++; break;
13255	case `'"'`: dquote++; break;
13256	case `'\\'`: case `'\t'`: case `'\r'`: case `'\n'`:
13257	incr = `2`;
13258	break;
13259	default:
13260	/ Fast-path ASCII /
13261	if (ch < `' '` \|\| ch == `0x7f`)
13262	incr = `4`; / \xHH /
13263	else if (ch < `0x7f`)
13264	;
13265	else if (Py_UNICODE_ISPRINTABLE(ch))
13266	max = ch > max ? ch : max;
13267	else if (ch < `0x100`)
13268	incr = `4`; / \xHH /
13269	else if (ch < `0x10000`)
13270	incr = `6`; / \uHHHH /
13271	else
13272	incr = `10`; / \uHHHHHHHH /
13273	}
13274	if (osize > PY_SSIZE_T_MAX - incr) {
13275	PyErr_SetString(PyExc_OverflowError,
13276	"string is too long to generate repr");
13277	return NULL;
13278	}
13279	osize += incr;
13280	}
13281
13282	quote = `'\''`;
13283	unchanged = (osize == isize);
13284	if (squote) {
13285	unchanged = `0`;
13286	if (dquote)
13287	/ Both squote and dquote present. Use squote,*
13288	and escape them /*
13289	osize += squote;
13290	else
13291	quote = `'"'`;
13292	}
13293	osize += `2`; / quotes /
13294
13295	repr = PyUnicode_New(osize, max);
13296	if (repr == NULL)
13297	return NULL;
13298	okind = PyUnicode_KIND(repr);
13299	odata = PyUnicode_DATA(repr);
13300
13301	PyUnicode_WRITE(okind, odata, `0`, quote);
13302	PyUnicode_WRITE(okind, odata, osize-`1`, quote);
13303	if (unchanged) {
13304	_PyUnicode_FastCopyCharacters(repr, `1`,
13305	unicode, `0`,
13306	isize);
13307	}
13308	else {
13309	for (i = `0`, o = `1`; i < isize; i++) {
13310	Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
13311
13312	/ Escape quotes and backslashes /
13313	if ((ch == quote) \|\| (ch == `'\\'`)) {
13314	PyUnicode_WRITE(okind, odata, o++, `'\\'`);
13315	PyUnicode_WRITE(okind, odata, o++, ch);
13316	continue;
13317	}
13318
13319	/ Map special whitespace to '\t', \n', '\r' /
13320	if (ch == `'\t'`) {
13321	PyUnicode_WRITE(okind, odata, o++, `'\\'`);
13322	PyUnicode_WRITE(okind, odata, o++, `'t'`);
13323	}
13324	else if (ch == `'\n'`) {
13325	PyUnicode_WRITE(okind, odata, o++, `'\\'`);
13326	PyUnicode_WRITE(okind, odata, o++, `'n'`);
13327	}
13328	else if (ch == `'\r'`) {
13329	PyUnicode_WRITE(okind, odata, o++, `'\\'`);
13330	PyUnicode_WRITE(okind, odata, o++, `'r'`);
13331	}
13332
13333	/ Map non-printable US ASCII to '\xhh' /
13334	else if (ch < `' '` \|\| ch == `0x7F`) {
13335	PyUnicode_WRITE(okind, odata, o++, `'\\'`);
13336	PyUnicode_WRITE(okind, odata, o++, `'x'`);
13337	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> `4`) & `0x000F`]);
13338	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & `0x000F`]);
13339	}
13340
13341	/ Copy ASCII characters as-is /
13342	else if (ch < `0x7F`) {
13343	PyUnicode_WRITE(okind, odata, o++, ch);
13344	}
13345
13346	/ Non-ASCII characters /
13347	else {
13348	/ Map Unicode whitespace and control characters*
13349	(categories Z and C* except ASCII space)*
13350	*/
13351	if (!Py_UNICODE_ISPRINTABLE(ch)) {
13352	PyUnicode_WRITE(okind, odata, o++, `'\\'`);
13353	/ Map 8-bit characters to '\xhh' /
13354	if (ch <= `0xff`) {
13355	PyUnicode_WRITE(okind, odata, o++, `'x'`);
13356	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> `4`) & `0x000F`]);
13357	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & `0x000F`]);
13358	}
13359	/ Map 16-bit characters to '\uxxxx' /
13360	else if (ch <= `0xffff`) {
13361	PyUnicode_WRITE(okind, odata, o++, `'u'`);
13362	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> `12`) & `0xF`]);
13363	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> `8`) & `0xF`]);
13364	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> `4`) & `0xF`]);
13365	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & `0xF`]);
13366	}
13367	/ Map 21-bit characters to '\U00xxxxxx' /
13368	else {
13369	PyUnicode_WRITE(okind, odata, o++, `'U'`);
13370	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> `28`) & `0xF`]);
13371	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> `24`) & `0xF`]);
13372	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> `20`) & `0xF`]);
13373	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> `16`) & `0xF`]);
13374	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> `12`) & `0xF`]);
13375	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> `8`) & `0xF`]);
13376	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> `4`) & `0xF`]);
13377	PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & `0xF`]);
13378	}
13379	}
13380	/ Copy characters as-is /
13381	else {
13382	PyUnicode_WRITE(okind, odata, o++, ch);
13383	}
13384	}
13385	}
13386	}
13387	/ Closing quote already added at the beginning /
13388	assert(_PyUnicode_CheckConsistency(repr, `1`));
13389	return repr;
13390	}
13391
13392	PyDoc_STRVAR(rfind__doc__,
13393	"S.rfind(sub[, start[, end]]) -> int\n\
13394	\n\
13395	Return the highest index in S where substring sub is found,\n\
13396	such that sub is contained within S[start:end]. Optional\n\
13397	arguments start and end are interpreted as in slice notation.\n\
13398	\n\
13399	Return -1 on failure.");
13400
13401	static PyObject *
13402	unicode_rfind(PyObject self, PyObject args)
13403	{
13404	/ initialize variables to prevent gcc warning /
13405	PyObject *substring = NULL;
13406	Py_ssize_t start = `0`;
13407	Py_ssize_t end = `0`;
13408	Py_ssize_t result;
13409
13410	if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13411	return NULL;
13412
13413	if (PyUnicode_READY(self) == -`1`)
13414	return NULL;
13415
13416	result = any_find_slice(self, substring, start, end, -`1`);
13417
13418	if (result == -`2`)
13419	return NULL;
13420
13421	return PyLong_FromSsize_t(result);
13422	}
13423
13424	PyDoc_STRVAR(rindex__doc__,
13425	"S.rindex(sub[, start[, end]]) -> int\n\
13426	\n\
13427	Return the highest index in S where substring sub is found,\n\
13428	such that sub is contained within S[start:end]. Optional\n\
13429	arguments start and end are interpreted as in slice notation.\n\
13430	\n\
13431	Raises ValueError when the substring is not found.");
13432
13433	static PyObject *
13434	unicode_rindex(PyObject self, PyObject args)
13435	{
13436	/ initialize variables to prevent gcc warning /
13437	PyObject *substring = NULL;
13438	Py_ssize_t start = `0`;
13439	Py_ssize_t end = `0`;
13440	Py_ssize_t result;
13441
13442	if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13443	return NULL;
13444
13445	if (PyUnicode_READY(self) == -`1`)
13446	return NULL;
13447
13448	result = any_find_slice(self, substring, start, end, -`1`);
13449
13450	if (result == -`2`)
13451	return NULL;
13452
13453	if (result < `0`) {
13454	PyErr_SetString(PyExc_ValueError, "substring not found");
13455	return NULL;
13456	}
13457
13458	return PyLong_FromSsize_t(result);
13459	}
13460
13461	/[clinic input]*
13462	str.rjust as unicode_rjust
13463
13464	width: Py_ssize_t
13465	fillchar: Py_UCS4 = ' '
13466	/
13467
13468	Return a right-justified string of length width.
13469
13470	Padding is done using the specified fill character (default is a space).
13471	[clinic start generated code]/*
13472
13473	static PyObject *
13474	unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13475	/[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]/
13476	{
13477	if (PyUnicode_READY(self) == -`1`)
13478	return NULL;
13479
13480	if (PyUnicode_GET_LENGTH(self) >= width)
13481	return unicode_result_unchanged(self);
13482
13483	return pad(self, width - PyUnicode_GET_LENGTH(self), `0`, fillchar);
13484	}
13485
13486	PyObject *
13487	PyUnicode_Split(PyObject s, PyObject sep, Py_ssize_t maxsplit)
13488	{
13489	if (ensure_unicode(s) < `0` \|\| (sep != NULL && ensure_unicode(sep) < `0`))
13490	return NULL;
13491
13492	return split(s, sep, maxsplit);
13493	}
13494
13495	/[clinic input]*
13496	str.split as unicode_split
13497
13498	sep: object = None
13499	The separator used to split the string.
13500
13501	When set to None (the default value), will split on any whitespace
13502	character (including \\n \\r \\t \\f and spaces) and will discard
13503	empty strings from the result.
13504	maxsplit: Py_ssize_t = -1
13505	Maximum number of splits (starting from the left).
13506	-1 (the default value) means no limit.
13507
13508	Return a list of the substrings in the string, using sep as the separator string.
13509
13510	Note, str.split() is mainly useful for data that has been intentionally
13511	delimited. With natural text that includes punctuation, consider using
13512	the regular expression module.
13513
13514	[clinic start generated code]/*
13515
13516	static PyObject *
13517	unicode_split_impl(PyObject self, PyObject sep, Py_ssize_t maxsplit)
13518	/[clinic end generated code: output=3a65b1db356948dc input=906d953b44efc43b]/
13519	{
13520	if (sep == Py_None)
13521	return split(self, NULL, maxsplit);
13522	if (PyUnicode_Check(sep))
13523	return split(self, sep, maxsplit);
13524
13525	PyErr_Format(PyExc_TypeError,
13526	"must be str or None, not %.100s",
13527	Py_TYPE(sep)->tp_name);
13528	return NULL;
13529	}
13530
13531	PyObject *
13532	PyUnicode_Partition(PyObject str_obj, PyObject sep_obj)
13533	{
13534	PyObject* out;
13535	int kind1, kind2;
13536	const void buf1, buf2;
13537	Py_ssize_t len1, len2;
13538
13539	if (ensure_unicode(str_obj) < `0` \|\| ensure_unicode(sep_obj) < `0`)
13540	return NULL;
13541
13542	kind1 = PyUnicode_KIND(str_obj);
13543	kind2 = PyUnicode_KIND(sep_obj);
13544	len1 = PyUnicode_GET_LENGTH(str_obj);
13545	len2 = PyUnicode_GET_LENGTH(sep_obj);
13546	if (kind1 < kind2 \|\| len1 < len2) {
13547	PyObject empty = unicode_get_empty(); // Borrowed reference*
13548	return PyTuple_Pack(`3`, str_obj, empty, empty);
13549	}
13550	buf1 = PyUnicode_DATA(str_obj);
13551	buf2 = PyUnicode_DATA(sep_obj);
13552	if (kind2 != kind1) {
13553	buf2 = unicode_askind(kind2, buf2, len2, kind1);
13554	if (!buf2)
13555	return NULL;
13556	}
13557
13558	switch (kind1) {
13559	case PyUnicode_1BYTE_KIND:
13560	if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13561	out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13562	else
13563	out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13564	break;
13565	case PyUnicode_2BYTE_KIND:
13566	out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13567	break;
13568	case PyUnicode_4BYTE_KIND:
13569	out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13570	break;
13571	default:
13572	Py_UNREACHABLE();
13573	}
13574
13575	assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13576	if (kind2 != kind1)
13577	PyMem_Free((void *)buf2);
13578
13579	return out;
13580	}
13581
13582
13583	PyObject *
13584	PyUnicode_RPartition(PyObject str_obj, PyObject sep_obj)
13585	{
13586	PyObject* out;
13587	int kind1, kind2;
13588	const void buf1, buf2;
13589	Py_ssize_t len1, len2;
13590
13591	if (ensure_unicode(str_obj) < `0` \|\| ensure_unicode(sep_obj) < `0`)
13592	return NULL;
13593
13594	kind1 = PyUnicode_KIND(str_obj);
13595	kind2 = PyUnicode_KIND(sep_obj);
13596	len1 = PyUnicode_GET_LENGTH(str_obj);
13597	len2 = PyUnicode_GET_LENGTH(sep_obj);
13598	if (kind1 < kind2 \|\| len1 < len2) {
13599	PyObject empty = unicode_get_empty(); // Borrowed reference*
13600	return PyTuple_Pack(`3`, empty, empty, str_obj);
13601	}
13602	buf1 = PyUnicode_DATA(str_obj);
13603	buf2 = PyUnicode_DATA(sep_obj);
13604	if (kind2 != kind1) {
13605	buf2 = unicode_askind(kind2, buf2, len2, kind1);
13606	if (!buf2)
13607	return NULL;
13608	}
13609
13610	switch (kind1) {
13611	case PyUnicode_1BYTE_KIND:
13612	if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13613	out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13614	else
13615	out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13616	break;
13617	case PyUnicode_2BYTE_KIND:
13618	out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13619	break;
13620	case PyUnicode_4BYTE_KIND:
13621	out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13622	break;
13623	default:
13624	Py_UNREACHABLE();
13625	}
13626
13627	assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13628	if (kind2 != kind1)
13629	PyMem_Free((void *)buf2);
13630
13631	return out;
13632	}
13633
13634	/[clinic input]*
13635	str.partition as unicode_partition
13636
13637	sep: object
13638	/
13639
13640	Partition the string into three parts using the given separator.
13641
13642	This will search for the separator in the string. If the separator is found,
13643	returns a 3-tuple containing the part before the separator, the separator
13644	itself, and the part after it.
13645
13646	If the separator is not found, returns a 3-tuple containing the original string
13647	and two empty strings.
13648	[clinic start generated code]/*
13649
13650	static PyObject *
13651	unicode_partition(PyObject self, PyObject sep)
13652	/[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]/
13653	{
13654	return PyUnicode_Partition(self, sep);
13655	}
13656
13657	/[clinic input]*
13658	str.rpartition as unicode_rpartition = str.partition
13659
13660	Partition the string into three parts using the given separator.
13661
13662	This will search for the separator in the string, starting at the end. If
13663	the separator is found, returns a 3-tuple containing the part before the
13664	separator, the separator itself, and the part after it.
13665
13666	If the separator is not found, returns a 3-tuple containing two empty strings
13667	and the original string.
13668	[clinic start generated code]/*
13669
13670	static PyObject *
13671	unicode_rpartition(PyObject self, PyObject sep)
13672	/[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]/
13673	{
13674	return PyUnicode_RPartition(self, sep);
13675	}
13676
13677	PyObject *
13678	PyUnicode_RSplit(PyObject s, PyObject sep, Py_ssize_t maxsplit)
13679	{
13680	if (ensure_unicode(s) < `0` \|\| (sep != NULL && ensure_unicode(sep) < `0`))
13681	return NULL;
13682
13683	return rsplit(s, sep, maxsplit);
13684	}
13685
13686	/[clinic input]*
13687	str.rsplit as unicode_rsplit = str.split
13688
13689	Return a list of the substrings in the string, using sep as the separator string.
13690
13691	Splitting starts at the end of the string and works to the front.
13692	[clinic start generated code]/*
13693
13694	static PyObject *
13695	unicode_rsplit_impl(PyObject self, PyObject sep, Py_ssize_t maxsplit)
13696	/[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]/
13697	{
13698	if (sep == Py_None)
13699	return rsplit(self, NULL, maxsplit);
13700	if (PyUnicode_Check(sep))
13701	return rsplit(self, sep, maxsplit);
13702
13703	PyErr_Format(PyExc_TypeError,
13704	"must be str or None, not %.100s",
13705	Py_TYPE(sep)->tp_name);
13706	return NULL;
13707	}
13708
13709	/[clinic input]*
13710	str.splitlines as unicode_splitlines
13711
13712	keepends: bool(accept={int}) = False
13713
13714	Return a list of the lines in the string, breaking at line boundaries.
13715
13716	Line breaks are not included in the resulting list unless keepends is given and
13717	true.
13718	[clinic start generated code]/*
13719
13720	static PyObject *
13721	unicode_splitlines_impl(PyObject self, int* keepends)
13722	/[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]/
13723	{
13724	return PyUnicode_Splitlines(self, keepends);
13725	}
13726
13727	static
13728	PyObject unicode_str(PyObject self)
13729	{
13730	return unicode_result_unchanged(self);
13731	}
13732
13733	/[clinic input]*
13734	str.swapcase as unicode_swapcase
13735
13736	Convert uppercase characters to lowercase and lowercase characters to uppercase.
13737	[clinic start generated code]/*
13738
13739	static PyObject *
13740	unicode_swapcase_impl(PyObject *self)
13741	/[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]/
13742	{
13743	if (PyUnicode_READY(self) == -`1`)
13744	return NULL;
13745	return case_operation(self, do_swapcase);
13746	}
13747
13748	/[clinic input]*
13749
13750	@staticmethod
13751	str.maketrans as unicode_maketrans
13752
13753	x: object
13754
13755	y: unicode=NULL
13756
13757	z: unicode=NULL
13758
13759	/
13760
13761	Return a translation table usable for str.translate().
13762
13763	If there is only one argument, it must be a dictionary mapping Unicode
13764	ordinals (integers) or characters to Unicode ordinals, strings or None.
13765	Character keys will be then converted to ordinals.
13766	If there are two arguments, they must be strings of equal length, and
13767	in the resulting dictionary, each character in x will be mapped to the
13768	character at the same position in y. If there is a third argument, it
13769	must be a string, whose characters will be mapped to None in the result.
13770	[clinic start generated code]/*
13771
13772	static PyObject *
13773	unicode_maketrans_impl(PyObject x, PyObject y, PyObject *z)
13774	/[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]/
13775	{
13776	PyObject new = NULL, key, *value;
13777	Py_ssize_t i = `0`;
13778	int res;
13779
13780	new = PyDict_New();
13781	if (!new)
13782	return NULL;
13783	if (y != NULL) {
13784	int x_kind, y_kind, z_kind;
13785	const void x_data, y_data, *z_data;
13786
13787	/ x must be a string too, of equal length /
13788	if (!PyUnicode_Check(x)) {
13789	PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13790	"be a string if there is a second argument");
13791	goto err;
13792	}
13793	if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13794	PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13795	"arguments must have equal length");
13796	goto err;
13797	}
13798	/ create entries for translating chars in x to those in y /
13799	x_kind = PyUnicode_KIND(x);
13800	y_kind = PyUnicode_KIND(y);
13801	x_data = PyUnicode_DATA(x);
13802	y_data = PyUnicode_DATA(y);
13803	for (i = `0`; i < PyUnicode_GET_LENGTH(x); i++) {
13804	key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13805	if (!key)
13806	goto err;
13807	value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13808	if (!value) {
13809	Py_DECREF(key);
13810	goto err;
13811	}
13812	res = PyDict_SetItem(new, key, value);
13813	Py_DECREF(key);
13814	Py_DECREF(value);
13815	if (res < `0`)
13816	goto err;
13817	}
13818	/ create entries for deleting chars in z /
13819	if (z != NULL) {
13820	z_kind = PyUnicode_KIND(z);
13821	z_data = PyUnicode_DATA(z);
13822	for (i = `0`; i < PyUnicode_GET_LENGTH(z); i++) {
13823	key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13824	if (!key)
13825	goto err;
13826	res = PyDict_SetItem(new, key, Py_None);
13827	Py_DECREF(key);
13828	if (res < `0`)
13829	goto err;
13830	}
13831	}
13832	} else {
13833	int kind;
13834	const void *data;
13835
13836	/ x must be a dict /
13837	if (!PyDict_CheckExact(x)) {
13838	PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13839	"to maketrans it must be a dict");
13840	goto err;
13841	}
13842	/ copy entries into the new dict, converting string keys to int keys /
13843	while (PyDict_Next(x, &i, &key, &value)) {
13844	if (PyUnicode_Check(key)) {
13845	/ convert string keys to integer keys /
13846	PyObject *newkey;
13847	if (PyUnicode_GET_LENGTH(key) != `1`) {
13848	PyErr_SetString(PyExc_ValueError, "string keys in translate "
13849	"table must be of length 1");
13850	goto err;
13851	}
13852	kind = PyUnicode_KIND(key);
13853	data = PyUnicode_DATA(key);
13854	newkey = PyLong_FromLong(PyUnicode_READ(kind, data, `0`));
13855	if (!newkey)
13856	goto err;
13857	res = PyDict_SetItem(new, newkey, value);
13858	Py_DECREF(newkey);
13859	if (res < `0`)
13860	goto err;
13861	} else if (PyLong_Check(key)) {
13862	/ just keep integer keys /
13863	if (PyDict_SetItem(new, key, value) < `0`)
13864	goto err;
13865	} else {
13866	PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13867	"be strings or integers");
13868	goto err;
13869	}
13870	}
13871	}
13872	return new;
13873	err:
13874	Py_DECREF(new);
13875	return NULL;
13876	}
13877
13878	/[clinic input]*
13879	str.translate as unicode_translate
13880
13881	table: object
13882	Translation table, which must be a mapping of Unicode ordinals to
13883	Unicode ordinals, strings, or None.
13884	/
13885
13886	Replace each character in the string using the given translation table.
13887
13888	The table must implement lookup/indexing via __getitem__, for instance a
13889	dictionary or list. If this operation raises LookupError, the character is
13890	left untouched. Characters mapped to None are deleted.
13891	[clinic start generated code]/*
13892
13893	static PyObject *
13894	unicode_translate(PyObject self, PyObject table)
13895	/[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]/
13896	{
13897	return _PyUnicode_TranslateCharmap(self, table, "ignore");
13898	}
13899
13900	/[clinic input]*
13901	str.upper as unicode_upper
13902
13903	Return a copy of the string converted to uppercase.
13904	[clinic start generated code]/*
13905
13906	static PyObject *
13907	unicode_upper_impl(PyObject *self)
13908	/[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]/
13909	{
13910	if (PyUnicode_READY(self) == -`1`)
13911	return NULL;
13912	if (PyUnicode_IS_ASCII(self))
13913	return ascii_upper_or_lower(self, `0`);
13914	return case_operation(self, do_upper);
13915	}
13916
13917	/[clinic input]*
13918	str.zfill as unicode_zfill
13919
13920	width: Py_ssize_t
13921	/
13922
13923	Pad a numeric string with zeros on the left, to fill a field of the given width.
13924
13925	The string is never truncated.
13926	[clinic start generated code]/*
13927
13928	static PyObject *
13929	unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13930	/[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]/
13931	{
13932	Py_ssize_t fill;
13933	PyObject *u;
13934	int kind;
13935	const void *data;
13936	Py_UCS4 chr;
13937
13938	if (PyUnicode_READY(self) == -`1`)
13939	return NULL;
13940
13941	if (PyUnicode_GET_LENGTH(self) >= width)
13942	return unicode_result_unchanged(self);
13943
13944	fill = width - PyUnicode_GET_LENGTH(self);
13945
13946	u = pad(self, fill, `0`, `'0'`);
13947
13948	if (u == NULL)
13949	return NULL;
13950
13951	kind = PyUnicode_KIND(u);
13952	data = PyUnicode_DATA(u);
13953	chr = PyUnicode_READ(kind, data, fill);
13954
13955	if (chr == `'+'` \|\| chr == `'-'`) {
13956	/ move sign to beginning of string /
13957	PyUnicode_WRITE(kind, data, `0`, chr);
13958	PyUnicode_WRITE(kind, data, fill, `'0'`);
13959	}
13960
13961	assert(_PyUnicode_CheckConsistency(u, `1`));
13962	return u;
13963	}
13964
13965	#if 0
13966	static PyObject *
13967	unicode__decimal2ascii(PyObject *self)
13968	{
13969	return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13970	}
13971	#endif
13972
13973	PyDoc_STRVAR(startswith__doc__,
13974	"S.startswith(prefix[, start[, end]]) -> bool\n\
13975	\n\
13976	Return True if S starts with the specified prefix, False otherwise.\n\
13977	With optional start, test S beginning at that position.\n\
13978	With optional end, stop comparing S at that position.\n\
13979	prefix can also be a tuple of strings to try.");
13980
13981	static PyObject *
13982	unicode_startswith(PyObject *self,
13983	PyObject *args)
13984	{
13985	PyObject *subobj;
13986	PyObject *substring;
13987	Py_ssize_t start = `0`;
13988	Py_ssize_t end = PY_SSIZE_T_MAX;
13989	int result;
13990
13991	if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13992	return NULL;
13993	if (PyTuple_Check(subobj)) {
13994	Py_ssize_t i;
13995	for (i = `0`; i < PyTuple_GET_SIZE(subobj); i++) {
13996	substring = PyTuple_GET_ITEM(subobj, i);
13997	if (!PyUnicode_Check(substring)) {
13998	PyErr_Format(PyExc_TypeError,
13999	"tuple for startswith must only contain str, "
14000	"not %.100s",
14001	Py_TYPE(substring)->tp_name);
14002	return NULL;
14003	}
14004	result = tailmatch(self, substring, start, end, -`1`);
14005	if (result == -`1`)
14006	return NULL;
14007	if (result) {
14008	Py_RETURN_TRUE;
14009	}
14010	}
14011	/ nothing matched /
14012	Py_RETURN_FALSE;
14013	}
14014	if (!PyUnicode_Check(subobj)) {
14015	PyErr_Format(PyExc_TypeError,
14016	"startswith first arg must be str or "
14017	"a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
14018	return NULL;
14019	}
14020	result = tailmatch(self, subobj, start, end, -`1`);
14021	if (result == -`1`)
14022	return NULL;
14023	return PyBool_FromLong(result);
14024	}
14025
14026
14027	PyDoc_STRVAR(endswith__doc__,
14028	"S.endswith(suffix[, start[, end]]) -> bool\n\
14029	\n\
14030	Return True if S ends with the specified suffix, False otherwise.\n\
14031	With optional start, test S beginning at that position.\n\
14032	With optional end, stop comparing S at that position.\n\
14033	suffix can also be a tuple of strings to try.");
14034
14035	static PyObject *
14036	unicode_endswith(PyObject *self,
14037	PyObject *args)
14038	{
14039	PyObject *subobj;
14040	PyObject *substring;
14041	Py_ssize_t start = `0`;
14042	Py_ssize_t end = PY_SSIZE_T_MAX;
14043	int result;
14044
14045	if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
14046	return NULL;
14047	if (PyTuple_Check(subobj)) {
14048	Py_ssize_t i;
14049	for (i = `0`; i < PyTuple_GET_SIZE(subobj); i++) {
14050	substring = PyTuple_GET_ITEM(subobj, i);
14051	if (!PyUnicode_Check(substring)) {
14052	PyErr_Format(PyExc_TypeError,
14053	"tuple for endswith must only contain str, "
14054	"not %.100s",
14055	Py_TYPE(substring)->tp_name);
14056	return NULL;
14057	}
14058	result = tailmatch(self, substring, start, end, +`1`);
14059	if (result == -`1`)
14060	return NULL;
14061	if (result) {
14062	Py_RETURN_TRUE;
14063	}
14064	}
14065	Py_RETURN_FALSE;
14066	}
14067	if (!PyUnicode_Check(subobj)) {
14068	PyErr_Format(PyExc_TypeError,
14069	"endswith first arg must be str or "
14070	"a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
14071	return NULL;
14072	}
14073	result = tailmatch(self, subobj, start, end, +`1`);
14074	if (result == -`1`)
14075	return NULL;
14076	return PyBool_FromLong(result);
14077	}
14078
14079	static inline void
14080	_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
14081	{
14082	writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
14083	writer->data = PyUnicode_DATA(writer->buffer);
14084
14085	if (!writer->readonly) {
14086	writer->kind = PyUnicode_KIND(writer->buffer);
14087	writer->size = PyUnicode_GET_LENGTH(writer->buffer);
14088	}
14089	else {
14090	/ use a value smaller than PyUnicode_1BYTE_KIND() so*
14091	_PyUnicodeWriter_PrepareKind() will copy the buffer. /*
14092	writer->kind = PyUnicode_WCHAR_KIND;
14093	assert(writer->kind <= PyUnicode_1BYTE_KIND);
14094
14095	/ Copy-on-write mode: set buffer size to 0 so*
14096	* _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
14097	* next write. */
14098	writer->size = `0`;
14099	}
14100	}
14101
14102	void
14103	_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
14104	{
14105	memset(writer, `0`, sizeof(*writer));
14106
14107	/ ASCII is the bare minimum /
14108	writer->min_char = `127`;
14109
14110	/ use a value smaller than PyUnicode_1BYTE_KIND() so*
14111	_PyUnicodeWriter_PrepareKind() will copy the buffer. /*
14112	writer->kind = PyUnicode_WCHAR_KIND;
14113	assert(writer->kind <= PyUnicode_1BYTE_KIND);
14114	}
14115
14116	// Initialize _PyUnicodeWriter with initial buffer
14117	static inline void
14118	_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter writer, PyObject buffer)
14119	{
14120	memset(writer, `0`, sizeof(*writer));
14121	writer->buffer = buffer;
14122	_PyUnicodeWriter_Update(writer);
14123	writer->min_length = writer->size;
14124	}
14125
14126	int
14127	_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
14128	Py_ssize_t length, Py_UCS4 maxchar)
14129	{
14130	Py_ssize_t newlen;
14131	PyObject *newbuffer;
14132
14133	assert(maxchar <= MAX_UNICODE);
14134
14135	/ ensure that the _PyUnicodeWriter_Prepare macro was used /
14136	assert((maxchar > writer->maxchar && length >= `0`)
14137	\|\| length > `0`);
14138
14139	if (length > PY_SSIZE_T_MAX - writer->pos) {
14140	PyErr_NoMemory();
14141	return -`1`;
14142	}
14143	newlen = writer->pos + length;
14144
14145	maxchar = Py_MAX(maxchar, writer->min_char);
14146
14147	if (writer->buffer == NULL) {
14148	assert(!writer->readonly);
14149	if (writer->overallocate
14150	&& newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14151	/ overallocate to limit the number of realloc() /
14152	newlen += newlen / OVERALLOCATE_FACTOR;
14153	}
14154	if (newlen < writer->min_length)
14155	newlen = writer->min_length;
14156
14157	writer->buffer = PyUnicode_New(newlen, maxchar);
14158	if (writer->buffer == NULL)
14159	return -`1`;
14160	}
14161	else if (newlen > writer->size) {
14162	if (writer->overallocate
14163	&& newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14164	/ overallocate to limit the number of realloc() /
14165	newlen += newlen / OVERALLOCATE_FACTOR;
14166	}
14167	if (newlen < writer->min_length)
14168	newlen = writer->min_length;
14169
14170	if (maxchar > writer->maxchar \|\| writer->readonly) {
14171	/ resize + widen /
14172	maxchar = Py_MAX(maxchar, writer->maxchar);
14173	newbuffer = PyUnicode_New(newlen, maxchar);
14174	if (newbuffer == NULL)
14175	return -`1`;
14176	_PyUnicode_FastCopyCharacters(newbuffer, `0`,
14177	writer->buffer, `0`, writer->pos);
14178	Py_DECREF(writer->buffer);
14179	writer->readonly = `0`;
14180	}
14181	else {
14182	newbuffer = resize_compact(writer->buffer, newlen);
14183	if (newbuffer == NULL)
14184	return -`1`;
14185	}
14186	writer->buffer = newbuffer;
14187	}
14188	else if (maxchar > writer->maxchar) {
14189	assert(!writer->readonly);
14190	newbuffer = PyUnicode_New(writer->size, maxchar);
14191	if (newbuffer == NULL)
14192	return -`1`;
14193	_PyUnicode_FastCopyCharacters(newbuffer, `0`,
14194	writer->buffer, `0`, writer->pos);
14195	Py_SETREF(writer->buffer, newbuffer);
14196	}
14197	_PyUnicodeWriter_Update(writer);
14198	return `0`;
14199
14200	#undef OVERALLOCATE_FACTOR
14201	}
14202
14203	int
14204	_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14205	enum PyUnicode_Kind kind)
14206	{
14207	Py_UCS4 maxchar;
14208
14209	/ ensure that the _PyUnicodeWriter_PrepareKind macro was used /
14210	assert(writer->kind < kind);
14211
14212	switch (kind)
14213	{
14214	case PyUnicode_1BYTE_KIND: maxchar = `0xff`; break;
14215	case PyUnicode_2BYTE_KIND: maxchar = `0xffff`; break;
14216	case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
14217	default:
14218	Py_UNREACHABLE();
14219	}
14220
14221	return _PyUnicodeWriter_PrepareInternal(writer, `0`, maxchar);
14222	}
14223
14224	static inline int
14225	_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
14226	{
14227	assert(ch <= MAX_UNICODE);
14228	if (_PyUnicodeWriter_Prepare(writer, `1`, ch) < `0`)
14229	return -`1`;
14230	PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14231	writer->pos++;
14232	return `0`;
14233	}
14234
14235	int
14236	_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14237	{
14238	return _PyUnicodeWriter_WriteCharInline(writer, ch);
14239	}
14240
14241	int
14242	_PyUnicodeWriter_WriteStr(_PyUnicodeWriter writer, PyObject str)
14243	{
14244	Py_UCS4 maxchar;
14245	Py_ssize_t len;
14246
14247	if (PyUnicode_READY(str) == -`1`)
14248	return -`1`;
14249	len = PyUnicode_GET_LENGTH(str);
14250	if (len == `0`)
14251	return `0`;
14252	maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14253	if (maxchar > writer->maxchar \|\| len > writer->size - writer->pos) {
14254	if (writer->buffer == NULL && !writer->overallocate) {
14255	assert(_PyUnicode_CheckConsistency(str, `1`));
14256	writer->readonly = `1`;
14257	Py_INCREF(str);
14258	writer->buffer = str;
14259	_PyUnicodeWriter_Update(writer);
14260	writer->pos += len;
14261	return `0`;
14262	}
14263	if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -`1`)
14264	return -`1`;
14265	}
14266	_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14267	str, `0`, len);
14268	writer->pos += len;
14269	return `0`;
14270	}
14271
14272	int
14273	_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter writer, PyObject str,
14274	Py_ssize_t start, Py_ssize_t end)
14275	{
14276	Py_UCS4 maxchar;
14277	Py_ssize_t len;
14278
14279	if (PyUnicode_READY(str) == -`1`)
14280	return -`1`;
14281
14282	assert(`0` <= start);
14283	assert(end <= PyUnicode_GET_LENGTH(str));
14284	assert(start <= end);
14285
14286	if (end == `0`)
14287	return `0`;
14288
14289	if (start == `0` && end == PyUnicode_GET_LENGTH(str))
14290	return _PyUnicodeWriter_WriteStr(writer, str);
14291
14292	if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14293	maxchar = _PyUnicode_FindMaxChar(str, start, end);
14294	else
14295	maxchar = writer->maxchar;
14296	len = end - start;
14297
14298	if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < `0`)
14299	return -`1`;
14300
14301	_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14302	str, start, len);
14303	writer->pos += len;
14304	return `0`;
14305	}
14306
14307	int
14308	_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14309	const char *ascii, Py_ssize_t len)
14310	{
14311	if (len == -`1`)
14312	len = strlen(ascii);
14313
14314	assert(ucs1lib_find_max_char((const Py_UCS1)ascii, (const* Py_UCS1*)ascii + len) < `128`);
14315
14316	if (writer->buffer == NULL && !writer->overallocate) {
14317	PyObject *str;
14318
14319	str = _PyUnicode_FromASCII(ascii, len);
14320	if (str == NULL)
14321	return -`1`;
14322
14323	writer->readonly = `1`;
14324	writer->buffer = str;
14325	_PyUnicodeWriter_Update(writer);
14326	writer->pos += len;
14327	return `0`;
14328	}
14329
14330	if (_PyUnicodeWriter_Prepare(writer, len, `127`) == -`1`)
14331	return -`1`;
14332
14333	switch (writer->kind)
14334	{
14335	case PyUnicode_1BYTE_KIND:
14336	{
14337	const Py_UCS1 str = (const* Py_UCS1 *)ascii;
14338	Py_UCS1 *data = writer->data;
14339
14340	memcpy(data + writer->pos, str, len);
14341	break;
14342	}
14343	case PyUnicode_2BYTE_KIND:
14344	{
14345	_PyUnicode_CONVERT_BYTES(
14346	Py_UCS1, Py_UCS2,
14347	ascii, ascii + len,
14348	(Py_UCS2 *)writer->data + writer->pos);
14349	break;
14350	}
14351	case PyUnicode_4BYTE_KIND:
14352	{
14353	_PyUnicode_CONVERT_BYTES(
14354	Py_UCS1, Py_UCS4,
14355	ascii, ascii + len,
14356	(Py_UCS4 *)writer->data + writer->pos);
14357	break;
14358	}
14359	default:
14360	Py_UNREACHABLE();
14361	}
14362
14363	writer->pos += len;
14364	return `0`;
14365	}
14366
14367	int
14368	_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14369	const char *str, Py_ssize_t len)
14370	{
14371	Py_UCS4 maxchar;
14372
14373	maxchar = ucs1lib_find_max_char((const Py_UCS1)str, (const* Py_UCS1*)str + len);
14374	if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -`1`)
14375	return -`1`;
14376	unicode_write_cstr(writer->buffer, writer->pos, str, len);
14377	writer->pos += len;
14378	return `0`;
14379	}
14380
14381	PyObject *
14382	_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14383	{
14384	PyObject *str;
14385
14386	if (writer->pos == `0`) {
14387	Py_CLEAR(writer->buffer);
14388	_Py_RETURN_UNICODE_EMPTY();
14389	}
14390
14391	str = writer->buffer;
14392	writer->buffer = NULL;
14393
14394	if (writer->readonly) {
14395	assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14396	return str;
14397	}
14398
14399	if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14400	PyObject *str2;
14401	str2 = resize_compact(str, writer->pos);
14402	if (str2 == NULL) {
14403	Py_DECREF(str);
14404	return NULL;
14405	}
14406	str = str2;
14407	}
14408
14409	assert(_PyUnicode_CheckConsistency(str, `1`));
14410	return unicode_result_ready(str);
14411	}
14412
14413	void
14414	_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14415	{
14416	Py_CLEAR(writer->buffer);
14417	}
14418
14419	#include "stringlib/unicode_format.h"
14420
14421	PyDoc_STRVAR(format__doc__,
14422	"S.format(args, *kwargs) -> str\n\
14423	\n\
14424	Return a formatted version of S, using substitutions from args and kwargs.\n\
14425	The substitutions are identified by braces ('{' and '}').");
14426
14427	PyDoc_STRVAR(format_map__doc__,
14428	"S.format_map(mapping) -> str\n\
14429	\n\
14430	Return a formatted version of S, using substitutions from mapping.\n\
14431	The substitutions are identified by braces ('{' and '}').");
14432
14433	/[clinic input]*
14434	str.__format__ as unicode___format__
14435
14436	format_spec: unicode
14437	/
14438
14439	Return a formatted version of the string as described by format_spec.
14440	[clinic start generated code]/*
14441
14442	static PyObject *
14443	unicode___format___impl(PyObject self, PyObject format_spec)
14444	/[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]/
14445	{
14446	_PyUnicodeWriter writer;
14447	int ret;
14448
14449	if (PyUnicode_READY(self) == -`1`)
14450	return NULL;
14451	_PyUnicodeWriter_Init(&writer);
14452	ret = _PyUnicode_FormatAdvancedWriter(&writer,
14453	self, format_spec, `0`,
14454	PyUnicode_GET_LENGTH(format_spec));
14455	if (ret == -`1`) {
14456	_PyUnicodeWriter_Dealloc(&writer);
14457	return NULL;
14458	}
14459	return _PyUnicodeWriter_Finish(&writer);
14460	}
14461
14462	/[clinic input]*
14463	str.__sizeof__ as unicode_sizeof
14464
14465	Return the size of the string in memory, in bytes.
14466	[clinic start generated code]/*
14467
14468	static PyObject *
14469	unicode_sizeof_impl(PyObject *self)
14470	/[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]/
14471	{
14472	Py_ssize_t size;
14473
14474	/ If it's a compact object, account for base structure +*
14475	character data. /*
14476	if (PyUnicode_IS_COMPACT_ASCII(self))
14477	size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + `1`;
14478	else if (PyUnicode_IS_COMPACT(self))
14479	size = sizeof(PyCompactUnicodeObject) +
14480	(PyUnicode_GET_LENGTH(self) + `1`) * PyUnicode_KIND(self);
14481	else {
14482	/ If it is a two-block object, account for base object, and*
14483	for character block if present. /*
14484	size = sizeof(PyUnicodeObject);
14485	if (_PyUnicode_DATA_ANY(self))
14486	size += (PyUnicode_GET_LENGTH(self) + `1`) *
14487	PyUnicode_KIND(self);
14488	}
14489	/ If the wstr pointer is present, account for it unless it is shared*
14490	with the data pointer. Check if the data is not shared. /*
14491	if (_PyUnicode_HAS_WSTR_MEMORY(self))
14492	size += (PyUnicode_WSTR_LENGTH(self) + `1`) * sizeof(wchar_t);
14493	if (_PyUnicode_HAS_UTF8_MEMORY(self))
14494	size += PyUnicode_UTF8_LENGTH(self) + `1`;
14495
14496	return PyLong_FromSsize_t(size);
14497	}
14498
14499	static PyObject *
14500	unicode_getnewargs(PyObject v, PyObject Py_UNUSED(ignored))
14501	{
14502	PyObject *copy = _PyUnicode_Copy(v);
14503	if (!copy)
14504	return NULL;
14505	return Py_BuildValue("(N)", copy);
14506	}
14507
14508	static PyMethodDef unicode_methods[] = {
14509	UNICODE_ENCODE_METHODDEF
14510	UNICODE_REPLACE_METHODDEF
14511	UNICODE_SPLIT_METHODDEF
14512	UNICODE_RSPLIT_METHODDEF
14513	UNICODE_JOIN_METHODDEF
14514	UNICODE_CAPITALIZE_METHODDEF
14515	UNICODE_CASEFOLD_METHODDEF
14516	UNICODE_TITLE_METHODDEF
14517	UNICODE_CENTER_METHODDEF
14518	{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14519	UNICODE_EXPANDTABS_METHODDEF
14520	{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14521	UNICODE_PARTITION_METHODDEF
14522	{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14523	UNICODE_LJUST_METHODDEF
14524	UNICODE_LOWER_METHODDEF
14525	UNICODE_LSTRIP_METHODDEF
14526	{"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14527	{"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14528	UNICODE_RJUST_METHODDEF
14529	UNICODE_RSTRIP_METHODDEF
14530	UNICODE_RPARTITION_METHODDEF
14531	UNICODE_SPLITLINES_METHODDEF
14532	UNICODE_STRIP_METHODDEF
14533	UNICODE_SWAPCASE_METHODDEF
14534	UNICODE_TRANSLATE_METHODDEF
14535	UNICODE_UPPER_METHODDEF
14536	{"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14537	{"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14538	UNICODE_REMOVEPREFIX_METHODDEF
14539	UNICODE_REMOVESUFFIX_METHODDEF
14540	UNICODE_ISASCII_METHODDEF
14541	UNICODE_ISLOWER_METHODDEF
14542	UNICODE_ISUPPER_METHODDEF
14543	UNICODE_ISTITLE_METHODDEF
14544	UNICODE_ISSPACE_METHODDEF
14545	UNICODE_ISDECIMAL_METHODDEF
14546	UNICODE_ISDIGIT_METHODDEF
14547	UNICODE_ISNUMERIC_METHODDEF
14548	UNICODE_ISALPHA_METHODDEF
14549	UNICODE_ISALNUM_METHODDEF
14550	UNICODE_ISIDENTIFIER_METHODDEF
14551	UNICODE_ISPRINTABLE_METHODDEF
14552	UNICODE_ZFILL_METHODDEF
14553	{"format", (PyCFunction)(void()(void*)) do_string_format, METH_VARARGS \| METH_KEYWORDS, format__doc__},
14554	{"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14555	UNICODE___FORMAT___METHODDEF
14556	UNICODE_MAKETRANS_METHODDEF
14557	UNICODE_SIZEOF_METHODDEF
14558	#if 0
14559	/ These methods are just used for debugging the implementation. /
14560	{"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
14561	#endif
14562
14563	{"__getnewargs__", unicode_getnewargs, METH_NOARGS},
14564	{NULL, NULL}
14565	};
14566
14567	static PyObject *
14568	unicode_mod(PyObject v, PyObject w)
14569	{
14570	if (!PyUnicode_Check(v))
14571	Py_RETURN_NOTIMPLEMENTED;
14572	return PyUnicode_Format(v, w);
14573	}
14574
14575	static PyNumberMethods unicode_as_number = {
14576	`0`, /nb_add/
14577	`0`, /nb_subtract/
14578	`0`, /nb_multiply/
14579	unicode_mod, /nb_remainder/
14580	};
14581
14582	static PySequenceMethods unicode_as_sequence = {
14583	(lenfunc) unicode_length, / sq_length /
14584	PyUnicode_Concat, / sq_concat /
14585	(ssizeargfunc) unicode_repeat, / sq_repeat /
14586	(ssizeargfunc) unicode_getitem, / sq_item /
14587	`0`, / sq_slice /
14588	`0`, / sq_ass_item /
14589	`0`, / sq_ass_slice /
14590	PyUnicode_Contains, / sq_contains /
14591	};
14592
14593	static PyObject*
14594	unicode_subscript(PyObject* self, PyObject* item)
14595	{
14596	if (PyUnicode_READY(self) == -`1`)
14597	return NULL;
14598
14599	if (_PyIndex_Check(item)) {
14600	Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14601	if (i == -`1` && PyErr_Occurred())
14602	return NULL;
14603	if (i < `0`)
14604	i += PyUnicode_GET_LENGTH(self);
14605	return unicode_getitem(self, i);
14606	} else if (PySlice_Check(item)) {
14607	Py_ssize_t start, stop, step, slicelength, i;
14608	size_t cur;
14609	PyObject *result;
14610	const void *src_data;
14611	void *dest_data;
14612	int src_kind, dest_kind;
14613	Py_UCS4 ch, max_char, kind_limit;
14614
14615	if (PySlice_Unpack(item, &start, &stop, &step) < `0`) {
14616	return NULL;
14617	}
14618	slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14619	&start, &stop, step);
14620
14621	if (slicelength <= `0`) {
14622	_Py_RETURN_UNICODE_EMPTY();
14623	} else if (start == `0` && step == `1` &&
14624	slicelength == PyUnicode_GET_LENGTH(self)) {
14625	return unicode_result_unchanged(self);
14626	} else if (step == `1`) {
14627	return PyUnicode_Substring(self,
14628	start, start + slicelength);
14629	}
14630	/ General case /
14631	src_kind = PyUnicode_KIND(self);
14632	src_data = PyUnicode_DATA(self);
14633	if (!PyUnicode_IS_ASCII(self)) {
14634	kind_limit = kind_maxchar_limit(src_kind);
14635	max_char = `0`;
14636	for (cur = start, i = `0`; i < slicelength; cur += step, i++) {
14637	ch = PyUnicode_READ(src_kind, src_data, cur);
14638	if (ch > max_char) {
14639	max_char = ch;
14640	if (max_char >= kind_limit)
14641	break;
14642	}
14643	}
14644	}
14645	else
14646	max_char = `127`;
14647	result = PyUnicode_New(slicelength, max_char);
14648	if (result == NULL)
14649	return NULL;
14650	dest_kind = PyUnicode_KIND(result);
14651	dest_data = PyUnicode_DATA(result);
14652
14653	for (cur = start, i = `0`; i < slicelength; cur += step, i++) {
14654	Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14655	PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14656	}
14657	assert(_PyUnicode_CheckConsistency(result, `1`));
14658	return result;
14659	} else {
14660	PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14661	return NULL;
14662	}
14663	}
14664
14665	static PyMappingMethods unicode_as_mapping = {
14666	(lenfunc)unicode_length, / mp_length /
14667	(binaryfunc)unicode_subscript, / mp_subscript /
14668	(objobjargproc)`0`, / mp_ass_subscript /
14669	};
14670
14671
14672	/ Helpers for PyUnicode_Format() /
14673
14674	struct unicode_formatter_t {
14675	PyObject *args;
14676	int args_owned;
14677	Py_ssize_t arglen, argidx;
14678	PyObject *dict;
14679
14680	enum PyUnicode_Kind fmtkind;
14681	Py_ssize_t fmtcnt, fmtpos;
14682	const void *fmtdata;
14683	PyObject *fmtstr;
14684
14685	_PyUnicodeWriter writer;
14686	};
14687
14688	struct unicode_format_arg_t {
14689	Py_UCS4 ch;
14690	int flags;
14691	Py_ssize_t width;
14692	int prec;
14693	int sign;
14694	};
14695
14696	static PyObject *
14697	unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14698	{
14699	Py_ssize_t argidx = ctx->argidx;
14700
14701	if (argidx < ctx->arglen) {
14702	ctx->argidx++;
14703	if (ctx->arglen < `0`)
14704	return ctx->args;
14705	else
14706	return PyTuple_GetItem(ctx->args, argidx);
14707	}
14708	PyErr_SetString(PyExc_TypeError,
14709	"not enough arguments for format string");
14710	return NULL;
14711	}
14712
14713	/ Returns a new reference to a PyUnicode object, or NULL on failure. /
14714
14715	/ Format a float into the writer if the writer is not NULL, or into p_output
14716	otherwise.
14717
14718	Return 0 on success, raise an exception and return -1 on error. /*
14719	static int
14720	formatfloat(PyObject v, struct* unicode_format_arg_t *arg,
14721	PyObject **p_output,
14722	_PyUnicodeWriter *writer)
14723	{
14724	char *p;
14725	double x;
14726	Py_ssize_t len;
14727	int prec;
14728	int dtoa_flags;
14729
14730	x = PyFloat_AsDouble(v);
14731	if (x == -`1.0` && PyErr_Occurred())
14732	return -`1`;
14733
14734	prec = arg->prec;
14735	if (prec < `0`)
14736	prec = `6`;
14737
14738	if (arg->flags & F_ALT)
14739	dtoa_flags = Py_DTSF_ALT;
14740	else
14741	dtoa_flags = `0`;
14742	p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14743	if (p == NULL)
14744	return -`1`;
14745	len = strlen(p);
14746	if (writer) {
14747	if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < `0`) {
14748	PyMem_Free(p);
14749	return -`1`;
14750	}
14751	}
14752	else
14753	*p_output = _PyUnicode_FromASCII(p, len);
14754	PyMem_Free(p);
14755	return `0`;
14756	}
14757
14758	/ formatlong() emulates the format codes d, u, o, x and X, and*
14759	* the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14760	* Python's regular ints.
14761	* Return value: a new PyUnicodeObject*, or NULL if error.
14762	* The output string is of the form
14763	* "-"? ("0x" \| "0X")? digit+
14764	* "0x"/"0X" are present only for x and X conversions, with F_ALT
14765	* set in flags. The case of hex digits will be correct,
14766	* There will be at least prec digits, zero-filled on the left if
14767	* necessary to get that many.
14768	* val object to be converted
14769	* flags bitmask of format flags; only F_ALT is looked at
14770	* prec minimum number of digits; 0-fill on left if needed
14771	* type a character in [duoxX]; u acts the same as d
14772	*
14773	* CAUTION: o, x and X conversions on regular ints can never
14774	* produce a '-' sign, but can for Python's unbounded ints.
14775	*/
14776	PyObject *
14777	_PyUnicode_FormatLong(PyObject val, int* alt, int prec, int type)
14778	{
14779	PyObject *result = NULL;
14780	char *buf;
14781	Py_ssize_t i;
14782	int sign; / 1 if '-', else 0 /
14783	int len; / number of characters /
14784	Py_ssize_t llen;
14785	int numdigits; / len == numnondigits + numdigits /
14786	int numnondigits = `0`;
14787
14788	/ Avoid exceeding SSIZE_T_MAX /
14789	if (prec > INT_MAX-`3`) {
14790	PyErr_SetString(PyExc_OverflowError,
14791	"precision too large");
14792	return NULL;
14793	}
14794
14795	assert(PyLong_Check(val));
14796
14797	switch (type) {
14798	default:
14799	Py_UNREACHABLE();
14800	case `'d'`:
14801	case `'i'`:
14802	case `'u'`:
14803	/ int and int subclasses should print numerically when a numeric /
14804	/ format code is used (see issue18780) /
14805	result = PyNumber_ToBase(val, `10`);
14806	break;
14807	case `'o'`:
14808	numnondigits = `2`;
14809	result = PyNumber_ToBase(val, `8`);
14810	break;
14811	case `'x'`:
14812	case `'X'`:
14813	numnondigits = `2`;
14814	result = PyNumber_ToBase(val, `16`);
14815	break;
14816	}
14817	if (!result)
14818	return NULL;
14819
14820	assert(unicode_modifiable(result));
14821	assert(PyUnicode_IS_READY(result));
14822	assert(PyUnicode_IS_ASCII(result));
14823
14824	/ To modify the string in-place, there can only be one reference. /
14825	if (Py_REFCNT(result) != `1`) {
14826	Py_DECREF(result);
14827	PyErr_BadInternalCall();
14828	return NULL;
14829	}
14830	buf = PyUnicode_DATA(result);
14831	llen = PyUnicode_GET_LENGTH(result);
14832	if (llen > INT_MAX) {
14833	Py_DECREF(result);
14834	PyErr_SetString(PyExc_ValueError,
14835	"string too large in _PyUnicode_FormatLong");
14836	return NULL;
14837	}
14838	len = (int)llen;
14839	sign = buf[`0`] == `'-'`;
14840	numnondigits += sign;
14841	numdigits = len - numnondigits;
14842	assert(numdigits > `0`);
14843
14844	/ Get rid of base marker unless F_ALT /
14845	if (((alt) == `0` &&
14846	(type == `'o'` \|\| type == `'x'` \|\| type == `'X'`))) {
14847	assert(buf[sign] == `'0'`);
14848	assert(buf[sign+`1`] == `'x'` \|\| buf[sign+`1`] == `'X'` \|\|
14849	buf[sign+`1`] == `'o'`);
14850	numnondigits -= `2`;
14851	buf += `2`;
14852	len -= `2`;
14853	if (sign)
14854	buf[`0`] = `'-'`;
14855	assert(len == numnondigits + numdigits);
14856	assert(numdigits > `0`);
14857	}
14858
14859	/ Fill with leading zeroes to meet minimum width. /
14860	if (prec > numdigits) {
14861	PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14862	numnondigits + prec);
14863	char *b1;
14864	if (!r1) {
14865	Py_DECREF(result);
14866	return NULL;
14867	}
14868	b1 = PyBytes_AS_STRING(r1);
14869	for (i = `0`; i < numnondigits; ++i)
14870	b1++ = buf++;
14871	for (i = `0`; i < prec - numdigits; i++)
14872	*b1++ = `'0'`;
14873	for (i = `0`; i < numdigits; i++)
14874	b1++ = buf++;
14875	*b1 = `'\0'`;
14876	Py_DECREF(result);
14877	result = r1;
14878	buf = PyBytes_AS_STRING(result);
14879	len = numnondigits + prec;
14880	}
14881
14882	/ Fix up case for hex conversions. /
14883	if (type == `'X'`) {
14884	/ Need to convert all lower case letters to upper case.*
14885	and need to convert 0x to 0X (and -0x to -0X). /*
14886	for (i = `0`; i < len; i++)
14887	if (buf[i] >= `'a'` && buf[i] <= `'x'`)
14888	buf[i] -= `'a'`-`'A'`;
14889	}
14890	if (!PyUnicode_Check(result)
14891	\|\| buf != PyUnicode_DATA(result)) {
14892	PyObject *unicode;
14893	unicode = _PyUnicode_FromASCII(buf, len);
14894	Py_DECREF(result);
14895	result = unicode;
14896	}
14897	else if (len != PyUnicode_GET_LENGTH(result)) {
14898	if (PyUnicode_Resize(&result, len) < `0`)
14899	Py_CLEAR(result);
14900	}
14901	return result;
14902	}
14903
14904	/ Format an integer or a float as an integer.*
14905	* Return 1 if the number has been formatted into the writer,
14906	* 0 if the number has been formatted into *p_output
14907	* -1 and raise an exception on error */
14908	static int
14909	mainformatlong(PyObject *v,
14910	struct unicode_format_arg_t *arg,
14911	PyObject **p_output,
14912	_PyUnicodeWriter *writer)
14913	{
14914	PyObject iobj, res;
14915	char type = (char)arg->ch;
14916
14917	if (!PyNumber_Check(v))
14918	goto wrongtype;
14919
14920	/ make sure number is a type of integer for o, x, and X /
14921	if (!PyLong_Check(v)) {
14922	if (type == `'o'` \|\| type == `'x'` \|\| type == `'X'`) {
14923	iobj = _PyNumber_Index(v);
14924	}
14925	else {
14926	iobj = PyNumber_Long(v);
14927	}
14928	if (iobj == NULL ) {
14929	if (PyErr_ExceptionMatches(PyExc_TypeError))
14930	goto wrongtype;
14931	return -`1`;
14932	}
14933	assert(PyLong_Check(iobj));
14934	}
14935	else {
14936	iobj = v;
14937	Py_INCREF(iobj);
14938	}
14939
14940	if (PyLong_CheckExact(v)
14941	&& arg->width == -`1` && arg->prec == -`1`
14942	&& !(arg->flags & (F_SIGN \| F_BLANK))
14943	&& type != `'X'`)
14944	{
14945	/ Fast path /
14946	int alternate = arg->flags & F_ALT;
14947	int base;
14948
14949	switch(type)
14950	{
14951	default:
14952	Py_UNREACHABLE();
14953	case `'d'`:
14954	case `'i'`:
14955	case `'u'`:
14956	base = `10`;
14957	break;
14958	case `'o'`:
14959	base = `8`;
14960	break;
14961	case `'x'`:
14962	case `'X'`:
14963	base = `16`;
14964	break;
14965	}
14966
14967	if (_PyLong_FormatWriter(writer, v, base, alternate) == -`1`) {
14968	Py_DECREF(iobj);
14969	return -`1`;
14970	}
14971	Py_DECREF(iobj);
14972	return `1`;
14973	}
14974
14975	res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14976	Py_DECREF(iobj);
14977	if (res == NULL)
14978	return -`1`;
14979	*p_output = res;
14980	return `0`;
14981
14982	wrongtype:
14983	switch(type)
14984	{
14985	case `'o'`:
14986	case `'x'`:
14987	case `'X'`:
14988	PyErr_Format(PyExc_TypeError,
14989	"%%%c format: an integer is required, "
14990	"not %.200s",
14991	type, Py_TYPE(v)->tp_name);
14992	break;
14993	default:
14994	PyErr_Format(PyExc_TypeError,
14995	"%%%c format: a real number is required, "
14996	"not %.200s",
14997	type, Py_TYPE(v)->tp_name);
14998	break;
14999	}
15000	return -`1`;
15001	}
15002
15003	static Py_UCS4
15004	formatchar(PyObject *v)
15005	{
15006	/ presume that the buffer is at least 3 characters long /
15007	if (PyUnicode_Check(v)) {
15008	if (PyUnicode_GET_LENGTH(v) == `1`) {
15009	return PyUnicode_READ_CHAR(v, `0`);
15010	}
15011	goto onError;
15012	}
15013	else {
15014	int overflow;
15015	long x = PyLong_AsLongAndOverflow(v, &overflow);
15016	if (x == -`1` && PyErr_Occurred()) {
15017	if (PyErr_ExceptionMatches(PyExc_TypeError)) {
15018	goto onError;
15019	}
15020	return (Py_UCS4) -`1`;
15021	}
15022
15023	if (x < `0` \|\| x > MAX_UNICODE) {
15024	/ this includes an overflow in converting to C long /
15025	PyErr_SetString(PyExc_OverflowError,
15026	"%c arg not in range(0x110000)");
15027	return (Py_UCS4) -`1`;
15028	}
15029
15030	return (Py_UCS4) x;
15031	}
15032
15033	onError:
15034	PyErr_SetString(PyExc_TypeError,
15035	"%c requires int or char");
15036	return (Py_UCS4) -`1`;
15037	}
15038
15039	/ Parse options of an argument: flags, width, precision.*
15040	Handle also "%(name)" syntax.
15041
15042	Return 0 if the argument has been formatted into arg->str.
15043	Return 1 if the argument has been written into ctx->writer,
15044	Raise an exception and return -1 on error. /*
15045	static int
15046	unicode_format_arg_parse(struct unicode_formatter_t *ctx,
15047	struct unicode_format_arg_t *arg)
15048	{
15049	#define FORMAT_READ(ctx) \
15050	PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
15051
15052	PyObject *v;
15053
15054	if (arg->ch == `'('`) {
15055	/ Get argument value from a dictionary. Example: "%(name)s". /
15056	Py_ssize_t keystart;
15057	Py_ssize_t keylen;
15058	PyObject *key;
15059	int pcount = `1`;
15060
15061	if (ctx->dict == NULL) {
15062	PyErr_SetString(PyExc_TypeError,
15063	"format requires a mapping");
15064	return -`1`;
15065	}
15066	++ctx->fmtpos;
15067	--ctx->fmtcnt;
15068	keystart = ctx->fmtpos;
15069	/ Skip over balanced parentheses /
15070	while (pcount > `0` && --ctx->fmtcnt >= `0`) {
15071	arg->ch = FORMAT_READ(ctx);
15072	if (arg->ch == `')'`)
15073	--pcount;
15074	else if (arg->ch == `'('`)
15075	++pcount;
15076	ctx->fmtpos++;
15077	}
15078	keylen = ctx->fmtpos - keystart - `1`;
15079	if (ctx->fmtcnt < `0` \|\| pcount > `0`) {
15080	PyErr_SetString(PyExc_ValueError,
15081	"incomplete format key");
15082	return -`1`;
15083	}
15084	key = PyUnicode_Substring(ctx->fmtstr,
15085	keystart, keystart + keylen);
15086	if (key == NULL)
15087	return -`1`;
15088	if (ctx->args_owned) {
15089	ctx->args_owned = `0`;
15090	Py_DECREF(ctx->args);
15091	}
15092	ctx->args = PyObject_GetItem(ctx->dict, key);
15093	Py_DECREF(key);
15094	if (ctx->args == NULL)
15095	return -`1`;
15096	ctx->args_owned = `1`;
15097	ctx->arglen = -`1`;
15098	ctx->argidx = -`2`;
15099	}
15100
15101	/ Parse flags. Example: "%+i" => flags=F_SIGN. /
15102	while (--ctx->fmtcnt >= `0`) {
15103	arg->ch = FORMAT_READ(ctx);
15104	ctx->fmtpos++;
15105	switch (arg->ch) {
15106	case `'-'`: arg->flags \|= F_LJUST; continue;
15107	case `'+'`: arg->flags \|= F_SIGN; continue;
15108	case `' '`: arg->flags \|= F_BLANK; continue;
15109	case `'#'`: arg->flags \|= F_ALT; continue;
15110	case `'0'`: arg->flags \|= F_ZERO; continue;
15111	}
15112	break;
15113	}
15114
15115	/ Parse width. Example: "%10s" => width=10 /
15116	if (arg->ch == `'*'`) {
15117	v = unicode_format_getnextarg(ctx);
15118	if (v == NULL)
15119	return -`1`;
15120	if (!PyLong_Check(v)) {
15121	PyErr_SetString(PyExc_TypeError,
15122	"* wants int");
15123	return -`1`;
15124	}
15125	arg->width = PyLong_AsSsize_t(v);
15126	if (arg->width == -`1` && PyErr_Occurred())
15127	return -`1`;
15128	if (arg->width < `0`) {
15129	arg->flags \|= F_LJUST;
15130	arg->width = -arg->width;
15131	}
15132	if (--ctx->fmtcnt >= `0`) {
15133	arg->ch = FORMAT_READ(ctx);
15134	ctx->fmtpos++;
15135	}
15136	}
15137	else if (arg->ch >= `'0'` && arg->ch <= `'9'`) {
15138	arg->width = arg->ch - `'0'`;
15139	while (--ctx->fmtcnt >= `0`) {
15140	arg->ch = FORMAT_READ(ctx);
15141	ctx->fmtpos++;
15142	if (arg->ch < `'0'` \|\| arg->ch > `'9'`)
15143	break;
15144	/ Since arg->ch is unsigned, the RHS would end up as unsigned,*
15145	mixing signed and unsigned comparison. Since arg->ch is between
15146	'0' and '9', casting to int is safe. /*
15147	if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - `'0'`)) / `10`) {
15148	PyErr_SetString(PyExc_ValueError,
15149	"width too big");
15150	return -`1`;
15151	}
15152	arg->width = arg->width*`10` + (arg->ch - `'0'`);
15153	}
15154	}
15155
15156	/ Parse precision. Example: "%.3f" => prec=3 /
15157	if (arg->ch == `'.'`) {
15158	arg->prec = `0`;
15159	if (--ctx->fmtcnt >= `0`) {
15160	arg->ch = FORMAT_READ(ctx);
15161	ctx->fmtpos++;
15162	}
15163	if (arg->ch == `'*'`) {
15164	v = unicode_format_getnextarg(ctx);
15165	if (v == NULL)
15166	return -`1`;
15167	if (!PyLong_Check(v)) {
15168	PyErr_SetString(PyExc_TypeError,
15169	"* wants int");
15170	return -`1`;
15171	}
15172	arg->prec = _PyLong_AsInt(v);
15173	if (arg->prec == -`1` && PyErr_Occurred())
15174	return -`1`;
15175	if (arg->prec < `0`)
15176	arg->prec = `0`;
15177	if (--ctx->fmtcnt >= `0`) {
15178	arg->ch = FORMAT_READ(ctx);
15179	ctx->fmtpos++;
15180	}
15181	}
15182	else if (arg->ch >= `'0'` && arg->ch <= `'9'`) {
15183	arg->prec = arg->ch - `'0'`;
15184	while (--ctx->fmtcnt >= `0`) {
15185	arg->ch = FORMAT_READ(ctx);
15186	ctx->fmtpos++;
15187	if (arg->ch < `'0'` \|\| arg->ch > `'9'`)
15188	break;
15189	if (arg->prec > (INT_MAX - ((int)arg->ch - `'0'`)) / `10`) {
15190	PyErr_SetString(PyExc_ValueError,
15191	"precision too big");
15192	return -`1`;
15193	}
15194	arg->prec = arg->prec*`10` + (arg->ch - `'0'`);
15195	}
15196	}
15197	}
15198
15199	/ Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") /
15200	if (ctx->fmtcnt >= `0`) {
15201	if (arg->ch == `'h'` \|\| arg->ch == `'l'` \|\| arg->ch == `'L'`) {
15202	if (--ctx->fmtcnt >= `0`) {
15203	arg->ch = FORMAT_READ(ctx);
15204	ctx->fmtpos++;
15205	}
15206	}
15207	}
15208	if (ctx->fmtcnt < `0`) {
15209	PyErr_SetString(PyExc_ValueError,
15210	"incomplete format");
15211	return -`1`;
15212	}
15213	return `0`;
15214
15215	#undef FORMAT_READ
15216	}
15217
15218	/ Format one argument. Supported conversion specifiers:*
15219
15220	- "s", "r", "a": any type
15221	- "i", "d", "u": int or float
15222	- "o", "x", "X": int
15223	- "e", "E", "f", "F", "g", "G": float
15224	- "c": int or str (1 character)
15225
15226	When possible, the output is written directly into the Unicode writer
15227	(ctx->writer). A string is created when padding is required.
15228
15229	Return 0 if the argument has been formatted into p_str,*
15230	1 if the argument has been written into ctx->writer,
15231	-1 on error. /*
15232	static int
15233	unicode_format_arg_format(struct unicode_formatter_t *ctx,
15234	struct unicode_format_arg_t *arg,
15235	PyObject **p_str)
15236	{
15237	PyObject *v;
15238	_PyUnicodeWriter *writer = &ctx->writer;
15239
15240	if (ctx->fmtcnt == `0`)
15241	ctx->writer.overallocate = `0`;
15242
15243	v = unicode_format_getnextarg(ctx);
15244	if (v == NULL)
15245	return -`1`;
15246
15247
15248	switch (arg->ch) {
15249	case `'s'`:
15250	case `'r'`:
15251	case `'a'`:
15252	if (PyLong_CheckExact(v) && arg->width == -`1` && arg->prec == -`1`) {
15253	/ Fast path /
15254	if (_PyLong_FormatWriter(writer, v, `10`, arg->flags & F_ALT) == -`1`)
15255	return -`1`;
15256	return `1`;
15257	}
15258
15259	if (PyUnicode_CheckExact(v) && arg->ch == `'s'`) {
15260	*p_str = v;
15261	Py_INCREF(*p_str);
15262	}
15263	else {
15264	if (arg->ch == `'s'`)
15265	*p_str = PyObject_Str(v);
15266	else if (arg->ch == `'r'`)
15267	*p_str = PyObject_Repr(v);
15268	else
15269	*p_str = PyObject_ASCII(v);
15270	}
15271	break;
15272
15273	case `'i'`:
15274	case `'d'`:
15275	case `'u'`:
15276	case `'o'`:
15277	case `'x'`:
15278	case `'X'`:
15279	{
15280	int ret = mainformatlong(v, arg, p_str, writer);
15281	if (ret != `0`)
15282	return ret;
15283	arg->sign = `1`;
15284	break;
15285	}
15286
15287	case `'e'`:
15288	case `'E'`:
15289	case `'f'`:
15290	case `'F'`:
15291	case `'g'`:
15292	case `'G'`:
15293	if (arg->width == -`1` && arg->prec == -`1`
15294	&& !(arg->flags & (F_SIGN \| F_BLANK)))
15295	{
15296	/ Fast path /
15297	if (formatfloat(v, arg, NULL, writer) == -`1`)
15298	return -`1`;
15299	return `1`;
15300	}
15301
15302	arg->sign = `1`;
15303	if (formatfloat(v, arg, p_str, NULL) == -`1`)
15304	return -`1`;
15305	break;
15306
15307	case `'c'`:
15308	{
15309	Py_UCS4 ch = formatchar(v);
15310	if (ch == (Py_UCS4) -`1`)
15311	return -`1`;
15312	if (arg->width == -`1` && arg->prec == -`1`) {
15313	/ Fast path /
15314	if (_PyUnicodeWriter_WriteCharInline(writer, ch) < `0`)
15315	return -`1`;
15316	return `1`;
15317	}
15318	*p_str = PyUnicode_FromOrdinal(ch);
15319	break;
15320	}
15321
15322	default:
15323	PyErr_Format(PyExc_ValueError,
15324	"unsupported format character '%c' (0x%x) "
15325	"at index %zd",
15326	(`31`<=arg->ch && arg->ch<=`126`) ? (char)arg->ch : `'?'`,
15327	(int)arg->ch,
15328	ctx->fmtpos - `1`);
15329	return -`1`;
15330	}
15331	if (*p_str == NULL)
15332	return -`1`;
15333	assert (PyUnicode_Check(*p_str));
15334	return `0`;
15335	}
15336
15337	static int
15338	unicode_format_arg_output(struct unicode_formatter_t *ctx,
15339	struct unicode_format_arg_t *arg,
15340	PyObject *str)
15341	{
15342	Py_ssize_t len;
15343	enum PyUnicode_Kind kind;
15344	const void *pbuf;
15345	Py_ssize_t pindex;
15346	Py_UCS4 signchar;
15347	Py_ssize_t buflen;
15348	Py_UCS4 maxchar;
15349	Py_ssize_t sublen;
15350	_PyUnicodeWriter *writer = &ctx->writer;
15351	Py_UCS4 fill;
15352
15353	fill = `' '`;
15354	if (arg->sign && arg->flags & F_ZERO)
15355	fill = `'0'`;
15356
15357	if (PyUnicode_READY(str) == -`1`)
15358	return -`1`;
15359
15360	len = PyUnicode_GET_LENGTH(str);
15361	if ((arg->width == -`1` \|\| arg->width <= len)
15362	&& (arg->prec == -`1` \|\| arg->prec >= len)
15363	&& !(arg->flags & (F_SIGN \| F_BLANK)))
15364	{
15365	/ Fast path /
15366	if (_PyUnicodeWriter_WriteStr(writer, str) == -`1`)
15367	return -`1`;
15368	return `0`;
15369	}
15370
15371	/ Truncate the string for "s", "r" and "a" formats*
15372	if the precision is set /*
15373	if (arg->ch == `'s'` \|\| arg->ch == `'r'` \|\| arg->ch == `'a'`) {
15374	if (arg->prec >= `0` && len > arg->prec)
15375	len = arg->prec;
15376	}
15377
15378	/ Adjust sign and width /
15379	kind = PyUnicode_KIND(str);
15380	pbuf = PyUnicode_DATA(str);
15381	pindex = `0`;
15382	signchar = `'\0'`;
15383	if (arg->sign) {
15384	Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15385	if (ch == `'-'` \|\| ch == `'+'`) {
15386	signchar = ch;
15387	len--;
15388	pindex++;
15389	}
15390	else if (arg->flags & F_SIGN)
15391	signchar = `'+'`;
15392	else if (arg->flags & F_BLANK)
15393	signchar = `' '`;
15394	else
15395	arg->sign = `0`;
15396	}
15397	if (arg->width < len)
15398	arg->width = len;
15399
15400	/ Prepare the writer /
15401	maxchar = writer->maxchar;
15402	if (!(arg->flags & F_LJUST)) {
15403	if (arg->sign) {
15404	if ((arg->width-`1`) > len)
15405	maxchar = Py_MAX(maxchar, fill);
15406	}
15407	else {
15408	if (arg->width > len)
15409	maxchar = Py_MAX(maxchar, fill);
15410	}
15411	}
15412	if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15413	Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, `0`, pindex+len);
15414	maxchar = Py_MAX(maxchar, strmaxchar);
15415	}
15416
15417	buflen = arg->width;
15418	if (arg->sign && len == arg->width)
15419	buflen++;
15420	if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -`1`)
15421	return -`1`;
15422
15423	/ Write the sign if needed /
15424	if (arg->sign) {
15425	if (fill != `' '`) {
15426	PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15427	writer->pos += `1`;
15428	}
15429	if (arg->width > len)
15430	arg->width--;
15431	}
15432
15433	/ Write the numeric prefix for "x", "X" and "o" formats*
15434	if the alternate form is used.
15435	For example, write "0x" for the "%#x" format. /*
15436	if ((arg->flags & F_ALT) && (arg->ch == `'x'` \|\| arg->ch == `'X'` \|\| arg->ch == `'o'`)) {
15437	assert(PyUnicode_READ(kind, pbuf, pindex) == `'0'`);
15438	assert(PyUnicode_READ(kind, pbuf, pindex + `1`) == arg->ch);
15439	if (fill != `' '`) {
15440	PyUnicode_WRITE(writer->kind, writer->data, writer->pos, `'0'`);
15441	PyUnicode_WRITE(writer->kind, writer->data, writer->pos+`1`, arg->ch);
15442	writer->pos += `2`;
15443	pindex += `2`;
15444	}
15445	arg->width -= `2`;
15446	if (arg->width < `0`)
15447	arg->width = `0`;
15448	len -= `2`;
15449	}
15450
15451	/ Pad left with the fill character if needed /
15452	if (arg->width > len && !(arg->flags & F_LJUST)) {
15453	sublen = arg->width - len;
15454	unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15455	writer->pos += sublen;
15456	arg->width = len;
15457	}
15458
15459	/ If padding with spaces: write sign if needed and/or numeric prefix if*
15460	the alternate form is used /*
15461	if (fill == `' '`) {
15462	if (arg->sign) {
15463	PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15464	writer->pos += `1`;
15465	}
15466	if ((arg->flags & F_ALT) && (arg->ch == `'x'` \|\| arg->ch == `'X'` \|\| arg->ch == `'o'`)) {
15467	assert(PyUnicode_READ(kind, pbuf, pindex) == `'0'`);
15468	assert(PyUnicode_READ(kind, pbuf, pindex+`1`) == arg->ch);
15469	PyUnicode_WRITE(writer->kind, writer->data, writer->pos, `'0'`);
15470	PyUnicode_WRITE(writer->kind, writer->data, writer->pos+`1`, arg->ch);
15471	writer->pos += `2`;
15472	pindex += `2`;
15473	}
15474	}
15475
15476	/ Write characters /
15477	if (len) {
15478	_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15479	str, pindex, len);
15480	writer->pos += len;
15481	}
15482
15483	/ Pad right with the fill character if needed /
15484	if (arg->width > len) {
15485	sublen = arg->width - len;
15486	unicode_fill(writer->kind, writer->data, `' '`, writer->pos, sublen);
15487	writer->pos += sublen;
15488	}
15489	return `0`;
15490	}
15491
15492	/ Helper of PyUnicode_Format(): format one arg.*
15493	Return 0 on success, raise an exception and return -1 on error. /*
15494	static int
15495	unicode_format_arg(struct unicode_formatter_t *ctx)
15496	{
15497	struct unicode_format_arg_t arg;
15498	PyObject *str;
15499	int ret;
15500
15501	arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15502	if (arg.ch == `'%'`) {
15503	ctx->fmtpos++;
15504	ctx->fmtcnt--;
15505	if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, `'%'`) < `0`)
15506	return -`1`;
15507	return `0`;
15508	}
15509	arg.flags = `0`;
15510	arg.width = -`1`;
15511	arg.prec = -`1`;
15512	arg.sign = `0`;
15513	str = NULL;
15514
15515	ret = unicode_format_arg_parse(ctx, &arg);
15516	if (ret == -`1`)
15517	return -`1`;
15518
15519	ret = unicode_format_arg_format(ctx, &arg, &str);
15520	if (ret == -`1`)
15521	return -`1`;
15522
15523	if (ret != `1`) {
15524	ret = unicode_format_arg_output(ctx, &arg, str);
15525	Py_DECREF(str);
15526	if (ret == -`1`)
15527	return -`1`;
15528	}
15529
15530	if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15531	PyErr_SetString(PyExc_TypeError,
15532	"not all arguments converted during string formatting");
15533	return -`1`;
15534	}
15535	return `0`;
15536	}
15537
15538	PyObject *
15539	PyUnicode_Format(PyObject format, PyObject args)
15540	{
15541	struct unicode_formatter_t ctx;
15542
15543	if (format == NULL \|\| args == NULL) {
15544	PyErr_BadInternalCall();
15545	return NULL;
15546	}
15547
15548	if (ensure_unicode(format) < `0`)
15549	return NULL;
15550
15551	ctx.fmtstr = format;
15552	ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15553	ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15554	ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15555	ctx.fmtpos = `0`;
15556
15557	_PyUnicodeWriter_Init(&ctx.writer);
15558	ctx.writer.min_length = ctx.fmtcnt + `100`;
15559	ctx.writer.overallocate = `1`;
15560
15561	if (PyTuple_Check(args)) {
15562	ctx.arglen = PyTuple_Size(args);
15563	ctx.argidx = `0`;
15564	}
15565	else {
15566	ctx.arglen = -`1`;
15567	ctx.argidx = -`2`;
15568	}
15569	ctx.args_owned = `0`;
15570	if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15571	ctx.dict = args;
15572	else
15573	ctx.dict = NULL;
15574	ctx.args = args;
15575
15576	while (--ctx.fmtcnt >= `0`) {
15577	if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != `'%'`) {
15578	Py_ssize_t nonfmtpos;
15579
15580	nonfmtpos = ctx.fmtpos++;
15581	while (ctx.fmtcnt >= `0` &&
15582	PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != `'%'`) {
15583	ctx.fmtpos++;
15584	ctx.fmtcnt--;
15585	}
15586	if (ctx.fmtcnt < `0`) {
15587	ctx.fmtpos--;
15588	ctx.writer.overallocate = `0`;
15589	}
15590
15591	if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15592	nonfmtpos, ctx.fmtpos) < `0`)
15593	goto onError;
15594	}
15595	else {
15596	ctx.fmtpos++;
15597	if (unicode_format_arg(&ctx) == -`1`)
15598	goto onError;
15599	}
15600	}
15601
15602	if (ctx.argidx < ctx.arglen && !ctx.dict) {
15603	PyErr_SetString(PyExc_TypeError,
15604	"not all arguments converted during string formatting");
15605	goto onError;
15606	}
15607
15608	if (ctx.args_owned) {
15609	Py_DECREF(ctx.args);
15610	}
15611	return _PyUnicodeWriter_Finish(&ctx.writer);
15612
15613	onError:
15614	_PyUnicodeWriter_Dealloc(&ctx.writer);
15615	if (ctx.args_owned) {
15616	Py_DECREF(ctx.args);
15617	}
15618	return NULL;
15619	}
15620
15621	static PyObject *
15622	unicode_subtype_new(PyTypeObject type, PyObject unicode);
15623
15624	/[clinic input]*
15625	@classmethod
15626	str.__new__ as unicode_new
15627
15628	object as x: object = NULL
15629	encoding: str = NULL
15630	errors: str = NULL
15631
15632	[clinic start generated code]/*
15633
15634	static PyObject *
15635	unicode_new_impl(PyTypeObject type, PyObject x, const char *encoding,
15636	const char *errors)
15637	/[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]/
15638	{
15639	PyObject *unicode;
15640	if (x == NULL) {
15641	unicode = unicode_new_empty();
15642	}
15643	else if (encoding == NULL && errors == NULL) {
15644	unicode = PyObject_Str(x);
15645	}
15646	else {
15647	unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15648	}
15649
15650	if (unicode != NULL && type != &PyUnicode_Type) {
15651	Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15652	}
15653	return unicode;
15654	}
15655
15656	static PyObject *
15657	unicode_subtype_new(PyTypeObject type, PyObject unicode)
15658	{
15659	PyObject *self;
15660	Py_ssize_t length, char_size;
15661	int share_wstr, share_utf8;
15662	unsigned int kind;
15663	void *data;
15664
15665	assert(PyType_IsSubtype(type, &PyUnicode_Type));
15666	assert(_PyUnicode_CHECK(unicode));
15667	if (PyUnicode_READY(unicode) == -`1`) {
15668	return NULL;
15669	}
15670
15671	self = type->tp_alloc(type, `0`);
15672	if (self == NULL) {
15673	return NULL;
15674	}
15675	kind = PyUnicode_KIND(unicode);
15676	length = PyUnicode_GET_LENGTH(unicode);
15677
15678	_PyUnicode_LENGTH(self) = length;
15679	#ifdef Py_DEBUG
15680	_PyUnicode_HASH(self) = -`1`;
15681	#else
15682	_PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15683	#endif
15684	_PyUnicode_STATE(self).interned = `0`;
15685	_PyUnicode_STATE(self).kind = kind;
15686	_PyUnicode_STATE(self).compact = `0`;
15687	_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15688	_PyUnicode_STATE(self).ready = `1`;
15689	_PyUnicode_WSTR(self) = NULL;
15690	_PyUnicode_UTF8_LENGTH(self) = `0`;
15691	_PyUnicode_UTF8(self) = NULL;
15692	_PyUnicode_WSTR_LENGTH(self) = `0`;
15693	_PyUnicode_DATA_ANY(self) = NULL;
15694
15695	share_utf8 = `0`;
15696	share_wstr = `0`;
15697	if (kind == PyUnicode_1BYTE_KIND) {
15698	char_size = `1`;
15699	if (PyUnicode_MAX_CHAR_VALUE(unicode) < `128`)
15700	share_utf8 = `1`;
15701	}
15702	else if (kind == PyUnicode_2BYTE_KIND) {
15703	char_size = `2`;
15704	if (sizeof(wchar_t) == `2`)
15705	share_wstr = `1`;
15706	}
15707	else {
15708	assert(kind == PyUnicode_4BYTE_KIND);
15709	char_size = `4`;
15710	if (sizeof(wchar_t) == `4`)
15711	share_wstr = `1`;
15712	}
15713
15714	/ Ensure we won't overflow the length. /
15715	if (length > (PY_SSIZE_T_MAX / char_size - `1`)) {
15716	PyErr_NoMemory();
15717	goto onError;
15718	}
15719	data = PyObject_Malloc((length + `1`) * char_size);
15720	if (data == NULL) {
15721	PyErr_NoMemory();
15722	goto onError;
15723	}
15724
15725	_PyUnicode_DATA_ANY(self) = data;
15726	if (share_utf8) {
15727	_PyUnicode_UTF8_LENGTH(self) = length;
15728	_PyUnicode_UTF8(self) = data;
15729	}
15730	if (share_wstr) {
15731	_PyUnicode_WSTR_LENGTH(self) = length;
15732	_PyUnicode_WSTR(self) = (wchar_t *)data;
15733	}
15734
15735	memcpy(data, PyUnicode_DATA(unicode),
15736	kind * (length + `1`));
15737	assert(_PyUnicode_CheckConsistency(self, `1`));
15738	#ifdef Py_DEBUG
15739	_PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15740	#endif
15741	return self;
15742
15743	onError:
15744	Py_DECREF(self);
15745	return NULL;
15746	}
15747
15748	PyDoc_STRVAR(unicode_doc,
15749	"str(object='') -> str\n\
15750	str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15751	\n\
15752	Create a new string object from the given object. If encoding or\n\
15753	errors is specified, then the object must expose a data buffer\n\
15754	that will be decoded using the given encoding and error handler.\n\
15755	Otherwise, returns the result of object.__str__() (if defined)\n\
15756	or repr(object).\n\
15757	encoding defaults to sys.getdefaultencoding().\n\
15758	errors defaults to 'strict'.");
15759
15760	static PyObject unicode_iter(PyObject seq);
15761
15762	PyTypeObject PyUnicode_Type = {
15763	PyVarObject_HEAD_INIT(&PyType_Type, `0`)
15764	"str", / tp_name /
15765	sizeof(PyUnicodeObject), / tp_basicsize /
15766	`0`, / tp_itemsize /
15767	/ Slots /
15768	(destructor)unicode_dealloc, / tp_dealloc /
15769	`0`, / tp_vectorcall_offset /
15770	`0`, / tp_getattr /
15771	`0`, / tp_setattr /
15772	`0`, / tp_as_async /
15773	unicode_repr, / tp_repr /
15774	&unicode_as_number, / tp_as_number /
15775	&unicode_as_sequence, / tp_as_sequence /
15776	&unicode_as_mapping, / tp_as_mapping /
15777	(hashfunc) unicode_hash, / tp_hash/
15778	`0`, / tp_call/
15779	(reprfunc) unicode_str, / tp_str /
15780	PyObject_GenericGetAttr, / tp_getattro /
15781	`0`, / tp_setattro /
15782	`0`, / tp_as_buffer /
15783	Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_BASETYPE \|
15784	Py_TPFLAGS_UNICODE_SUBCLASS \|
15785	_Py_TPFLAGS_MATCH_SELF, / tp_flags /
15786	unicode_doc, / tp_doc /
15787	`0`, / tp_traverse /
15788	`0`, / tp_clear /
15789	PyUnicode_RichCompare, / tp_richcompare /
15790	`0`, / tp_weaklistoffset /
15791	unicode_iter, / tp_iter /
15792	`0`, / tp_iternext /
15793	unicode_methods, / tp_methods /
15794	`0`, / tp_members /
15795	`0`, / tp_getset /
15796	&PyBaseObject_Type, / tp_base /
15797	`0`, / tp_dict /
15798	`0`, / tp_descr_get /
15799	`0`, / tp_descr_set /
15800	`0`, / tp_dictoffset /
15801	`0`, / tp_init /
15802	`0`, / tp_alloc /
15803	unicode_new, / tp_new /
15804	PyObject_Del, / tp_free /
15805	};
15806
15807	/ Initialize the Unicode implementation /
15808
15809	PyStatus
15810	_PyUnicode_Init(PyInterpreterState *interp)
15811	{
15812	struct _Py_unicode_state *state = &interp->unicode;
15813	if (unicode_create_empty_string_singleton(state) < `0`) {
15814	return _PyStatus_NO_MEMORY();
15815	}
15816
15817	if (_Py_IsMainInterpreter(interp)) {
15818	/ initialize the linebreak bloom filter /
15819	const Py_UCS2 linebreak[] = {
15820	`0x000A`, / LINE FEED /
15821	`0x000D`, / CARRIAGE RETURN /
15822	`0x001C`, / FILE SEPARATOR /
15823	`0x001D`, / GROUP SEPARATOR /
15824	`0x001E`, / RECORD SEPARATOR /
15825	`0x0085`, / NEXT LINE /
15826	`0x2028`, / LINE SEPARATOR /
15827	`0x2029`, / PARAGRAPH SEPARATOR /
15828	};
15829	bloom_linebreak = make_bloom_mask(
15830	PyUnicode_2BYTE_KIND, linebreak,
15831	Py_ARRAY_LENGTH(linebreak));
15832	}
15833
15834	return _PyStatus_OK();
15835	}
15836
15837
15838	PyStatus
15839	_PyUnicode_InitTypes(void)
15840	{
15841	if (PyType_Ready(&PyUnicode_Type) < `0`) {
15842	return _PyStatus_ERR("Can't initialize unicode type");
15843	}
15844	if (PyType_Ready(&EncodingMapType) < `0`) {
15845	return _PyStatus_ERR("Can't initialize encoding map type");
15846	}
15847	if (PyType_Ready(&PyFieldNameIter_Type) < `0`) {
15848	return _PyStatus_ERR("Can't initialize field name iterator type");
15849	}
15850	if (PyType_Ready(&PyFormatterIter_Type) < `0`) {
15851	return _PyStatus_ERR("Can't initialize formatter iter type");
15852	}
15853	return _PyStatus_OK();
15854	}
15855
15856
15857	void
15858	PyUnicode_InternInPlace(PyObject **p)
15859	{
15860	PyObject s = p;
15861	#ifdef Py_DEBUG
15862	assert(s != NULL);
15863	assert(_PyUnicode_CHECK(s));
15864	#else
15865	if (s == NULL \|\| !PyUnicode_Check(s)) {
15866	return;
15867	}
15868	#endif
15869
15870	/ If it's a subclass, we don't really know what putting*
15871	it in the interned dict might do. /*
15872	if (!PyUnicode_CheckExact(s)) {
15873	return;
15874	}
15875
15876	if (PyUnicode_CHECK_INTERNED(s)) {
15877	return;
15878	}
15879
15880	#ifdef INTERNED_STRINGS
15881	if (PyUnicode_READY(s) == -`1`) {
15882	PyErr_Clear();
15883	return;
15884	}
15885
15886	if (interned == NULL) {
15887	interned = PyDict_New();
15888	if (interned == NULL) {
15889	PyErr_Clear(); / Don't leave an exception /
15890	return;
15891	}
15892	}
15893
15894	PyObject *t = PyDict_SetDefault(interned, s, s);
15895	if (t == NULL) {
15896	PyErr_Clear();
15897	return;
15898	}
15899
15900	if (t != s) {
15901	Py_INCREF(t);
15902	Py_SETREF(*p, t);
15903	return;
15904	}
15905
15906	/ The two references in interned dict (key and value) are not counted by*
15907	refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15908	this. /*
15909	Py_SET_REFCNT(s, Py_REFCNT(s) - `2`);
15910	_PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15911	#else
15912	// PyDict expects that interned strings have their hash
15913	// (PyASCIIObject.hash) already computed.
15914	(void)unicode_hash(s);
15915	#endif
15916	}
15917
15918	void
15919	PyUnicode_InternImmortal(PyObject **p)
15920	{
15921	if (PyErr_WarnEx(PyExc_DeprecationWarning,
15922	"PyUnicode_InternImmortal() is deprecated; "
15923	"use PyUnicode_InternInPlace() instead", `1`) < `0`)
15924	{
15925	// The function has no return value, the exception cannot
15926	// be reported to the caller, so just log it.
15927	PyErr_WriteUnraisable(NULL);
15928	}
15929
15930	PyUnicode_InternInPlace(p);
15931	if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15932	_PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15933	Py_INCREF(*p);
15934	}
15935	}
15936
15937	PyObject *
15938	PyUnicode_InternFromString(const char *cp)
15939	{
15940	PyObject *s = PyUnicode_FromString(cp);
15941	if (s == NULL)
15942	return NULL;
15943	PyUnicode_InternInPlace(&s);
15944	return s;
15945	}
15946
15947
15948	void
15949	_PyUnicode_ClearInterned(PyInterpreterState *interp)
15950	{
15951	if (!_Py_IsMainInterpreter(interp)) {
15952	// interned dict is shared by all interpreters
15953	return;
15954	}
15955
15956	if (interned == NULL) {
15957	return;
15958	}
15959	assert(PyDict_CheckExact(interned));
15960
15961	/ Interned unicode strings are not forcibly deallocated; rather, we give*
15962	them their stolen references back, and then clear and DECREF the
15963	interned dict. /*
15964
15965	#ifdef INTERNED_STATS
15966	fprintf(stderr, "releasing %zd interned strings\n",
15967	PyDict_GET_SIZE(interned));
15968
15969	Py_ssize_t immortal_size = `0`, mortal_size = `0`;
15970	#endif
15971	Py_ssize_t pos = `0`;
15972	PyObject s, ignored_value;
15973	while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15974	assert(PyUnicode_IS_READY(s));
15975
15976	switch (PyUnicode_CHECK_INTERNED(s)) {
15977	case SSTATE_INTERNED_IMMORTAL:
15978	Py_SET_REFCNT(s, Py_REFCNT(s) + `1`);
15979	#ifdef INTERNED_STATS
15980	immortal_size += PyUnicode_GET_LENGTH(s);
15981	#endif
15982	break;
15983	case SSTATE_INTERNED_MORTAL:
15984	// Restore the two references (key and value) ignored
15985	// by PyUnicode_InternInPlace().
15986	Py_SET_REFCNT(s, Py_REFCNT(s) + `2`);
15987	#ifdef INTERNED_STATS
15988	mortal_size += PyUnicode_GET_LENGTH(s);
15989	#endif
15990	break;
15991	case SSTATE_NOT_INTERNED:
15992	/ fall through /
15993	default:
15994	Py_UNREACHABLE();
15995	}
15996	_PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15997	}
15998	#ifdef INTERNED_STATS
15999	fprintf(stderr,
16000	"total size of all interned strings: %zd/%zd mortal/immortal\n",
16001	mortal_size, immortal_size);
16002	#endif
16003
16004	PyDict_Clear(interned);
16005	Py_CLEAR(interned);
16006	}
16007
16008
16009	/****************** Unicode Iterator ***********************/
16010
16011	typedef struct {
16012	PyObject_HEAD
16013	Py_ssize_t it_index;
16014	PyObject it_seq; /* Set to NULL when iterator is exhausted /
16015	} unicodeiterobject;
16016
16017	static void
16018	unicodeiter_dealloc(unicodeiterobject *it)
16019	{
16020	_PyObject_GC_UNTRACK(it);
16021	Py_XDECREF(it->it_seq);
16022	PyObject_GC_Del(it);
16023	}
16024
16025	static int
16026	unicodeiter_traverse(unicodeiterobject it, visitproc visit, void* *arg)
16027	{
16028	Py_VISIT(it->it_seq);
16029	return `0`;
16030	}
16031
16032	static PyObject *
16033	unicodeiter_next(unicodeiterobject *it)
16034	{
16035	PyObject seq, item;
16036
16037	assert(it != NULL);
16038	seq = it->it_seq;
16039	if (seq == NULL)
16040	return NULL;
16041	assert(_PyUnicode_CHECK(seq));
16042
16043	if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16044	int kind = PyUnicode_KIND(seq);
16045	const void *data = PyUnicode_DATA(seq);
16046	Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
16047	item = PyUnicode_FromOrdinal(chr);
16048	if (item != NULL)
16049	++it->it_index;
16050	return item;
16051	}
16052
16053	it->it_seq = NULL;
16054	Py_DECREF(seq);
16055	return NULL;
16056	}
16057
16058	static PyObject *
16059	unicodeiter_len(unicodeiterobject it, PyObject Py_UNUSED(ignored))
16060	{
16061	Py_ssize_t len = `0`;
16062	if (it->it_seq)
16063	len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
16064	return PyLong_FromSsize_t(len);
16065	}
16066
16067	PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
16068
16069	static PyObject *
16070	unicodeiter_reduce(unicodeiterobject it, PyObject Py_UNUSED(ignored))
16071	{
16072	_Py_IDENTIFIER(iter);
16073	if (it->it_seq != NULL) {
16074	return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
16075	it->it_seq, it->it_index);
16076	} else {
16077	PyObject u = (PyObject )_PyUnicode_New(`0`);
16078	if (u == NULL)
16079	return NULL;
16080	return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
16081	}
16082	}
16083
16084	PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
16085
16086	static PyObject *
16087	unicodeiter_setstate(unicodeiterobject it, PyObject state)
16088	{
16089	Py_ssize_t index = PyLong_AsSsize_t(state);
16090	if (index == -`1` && PyErr_Occurred())
16091	return NULL;
16092	if (it->it_seq != NULL) {
16093	if (index < `0`)
16094	index = `0`;
16095	else if (index > PyUnicode_GET_LENGTH(it->it_seq))
16096	index = PyUnicode_GET_LENGTH(it->it_seq); / iterator truncated /
16097	it->it_index = index;
16098	}
16099	Py_RETURN_NONE;
16100	}
16101
16102	PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16103
16104	static PyMethodDef unicodeiter_methods[] = {
16105	{"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
16106	length_hint_doc},
16107	{"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
16108	reduce_doc},
16109	{"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
16110	setstate_doc},
16111	{NULL, NULL} / sentinel /
16112	};
16113
16114	PyTypeObject PyUnicodeIter_Type = {
16115	PyVarObject_HEAD_INIT(&PyType_Type, `0`)
16116	"str_iterator", / tp_name /
16117	sizeof(unicodeiterobject), / tp_basicsize /
16118	`0`, / tp_itemsize /
16119	/ methods /
16120	(destructor)unicodeiter_dealloc, / tp_dealloc /
16121	`0`, / tp_vectorcall_offset /
16122	`0`, / tp_getattr /
16123	`0`, / tp_setattr /
16124	`0`, / tp_as_async /
16125	`0`, / tp_repr /
16126	`0`, / tp_as_number /
16127	`0`, / tp_as_sequence /
16128	`0`, / tp_as_mapping /
16129	`0`, / tp_hash /
16130	`0`, / tp_call /
16131	`0`, / tp_str /
16132	PyObject_GenericGetAttr, / tp_getattro /
16133	`0`, / tp_setattro /
16134	`0`, / tp_as_buffer /
16135	Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_HAVE_GC,/ tp_flags /
16136	`0`, / tp_doc /
16137	(traverseproc)unicodeiter_traverse, / tp_traverse /
16138	`0`, / tp_clear /
16139	`0`, / tp_richcompare /
16140	`0`, / tp_weaklistoffset /
16141	PyObject_SelfIter, / tp_iter /
16142	(iternextfunc)unicodeiter_next, / tp_iternext /
16143	unicodeiter_methods, / tp_methods /
16144	`0`,
16145	};
16146
16147	static PyObject *
16148	unicode_iter(PyObject *seq)
16149	{
16150	unicodeiterobject *it;
16151
16152	if (!PyUnicode_Check(seq)) {
16153	PyErr_BadInternalCall();
16154	return NULL;
16155	}
16156	if (PyUnicode_READY(seq) == -`1`)
16157	return NULL;
16158	it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16159	if (it == NULL)
16160	return NULL;
16161	it->it_index = `0`;
16162	Py_INCREF(seq);
16163	it->it_seq = seq;
16164	_PyObject_GC_TRACK(it);
16165	return (PyObject *)it;
16166	}
16167
16168	static int
16169	encode_wstr_utf8(wchar_t wstr, char* *str, const* char *name)
16170	{
16171	int res;
16172	res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, `1`, _Py_ERROR_STRICT);
16173	if (res == -`2`) {
16174	PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16175	return -`1`;
16176	}
16177	if (res < `0`) {
16178	PyErr_NoMemory();
16179	return -`1`;
16180	}
16181	return `0`;
16182	}
16183
16184
16185	static int
16186	config_get_codec_name(wchar_t **config_encoding)
16187	{
16188	char *encoding;
16189	if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < `0`) {
16190	return -`1`;
16191	}
16192
16193	PyObject *name_obj = NULL;
16194	PyObject *codec = _PyCodec_Lookup(encoding);
16195	PyMem_RawFree(encoding);
16196
16197	if (!codec)
16198	goto error;
16199
16200	name_obj = PyObject_GetAttrString(codec, "name");
16201	Py_CLEAR(codec);
16202	if (!name_obj) {
16203	goto error;
16204	}
16205
16206	wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16207	Py_DECREF(name_obj);
16208	if (wname == NULL) {
16209	goto error;
16210	}
16211
16212	wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16213	if (raw_wname == NULL) {
16214	PyMem_Free(wname);
16215	PyErr_NoMemory();
16216	goto error;
16217	}
16218
16219	PyMem_RawFree(*config_encoding);
16220	*config_encoding = raw_wname;
16221
16222	PyMem_Free(wname);
16223	return `0`;
16224
16225	error:
16226	Py_XDECREF(codec);
16227	Py_XDECREF(name_obj);
16228	return -`1`;
16229	}
16230
16231
16232	static PyStatus
16233	init_stdio_encoding(PyInterpreterState *interp)
16234	{
16235	/ Update the stdio encoding to the normalized Python codec name. /
16236	PyConfig config = (PyConfig)_PyInterpreterState_GetConfig(interp);
16237	if (config_get_codec_name(&config->stdio_encoding) < `0`) {
16238	return _PyStatus_ERR("failed to get the Python codec name "
16239	"of the stdio encoding");
16240	}
16241	return _PyStatus_OK();
16242	}
16243
16244
16245	static int
16246	init_fs_codec(PyInterpreterState *interp)
16247	{
16248	const PyConfig *config = _PyInterpreterState_GetConfig(interp);
16249
16250	_Py_error_handler error_handler;
16251	error_handler = get_error_handler_wide(config->filesystem_errors);
16252	if (error_handler == _Py_ERROR_UNKNOWN) {
16253	PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
16254	return -`1`;
16255	}
16256
16257	char encoding, errors;
16258	if (encode_wstr_utf8(config->filesystem_encoding,
16259	&encoding,
16260	"filesystem_encoding") < `0`) {
16261	return -`1`;
16262	}
16263
16264	if (encode_wstr_utf8(config->filesystem_errors,
16265	&errors,
16266	"filesystem_errors") < `0`) {
16267	PyMem_RawFree(encoding);
16268	return -`1`;
16269	}
16270
16271	struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16272	PyMem_RawFree(fs_codec->encoding);
16273	fs_codec->encoding = encoding;
16274	/ encoding has been normalized by init_fs_encoding() /
16275	fs_codec->utf8 = (strcmp(encoding, "utf-8") == `0`);
16276	PyMem_RawFree(fs_codec->errors);
16277	fs_codec->errors = errors;
16278	fs_codec->error_handler = error_handler;
16279
16280	#ifdef _Py_FORCE_UTF8_FS_ENCODING
16281	assert(fs_codec->utf8 == `1`);
16282	#endif
16283
16284	/ At this point, PyUnicode_EncodeFSDefault() and*
16285	PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16286	the C implementation of the filesystem encoding. /*
16287
16288	/ Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors*
16289	global configuration variables. /*
16290	if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16291	fs_codec->errors) < `0`) {
16292	PyErr_NoMemory();
16293	return -`1`;
16294	}
16295	return `0`;
16296	}
16297
16298
16299	static PyStatus
16300	init_fs_encoding(PyThreadState *tstate)
16301	{
16302	PyInterpreterState *interp = tstate->interp;
16303
16304	/ Update the filesystem encoding to the normalized Python codec name.*
16305	For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16306	(Python codec name). /*
16307	PyConfig config = (PyConfig)_PyInterpreterState_GetConfig(interp);
16308	if (config_get_codec_name(&config->filesystem_encoding) < `0`) {
16309	_Py_DumpPathConfig(tstate);
16310	return _PyStatus_ERR("failed to get the Python codec "
16311	"of the filesystem encoding");
16312	}
16313
16314	if (init_fs_codec(interp) < `0`) {
16315	return _PyStatus_ERR("cannot initialize filesystem codec");
16316	}
16317	return _PyStatus_OK();
16318	}
16319
16320
16321	PyStatus
16322	_PyUnicode_InitEncodings(PyThreadState *tstate)
16323	{
16324	PyStatus status = init_fs_encoding(tstate);
16325	if (_PyStatus_EXCEPTION(status)) {
16326	return status;
16327	}
16328
16329	return init_stdio_encoding(tstate->interp);
16330	}
16331
16332
16333	static void
16334	_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16335	{
16336	PyMem_RawFree(fs_codec->encoding);
16337	fs_codec->encoding = NULL;
16338	fs_codec->utf8 = `0`;
16339	PyMem_RawFree(fs_codec->errors);
16340	fs_codec->errors = NULL;
16341	fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16342	}
16343
16344
16345	#ifdef MS_WINDOWS
16346	int
16347	_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16348	{
16349	PyInterpreterState *interp = _PyInterpreterState_GET();
16350	PyConfig config = (PyConfig )_PyInterpreterState_GetConfig(interp);
16351
16352	/ Set the filesystem encoding to mbcs/replace (PEP 529) /
16353	wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16354	wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16355	if (encoding == NULL \|\| errors == NULL) {
16356	PyMem_RawFree(encoding);
16357	PyMem_RawFree(errors);
16358	PyErr_NoMemory();
16359	return -`1`;
16360	}
16361
16362	PyMem_RawFree(config->filesystem_encoding);
16363	config->filesystem_encoding = encoding;
16364	PyMem_RawFree(config->filesystem_errors);
16365	config->filesystem_errors = errors;
16366
16367	return init_fs_codec(interp);
16368	}
16369	#endif
16370
16371
16372	void
16373	_PyUnicode_Fini(PyInterpreterState *interp)
16374	{
16375	struct _Py_unicode_state *state = &interp->unicode;
16376
16377	if (_Py_IsMainInterpreter(interp)) {
16378	// _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16379	assert(interned == NULL);
16380	// bpo-47182: force a unicodedata CAPI capsule re-import on
16381	// subsequent initialization of main interpreter.
16382	ucnhash_capi = NULL;
16383	}
16384
16385	_PyUnicode_FiniEncodings(&state->fs_codec);
16386
16387	unicode_clear_identifiers(state);
16388
16389	for (Py_ssize_t i = `0`; i < `256`; i++) {
16390	Py_CLEAR(state->latin1[i]);
16391	}
16392	Py_CLEAR(state->empty_string);
16393	}
16394
16395
16396	/ A _string module, to export formatter_parser and formatter_field_name_split*
16397	to the string.Formatter class implemented in Python. /*
16398
16399	static PyMethodDef _string_methods[] = {
16400	{"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16401	METH_O, PyDoc_STR("split the argument as a field name")},
16402	{"formatter_parser", (PyCFunction) formatter_parser,
16403	METH_O, PyDoc_STR("parse the argument as a format string")},
16404	{NULL, NULL}
16405	};
16406
16407	static struct PyModuleDef _string_module = {
16408	PyModuleDef_HEAD_INIT,
16409	.m_name = "_string",
16410	.m_doc = PyDoc_STR("string helper module"),
16411	.m_size = `0`,
16412	.m_methods = _string_methods,
16413	};
16414
16415	PyMODINIT_FUNC
16416	PyInit__string(void)
16417	{
16418	return PyModuleDef_Init(&_string_module);
16419	}
16420
16421
16422	#ifdef __cplusplus
16423	}
16424	#endif
16425

Browse the source code of python/Objects/unicodeobject.c