1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <[email protected]>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "pycore_abstract.h" // _PyIndex_Check()
44#include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
45#include "pycore_bytes_methods.h" // _Py_bytes_lower()
46#include "pycore_format.h" // F_LJUST
47#include "pycore_initconfig.h" // _PyStatus_OK()
48#include "pycore_interp.h" // PyInterpreterState.fs_codec
49#include "pycore_object.h" // _PyObject_GC_TRACK()
50#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
51#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
52#include "pycore_pystate.h" // _PyInterpreterState_GET()
53#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
54#include "stringlib/eq.h" // unicode_eq()
55
56#ifdef MS_WINDOWS
57#include <windows.h>
58#endif
59
60#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
61#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
62#endif
63
64/* Uncomment to display statistics on interned strings at exit
65 in _PyUnicode_ClearInterned(). */
66/* #define INTERNED_STATS 1 */
67
68
69/*[clinic input]
70class str "PyObject *" "&PyUnicode_Type"
71[clinic start generated code]*/
72/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
73
74/*[python input]
75class Py_UCS4_converter(CConverter):
76 type = 'Py_UCS4'
77 converter = 'convert_uc'
78
79 def converter_init(self):
80 if self.default is not unspecified:
81 self.c_default = ascii(self.default)
82 if len(self.c_default) > 4 or self.c_default[0] != "'":
83 self.c_default = hex(ord(self.default))
84
85[python start generated code]*/
86/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
87
88/* --- Globals ------------------------------------------------------------
89
90NOTE: In the interpreter's initialization phase, some globals are currently
91 initialized dynamically as needed. In the process Unicode objects may
92 be created before the Unicode type is ready.
93
94*/
95
96
97#ifdef __cplusplus
98extern "C" {
99#endif
100
101// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
102// The value must be the same in fileutils.c.
103#define MAX_UNICODE 0x10ffff
104
105#ifdef Py_DEBUG
106# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
107#else
108# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
109#endif
110
111#define _PyUnicode_UTF8(op) \
112 (((PyCompactUnicodeObject*)(op))->utf8)
113#define PyUnicode_UTF8(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((char*)((PyASCIIObject*)(op) + 1)) : \
118 _PyUnicode_UTF8(op))
119#define _PyUnicode_UTF8_LENGTH(op) \
120 (((PyCompactUnicodeObject*)(op))->utf8_length)
121#define PyUnicode_UTF8_LENGTH(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(PyUnicode_IS_READY(op)), \
124 PyUnicode_IS_COMPACT_ASCII(op) ? \
125 ((PyASCIIObject*)(op))->length : \
126 _PyUnicode_UTF8_LENGTH(op))
127#define _PyUnicode_WSTR(op) \
128 (((PyASCIIObject*)(op))->wstr)
129
130/* Don't use deprecated macro of unicodeobject.h */
131#undef PyUnicode_WSTR_LENGTH
132#define PyUnicode_WSTR_LENGTH(op) \
133 (PyUnicode_IS_COMPACT_ASCII(op) ? \
134 ((PyASCIIObject*)op)->length : \
135 ((PyCompactUnicodeObject*)op)->wstr_length)
136#define _PyUnicode_WSTR_LENGTH(op) \
137 (((PyCompactUnicodeObject*)(op))->wstr_length)
138#define _PyUnicode_LENGTH(op) \
139 (((PyASCIIObject *)(op))->length)
140#define _PyUnicode_STATE(op) \
141 (((PyASCIIObject *)(op))->state)
142#define _PyUnicode_HASH(op) \
143 (((PyASCIIObject *)(op))->hash)
144#define _PyUnicode_KIND(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 ((PyASCIIObject *)(op))->state.kind)
147#define _PyUnicode_GET_LENGTH(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 ((PyASCIIObject *)(op))->length)
150#define _PyUnicode_DATA_ANY(op) \
151 (((PyUnicodeObject*)(op))->data.any)
152
153#undef PyUnicode_READY
154#define PyUnicode_READY(op) \
155 (assert(_PyUnicode_CHECK(op)), \
156 (PyUnicode_IS_READY(op) ? \
157 0 : \
158 _PyUnicode_Ready(op)))
159
160#define _PyUnicode_SHARE_UTF8(op) \
161 (assert(_PyUnicode_CHECK(op)), \
162 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
163 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
164#define _PyUnicode_SHARE_WSTR(op) \
165 (assert(_PyUnicode_CHECK(op)), \
166 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
167
168/* true if the Unicode object has an allocated UTF-8 memory block
169 (not shared with other data) */
170#define _PyUnicode_HAS_UTF8_MEMORY(op) \
171 ((!PyUnicode_IS_COMPACT_ASCII(op) \
172 && _PyUnicode_UTF8(op) \
173 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
174
175/* true if the Unicode object has an allocated wstr memory block
176 (not shared with other data) */
177#define _PyUnicode_HAS_WSTR_MEMORY(op) \
178 ((_PyUnicode_WSTR(op) && \
179 (!PyUnicode_IS_READY(op) || \
180 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
181
182/* Generic helper macro to convert characters of different types.
183 from_type and to_type have to be valid type names, begin and end
184 are pointers to the source characters which should be of type
185 "from_type *". to is a pointer of type "to_type *" and points to the
186 buffer where the result characters are written to. */
187#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
188 do { \
189 to_type *_to = (to_type *)(to); \
190 const from_type *_iter = (const from_type *)(begin);\
191 const from_type *_end = (const from_type *)(end);\
192 Py_ssize_t n = (_end) - (_iter); \
193 const from_type *_unrolled_end = \
194 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
195 while (_iter < (_unrolled_end)) { \
196 _to[0] = (to_type) _iter[0]; \
197 _to[1] = (to_type) _iter[1]; \
198 _to[2] = (to_type) _iter[2]; \
199 _to[3] = (to_type) _iter[3]; \
200 _iter += 4; _to += 4; \
201 } \
202 while (_iter < (_end)) \
203 *_to++ = (to_type) *_iter++; \
204 } while (0)
205
206#ifdef MS_WINDOWS
207 /* On Windows, overallocate by 50% is the best factor */
208# define OVERALLOCATE_FACTOR 2
209#else
210 /* On Linux, overallocate by 25% is the best factor */
211# define OVERALLOCATE_FACTOR 4
212#endif
213
214/* bpo-40521: Interned strings are shared by all interpreters. */
215#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
216# define INTERNED_STRINGS
217#endif
218
219/* This dictionary holds all interned unicode strings. Note that references
220 to strings in this dictionary are *not* counted in the string's ob_refcnt.
221 When the interned string reaches a refcnt of 0 the string deallocation
222 function will delete the reference from this dictionary.
223
224 Another way to look at this is that to say that the actual reference
225 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
226*/
227#ifdef INTERNED_STRINGS
228static PyObject *interned = NULL;
229#endif
230
231static struct _Py_unicode_state*
232get_unicode_state(void)
233{
234 PyInterpreterState *interp = _PyInterpreterState_GET();
235 return &interp->unicode;
236}
237
238
239// Return a borrowed reference to the empty string singleton.
240static inline PyObject* unicode_get_empty(void)
241{
242 struct _Py_unicode_state *state = get_unicode_state();
243 // unicode_get_empty() must not be called before _PyUnicode_Init()
244 // or after _PyUnicode_Fini()
245 assert(state->empty_string != NULL);
246 return state->empty_string;
247}
248
249
250// Return a strong reference to the empty string singleton.
251static inline PyObject* unicode_new_empty(void)
252{
253 PyObject *empty = unicode_get_empty();
254 Py_INCREF(empty);
255 return empty;
256}
257
258#define _Py_RETURN_UNICODE_EMPTY() \
259 do { \
260 return unicode_new_empty(); \
261 } while (0)
262
263static inline void
264unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
265 Py_ssize_t start, Py_ssize_t length)
266{
267 assert(0 <= start);
268 assert(kind != PyUnicode_WCHAR_KIND);
269 switch (kind) {
270 case PyUnicode_1BYTE_KIND: {
271 assert(value <= 0xff);
272 Py_UCS1 ch = (unsigned char)value;
273 Py_UCS1 *to = (Py_UCS1 *)data + start;
274 memset(to, ch, length);
275 break;
276 }
277 case PyUnicode_2BYTE_KIND: {
278 assert(value <= 0xffff);
279 Py_UCS2 ch = (Py_UCS2)value;
280 Py_UCS2 *to = (Py_UCS2 *)data + start;
281 const Py_UCS2 *end = to + length;
282 for (; to < end; ++to) *to = ch;
283 break;
284 }
285 case PyUnicode_4BYTE_KIND: {
286 assert(value <= MAX_UNICODE);
287 Py_UCS4 ch = value;
288 Py_UCS4 * to = (Py_UCS4 *)data + start;
289 const Py_UCS4 *end = to + length;
290 for (; to < end; ++to) *to = ch;
291 break;
292 }
293 default: Py_UNREACHABLE();
294 }
295}
296
297
298/* Forward declaration */
299static inline int
300_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
301static inline void
302_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
303static PyObject *
304unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
305 const char *errors);
306static PyObject *
307unicode_decode_utf8(const char *s, Py_ssize_t size,
308 _Py_error_handler error_handler, const char *errors,
309 Py_ssize_t *consumed);
310
311/* Fast detection of the most frequent whitespace characters */
312const unsigned char _Py_ascii_whitespace[] = {
313 0, 0, 0, 0, 0, 0, 0, 0,
314/* case 0x0009: * CHARACTER TABULATION */
315/* case 0x000A: * LINE FEED */
316/* case 0x000B: * LINE TABULATION */
317/* case 0x000C: * FORM FEED */
318/* case 0x000D: * CARRIAGE RETURN */
319 0, 1, 1, 1, 1, 1, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0,
321/* case 0x001C: * FILE SEPARATOR */
322/* case 0x001D: * GROUP SEPARATOR */
323/* case 0x001E: * RECORD SEPARATOR */
324/* case 0x001F: * UNIT SEPARATOR */
325 0, 0, 0, 0, 1, 1, 1, 1,
326/* case 0x0020: * SPACE */
327 1, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0,
336 0, 0, 0, 0, 0, 0, 0, 0,
337 0, 0, 0, 0, 0, 0, 0, 0,
338 0, 0, 0, 0, 0, 0, 0, 0,
339 0, 0, 0, 0, 0, 0, 0, 0
340};
341
342/* forward */
343static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
344static PyObject* get_latin1_char(unsigned char ch);
345static int unicode_modifiable(PyObject *unicode);
346
347
348static PyObject *
349_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
350static PyObject *
351_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
352static PyObject *
353_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
354
355static PyObject *
356unicode_encode_call_errorhandler(const char *errors,
357 PyObject **errorHandler,const char *encoding, const char *reason,
358 PyObject *unicode, PyObject **exceptionObject,
359 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
360
361static void
362raise_encode_exception(PyObject **exceptionObject,
363 const char *encoding,
364 PyObject *unicode,
365 Py_ssize_t startpos, Py_ssize_t endpos,
366 const char *reason);
367
368/* Same for linebreaks */
369static const unsigned char ascii_linebreak[] = {
370 0, 0, 0, 0, 0, 0, 0, 0,
371/* 0x000A, * LINE FEED */
372/* 0x000B, * LINE TABULATION */
373/* 0x000C, * FORM FEED */
374/* 0x000D, * CARRIAGE RETURN */
375 0, 0, 1, 1, 1, 1, 0, 0,
376 0, 0, 0, 0, 0, 0, 0, 0,
377/* 0x001C, * FILE SEPARATOR */
378/* 0x001D, * GROUP SEPARATOR */
379/* 0x001E, * RECORD SEPARATOR */
380 0, 0, 0, 0, 1, 1, 1, 0,
381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0,
390 0, 0, 0, 0, 0, 0, 0, 0,
391 0, 0, 0, 0, 0, 0, 0, 0,
392 0, 0, 0, 0, 0, 0, 0, 0,
393 0, 0, 0, 0, 0, 0, 0, 0
394};
395
396static int convert_uc(PyObject *obj, void *addr);
397
398#include "clinic/unicodeobject.c.h"
399
400_Py_error_handler
401_Py_GetErrorHandler(const char *errors)
402{
403 if (errors == NULL || strcmp(errors, "strict") == 0) {
404 return _Py_ERROR_STRICT;
405 }
406 if (strcmp(errors, "surrogateescape") == 0) {
407 return _Py_ERROR_SURROGATEESCAPE;
408 }
409 if (strcmp(errors, "replace") == 0) {
410 return _Py_ERROR_REPLACE;
411 }
412 if (strcmp(errors, "ignore") == 0) {
413 return _Py_ERROR_IGNORE;
414 }
415 if (strcmp(errors, "backslashreplace") == 0) {
416 return _Py_ERROR_BACKSLASHREPLACE;
417 }
418 if (strcmp(errors, "surrogatepass") == 0) {
419 return _Py_ERROR_SURROGATEPASS;
420 }
421 if (strcmp(errors, "xmlcharrefreplace") == 0) {
422 return _Py_ERROR_XMLCHARREFREPLACE;
423 }
424 return _Py_ERROR_OTHER;
425}
426
427
428static _Py_error_handler
429get_error_handler_wide(const wchar_t *errors)
430{
431 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
432 return _Py_ERROR_STRICT;
433 }
434 if (wcscmp(errors, L"surrogateescape") == 0) {
435 return _Py_ERROR_SURROGATEESCAPE;
436 }
437 if (wcscmp(errors, L"replace") == 0) {
438 return _Py_ERROR_REPLACE;
439 }
440 if (wcscmp(errors, L"ignore") == 0) {
441 return _Py_ERROR_IGNORE;
442 }
443 if (wcscmp(errors, L"backslashreplace") == 0) {
444 return _Py_ERROR_BACKSLASHREPLACE;
445 }
446 if (wcscmp(errors, L"surrogatepass") == 0) {
447 return _Py_ERROR_SURROGATEPASS;
448 }
449 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
450 return _Py_ERROR_XMLCHARREFREPLACE;
451 }
452 return _Py_ERROR_OTHER;
453}
454
455
456static inline int
457unicode_check_encoding_errors(const char *encoding, const char *errors)
458{
459 if (encoding == NULL && errors == NULL) {
460 return 0;
461 }
462
463 PyInterpreterState *interp = _PyInterpreterState_GET();
464#ifndef Py_DEBUG
465 /* In release mode, only check in development mode (-X dev) */
466 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
467 return 0;
468 }
469#else
470 /* Always check in debug mode */
471#endif
472
473 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
474 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
475 if (!interp->unicode.fs_codec.encoding) {
476 return 0;
477 }
478
479 /* Disable checks during Python finalization. For example, it allows to
480 call _PyObject_Dump() during finalization for debugging purpose. */
481 if (interp->finalizing) {
482 return 0;
483 }
484
485 if (encoding != NULL) {
486 PyObject *handler = _PyCodec_Lookup(encoding);
487 if (handler == NULL) {
488 return -1;
489 }
490 Py_DECREF(handler);
491 }
492
493 if (errors != NULL) {
494 PyObject *handler = PyCodec_LookupError(errors);
495 if (handler == NULL) {
496 return -1;
497 }
498 Py_DECREF(handler);
499 }
500 return 0;
501}
502
503
504int
505_PyUnicode_CheckConsistency(PyObject *op, int check_content)
506{
507#define CHECK(expr) \
508 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
509
510 PyASCIIObject *ascii;
511 unsigned int kind;
512
513 assert(op != NULL);
514 CHECK(PyUnicode_Check(op));
515
516 ascii = (PyASCIIObject *)op;
517 kind = ascii->state.kind;
518
519 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
520 CHECK(kind == PyUnicode_1BYTE_KIND);
521 CHECK(ascii->state.ready == 1);
522 }
523 else {
524 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
525 void *data;
526
527 if (ascii->state.compact == 1) {
528 data = compact + 1;
529 CHECK(kind == PyUnicode_1BYTE_KIND
530 || kind == PyUnicode_2BYTE_KIND
531 || kind == PyUnicode_4BYTE_KIND);
532 CHECK(ascii->state.ascii == 0);
533 CHECK(ascii->state.ready == 1);
534 CHECK(compact->utf8 != data);
535 }
536 else {
537 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
538
539 data = unicode->data.any;
540 if (kind == PyUnicode_WCHAR_KIND) {
541 CHECK(ascii->length == 0);
542 CHECK(ascii->hash == -1);
543 CHECK(ascii->state.compact == 0);
544 CHECK(ascii->state.ascii == 0);
545 CHECK(ascii->state.ready == 0);
546 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
547 CHECK(ascii->wstr != NULL);
548 CHECK(data == NULL);
549 CHECK(compact->utf8 == NULL);
550 }
551 else {
552 CHECK(kind == PyUnicode_1BYTE_KIND
553 || kind == PyUnicode_2BYTE_KIND
554 || kind == PyUnicode_4BYTE_KIND);
555 CHECK(ascii->state.compact == 0);
556 CHECK(ascii->state.ready == 1);
557 CHECK(data != NULL);
558 if (ascii->state.ascii) {
559 CHECK(compact->utf8 == data);
560 CHECK(compact->utf8_length == ascii->length);
561 }
562 else
563 CHECK(compact->utf8 != data);
564 }
565 }
566 if (kind != PyUnicode_WCHAR_KIND) {
567 if (
568#if SIZEOF_WCHAR_T == 2
569 kind == PyUnicode_2BYTE_KIND
570#else
571 kind == PyUnicode_4BYTE_KIND
572#endif
573 )
574 {
575 CHECK(ascii->wstr == data);
576 CHECK(compact->wstr_length == ascii->length);
577 } else
578 CHECK(ascii->wstr != data);
579 }
580
581 if (compact->utf8 == NULL)
582 CHECK(compact->utf8_length == 0);
583 if (ascii->wstr == NULL)
584 CHECK(compact->wstr_length == 0);
585 }
586
587 /* check that the best kind is used: O(n) operation */
588 if (check_content && kind != PyUnicode_WCHAR_KIND) {
589 Py_ssize_t i;
590 Py_UCS4 maxchar = 0;
591 const void *data;
592 Py_UCS4 ch;
593
594 data = PyUnicode_DATA(ascii);
595 for (i=0; i < ascii->length; i++)
596 {
597 ch = PyUnicode_READ(kind, data, i);
598 if (ch > maxchar)
599 maxchar = ch;
600 }
601 if (kind == PyUnicode_1BYTE_KIND) {
602 if (ascii->state.ascii == 0) {
603 CHECK(maxchar >= 128);
604 CHECK(maxchar <= 255);
605 }
606 else
607 CHECK(maxchar < 128);
608 }
609 else if (kind == PyUnicode_2BYTE_KIND) {
610 CHECK(maxchar >= 0x100);
611 CHECK(maxchar <= 0xFFFF);
612 }
613 else {
614 CHECK(maxchar >= 0x10000);
615 CHECK(maxchar <= MAX_UNICODE);
616 }
617 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
618 }
619 return 1;
620
621#undef CHECK
622}
623
624
625static PyObject*
626unicode_result_wchar(PyObject *unicode)
627{
628#ifndef Py_DEBUG
629 Py_ssize_t len;
630
631 len = _PyUnicode_WSTR_LENGTH(unicode);
632 if (len == 0) {
633 Py_DECREF(unicode);
634 _Py_RETURN_UNICODE_EMPTY();
635 }
636
637 if (len == 1) {
638 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
639 if ((Py_UCS4)ch < 256) {
640 Py_DECREF(unicode);
641 return get_latin1_char((unsigned char)ch);
642 }
643 }
644
645 if (_PyUnicode_Ready(unicode) < 0) {
646 Py_DECREF(unicode);
647 return NULL;
648 }
649#else
650 assert(Py_REFCNT(unicode) == 1);
651
652 /* don't make the result ready in debug mode to ensure that the caller
653 makes the string ready before using it */
654 assert(_PyUnicode_CheckConsistency(unicode, 1));
655#endif
656 return unicode;
657}
658
659static PyObject*
660unicode_result_ready(PyObject *unicode)
661{
662 Py_ssize_t length;
663
664 length = PyUnicode_GET_LENGTH(unicode);
665 if (length == 0) {
666 PyObject *empty = unicode_get_empty();
667 if (unicode != empty) {
668 Py_DECREF(unicode);
669 Py_INCREF(empty);
670 }
671 return empty;
672 }
673
674 if (length == 1) {
675 int kind = PyUnicode_KIND(unicode);
676 if (kind == PyUnicode_1BYTE_KIND) {
677 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
678 Py_UCS1 ch = data[0];
679 struct _Py_unicode_state *state = get_unicode_state();
680 PyObject *latin1_char = state->latin1[ch];
681 if (latin1_char != NULL) {
682 if (unicode != latin1_char) {
683 Py_INCREF(latin1_char);
684 Py_DECREF(unicode);
685 }
686 return latin1_char;
687 }
688 else {
689 assert(_PyUnicode_CheckConsistency(unicode, 1));
690 Py_INCREF(unicode);
691 state->latin1[ch] = unicode;
692 return unicode;
693 }
694 }
695 else {
696 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
697 }
698 }
699
700 assert(_PyUnicode_CheckConsistency(unicode, 1));
701 return unicode;
702}
703
704static PyObject*
705unicode_result(PyObject *unicode)
706{
707 assert(_PyUnicode_CHECK(unicode));
708 if (PyUnicode_IS_READY(unicode))
709 return unicode_result_ready(unicode);
710 else
711 return unicode_result_wchar(unicode);
712}
713
714static PyObject*
715unicode_result_unchanged(PyObject *unicode)
716{
717 if (PyUnicode_CheckExact(unicode)) {
718 if (PyUnicode_READY(unicode) == -1)
719 return NULL;
720 Py_INCREF(unicode);
721 return unicode;
722 }
723 else
724 /* Subtype -- return genuine unicode string with the same value. */
725 return _PyUnicode_Copy(unicode);
726}
727
728/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
729 ASCII, Latin1, UTF-8, etc. */
730static char*
731backslashreplace(_PyBytesWriter *writer, char *str,
732 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
733{
734 Py_ssize_t size, i;
735 Py_UCS4 ch;
736 enum PyUnicode_Kind kind;
737 const void *data;
738
739 assert(PyUnicode_IS_READY(unicode));
740 kind = PyUnicode_KIND(unicode);
741 data = PyUnicode_DATA(unicode);
742
743 size = 0;
744 /* determine replacement size */
745 for (i = collstart; i < collend; ++i) {
746 Py_ssize_t incr;
747
748 ch = PyUnicode_READ(kind, data, i);
749 if (ch < 0x100)
750 incr = 2+2;
751 else if (ch < 0x10000)
752 incr = 2+4;
753 else {
754 assert(ch <= MAX_UNICODE);
755 incr = 2+8;
756 }
757 if (size > PY_SSIZE_T_MAX - incr) {
758 PyErr_SetString(PyExc_OverflowError,
759 "encoded result is too long for a Python string");
760 return NULL;
761 }
762 size += incr;
763 }
764
765 str = _PyBytesWriter_Prepare(writer, str, size);
766 if (str == NULL)
767 return NULL;
768
769 /* generate replacement */
770 for (i = collstart; i < collend; ++i) {
771 ch = PyUnicode_READ(kind, data, i);
772 *str++ = '\\';
773 if (ch >= 0x00010000) {
774 *str++ = 'U';
775 *str++ = Py_hexdigits[(ch>>28)&0xf];
776 *str++ = Py_hexdigits[(ch>>24)&0xf];
777 *str++ = Py_hexdigits[(ch>>20)&0xf];
778 *str++ = Py_hexdigits[(ch>>16)&0xf];
779 *str++ = Py_hexdigits[(ch>>12)&0xf];
780 *str++ = Py_hexdigits[(ch>>8)&0xf];
781 }
782 else if (ch >= 0x100) {
783 *str++ = 'u';
784 *str++ = Py_hexdigits[(ch>>12)&0xf];
785 *str++ = Py_hexdigits[(ch>>8)&0xf];
786 }
787 else
788 *str++ = 'x';
789 *str++ = Py_hexdigits[(ch>>4)&0xf];
790 *str++ = Py_hexdigits[ch&0xf];
791 }
792 return str;
793}
794
795/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
796 ASCII, Latin1, UTF-8, etc. */
797static char*
798xmlcharrefreplace(_PyBytesWriter *writer, char *str,
799 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
800{
801 Py_ssize_t size, i;
802 Py_UCS4 ch;
803 enum PyUnicode_Kind kind;
804 const void *data;
805
806 assert(PyUnicode_IS_READY(unicode));
807 kind = PyUnicode_KIND(unicode);
808 data = PyUnicode_DATA(unicode);
809
810 size = 0;
811 /* determine replacement size */
812 for (i = collstart; i < collend; ++i) {
813 Py_ssize_t incr;
814
815 ch = PyUnicode_READ(kind, data, i);
816 if (ch < 10)
817 incr = 2+1+1;
818 else if (ch < 100)
819 incr = 2+2+1;
820 else if (ch < 1000)
821 incr = 2+3+1;
822 else if (ch < 10000)
823 incr = 2+4+1;
824 else if (ch < 100000)
825 incr = 2+5+1;
826 else if (ch < 1000000)
827 incr = 2+6+1;
828 else {
829 assert(ch <= MAX_UNICODE);
830 incr = 2+7+1;
831 }
832 if (size > PY_SSIZE_T_MAX - incr) {
833 PyErr_SetString(PyExc_OverflowError,
834 "encoded result is too long for a Python string");
835 return NULL;
836 }
837 size += incr;
838 }
839
840 str = _PyBytesWriter_Prepare(writer, str, size);
841 if (str == NULL)
842 return NULL;
843
844 /* generate replacement */
845 for (i = collstart; i < collend; ++i) {
846 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
847 if (size < 0) {
848 return NULL;
849 }
850 str += size;
851 }
852 return str;
853}
854
855/* --- Bloom Filters ----------------------------------------------------- */
856
857/* stuff to implement simple "bloom filters" for Unicode characters.
858 to keep things simple, we use a single bitmask, using the least 5
859 bits from each unicode characters as the bit index. */
860
861/* the linebreak mask is set up by _PyUnicode_Init() below */
862
863#if LONG_BIT >= 128
864#define BLOOM_WIDTH 128
865#elif LONG_BIT >= 64
866#define BLOOM_WIDTH 64
867#elif LONG_BIT >= 32
868#define BLOOM_WIDTH 32
869#else
870#error "LONG_BIT is smaller than 32"
871#endif
872
873#define BLOOM_MASK unsigned long
874
875static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
876
877#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
878
879#define BLOOM_LINEBREAK(ch) \
880 ((ch) < 128U ? ascii_linebreak[(ch)] : \
881 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
882
883static inline BLOOM_MASK
884make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
885{
886#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
887 do { \
888 TYPE *data = (TYPE *)PTR; \
889 TYPE *end = data + LEN; \
890 Py_UCS4 ch; \
891 for (; data != end; data++) { \
892 ch = *data; \
893 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
894 } \
895 break; \
896 } while (0)
897
898 /* calculate simple bloom-style bitmask for a given unicode string */
899
900 BLOOM_MASK mask;
901
902 mask = 0;
903 switch (kind) {
904 case PyUnicode_1BYTE_KIND:
905 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
906 break;
907 case PyUnicode_2BYTE_KIND:
908 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
909 break;
910 case PyUnicode_4BYTE_KIND:
911 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
912 break;
913 default:
914 Py_UNREACHABLE();
915 }
916 return mask;
917
918#undef BLOOM_UPDATE
919}
920
921static int
922ensure_unicode(PyObject *obj)
923{
924 if (!PyUnicode_Check(obj)) {
925 PyErr_Format(PyExc_TypeError,
926 "must be str, not %.100s",
927 Py_TYPE(obj)->tp_name);
928 return -1;
929 }
930 return PyUnicode_READY(obj);
931}
932
933/* Compilation of templated routines */
934
935#define STRINGLIB_GET_EMPTY() unicode_get_empty()
936
937#include "stringlib/asciilib.h"
938#include "stringlib/fastsearch.h"
939#include "stringlib/partition.h"
940#include "stringlib/split.h"
941#include "stringlib/count.h"
942#include "stringlib/find.h"
943#include "stringlib/find_max_char.h"
944#include "stringlib/undef.h"
945
946#include "stringlib/ucs1lib.h"
947#include "stringlib/fastsearch.h"
948#include "stringlib/partition.h"
949#include "stringlib/split.h"
950#include "stringlib/count.h"
951#include "stringlib/find.h"
952#include "stringlib/replace.h"
953#include "stringlib/find_max_char.h"
954#include "stringlib/undef.h"
955
956#include "stringlib/ucs2lib.h"
957#include "stringlib/fastsearch.h"
958#include "stringlib/partition.h"
959#include "stringlib/split.h"
960#include "stringlib/count.h"
961#include "stringlib/find.h"
962#include "stringlib/replace.h"
963#include "stringlib/find_max_char.h"
964#include "stringlib/undef.h"
965
966#include "stringlib/ucs4lib.h"
967#include "stringlib/fastsearch.h"
968#include "stringlib/partition.h"
969#include "stringlib/split.h"
970#include "stringlib/count.h"
971#include "stringlib/find.h"
972#include "stringlib/replace.h"
973#include "stringlib/find_max_char.h"
974#include "stringlib/undef.h"
975
976_Py_COMP_DIAG_PUSH
977_Py_COMP_DIAG_IGNORE_DEPR_DECLS
978#include "stringlib/unicodedefs.h"
979#include "stringlib/fastsearch.h"
980#include "stringlib/count.h"
981#include "stringlib/find.h"
982#include "stringlib/undef.h"
983_Py_COMP_DIAG_POP
984
985#undef STRINGLIB_GET_EMPTY
986
987/* --- Unicode Object ----------------------------------------------------- */
988
989static inline Py_ssize_t
990findchar(const void *s, int kind,
991 Py_ssize_t size, Py_UCS4 ch,
992 int direction)
993{
994 switch (kind) {
995 case PyUnicode_1BYTE_KIND:
996 if ((Py_UCS1) ch != ch)
997 return -1;
998 if (direction > 0)
999 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1000 else
1001 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1002 case PyUnicode_2BYTE_KIND:
1003 if ((Py_UCS2) ch != ch)
1004 return -1;
1005 if (direction > 0)
1006 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1007 else
1008 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1009 case PyUnicode_4BYTE_KIND:
1010 if (direction > 0)
1011 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1012 else
1013 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1014 default:
1015 Py_UNREACHABLE();
1016 }
1017}
1018
1019#ifdef Py_DEBUG
1020/* Fill the data of a Unicode string with invalid characters to detect bugs
1021 earlier.
1022
1023 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1024 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1025 invalid character in Unicode 6.0. */
1026static void
1027unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1028{
1029 int kind = PyUnicode_KIND(unicode);
1030 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1031 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1032 if (length <= old_length)
1033 return;
1034 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1035}
1036#endif
1037
1038static PyObject*
1039resize_compact(PyObject *unicode, Py_ssize_t length)
1040{
1041 Py_ssize_t char_size;
1042 Py_ssize_t struct_size;
1043 Py_ssize_t new_size;
1044 int share_wstr;
1045 PyObject *new_unicode;
1046#ifdef Py_DEBUG
1047 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1048#endif
1049
1050 assert(unicode_modifiable(unicode));
1051 assert(PyUnicode_IS_READY(unicode));
1052 assert(PyUnicode_IS_COMPACT(unicode));
1053
1054 char_size = PyUnicode_KIND(unicode);
1055 if (PyUnicode_IS_ASCII(unicode))
1056 struct_size = sizeof(PyASCIIObject);
1057 else
1058 struct_size = sizeof(PyCompactUnicodeObject);
1059 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1060
1061 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1062 PyErr_NoMemory();
1063 return NULL;
1064 }
1065 new_size = (struct_size + (length + 1) * char_size);
1066
1067 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1068 PyObject_Free(_PyUnicode_UTF8(unicode));
1069 _PyUnicode_UTF8(unicode) = NULL;
1070 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1071 }
1072#ifdef Py_REF_DEBUG
1073 _Py_RefTotal--;
1074#endif
1075#ifdef Py_TRACE_REFS
1076 _Py_ForgetReference(unicode);
1077#endif
1078
1079 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1080 if (new_unicode == NULL) {
1081 _Py_NewReference(unicode);
1082 PyErr_NoMemory();
1083 return NULL;
1084 }
1085 unicode = new_unicode;
1086 _Py_NewReference(unicode);
1087
1088 _PyUnicode_LENGTH(unicode) = length;
1089 if (share_wstr) {
1090 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1091 if (!PyUnicode_IS_ASCII(unicode))
1092 _PyUnicode_WSTR_LENGTH(unicode) = length;
1093 }
1094 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1095 PyObject_Free(_PyUnicode_WSTR(unicode));
1096 _PyUnicode_WSTR(unicode) = NULL;
1097 if (!PyUnicode_IS_ASCII(unicode))
1098 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1099 }
1100#ifdef Py_DEBUG
1101 unicode_fill_invalid(unicode, old_length);
1102#endif
1103 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1104 length, 0);
1105 assert(_PyUnicode_CheckConsistency(unicode, 0));
1106 return unicode;
1107}
1108
1109static int
1110resize_inplace(PyObject *unicode, Py_ssize_t length)
1111{
1112 wchar_t *wstr;
1113 Py_ssize_t new_size;
1114 assert(!PyUnicode_IS_COMPACT(unicode));
1115 assert(Py_REFCNT(unicode) == 1);
1116
1117 if (PyUnicode_IS_READY(unicode)) {
1118 Py_ssize_t char_size;
1119 int share_wstr, share_utf8;
1120 void *data;
1121#ifdef Py_DEBUG
1122 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1123#endif
1124
1125 data = _PyUnicode_DATA_ANY(unicode);
1126 char_size = PyUnicode_KIND(unicode);
1127 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1128 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1129
1130 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1131 PyErr_NoMemory();
1132 return -1;
1133 }
1134 new_size = (length + 1) * char_size;
1135
1136 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1137 {
1138 PyObject_Free(_PyUnicode_UTF8(unicode));
1139 _PyUnicode_UTF8(unicode) = NULL;
1140 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1141 }
1142
1143 data = (PyObject *)PyObject_Realloc(data, new_size);
1144 if (data == NULL) {
1145 PyErr_NoMemory();
1146 return -1;
1147 }
1148 _PyUnicode_DATA_ANY(unicode) = data;
1149 if (share_wstr) {
1150 _PyUnicode_WSTR(unicode) = data;
1151 _PyUnicode_WSTR_LENGTH(unicode) = length;
1152 }
1153 if (share_utf8) {
1154 _PyUnicode_UTF8(unicode) = data;
1155 _PyUnicode_UTF8_LENGTH(unicode) = length;
1156 }
1157 _PyUnicode_LENGTH(unicode) = length;
1158 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1159#ifdef Py_DEBUG
1160 unicode_fill_invalid(unicode, old_length);
1161#endif
1162 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1163 assert(_PyUnicode_CheckConsistency(unicode, 0));
1164 return 0;
1165 }
1166 }
1167 assert(_PyUnicode_WSTR(unicode) != NULL);
1168
1169 /* check for integer overflow */
1170 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1171 PyErr_NoMemory();
1172 return -1;
1173 }
1174 new_size = sizeof(wchar_t) * (length + 1);
1175 wstr = _PyUnicode_WSTR(unicode);
1176 wstr = PyObject_Realloc(wstr, new_size);
1177 if (!wstr) {
1178 PyErr_NoMemory();
1179 return -1;
1180 }
1181 _PyUnicode_WSTR(unicode) = wstr;
1182 _PyUnicode_WSTR(unicode)[length] = 0;
1183 _PyUnicode_WSTR_LENGTH(unicode) = length;
1184 assert(_PyUnicode_CheckConsistency(unicode, 0));
1185 return 0;
1186}
1187
1188static PyObject*
1189resize_copy(PyObject *unicode, Py_ssize_t length)
1190{
1191 Py_ssize_t copy_length;
1192 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1193 PyObject *copy;
1194
1195 assert(PyUnicode_IS_READY(unicode));
1196
1197 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1198 if (copy == NULL)
1199 return NULL;
1200
1201 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1202 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1203 return copy;
1204 }
1205 else {
1206 PyObject *w;
1207
1208 w = (PyObject*)_PyUnicode_New(length);
1209 if (w == NULL)
1210 return NULL;
1211 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1212 copy_length = Py_MIN(copy_length, length);
1213 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1214 copy_length * sizeof(wchar_t));
1215 return w;
1216 }
1217}
1218
1219/* We allocate one more byte to make sure the string is
1220 Ux0000 terminated; some code (e.g. new_identifier)
1221 relies on that.
1222
1223 XXX This allocator could further be enhanced by assuring that the
1224 free list never reduces its size below 1.
1225
1226*/
1227
1228static PyUnicodeObject *
1229_PyUnicode_New(Py_ssize_t length)
1230{
1231 PyUnicodeObject *unicode;
1232 size_t new_size;
1233
1234 /* Optimization for empty strings */
1235 if (length == 0) {
1236 return (PyUnicodeObject *)unicode_new_empty();
1237 }
1238
1239 /* Ensure we won't overflow the size. */
1240 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1241 return (PyUnicodeObject *)PyErr_NoMemory();
1242 }
1243 if (length < 0) {
1244 PyErr_SetString(PyExc_SystemError,
1245 "Negative size passed to _PyUnicode_New");
1246 return NULL;
1247 }
1248
1249 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1250 if (unicode == NULL)
1251 return NULL;
1252 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1253
1254 _PyUnicode_WSTR_LENGTH(unicode) = length;
1255 _PyUnicode_HASH(unicode) = -1;
1256 _PyUnicode_STATE(unicode).interned = 0;
1257 _PyUnicode_STATE(unicode).kind = 0;
1258 _PyUnicode_STATE(unicode).compact = 0;
1259 _PyUnicode_STATE(unicode).ready = 0;
1260 _PyUnicode_STATE(unicode).ascii = 0;
1261 _PyUnicode_DATA_ANY(unicode) = NULL;
1262 _PyUnicode_LENGTH(unicode) = 0;
1263 _PyUnicode_UTF8(unicode) = NULL;
1264 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1265
1266 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
1267 if (!_PyUnicode_WSTR(unicode)) {
1268 Py_DECREF(unicode);
1269 PyErr_NoMemory();
1270 return NULL;
1271 }
1272
1273 /* Initialize the first element to guard against cases where
1274 * the caller fails before initializing str -- unicode_resize()
1275 * reads str[0], and the Keep-Alive optimization can keep memory
1276 * allocated for str alive across a call to unicode_dealloc(unicode).
1277 * We don't want unicode_resize to read uninitialized memory in
1278 * that case.
1279 */
1280 _PyUnicode_WSTR(unicode)[0] = 0;
1281 _PyUnicode_WSTR(unicode)[length] = 0;
1282
1283 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1284 return unicode;
1285}
1286
1287static const char*
1288unicode_kind_name(PyObject *unicode)
1289{
1290 /* don't check consistency: unicode_kind_name() is called from
1291 _PyUnicode_Dump() */
1292 if (!PyUnicode_IS_COMPACT(unicode))
1293 {
1294 if (!PyUnicode_IS_READY(unicode))
1295 return "wstr";
1296 switch (PyUnicode_KIND(unicode))
1297 {
1298 case PyUnicode_1BYTE_KIND:
1299 if (PyUnicode_IS_ASCII(unicode))
1300 return "legacy ascii";
1301 else
1302 return "legacy latin1";
1303 case PyUnicode_2BYTE_KIND:
1304 return "legacy UCS2";
1305 case PyUnicode_4BYTE_KIND:
1306 return "legacy UCS4";
1307 default:
1308 return "<legacy invalid kind>";
1309 }
1310 }
1311 assert(PyUnicode_IS_READY(unicode));
1312 switch (PyUnicode_KIND(unicode)) {
1313 case PyUnicode_1BYTE_KIND:
1314 if (PyUnicode_IS_ASCII(unicode))
1315 return "ascii";
1316 else
1317 return "latin1";
1318 case PyUnicode_2BYTE_KIND:
1319 return "UCS2";
1320 case PyUnicode_4BYTE_KIND:
1321 return "UCS4";
1322 default:
1323 return "<invalid compact kind>";
1324 }
1325}
1326
1327#ifdef Py_DEBUG
1328/* Functions wrapping macros for use in debugger */
1329const char *_PyUnicode_utf8(void *unicode_raw){
1330 PyObject *unicode = _PyObject_CAST(unicode_raw);
1331 return PyUnicode_UTF8(unicode);
1332}
1333
1334const void *_PyUnicode_compact_data(void *unicode_raw) {
1335 PyObject *unicode = _PyObject_CAST(unicode_raw);
1336 return _PyUnicode_COMPACT_DATA(unicode);
1337}
1338const void *_PyUnicode_data(void *unicode_raw) {
1339 PyObject *unicode = _PyObject_CAST(unicode_raw);
1340 printf("obj %p\n", (void*)unicode);
1341 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1342 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1343 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1344 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1345 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1346 return PyUnicode_DATA(unicode);
1347}
1348
1349void
1350_PyUnicode_Dump(PyObject *op)
1351{
1352 PyASCIIObject *ascii = (PyASCIIObject *)op;
1353 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1354 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1355 const void *data;
1356
1357 if (ascii->state.compact)
1358 {
1359 if (ascii->state.ascii)
1360 data = (ascii + 1);
1361 else
1362 data = (compact + 1);
1363 }
1364 else
1365 data = unicode->data.any;
1366 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1367
1368 if (ascii->wstr == data)
1369 printf("shared ");
1370 printf("wstr=%p", (void *)ascii->wstr);
1371
1372 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1373 printf(" (%zu), ", compact->wstr_length);
1374 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
1375 printf("shared ");
1376 }
1377 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1378 }
1379 printf(", data=%p\n", data);
1380}
1381#endif
1382
1383static int
1384unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1385{
1386 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1387 // optimized to always use state->empty_string without having to check if
1388 // it is NULL or not.
1389 PyObject *empty = PyUnicode_New(1, 0);
1390 if (empty == NULL) {
1391 return -1;
1392 }
1393 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1394 _PyUnicode_LENGTH(empty) = 0;
1395 assert(_PyUnicode_CheckConsistency(empty, 1));
1396
1397 assert(state->empty_string == NULL);
1398 state->empty_string = empty;
1399 return 0;
1400}
1401
1402
1403PyObject *
1404PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1405{
1406 /* Optimization for empty strings */
1407 if (size == 0) {
1408 return unicode_new_empty();
1409 }
1410
1411 PyObject *obj;
1412 PyCompactUnicodeObject *unicode;
1413 void *data;
1414 enum PyUnicode_Kind kind;
1415 int is_sharing, is_ascii;
1416 Py_ssize_t char_size;
1417 Py_ssize_t struct_size;
1418
1419 is_ascii = 0;
1420 is_sharing = 0;
1421 struct_size = sizeof(PyCompactUnicodeObject);
1422 if (maxchar < 128) {
1423 kind = PyUnicode_1BYTE_KIND;
1424 char_size = 1;
1425 is_ascii = 1;
1426 struct_size = sizeof(PyASCIIObject);
1427 }
1428 else if (maxchar < 256) {
1429 kind = PyUnicode_1BYTE_KIND;
1430 char_size = 1;
1431 }
1432 else if (maxchar < 65536) {
1433 kind = PyUnicode_2BYTE_KIND;
1434 char_size = 2;
1435 if (sizeof(wchar_t) == 2)
1436 is_sharing = 1;
1437 }
1438 else {
1439 if (maxchar > MAX_UNICODE) {
1440 PyErr_SetString(PyExc_SystemError,
1441 "invalid maximum character passed to PyUnicode_New");
1442 return NULL;
1443 }
1444 kind = PyUnicode_4BYTE_KIND;
1445 char_size = 4;
1446 if (sizeof(wchar_t) == 4)
1447 is_sharing = 1;
1448 }
1449
1450 /* Ensure we won't overflow the size. */
1451 if (size < 0) {
1452 PyErr_SetString(PyExc_SystemError,
1453 "Negative size passed to PyUnicode_New");
1454 return NULL;
1455 }
1456 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1457 return PyErr_NoMemory();
1458
1459 /* Duplicated allocation code from _PyObject_New() instead of a call to
1460 * PyObject_New() so we are able to allocate space for the object and
1461 * it's data buffer.
1462 */
1463 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1464 if (obj == NULL) {
1465 return PyErr_NoMemory();
1466 }
1467 _PyObject_Init(obj, &PyUnicode_Type);
1468
1469 unicode = (PyCompactUnicodeObject *)obj;
1470 if (is_ascii)
1471 data = ((PyASCIIObject*)obj) + 1;
1472 else
1473 data = unicode + 1;
1474 _PyUnicode_LENGTH(unicode) = size;
1475 _PyUnicode_HASH(unicode) = -1;
1476 _PyUnicode_STATE(unicode).interned = 0;
1477 _PyUnicode_STATE(unicode).kind = kind;
1478 _PyUnicode_STATE(unicode).compact = 1;
1479 _PyUnicode_STATE(unicode).ready = 1;
1480 _PyUnicode_STATE(unicode).ascii = is_ascii;
1481 if (is_ascii) {
1482 ((char*)data)[size] = 0;
1483 _PyUnicode_WSTR(unicode) = NULL;
1484 }
1485 else if (kind == PyUnicode_1BYTE_KIND) {
1486 ((char*)data)[size] = 0;
1487 _PyUnicode_WSTR(unicode) = NULL;
1488 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489 unicode->utf8 = NULL;
1490 unicode->utf8_length = 0;
1491 }
1492 else {
1493 unicode->utf8 = NULL;
1494 unicode->utf8_length = 0;
1495 if (kind == PyUnicode_2BYTE_KIND)
1496 ((Py_UCS2*)data)[size] = 0;
1497 else /* kind == PyUnicode_4BYTE_KIND */
1498 ((Py_UCS4*)data)[size] = 0;
1499 if (is_sharing) {
1500 _PyUnicode_WSTR_LENGTH(unicode) = size;
1501 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1502 }
1503 else {
1504 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1505 _PyUnicode_WSTR(unicode) = NULL;
1506 }
1507 }
1508#ifdef Py_DEBUG
1509 unicode_fill_invalid((PyObject*)unicode, 0);
1510#endif
1511 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1512 return obj;
1513}
1514
1515#if SIZEOF_WCHAR_T == 2
1516/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1517 will decode surrogate pairs, the other conversions are implemented as macros
1518 for efficiency.
1519
1520 This function assumes that unicode can hold one more code point than wstr
1521 characters for a terminating null character. */
1522static void
1523unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1524 PyObject *unicode)
1525{
1526 const wchar_t *iter;
1527 Py_UCS4 *ucs4_out;
1528
1529 assert(unicode != NULL);
1530 assert(_PyUnicode_CHECK(unicode));
1531 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1532 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1533
1534 for (iter = begin; iter < end; ) {
1535 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1536 _PyUnicode_GET_LENGTH(unicode)));
1537 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1538 && (iter+1) < end
1539 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1540 {
1541 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1542 iter += 2;
1543 }
1544 else {
1545 *ucs4_out++ = *iter;
1546 iter++;
1547 }
1548 }
1549 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1550 _PyUnicode_GET_LENGTH(unicode)));
1551
1552}
1553#endif
1554
1555static int
1556unicode_check_modifiable(PyObject *unicode)
1557{
1558 if (!unicode_modifiable(unicode)) {
1559 PyErr_SetString(PyExc_SystemError,
1560 "Cannot modify a string currently used");
1561 return -1;
1562 }
1563 return 0;
1564}
1565
1566static int
1567_copy_characters(PyObject *to, Py_ssize_t to_start,
1568 PyObject *from, Py_ssize_t from_start,
1569 Py_ssize_t how_many, int check_maxchar)
1570{
1571 unsigned int from_kind, to_kind;
1572 const void *from_data;
1573 void *to_data;
1574
1575 assert(0 <= how_many);
1576 assert(0 <= from_start);
1577 assert(0 <= to_start);
1578 assert(PyUnicode_Check(from));
1579 assert(PyUnicode_IS_READY(from));
1580 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1581
1582 assert(PyUnicode_Check(to));
1583 assert(PyUnicode_IS_READY(to));
1584 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1585
1586 if (how_many == 0)
1587 return 0;
1588
1589 from_kind = PyUnicode_KIND(from);
1590 from_data = PyUnicode_DATA(from);
1591 to_kind = PyUnicode_KIND(to);
1592 to_data = PyUnicode_DATA(to);
1593
1594#ifdef Py_DEBUG
1595 if (!check_maxchar
1596 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1597 {
1598 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1599 Py_UCS4 ch;
1600 Py_ssize_t i;
1601 for (i=0; i < how_many; i++) {
1602 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1603 assert(ch <= to_maxchar);
1604 }
1605 }
1606#endif
1607
1608 if (from_kind == to_kind) {
1609 if (check_maxchar
1610 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1611 {
1612 /* Writing Latin-1 characters into an ASCII string requires to
1613 check that all written characters are pure ASCII */
1614 Py_UCS4 max_char;
1615 max_char = ucs1lib_find_max_char(from_data,
1616 (const Py_UCS1*)from_data + how_many);
1617 if (max_char >= 128)
1618 return -1;
1619 }
1620 memcpy((char*)to_data + to_kind * to_start,
1621 (const char*)from_data + from_kind * from_start,
1622 to_kind * how_many);
1623 }
1624 else if (from_kind == PyUnicode_1BYTE_KIND
1625 && to_kind == PyUnicode_2BYTE_KIND)
1626 {
1627 _PyUnicode_CONVERT_BYTES(
1628 Py_UCS1, Py_UCS2,
1629 PyUnicode_1BYTE_DATA(from) + from_start,
1630 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1631 PyUnicode_2BYTE_DATA(to) + to_start
1632 );
1633 }
1634 else if (from_kind == PyUnicode_1BYTE_KIND
1635 && to_kind == PyUnicode_4BYTE_KIND)
1636 {
1637 _PyUnicode_CONVERT_BYTES(
1638 Py_UCS1, Py_UCS4,
1639 PyUnicode_1BYTE_DATA(from) + from_start,
1640 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1641 PyUnicode_4BYTE_DATA(to) + to_start
1642 );
1643 }
1644 else if (from_kind == PyUnicode_2BYTE_KIND
1645 && to_kind == PyUnicode_4BYTE_KIND)
1646 {
1647 _PyUnicode_CONVERT_BYTES(
1648 Py_UCS2, Py_UCS4,
1649 PyUnicode_2BYTE_DATA(from) + from_start,
1650 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1651 PyUnicode_4BYTE_DATA(to) + to_start
1652 );
1653 }
1654 else {
1655 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1656
1657 if (!check_maxchar) {
1658 if (from_kind == PyUnicode_2BYTE_KIND
1659 && to_kind == PyUnicode_1BYTE_KIND)
1660 {
1661 _PyUnicode_CONVERT_BYTES(
1662 Py_UCS2, Py_UCS1,
1663 PyUnicode_2BYTE_DATA(from) + from_start,
1664 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1665 PyUnicode_1BYTE_DATA(to) + to_start
1666 );
1667 }
1668 else if (from_kind == PyUnicode_4BYTE_KIND
1669 && to_kind == PyUnicode_1BYTE_KIND)
1670 {
1671 _PyUnicode_CONVERT_BYTES(
1672 Py_UCS4, Py_UCS1,
1673 PyUnicode_4BYTE_DATA(from) + from_start,
1674 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1675 PyUnicode_1BYTE_DATA(to) + to_start
1676 );
1677 }
1678 else if (from_kind == PyUnicode_4BYTE_KIND
1679 && to_kind == PyUnicode_2BYTE_KIND)
1680 {
1681 _PyUnicode_CONVERT_BYTES(
1682 Py_UCS4, Py_UCS2,
1683 PyUnicode_4BYTE_DATA(from) + from_start,
1684 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1685 PyUnicode_2BYTE_DATA(to) + to_start
1686 );
1687 }
1688 else {
1689 Py_UNREACHABLE();
1690 }
1691 }
1692 else {
1693 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1694 Py_UCS4 ch;
1695 Py_ssize_t i;
1696
1697 for (i=0; i < how_many; i++) {
1698 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1699 if (ch > to_maxchar)
1700 return -1;
1701 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1702 }
1703 }
1704 }
1705 return 0;
1706}
1707
1708void
1709_PyUnicode_FastCopyCharacters(
1710 PyObject *to, Py_ssize_t to_start,
1711 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1712{
1713 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1714}
1715
1716Py_ssize_t
1717PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1718 PyObject *from, Py_ssize_t from_start,
1719 Py_ssize_t how_many)
1720{
1721 int err;
1722
1723 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1724 PyErr_BadInternalCall();
1725 return -1;
1726 }
1727
1728 if (PyUnicode_READY(from) == -1)
1729 return -1;
1730 if (PyUnicode_READY(to) == -1)
1731 return -1;
1732
1733 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1734 PyErr_SetString(PyExc_IndexError, "string index out of range");
1735 return -1;
1736 }
1737 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1738 PyErr_SetString(PyExc_IndexError, "string index out of range");
1739 return -1;
1740 }
1741 if (how_many < 0) {
1742 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1743 return -1;
1744 }
1745 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1746 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1747 PyErr_Format(PyExc_SystemError,
1748 "Cannot write %zi characters at %zi "
1749 "in a string of %zi characters",
1750 how_many, to_start, PyUnicode_GET_LENGTH(to));
1751 return -1;
1752 }
1753
1754 if (how_many == 0)
1755 return 0;
1756
1757 if (unicode_check_modifiable(to))
1758 return -1;
1759
1760 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1761 if (err) {
1762 PyErr_Format(PyExc_SystemError,
1763 "Cannot copy %s characters "
1764 "into a string of %s characters",
1765 unicode_kind_name(from),
1766 unicode_kind_name(to));
1767 return -1;
1768 }
1769 return how_many;
1770}
1771
1772/* Find the maximum code point and count the number of surrogate pairs so a
1773 correct string length can be computed before converting a string to UCS4.
1774 This function counts single surrogates as a character and not as a pair.
1775
1776 Return 0 on success, or -1 on error. */
1777static int
1778find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1779 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1780{
1781 const wchar_t *iter;
1782 Py_UCS4 ch;
1783
1784 assert(num_surrogates != NULL && maxchar != NULL);
1785 *num_surrogates = 0;
1786 *maxchar = 0;
1787
1788 for (iter = begin; iter < end; ) {
1789#if SIZEOF_WCHAR_T == 2
1790 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1791 && (iter+1) < end
1792 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1793 {
1794 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1795 ++(*num_surrogates);
1796 iter += 2;
1797 }
1798 else
1799#endif
1800 {
1801 ch = *iter;
1802 iter++;
1803 }
1804 if (ch > *maxchar) {
1805 *maxchar = ch;
1806 if (*maxchar > MAX_UNICODE) {
1807 PyErr_Format(PyExc_ValueError,
1808 "character U+%x is not in range [U+0000; U+%x]",
1809 ch, MAX_UNICODE);
1810 return -1;
1811 }
1812 }
1813 }
1814 return 0;
1815}
1816
1817int
1818_PyUnicode_Ready(PyObject *unicode)
1819{
1820 wchar_t *end;
1821 Py_UCS4 maxchar = 0;
1822 Py_ssize_t num_surrogates;
1823#if SIZEOF_WCHAR_T == 2
1824 Py_ssize_t length_wo_surrogates;
1825#endif
1826
1827 /* _PyUnicode_Ready() is only intended for old-style API usage where
1828 strings were created using _PyObject_New() and where no canonical
1829 representation (the str field) has been set yet aka strings
1830 which are not yet ready. */
1831 assert(_PyUnicode_CHECK(unicode));
1832 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1833 assert(_PyUnicode_WSTR(unicode) != NULL);
1834 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1835 assert(_PyUnicode_UTF8(unicode) == NULL);
1836 /* Actually, it should neither be interned nor be anything else: */
1837 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1838
1839 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1840 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1841 &maxchar, &num_surrogates) == -1)
1842 return -1;
1843
1844 if (maxchar < 256) {
1845 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1846 if (!_PyUnicode_DATA_ANY(unicode)) {
1847 PyErr_NoMemory();
1848 return -1;
1849 }
1850 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1851 _PyUnicode_WSTR(unicode), end,
1852 PyUnicode_1BYTE_DATA(unicode));
1853 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1854 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1855 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1856 if (maxchar < 128) {
1857 _PyUnicode_STATE(unicode).ascii = 1;
1858 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1859 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1860 }
1861 else {
1862 _PyUnicode_STATE(unicode).ascii = 0;
1863 _PyUnicode_UTF8(unicode) = NULL;
1864 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1865 }
1866 PyObject_Free(_PyUnicode_WSTR(unicode));
1867 _PyUnicode_WSTR(unicode) = NULL;
1868 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1869 }
1870 /* In this case we might have to convert down from 4-byte native
1871 wchar_t to 2-byte unicode. */
1872 else if (maxchar < 65536) {
1873 assert(num_surrogates == 0 &&
1874 "FindMaxCharAndNumSurrogatePairs() messed up");
1875
1876#if SIZEOF_WCHAR_T == 2
1877 /* We can share representations and are done. */
1878 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1879 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1880 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1881 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1882 _PyUnicode_UTF8(unicode) = NULL;
1883 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1884#else
1885 /* sizeof(wchar_t) == 4 */
1886 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
1887 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1888 if (!_PyUnicode_DATA_ANY(unicode)) {
1889 PyErr_NoMemory();
1890 return -1;
1891 }
1892 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1893 _PyUnicode_WSTR(unicode), end,
1894 PyUnicode_2BYTE_DATA(unicode));
1895 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1896 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1897 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1898 _PyUnicode_UTF8(unicode) = NULL;
1899 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1900 PyObject_Free(_PyUnicode_WSTR(unicode));
1901 _PyUnicode_WSTR(unicode) = NULL;
1902 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1903#endif
1904 }
1905 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1906 else {
1907#if SIZEOF_WCHAR_T == 2
1908 /* in case the native representation is 2-bytes, we need to allocate a
1909 new normalized 4-byte version. */
1910 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1911 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1912 PyErr_NoMemory();
1913 return -1;
1914 }
1915 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
1916 if (!_PyUnicode_DATA_ANY(unicode)) {
1917 PyErr_NoMemory();
1918 return -1;
1919 }
1920 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1921 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1922 _PyUnicode_UTF8(unicode) = NULL;
1923 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1924 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1925 _PyUnicode_STATE(unicode).ready = 1;
1926 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1927 PyObject_Free(_PyUnicode_WSTR(unicode));
1928 _PyUnicode_WSTR(unicode) = NULL;
1929 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1930#else
1931 assert(num_surrogates == 0);
1932
1933 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1934 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1935 _PyUnicode_UTF8(unicode) = NULL;
1936 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1937 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1938#endif
1939 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1940 }
1941 _PyUnicode_STATE(unicode).ready = 1;
1942 assert(_PyUnicode_CheckConsistency(unicode, 1));
1943 return 0;
1944}
1945
1946static void
1947unicode_dealloc(PyObject *unicode)
1948{
1949 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1950 case SSTATE_NOT_INTERNED:
1951 break;
1952
1953 case SSTATE_INTERNED_MORTAL:
1954 {
1955#ifdef INTERNED_STRINGS
1956 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1957 references (key and value) which were ignored by
1958 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1959 to prevent calling unicode_dealloc() again. Adjust refcnt after
1960 PyDict_DelItem(). */
1961 assert(Py_REFCNT(unicode) == 0);
1962 Py_SET_REFCNT(unicode, 3);
1963 if (PyDict_DelItem(interned, unicode) != 0) {
1964 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1965 NULL);
1966 }
1967 assert(Py_REFCNT(unicode) == 1);
1968 Py_SET_REFCNT(unicode, 0);
1969#endif
1970 break;
1971 }
1972
1973 case SSTATE_INTERNED_IMMORTAL:
1974 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1975 break;
1976
1977 default:
1978 Py_UNREACHABLE();
1979 }
1980
1981 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1982 PyObject_Free(_PyUnicode_WSTR(unicode));
1983 }
1984 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1985 PyObject_Free(_PyUnicode_UTF8(unicode));
1986 }
1987 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1988 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1989 }
1990
1991 Py_TYPE(unicode)->tp_free(unicode);
1992}
1993
1994#ifdef Py_DEBUG
1995static int
1996unicode_is_singleton(PyObject *unicode)
1997{
1998 struct _Py_unicode_state *state = get_unicode_state();
1999 if (unicode == state->empty_string) {
2000 return 1;
2001 }
2002 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
2003 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
2004 {
2005 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
2006 if (ch < 256 && state->latin1[ch] == unicode) {
2007 return 1;
2008 }
2009 }
2010 return 0;
2011}
2012#endif
2013
2014static int
2015unicode_modifiable(PyObject *unicode)
2016{
2017 assert(_PyUnicode_CHECK(unicode));
2018 if (Py_REFCNT(unicode) != 1)
2019 return 0;
2020 if (_PyUnicode_HASH(unicode) != -1)
2021 return 0;
2022 if (PyUnicode_CHECK_INTERNED(unicode))
2023 return 0;
2024 if (!PyUnicode_CheckExact(unicode))
2025 return 0;
2026#ifdef Py_DEBUG
2027 /* singleton refcount is greater than 1 */
2028 assert(!unicode_is_singleton(unicode));
2029#endif
2030 return 1;
2031}
2032
2033static int
2034unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2035{
2036 PyObject *unicode;
2037 Py_ssize_t old_length;
2038
2039 assert(p_unicode != NULL);
2040 unicode = *p_unicode;
2041
2042 assert(unicode != NULL);
2043 assert(PyUnicode_Check(unicode));
2044 assert(0 <= length);
2045
2046 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2047 old_length = PyUnicode_WSTR_LENGTH(unicode);
2048 else
2049 old_length = PyUnicode_GET_LENGTH(unicode);
2050 if (old_length == length)
2051 return 0;
2052
2053 if (length == 0) {
2054 PyObject *empty = unicode_new_empty();
2055 Py_SETREF(*p_unicode, empty);
2056 return 0;
2057 }
2058
2059 if (!unicode_modifiable(unicode)) {
2060 PyObject *copy = resize_copy(unicode, length);
2061 if (copy == NULL)
2062 return -1;
2063 Py_SETREF(*p_unicode, copy);
2064 return 0;
2065 }
2066
2067 if (PyUnicode_IS_COMPACT(unicode)) {
2068 PyObject *new_unicode = resize_compact(unicode, length);
2069 if (new_unicode == NULL)
2070 return -1;
2071 *p_unicode = new_unicode;
2072 return 0;
2073 }
2074 return resize_inplace(unicode, length);
2075}
2076
2077int
2078PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2079{
2080 PyObject *unicode;
2081 if (p_unicode == NULL) {
2082 PyErr_BadInternalCall();
2083 return -1;
2084 }
2085 unicode = *p_unicode;
2086 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2087 {
2088 PyErr_BadInternalCall();
2089 return -1;
2090 }
2091 return unicode_resize(p_unicode, length);
2092}
2093
2094/* Copy an ASCII or latin1 char* string into a Python Unicode string.
2095
2096 WARNING: The function doesn't copy the terminating null character and
2097 doesn't check the maximum character (may write a latin1 character in an
2098 ASCII string). */
2099static void
2100unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2101 const char *str, Py_ssize_t len)
2102{
2103 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2104 const void *data = PyUnicode_DATA(unicode);
2105 const char *end = str + len;
2106
2107 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2108 switch (kind) {
2109 case PyUnicode_1BYTE_KIND: {
2110#ifdef Py_DEBUG
2111 if (PyUnicode_IS_ASCII(unicode)) {
2112 Py_UCS4 maxchar = ucs1lib_find_max_char(
2113 (const Py_UCS1*)str,
2114 (const Py_UCS1*)str + len);
2115 assert(maxchar < 128);
2116 }
2117#endif
2118 memcpy((char *) data + index, str, len);
2119 break;
2120 }
2121 case PyUnicode_2BYTE_KIND: {
2122 Py_UCS2 *start = (Py_UCS2 *)data + index;
2123 Py_UCS2 *ucs2 = start;
2124
2125 for (; str < end; ++ucs2, ++str)
2126 *ucs2 = (Py_UCS2)*str;
2127
2128 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2129 break;
2130 }
2131 case PyUnicode_4BYTE_KIND: {
2132 Py_UCS4 *start = (Py_UCS4 *)data + index;
2133 Py_UCS4 *ucs4 = start;
2134
2135 for (; str < end; ++ucs4, ++str)
2136 *ucs4 = (Py_UCS4)*str;
2137
2138 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2139 break;
2140 }
2141 default:
2142 Py_UNREACHABLE();
2143 }
2144}
2145
2146static PyObject*
2147get_latin1_char(Py_UCS1 ch)
2148{
2149 struct _Py_unicode_state *state = get_unicode_state();
2150
2151 PyObject *unicode = state->latin1[ch];
2152 if (unicode) {
2153 Py_INCREF(unicode);
2154 return unicode;
2155 }
2156
2157 unicode = PyUnicode_New(1, ch);
2158 if (!unicode) {
2159 return NULL;
2160 }
2161
2162 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2163 assert(_PyUnicode_CheckConsistency(unicode, 1));
2164
2165 Py_INCREF(unicode);
2166 state->latin1[ch] = unicode;
2167 return unicode;
2168}
2169
2170static PyObject*
2171unicode_char(Py_UCS4 ch)
2172{
2173 PyObject *unicode;
2174
2175 assert(ch <= MAX_UNICODE);
2176
2177 if (ch < 256) {
2178 return get_latin1_char(ch);
2179 }
2180
2181 unicode = PyUnicode_New(1, ch);
2182 if (unicode == NULL)
2183 return NULL;
2184
2185 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2186 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2187 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2188 } else {
2189 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2190 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2191 }
2192 assert(_PyUnicode_CheckConsistency(unicode, 1));
2193 return unicode;
2194}
2195
2196PyObject *
2197PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2198{
2199 if (u == NULL) {
2200 if (size > 0) {
2201 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2202 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2203 "use PyUnicode_New() instead", 1) < 0) {
2204 return NULL;
2205 }
2206 }
2207 return (PyObject*)_PyUnicode_New(size);
2208 }
2209
2210 if (size < 0) {
2211 PyErr_BadInternalCall();
2212 return NULL;
2213 }
2214
2215 return PyUnicode_FromWideChar(u, size);
2216}
2217
2218PyObject *
2219PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2220{
2221 PyObject *unicode;
2222 Py_UCS4 maxchar = 0;
2223 Py_ssize_t num_surrogates;
2224
2225 if (u == NULL && size != 0) {
2226 PyErr_BadInternalCall();
2227 return NULL;
2228 }
2229
2230 if (size == -1) {
2231 size = wcslen(u);
2232 }
2233
2234 /* If the Unicode data is known at construction time, we can apply
2235 some optimizations which share commonly used objects. */
2236
2237 /* Optimization for empty strings */
2238 if (size == 0)
2239 _Py_RETURN_UNICODE_EMPTY();
2240
2241#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2242 /* Oracle Solaris uses non-Unicode internal wchar_t form for
2243 non-Unicode locales and hence needs conversion to UCS-4 first. */
2244 if (_Py_LocaleUsesNonUnicodeWchar()) {
2245 wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2246 if (!converted) {
2247 return NULL;
2248 }
2249 PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2250 PyMem_Free(converted);
2251 return unicode;
2252 }
2253#endif
2254
2255 /* Single character Unicode objects in the Latin-1 range are
2256 shared when using this constructor */
2257 if (size == 1 && (Py_UCS4)*u < 256)
2258 return get_latin1_char((unsigned char)*u);
2259
2260 /* If not empty and not single character, copy the Unicode data
2261 into the new object */
2262 if (find_maxchar_surrogates(u, u + size,
2263 &maxchar, &num_surrogates) == -1)
2264 return NULL;
2265
2266 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2267 if (!unicode)
2268 return NULL;
2269
2270 switch (PyUnicode_KIND(unicode)) {
2271 case PyUnicode_1BYTE_KIND:
2272 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2273 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2274 break;
2275 case PyUnicode_2BYTE_KIND:
2276#if Py_UNICODE_SIZE == 2
2277 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2278#else
2279 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2280 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2281#endif
2282 break;
2283 case PyUnicode_4BYTE_KIND:
2284#if SIZEOF_WCHAR_T == 2
2285 /* This is the only case which has to process surrogates, thus
2286 a simple copy loop is not enough and we need a function. */
2287 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2288#else
2289 assert(num_surrogates == 0);
2290 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2291#endif
2292 break;
2293 default:
2294 Py_UNREACHABLE();
2295 }
2296
2297 return unicode_result(unicode);
2298}
2299
2300PyObject *
2301PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2302{
2303 if (size < 0) {
2304 PyErr_SetString(PyExc_SystemError,
2305 "Negative size passed to PyUnicode_FromStringAndSize");
2306 return NULL;
2307 }
2308 if (u != NULL) {
2309 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2310 }
2311 else {
2312 if (size > 0) {
2313 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2314 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2315 "use PyUnicode_New() instead", 1) < 0) {
2316 return NULL;
2317 }
2318 }
2319 return (PyObject *)_PyUnicode_New(size);
2320 }
2321}
2322
2323PyObject *
2324PyUnicode_FromString(const char *u)
2325{
2326 size_t size = strlen(u);
2327 if (size > PY_SSIZE_T_MAX) {
2328 PyErr_SetString(PyExc_OverflowError, "input too long");
2329 return NULL;
2330 }
2331 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2332}
2333
2334
2335PyObject *
2336_PyUnicode_FromId(_Py_Identifier *id)
2337{
2338 PyInterpreterState *interp = _PyInterpreterState_GET();
2339 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2340
2341 Py_ssize_t index = _Py_atomic_size_get(&id->index);
2342 if (index < 0) {
2343 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2344
2345 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2346 // Check again to detect concurrent access. Another thread can have
2347 // initialized the index while this thread waited for the lock.
2348 index = _Py_atomic_size_get(&id->index);
2349 if (index < 0) {
2350 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2351 index = rt_ids->next_index;
2352 rt_ids->next_index++;
2353 _Py_atomic_size_set(&id->index, index);
2354 }
2355 PyThread_release_lock(rt_ids->lock);
2356 }
2357 assert(index >= 0);
2358
2359 PyObject *obj;
2360 if (index < ids->size) {
2361 obj = ids->array[index];
2362 if (obj) {
2363 // Return a borrowed reference
2364 return obj;
2365 }
2366 }
2367
2368 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2369 NULL, NULL);
2370 if (!obj) {
2371 return NULL;
2372 }
2373 PyUnicode_InternInPlace(&obj);
2374
2375 if (index >= ids->size) {
2376 // Overallocate to reduce the number of realloc
2377 Py_ssize_t new_size = Py_MAX(index * 2, 16);
2378 Py_ssize_t item_size = sizeof(ids->array[0]);
2379 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2380 if (new_array == NULL) {
2381 PyErr_NoMemory();
2382 return NULL;
2383 }
2384 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2385 ids->array = new_array;
2386 ids->size = new_size;
2387 }
2388
2389 // The array stores a strong reference
2390 ids->array[index] = obj;
2391
2392 // Return a borrowed reference
2393 return obj;
2394}
2395
2396
2397static void
2398unicode_clear_identifiers(struct _Py_unicode_state *state)
2399{
2400 struct _Py_unicode_ids *ids = &state->ids;
2401 for (Py_ssize_t i=0; i < ids->size; i++) {
2402 Py_XDECREF(ids->array[i]);
2403 }
2404 ids->size = 0;
2405 PyMem_Free(ids->array);
2406 ids->array = NULL;
2407 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2408 // after Py_Finalize().
2409}
2410
2411
2412/* Internal function, doesn't check maximum character */
2413
2414PyObject*
2415_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2416{
2417 const unsigned char *s = (const unsigned char *)buffer;
2418 PyObject *unicode;
2419 if (size == 1) {
2420#ifdef Py_DEBUG
2421 assert((unsigned char)s[0] < 128);
2422#endif
2423 return get_latin1_char(s[0]);
2424 }
2425 unicode = PyUnicode_New(size, 127);
2426 if (!unicode)
2427 return NULL;
2428 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2429 assert(_PyUnicode_CheckConsistency(unicode, 1));
2430 return unicode;
2431}
2432
2433static Py_UCS4
2434kind_maxchar_limit(unsigned int kind)
2435{
2436 switch (kind) {
2437 case PyUnicode_1BYTE_KIND:
2438 return 0x80;
2439 case PyUnicode_2BYTE_KIND:
2440 return 0x100;
2441 case PyUnicode_4BYTE_KIND:
2442 return 0x10000;
2443 default:
2444 Py_UNREACHABLE();
2445 }
2446}
2447
2448static PyObject*
2449_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2450{
2451 PyObject *res;
2452 unsigned char max_char;
2453
2454 if (size == 0) {
2455 _Py_RETURN_UNICODE_EMPTY();
2456 }
2457 assert(size > 0);
2458 if (size == 1) {
2459 return get_latin1_char(u[0]);
2460 }
2461
2462 max_char = ucs1lib_find_max_char(u, u + size);
2463 res = PyUnicode_New(size, max_char);
2464 if (!res)
2465 return NULL;
2466 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2467 assert(_PyUnicode_CheckConsistency(res, 1));
2468 return res;
2469}
2470
2471static PyObject*
2472_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2473{
2474 PyObject *res;
2475 Py_UCS2 max_char;
2476
2477 if (size == 0)
2478 _Py_RETURN_UNICODE_EMPTY();
2479 assert(size > 0);
2480 if (size == 1)
2481 return unicode_char(u[0]);
2482
2483 max_char = ucs2lib_find_max_char(u, u + size);
2484 res = PyUnicode_New(size, max_char);
2485 if (!res)
2486 return NULL;
2487 if (max_char >= 256)
2488 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2489 else {
2490 _PyUnicode_CONVERT_BYTES(
2491 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2492 }
2493 assert(_PyUnicode_CheckConsistency(res, 1));
2494 return res;
2495}
2496
2497static PyObject*
2498_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2499{
2500 PyObject *res;
2501 Py_UCS4 max_char;
2502
2503 if (size == 0)
2504 _Py_RETURN_UNICODE_EMPTY();
2505 assert(size > 0);
2506 if (size == 1)
2507 return unicode_char(u[0]);
2508
2509 max_char = ucs4lib_find_max_char(u, u + size);
2510 res = PyUnicode_New(size, max_char);
2511 if (!res)
2512 return NULL;
2513 if (max_char < 256)
2514 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2515 PyUnicode_1BYTE_DATA(res));
2516 else if (max_char < 0x10000)
2517 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2518 PyUnicode_2BYTE_DATA(res));
2519 else
2520 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2521 assert(_PyUnicode_CheckConsistency(res, 1));
2522 return res;
2523}
2524
2525PyObject*
2526PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2527{
2528 if (size < 0) {
2529 PyErr_SetString(PyExc_ValueError, "size must be positive");
2530 return NULL;
2531 }
2532 switch (kind) {
2533 case PyUnicode_1BYTE_KIND:
2534 return _PyUnicode_FromUCS1(buffer, size);
2535 case PyUnicode_2BYTE_KIND:
2536 return _PyUnicode_FromUCS2(buffer, size);
2537 case PyUnicode_4BYTE_KIND:
2538 return _PyUnicode_FromUCS4(buffer, size);
2539 default:
2540 PyErr_SetString(PyExc_SystemError, "invalid kind");
2541 return NULL;
2542 }
2543}
2544
2545Py_UCS4
2546_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2547{
2548 enum PyUnicode_Kind kind;
2549 const void *startptr, *endptr;
2550
2551 assert(PyUnicode_IS_READY(unicode));
2552 assert(0 <= start);
2553 assert(end <= PyUnicode_GET_LENGTH(unicode));
2554 assert(start <= end);
2555
2556 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2557 return PyUnicode_MAX_CHAR_VALUE(unicode);
2558
2559 if (start == end)
2560 return 127;
2561
2562 if (PyUnicode_IS_ASCII(unicode))
2563 return 127;
2564
2565 kind = PyUnicode_KIND(unicode);
2566 startptr = PyUnicode_DATA(unicode);
2567 endptr = (char *)startptr + end * kind;
2568 startptr = (char *)startptr + start * kind;
2569 switch(kind) {
2570 case PyUnicode_1BYTE_KIND:
2571 return ucs1lib_find_max_char(startptr, endptr);
2572 case PyUnicode_2BYTE_KIND:
2573 return ucs2lib_find_max_char(startptr, endptr);
2574 case PyUnicode_4BYTE_KIND:
2575 return ucs4lib_find_max_char(startptr, endptr);
2576 default:
2577 Py_UNREACHABLE();
2578 }
2579}
2580
2581/* Ensure that a string uses the most efficient storage, if it is not the
2582 case: create a new string with of the right kind. Write NULL into *p_unicode
2583 on error. */
2584static void
2585unicode_adjust_maxchar(PyObject **p_unicode)
2586{
2587 PyObject *unicode, *copy;
2588 Py_UCS4 max_char;
2589 Py_ssize_t len;
2590 unsigned int kind;
2591
2592 assert(p_unicode != NULL);
2593 unicode = *p_unicode;
2594 assert(PyUnicode_IS_READY(unicode));
2595 if (PyUnicode_IS_ASCII(unicode))
2596 return;
2597
2598 len = PyUnicode_GET_LENGTH(unicode);
2599 kind = PyUnicode_KIND(unicode);
2600 if (kind == PyUnicode_1BYTE_KIND) {
2601 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2602 max_char = ucs1lib_find_max_char(u, u + len);
2603 if (max_char >= 128)
2604 return;
2605 }
2606 else if (kind == PyUnicode_2BYTE_KIND) {
2607 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2608 max_char = ucs2lib_find_max_char(u, u + len);
2609 if (max_char >= 256)
2610 return;
2611 }
2612 else if (kind == PyUnicode_4BYTE_KIND) {
2613 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2614 max_char = ucs4lib_find_max_char(u, u + len);
2615 if (max_char >= 0x10000)
2616 return;
2617 }
2618 else
2619 Py_UNREACHABLE();
2620
2621 copy = PyUnicode_New(len, max_char);
2622 if (copy != NULL)
2623 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2624 Py_DECREF(unicode);
2625 *p_unicode = copy;
2626}
2627
2628PyObject*
2629_PyUnicode_Copy(PyObject *unicode)
2630{
2631 Py_ssize_t length;
2632 PyObject *copy;
2633
2634 if (!PyUnicode_Check(unicode)) {
2635 PyErr_BadInternalCall();
2636 return NULL;
2637 }
2638 if (PyUnicode_READY(unicode) == -1)
2639 return NULL;
2640
2641 length = PyUnicode_GET_LENGTH(unicode);
2642 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2643 if (!copy)
2644 return NULL;
2645 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2646
2647 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2648 length * PyUnicode_KIND(unicode));
2649 assert(_PyUnicode_CheckConsistency(copy, 1));
2650 return copy;
2651}
2652
2653
2654/* Widen Unicode objects to larger buffers. Don't write terminating null
2655 character. Return NULL on error. */
2656
2657static void*
2658unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
2659{
2660 void *result;
2661
2662 assert(skind < kind);
2663 switch (kind) {
2664 case PyUnicode_2BYTE_KIND:
2665 result = PyMem_New(Py_UCS2, len);
2666 if (!result)
2667 return PyErr_NoMemory();
2668 assert(skind == PyUnicode_1BYTE_KIND);
2669 _PyUnicode_CONVERT_BYTES(
2670 Py_UCS1, Py_UCS2,
2671 (const Py_UCS1 *)data,
2672 ((const Py_UCS1 *)data) + len,
2673 result);
2674 return result;
2675 case PyUnicode_4BYTE_KIND:
2676 result = PyMem_New(Py_UCS4, len);
2677 if (!result)
2678 return PyErr_NoMemory();
2679 if (skind == PyUnicode_2BYTE_KIND) {
2680 _PyUnicode_CONVERT_BYTES(
2681 Py_UCS2, Py_UCS4,
2682 (const Py_UCS2 *)data,
2683 ((const Py_UCS2 *)data) + len,
2684 result);
2685 }
2686 else {
2687 assert(skind == PyUnicode_1BYTE_KIND);
2688 _PyUnicode_CONVERT_BYTES(
2689 Py_UCS1, Py_UCS4,
2690 (const Py_UCS1 *)data,
2691 ((const Py_UCS1 *)data) + len,
2692 result);
2693 }
2694 return result;
2695 default:
2696 Py_UNREACHABLE();
2697 return NULL;
2698 }
2699}
2700
2701static Py_UCS4*
2702as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2703 int copy_null)
2704{
2705 int kind;
2706 const void *data;
2707 Py_ssize_t len, targetlen;
2708 if (PyUnicode_READY(string) == -1)
2709 return NULL;
2710 kind = PyUnicode_KIND(string);
2711 data = PyUnicode_DATA(string);
2712 len = PyUnicode_GET_LENGTH(string);
2713 targetlen = len;
2714 if (copy_null)
2715 targetlen++;
2716 if (!target) {
2717 target = PyMem_New(Py_UCS4, targetlen);
2718 if (!target) {
2719 PyErr_NoMemory();
2720 return NULL;
2721 }
2722 }
2723 else {
2724 if (targetsize < targetlen) {
2725 PyErr_Format(PyExc_SystemError,
2726 "string is longer than the buffer");
2727 if (copy_null && 0 < targetsize)
2728 target[0] = 0;
2729 return NULL;
2730 }
2731 }
2732 if (kind == PyUnicode_1BYTE_KIND) {
2733 const Py_UCS1 *start = (const Py_UCS1 *) data;
2734 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2735 }
2736 else if (kind == PyUnicode_2BYTE_KIND) {
2737 const Py_UCS2 *start = (const Py_UCS2 *) data;
2738 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2739 }
2740 else if (kind == PyUnicode_4BYTE_KIND) {
2741 memcpy(target, data, len * sizeof(Py_UCS4));
2742 }
2743 else {
2744 Py_UNREACHABLE();
2745 }
2746 if (copy_null)
2747 target[len] = 0;
2748 return target;
2749}
2750
2751Py_UCS4*
2752PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2753 int copy_null)
2754{
2755 if (target == NULL || targetsize < 0) {
2756 PyErr_BadInternalCall();
2757 return NULL;
2758 }
2759 return as_ucs4(string, target, targetsize, copy_null);
2760}
2761
2762Py_UCS4*
2763PyUnicode_AsUCS4Copy(PyObject *string)
2764{
2765 return as_ucs4(string, NULL, 0, 1);
2766}
2767
2768/* maximum number of characters required for output of %lld or %p.
2769 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2770 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2771#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2772
2773static int
2774unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2775 Py_ssize_t width, Py_ssize_t precision)
2776{
2777 Py_ssize_t length, fill, arglen;
2778 Py_UCS4 maxchar;
2779
2780 if (PyUnicode_READY(str) == -1)
2781 return -1;
2782
2783 length = PyUnicode_GET_LENGTH(str);
2784 if ((precision == -1 || precision >= length)
2785 && width <= length)
2786 return _PyUnicodeWriter_WriteStr(writer, str);
2787
2788 if (precision != -1)
2789 length = Py_MIN(precision, length);
2790
2791 arglen = Py_MAX(length, width);
2792 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2793 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2794 else
2795 maxchar = writer->maxchar;
2796
2797 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2798 return -1;
2799
2800 if (width > length) {
2801 fill = width - length;
2802 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2803 return -1;
2804 writer->pos += fill;
2805 }
2806
2807 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2808 str, 0, length);
2809 writer->pos += length;
2810 return 0;
2811}
2812
2813static int
2814unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2815 Py_ssize_t width, Py_ssize_t precision)
2816{
2817 /* UTF-8 */
2818 Py_ssize_t length;
2819 PyObject *unicode;
2820 int res;
2821
2822 if (precision == -1) {
2823 length = strlen(str);
2824 }
2825 else {
2826 length = 0;
2827 while (length < precision && str[length]) {
2828 length++;
2829 }
2830 }
2831 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2832 if (unicode == NULL)
2833 return -1;
2834
2835 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2836 Py_DECREF(unicode);
2837 return res;
2838}
2839
2840static const char*
2841unicode_fromformat_arg(_PyUnicodeWriter *writer,
2842 const char *f, va_list *vargs)
2843{
2844 const char *p;
2845 Py_ssize_t len;
2846 int zeropad;
2847 Py_ssize_t width;
2848 Py_ssize_t precision;
2849 int longflag;
2850 int longlongflag;
2851 int size_tflag;
2852 Py_ssize_t fill;
2853
2854 p = f;
2855 f++;
2856 zeropad = 0;
2857 if (*f == '0') {
2858 zeropad = 1;
2859 f++;
2860 }
2861
2862 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2863 width = -1;
2864 if (Py_ISDIGIT((unsigned)*f)) {
2865 width = *f - '0';
2866 f++;
2867 while (Py_ISDIGIT((unsigned)*f)) {
2868 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2869 PyErr_SetString(PyExc_ValueError,
2870 "width too big");
2871 return NULL;
2872 }
2873 width = (width * 10) + (*f - '0');
2874 f++;
2875 }
2876 }
2877 precision = -1;
2878 if (*f == '.') {
2879 f++;
2880 if (Py_ISDIGIT((unsigned)*f)) {
2881 precision = (*f - '0');
2882 f++;
2883 while (Py_ISDIGIT((unsigned)*f)) {
2884 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2885 PyErr_SetString(PyExc_ValueError,
2886 "precision too big");
2887 return NULL;
2888 }
2889 precision = (precision * 10) + (*f - '0');
2890 f++;
2891 }
2892 }
2893 if (*f == '%') {
2894 /* "%.3%s" => f points to "3" */
2895 f--;
2896 }
2897 }
2898 if (*f == '\0') {
2899 /* bogus format "%.123" => go backward, f points to "3" */
2900 f--;
2901 }
2902
2903 /* Handle %ld, %lu, %lld and %llu. */
2904 longflag = 0;
2905 longlongflag = 0;
2906 size_tflag = 0;
2907 if (*f == 'l') {
2908 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2909 longflag = 1;
2910 ++f;
2911 }
2912 else if (f[1] == 'l' &&
2913 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2914 longlongflag = 1;
2915 f += 2;
2916 }
2917 }
2918 /* handle the size_t flag. */
2919 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2920 size_tflag = 1;
2921 ++f;
2922 }
2923
2924 if (f[1] == '\0')
2925 writer->overallocate = 0;
2926
2927 switch (*f) {
2928 case 'c':
2929 {
2930 int ordinal = va_arg(*vargs, int);
2931 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2932 PyErr_SetString(PyExc_OverflowError,
2933 "character argument not in range(0x110000)");
2934 return NULL;
2935 }
2936 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2937 return NULL;
2938 break;
2939 }
2940
2941 case 'i':
2942 case 'd':
2943 case 'u':
2944 case 'x':
2945 {
2946 /* used by sprintf */
2947 char buffer[MAX_LONG_LONG_CHARS];
2948 Py_ssize_t arglen;
2949
2950 if (*f == 'u') {
2951 if (longflag) {
2952 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2953 }
2954 else if (longlongflag) {
2955 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2956 }
2957 else if (size_tflag) {
2958 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2959 }
2960 else {
2961 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2962 }
2963 }
2964 else if (*f == 'x') {
2965 len = sprintf(buffer, "%x", va_arg(*vargs, int));
2966 }
2967 else {
2968 if (longflag) {
2969 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2970 }
2971 else if (longlongflag) {
2972 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2973 }
2974 else if (size_tflag) {
2975 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2976 }
2977 else {
2978 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2979 }
2980 }
2981 assert(len >= 0);
2982
2983 if (precision < len)
2984 precision = len;
2985
2986 arglen = Py_MAX(precision, width);
2987 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2988 return NULL;
2989
2990 if (width > precision) {
2991 Py_UCS4 fillchar;
2992 fill = width - precision;
2993 fillchar = zeropad?'0':' ';
2994 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2995 return NULL;
2996 writer->pos += fill;
2997 }
2998 if (precision > len) {
2999 fill = precision - len;
3000 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
3001 return NULL;
3002 writer->pos += fill;
3003 }
3004
3005 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
3006 return NULL;
3007 break;
3008 }
3009
3010 case 'p':
3011 {
3012 char number[MAX_LONG_LONG_CHARS];
3013
3014 len = sprintf(number, "%p", va_arg(*vargs, void*));
3015 assert(len >= 0);
3016
3017 /* %p is ill-defined: ensure leading 0x. */
3018 if (number[1] == 'X')
3019 number[1] = 'x';
3020 else if (number[1] != 'x') {
3021 memmove(number + 2, number,
3022 strlen(number) + 1);
3023 number[0] = '0';
3024 number[1] = 'x';
3025 len += 2;
3026 }
3027
3028 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
3029 return NULL;
3030 break;
3031 }
3032
3033 case 's':
3034 {
3035 /* UTF-8 */
3036 const char *s = va_arg(*vargs, const char*);
3037 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
3038 return NULL;
3039 break;
3040 }
3041
3042 case 'U':
3043 {
3044 PyObject *obj = va_arg(*vargs, PyObject *);
3045 assert(obj && _PyUnicode_CHECK(obj));
3046
3047 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3048 return NULL;
3049 break;
3050 }
3051
3052 case 'V':
3053 {
3054 PyObject *obj = va_arg(*vargs, PyObject *);
3055 const char *str = va_arg(*vargs, const char *);
3056 if (obj) {
3057 assert(_PyUnicode_CHECK(obj));
3058 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3059 return NULL;
3060 }
3061 else {
3062 assert(str != NULL);
3063 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
3064 return NULL;
3065 }
3066 break;
3067 }
3068
3069 case 'S':
3070 {
3071 PyObject *obj = va_arg(*vargs, PyObject *);
3072 PyObject *str;
3073 assert(obj);
3074 str = PyObject_Str(obj);
3075 if (!str)
3076 return NULL;
3077 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
3078 Py_DECREF(str);
3079 return NULL;
3080 }
3081 Py_DECREF(str);
3082 break;
3083 }
3084
3085 case 'R':
3086 {
3087 PyObject *obj = va_arg(*vargs, PyObject *);
3088 PyObject *repr;
3089 assert(obj);
3090 repr = PyObject_Repr(obj);
3091 if (!repr)
3092 return NULL;
3093 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
3094 Py_DECREF(repr);
3095 return NULL;
3096 }
3097 Py_DECREF(repr);
3098 break;
3099 }
3100
3101 case 'A':
3102 {
3103 PyObject *obj = va_arg(*vargs, PyObject *);
3104 PyObject *ascii;
3105 assert(obj);
3106 ascii = PyObject_ASCII(obj);
3107 if (!ascii)
3108 return NULL;
3109 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
3110 Py_DECREF(ascii);
3111 return NULL;
3112 }
3113 Py_DECREF(ascii);
3114 break;
3115 }
3116
3117 case '%':
3118 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
3119 return NULL;
3120 break;
3121
3122 default:
3123 /* if we stumble upon an unknown formatting code, copy the rest
3124 of the format string to the output string. (we cannot just
3125 skip the code, since there's no way to know what's in the
3126 argument list) */
3127 len = strlen(p);
3128 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
3129 return NULL;
3130 f = p+len;
3131 return f;
3132 }
3133
3134 f++;
3135 return f;
3136}
3137
3138PyObject *
3139PyUnicode_FromFormatV(const char *format, va_list vargs)
3140{
3141 va_list vargs2;
3142 const char *f;
3143 _PyUnicodeWriter writer;
3144
3145 _PyUnicodeWriter_Init(&writer);
3146 writer.min_length = strlen(format) + 100;
3147 writer.overallocate = 1;
3148
3149 // Copy varags to be able to pass a reference to a subfunction.
3150 va_copy(vargs2, vargs);
3151
3152 for (f = format; *f; ) {
3153 if (*f == '%') {
3154 f = unicode_fromformat_arg(&writer, f, &vargs2);
3155 if (f == NULL)
3156 goto fail;
3157 }
3158 else {
3159 const char *p;
3160 Py_ssize_t len;
3161
3162 p = f;
3163 do
3164 {
3165 if ((unsigned char)*p > 127) {
3166 PyErr_Format(PyExc_ValueError,
3167 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3168 "string, got a non-ASCII byte: 0x%02x",
3169 (unsigned char)*p);
3170 goto fail;
3171 }
3172 p++;
3173 }
3174 while (*p != '\0' && *p != '%');
3175 len = p - f;
3176
3177 if (*p == '\0')
3178 writer.overallocate = 0;
3179
3180 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3181 goto fail;
3182
3183 f = p;
3184 }
3185 }
3186 va_end(vargs2);
3187 return _PyUnicodeWriter_Finish(&writer);
3188
3189 fail:
3190 va_end(vargs2);
3191 _PyUnicodeWriter_Dealloc(&writer);
3192 return NULL;
3193}
3194
3195PyObject *
3196PyUnicode_FromFormat(const char *format, ...)
3197{
3198 PyObject* ret;
3199 va_list vargs;
3200
3201#ifdef HAVE_STDARG_PROTOTYPES
3202 va_start(vargs, format);
3203#else
3204 va_start(vargs);
3205#endif
3206 ret = PyUnicode_FromFormatV(format, vargs);
3207 va_end(vargs);
3208 return ret;
3209}
3210
3211static Py_ssize_t
3212unicode_get_widechar_size(PyObject *unicode)
3213{
3214 Py_ssize_t res;
3215
3216 assert(unicode != NULL);
3217 assert(_PyUnicode_CHECK(unicode));
3218
3219#if USE_UNICODE_WCHAR_CACHE
3220 if (_PyUnicode_WSTR(unicode) != NULL) {
3221 return PyUnicode_WSTR_LENGTH(unicode);
3222 }
3223#endif /* USE_UNICODE_WCHAR_CACHE */
3224 assert(PyUnicode_IS_READY(unicode));
3225
3226 res = _PyUnicode_LENGTH(unicode);
3227#if SIZEOF_WCHAR_T == 2
3228 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3229 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3230 const Py_UCS4 *end = s + res;
3231 for (; s < end; ++s) {
3232 if (*s > 0xFFFF) {
3233 ++res;
3234 }
3235 }
3236 }
3237#endif
3238 return res;
3239}
3240
3241static void
3242unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3243{
3244 assert(unicode != NULL);
3245 assert(_PyUnicode_CHECK(unicode));
3246
3247#if USE_UNICODE_WCHAR_CACHE
3248 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
3249 if (wstr != NULL) {
3250 memcpy(w, wstr, size * sizeof(wchar_t));
3251 return;
3252 }
3253#else /* USE_UNICODE_WCHAR_CACHE */
3254 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3255 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3256 return;
3257 }
3258#endif /* USE_UNICODE_WCHAR_CACHE */
3259 assert(PyUnicode_IS_READY(unicode));
3260
3261 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3262 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3263 for (; size--; ++s, ++w) {
3264 *w = *s;
3265 }
3266 }
3267 else {
3268#if SIZEOF_WCHAR_T == 4
3269 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3270 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3271 for (; size--; ++s, ++w) {
3272 *w = *s;
3273 }
3274#else
3275 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3276 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3277 for (; size--; ++s, ++w) {
3278 Py_UCS4 ch = *s;
3279 if (ch > 0xFFFF) {
3280 assert(ch <= MAX_UNICODE);
3281 /* encode surrogate pair in this case */
3282 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3283 if (!size--)
3284 break;
3285 *w = Py_UNICODE_LOW_SURROGATE(ch);
3286 }
3287 else {
3288 *w = ch;
3289 }
3290 }
3291#endif
3292 }
3293}
3294
3295#ifdef HAVE_WCHAR_H
3296
3297/* Convert a Unicode object to a wide character string.
3298
3299 - If w is NULL: return the number of wide characters (including the null
3300 character) required to convert the unicode object. Ignore size argument.
3301
3302 - Otherwise: return the number of wide characters (excluding the null
3303 character) written into w. Write at most size wide characters (including
3304 the null character). */
3305Py_ssize_t
3306PyUnicode_AsWideChar(PyObject *unicode,
3307 wchar_t *w,
3308 Py_ssize_t size)
3309{
3310 Py_ssize_t res;
3311
3312 if (unicode == NULL) {
3313 PyErr_BadInternalCall();
3314 return -1;
3315 }
3316 if (!PyUnicode_Check(unicode)) {
3317 PyErr_BadArgument();
3318 return -1;
3319 }
3320
3321 res = unicode_get_widechar_size(unicode);
3322 if (w == NULL) {
3323 return res + 1;
3324 }
3325
3326 if (size > res) {
3327 size = res + 1;
3328 }
3329 else {
3330 res = size;
3331 }
3332 unicode_copy_as_widechar(unicode, w, size);
3333
3334#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3335 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3336 non-Unicode locales and hence needs conversion first. */
3337 if (_Py_LocaleUsesNonUnicodeWchar()) {
3338 if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3339 return -1;
3340 }
3341 }
3342#endif
3343
3344 return res;
3345}
3346
3347wchar_t*
3348PyUnicode_AsWideCharString(PyObject *unicode,
3349 Py_ssize_t *size)
3350{
3351 wchar_t *buffer;
3352 Py_ssize_t buflen;
3353
3354 if (unicode == NULL) {
3355 PyErr_BadInternalCall();
3356 return NULL;
3357 }
3358 if (!PyUnicode_Check(unicode)) {
3359 PyErr_BadArgument();
3360 return NULL;
3361 }
3362
3363 buflen = unicode_get_widechar_size(unicode);
3364 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3365 if (buffer == NULL) {
3366 PyErr_NoMemory();
3367 return NULL;
3368 }
3369 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3370
3371#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3372 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3373 non-Unicode locales and hence needs conversion first. */
3374 if (_Py_LocaleUsesNonUnicodeWchar()) {
3375 if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3376 return NULL;
3377 }
3378 }
3379#endif
3380
3381 if (size != NULL) {
3382 *size = buflen;
3383 }
3384 else if (wcslen(buffer) != (size_t)buflen) {
3385 PyMem_Free(buffer);
3386 PyErr_SetString(PyExc_ValueError,
3387 "embedded null character");
3388 return NULL;
3389 }
3390 return buffer;
3391}
3392
3393#endif /* HAVE_WCHAR_H */
3394
3395int
3396_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3397{
3398 wchar_t **p = (wchar_t **)ptr;
3399 if (obj == NULL) {
3400#if !USE_UNICODE_WCHAR_CACHE
3401 PyMem_Free(*p);
3402#endif /* USE_UNICODE_WCHAR_CACHE */
3403 *p = NULL;
3404 return 1;
3405 }
3406 if (PyUnicode_Check(obj)) {
3407#if USE_UNICODE_WCHAR_CACHE
3408 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3409 if (*p == NULL) {
3410 return 0;
3411 }
3412 return 1;
3413#else /* USE_UNICODE_WCHAR_CACHE */
3414 *p = PyUnicode_AsWideCharString(obj, NULL);
3415 if (*p == NULL) {
3416 return 0;
3417 }
3418 return Py_CLEANUP_SUPPORTED;
3419#endif /* USE_UNICODE_WCHAR_CACHE */
3420 }
3421 PyErr_Format(PyExc_TypeError,
3422 "argument must be str, not %.50s",
3423 Py_TYPE(obj)->tp_name);
3424 return 0;
3425}
3426
3427int
3428_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3429{
3430 wchar_t **p = (wchar_t **)ptr;
3431 if (obj == NULL) {
3432#if !USE_UNICODE_WCHAR_CACHE
3433 PyMem_Free(*p);
3434#endif /* USE_UNICODE_WCHAR_CACHE */
3435 *p = NULL;
3436 return 1;
3437 }
3438 if (obj == Py_None) {
3439 *p = NULL;
3440 return 1;
3441 }
3442 if (PyUnicode_Check(obj)) {
3443#if USE_UNICODE_WCHAR_CACHE
3444 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3445 if (*p == NULL) {
3446 return 0;
3447 }
3448 return 1;
3449#else /* USE_UNICODE_WCHAR_CACHE */
3450 *p = PyUnicode_AsWideCharString(obj, NULL);
3451 if (*p == NULL) {
3452 return 0;
3453 }
3454 return Py_CLEANUP_SUPPORTED;
3455#endif /* USE_UNICODE_WCHAR_CACHE */
3456 }
3457 PyErr_Format(PyExc_TypeError,
3458 "argument must be str or None, not %.50s",
3459 Py_TYPE(obj)->tp_name);
3460 return 0;
3461}
3462
3463PyObject *
3464PyUnicode_FromOrdinal(int ordinal)
3465{
3466 if (ordinal < 0 || ordinal > MAX_UNICODE) {
3467 PyErr_SetString(PyExc_ValueError,
3468 "chr() arg not in range(0x110000)");
3469 return NULL;
3470 }
3471
3472 return unicode_char((Py_UCS4)ordinal);
3473}
3474
3475PyObject *
3476PyUnicode_FromObject(PyObject *obj)
3477{
3478 /* XXX Perhaps we should make this API an alias of
3479 PyObject_Str() instead ?! */
3480 if (PyUnicode_CheckExact(obj)) {
3481 if (PyUnicode_READY(obj) == -1)
3482 return NULL;
3483 Py_INCREF(obj);
3484 return obj;
3485 }
3486 if (PyUnicode_Check(obj)) {
3487 /* For a Unicode subtype that's not a Unicode object,
3488 return a true Unicode object with the same data. */
3489 return _PyUnicode_Copy(obj);
3490 }
3491 PyErr_Format(PyExc_TypeError,
3492 "Can't convert '%.100s' object to str implicitly",
3493 Py_TYPE(obj)->tp_name);
3494 return NULL;
3495}
3496
3497PyObject *
3498PyUnicode_FromEncodedObject(PyObject *obj,
3499 const char *encoding,
3500 const char *errors)
3501{
3502 Py_buffer buffer;
3503 PyObject *v;
3504
3505 if (obj == NULL) {
3506 PyErr_BadInternalCall();
3507 return NULL;
3508 }
3509
3510 /* Decoding bytes objects is the most common case and should be fast */
3511 if (PyBytes_Check(obj)) {
3512 if (PyBytes_GET_SIZE(obj) == 0) {
3513 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3514 return NULL;
3515 }
3516 _Py_RETURN_UNICODE_EMPTY();
3517 }
3518 return PyUnicode_Decode(
3519 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3520 encoding, errors);
3521 }
3522
3523 if (PyUnicode_Check(obj)) {
3524 PyErr_SetString(PyExc_TypeError,
3525 "decoding str is not supported");
3526 return NULL;
3527 }
3528
3529 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3530 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3531 PyErr_Format(PyExc_TypeError,
3532 "decoding to str: need a bytes-like object, %.80s found",
3533 Py_TYPE(obj)->tp_name);
3534 return NULL;
3535 }
3536
3537 if (buffer.len == 0) {
3538 PyBuffer_Release(&buffer);
3539 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3540 return NULL;
3541 }
3542 _Py_RETURN_UNICODE_EMPTY();
3543 }
3544
3545 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3546 PyBuffer_Release(&buffer);
3547 return v;
3548}
3549
3550/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3551 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3552 longer than lower_len-1). */
3553int
3554_Py_normalize_encoding(const char *encoding,
3555 char *lower,
3556 size_t lower_len)
3557{
3558 const char *e;
3559 char *l;
3560 char *l_end;
3561 int punct;
3562
3563 assert(encoding != NULL);
3564
3565 e = encoding;
3566 l = lower;
3567 l_end = &lower[lower_len - 1];
3568 punct = 0;
3569 while (1) {
3570 char c = *e;
3571 if (c == 0) {
3572 break;
3573 }
3574
3575 if (Py_ISALNUM(c) || c == '.') {
3576 if (punct && l != lower) {
3577 if (l == l_end) {
3578 return 0;
3579 }
3580 *l++ = '_';
3581 }
3582 punct = 0;
3583
3584 if (l == l_end) {
3585 return 0;
3586 }
3587 *l++ = Py_TOLOWER(c);
3588 }
3589 else {
3590 punct = 1;
3591 }
3592
3593 e++;
3594 }
3595 *l = '\0';
3596 return 1;
3597}
3598
3599PyObject *
3600PyUnicode_Decode(const char *s,
3601 Py_ssize_t size,
3602 const char *encoding,
3603 const char *errors)
3604{
3605 PyObject *buffer = NULL, *unicode;
3606 Py_buffer info;
3607 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3608
3609 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3610 return NULL;
3611 }
3612
3613 if (size == 0) {
3614 _Py_RETURN_UNICODE_EMPTY();
3615 }
3616
3617 if (encoding == NULL) {
3618 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3619 }
3620
3621 /* Shortcuts for common default encodings */
3622 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3623 char *lower = buflower;
3624
3625 /* Fast paths */
3626 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3627 lower += 3;
3628 if (*lower == '_') {
3629 /* Match "utf8" and "utf_8" */
3630 lower++;
3631 }
3632
3633 if (lower[0] == '8' && lower[1] == 0) {
3634 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3635 }
3636 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3637 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3638 }
3639 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3640 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3641 }
3642 }
3643 else {
3644 if (strcmp(lower, "ascii") == 0
3645 || strcmp(lower, "us_ascii") == 0) {
3646 return PyUnicode_DecodeASCII(s, size, errors);
3647 }
3648 #ifdef MS_WINDOWS
3649 else if (strcmp(lower, "mbcs") == 0) {
3650 return PyUnicode_DecodeMBCS(s, size, errors);
3651 }
3652 #endif
3653 else if (strcmp(lower, "latin1") == 0
3654 || strcmp(lower, "latin_1") == 0
3655 || strcmp(lower, "iso_8859_1") == 0
3656 || strcmp(lower, "iso8859_1") == 0) {
3657 return PyUnicode_DecodeLatin1(s, size, errors);
3658 }
3659 }
3660 }
3661
3662 /* Decode via the codec registry */
3663 buffer = NULL;
3664 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3665 goto onError;
3666 buffer = PyMemoryView_FromBuffer(&info);
3667 if (buffer == NULL)
3668 goto onError;
3669 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3670 if (unicode == NULL)
3671 goto onError;
3672 if (!PyUnicode_Check(unicode)) {
3673 PyErr_Format(PyExc_TypeError,
3674 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3675 "use codecs.decode() to decode to arbitrary types",
3676 encoding,
3677 Py_TYPE(unicode)->tp_name);
3678 Py_DECREF(unicode);
3679 goto onError;
3680 }
3681 Py_DECREF(buffer);
3682 return unicode_result(unicode);
3683
3684 onError:
3685 Py_XDECREF(buffer);
3686 return NULL;
3687}
3688
3689PyObject *
3690PyUnicode_AsDecodedObject(PyObject *unicode,
3691 const char *encoding,
3692 const char *errors)
3693{
3694 if (!PyUnicode_Check(unicode)) {
3695 PyErr_BadArgument();
3696 return NULL;
3697 }
3698
3699 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3700 "PyUnicode_AsDecodedObject() is deprecated; "
3701 "use PyCodec_Decode() to decode from str", 1) < 0)
3702 return NULL;
3703
3704 if (encoding == NULL)
3705 encoding = PyUnicode_GetDefaultEncoding();
3706
3707 /* Decode via the codec registry */
3708 return PyCodec_Decode(unicode, encoding, errors);
3709}
3710
3711PyObject *
3712PyUnicode_AsDecodedUnicode(PyObject *unicode,
3713 const char *encoding,
3714 const char *errors)
3715{
3716 PyObject *v;
3717
3718 if (!PyUnicode_Check(unicode)) {
3719 PyErr_BadArgument();
3720 goto onError;
3721 }
3722
3723 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3724 "PyUnicode_AsDecodedUnicode() is deprecated; "
3725 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3726 return NULL;
3727
3728 if (encoding == NULL)
3729 encoding = PyUnicode_GetDefaultEncoding();
3730
3731 /* Decode via the codec registry */
3732 v = PyCodec_Decode(unicode, encoding, errors);
3733 if (v == NULL)
3734 goto onError;
3735 if (!PyUnicode_Check(v)) {
3736 PyErr_Format(PyExc_TypeError,
3737 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3738 "use codecs.decode() to decode to arbitrary types",
3739 encoding,
3740 Py_TYPE(unicode)->tp_name);
3741 Py_DECREF(v);
3742 goto onError;
3743 }
3744 return unicode_result(v);
3745
3746 onError:
3747 return NULL;
3748}
3749
3750PyObject *
3751PyUnicode_Encode(const Py_UNICODE *s,
3752 Py_ssize_t size,
3753 const char *encoding,
3754 const char *errors)
3755{
3756 PyObject *v, *unicode;
3757
3758 unicode = PyUnicode_FromWideChar(s, size);
3759 if (unicode == NULL)
3760 return NULL;
3761 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3762 Py_DECREF(unicode);
3763 return v;
3764}
3765
3766PyObject *
3767PyUnicode_AsEncodedObject(PyObject *unicode,
3768 const char *encoding,
3769 const char *errors)
3770{
3771 PyObject *v;
3772
3773 if (!PyUnicode_Check(unicode)) {
3774 PyErr_BadArgument();
3775 goto onError;
3776 }
3777
3778 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3779 "PyUnicode_AsEncodedObject() is deprecated; "
3780 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3781 "or PyCodec_Encode() for generic encoding", 1) < 0)
3782 return NULL;
3783
3784 if (encoding == NULL)
3785 encoding = PyUnicode_GetDefaultEncoding();
3786
3787 /* Encode via the codec registry */
3788 v = PyCodec_Encode(unicode, encoding, errors);
3789 if (v == NULL)
3790 goto onError;
3791 return v;
3792
3793 onError:
3794 return NULL;
3795}
3796
3797
3798static PyObject *
3799unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3800 int current_locale)
3801{
3802 Py_ssize_t wlen;
3803 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3804 if (wstr == NULL) {
3805 return NULL;
3806 }
3807
3808 if ((size_t)wlen != wcslen(wstr)) {
3809 PyErr_SetString(PyExc_ValueError, "embedded null character");
3810 PyMem_Free(wstr);
3811 return NULL;
3812 }
3813
3814 char *str;
3815 size_t error_pos;
3816 const char *reason;
3817 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3818 current_locale, error_handler);
3819 PyMem_Free(wstr);
3820
3821 if (res != 0) {
3822 if (res == -2) {
3823 PyObject *exc;
3824 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3825 "locale", unicode,
3826 (Py_ssize_t)error_pos,
3827 (Py_ssize_t)(error_pos+1),
3828 reason);
3829 if (exc != NULL) {
3830 PyCodec_StrictErrors(exc);
3831 Py_DECREF(exc);
3832 }
3833 }
3834 else if (res == -3) {
3835 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3836 }
3837 else {
3838 PyErr_NoMemory();
3839 }
3840 return NULL;
3841 }
3842
3843 PyObject *bytes = PyBytes_FromString(str);
3844 PyMem_RawFree(str);
3845 return bytes;
3846}
3847
3848PyObject *
3849PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3850{
3851 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3852 return unicode_encode_locale(unicode, error_handler, 1);
3853}
3854
3855PyObject *
3856PyUnicode_EncodeFSDefault(PyObject *unicode)
3857{
3858 PyInterpreterState *interp = _PyInterpreterState_GET();
3859 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3860 if (fs_codec->utf8) {
3861 return unicode_encode_utf8(unicode,
3862 fs_codec->error_handler,
3863 fs_codec->errors);
3864 }
3865#ifndef _Py_FORCE_UTF8_FS_ENCODING
3866 else if (fs_codec->encoding) {
3867 return PyUnicode_AsEncodedString(unicode,
3868 fs_codec->encoding,
3869 fs_codec->errors);
3870 }
3871#endif
3872 else {
3873 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3874 machinery is not ready and so cannot be used:
3875 use wcstombs() in this case. */
3876 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3877 const wchar_t *filesystem_errors = config->filesystem_errors;
3878 assert(filesystem_errors != NULL);
3879 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3880 assert(errors != _Py_ERROR_UNKNOWN);
3881#ifdef _Py_FORCE_UTF8_FS_ENCODING
3882 return unicode_encode_utf8(unicode, errors, NULL);
3883#else
3884 return unicode_encode_locale(unicode, errors, 0);
3885#endif
3886 }
3887}
3888
3889PyObject *
3890PyUnicode_AsEncodedString(PyObject *unicode,
3891 const char *encoding,
3892 const char *errors)
3893{
3894 PyObject *v;
3895 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3896
3897 if (!PyUnicode_Check(unicode)) {
3898 PyErr_BadArgument();
3899 return NULL;
3900 }
3901
3902 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3903 return NULL;
3904 }
3905
3906 if (encoding == NULL) {
3907 return _PyUnicode_AsUTF8String(unicode, errors);
3908 }
3909
3910 /* Shortcuts for common default encodings */
3911 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3912 char *lower = buflower;
3913
3914 /* Fast paths */
3915 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3916 lower += 3;
3917 if (*lower == '_') {
3918 /* Match "utf8" and "utf_8" */
3919 lower++;
3920 }
3921
3922 if (lower[0] == '8' && lower[1] == 0) {
3923 return _PyUnicode_AsUTF8String(unicode, errors);
3924 }
3925 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3926 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3927 }
3928 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3929 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3930 }
3931 }
3932 else {
3933 if (strcmp(lower, "ascii") == 0
3934 || strcmp(lower, "us_ascii") == 0) {
3935 return _PyUnicode_AsASCIIString(unicode, errors);
3936 }
3937#ifdef MS_WINDOWS
3938 else if (strcmp(lower, "mbcs") == 0) {
3939 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3940 }
3941#endif
3942 else if (strcmp(lower, "latin1") == 0 ||
3943 strcmp(lower, "latin_1") == 0 ||
3944 strcmp(lower, "iso_8859_1") == 0 ||
3945 strcmp(lower, "iso8859_1") == 0) {
3946 return _PyUnicode_AsLatin1String(unicode, errors);
3947 }
3948 }
3949 }
3950
3951 /* Encode via the codec registry */
3952 v = _PyCodec_EncodeText(unicode, encoding, errors);
3953 if (v == NULL)
3954 return NULL;
3955
3956 /* The normal path */
3957 if (PyBytes_Check(v))
3958 return v;
3959
3960 /* If the codec returns a buffer, raise a warning and convert to bytes */
3961 if (PyByteArray_Check(v)) {
3962 int error;
3963 PyObject *b;
3964
3965 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3966 "encoder %s returned bytearray instead of bytes; "
3967 "use codecs.encode() to encode to arbitrary types",
3968 encoding);
3969 if (error) {
3970 Py_DECREF(v);
3971 return NULL;
3972 }
3973
3974 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3975 PyByteArray_GET_SIZE(v));
3976 Py_DECREF(v);
3977 return b;
3978 }
3979
3980 PyErr_Format(PyExc_TypeError,
3981 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3982 "use codecs.encode() to encode to arbitrary types",
3983 encoding,
3984 Py_TYPE(v)->tp_name);
3985 Py_DECREF(v);
3986 return NULL;
3987}
3988
3989PyObject *
3990PyUnicode_AsEncodedUnicode(PyObject *unicode,
3991 const char *encoding,
3992 const char *errors)
3993{
3994 PyObject *v;
3995
3996 if (!PyUnicode_Check(unicode)) {
3997 PyErr_BadArgument();
3998 goto onError;
3999 }
4000
4001 if (PyErr_WarnEx(PyExc_DeprecationWarning,
4002 "PyUnicode_AsEncodedUnicode() is deprecated; "
4003 "use PyCodec_Encode() to encode from str to str", 1) < 0)
4004 return NULL;
4005
4006 if (encoding == NULL)
4007 encoding = PyUnicode_GetDefaultEncoding();
4008
4009 /* Encode via the codec registry */
4010 v = PyCodec_Encode(unicode, encoding, errors);
4011 if (v == NULL)
4012 goto onError;
4013 if (!PyUnicode_Check(v)) {
4014 PyErr_Format(PyExc_TypeError,
4015 "'%.400s' encoder returned '%.400s' instead of 'str'; "
4016 "use codecs.encode() to encode to arbitrary types",
4017 encoding,
4018 Py_TYPE(v)->tp_name);
4019 Py_DECREF(v);
4020 goto onError;
4021 }
4022 return v;
4023
4024 onError:
4025 return NULL;
4026}
4027
4028static PyObject*
4029unicode_decode_locale(const char *str, Py_ssize_t len,
4030 _Py_error_handler errors, int current_locale)
4031{
4032 if (str[len] != '\0' || (size_t)len != strlen(str)) {
4033 PyErr_SetString(PyExc_ValueError, "embedded null byte");
4034 return NULL;
4035 }
4036
4037 wchar_t *wstr;
4038 size_t wlen;
4039 const char *reason;
4040 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
4041 current_locale, errors);
4042 if (res != 0) {
4043 if (res == -2) {
4044 PyObject *exc;
4045 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4046 "locale", str, len,
4047 (Py_ssize_t)wlen,
4048 (Py_ssize_t)(wlen + 1),
4049 reason);
4050 if (exc != NULL) {
4051 PyCodec_StrictErrors(exc);
4052 Py_DECREF(exc);
4053 }
4054 }
4055 else if (res == -3) {
4056 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4057 }
4058 else {
4059 PyErr_NoMemory();
4060 }
4061 return NULL;
4062 }
4063
4064 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4065 PyMem_RawFree(wstr);
4066 return unicode;
4067}
4068
4069PyObject*
4070PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4071 const char *errors)
4072{
4073 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4074 return unicode_decode_locale(str, len, error_handler, 1);
4075}
4076
4077PyObject*
4078PyUnicode_DecodeLocale(const char *str, const char *errors)
4079{
4080 Py_ssize_t size = (Py_ssize_t)strlen(str);
4081 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4082 return unicode_decode_locale(str, size, error_handler, 1);
4083}
4084
4085
4086PyObject*
4087PyUnicode_DecodeFSDefault(const char *s) {
4088 Py_ssize_t size = (Py_ssize_t)strlen(s);
4089 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4090}
4091
4092PyObject*
4093PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4094{
4095 PyInterpreterState *interp = _PyInterpreterState_GET();
4096 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4097 if (fs_codec->utf8) {
4098 return unicode_decode_utf8(s, size,
4099 fs_codec->error_handler,
4100 fs_codec->errors,
4101 NULL);
4102 }
4103#ifndef _Py_FORCE_UTF8_FS_ENCODING
4104 else if (fs_codec->encoding) {
4105 return PyUnicode_Decode(s, size,
4106 fs_codec->encoding,
4107 fs_codec->errors);
4108 }
4109#endif
4110 else {
4111 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4112 machinery is not ready and so cannot be used:
4113 use mbstowcs() in this case. */
4114 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4115 const wchar_t *filesystem_errors = config->filesystem_errors;
4116 assert(filesystem_errors != NULL);
4117 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4118 assert(errors != _Py_ERROR_UNKNOWN);
4119#ifdef _Py_FORCE_UTF8_FS_ENCODING
4120 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4121#else
4122 return unicode_decode_locale(s, size, errors, 0);
4123#endif
4124 }
4125}
4126
4127
4128int
4129PyUnicode_FSConverter(PyObject* arg, void* addr)
4130{
4131 PyObject *path = NULL;
4132 PyObject *output = NULL;
4133 Py_ssize_t size;
4134 const char *data;
4135 if (arg == NULL) {
4136 Py_DECREF(*(PyObject**)addr);
4137 *(PyObject**)addr = NULL;
4138 return 1;
4139 }
4140 path = PyOS_FSPath(arg);
4141 if (path == NULL) {
4142 return 0;
4143 }
4144 if (PyBytes_Check(path)) {
4145 output = path;
4146 }
4147 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4148 output = PyUnicode_EncodeFSDefault(path);
4149 Py_DECREF(path);
4150 if (!output) {
4151 return 0;
4152 }
4153 assert(PyBytes_Check(output));
4154 }
4155
4156 size = PyBytes_GET_SIZE(output);
4157 data = PyBytes_AS_STRING(output);
4158 if ((size_t)size != strlen(data)) {
4159 PyErr_SetString(PyExc_ValueError, "embedded null byte");
4160 Py_DECREF(output);
4161 return 0;
4162 }
4163 *(PyObject**)addr = output;
4164 return Py_CLEANUP_SUPPORTED;
4165}
4166
4167
4168int
4169PyUnicode_FSDecoder(PyObject* arg, void* addr)
4170{
4171 int is_buffer = 0;
4172 PyObject *path = NULL;
4173 PyObject *output = NULL;
4174 if (arg == NULL) {
4175 Py_DECREF(*(PyObject**)addr);
4176 *(PyObject**)addr = NULL;
4177 return 1;
4178 }
4179
4180 is_buffer = PyObject_CheckBuffer(arg);
4181 if (!is_buffer) {
4182 path = PyOS_FSPath(arg);
4183 if (path == NULL) {
4184 return 0;
4185 }
4186 }
4187 else {
4188 path = arg;
4189 Py_INCREF(arg);
4190 }
4191
4192 if (PyUnicode_Check(path)) {
4193 output = path;
4194 }
4195 else if (PyBytes_Check(path) || is_buffer) {
4196 PyObject *path_bytes = NULL;
4197
4198 if (!PyBytes_Check(path) &&
4199 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
4200 "path should be string, bytes, or os.PathLike, not %.200s",
4201 Py_TYPE(arg)->tp_name)) {
4202 Py_DECREF(path);
4203 return 0;
4204 }
4205 path_bytes = PyBytes_FromObject(path);
4206 Py_DECREF(path);
4207 if (!path_bytes) {
4208 return 0;
4209 }
4210 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4211 PyBytes_GET_SIZE(path_bytes));
4212 Py_DECREF(path_bytes);
4213 if (!output) {
4214 return 0;
4215 }
4216 }
4217 else {
4218 PyErr_Format(PyExc_TypeError,
4219 "path should be string, bytes, or os.PathLike, not %.200s",
4220 Py_TYPE(arg)->tp_name);
4221 Py_DECREF(path);
4222 return 0;
4223 }
4224 if (PyUnicode_READY(output) == -1) {
4225 Py_DECREF(output);
4226 return 0;
4227 }
4228 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4229 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4230 PyErr_SetString(PyExc_ValueError, "embedded null character");
4231 Py_DECREF(output);
4232 return 0;
4233 }
4234 *(PyObject**)addr = output;
4235 return Py_CLEANUP_SUPPORTED;
4236}
4237
4238
4239static int unicode_fill_utf8(PyObject *unicode);
4240
4241const char *
4242PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4243{
4244 if (!PyUnicode_Check(unicode)) {
4245 PyErr_BadArgument();
4246 return NULL;
4247 }
4248 if (PyUnicode_READY(unicode) == -1)
4249 return NULL;
4250
4251 if (PyUnicode_UTF8(unicode) == NULL) {
4252 if (unicode_fill_utf8(unicode) == -1) {
4253 return NULL;
4254 }
4255 }
4256
4257 if (psize)
4258 *psize = PyUnicode_UTF8_LENGTH(unicode);
4259 return PyUnicode_UTF8(unicode);
4260}
4261
4262const char *
4263PyUnicode_AsUTF8(PyObject *unicode)
4264{
4265 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4266}
4267
4268Py_UNICODE *
4269PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4270{
4271 if (!PyUnicode_Check(unicode)) {
4272 PyErr_BadArgument();
4273 return NULL;
4274 }
4275 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4276 if (w == NULL) {
4277 /* Non-ASCII compact unicode object */
4278 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4279 assert(PyUnicode_IS_READY(unicode));
4280
4281 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4282 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4283 PyErr_NoMemory();
4284 return NULL;
4285 }
4286 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
4287 if (w == NULL) {
4288 PyErr_NoMemory();
4289 return NULL;
4290 }
4291 unicode_copy_as_widechar(unicode, w, wlen + 1);
4292 _PyUnicode_WSTR(unicode) = w;
4293 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4294 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4295 }
4296 }
4297 if (size != NULL)
4298 *size = PyUnicode_WSTR_LENGTH(unicode);
4299 return w;
4300}
4301
4302/* Deprecated APIs */
4303
4304_Py_COMP_DIAG_PUSH
4305_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4306
4307Py_UNICODE *
4308PyUnicode_AsUnicode(PyObject *unicode)
4309{
4310 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4311}
4312
4313const Py_UNICODE *
4314_PyUnicode_AsUnicode(PyObject *unicode)
4315{
4316 Py_ssize_t size;
4317 const Py_UNICODE *wstr;
4318
4319 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4320 if (wstr && wcslen(wstr) != (size_t)size) {
4321 PyErr_SetString(PyExc_ValueError, "embedded null character");
4322 return NULL;
4323 }
4324 return wstr;
4325}
4326
4327
4328Py_ssize_t
4329PyUnicode_GetSize(PyObject *unicode)
4330{
4331 if (!PyUnicode_Check(unicode)) {
4332 PyErr_BadArgument();
4333 goto onError;
4334 }
4335 if (_PyUnicode_WSTR(unicode) == NULL) {
4336 if (PyUnicode_AsUnicode(unicode) == NULL)
4337 goto onError;
4338 }
4339 return PyUnicode_WSTR_LENGTH(unicode);
4340
4341 onError:
4342 return -1;
4343}
4344
4345_Py_COMP_DIAG_POP
4346
4347Py_ssize_t
4348PyUnicode_GetLength(PyObject *unicode)
4349{
4350 if (!PyUnicode_Check(unicode)) {
4351 PyErr_BadArgument();
4352 return -1;
4353 }
4354 if (PyUnicode_READY(unicode) == -1)
4355 return -1;
4356 return PyUnicode_GET_LENGTH(unicode);
4357}
4358
4359Py_UCS4
4360PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4361{
4362 const void *data;
4363 int kind;
4364
4365 if (!PyUnicode_Check(unicode)) {
4366 PyErr_BadArgument();
4367 return (Py_UCS4)-1;
4368 }
4369 if (PyUnicode_READY(unicode) == -1) {
4370 return (Py_UCS4)-1;
4371 }
4372 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4373 PyErr_SetString(PyExc_IndexError, "string index out of range");
4374 return (Py_UCS4)-1;
4375 }
4376 data = PyUnicode_DATA(unicode);
4377 kind = PyUnicode_KIND(unicode);
4378 return PyUnicode_READ(kind, data, index);
4379}
4380
4381int
4382PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4383{
4384 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4385 PyErr_BadArgument();
4386 return -1;
4387 }
4388 assert(PyUnicode_IS_READY(unicode));
4389 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4390 PyErr_SetString(PyExc_IndexError, "string index out of range");
4391 return -1;
4392 }
4393 if (unicode_check_modifiable(unicode))
4394 return -1;
4395 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4396 PyErr_SetString(PyExc_ValueError, "character out of range");
4397 return -1;
4398 }
4399 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4400 index, ch);
4401 return 0;
4402}
4403
4404const char *
4405PyUnicode_GetDefaultEncoding(void)
4406{
4407 return "utf-8";
4408}
4409
4410/* create or adjust a UnicodeDecodeError */
4411static void
4412make_decode_exception(PyObject **exceptionObject,
4413 const char *encoding,
4414 const char *input, Py_ssize_t length,
4415 Py_ssize_t startpos, Py_ssize_t endpos,
4416 const char *reason)
4417{
4418 if (*exceptionObject == NULL) {
4419 *exceptionObject = PyUnicodeDecodeError_Create(
4420 encoding, input, length, startpos, endpos, reason);
4421 }
4422 else {
4423 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4424 goto onError;
4425 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4426 goto onError;
4427 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4428 goto onError;
4429 }
4430 return;
4431
4432onError:
4433 Py_CLEAR(*exceptionObject);
4434}
4435
4436#ifdef MS_WINDOWS
4437static int
4438widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4439{
4440 if (newsize > *size) {
4441 wchar_t *newbuf = *buf;
4442 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4443 PyErr_NoMemory();
4444 return -1;
4445 }
4446 *buf = newbuf;
4447 }
4448 *size = newsize;
4449 return 0;
4450}
4451
4452/* error handling callback helper:
4453 build arguments, call the callback and check the arguments,
4454 if no exception occurred, copy the replacement to the output
4455 and adjust various state variables.
4456 return 0 on success, -1 on error
4457*/
4458
4459static int
4460unicode_decode_call_errorhandler_wchar(
4461 const char *errors, PyObject **errorHandler,
4462 const char *encoding, const char *reason,
4463 const char **input, const char **inend, Py_ssize_t *startinpos,
4464 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4465 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4466{
4467 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4468
4469 PyObject *restuple = NULL;
4470 PyObject *repunicode = NULL;
4471 Py_ssize_t outsize;
4472 Py_ssize_t insize;
4473 Py_ssize_t requiredsize;
4474 Py_ssize_t newpos;
4475 PyObject *inputobj = NULL;
4476 Py_ssize_t repwlen;
4477
4478 if (*errorHandler == NULL) {
4479 *errorHandler = PyCodec_LookupError(errors);
4480 if (*errorHandler == NULL)
4481 goto onError;
4482 }
4483
4484 make_decode_exception(exceptionObject,
4485 encoding,
4486 *input, *inend - *input,
4487 *startinpos, *endinpos,
4488 reason);
4489 if (*exceptionObject == NULL)
4490 goto onError;
4491
4492 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4493 if (restuple == NULL)
4494 goto onError;
4495 if (!PyTuple_Check(restuple)) {
4496 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4497 goto onError;
4498 }
4499 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4500 goto onError;
4501
4502 /* Copy back the bytes variables, which might have been modified by the
4503 callback */
4504 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4505 if (!inputobj)
4506 goto onError;
4507 *input = PyBytes_AS_STRING(inputobj);
4508 insize = PyBytes_GET_SIZE(inputobj);
4509 *inend = *input + insize;
4510 /* we can DECREF safely, as the exception has another reference,
4511 so the object won't go away. */
4512 Py_DECREF(inputobj);
4513
4514 if (newpos<0)
4515 newpos = insize+newpos;
4516 if (newpos<0 || newpos>insize) {
4517 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4518 goto onError;
4519 }
4520
4521#if USE_UNICODE_WCHAR_CACHE
4522_Py_COMP_DIAG_PUSH
4523_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4524 repwlen = PyUnicode_GetSize(repunicode);
4525 if (repwlen < 0)
4526 goto onError;
4527_Py_COMP_DIAG_POP
4528#else /* USE_UNICODE_WCHAR_CACHE */
4529 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4530 if (repwlen < 0)
4531 goto onError;
4532 repwlen--;
4533#endif /* USE_UNICODE_WCHAR_CACHE */
4534 /* need more space? (at least enough for what we
4535 have+the replacement+the rest of the string (starting
4536 at the new input position), so we won't have to check space
4537 when there are no errors in the rest of the string) */
4538 requiredsize = *outpos;
4539 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4540 goto overflow;
4541 requiredsize += repwlen;
4542 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4543 goto overflow;
4544 requiredsize += insize - newpos;
4545 outsize = *bufsize;
4546 if (requiredsize > outsize) {
4547 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4548 requiredsize = 2*outsize;
4549 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4550 goto onError;
4551 }
4552 }
4553 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4554 *outpos += repwlen;
4555 *endinpos = newpos;
4556 *inptr = *input + newpos;
4557
4558 /* we made it! */
4559 Py_DECREF(restuple);
4560 return 0;
4561
4562 overflow:
4563 PyErr_SetString(PyExc_OverflowError,
4564 "decoded result is too long for a Python string");
4565
4566 onError:
4567 Py_XDECREF(restuple);
4568 return -1;
4569}
4570#endif /* MS_WINDOWS */
4571
4572static int
4573unicode_decode_call_errorhandler_writer(
4574 const char *errors, PyObject **errorHandler,
4575 const char *encoding, const char *reason,
4576 const char **input, const char **inend, Py_ssize_t *startinpos,
4577 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4578 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4579{
4580 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4581
4582 PyObject *restuple = NULL;
4583 PyObject *repunicode = NULL;
4584 Py_ssize_t insize;
4585 Py_ssize_t newpos;
4586 Py_ssize_t replen;
4587 Py_ssize_t remain;
4588 PyObject *inputobj = NULL;
4589 int need_to_grow = 0;
4590 const char *new_inptr;
4591
4592 if (*errorHandler == NULL) {
4593 *errorHandler = PyCodec_LookupError(errors);
4594 if (*errorHandler == NULL)
4595 goto onError;
4596 }
4597
4598 make_decode_exception(exceptionObject,
4599 encoding,
4600 *input, *inend - *input,
4601 *startinpos, *endinpos,
4602 reason);
4603 if (*exceptionObject == NULL)
4604 goto onError;
4605
4606 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4607 if (restuple == NULL)
4608 goto onError;
4609 if (!PyTuple_Check(restuple)) {
4610 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4611 goto onError;
4612 }
4613 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4614 goto onError;
4615
4616 /* Copy back the bytes variables, which might have been modified by the
4617 callback */
4618 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4619 if (!inputobj)
4620 goto onError;
4621 remain = *inend - *input - *endinpos;
4622 *input = PyBytes_AS_STRING(inputobj);
4623 insize = PyBytes_GET_SIZE(inputobj);
4624 *inend = *input + insize;
4625 /* we can DECREF safely, as the exception has another reference,
4626 so the object won't go away. */
4627 Py_DECREF(inputobj);
4628
4629 if (newpos<0)
4630 newpos = insize+newpos;
4631 if (newpos<0 || newpos>insize) {
4632 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4633 goto onError;
4634 }
4635
4636 replen = PyUnicode_GET_LENGTH(repunicode);
4637 if (replen > 1) {
4638 writer->min_length += replen - 1;
4639 need_to_grow = 1;
4640 }
4641 new_inptr = *input + newpos;
4642 if (*inend - new_inptr > remain) {
4643 /* We don't know the decoding algorithm here so we make the worst
4644 assumption that one byte decodes to one unicode character.
4645 If unfortunately one byte could decode to more unicode characters,
4646 the decoder may write out-of-bound then. Is it possible for the
4647 algorithms using this function? */
4648 writer->min_length += *inend - new_inptr - remain;
4649 need_to_grow = 1;
4650 }
4651 if (need_to_grow) {
4652 writer->overallocate = 1;
4653 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4654 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4655 goto onError;
4656 }
4657 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4658 goto onError;
4659
4660 *endinpos = newpos;
4661 *inptr = new_inptr;
4662
4663 /* we made it! */
4664 Py_DECREF(restuple);
4665 return 0;
4666
4667 onError:
4668 Py_XDECREF(restuple);
4669 return -1;
4670}
4671
4672/* --- UTF-7 Codec -------------------------------------------------------- */
4673
4674/* See RFC2152 for details. We encode conservatively and decode liberally. */
4675
4676/* Three simple macros defining base-64. */
4677
4678/* Is c a base-64 character? */
4679
4680#define IS_BASE64(c) \
4681 (((c) >= 'A' && (c) <= 'Z') || \
4682 ((c) >= 'a' && (c) <= 'z') || \
4683 ((c) >= '0' && (c) <= '9') || \
4684 (c) == '+' || (c) == '/')
4685
4686/* given that c is a base-64 character, what is its base-64 value? */
4687
4688#define FROM_BASE64(c) \
4689 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4690 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4691 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4692 (c) == '+' ? 62 : 63)
4693
4694/* What is the base-64 character of the bottom 6 bits of n? */
4695
4696#define TO_BASE64(n) \
4697 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4698
4699/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4700 * decoded as itself. We are permissive on decoding; the only ASCII
4701 * byte not decoding to itself is the + which begins a base64
4702 * string. */
4703
4704#define DECODE_DIRECT(c) \
4705 ((c) <= 127 && (c) != '+')
4706
4707/* The UTF-7 encoder treats ASCII characters differently according to
4708 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4709 * the above). See RFC2152. This array identifies these different
4710 * sets:
4711 * 0 : "Set D"
4712 * alphanumeric and '(),-./:?
4713 * 1 : "Set O"
4714 * !"#$%&*;<=>@[]^_`{|}
4715 * 2 : "whitespace"
4716 * ht nl cr sp
4717 * 3 : special (must be base64 encoded)
4718 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4719 */
4720
4721static
4722char utf7_category[128] = {
4723/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4724 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4725/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4726 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4727/* sp ! " # $ % & ' ( ) * + , - . / */
4728 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4729/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4731/* @ A B C D E F G H I J K L M N O */
4732 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4733/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4734 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4735/* ` a b c d e f g h i j k l m n o */
4736 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4737/* p q r s t u v w x y z { | } ~ del */
4738 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4739};
4740
4741/* ENCODE_DIRECT: this character should be encoded as itself. The
4742 * answer depends on whether we are encoding set O as itself, and also
4743 * on whether we are encoding whitespace as itself. RFC2152 makes it
4744 * clear that the answers to these questions vary between
4745 * applications, so this code needs to be flexible. */
4746
4747#define ENCODE_DIRECT(c, directO, directWS) \
4748 ((c) < 128 && (c) > 0 && \
4749 ((utf7_category[(c)] == 0) || \
4750 (directWS && (utf7_category[(c)] == 2)) || \
4751 (directO && (utf7_category[(c)] == 1))))
4752
4753PyObject *
4754PyUnicode_DecodeUTF7(const char *s,
4755 Py_ssize_t size,
4756 const char *errors)
4757{
4758 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4759}
4760
4761/* The decoder. The only state we preserve is our read position,
4762 * i.e. how many characters we have consumed. So if we end in the
4763 * middle of a shift sequence we have to back off the read position
4764 * and the output to the beginning of the sequence, otherwise we lose
4765 * all the shift state (seen bits, number of bits seen, high
4766 * surrogate). */
4767
4768PyObject *
4769PyUnicode_DecodeUTF7Stateful(const char *s,
4770 Py_ssize_t size,
4771 const char *errors,
4772 Py_ssize_t *consumed)
4773{
4774 const char *starts = s;
4775 Py_ssize_t startinpos;
4776 Py_ssize_t endinpos;
4777 const char *e;
4778 _PyUnicodeWriter writer;
4779 const char *errmsg = "";
4780 int inShift = 0;
4781 Py_ssize_t shiftOutStart;
4782 unsigned int base64bits = 0;
4783 unsigned long base64buffer = 0;
4784 Py_UCS4 surrogate = 0;
4785 PyObject *errorHandler = NULL;
4786 PyObject *exc = NULL;
4787
4788 if (size == 0) {
4789 if (consumed)
4790 *consumed = 0;
4791 _Py_RETURN_UNICODE_EMPTY();
4792 }
4793
4794 /* Start off assuming it's all ASCII. Widen later as necessary. */
4795 _PyUnicodeWriter_Init(&writer);
4796 writer.min_length = size;
4797
4798 shiftOutStart = 0;
4799 e = s + size;
4800
4801 while (s < e) {
4802 Py_UCS4 ch;
4803 restart:
4804 ch = (unsigned char) *s;
4805
4806 if (inShift) { /* in a base-64 section */
4807 if (IS_BASE64(ch)) { /* consume a base-64 character */
4808 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4809 base64bits += 6;
4810 s++;
4811 if (base64bits >= 16) {
4812 /* we have enough bits for a UTF-16 value */
4813 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4814 base64bits -= 16;
4815 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4816 assert(outCh <= 0xffff);
4817 if (surrogate) {
4818 /* expecting a second surrogate */
4819 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4820 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4821 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4822 goto onError;
4823 surrogate = 0;
4824 continue;
4825 }
4826 else {
4827 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4828 goto onError;
4829 surrogate = 0;
4830 }
4831 }
4832 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4833 /* first surrogate */
4834 surrogate = outCh;
4835 }
4836 else {
4837 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4838 goto onError;
4839 }
4840 }
4841 }
4842 else { /* now leaving a base-64 section */
4843 inShift = 0;
4844 if (base64bits > 0) { /* left-over bits */
4845 if (base64bits >= 6) {
4846 /* We've seen at least one base-64 character */
4847 s++;
4848 errmsg = "partial character in shift sequence";
4849 goto utf7Error;
4850 }
4851 else {
4852 /* Some bits remain; they should be zero */
4853 if (base64buffer != 0) {
4854 s++;
4855 errmsg = "non-zero padding bits in shift sequence";
4856 goto utf7Error;
4857 }
4858 }
4859 }
4860 if (surrogate && DECODE_DIRECT(ch)) {
4861 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4862 goto onError;
4863 }
4864 surrogate = 0;
4865 if (ch == '-') {
4866 /* '-' is absorbed; other terminating
4867 characters are preserved */
4868 s++;
4869 }
4870 }
4871 }
4872 else if ( ch == '+' ) {
4873 startinpos = s-starts;
4874 s++; /* consume '+' */
4875 if (s < e && *s == '-') { /* '+-' encodes '+' */
4876 s++;
4877 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4878 goto onError;
4879 }
4880 else if (s < e && !IS_BASE64(*s)) {
4881 s++;
4882 errmsg = "ill-formed sequence";
4883 goto utf7Error;
4884 }
4885 else { /* begin base64-encoded section */
4886 inShift = 1;
4887 surrogate = 0;
4888 shiftOutStart = writer.pos;
4889 base64bits = 0;
4890 base64buffer = 0;
4891 }
4892 }
4893 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4894 s++;
4895 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4896 goto onError;
4897 }
4898 else {
4899 startinpos = s-starts;
4900 s++;
4901 errmsg = "unexpected special character";
4902 goto utf7Error;
4903 }
4904 continue;
4905utf7Error:
4906 endinpos = s-starts;
4907 if (unicode_decode_call_errorhandler_writer(
4908 errors, &errorHandler,
4909 "utf7", errmsg,
4910 &starts, &e, &startinpos, &endinpos, &exc, &s,
4911 &writer))
4912 goto onError;
4913 }
4914
4915 /* end of string */
4916
4917 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4918 /* if we're in an inconsistent state, that's an error */
4919 inShift = 0;
4920 if (surrogate ||
4921 (base64bits >= 6) ||
4922 (base64bits > 0 && base64buffer != 0)) {
4923 endinpos = size;
4924 if (unicode_decode_call_errorhandler_writer(
4925 errors, &errorHandler,
4926 "utf7", "unterminated shift sequence",
4927 &starts, &e, &startinpos, &endinpos, &exc, &s,
4928 &writer))
4929 goto onError;
4930 if (s < e)
4931 goto restart;
4932 }
4933 }
4934
4935 /* return state */
4936 if (consumed) {
4937 if (inShift) {
4938 *consumed = startinpos;
4939 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4940 PyObject *result = PyUnicode_FromKindAndData(
4941 writer.kind, writer.data, shiftOutStart);
4942 Py_XDECREF(errorHandler);
4943 Py_XDECREF(exc);
4944 _PyUnicodeWriter_Dealloc(&writer);
4945 return result;
4946 }
4947 writer.pos = shiftOutStart; /* back off output */
4948 }
4949 else {
4950 *consumed = s-starts;
4951 }
4952 }
4953
4954 Py_XDECREF(errorHandler);
4955 Py_XDECREF(exc);
4956 return _PyUnicodeWriter_Finish(&writer);
4957
4958 onError:
4959 Py_XDECREF(errorHandler);
4960 Py_XDECREF(exc);
4961 _PyUnicodeWriter_Dealloc(&writer);
4962 return NULL;
4963}
4964
4965
4966PyObject *
4967_PyUnicode_EncodeUTF7(PyObject *str,
4968 int base64SetO,
4969 int base64WhiteSpace,
4970 const char *errors)
4971{
4972 int kind;
4973 const void *data;
4974 Py_ssize_t len;
4975 PyObject *v;
4976 int inShift = 0;
4977 Py_ssize_t i;
4978 unsigned int base64bits = 0;
4979 unsigned long base64buffer = 0;
4980 char * out;
4981 const char * start;
4982
4983 if (PyUnicode_READY(str) == -1)
4984 return NULL;
4985 kind = PyUnicode_KIND(str);
4986 data = PyUnicode_DATA(str);
4987 len = PyUnicode_GET_LENGTH(str);
4988
4989 if (len == 0)
4990 return PyBytes_FromStringAndSize(NULL, 0);
4991
4992 /* It might be possible to tighten this worst case */
4993 if (len > PY_SSIZE_T_MAX / 8)
4994 return PyErr_NoMemory();
4995 v = PyBytes_FromStringAndSize(NULL, len * 8);
4996 if (v == NULL)
4997 return NULL;
4998
4999 start = out = PyBytes_AS_STRING(v);
5000 for (i = 0; i < len; ++i) {
5001 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5002
5003 if (inShift) {
5004 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5005 /* shifting out */
5006 if (base64bits) { /* output remaining bits */
5007 *out++ = TO_BASE64(base64buffer << (6-base64bits));
5008 base64buffer = 0;
5009 base64bits = 0;
5010 }
5011 inShift = 0;
5012 /* Characters not in the BASE64 set implicitly unshift the sequence
5013 so no '-' is required, except if the character is itself a '-' */
5014 if (IS_BASE64(ch) || ch == '-') {
5015 *out++ = '-';
5016 }
5017 *out++ = (char) ch;
5018 }
5019 else {
5020 goto encode_char;
5021 }
5022 }
5023 else { /* not in a shift sequence */
5024 if (ch == '+') {
5025 *out++ = '+';
5026 *out++ = '-';
5027 }
5028 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5029 *out++ = (char) ch;
5030 }
5031 else {
5032 *out++ = '+';
5033 inShift = 1;
5034 goto encode_char;
5035 }
5036 }
5037 continue;
5038encode_char:
5039 if (ch >= 0x10000) {
5040 assert(ch <= MAX_UNICODE);
5041
5042 /* code first surrogate */
5043 base64bits += 16;
5044 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
5045 while (base64bits >= 6) {
5046 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5047 base64bits -= 6;
5048 }
5049 /* prepare second surrogate */
5050 ch = Py_UNICODE_LOW_SURROGATE(ch);
5051 }
5052 base64bits += 16;
5053 base64buffer = (base64buffer << 16) | ch;
5054 while (base64bits >= 6) {
5055 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5056 base64bits -= 6;
5057 }
5058 }
5059 if (base64bits)
5060 *out++= TO_BASE64(base64buffer << (6-base64bits) );
5061 if (inShift)
5062 *out++ = '-';
5063 if (_PyBytes_Resize(&v, out - start) < 0)
5064 return NULL;
5065 return v;
5066}
5067PyObject *
5068PyUnicode_EncodeUTF7(const Py_UNICODE *s,
5069 Py_ssize_t size,
5070 int base64SetO,
5071 int base64WhiteSpace,
5072 const char *errors)
5073{
5074 PyObject *result;
5075 PyObject *tmp = PyUnicode_FromWideChar(s, size);
5076 if (tmp == NULL)
5077 return NULL;
5078 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
5079 base64WhiteSpace, errors);
5080 Py_DECREF(tmp);
5081 return result;
5082}
5083
5084#undef IS_BASE64
5085#undef FROM_BASE64
5086#undef TO_BASE64
5087#undef DECODE_DIRECT
5088#undef ENCODE_DIRECT
5089
5090/* --- UTF-8 Codec -------------------------------------------------------- */
5091
5092PyObject *
5093PyUnicode_DecodeUTF8(const char *s,
5094 Py_ssize_t size,
5095 const char *errors)
5096{
5097 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5098}
5099
5100#include "stringlib/asciilib.h"
5101#include "stringlib/codecs.h"
5102#include "stringlib/undef.h"
5103
5104#include "stringlib/ucs1lib.h"
5105#include "stringlib/codecs.h"
5106#include "stringlib/undef.h"
5107
5108#include "stringlib/ucs2lib.h"
5109#include "stringlib/codecs.h"
5110#include "stringlib/undef.h"
5111
5112#include "stringlib/ucs4lib.h"
5113#include "stringlib/codecs.h"
5114#include "stringlib/undef.h"
5115
5116/* Mask to quickly check whether a C 'size_t' contains a
5117 non-ASCII, UTF8-encoded char. */
5118#if (SIZEOF_SIZE_T == 8)
5119# define ASCII_CHAR_MASK 0x8080808080808080ULL
5120#elif (SIZEOF_SIZE_T == 4)
5121# define ASCII_CHAR_MASK 0x80808080U
5122#else
5123# error C 'size_t' size should be either 4 or 8!
5124#endif
5125
5126static Py_ssize_t
5127ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5128{
5129 const char *p = start;
5130
5131#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5132 assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5133 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5134 /* Fast path, see in STRINGLIB(utf8_decode) for
5135 an explanation. */
5136 /* Help allocation */
5137 const char *_p = p;
5138 Py_UCS1 * q = dest;
5139 while (_p + SIZEOF_SIZE_T <= end) {
5140 size_t value = *(const size_t *) _p;
5141 if (value & ASCII_CHAR_MASK)
5142 break;
5143 *((size_t *)q) = value;
5144 _p += SIZEOF_SIZE_T;
5145 q += SIZEOF_SIZE_T;
5146 }
5147 p = _p;
5148 while (p < end) {
5149 if ((unsigned char)*p & 0x80)
5150 break;
5151 *q++ = *p++;
5152 }
5153 return p - start;
5154 }
5155#endif
5156 while (p < end) {
5157 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5158 for an explanation. */
5159 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5160 /* Help allocation */
5161 const char *_p = p;
5162 while (_p + SIZEOF_SIZE_T <= end) {
5163 size_t value = *(const size_t *) _p;
5164 if (value & ASCII_CHAR_MASK)
5165 break;
5166 _p += SIZEOF_SIZE_T;
5167 }
5168 p = _p;
5169 if (_p == end)
5170 break;
5171 }
5172 if ((unsigned char)*p & 0x80)
5173 break;
5174 ++p;
5175 }
5176 memcpy(dest, start, p - start);
5177 return p - start;
5178}
5179
5180static PyObject *
5181unicode_decode_utf8(const char *s, Py_ssize_t size,
5182 _Py_error_handler error_handler, const char *errors,
5183 Py_ssize_t *consumed)
5184{
5185 if (size == 0) {
5186 if (consumed)
5187 *consumed = 0;
5188 _Py_RETURN_UNICODE_EMPTY();
5189 }
5190
5191 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5192 if (size == 1 && (unsigned char)s[0] < 128) {
5193 if (consumed) {
5194 *consumed = 1;
5195 }
5196 return get_latin1_char((unsigned char)s[0]);
5197 }
5198
5199 const char *starts = s;
5200 const char *end = s + size;
5201
5202 // fast path: try ASCII string.
5203 PyObject *u = PyUnicode_New(size, 127);
5204 if (u == NULL) {
5205 return NULL;
5206 }
5207 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5208 if (s == end) {
5209 return u;
5210 }
5211
5212 // Use _PyUnicodeWriter after fast path is failed.
5213 _PyUnicodeWriter writer;
5214 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5215 writer.pos = s - starts;
5216
5217 Py_ssize_t startinpos, endinpos;
5218 const char *errmsg = "";
5219 PyObject *error_handler_obj = NULL;
5220 PyObject *exc = NULL;
5221
5222 while (s < end) {
5223 Py_UCS4 ch;
5224 int kind = writer.kind;
5225
5226 if (kind == PyUnicode_1BYTE_KIND) {
5227 if (PyUnicode_IS_ASCII(writer.buffer))
5228 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5229 else
5230 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5231 } else if (kind == PyUnicode_2BYTE_KIND) {
5232 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5233 } else {
5234 assert(kind == PyUnicode_4BYTE_KIND);
5235 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5236 }
5237
5238 switch (ch) {
5239 case 0:
5240 if (s == end || consumed)
5241 goto End;
5242 errmsg = "unexpected end of data";
5243 startinpos = s - starts;
5244 endinpos = end - starts;
5245 break;
5246 case 1:
5247 errmsg = "invalid start byte";
5248 startinpos = s - starts;
5249 endinpos = startinpos + 1;
5250 break;
5251 case 2:
5252 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5253 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5254 {
5255 /* Truncated surrogate code in range D800-DFFF */
5256 goto End;
5257 }
5258 /* fall through */
5259 case 3:
5260 case 4:
5261 errmsg = "invalid continuation byte";
5262 startinpos = s - starts;
5263 endinpos = startinpos + ch - 1;
5264 break;
5265 default:
5266 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5267 goto onError;
5268 continue;
5269 }
5270
5271 if (error_handler == _Py_ERROR_UNKNOWN)
5272 error_handler = _Py_GetErrorHandler(errors);
5273
5274 switch (error_handler) {
5275 case _Py_ERROR_IGNORE:
5276 s += (endinpos - startinpos);
5277 break;
5278
5279 case _Py_ERROR_REPLACE:
5280 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5281 goto onError;
5282 s += (endinpos - startinpos);
5283 break;
5284
5285 case _Py_ERROR_SURROGATEESCAPE:
5286 {
5287 Py_ssize_t i;
5288
5289 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5290 goto onError;
5291 for (i=startinpos; i<endinpos; i++) {
5292 ch = (Py_UCS4)(unsigned char)(starts[i]);
5293 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5294 ch + 0xdc00);
5295 writer.pos++;
5296 }
5297 s += (endinpos - startinpos);
5298 break;
5299 }
5300
5301 default:
5302 if (unicode_decode_call_errorhandler_writer(
5303 errors, &error_handler_obj,
5304 "utf-8", errmsg,
5305 &starts, &end, &startinpos, &endinpos, &exc, &s,
5306 &writer))
5307 goto onError;
5308 }
5309 }
5310
5311End:
5312 if (consumed)
5313 *consumed = s - starts;
5314
5315 Py_XDECREF(error_handler_obj);
5316 Py_XDECREF(exc);
5317 return _PyUnicodeWriter_Finish(&writer);
5318
5319onError:
5320 Py_XDECREF(error_handler_obj);
5321 Py_XDECREF(exc);
5322 _PyUnicodeWriter_Dealloc(&writer);
5323 return NULL;
5324}
5325
5326
5327PyObject *
5328PyUnicode_DecodeUTF8Stateful(const char *s,
5329 Py_ssize_t size,
5330 const char *errors,
5331 Py_ssize_t *consumed)
5332{
5333 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5334}
5335
5336
5337/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5338 non-zero, use strict error handler otherwise.
5339
5340 On success, write a pointer to a newly allocated wide character string into
5341 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5342 (in number of wchar_t units) into *wlen (if wlen is set).
5343
5344 On memory allocation failure, return -1.
5345
5346 On decoding error (if surrogateescape is zero), return -2. If wlen is
5347 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5348 is not NULL, write the decoding error message into *reason. */
5349int
5350_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5351 const char **reason, _Py_error_handler errors)
5352{
5353 const char *orig_s = s;
5354 const char *e;
5355 wchar_t *unicode;
5356 Py_ssize_t outpos;
5357
5358 int surrogateescape = 0;
5359 int surrogatepass = 0;
5360 switch (errors)
5361 {
5362 case _Py_ERROR_STRICT:
5363 break;
5364 case _Py_ERROR_SURROGATEESCAPE:
5365 surrogateescape = 1;
5366 break;
5367 case _Py_ERROR_SURROGATEPASS:
5368 surrogatepass = 1;
5369 break;
5370 default:
5371 return -3;
5372 }
5373
5374 /* Note: size will always be longer than the resulting Unicode
5375 character count */
5376 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5377 return -1;
5378 }
5379
5380 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5381 if (!unicode) {
5382 return -1;
5383 }
5384
5385 /* Unpack UTF-8 encoded data */
5386 e = s + size;
5387 outpos = 0;
5388 while (s < e) {
5389 Py_UCS4 ch;
5390#if SIZEOF_WCHAR_T == 4
5391 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5392#else
5393 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5394#endif
5395 if (ch > 0xFF) {
5396#if SIZEOF_WCHAR_T == 4
5397 Py_UNREACHABLE();
5398#else
5399 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5400 /* write a surrogate pair */
5401 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5402 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5403#endif
5404 }
5405 else {
5406 if (!ch && s == e) {
5407 break;
5408 }
5409
5410 if (surrogateescape) {
5411 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5412 }
5413 else {
5414 /* Is it a valid three-byte code? */
5415 if (surrogatepass
5416 && (e - s) >= 3
5417 && (s[0] & 0xf0) == 0xe0
5418 && (s[1] & 0xc0) == 0x80
5419 && (s[2] & 0xc0) == 0x80)
5420 {
5421 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5422 s += 3;
5423 unicode[outpos++] = ch;
5424 }
5425 else {
5426 PyMem_RawFree(unicode );
5427 if (reason != NULL) {
5428 switch (ch) {
5429 case 0:
5430 *reason = "unexpected end of data";
5431 break;
5432 case 1:
5433 *reason = "invalid start byte";
5434 break;
5435 /* 2, 3, 4 */
5436 default:
5437 *reason = "invalid continuation byte";
5438 break;
5439 }
5440 }
5441 if (wlen != NULL) {
5442 *wlen = s - orig_s;
5443 }
5444 return -2;
5445 }
5446 }
5447 }
5448 }
5449 unicode[outpos] = L'\0';
5450 if (wlen) {
5451 *wlen = outpos;
5452 }
5453 *wstr = unicode;
5454 return 0;
5455}
5456
5457
5458wchar_t*
5459_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5460 size_t *wlen)
5461{
5462 wchar_t *wstr;
5463 int res = _Py_DecodeUTF8Ex(arg, arglen,
5464 &wstr, wlen,
5465 NULL, _Py_ERROR_SURROGATEESCAPE);
5466 if (res != 0) {
5467 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5468 assert(res != -3);
5469 if (wlen) {
5470 *wlen = (size_t)res;
5471 }
5472 return NULL;
5473 }
5474 return wstr;
5475}
5476
5477
5478/* UTF-8 encoder using the surrogateescape error handler .
5479
5480 On success, return 0 and write the newly allocated character string (use
5481 PyMem_Free() to free the memory) into *str.
5482
5483 On encoding failure, return -2 and write the position of the invalid
5484 surrogate character into *error_pos (if error_pos is set) and the decoding
5485 error message into *reason (if reason is set).
5486
5487 On memory allocation failure, return -1. */
5488int
5489_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5490 const char **reason, int raw_malloc, _Py_error_handler errors)
5491{
5492 const Py_ssize_t max_char_size = 4;
5493 Py_ssize_t len = wcslen(text);
5494
5495 assert(len >= 0);
5496
5497 int surrogateescape = 0;
5498 int surrogatepass = 0;
5499 switch (errors)
5500 {
5501 case _Py_ERROR_STRICT:
5502 break;
5503 case _Py_ERROR_SURROGATEESCAPE:
5504 surrogateescape = 1;
5505 break;
5506 case _Py_ERROR_SURROGATEPASS:
5507 surrogatepass = 1;
5508 break;
5509 default:
5510 return -3;
5511 }
5512
5513 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5514 return -1;
5515 }
5516 char *bytes;
5517 if (raw_malloc) {
5518 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5519 }
5520 else {
5521 bytes = PyMem_Malloc((len + 1) * max_char_size);
5522 }
5523 if (bytes == NULL) {
5524 return -1;
5525 }
5526
5527 char *p = bytes;
5528 Py_ssize_t i;
5529 for (i = 0; i < len; ) {
5530 Py_ssize_t ch_pos = i;
5531 Py_UCS4 ch = text[i];
5532 i++;
5533#if Py_UNICODE_SIZE == 2
5534 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5535 && i < len
5536 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5537 {
5538 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5539 i++;
5540 }
5541#endif
5542
5543 if (ch < 0x80) {
5544 /* Encode ASCII */
5545 *p++ = (char) ch;
5546
5547 }
5548 else if (ch < 0x0800) {
5549 /* Encode Latin-1 */
5550 *p++ = (char)(0xc0 | (ch >> 6));
5551 *p++ = (char)(0x80 | (ch & 0x3f));
5552 }
5553 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5554 /* surrogateescape error handler */
5555 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5556 if (error_pos != NULL) {
5557 *error_pos = (size_t)ch_pos;
5558 }
5559 if (reason != NULL) {
5560 *reason = "encoding error";
5561 }
5562 if (raw_malloc) {
5563 PyMem_RawFree(bytes);
5564 }
5565 else {
5566 PyMem_Free(bytes);
5567 }
5568 return -2;
5569 }
5570 *p++ = (char)(ch & 0xff);
5571 }
5572 else if (ch < 0x10000) {
5573 *p++ = (char)(0xe0 | (ch >> 12));
5574 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5575 *p++ = (char)(0x80 | (ch & 0x3f));
5576 }
5577 else { /* ch >= 0x10000 */
5578 assert(ch <= MAX_UNICODE);
5579 /* Encode UCS4 Unicode ordinals */
5580 *p++ = (char)(0xf0 | (ch >> 18));
5581 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5582 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5583 *p++ = (char)(0x80 | (ch & 0x3f));
5584 }
5585 }
5586 *p++ = '\0';
5587
5588 size_t final_size = (p - bytes);
5589 char *bytes2;
5590 if (raw_malloc) {
5591 bytes2 = PyMem_RawRealloc(bytes, final_size);
5592 }
5593 else {
5594 bytes2 = PyMem_Realloc(bytes, final_size);
5595 }
5596 if (bytes2 == NULL) {
5597 if (error_pos != NULL) {
5598 *error_pos = (size_t)-1;
5599 }
5600 if (raw_malloc) {
5601 PyMem_RawFree(bytes);
5602 }
5603 else {
5604 PyMem_Free(bytes);
5605 }
5606 return -1;
5607 }
5608 *str = bytes2;
5609 return 0;
5610}
5611
5612
5613/* Primary internal function which creates utf8 encoded bytes objects.
5614
5615 Allocation strategy: if the string is short, convert into a stack buffer
5616 and allocate exactly as much space needed at the end. Else allocate the
5617 maximum possible needed (4 result bytes per Unicode character), and return
5618 the excess memory at the end.
5619*/
5620static PyObject *
5621unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5622 const char *errors)
5623{
5624 if (!PyUnicode_Check(unicode)) {
5625 PyErr_BadArgument();
5626 return NULL;
5627 }
5628
5629 if (PyUnicode_READY(unicode) == -1)
5630 return NULL;
5631
5632 if (PyUnicode_UTF8(unicode))
5633 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5634 PyUnicode_UTF8_LENGTH(unicode));
5635
5636 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5637 const void *data = PyUnicode_DATA(unicode);
5638 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5639
5640 _PyBytesWriter writer;
5641 char *end;
5642
5643 switch (kind) {
5644 default:
5645 Py_UNREACHABLE();
5646 case PyUnicode_1BYTE_KIND:
5647 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5648 assert(!PyUnicode_IS_ASCII(unicode));
5649 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5650 break;
5651 case PyUnicode_2BYTE_KIND:
5652 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5653 break;
5654 case PyUnicode_4BYTE_KIND:
5655 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5656 break;
5657 }
5658
5659 if (end == NULL) {
5660 _PyBytesWriter_Dealloc(&writer);
5661 return NULL;
5662 }
5663 return _PyBytesWriter_Finish(&writer, end);
5664}
5665
5666static int
5667unicode_fill_utf8(PyObject *unicode)
5668{
5669 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5670 assert(!PyUnicode_IS_ASCII(unicode));
5671
5672 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5673 const void *data = PyUnicode_DATA(unicode);
5674 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5675
5676 _PyBytesWriter writer;
5677 char *end;
5678
5679 switch (kind) {
5680 default:
5681 Py_UNREACHABLE();
5682 case PyUnicode_1BYTE_KIND:
5683 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5684 _Py_ERROR_STRICT, NULL);
5685 break;
5686 case PyUnicode_2BYTE_KIND:
5687 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5688 _Py_ERROR_STRICT, NULL);
5689 break;
5690 case PyUnicode_4BYTE_KIND:
5691 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5692 _Py_ERROR_STRICT, NULL);
5693 break;
5694 }
5695 if (end == NULL) {
5696 _PyBytesWriter_Dealloc(&writer);
5697 return -1;
5698 }
5699
5700 const char *start = writer.use_small_buffer ? writer.small_buffer :
5701 PyBytes_AS_STRING(writer.buffer);
5702 Py_ssize_t len = end - start;
5703
5704 char *cache = PyObject_Malloc(len + 1);
5705 if (cache == NULL) {
5706 _PyBytesWriter_Dealloc(&writer);
5707 PyErr_NoMemory();
5708 return -1;
5709 }
5710 _PyUnicode_UTF8(unicode) = cache;
5711 _PyUnicode_UTF8_LENGTH(unicode) = len;
5712 memcpy(cache, start, len);
5713 cache[len] = '\0';
5714 _PyBytesWriter_Dealloc(&writer);
5715 return 0;
5716}
5717
5718PyObject *
5719_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5720{
5721 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5722}
5723
5724
5725PyObject *
5726PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5727 Py_ssize_t size,
5728 const char *errors)
5729{
5730 PyObject *v, *unicode;
5731
5732 unicode = PyUnicode_FromWideChar(s, size);
5733 if (unicode == NULL)
5734 return NULL;
5735 v = _PyUnicode_AsUTF8String(unicode, errors);
5736 Py_DECREF(unicode);
5737 return v;
5738}
5739
5740PyObject *
5741PyUnicode_AsUTF8String(PyObject *unicode)
5742{
5743 return _PyUnicode_AsUTF8String(unicode, NULL);
5744}
5745
5746/* --- UTF-32 Codec ------------------------------------------------------- */
5747
5748PyObject *
5749PyUnicode_DecodeUTF32(const char *s,
5750 Py_ssize_t size,
5751 const char *errors,
5752 int *byteorder)
5753{
5754 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5755}
5756
5757PyObject *
5758PyUnicode_DecodeUTF32Stateful(const char *s,
5759 Py_ssize_t size,
5760 const char *errors,
5761 int *byteorder,
5762 Py_ssize_t *consumed)
5763{
5764 const char *starts = s;
5765 Py_ssize_t startinpos;
5766 Py_ssize_t endinpos;
5767 _PyUnicodeWriter writer;
5768 const unsigned char *q, *e;
5769 int le, bo = 0; /* assume native ordering by default */
5770 const char *encoding;
5771 const char *errmsg = "";
5772 PyObject *errorHandler = NULL;
5773 PyObject *exc = NULL;
5774
5775 q = (const unsigned char *)s;
5776 e = q + size;
5777
5778 if (byteorder)
5779 bo = *byteorder;
5780
5781 /* Check for BOM marks (U+FEFF) in the input and adjust current
5782 byte order setting accordingly. In native mode, the leading BOM
5783 mark is skipped, in all other modes, it is copied to the output
5784 stream as-is (giving a ZWNBSP character). */
5785 if (bo == 0 && size >= 4) {
5786 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5787 if (bom == 0x0000FEFF) {
5788 bo = -1;
5789 q += 4;
5790 }
5791 else if (bom == 0xFFFE0000) {
5792 bo = 1;
5793 q += 4;
5794 }
5795 if (byteorder)
5796 *byteorder = bo;
5797 }
5798
5799 if (q == e) {
5800 if (consumed)
5801 *consumed = size;
5802 _Py_RETURN_UNICODE_EMPTY();
5803 }
5804
5805#ifdef WORDS_BIGENDIAN
5806 le = bo < 0;
5807#else
5808 le = bo <= 0;
5809#endif
5810 encoding = le ? "utf-32-le" : "utf-32-be";
5811
5812 _PyUnicodeWriter_Init(&writer);
5813 writer.min_length = (e - q + 3) / 4;
5814 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5815 goto onError;
5816
5817 while (1) {
5818 Py_UCS4 ch = 0;
5819 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5820
5821 if (e - q >= 4) {
5822 enum PyUnicode_Kind kind = writer.kind;
5823 void *data = writer.data;
5824 const unsigned char *last = e - 4;
5825 Py_ssize_t pos = writer.pos;
5826 if (le) {
5827 do {
5828 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5829 if (ch > maxch)
5830 break;
5831 if (kind != PyUnicode_1BYTE_KIND &&
5832 Py_UNICODE_IS_SURROGATE(ch))
5833 break;
5834 PyUnicode_WRITE(kind, data, pos++, ch);
5835 q += 4;
5836 } while (q <= last);
5837 }
5838 else {
5839 do {
5840 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5841 if (ch > maxch)
5842 break;
5843 if (kind != PyUnicode_1BYTE_KIND &&
5844 Py_UNICODE_IS_SURROGATE(ch))
5845 break;
5846 PyUnicode_WRITE(kind, data, pos++, ch);
5847 q += 4;
5848 } while (q <= last);
5849 }
5850 writer.pos = pos;
5851 }
5852
5853 if (Py_UNICODE_IS_SURROGATE(ch)) {
5854 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5855 startinpos = ((const char *)q) - starts;
5856 endinpos = startinpos + 4;
5857 }
5858 else if (ch <= maxch) {
5859 if (q == e || consumed)
5860 break;
5861 /* remaining bytes at the end? (size should be divisible by 4) */
5862 errmsg = "truncated data";
5863 startinpos = ((const char *)q) - starts;
5864 endinpos = ((const char *)e) - starts;
5865 }
5866 else {
5867 if (ch < 0x110000) {
5868 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5869 goto onError;
5870 q += 4;
5871 continue;
5872 }
5873 errmsg = "code point not in range(0x110000)";
5874 startinpos = ((const char *)q) - starts;
5875 endinpos = startinpos + 4;
5876 }
5877
5878 /* The remaining input chars are ignored if the callback
5879 chooses to skip the input */
5880 if (unicode_decode_call_errorhandler_writer(
5881 errors, &errorHandler,
5882 encoding, errmsg,
5883 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5884 &writer))
5885 goto onError;
5886 }
5887
5888 if (consumed)
5889 *consumed = (const char *)q-starts;
5890
5891 Py_XDECREF(errorHandler);
5892 Py_XDECREF(exc);
5893 return _PyUnicodeWriter_Finish(&writer);
5894
5895 onError:
5896 _PyUnicodeWriter_Dealloc(&writer);
5897 Py_XDECREF(errorHandler);
5898 Py_XDECREF(exc);
5899 return NULL;
5900}
5901
5902PyObject *
5903_PyUnicode_EncodeUTF32(PyObject *str,
5904 const char *errors,
5905 int byteorder)
5906{
5907 enum PyUnicode_Kind kind;
5908 const void *data;
5909 Py_ssize_t len;
5910 PyObject *v;
5911 uint32_t *out;
5912#if PY_LITTLE_ENDIAN
5913 int native_ordering = byteorder <= 0;
5914#else
5915 int native_ordering = byteorder >= 0;
5916#endif
5917 const char *encoding;
5918 Py_ssize_t nsize, pos;
5919 PyObject *errorHandler = NULL;
5920 PyObject *exc = NULL;
5921 PyObject *rep = NULL;
5922
5923 if (!PyUnicode_Check(str)) {
5924 PyErr_BadArgument();
5925 return NULL;
5926 }
5927 if (PyUnicode_READY(str) == -1)
5928 return NULL;
5929 kind = PyUnicode_KIND(str);
5930 data = PyUnicode_DATA(str);
5931 len = PyUnicode_GET_LENGTH(str);
5932
5933 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5934 return PyErr_NoMemory();
5935 nsize = len + (byteorder == 0);
5936 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5937 if (v == NULL)
5938 return NULL;
5939
5940 /* output buffer is 4-bytes aligned */
5941 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5942 out = (uint32_t *)PyBytes_AS_STRING(v);
5943 if (byteorder == 0)
5944 *out++ = 0xFEFF;
5945 if (len == 0)
5946 goto done;
5947
5948 if (byteorder == -1)
5949 encoding = "utf-32-le";
5950 else if (byteorder == 1)
5951 encoding = "utf-32-be";
5952 else
5953 encoding = "utf-32";
5954
5955 if (kind == PyUnicode_1BYTE_KIND) {
5956 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5957 goto done;
5958 }
5959
5960 pos = 0;
5961 while (pos < len) {
5962 Py_ssize_t newpos, repsize, moreunits;
5963
5964 if (kind == PyUnicode_2BYTE_KIND) {
5965 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5966 &out, native_ordering);
5967 }
5968 else {
5969 assert(kind == PyUnicode_4BYTE_KIND);
5970 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5971 &out, native_ordering);
5972 }
5973 if (pos == len)
5974 break;
5975
5976 rep = unicode_encode_call_errorhandler(
5977 errors, &errorHandler,
5978 encoding, "surrogates not allowed",
5979 str, &exc, pos, pos + 1, &newpos);
5980 if (!rep)
5981 goto error;
5982
5983 if (PyBytes_Check(rep)) {
5984 repsize = PyBytes_GET_SIZE(rep);
5985 if (repsize & 3) {
5986 raise_encode_exception(&exc, encoding,
5987 str, pos, pos + 1,
5988 "surrogates not allowed");
5989 goto error;
5990 }
5991 moreunits = repsize / 4;
5992 }
5993 else {
5994 assert(PyUnicode_Check(rep));
5995 if (PyUnicode_READY(rep) < 0)
5996 goto error;
5997 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5998 if (!PyUnicode_IS_ASCII(rep)) {
5999 raise_encode_exception(&exc, encoding,
6000 str, pos, pos + 1,
6001 "surrogates not allowed");
6002 goto error;
6003 }
6004 }
6005 moreunits += pos - newpos;
6006 pos = newpos;
6007
6008 /* four bytes are reserved for each surrogate */
6009 if (moreunits > 0) {
6010 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
6011 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
6012 /* integer overflow */
6013 PyErr_NoMemory();
6014 goto error;
6015 }
6016 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
6017 goto error;
6018 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
6019 }
6020
6021 if (PyBytes_Check(rep)) {
6022 memcpy(out, PyBytes_AS_STRING(rep), repsize);
6023 out += repsize / 4;
6024 } else /* rep is unicode */ {
6025 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6026 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6027 &out, native_ordering);
6028 }
6029
6030 Py_CLEAR(rep);
6031 }
6032
6033 /* Cut back to size actually needed. This is necessary for, for example,
6034 encoding of a string containing isolated surrogates and the 'ignore'
6035 handler is used. */
6036 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6037 if (nsize != PyBytes_GET_SIZE(v))
6038 _PyBytes_Resize(&v, nsize);
6039 Py_XDECREF(errorHandler);
6040 Py_XDECREF(exc);
6041 done:
6042 return v;
6043 error:
6044 Py_XDECREF(rep);
6045 Py_XDECREF(errorHandler);
6046 Py_XDECREF(exc);
6047 Py_XDECREF(v);
6048 return NULL;
6049}
6050
6051PyObject *
6052PyUnicode_EncodeUTF32(const Py_UNICODE *s,
6053 Py_ssize_t size,
6054 const char *errors,
6055 int byteorder)
6056{
6057 PyObject *result;
6058 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6059 if (tmp == NULL)
6060 return NULL;
6061 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
6062 Py_DECREF(tmp);
6063 return result;
6064}
6065
6066PyObject *
6067PyUnicode_AsUTF32String(PyObject *unicode)
6068{
6069 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6070}
6071
6072/* --- UTF-16 Codec ------------------------------------------------------- */
6073
6074PyObject *
6075PyUnicode_DecodeUTF16(const char *s,
6076 Py_ssize_t size,
6077 const char *errors,
6078 int *byteorder)
6079{
6080 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6081}
6082
6083PyObject *
6084PyUnicode_DecodeUTF16Stateful(const char *s,
6085 Py_ssize_t size,
6086 const char *errors,
6087 int *byteorder,
6088 Py_ssize_t *consumed)
6089{
6090 const char *starts = s;
6091 Py_ssize_t startinpos;
6092 Py_ssize_t endinpos;
6093 _PyUnicodeWriter writer;
6094 const unsigned char *q, *e;
6095 int bo = 0; /* assume native ordering by default */
6096 int native_ordering;
6097 const char *errmsg = "";
6098 PyObject *errorHandler = NULL;
6099 PyObject *exc = NULL;
6100 const char *encoding;
6101
6102 q = (const unsigned char *)s;
6103 e = q + size;
6104
6105 if (byteorder)
6106 bo = *byteorder;
6107
6108 /* Check for BOM marks (U+FEFF) in the input and adjust current
6109 byte order setting accordingly. In native mode, the leading BOM
6110 mark is skipped, in all other modes, it is copied to the output
6111 stream as-is (giving a ZWNBSP character). */
6112 if (bo == 0 && size >= 2) {
6113 const Py_UCS4 bom = (q[1] << 8) | q[0];
6114 if (bom == 0xFEFF) {
6115 q += 2;
6116 bo = -1;
6117 }
6118 else if (bom == 0xFFFE) {
6119 q += 2;
6120 bo = 1;
6121 }
6122 if (byteorder)
6123 *byteorder = bo;
6124 }
6125
6126 if (q == e) {
6127 if (consumed)
6128 *consumed = size;
6129 _Py_RETURN_UNICODE_EMPTY();
6130 }
6131
6132#if PY_LITTLE_ENDIAN
6133 native_ordering = bo <= 0;
6134 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6135#else
6136 native_ordering = bo >= 0;
6137 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6138#endif
6139
6140 /* Note: size will always be longer than the resulting Unicode
6141 character count normally. Error handler will take care of
6142 resizing when needed. */
6143 _PyUnicodeWriter_Init(&writer);
6144 writer.min_length = (e - q + 1) / 2;
6145 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6146 goto onError;
6147
6148 while (1) {
6149 Py_UCS4 ch = 0;
6150 if (e - q >= 2) {
6151 int kind = writer.kind;
6152 if (kind == PyUnicode_1BYTE_KIND) {
6153 if (PyUnicode_IS_ASCII(writer.buffer))
6154 ch = asciilib_utf16_decode(&q, e,
6155 (Py_UCS1*)writer.data, &writer.pos,
6156 native_ordering);
6157 else
6158 ch = ucs1lib_utf16_decode(&q, e,
6159 (Py_UCS1*)writer.data, &writer.pos,
6160 native_ordering);
6161 } else if (kind == PyUnicode_2BYTE_KIND) {
6162 ch = ucs2lib_utf16_decode(&q, e,
6163 (Py_UCS2*)writer.data, &writer.pos,
6164 native_ordering);
6165 } else {
6166 assert(kind == PyUnicode_4BYTE_KIND);
6167 ch = ucs4lib_utf16_decode(&q, e,
6168 (Py_UCS4*)writer.data, &writer.pos,
6169 native_ordering);
6170 }
6171 }
6172
6173 switch (ch)
6174 {
6175 case 0:
6176 /* remaining byte at the end? (size should be even) */
6177 if (q == e || consumed)
6178 goto End;
6179 errmsg = "truncated data";
6180 startinpos = ((const char *)q) - starts;
6181 endinpos = ((const char *)e) - starts;
6182 break;
6183 /* The remaining input chars are ignored if the callback
6184 chooses to skip the input */
6185 case 1:
6186 q -= 2;
6187 if (consumed)
6188 goto End;
6189 errmsg = "unexpected end of data";
6190 startinpos = ((const char *)q) - starts;
6191 endinpos = ((const char *)e) - starts;
6192 break;
6193 case 2:
6194 errmsg = "illegal encoding";
6195 startinpos = ((const char *)q) - 2 - starts;
6196 endinpos = startinpos + 2;
6197 break;
6198 case 3:
6199 errmsg = "illegal UTF-16 surrogate";
6200 startinpos = ((const char *)q) - 4 - starts;
6201 endinpos = startinpos + 2;
6202 break;
6203 default:
6204 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6205 goto onError;
6206 continue;
6207 }
6208
6209 if (unicode_decode_call_errorhandler_writer(
6210 errors,
6211 &errorHandler,
6212 encoding, errmsg,
6213 &starts,
6214 (const char **)&e,
6215 &startinpos,
6216 &endinpos,
6217 &exc,
6218 (const char **)&q,
6219 &writer))
6220 goto onError;
6221 }
6222
6223End:
6224 if (consumed)
6225 *consumed = (const char *)q-starts;
6226
6227 Py_XDECREF(errorHandler);
6228 Py_XDECREF(exc);
6229 return _PyUnicodeWriter_Finish(&writer);
6230
6231 onError:
6232 _PyUnicodeWriter_Dealloc(&writer);
6233 Py_XDECREF(errorHandler);
6234 Py_XDECREF(exc);
6235 return NULL;
6236}
6237
6238PyObject *
6239_PyUnicode_EncodeUTF16(PyObject *str,
6240 const char *errors,
6241 int byteorder)
6242{
6243 enum PyUnicode_Kind kind;
6244 const void *data;
6245 Py_ssize_t len;
6246 PyObject *v;
6247 unsigned short *out;
6248 Py_ssize_t pairs;
6249#if PY_BIG_ENDIAN
6250 int native_ordering = byteorder >= 0;
6251#else
6252 int native_ordering = byteorder <= 0;
6253#endif
6254 const char *encoding;
6255 Py_ssize_t nsize, pos;
6256 PyObject *errorHandler = NULL;
6257 PyObject *exc = NULL;
6258 PyObject *rep = NULL;
6259
6260 if (!PyUnicode_Check(str)) {
6261 PyErr_BadArgument();
6262 return NULL;
6263 }
6264 if (PyUnicode_READY(str) == -1)
6265 return NULL;
6266 kind = PyUnicode_KIND(str);
6267 data = PyUnicode_DATA(str);
6268 len = PyUnicode_GET_LENGTH(str);
6269
6270 pairs = 0;
6271 if (kind == PyUnicode_4BYTE_KIND) {
6272 const Py_UCS4 *in = (const Py_UCS4 *)data;
6273 const Py_UCS4 *end = in + len;
6274 while (in < end) {
6275 if (*in++ >= 0x10000) {
6276 pairs++;
6277 }
6278 }
6279 }
6280 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6281 return PyErr_NoMemory();
6282 }
6283 nsize = len + pairs + (byteorder == 0);
6284 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6285 if (v == NULL) {
6286 return NULL;
6287 }
6288
6289 /* output buffer is 2-bytes aligned */
6290 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6291 out = (unsigned short *)PyBytes_AS_STRING(v);
6292 if (byteorder == 0) {
6293 *out++ = 0xFEFF;
6294 }
6295 if (len == 0) {
6296 goto done;
6297 }
6298
6299 if (kind == PyUnicode_1BYTE_KIND) {
6300 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6301 goto done;
6302 }
6303
6304 if (byteorder < 0) {
6305 encoding = "utf-16-le";
6306 }
6307 else if (byteorder > 0) {
6308 encoding = "utf-16-be";
6309 }
6310 else {
6311 encoding = "utf-16";
6312 }
6313
6314 pos = 0;
6315 while (pos < len) {
6316 Py_ssize_t newpos, repsize, moreunits;
6317
6318 if (kind == PyUnicode_2BYTE_KIND) {
6319 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6320 &out, native_ordering);
6321 }
6322 else {
6323 assert(kind == PyUnicode_4BYTE_KIND);
6324 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6325 &out, native_ordering);
6326 }
6327 if (pos == len)
6328 break;
6329
6330 rep = unicode_encode_call_errorhandler(
6331 errors, &errorHandler,
6332 encoding, "surrogates not allowed",
6333 str, &exc, pos, pos + 1, &newpos);
6334 if (!rep)
6335 goto error;
6336
6337 if (PyBytes_Check(rep)) {
6338 repsize = PyBytes_GET_SIZE(rep);
6339 if (repsize & 1) {
6340 raise_encode_exception(&exc, encoding,
6341 str, pos, pos + 1,
6342 "surrogates not allowed");
6343 goto error;
6344 }
6345 moreunits = repsize / 2;
6346 }
6347 else {
6348 assert(PyUnicode_Check(rep));
6349 if (PyUnicode_READY(rep) < 0)
6350 goto error;
6351 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6352 if (!PyUnicode_IS_ASCII(rep)) {
6353 raise_encode_exception(&exc, encoding,
6354 str, pos, pos + 1,
6355 "surrogates not allowed");
6356 goto error;
6357 }
6358 }
6359 moreunits += pos - newpos;
6360 pos = newpos;
6361
6362 /* two bytes are reserved for each surrogate */
6363 if (moreunits > 0) {
6364 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6365 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6366 /* integer overflow */
6367 PyErr_NoMemory();
6368 goto error;
6369 }
6370 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6371 goto error;
6372 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6373 }
6374
6375 if (PyBytes_Check(rep)) {
6376 memcpy(out, PyBytes_AS_STRING(rep), repsize);
6377 out += repsize / 2;
6378 } else /* rep is unicode */ {
6379 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6380 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6381 &out, native_ordering);
6382 }
6383
6384 Py_CLEAR(rep);
6385 }
6386
6387 /* Cut back to size actually needed. This is necessary for, for example,
6388 encoding of a string containing isolated surrogates and the 'ignore' handler
6389 is used. */
6390 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6391 if (nsize != PyBytes_GET_SIZE(v))
6392 _PyBytes_Resize(&v, nsize);
6393 Py_XDECREF(errorHandler);
6394 Py_XDECREF(exc);
6395 done:
6396 return v;
6397 error:
6398 Py_XDECREF(rep);
6399 Py_XDECREF(errorHandler);
6400 Py_XDECREF(exc);
6401 Py_XDECREF(v);
6402 return NULL;
6403#undef STORECHAR
6404}
6405
6406PyObject *
6407PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6408 Py_ssize_t size,
6409 const char *errors,
6410 int byteorder)
6411{
6412 PyObject *result;
6413 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6414 if (tmp == NULL)
6415 return NULL;
6416 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6417 Py_DECREF(tmp);
6418 return result;
6419}
6420
6421PyObject *
6422PyUnicode_AsUTF16String(PyObject *unicode)
6423{
6424 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6425}
6426
6427/* --- Unicode Escape Codec ----------------------------------------------- */
6428
6429static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
6430
6431PyObject *
6432_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6433 Py_ssize_t size,
6434 const char *errors,
6435 Py_ssize_t *consumed,
6436 const char **first_invalid_escape)
6437{
6438 const char *starts = s;
6439 _PyUnicodeWriter writer;
6440 const char *end;
6441 PyObject *errorHandler = NULL;
6442 PyObject *exc = NULL;
6443
6444 // so we can remember if we've seen an invalid escape char or not
6445 *first_invalid_escape = NULL;
6446
6447 if (size == 0) {
6448 if (consumed) {
6449 *consumed = 0;
6450 }
6451 _Py_RETURN_UNICODE_EMPTY();
6452 }
6453 /* Escaped strings will always be longer than the resulting
6454 Unicode string, so we start with size here and then reduce the
6455 length after conversion to the true value.
6456 (but if the error callback returns a long replacement string
6457 we'll have to allocate more space) */
6458 _PyUnicodeWriter_Init(&writer);
6459 writer.min_length = size;
6460 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6461 goto onError;
6462 }
6463
6464 end = s + size;
6465 while (s < end) {
6466 unsigned char c = (unsigned char) *s++;
6467 Py_UCS4 ch;
6468 int count;
6469 const char *message;
6470
6471#define WRITE_ASCII_CHAR(ch) \
6472 do { \
6473 assert(ch <= 127); \
6474 assert(writer.pos < writer.size); \
6475 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6476 } while(0)
6477
6478#define WRITE_CHAR(ch) \
6479 do { \
6480 if (ch <= writer.maxchar) { \
6481 assert(writer.pos < writer.size); \
6482 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6483 } \
6484 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6485 goto onError; \
6486 } \
6487 } while(0)
6488
6489 /* Non-escape characters are interpreted as Unicode ordinals */
6490 if (c != '\\') {
6491 WRITE_CHAR(c);
6492 continue;
6493 }
6494
6495 Py_ssize_t startinpos = s - starts - 1;
6496 /* \ - Escapes */
6497 if (s >= end) {
6498 message = "\\ at end of string";
6499 goto incomplete;
6500 }
6501 c = (unsigned char) *s++;
6502
6503 assert(writer.pos < writer.size);
6504 switch (c) {
6505
6506 /* \x escapes */
6507 case '\n': continue;
6508 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6509 case '\'': WRITE_ASCII_CHAR('\''); continue;
6510 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6511 case 'b': WRITE_ASCII_CHAR('\b'); continue;
6512 /* FF */
6513 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6514 case 't': WRITE_ASCII_CHAR('\t'); continue;
6515 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6516 case 'r': WRITE_ASCII_CHAR('\r'); continue;
6517 /* VT */
6518 case 'v': WRITE_ASCII_CHAR('\013'); continue;
6519 /* BEL, not classic C */
6520 case 'a': WRITE_ASCII_CHAR('\007'); continue;
6521
6522 /* \OOO (octal) escapes */
6523 case '0': case '1': case '2': case '3':
6524 case '4': case '5': case '6': case '7':
6525 ch = c - '0';
6526 if (s < end && '0' <= *s && *s <= '7') {
6527 ch = (ch<<3) + *s++ - '0';
6528 if (s < end && '0' <= *s && *s <= '7') {
6529 ch = (ch<<3) + *s++ - '0';
6530 }
6531 }
6532 WRITE_CHAR(ch);
6533 continue;
6534
6535 /* hex escapes */
6536 /* \xXX */
6537 case 'x':
6538 count = 2;
6539 message = "truncated \\xXX escape";
6540 goto hexescape;
6541
6542 /* \uXXXX */
6543 case 'u':
6544 count = 4;
6545 message = "truncated \\uXXXX escape";
6546 goto hexescape;
6547
6548 /* \UXXXXXXXX */
6549 case 'U':
6550 count = 8;
6551 message = "truncated \\UXXXXXXXX escape";
6552 hexescape:
6553 for (ch = 0; count; ++s, --count) {
6554 if (s >= end) {
6555 goto incomplete;
6556 }
6557 c = (unsigned char)*s;
6558 ch <<= 4;
6559 if (c >= '0' && c <= '9') {
6560 ch += c - '0';
6561 }
6562 else if (c >= 'a' && c <= 'f') {
6563 ch += c - ('a' - 10);
6564 }
6565 else if (c >= 'A' && c <= 'F') {
6566 ch += c - ('A' - 10);
6567 }
6568 else {
6569 goto error;
6570 }
6571 }
6572
6573 /* when we get here, ch is a 32-bit unicode character */
6574 if (ch > MAX_UNICODE) {
6575 message = "illegal Unicode character";
6576 goto error;
6577 }
6578
6579 WRITE_CHAR(ch);
6580 continue;
6581
6582 /* \N{name} */
6583 case 'N':
6584 if (ucnhash_capi == NULL) {
6585 /* load the unicode data module */
6586 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6587 PyUnicodeData_CAPSULE_NAME, 1);
6588 if (ucnhash_capi == NULL) {
6589 PyErr_SetString(
6590 PyExc_UnicodeError,
6591 "\\N escapes not supported (can't load unicodedata module)"
6592 );
6593 goto onError;
6594 }
6595 }
6596
6597 message = "malformed \\N character escape";
6598 if (s >= end) {
6599 goto incomplete;
6600 }
6601 if (*s == '{') {
6602 const char *start = ++s;
6603 size_t namelen;
6604 /* look for the closing brace */
6605 while (s < end && *s != '}')
6606 s++;
6607 if (s >= end) {
6608 goto incomplete;
6609 }
6610 namelen = s - start;
6611 if (namelen) {
6612 /* found a name. look it up in the unicode database */
6613 s++;
6614 ch = 0xffffffff; /* in case 'getcode' messes up */
6615 if (namelen <= INT_MAX &&
6616 ucnhash_capi->getcode(start, (int)namelen,
6617 &ch, 0)) {
6618 assert(ch <= MAX_UNICODE);
6619 WRITE_CHAR(ch);
6620 continue;
6621 }
6622 message = "unknown Unicode character name";
6623 }
6624 }
6625 goto error;
6626
6627 default:
6628 if (*first_invalid_escape == NULL) {
6629 *first_invalid_escape = s-1; /* Back up one char, since we've
6630 already incremented s. */
6631 }
6632 WRITE_ASCII_CHAR('\\');
6633 WRITE_CHAR(c);
6634 continue;
6635 }
6636
6637 incomplete:
6638 if (consumed) {
6639 *consumed = startinpos;
6640 break;
6641 }
6642 error:;
6643 Py_ssize_t endinpos = s-starts;
6644 writer.min_length = end - s + writer.pos;
6645 if (unicode_decode_call_errorhandler_writer(
6646 errors, &errorHandler,
6647 "unicodeescape", message,
6648 &starts, &end, &startinpos, &endinpos, &exc, &s,
6649 &writer)) {
6650 goto onError;
6651 }
6652 assert(end - s <= writer.size - writer.pos);
6653
6654#undef WRITE_ASCII_CHAR
6655#undef WRITE_CHAR
6656 }
6657
6658 Py_XDECREF(errorHandler);
6659 Py_XDECREF(exc);
6660 return _PyUnicodeWriter_Finish(&writer);
6661
6662 onError:
6663 _PyUnicodeWriter_Dealloc(&writer);
6664 Py_XDECREF(errorHandler);
6665 Py_XDECREF(exc);
6666 return NULL;
6667}
6668
6669PyObject *
6670_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6671 Py_ssize_t size,
6672 const char *errors,
6673 Py_ssize_t *consumed)
6674{
6675 const char *first_invalid_escape;
6676 PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6677 consumed,
6678 &first_invalid_escape);
6679 if (result == NULL)
6680 return NULL;
6681 if (first_invalid_escape != NULL) {
6682 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6683 "invalid escape sequence '\\%c'",
6684 (unsigned char)*first_invalid_escape) < 0) {
6685 Py_DECREF(result);
6686 return NULL;
6687 }
6688 }
6689 return result;
6690}
6691
6692PyObject *
6693PyUnicode_DecodeUnicodeEscape(const char *s,
6694 Py_ssize_t size,
6695 const char *errors)
6696{
6697 return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6698}
6699
6700/* Return a Unicode-Escape string version of the Unicode object. */
6701
6702PyObject *
6703PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6704{
6705 Py_ssize_t i, len;
6706 PyObject *repr;
6707 char *p;
6708 enum PyUnicode_Kind kind;
6709 const void *data;
6710 Py_ssize_t expandsize;
6711
6712 /* Initial allocation is based on the longest-possible character
6713 escape.
6714
6715 For UCS1 strings it's '\xxx', 4 bytes per source character.
6716 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6717 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6718 */
6719
6720 if (!PyUnicode_Check(unicode)) {
6721 PyErr_BadArgument();
6722 return NULL;
6723 }
6724 if (PyUnicode_READY(unicode) == -1) {
6725 return NULL;
6726 }
6727
6728 len = PyUnicode_GET_LENGTH(unicode);
6729 if (len == 0) {
6730 return PyBytes_FromStringAndSize(NULL, 0);
6731 }
6732
6733 kind = PyUnicode_KIND(unicode);
6734 data = PyUnicode_DATA(unicode);
6735 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6736 bytes, and 1 byte characters 4. */
6737 expandsize = kind * 2 + 2;
6738 if (len > PY_SSIZE_T_MAX / expandsize) {
6739 return PyErr_NoMemory();
6740 }
6741 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6742 if (repr == NULL) {
6743 return NULL;
6744 }
6745
6746 p = PyBytes_AS_STRING(repr);
6747 for (i = 0; i < len; i++) {
6748 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6749
6750 /* U+0000-U+00ff range */
6751 if (ch < 0x100) {
6752 if (ch >= ' ' && ch < 127) {
6753 if (ch != '\\') {
6754 /* Copy printable US ASCII as-is */
6755 *p++ = (char) ch;
6756 }
6757 /* Escape backslashes */
6758 else {
6759 *p++ = '\\';
6760 *p++ = '\\';
6761 }
6762 }
6763
6764 /* Map special whitespace to '\t', \n', '\r' */
6765 else if (ch == '\t') {
6766 *p++ = '\\';
6767 *p++ = 't';
6768 }
6769 else if (ch == '\n') {
6770 *p++ = '\\';
6771 *p++ = 'n';
6772 }
6773 else if (ch == '\r') {
6774 *p++ = '\\';
6775 *p++ = 'r';
6776 }
6777
6778 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6779 else {
6780 *p++ = '\\';
6781 *p++ = 'x';
6782 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6783 *p++ = Py_hexdigits[ch & 0x000F];
6784 }
6785 }
6786 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6787 else if (ch < 0x10000) {
6788 *p++ = '\\';
6789 *p++ = 'u';
6790 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6791 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6792 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6793 *p++ = Py_hexdigits[ch & 0x000F];
6794 }
6795 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6796 else {
6797
6798 /* Make sure that the first two digits are zero */
6799 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6800 *p++ = '\\';
6801 *p++ = 'U';
6802 *p++ = '0';
6803 *p++ = '0';
6804 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6805 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6806 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6807 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6808 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6809 *p++ = Py_hexdigits[ch & 0x0000000F];
6810 }
6811 }
6812
6813 assert(p - PyBytes_AS_STRING(repr) > 0);
6814 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6815 return NULL;
6816 }
6817 return repr;
6818}
6819
6820PyObject *
6821PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6822 Py_ssize_t size)
6823{
6824 PyObject *result;
6825 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6826 if (tmp == NULL) {
6827 return NULL;
6828 }
6829
6830 result = PyUnicode_AsUnicodeEscapeString(tmp);
6831 Py_DECREF(tmp);
6832 return result;
6833}
6834
6835/* --- Raw Unicode Escape Codec ------------------------------------------- */
6836
6837PyObject *
6838_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6839 Py_ssize_t size,
6840 const char *errors,
6841 Py_ssize_t *consumed)
6842{
6843 const char *starts = s;
6844 _PyUnicodeWriter writer;
6845 const char *end;
6846 PyObject *errorHandler = NULL;
6847 PyObject *exc = NULL;
6848
6849 if (size == 0) {
6850 if (consumed) {
6851 *consumed = 0;
6852 }
6853 _Py_RETURN_UNICODE_EMPTY();
6854 }
6855
6856 /* Escaped strings will always be longer than the resulting
6857 Unicode string, so we start with size here and then reduce the
6858 length after conversion to the true value. (But decoding error
6859 handler might have to resize the string) */
6860 _PyUnicodeWriter_Init(&writer);
6861 writer.min_length = size;
6862 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6863 goto onError;
6864 }
6865
6866 end = s + size;
6867 while (s < end) {
6868 unsigned char c = (unsigned char) *s++;
6869 Py_UCS4 ch;
6870 int count;
6871 const char *message;
6872
6873#define WRITE_CHAR(ch) \
6874 do { \
6875 if (ch <= writer.maxchar) { \
6876 assert(writer.pos < writer.size); \
6877 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6878 } \
6879 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6880 goto onError; \
6881 } \
6882 } while(0)
6883
6884 /* Non-escape characters are interpreted as Unicode ordinals */
6885 if (c != '\\' || (s >= end && !consumed)) {
6886 WRITE_CHAR(c);
6887 continue;
6888 }
6889
6890 Py_ssize_t startinpos = s - starts - 1;
6891 /* \ - Escapes */
6892 if (s >= end) {
6893 assert(consumed);
6894 // Set message to silent compiler warning.
6895 // Actually it is never used.
6896 message = "\\ at end of string";
6897 goto incomplete;
6898 }
6899
6900 c = (unsigned char) *s++;
6901 if (c == 'u') {
6902 count = 4;
6903 message = "truncated \\uXXXX escape";
6904 }
6905 else if (c == 'U') {
6906 count = 8;
6907 message = "truncated \\UXXXXXXXX escape";
6908 }
6909 else {
6910 assert(writer.pos < writer.size);
6911 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6912 WRITE_CHAR(c);
6913 continue;
6914 }
6915
6916 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6917 for (ch = 0; count; ++s, --count) {
6918 if (s >= end) {
6919 goto incomplete;
6920 }
6921 c = (unsigned char)*s;
6922 ch <<= 4;
6923 if (c >= '0' && c <= '9') {
6924 ch += c - '0';
6925 }
6926 else if (c >= 'a' && c <= 'f') {
6927 ch += c - ('a' - 10);
6928 }
6929 else if (c >= 'A' && c <= 'F') {
6930 ch += c - ('A' - 10);
6931 }
6932 else {
6933 goto error;
6934 }
6935 }
6936 if (ch > MAX_UNICODE) {
6937 message = "\\Uxxxxxxxx out of range";
6938 goto error;
6939 }
6940 WRITE_CHAR(ch);
6941 continue;
6942
6943 incomplete:
6944 if (consumed) {
6945 *consumed = startinpos;
6946 break;
6947 }
6948 error:;
6949 Py_ssize_t endinpos = s-starts;
6950 writer.min_length = end - s + writer.pos;
6951 if (unicode_decode_call_errorhandler_writer(
6952 errors, &errorHandler,
6953 "rawunicodeescape", message,
6954 &starts, &end, &startinpos, &endinpos, &exc, &s,
6955 &writer)) {
6956 goto onError;
6957 }
6958 assert(end - s <= writer.size - writer.pos);
6959
6960#undef WRITE_CHAR
6961 }
6962 Py_XDECREF(errorHandler);
6963 Py_XDECREF(exc);
6964 return _PyUnicodeWriter_Finish(&writer);
6965
6966 onError:
6967 _PyUnicodeWriter_Dealloc(&writer);
6968 Py_XDECREF(errorHandler);
6969 Py_XDECREF(exc);
6970 return NULL;
6971}
6972
6973PyObject *
6974PyUnicode_DecodeRawUnicodeEscape(const char *s,
6975 Py_ssize_t size,
6976 const char *errors)
6977{
6978 return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6979}
6980
6981
6982PyObject *
6983PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6984{
6985 PyObject *repr;
6986 char *p;
6987 Py_ssize_t expandsize, pos;
6988 int kind;
6989 const void *data;
6990 Py_ssize_t len;
6991
6992 if (!PyUnicode_Check(unicode)) {
6993 PyErr_BadArgument();
6994 return NULL;
6995 }
6996 if (PyUnicode_READY(unicode) == -1) {
6997 return NULL;
6998 }
6999 kind = PyUnicode_KIND(unicode);
7000 data = PyUnicode_DATA(unicode);
7001 len = PyUnicode_GET_LENGTH(unicode);
7002 if (kind == PyUnicode_1BYTE_KIND) {
7003 return PyBytes_FromStringAndSize(data, len);
7004 }
7005
7006 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7007 bytes, and 1 byte characters 4. */
7008 expandsize = kind * 2 + 2;
7009
7010 if (len > PY_SSIZE_T_MAX / expandsize) {
7011 return PyErr_NoMemory();
7012 }
7013 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
7014 if (repr == NULL) {
7015 return NULL;
7016 }
7017 if (len == 0) {
7018 return repr;
7019 }
7020
7021 p = PyBytes_AS_STRING(repr);
7022 for (pos = 0; pos < len; pos++) {
7023 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7024
7025 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7026 if (ch < 0x100) {
7027 *p++ = (char) ch;
7028 }
7029 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7030 else if (ch < 0x10000) {
7031 *p++ = '\\';
7032 *p++ = 'u';
7033 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7034 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7035 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7036 *p++ = Py_hexdigits[ch & 15];
7037 }
7038 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7039 else {
7040 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7041 *p++ = '\\';
7042 *p++ = 'U';
7043 *p++ = '0';
7044 *p++ = '0';
7045 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7046 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7047 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7048 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7049 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7050 *p++ = Py_hexdigits[ch & 15];
7051 }
7052 }
7053
7054 assert(p > PyBytes_AS_STRING(repr));
7055 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
7056 return NULL;
7057 }
7058 return repr;
7059}
7060
7061PyObject *
7062PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
7063 Py_ssize_t size)
7064{
7065 PyObject *result;
7066 PyObject *tmp = PyUnicode_FromWideChar(s, size);
7067 if (tmp == NULL)
7068 return NULL;
7069 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
7070 Py_DECREF(tmp);
7071 return result;
7072}
7073
7074/* --- Latin-1 Codec ------------------------------------------------------ */
7075
7076PyObject *
7077PyUnicode_DecodeLatin1(const char *s,
7078 Py_ssize_t size,
7079 const char *errors)
7080{
7081 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7082 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7083}
7084
7085/* create or adjust a UnicodeEncodeError */
7086static void
7087make_encode_exception(PyObject **exceptionObject,
7088 const char *encoding,
7089 PyObject *unicode,
7090 Py_ssize_t startpos, Py_ssize_t endpos,
7091 const char *reason)
7092{
7093 if (*exceptionObject == NULL) {
7094 *exceptionObject = PyObject_CallFunction(
7095 PyExc_UnicodeEncodeError, "sOnns",
7096 encoding, unicode, startpos, endpos, reason);
7097 }
7098 else {
7099 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7100 goto onError;
7101 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7102 goto onError;
7103 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7104 goto onError;
7105 return;
7106 onError:
7107 Py_CLEAR(*exceptionObject);
7108 }
7109}
7110
7111/* raises a UnicodeEncodeError */
7112static void
7113raise_encode_exception(PyObject **exceptionObject,
7114 const char *encoding,
7115 PyObject *unicode,
7116 Py_ssize_t startpos, Py_ssize_t endpos,
7117 const char *reason)
7118{
7119 make_encode_exception(exceptionObject,
7120 encoding, unicode, startpos, endpos, reason);
7121 if (*exceptionObject != NULL)
7122 PyCodec_StrictErrors(*exceptionObject);
7123}
7124
7125/* error handling callback helper:
7126 build arguments, call the callback and check the arguments,
7127 put the result into newpos and return the replacement string, which
7128 has to be freed by the caller */
7129static PyObject *
7130unicode_encode_call_errorhandler(const char *errors,
7131 PyObject **errorHandler,
7132 const char *encoding, const char *reason,
7133 PyObject *unicode, PyObject **exceptionObject,
7134 Py_ssize_t startpos, Py_ssize_t endpos,
7135 Py_ssize_t *newpos)
7136{
7137 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7138 Py_ssize_t len;
7139 PyObject *restuple;
7140 PyObject *resunicode;
7141
7142 if (*errorHandler == NULL) {
7143 *errorHandler = PyCodec_LookupError(errors);
7144 if (*errorHandler == NULL)
7145 return NULL;
7146 }
7147
7148 if (PyUnicode_READY(unicode) == -1)
7149 return NULL;
7150 len = PyUnicode_GET_LENGTH(unicode);
7151
7152 make_encode_exception(exceptionObject,
7153 encoding, unicode, startpos, endpos, reason);
7154 if (*exceptionObject == NULL)
7155 return NULL;
7156
7157 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7158 if (restuple == NULL)
7159 return NULL;
7160 if (!PyTuple_Check(restuple)) {
7161 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7162 Py_DECREF(restuple);
7163 return NULL;
7164 }
7165 if (!PyArg_ParseTuple(restuple, argparse,
7166 &resunicode, newpos)) {
7167 Py_DECREF(restuple);
7168 return NULL;
7169 }
7170 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7171 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7172 Py_DECREF(restuple);
7173 return NULL;
7174 }
7175 if (*newpos<0)
7176 *newpos = len + *newpos;
7177 if (*newpos<0 || *newpos>len) {
7178 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7179 Py_DECREF(restuple);
7180 return NULL;
7181 }
7182 Py_INCREF(resunicode);
7183 Py_DECREF(restuple);
7184 return resunicode;
7185}
7186
7187static PyObject *
7188unicode_encode_ucs1(PyObject *unicode,
7189 const char *errors,
7190 const Py_UCS4 limit)
7191{
7192 /* input state */
7193 Py_ssize_t pos=0, size;
7194 int kind;
7195 const void *data;
7196 /* pointer into the output */
7197 char *str;
7198 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7199 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7200 PyObject *error_handler_obj = NULL;
7201 PyObject *exc = NULL;
7202 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7203 PyObject *rep = NULL;
7204 /* output object */
7205 _PyBytesWriter writer;
7206
7207 if (PyUnicode_READY(unicode) == -1)
7208 return NULL;
7209 size = PyUnicode_GET_LENGTH(unicode);
7210 kind = PyUnicode_KIND(unicode);
7211 data = PyUnicode_DATA(unicode);
7212 /* allocate enough for a simple encoding without
7213 replacements, if we need more, we'll resize */
7214 if (size == 0)
7215 return PyBytes_FromStringAndSize(NULL, 0);
7216
7217 _PyBytesWriter_Init(&writer);
7218 str = _PyBytesWriter_Alloc(&writer, size);
7219 if (str == NULL)
7220 return NULL;
7221
7222 while (pos < size) {
7223 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7224
7225 /* can we encode this? */
7226 if (ch < limit) {
7227 /* no overflow check, because we know that the space is enough */
7228 *str++ = (char)ch;
7229 ++pos;
7230 }
7231 else {
7232 Py_ssize_t newpos, i;
7233 /* startpos for collecting unencodable chars */
7234 Py_ssize_t collstart = pos;
7235 Py_ssize_t collend = collstart + 1;
7236 /* find all unecodable characters */
7237
7238 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7239 ++collend;
7240
7241 /* Only overallocate the buffer if it's not the last write */
7242 writer.overallocate = (collend < size);
7243
7244 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7245 if (error_handler == _Py_ERROR_UNKNOWN)
7246 error_handler = _Py_GetErrorHandler(errors);
7247
7248 switch (error_handler) {
7249 case _Py_ERROR_STRICT:
7250 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7251 goto onError;
7252
7253 case _Py_ERROR_REPLACE:
7254 memset(str, '?', collend - collstart);
7255 str += (collend - collstart);
7256 /* fall through */
7257 case _Py_ERROR_IGNORE:
7258 pos = collend;
7259 break;
7260
7261 case _Py_ERROR_BACKSLASHREPLACE:
7262 /* subtract preallocated bytes */
7263 writer.min_size -= (collend - collstart);
7264 str = backslashreplace(&writer, str,
7265 unicode, collstart, collend);
7266 if (str == NULL)
7267 goto onError;
7268 pos = collend;
7269 break;
7270
7271 case _Py_ERROR_XMLCHARREFREPLACE:
7272 /* subtract preallocated bytes */
7273 writer.min_size -= (collend - collstart);
7274 str = xmlcharrefreplace(&writer, str,
7275 unicode, collstart, collend);
7276 if (str == NULL)
7277 goto onError;
7278 pos = collend;
7279 break;
7280
7281 case _Py_ERROR_SURROGATEESCAPE:
7282 for (i = collstart; i < collend; ++i) {
7283 ch = PyUnicode_READ(kind, data, i);
7284 if (ch < 0xdc80 || 0xdcff < ch) {
7285 /* Not a UTF-8b surrogate */
7286 break;
7287 }
7288 *str++ = (char)(ch - 0xdc00);
7289 ++pos;
7290 }
7291 if (i >= collend)
7292 break;
7293 collstart = pos;
7294 assert(collstart != collend);
7295 /* fall through */
7296
7297 default:
7298 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7299 encoding, reason, unicode, &exc,
7300 collstart, collend, &newpos);
7301 if (rep == NULL)
7302 goto onError;
7303
7304 if (newpos < collstart) {
7305 writer.overallocate = 1;
7306 str = _PyBytesWriter_Prepare(&writer, str,
7307 collstart - newpos);
7308 if (str == NULL)
7309 goto onError;
7310 }
7311 else {
7312 /* subtract preallocated bytes */
7313 writer.min_size -= newpos - collstart;
7314 /* Only overallocate the buffer if it's not the last write */
7315 writer.overallocate = (newpos < size);
7316 }
7317
7318 if (PyBytes_Check(rep)) {
7319 /* Directly copy bytes result to output. */
7320 str = _PyBytesWriter_WriteBytes(&writer, str,
7321 PyBytes_AS_STRING(rep),
7322 PyBytes_GET_SIZE(rep));
7323 }
7324 else {
7325 assert(PyUnicode_Check(rep));
7326
7327 if (PyUnicode_READY(rep) < 0)
7328 goto onError;
7329
7330 if (limit == 256 ?
7331 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7332 !PyUnicode_IS_ASCII(rep))
7333 {
7334 /* Not all characters are smaller than limit */
7335 raise_encode_exception(&exc, encoding, unicode,
7336 collstart, collend, reason);
7337 goto onError;
7338 }
7339 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7340 str = _PyBytesWriter_WriteBytes(&writer, str,
7341 PyUnicode_DATA(rep),
7342 PyUnicode_GET_LENGTH(rep));
7343 }
7344 if (str == NULL)
7345 goto onError;
7346
7347 pos = newpos;
7348 Py_CLEAR(rep);
7349 }
7350
7351 /* If overallocation was disabled, ensure that it was the last
7352 write. Otherwise, we missed an optimization */
7353 assert(writer.overallocate || pos == size);
7354 }
7355 }
7356
7357 Py_XDECREF(error_handler_obj);
7358 Py_XDECREF(exc);
7359 return _PyBytesWriter_Finish(&writer, str);
7360
7361 onError:
7362 Py_XDECREF(rep);
7363 _PyBytesWriter_Dealloc(&writer);
7364 Py_XDECREF(error_handler_obj);
7365 Py_XDECREF(exc);
7366 return NULL;
7367}
7368
7369/* Deprecated */
7370PyObject *
7371PyUnicode_EncodeLatin1(const Py_UNICODE *p,
7372 Py_ssize_t size,
7373 const char *errors)
7374{
7375 PyObject *result;
7376 PyObject *unicode = PyUnicode_FromWideChar(p, size);
7377 if (unicode == NULL)
7378 return NULL;
7379 result = unicode_encode_ucs1(unicode, errors, 256);
7380 Py_DECREF(unicode);
7381 return result;
7382}
7383
7384PyObject *
7385_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7386{
7387 if (!PyUnicode_Check(unicode)) {
7388 PyErr_BadArgument();
7389 return NULL;
7390 }
7391 if (PyUnicode_READY(unicode) == -1)
7392 return NULL;
7393 /* Fast path: if it is a one-byte string, construct
7394 bytes object directly. */
7395 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7396 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7397 PyUnicode_GET_LENGTH(unicode));
7398 /* Non-Latin-1 characters present. Defer to above function to
7399 raise the exception. */
7400 return unicode_encode_ucs1(unicode, errors, 256);
7401}
7402
7403PyObject*
7404PyUnicode_AsLatin1String(PyObject *unicode)
7405{
7406 return _PyUnicode_AsLatin1String(unicode, NULL);
7407}
7408
7409/* --- 7-bit ASCII Codec -------------------------------------------------- */
7410
7411PyObject *
7412PyUnicode_DecodeASCII(const char *s,
7413 Py_ssize_t size,
7414 const char *errors)
7415{
7416 const char *starts = s;
7417 const char *e = s + size;
7418 PyObject *error_handler_obj = NULL;
7419 PyObject *exc = NULL;
7420 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7421
7422 if (size == 0)
7423 _Py_RETURN_UNICODE_EMPTY();
7424
7425 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7426 if (size == 1 && (unsigned char)s[0] < 128) {
7427 return get_latin1_char((unsigned char)s[0]);
7428 }
7429
7430 // Shortcut for simple case
7431 PyObject *u = PyUnicode_New(size, 127);
7432 if (u == NULL) {
7433 return NULL;
7434 }
7435 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7436 if (outpos == size) {
7437 return u;
7438 }
7439
7440 _PyUnicodeWriter writer;
7441 _PyUnicodeWriter_InitWithBuffer(&writer, u);
7442 writer.pos = outpos;
7443
7444 s += outpos;
7445 int kind = writer.kind;
7446 void *data = writer.data;
7447 Py_ssize_t startinpos, endinpos;
7448
7449 while (s < e) {
7450 unsigned char c = (unsigned char)*s;
7451 if (c < 128) {
7452 PyUnicode_WRITE(kind, data, writer.pos, c);
7453 writer.pos++;
7454 ++s;
7455 continue;
7456 }
7457
7458 /* byte outsize range 0x00..0x7f: call the error handler */
7459
7460 if (error_handler == _Py_ERROR_UNKNOWN)
7461 error_handler = _Py_GetErrorHandler(errors);
7462
7463 switch (error_handler)
7464 {
7465 case _Py_ERROR_REPLACE:
7466 case _Py_ERROR_SURROGATEESCAPE:
7467 /* Fast-path: the error handler only writes one character,
7468 but we may switch to UCS2 at the first write */
7469 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7470 goto onError;
7471 kind = writer.kind;
7472 data = writer.data;
7473
7474 if (error_handler == _Py_ERROR_REPLACE)
7475 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7476 else
7477 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7478 writer.pos++;
7479 ++s;
7480 break;
7481
7482 case _Py_ERROR_IGNORE:
7483 ++s;
7484 break;
7485
7486 default:
7487 startinpos = s-starts;
7488 endinpos = startinpos + 1;
7489 if (unicode_decode_call_errorhandler_writer(
7490 errors, &error_handler_obj,
7491 "ascii", "ordinal not in range(128)",
7492 &starts, &e, &startinpos, &endinpos, &exc, &s,
7493 &writer))
7494 goto onError;
7495 kind = writer.kind;
7496 data = writer.data;
7497 }
7498 }
7499 Py_XDECREF(error_handler_obj);
7500 Py_XDECREF(exc);
7501 return _PyUnicodeWriter_Finish(&writer);
7502
7503 onError:
7504 _PyUnicodeWriter_Dealloc(&writer);
7505 Py_XDECREF(error_handler_obj);
7506 Py_XDECREF(exc);
7507 return NULL;
7508}
7509
7510/* Deprecated */
7511PyObject *
7512PyUnicode_EncodeASCII(const Py_UNICODE *p,
7513 Py_ssize_t size,
7514 const char *errors)
7515{
7516 PyObject *result;
7517 PyObject *unicode = PyUnicode_FromWideChar(p, size);
7518 if (unicode == NULL)
7519 return NULL;
7520 result = unicode_encode_ucs1(unicode, errors, 128);
7521 Py_DECREF(unicode);
7522 return result;
7523}
7524
7525PyObject *
7526_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7527{
7528 if (!PyUnicode_Check(unicode)) {
7529 PyErr_BadArgument();
7530 return NULL;
7531 }
7532 if (PyUnicode_READY(unicode) == -1)
7533 return NULL;
7534 /* Fast path: if it is an ASCII-only string, construct bytes object
7535 directly. Else defer to above function to raise the exception. */
7536 if (PyUnicode_IS_ASCII(unicode))
7537 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7538 PyUnicode_GET_LENGTH(unicode));
7539 return unicode_encode_ucs1(unicode, errors, 128);
7540}
7541
7542PyObject *
7543PyUnicode_AsASCIIString(PyObject *unicode)
7544{
7545 return _PyUnicode_AsASCIIString(unicode, NULL);
7546}
7547
7548#ifdef MS_WINDOWS
7549
7550/* --- MBCS codecs for Windows -------------------------------------------- */
7551
7552#if SIZEOF_INT < SIZEOF_SIZE_T
7553#define NEED_RETRY
7554#endif
7555
7556/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7557 transcoding from UTF-16), but INT_MAX / 4 performs better in
7558 both cases also and avoids partial characters overrunning the
7559 length limit in MultiByteToWideChar on Windows */
7560#define DECODING_CHUNK_SIZE (INT_MAX/4)
7561
7562#ifndef WC_ERR_INVALID_CHARS
7563# define WC_ERR_INVALID_CHARS 0x0080
7564#endif
7565
7566static const char*
7567code_page_name(UINT code_page, PyObject **obj)
7568{
7569 *obj = NULL;
7570 if (code_page == CP_ACP)
7571 return "mbcs";
7572 if (code_page == CP_UTF7)
7573 return "CP_UTF7";
7574 if (code_page == CP_UTF8)
7575 return "CP_UTF8";
7576
7577 *obj = PyBytes_FromFormat("cp%u", code_page);
7578 if (*obj == NULL)
7579 return NULL;
7580 return PyBytes_AS_STRING(*obj);
7581}
7582
7583static DWORD
7584decode_code_page_flags(UINT code_page)
7585{
7586 if (code_page == CP_UTF7) {
7587 /* The CP_UTF7 decoder only supports flags=0 */
7588 return 0;
7589 }
7590 else
7591 return MB_ERR_INVALID_CHARS;
7592}
7593
7594/*
7595 * Decode a byte string from a Windows code page into unicode object in strict
7596 * mode.
7597 *
7598 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7599 * OSError and returns -1 on other error.
7600 */
7601static int
7602decode_code_page_strict(UINT code_page,
7603 wchar_t **buf,
7604 Py_ssize_t *bufsize,
7605 const char *in,
7606 int insize)
7607{
7608 DWORD flags = MB_ERR_INVALID_CHARS;
7609 wchar_t *out;
7610 DWORD outsize;
7611
7612 /* First get the size of the result */
7613 assert(insize > 0);
7614 while ((outsize = MultiByteToWideChar(code_page, flags,
7615 in, insize, NULL, 0)) <= 0)
7616 {
7617 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7618 goto error;
7619 }
7620 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7621 flags = 0;
7622 }
7623
7624 /* Extend a wchar_t* buffer */
7625 Py_ssize_t n = *bufsize; /* Get the current length */
7626 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7627 return -1;
7628 }
7629 out = *buf + n;
7630
7631 /* Do the conversion */
7632 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7633 if (outsize <= 0)
7634 goto error;
7635 return insize;
7636
7637error:
7638 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7639 return -2;
7640 PyErr_SetFromWindowsErr(0);
7641 return -1;
7642}
7643
7644/*
7645 * Decode a byte string from a code page into unicode object with an error
7646 * handler.
7647 *
7648 * Returns consumed size if succeed, or raise an OSError or
7649 * UnicodeDecodeError exception and returns -1 on error.
7650 */
7651static int
7652decode_code_page_errors(UINT code_page,
7653 wchar_t **buf,
7654 Py_ssize_t *bufsize,
7655 const char *in, const int size,
7656 const char *errors, int final)
7657{
7658 const char *startin = in;
7659 const char *endin = in + size;
7660 DWORD flags = MB_ERR_INVALID_CHARS;
7661 /* Ideally, we should get reason from FormatMessage. This is the Windows
7662 2000 English version of the message. */
7663 const char *reason = "No mapping for the Unicode character exists "
7664 "in the target code page.";
7665 /* each step cannot decode more than 1 character, but a character can be
7666 represented as a surrogate pair */
7667 wchar_t buffer[2], *out;
7668 int insize;
7669 Py_ssize_t outsize;
7670 PyObject *errorHandler = NULL;
7671 PyObject *exc = NULL;
7672 PyObject *encoding_obj = NULL;
7673 const char *encoding;
7674 DWORD err;
7675 int ret = -1;
7676
7677 assert(size > 0);
7678
7679 encoding = code_page_name(code_page, &encoding_obj);
7680 if (encoding == NULL)
7681 return -1;
7682
7683 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7684 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7685 UnicodeDecodeError. */
7686 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7687 if (exc != NULL) {
7688 PyCodec_StrictErrors(exc);
7689 Py_CLEAR(exc);
7690 }
7691 goto error;
7692 }
7693
7694 /* Extend a wchar_t* buffer */
7695 Py_ssize_t n = *bufsize; /* Get the current length */
7696 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7697 PyErr_NoMemory();
7698 goto error;
7699 }
7700 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7701 goto error;
7702 }
7703 out = *buf + n;
7704
7705 /* Decode the byte string character per character */
7706 while (in < endin)
7707 {
7708 /* Decode a character */
7709 insize = 1;
7710 do
7711 {
7712 outsize = MultiByteToWideChar(code_page, flags,
7713 in, insize,
7714 buffer, Py_ARRAY_LENGTH(buffer));
7715 if (outsize > 0)
7716 break;
7717 err = GetLastError();
7718 if (err == ERROR_INVALID_FLAGS && flags) {
7719 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7720 flags = 0;
7721 continue;
7722 }
7723 if (err != ERROR_NO_UNICODE_TRANSLATION
7724 && err != ERROR_INSUFFICIENT_BUFFER)
7725 {
7726 PyErr_SetFromWindowsErr(0);
7727 goto error;
7728 }
7729 insize++;
7730 }
7731 /* 4=maximum length of a UTF-8 sequence */
7732 while (insize <= 4 && (in + insize) <= endin);
7733
7734 if (outsize <= 0) {
7735 Py_ssize_t startinpos, endinpos, outpos;
7736
7737 /* last character in partial decode? */
7738 if (in + insize >= endin && !final)
7739 break;
7740
7741 startinpos = in - startin;
7742 endinpos = startinpos + 1;
7743 outpos = out - *buf;
7744 if (unicode_decode_call_errorhandler_wchar(
7745 errors, &errorHandler,
7746 encoding, reason,
7747 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7748 buf, bufsize, &outpos))
7749 {
7750 goto error;
7751 }
7752 out = *buf + outpos;
7753 }
7754 else {
7755 in += insize;
7756 memcpy(out, buffer, outsize * sizeof(wchar_t));
7757 out += outsize;
7758 }
7759 }
7760
7761 /* Shrink the buffer */
7762 assert(out - *buf <= *bufsize);
7763 *bufsize = out - *buf;
7764 /* (in - startin) <= size and size is an int */
7765 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7766
7767error:
7768 Py_XDECREF(encoding_obj);
7769 Py_XDECREF(errorHandler);
7770 Py_XDECREF(exc);
7771 return ret;
7772}
7773
7774static PyObject *
7775decode_code_page_stateful(int code_page,
7776 const char *s, Py_ssize_t size,
7777 const char *errors, Py_ssize_t *consumed)
7778{
7779 wchar_t *buf = NULL;
7780 Py_ssize_t bufsize = 0;
7781 int chunk_size, final, converted, done;
7782
7783 if (code_page < 0) {
7784 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7785 return NULL;
7786 }
7787 if (size < 0) {
7788 PyErr_BadInternalCall();
7789 return NULL;
7790 }
7791
7792 if (consumed)
7793 *consumed = 0;
7794
7795 do
7796 {
7797#ifdef NEED_RETRY
7798 if (size > DECODING_CHUNK_SIZE) {
7799 chunk_size = DECODING_CHUNK_SIZE;
7800 final = 0;
7801 done = 0;
7802 }
7803 else
7804#endif
7805 {
7806 chunk_size = (int)size;
7807 final = (consumed == NULL);
7808 done = 1;
7809 }
7810
7811 if (chunk_size == 0 && done) {
7812 if (buf != NULL)
7813 break;
7814 _Py_RETURN_UNICODE_EMPTY();
7815 }
7816
7817 converted = decode_code_page_strict(code_page, &buf, &bufsize,
7818 s, chunk_size);
7819 if (converted == -2)
7820 converted = decode_code_page_errors(code_page, &buf, &bufsize,
7821 s, chunk_size,
7822 errors, final);
7823 assert(converted != 0 || done);
7824
7825 if (converted < 0) {
7826 PyMem_Free(buf);
7827 return NULL;
7828 }
7829
7830 if (consumed)
7831 *consumed += converted;
7832
7833 s += converted;
7834 size -= converted;
7835 } while (!done);
7836
7837 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7838 PyMem_Free(buf);
7839 return v;
7840}
7841
7842PyObject *
7843PyUnicode_DecodeCodePageStateful(int code_page,
7844 const char *s,
7845 Py_ssize_t size,
7846 const char *errors,
7847 Py_ssize_t *consumed)
7848{
7849 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7850}
7851
7852PyObject *
7853PyUnicode_DecodeMBCSStateful(const char *s,
7854 Py_ssize_t size,
7855 const char *errors,
7856 Py_ssize_t *consumed)
7857{
7858 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7859}
7860
7861PyObject *
7862PyUnicode_DecodeMBCS(const char *s,
7863 Py_ssize_t size,
7864 const char *errors)
7865{
7866 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7867}
7868
7869static DWORD
7870encode_code_page_flags(UINT code_page, const char *errors)
7871{
7872 if (code_page == CP_UTF8) {
7873 return WC_ERR_INVALID_CHARS;
7874 }
7875 else if (code_page == CP_UTF7) {
7876 /* CP_UTF7 only supports flags=0 */
7877 return 0;
7878 }
7879 else {
7880 if (errors != NULL && strcmp(errors, "replace") == 0)
7881 return 0;
7882 else
7883 return WC_NO_BEST_FIT_CHARS;
7884 }
7885}
7886
7887/*
7888 * Encode a Unicode string to a Windows code page into a byte string in strict
7889 * mode.
7890 *
7891 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7892 * an OSError and returns -1 on other error.
7893 */
7894static int
7895encode_code_page_strict(UINT code_page, PyObject **outbytes,
7896 PyObject *unicode, Py_ssize_t offset, int len,
7897 const char* errors)
7898{
7899 BOOL usedDefaultChar = FALSE;
7900 BOOL *pusedDefaultChar = &usedDefaultChar;
7901 int outsize;
7902 wchar_t *p;
7903 Py_ssize_t size;
7904 const DWORD flags = encode_code_page_flags(code_page, NULL);
7905 char *out;
7906 /* Create a substring so that we can get the UTF-16 representation
7907 of just the slice under consideration. */
7908 PyObject *substring;
7909 int ret = -1;
7910
7911 assert(len > 0);
7912
7913 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7914 pusedDefaultChar = &usedDefaultChar;
7915 else
7916 pusedDefaultChar = NULL;
7917
7918 substring = PyUnicode_Substring(unicode, offset, offset+len);
7919 if (substring == NULL)
7920 return -1;
7921#if USE_UNICODE_WCHAR_CACHE
7922_Py_COMP_DIAG_PUSH
7923_Py_COMP_DIAG_IGNORE_DEPR_DECLS
7924 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7925 if (p == NULL) {
7926 Py_DECREF(substring);
7927 return -1;
7928 }
7929_Py_COMP_DIAG_POP
7930#else /* USE_UNICODE_WCHAR_CACHE */
7931 p = PyUnicode_AsWideCharString(substring, &size);
7932 Py_CLEAR(substring);
7933 if (p == NULL) {
7934 return -1;
7935 }
7936#endif /* USE_UNICODE_WCHAR_CACHE */
7937 assert(size <= INT_MAX);
7938
7939 /* First get the size of the result */
7940 outsize = WideCharToMultiByte(code_page, flags,
7941 p, (int)size,
7942 NULL, 0,
7943 NULL, pusedDefaultChar);
7944 if (outsize <= 0)
7945 goto error;
7946 /* If we used a default char, then we failed! */
7947 if (pusedDefaultChar && *pusedDefaultChar) {
7948 ret = -2;
7949 goto done;
7950 }
7951
7952 if (*outbytes == NULL) {
7953 /* Create string object */
7954 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7955 if (*outbytes == NULL) {
7956 goto done;
7957 }
7958 out = PyBytes_AS_STRING(*outbytes);
7959 }
7960 else {
7961 /* Extend string object */
7962 const Py_ssize_t n = PyBytes_Size(*outbytes);
7963 if (outsize > PY_SSIZE_T_MAX - n) {
7964 PyErr_NoMemory();
7965 goto done;
7966 }
7967 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7968 goto done;
7969 }
7970 out = PyBytes_AS_STRING(*outbytes) + n;
7971 }
7972
7973 /* Do the conversion */
7974 outsize = WideCharToMultiByte(code_page, flags,
7975 p, (int)size,
7976 out, outsize,
7977 NULL, pusedDefaultChar);
7978 if (outsize <= 0)
7979 goto error;
7980 if (pusedDefaultChar && *pusedDefaultChar) {
7981 ret = -2;
7982 goto done;
7983 }
7984 ret = 0;
7985
7986done:
7987#if USE_UNICODE_WCHAR_CACHE
7988 Py_DECREF(substring);
7989#else /* USE_UNICODE_WCHAR_CACHE */
7990 PyMem_Free(p);
7991#endif /* USE_UNICODE_WCHAR_CACHE */
7992 return ret;
7993
7994error:
7995 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7996 ret = -2;
7997 goto done;
7998 }
7999 PyErr_SetFromWindowsErr(0);
8000 goto done;
8001}
8002
8003/*
8004 * Encode a Unicode string to a Windows code page into a byte string using an
8005 * error handler.
8006 *
8007 * Returns consumed characters if succeed, or raise an OSError and returns
8008 * -1 on other error.
8009 */
8010static int
8011encode_code_page_errors(UINT code_page, PyObject **outbytes,
8012 PyObject *unicode, Py_ssize_t unicode_offset,
8013 Py_ssize_t insize, const char* errors)
8014{
8015 const DWORD flags = encode_code_page_flags(code_page, errors);
8016 Py_ssize_t pos = unicode_offset;
8017 Py_ssize_t endin = unicode_offset + insize;
8018 /* Ideally, we should get reason from FormatMessage. This is the Windows
8019 2000 English version of the message. */
8020 const char *reason = "invalid character";
8021 /* 4=maximum length of a UTF-8 sequence */
8022 char buffer[4];
8023 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
8024 Py_ssize_t outsize;
8025 char *out;
8026 PyObject *errorHandler = NULL;
8027 PyObject *exc = NULL;
8028 PyObject *encoding_obj = NULL;
8029 const char *encoding;
8030 Py_ssize_t newpos, newoutsize;
8031 PyObject *rep;
8032 int ret = -1;
8033
8034 assert(insize > 0);
8035
8036 encoding = code_page_name(code_page, &encoding_obj);
8037 if (encoding == NULL)
8038 return -1;
8039
8040 if (errors == NULL || strcmp(errors, "strict") == 0) {
8041 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
8042 then we raise a UnicodeEncodeError. */
8043 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
8044 if (exc != NULL) {
8045 PyCodec_StrictErrors(exc);
8046 Py_DECREF(exc);
8047 }
8048 Py_XDECREF(encoding_obj);
8049 return -1;
8050 }
8051
8052 if (code_page != CP_UTF8 && code_page != CP_UTF7)
8053 pusedDefaultChar = &usedDefaultChar;
8054 else
8055 pusedDefaultChar = NULL;
8056
8057 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8058 PyErr_NoMemory();
8059 goto error;
8060 }
8061 outsize = insize * Py_ARRAY_LENGTH(buffer);
8062
8063 if (*outbytes == NULL) {
8064 /* Create string object */
8065 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8066 if (*outbytes == NULL)
8067 goto error;
8068 out = PyBytes_AS_STRING(*outbytes);
8069 }
8070 else {
8071 /* Extend string object */
8072 Py_ssize_t n = PyBytes_Size(*outbytes);
8073 if (n > PY_SSIZE_T_MAX - outsize) {
8074 PyErr_NoMemory();
8075 goto error;
8076 }
8077 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
8078 goto error;
8079 out = PyBytes_AS_STRING(*outbytes) + n;
8080 }
8081
8082 /* Encode the string character per character */
8083 while (pos < endin)
8084 {
8085 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8086 wchar_t chars[2];
8087 int charsize;
8088 if (ch < 0x10000) {
8089 chars[0] = (wchar_t)ch;
8090 charsize = 1;
8091 }
8092 else {
8093 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8094 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8095 charsize = 2;
8096 }
8097
8098 outsize = WideCharToMultiByte(code_page, flags,
8099 chars, charsize,
8100 buffer, Py_ARRAY_LENGTH(buffer),
8101 NULL, pusedDefaultChar);
8102 if (outsize > 0) {
8103 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8104 {
8105 pos++;
8106 memcpy(out, buffer, outsize);
8107 out += outsize;
8108 continue;
8109 }
8110 }
8111 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8112 PyErr_SetFromWindowsErr(0);
8113 goto error;
8114 }
8115
8116 rep = unicode_encode_call_errorhandler(
8117 errors, &errorHandler, encoding, reason,
8118 unicode, &exc,
8119 pos, pos + 1, &newpos);
8120 if (rep == NULL)
8121 goto error;
8122
8123 Py_ssize_t morebytes = pos - newpos;
8124 if (PyBytes_Check(rep)) {
8125 outsize = PyBytes_GET_SIZE(rep);
8126 morebytes += outsize;
8127 if (morebytes > 0) {
8128 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8129 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
8130 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8131 Py_DECREF(rep);
8132 goto error;
8133 }
8134 out = PyBytes_AS_STRING(*outbytes) + offset;
8135 }
8136 memcpy(out, PyBytes_AS_STRING(rep), outsize);
8137 out += outsize;
8138 }
8139 else {
8140 Py_ssize_t i;
8141 enum PyUnicode_Kind kind;
8142 const void *data;
8143
8144 if (PyUnicode_READY(rep) == -1) {
8145 Py_DECREF(rep);
8146 goto error;
8147 }
8148
8149 outsize = PyUnicode_GET_LENGTH(rep);
8150 morebytes += outsize;
8151 if (morebytes > 0) {
8152 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8153 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
8154 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8155 Py_DECREF(rep);
8156 goto error;
8157 }
8158 out = PyBytes_AS_STRING(*outbytes) + offset;
8159 }
8160 kind = PyUnicode_KIND(rep);
8161 data = PyUnicode_DATA(rep);
8162 for (i=0; i < outsize; i++) {
8163 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8164 if (ch > 127) {
8165 raise_encode_exception(&exc,
8166 encoding, unicode,
8167 pos, pos + 1,
8168 "unable to encode error handler result to ASCII");
8169 Py_DECREF(rep);
8170 goto error;
8171 }
8172 *out = (unsigned char)ch;
8173 out++;
8174 }
8175 }
8176 pos = newpos;
8177 Py_DECREF(rep);
8178 }
8179 /* write a NUL byte */
8180 *out = 0;
8181 outsize = out - PyBytes_AS_STRING(*outbytes);
8182 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8183 if (_PyBytes_Resize(outbytes, outsize) < 0)
8184 goto error;
8185 ret = 0;
8186
8187error:
8188 Py_XDECREF(encoding_obj);
8189 Py_XDECREF(errorHandler);
8190 Py_XDECREF(exc);
8191 return ret;
8192}
8193
8194static PyObject *
8195encode_code_page(int code_page,
8196 PyObject *unicode,
8197 const char *errors)
8198{
8199 Py_ssize_t len;
8200 PyObject *outbytes = NULL;
8201 Py_ssize_t offset;
8202 int chunk_len, ret, done;
8203
8204 if (!PyUnicode_Check(unicode)) {
8205 PyErr_BadArgument();
8206 return NULL;
8207 }
8208
8209 if (PyUnicode_READY(unicode) == -1)
8210 return NULL;
8211 len = PyUnicode_GET_LENGTH(unicode);
8212
8213 if (code_page < 0) {
8214 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8215 return NULL;
8216 }
8217
8218 if (len == 0)
8219 return PyBytes_FromStringAndSize(NULL, 0);
8220
8221 offset = 0;
8222 do
8223 {
8224#ifdef NEED_RETRY
8225 if (len > DECODING_CHUNK_SIZE) {
8226 chunk_len = DECODING_CHUNK_SIZE;
8227 done = 0;
8228 }
8229 else
8230#endif
8231 {
8232 chunk_len = (int)len;
8233 done = 1;
8234 }
8235
8236 ret = encode_code_page_strict(code_page, &outbytes,
8237 unicode, offset, chunk_len,
8238 errors);
8239 if (ret == -2)
8240 ret = encode_code_page_errors(code_page, &outbytes,
8241 unicode, offset,
8242 chunk_len, errors);
8243 if (ret < 0) {
8244 Py_XDECREF(outbytes);
8245 return NULL;
8246 }
8247
8248 offset += chunk_len;
8249 len -= chunk_len;
8250 } while (!done);
8251
8252 return outbytes;
8253}
8254
8255PyObject *
8256PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8257 Py_ssize_t size,
8258 const char *errors)
8259{
8260 PyObject *unicode, *res;
8261 unicode = PyUnicode_FromWideChar(p, size);
8262 if (unicode == NULL)
8263 return NULL;
8264 res = encode_code_page(CP_ACP, unicode, errors);
8265 Py_DECREF(unicode);
8266 return res;
8267}
8268
8269PyObject *
8270PyUnicode_EncodeCodePage(int code_page,
8271 PyObject *unicode,
8272 const char *errors)
8273{
8274 return encode_code_page(code_page, unicode, errors);
8275}
8276
8277PyObject *
8278PyUnicode_AsMBCSString(PyObject *unicode)
8279{
8280 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8281}
8282
8283#undef NEED_RETRY
8284
8285#endif /* MS_WINDOWS */
8286
8287/* --- Character Mapping Codec -------------------------------------------- */
8288
8289static int
8290charmap_decode_string(const char *s,
8291 Py_ssize_t size,
8292 PyObject *mapping,
8293 const char *errors,
8294 _PyUnicodeWriter *writer)
8295{
8296 const char *starts = s;
8297 const char *e;
8298 Py_ssize_t startinpos, endinpos;
8299 PyObject *errorHandler = NULL, *exc = NULL;
8300 Py_ssize_t maplen;
8301 enum PyUnicode_Kind mapkind;
8302 const void *mapdata;
8303 Py_UCS4 x;
8304 unsigned char ch;
8305
8306 if (PyUnicode_READY(mapping) == -1)
8307 return -1;
8308
8309 maplen = PyUnicode_GET_LENGTH(mapping);
8310 mapdata = PyUnicode_DATA(mapping);
8311 mapkind = PyUnicode_KIND(mapping);
8312
8313 e = s + size;
8314
8315 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8316 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8317 * is disabled in encoding aliases, latin1 is preferred because
8318 * its implementation is faster. */
8319 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8320 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8321 Py_UCS4 maxchar = writer->maxchar;
8322
8323 assert (writer->kind == PyUnicode_1BYTE_KIND);
8324 while (s < e) {
8325 ch = *s;
8326 x = mapdata_ucs1[ch];
8327 if (x > maxchar) {
8328 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8329 goto onError;
8330 maxchar = writer->maxchar;
8331 outdata = (Py_UCS1 *)writer->data;
8332 }
8333 outdata[writer->pos] = x;
8334 writer->pos++;
8335 ++s;
8336 }
8337 return 0;
8338 }
8339
8340 while (s < e) {
8341 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8342 enum PyUnicode_Kind outkind = writer->kind;
8343 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8344 if (outkind == PyUnicode_1BYTE_KIND) {
8345 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8346 Py_UCS4 maxchar = writer->maxchar;
8347 while (s < e) {
8348 ch = *s;
8349 x = mapdata_ucs2[ch];
8350 if (x > maxchar)
8351 goto Error;
8352 outdata[writer->pos] = x;
8353 writer->pos++;
8354 ++s;
8355 }
8356 break;
8357 }
8358 else if (outkind == PyUnicode_2BYTE_KIND) {
8359 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8360 while (s < e) {
8361 ch = *s;
8362 x = mapdata_ucs2[ch];
8363 if (x == 0xFFFE)
8364 goto Error;
8365 outdata[writer->pos] = x;
8366 writer->pos++;
8367 ++s;
8368 }
8369 break;
8370 }
8371 }
8372 ch = *s;
8373
8374 if (ch < maplen)
8375 x = PyUnicode_READ(mapkind, mapdata, ch);
8376 else
8377 x = 0xfffe; /* invalid value */
8378Error:
8379 if (x == 0xfffe)
8380 {
8381 /* undefined mapping */
8382 startinpos = s-starts;
8383 endinpos = startinpos+1;
8384 if (unicode_decode_call_errorhandler_writer(
8385 errors, &errorHandler,
8386 "charmap", "character maps to <undefined>",
8387 &starts, &e, &startinpos, &endinpos, &exc, &s,
8388 writer)) {
8389 goto onError;
8390 }
8391 continue;
8392 }
8393
8394 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8395 goto onError;
8396 ++s;
8397 }
8398 Py_XDECREF(errorHandler);
8399 Py_XDECREF(exc);
8400 return 0;
8401
8402onError:
8403 Py_XDECREF(errorHandler);
8404 Py_XDECREF(exc);
8405 return -1;
8406}
8407
8408static int
8409charmap_decode_mapping(const char *s,
8410 Py_ssize_t size,
8411 PyObject *mapping,
8412 const char *errors,
8413 _PyUnicodeWriter *writer)
8414{
8415 const char *starts = s;
8416 const char *e;
8417 Py_ssize_t startinpos, endinpos;
8418 PyObject *errorHandler = NULL, *exc = NULL;
8419 unsigned char ch;
8420 PyObject *key, *item = NULL;
8421
8422 e = s + size;
8423
8424 while (s < e) {
8425 ch = *s;
8426
8427 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8428 key = PyLong_FromLong((long)ch);
8429 if (key == NULL)
8430 goto onError;
8431
8432 item = PyObject_GetItem(mapping, key);
8433 Py_DECREF(key);
8434 if (item == NULL) {
8435 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8436 /* No mapping found means: mapping is undefined. */
8437 PyErr_Clear();
8438 goto Undefined;
8439 } else
8440 goto onError;
8441 }
8442
8443 /* Apply mapping */
8444 if (item == Py_None)
8445 goto Undefined;
8446 if (PyLong_Check(item)) {
8447 long value = PyLong_AS_LONG(item);
8448 if (value == 0xFFFE)
8449 goto Undefined;
8450 if (value < 0 || value > MAX_UNICODE) {
8451 PyErr_Format(PyExc_TypeError,
8452 "character mapping must be in range(0x%x)",
8453 (unsigned long)MAX_UNICODE + 1);
8454 goto onError;
8455 }
8456
8457 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8458 goto onError;
8459 }
8460 else if (PyUnicode_Check(item)) {
8461 if (PyUnicode_READY(item) == -1)
8462 goto onError;
8463 if (PyUnicode_GET_LENGTH(item) == 1) {
8464 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8465 if (value == 0xFFFE)
8466 goto Undefined;
8467 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8468 goto onError;
8469 }
8470 else {
8471 writer->overallocate = 1;
8472 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8473 goto onError;
8474 }
8475 }
8476 else {
8477 /* wrong return value */
8478 PyErr_SetString(PyExc_TypeError,
8479 "character mapping must return integer, None or str");
8480 goto onError;
8481 }
8482 Py_CLEAR(item);
8483 ++s;
8484 continue;
8485
8486Undefined:
8487 /* undefined mapping */
8488 Py_CLEAR(item);
8489 startinpos = s-starts;
8490 endinpos = startinpos+1;
8491 if (unicode_decode_call_errorhandler_writer(
8492 errors, &errorHandler,
8493 "charmap", "character maps to <undefined>",
8494 &starts, &e, &startinpos, &endinpos, &exc, &s,
8495 writer)) {
8496 goto onError;
8497 }
8498 }
8499 Py_XDECREF(errorHandler);
8500 Py_XDECREF(exc);
8501 return 0;
8502
8503onError:
8504 Py_XDECREF(item);
8505 Py_XDECREF(errorHandler);
8506 Py_XDECREF(exc);
8507 return -1;
8508}
8509
8510PyObject *
8511PyUnicode_DecodeCharmap(const char *s,
8512 Py_ssize_t size,
8513 PyObject *mapping,
8514 const char *errors)
8515{
8516 _PyUnicodeWriter writer;
8517
8518 /* Default to Latin-1 */
8519 if (mapping == NULL)
8520 return PyUnicode_DecodeLatin1(s, size, errors);
8521
8522 if (size == 0)
8523 _Py_RETURN_UNICODE_EMPTY();
8524 _PyUnicodeWriter_Init(&writer);
8525 writer.min_length = size;
8526 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8527 goto onError;
8528
8529 if (PyUnicode_CheckExact(mapping)) {
8530 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8531 goto onError;
8532 }
8533 else {
8534 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8535 goto onError;
8536 }
8537 return _PyUnicodeWriter_Finish(&writer);
8538
8539 onError:
8540 _PyUnicodeWriter_Dealloc(&writer);
8541 return NULL;
8542}
8543
8544/* Charmap encoding: the lookup table */
8545
8546struct encoding_map {
8547 PyObject_HEAD
8548 unsigned char level1[32];
8549 int count2, count3;
8550 unsigned char level23[1];
8551};
8552
8553static PyObject*
8554encoding_map_size(PyObject *obj, PyObject* args)
8555{
8556 struct encoding_map *map = (struct encoding_map*)obj;
8557 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8558 128*map->count3);
8559}
8560
8561static PyMethodDef encoding_map_methods[] = {
8562 {"size", encoding_map_size, METH_NOARGS,
8563 PyDoc_STR("Return the size (in bytes) of this object") },
8564 { 0 }
8565};
8566
8567static PyTypeObject EncodingMapType = {
8568 PyVarObject_HEAD_INIT(NULL, 0)
8569 "EncodingMap", /*tp_name*/
8570 sizeof(struct encoding_map), /*tp_basicsize*/
8571 0, /*tp_itemsize*/
8572 /* methods */
8573 0, /*tp_dealloc*/
8574 0, /*tp_vectorcall_offset*/
8575 0, /*tp_getattr*/
8576 0, /*tp_setattr*/
8577 0, /*tp_as_async*/
8578 0, /*tp_repr*/
8579 0, /*tp_as_number*/
8580 0, /*tp_as_sequence*/
8581 0, /*tp_as_mapping*/
8582 0, /*tp_hash*/
8583 0, /*tp_call*/
8584 0, /*tp_str*/
8585 0, /*tp_getattro*/
8586 0, /*tp_setattro*/
8587 0, /*tp_as_buffer*/
8588 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8589 0, /*tp_doc*/
8590 0, /*tp_traverse*/
8591 0, /*tp_clear*/
8592 0, /*tp_richcompare*/
8593 0, /*tp_weaklistoffset*/
8594 0, /*tp_iter*/
8595 0, /*tp_iternext*/
8596 encoding_map_methods, /*tp_methods*/
8597 0, /*tp_members*/
8598 0, /*tp_getset*/
8599 0, /*tp_base*/
8600 0, /*tp_dict*/
8601 0, /*tp_descr_get*/
8602 0, /*tp_descr_set*/
8603 0, /*tp_dictoffset*/
8604 0, /*tp_init*/
8605 0, /*tp_alloc*/
8606 0, /*tp_new*/
8607 0, /*tp_free*/
8608 0, /*tp_is_gc*/
8609};
8610
8611PyObject*
8612PyUnicode_BuildEncodingMap(PyObject* string)
8613{
8614 PyObject *result;
8615 struct encoding_map *mresult;
8616 int i;
8617 int need_dict = 0;
8618 unsigned char level1[32];
8619 unsigned char level2[512];
8620 unsigned char *mlevel1, *mlevel2, *mlevel3;
8621 int count2 = 0, count3 = 0;
8622 int kind;
8623 const void *data;
8624 Py_ssize_t length;
8625 Py_UCS4 ch;
8626
8627 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8628 PyErr_BadArgument();
8629 return NULL;
8630 }
8631 kind = PyUnicode_KIND(string);
8632 data = PyUnicode_DATA(string);
8633 length = PyUnicode_GET_LENGTH(string);
8634 length = Py_MIN(length, 256);
8635 memset(level1, 0xFF, sizeof level1);
8636 memset(level2, 0xFF, sizeof level2);
8637
8638 /* If there isn't a one-to-one mapping of NULL to \0,
8639 or if there are non-BMP characters, we need to use
8640 a mapping dictionary. */
8641 if (PyUnicode_READ(kind, data, 0) != 0)
8642 need_dict = 1;
8643 for (i = 1; i < length; i++) {
8644 int l1, l2;
8645 ch = PyUnicode_READ(kind, data, i);
8646 if (ch == 0 || ch > 0xFFFF) {
8647 need_dict = 1;
8648 break;
8649 }
8650 if (ch == 0xFFFE)
8651 /* unmapped character */
8652 continue;
8653 l1 = ch >> 11;
8654 l2 = ch >> 7;
8655 if (level1[l1] == 0xFF)
8656 level1[l1] = count2++;
8657 if (level2[l2] == 0xFF)
8658 level2[l2] = count3++;
8659 }
8660
8661 if (count2 >= 0xFF || count3 >= 0xFF)
8662 need_dict = 1;
8663
8664 if (need_dict) {
8665 PyObject *result = PyDict_New();
8666 PyObject *key, *value;
8667 if (!result)
8668 return NULL;
8669 for (i = 0; i < length; i++) {
8670 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8671 value = PyLong_FromLong(i);
8672 if (!key || !value)
8673 goto failed1;
8674 if (PyDict_SetItem(result, key, value) == -1)
8675 goto failed1;
8676 Py_DECREF(key);
8677 Py_DECREF(value);
8678 }
8679 return result;
8680 failed1:
8681 Py_XDECREF(key);
8682 Py_XDECREF(value);
8683 Py_DECREF(result);
8684 return NULL;
8685 }
8686
8687 /* Create a three-level trie */
8688 result = PyObject_Malloc(sizeof(struct encoding_map) +
8689 16*count2 + 128*count3 - 1);
8690 if (!result) {
8691 return PyErr_NoMemory();
8692 }
8693
8694 _PyObject_Init(result, &EncodingMapType);
8695 mresult = (struct encoding_map*)result;
8696 mresult->count2 = count2;
8697 mresult->count3 = count3;
8698 mlevel1 = mresult->level1;
8699 mlevel2 = mresult->level23;
8700 mlevel3 = mresult->level23 + 16*count2;
8701 memcpy(mlevel1, level1, 32);
8702 memset(mlevel2, 0xFF, 16*count2);
8703 memset(mlevel3, 0, 128*count3);
8704 count3 = 0;
8705 for (i = 1; i < length; i++) {
8706 int o1, o2, o3, i2, i3;
8707 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8708 if (ch == 0xFFFE)
8709 /* unmapped character */
8710 continue;
8711 o1 = ch>>11;
8712 o2 = (ch>>7) & 0xF;
8713 i2 = 16*mlevel1[o1] + o2;
8714 if (mlevel2[i2] == 0xFF)
8715 mlevel2[i2] = count3++;
8716 o3 = ch & 0x7F;
8717 i3 = 128*mlevel2[i2] + o3;
8718 mlevel3[i3] = i;
8719 }
8720 return result;
8721}
8722
8723static int
8724encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8725{
8726 struct encoding_map *map = (struct encoding_map*)mapping;
8727 int l1 = c>>11;
8728 int l2 = (c>>7) & 0xF;
8729 int l3 = c & 0x7F;
8730 int i;
8731
8732 if (c > 0xFFFF)
8733 return -1;
8734 if (c == 0)
8735 return 0;
8736 /* level 1*/
8737 i = map->level1[l1];
8738 if (i == 0xFF) {
8739 return -1;
8740 }
8741 /* level 2*/
8742 i = map->level23[16*i+l2];
8743 if (i == 0xFF) {
8744 return -1;
8745 }
8746 /* level 3 */
8747 i = map->level23[16*map->count2 + 128*i + l3];
8748 if (i == 0) {
8749 return -1;
8750 }
8751 return i;
8752}
8753
8754/* Lookup the character ch in the mapping. If the character
8755 can't be found, Py_None is returned (or NULL, if another
8756 error occurred). */
8757static PyObject *
8758charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8759{
8760 PyObject *w = PyLong_FromLong((long)c);
8761 PyObject *x;
8762
8763 if (w == NULL)
8764 return NULL;
8765 x = PyObject_GetItem(mapping, w);
8766 Py_DECREF(w);
8767 if (x == NULL) {
8768 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8769 /* No mapping found means: mapping is undefined. */
8770 PyErr_Clear();
8771 Py_RETURN_NONE;
8772 } else
8773 return NULL;
8774 }
8775 else if (x == Py_None)
8776 return x;
8777 else if (PyLong_Check(x)) {
8778 long value = PyLong_AS_LONG(x);
8779 if (value < 0 || value > 255) {
8780 PyErr_SetString(PyExc_TypeError,
8781 "character mapping must be in range(256)");
8782 Py_DECREF(x);
8783 return NULL;
8784 }
8785 return x;
8786 }
8787 else if (PyBytes_Check(x))
8788 return x;
8789 else {
8790 /* wrong return value */
8791 PyErr_Format(PyExc_TypeError,
8792 "character mapping must return integer, bytes or None, not %.400s",
8793 Py_TYPE(x)->tp_name);
8794 Py_DECREF(x);
8795 return NULL;
8796 }
8797}
8798
8799static int
8800charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8801{
8802 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8803 /* exponentially overallocate to minimize reallocations */
8804 if (requiredsize < 2*outsize)
8805 requiredsize = 2*outsize;
8806 if (_PyBytes_Resize(outobj, requiredsize))
8807 return -1;
8808 return 0;
8809}
8810
8811typedef enum charmapencode_result {
8812 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8813} charmapencode_result;
8814/* lookup the character, put the result in the output string and adjust
8815 various state variables. Resize the output bytes object if not enough
8816 space is available. Return a new reference to the object that
8817 was put in the output buffer, or Py_None, if the mapping was undefined
8818 (in which case no character was written) or NULL, if a
8819 reallocation error occurred. The caller must decref the result */
8820static charmapencode_result
8821charmapencode_output(Py_UCS4 c, PyObject *mapping,
8822 PyObject **outobj, Py_ssize_t *outpos)
8823{
8824 PyObject *rep;
8825 char *outstart;
8826 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8827
8828 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8829 int res = encoding_map_lookup(c, mapping);
8830 Py_ssize_t requiredsize = *outpos+1;
8831 if (res == -1)
8832 return enc_FAILED;
8833 if (outsize<requiredsize)
8834 if (charmapencode_resize(outobj, outpos, requiredsize))
8835 return enc_EXCEPTION;
8836 outstart = PyBytes_AS_STRING(*outobj);
8837 outstart[(*outpos)++] = (char)res;
8838 return enc_SUCCESS;
8839 }
8840
8841 rep = charmapencode_lookup(c, mapping);
8842 if (rep==NULL)
8843 return enc_EXCEPTION;
8844 else if (rep==Py_None) {
8845 Py_DECREF(rep);
8846 return enc_FAILED;
8847 } else {
8848 if (PyLong_Check(rep)) {
8849 Py_ssize_t requiredsize = *outpos+1;
8850 if (outsize<requiredsize)
8851 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8852 Py_DECREF(rep);
8853 return enc_EXCEPTION;
8854 }
8855 outstart = PyBytes_AS_STRING(*outobj);
8856 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8857 }
8858 else {
8859 const char *repchars = PyBytes_AS_STRING(rep);
8860 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8861 Py_ssize_t requiredsize = *outpos+repsize;
8862 if (outsize<requiredsize)
8863 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8864 Py_DECREF(rep);
8865 return enc_EXCEPTION;
8866 }
8867 outstart = PyBytes_AS_STRING(*outobj);
8868 memcpy(outstart + *outpos, repchars, repsize);
8869 *outpos += repsize;
8870 }
8871 }
8872 Py_DECREF(rep);
8873 return enc_SUCCESS;
8874}
8875
8876/* handle an error in PyUnicode_EncodeCharmap
8877 Return 0 on success, -1 on error */
8878static int
8879charmap_encoding_error(
8880 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8881 PyObject **exceptionObject,
8882 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8883 PyObject **res, Py_ssize_t *respos)
8884{
8885 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8886 Py_ssize_t size, repsize;
8887 Py_ssize_t newpos;
8888 enum PyUnicode_Kind kind;
8889 const void *data;
8890 Py_ssize_t index;
8891 /* startpos for collecting unencodable chars */
8892 Py_ssize_t collstartpos = *inpos;
8893 Py_ssize_t collendpos = *inpos+1;
8894 Py_ssize_t collpos;
8895 const char *encoding = "charmap";
8896 const char *reason = "character maps to <undefined>";
8897 charmapencode_result x;
8898 Py_UCS4 ch;
8899 int val;
8900
8901 if (PyUnicode_READY(unicode) == -1)
8902 return -1;
8903 size = PyUnicode_GET_LENGTH(unicode);
8904 /* find all unencodable characters */
8905 while (collendpos < size) {
8906 PyObject *rep;
8907 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8908 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8909 val = encoding_map_lookup(ch, mapping);
8910 if (val != -1)
8911 break;
8912 ++collendpos;
8913 continue;
8914 }
8915
8916 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8917 rep = charmapencode_lookup(ch, mapping);
8918 if (rep==NULL)
8919 return -1;
8920 else if (rep!=Py_None) {
8921 Py_DECREF(rep);
8922 break;
8923 }
8924 Py_DECREF(rep);
8925 ++collendpos;
8926 }
8927 /* cache callback name lookup
8928 * (if not done yet, i.e. it's the first error) */
8929 if (*error_handler == _Py_ERROR_UNKNOWN)
8930 *error_handler = _Py_GetErrorHandler(errors);
8931
8932 switch (*error_handler) {
8933 case _Py_ERROR_STRICT:
8934 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8935 return -1;
8936
8937 case _Py_ERROR_REPLACE:
8938 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8939 x = charmapencode_output('?', mapping, res, respos);
8940 if (x==enc_EXCEPTION) {
8941 return -1;
8942 }
8943 else if (x==enc_FAILED) {
8944 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8945 return -1;
8946 }
8947 }
8948 /* fall through */
8949 case _Py_ERROR_IGNORE:
8950 *inpos = collendpos;
8951 break;
8952
8953 case _Py_ERROR_XMLCHARREFREPLACE:
8954 /* generate replacement (temporarily (mis)uses p) */
8955 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8956 char buffer[2+29+1+1];
8957 char *cp;
8958 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8959 for (cp = buffer; *cp; ++cp) {
8960 x = charmapencode_output(*cp, mapping, res, respos);
8961 if (x==enc_EXCEPTION)
8962 return -1;
8963 else if (x==enc_FAILED) {
8964 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8965 return -1;
8966 }
8967 }
8968 }
8969 *inpos = collendpos;
8970 break;
8971
8972 default:
8973 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8974 encoding, reason, unicode, exceptionObject,
8975 collstartpos, collendpos, &newpos);
8976 if (repunicode == NULL)
8977 return -1;
8978 if (PyBytes_Check(repunicode)) {
8979 /* Directly copy bytes result to output. */
8980 Py_ssize_t outsize = PyBytes_Size(*res);
8981 Py_ssize_t requiredsize;
8982 repsize = PyBytes_Size(repunicode);
8983 requiredsize = *respos + repsize;
8984 if (requiredsize > outsize)
8985 /* Make room for all additional bytes. */
8986 if (charmapencode_resize(res, respos, requiredsize)) {
8987 Py_DECREF(repunicode);
8988 return -1;
8989 }
8990 memcpy(PyBytes_AsString(*res) + *respos,
8991 PyBytes_AsString(repunicode), repsize);
8992 *respos += repsize;
8993 *inpos = newpos;
8994 Py_DECREF(repunicode);
8995 break;
8996 }
8997 /* generate replacement */
8998 if (PyUnicode_READY(repunicode) == -1) {
8999 Py_DECREF(repunicode);
9000 return -1;
9001 }
9002 repsize = PyUnicode_GET_LENGTH(repunicode);
9003 data = PyUnicode_DATA(repunicode);
9004 kind = PyUnicode_KIND(repunicode);
9005 for (index = 0; index < repsize; index++) {
9006 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
9007 x = charmapencode_output(repch, mapping, res, respos);
9008 if (x==enc_EXCEPTION) {
9009 Py_DECREF(repunicode);
9010 return -1;
9011 }
9012 else if (x==enc_FAILED) {
9013 Py_DECREF(repunicode);
9014 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9015 return -1;
9016 }
9017 }
9018 *inpos = newpos;
9019 Py_DECREF(repunicode);
9020 }
9021 return 0;
9022}
9023
9024PyObject *
9025_PyUnicode_EncodeCharmap(PyObject *unicode,
9026 PyObject *mapping,
9027 const char *errors)
9028{
9029 /* output object */
9030 PyObject *res = NULL;
9031 /* current input position */
9032 Py_ssize_t inpos = 0;
9033 Py_ssize_t size;
9034 /* current output position */
9035 Py_ssize_t respos = 0;
9036 PyObject *error_handler_obj = NULL;
9037 PyObject *exc = NULL;
9038 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
9039 const void *data;
9040 int kind;
9041
9042 if (PyUnicode_READY(unicode) == -1)
9043 return NULL;
9044 size = PyUnicode_GET_LENGTH(unicode);
9045 data = PyUnicode_DATA(unicode);
9046 kind = PyUnicode_KIND(unicode);
9047
9048 /* Default to Latin-1 */
9049 if (mapping == NULL)
9050 return unicode_encode_ucs1(unicode, errors, 256);
9051
9052 /* allocate enough for a simple encoding without
9053 replacements, if we need more, we'll resize */
9054 res = PyBytes_FromStringAndSize(NULL, size);
9055 if (res == NULL)
9056 goto onError;
9057 if (size == 0)
9058 return res;
9059
9060 while (inpos<size) {
9061 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9062 /* try to encode it */
9063 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
9064 if (x==enc_EXCEPTION) /* error */
9065 goto onError;
9066 if (x==enc_FAILED) { /* unencodable character */
9067 if (charmap_encoding_error(unicode, &inpos, mapping,
9068 &exc,
9069 &error_handler, &error_handler_obj, errors,
9070 &res, &respos)) {
9071 goto onError;
9072 }
9073 }
9074 else
9075 /* done with this character => adjust input position */
9076 ++inpos;
9077 }
9078
9079 /* Resize if we allocated to much */
9080 if (respos<PyBytes_GET_SIZE(res))
9081 if (_PyBytes_Resize(&res, respos) < 0)
9082 goto onError;
9083
9084 Py_XDECREF(exc);
9085 Py_XDECREF(error_handler_obj);
9086 return res;
9087
9088 onError:
9089 Py_XDECREF(res);
9090 Py_XDECREF(exc);
9091 Py_XDECREF(error_handler_obj);
9092 return NULL;
9093}
9094
9095/* Deprecated */
9096PyObject *
9097PyUnicode_EncodeCharmap(const Py_UNICODE *p,
9098 Py_ssize_t size,
9099 PyObject *mapping,
9100 const char *errors)
9101{
9102 PyObject *result;
9103 PyObject *unicode = PyUnicode_FromWideChar(p, size);
9104 if (unicode == NULL)
9105 return NULL;
9106 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
9107 Py_DECREF(unicode);
9108 return result;
9109}
9110
9111PyObject *
9112PyUnicode_AsCharmapString(PyObject *unicode,
9113 PyObject *mapping)
9114{
9115 if (!PyUnicode_Check(unicode) || mapping == NULL) {
9116 PyErr_BadArgument();
9117 return NULL;
9118 }
9119 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9120}
9121
9122/* create or adjust a UnicodeTranslateError */
9123static void
9124make_translate_exception(PyObject **exceptionObject,
9125 PyObject *unicode,
9126 Py_ssize_t startpos, Py_ssize_t endpos,
9127 const char *reason)
9128{
9129 if (*exceptionObject == NULL) {
9130 *exceptionObject = _PyUnicodeTranslateError_Create(
9131 unicode, startpos, endpos, reason);
9132 }
9133 else {
9134 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9135 goto onError;
9136 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9137 goto onError;
9138 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9139 goto onError;
9140 return;
9141 onError:
9142 Py_CLEAR(*exceptionObject);
9143 }
9144}
9145
9146/* error handling callback helper:
9147 build arguments, call the callback and check the arguments,
9148 put the result into newpos and return the replacement string, which
9149 has to be freed by the caller */
9150static PyObject *
9151unicode_translate_call_errorhandler(const char *errors,
9152 PyObject **errorHandler,
9153 const char *reason,
9154 PyObject *unicode, PyObject **exceptionObject,
9155 Py_ssize_t startpos, Py_ssize_t endpos,
9156 Py_ssize_t *newpos)
9157{
9158 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9159
9160 Py_ssize_t i_newpos;
9161 PyObject *restuple;
9162 PyObject *resunicode;
9163
9164 if (*errorHandler == NULL) {
9165 *errorHandler = PyCodec_LookupError(errors);
9166 if (*errorHandler == NULL)
9167 return NULL;
9168 }
9169
9170 make_translate_exception(exceptionObject,
9171 unicode, startpos, endpos, reason);
9172 if (*exceptionObject == NULL)
9173 return NULL;
9174
9175 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9176 if (restuple == NULL)
9177 return NULL;
9178 if (!PyTuple_Check(restuple)) {
9179 PyErr_SetString(PyExc_TypeError, &argparse[3]);
9180 Py_DECREF(restuple);
9181 return NULL;
9182 }
9183 if (!PyArg_ParseTuple(restuple, argparse,
9184 &resunicode, &i_newpos)) {
9185 Py_DECREF(restuple);
9186 return NULL;
9187 }
9188 if (i_newpos<0)
9189 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9190 else
9191 *newpos = i_newpos;
9192 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9193 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9194 Py_DECREF(restuple);
9195 return NULL;
9196 }
9197 Py_INCREF(resunicode);
9198 Py_DECREF(restuple);
9199 return resunicode;
9200}
9201
9202/* Lookup the character ch in the mapping and put the result in result,
9203 which must be decrefed by the caller.
9204 Return 0 on success, -1 on error */
9205static int
9206charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
9207{
9208 PyObject *w = PyLong_FromLong((long)c);
9209 PyObject *x;
9210
9211 if (w == NULL)
9212 return -1;
9213 x = PyObject_GetItem(mapping, w);
9214 Py_DECREF(w);
9215 if (x == NULL) {
9216 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9217 /* No mapping found means: use 1:1 mapping. */
9218 PyErr_Clear();
9219 *result = NULL;
9220 return 0;
9221 } else
9222 return -1;
9223 }
9224 else if (x == Py_None) {
9225 *result = x;
9226 return 0;
9227 }
9228 else if (PyLong_Check(x)) {
9229 long value = PyLong_AS_LONG(x);
9230 if (value < 0 || value > MAX_UNICODE) {
9231 PyErr_Format(PyExc_ValueError,
9232 "character mapping must be in range(0x%x)",
9233 MAX_UNICODE+1);
9234 Py_DECREF(x);
9235 return -1;
9236 }
9237 *result = x;
9238 return 0;
9239 }
9240 else if (PyUnicode_Check(x)) {
9241 *result = x;
9242 return 0;
9243 }
9244 else {
9245 /* wrong return value */
9246 PyErr_SetString(PyExc_TypeError,
9247 "character mapping must return integer, None or str");
9248 Py_DECREF(x);
9249 return -1;
9250 }
9251}
9252
9253/* lookup the character, write the result into the writer.
9254 Return 1 if the result was written into the writer, return 0 if the mapping
9255 was undefined, raise an exception return -1 on error. */
9256static int
9257charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9258 _PyUnicodeWriter *writer)
9259{
9260 PyObject *item;
9261
9262 if (charmaptranslate_lookup(ch, mapping, &item))
9263 return -1;
9264
9265 if (item == NULL) {
9266 /* not found => default to 1:1 mapping */
9267 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9268 return -1;
9269 }
9270 return 1;
9271 }
9272
9273 if (item == Py_None) {
9274 Py_DECREF(item);
9275 return 0;
9276 }
9277
9278 if (PyLong_Check(item)) {
9279 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9280 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9281 used it */
9282 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9283 Py_DECREF(item);
9284 return -1;
9285 }
9286 Py_DECREF(item);
9287 return 1;
9288 }
9289
9290 if (!PyUnicode_Check(item)) {
9291 Py_DECREF(item);
9292 return -1;
9293 }
9294
9295 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9296 Py_DECREF(item);
9297 return -1;
9298 }
9299
9300 Py_DECREF(item);
9301 return 1;
9302}
9303
9304static int
9305unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9306 Py_UCS1 *translate)
9307{
9308 PyObject *item = NULL;
9309 int ret = 0;
9310
9311 if (charmaptranslate_lookup(ch, mapping, &item)) {
9312 return -1;
9313 }
9314
9315 if (item == Py_None) {
9316 /* deletion */
9317 translate[ch] = 0xfe;
9318 }
9319 else if (item == NULL) {
9320 /* not found => default to 1:1 mapping */
9321 translate[ch] = ch;
9322 return 1;
9323 }
9324 else if (PyLong_Check(item)) {
9325 long replace = PyLong_AS_LONG(item);
9326 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9327 used it */
9328 if (127 < replace) {
9329 /* invalid character or character outside ASCII:
9330 skip the fast translate */
9331 goto exit;
9332 }
9333 translate[ch] = (Py_UCS1)replace;
9334 }
9335 else if (PyUnicode_Check(item)) {
9336 Py_UCS4 replace;
9337
9338 if (PyUnicode_READY(item) == -1) {
9339 Py_DECREF(item);
9340 return -1;
9341 }
9342 if (PyUnicode_GET_LENGTH(item) != 1)
9343 goto exit;
9344
9345 replace = PyUnicode_READ_CHAR(item, 0);
9346 if (replace > 127)
9347 goto exit;
9348 translate[ch] = (Py_UCS1)replace;
9349 }
9350 else {
9351 /* not None, NULL, long or unicode */
9352 goto exit;
9353 }
9354 ret = 1;
9355
9356 exit:
9357 Py_DECREF(item);
9358 return ret;
9359}
9360
9361/* Fast path for ascii => ascii translation. Return 1 if the whole string
9362 was translated into writer, return 0 if the input string was partially
9363 translated into writer, raise an exception and return -1 on error. */
9364static int
9365unicode_fast_translate(PyObject *input, PyObject *mapping,
9366 _PyUnicodeWriter *writer, int ignore,
9367 Py_ssize_t *input_pos)
9368{
9369 Py_UCS1 ascii_table[128], ch, ch2;
9370 Py_ssize_t len;
9371 const Py_UCS1 *in, *end;
9372 Py_UCS1 *out;
9373 int res = 0;
9374
9375 len = PyUnicode_GET_LENGTH(input);
9376
9377 memset(ascii_table, 0xff, 128);
9378
9379 in = PyUnicode_1BYTE_DATA(input);
9380 end = in + len;
9381
9382 assert(PyUnicode_IS_ASCII(writer->buffer));
9383 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9384 out = PyUnicode_1BYTE_DATA(writer->buffer);
9385
9386 for (; in < end; in++) {
9387 ch = *in;
9388 ch2 = ascii_table[ch];
9389 if (ch2 == 0xff) {
9390 int translate = unicode_fast_translate_lookup(mapping, ch,
9391 ascii_table);
9392 if (translate < 0)
9393 return -1;
9394 if (translate == 0)
9395 goto exit;
9396 ch2 = ascii_table[ch];
9397 }
9398 if (ch2 == 0xfe) {
9399 if (ignore)
9400 continue;
9401 goto exit;
9402 }
9403 assert(ch2 < 128);
9404 *out = ch2;
9405 out++;
9406 }
9407 res = 1;
9408
9409exit:
9410 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9411 *input_pos = in - PyUnicode_1BYTE_DATA(input);
9412 return res;
9413}
9414
9415static PyObject *
9416_PyUnicode_TranslateCharmap(PyObject *input,
9417 PyObject *mapping,
9418 const char *errors)
9419{
9420 /* input object */
9421 const void *data;
9422 Py_ssize_t size, i;
9423 int kind;
9424 /* output buffer */
9425 _PyUnicodeWriter writer;
9426 /* error handler */
9427 const char *reason = "character maps to <undefined>";
9428 PyObject *errorHandler = NULL;
9429 PyObject *exc = NULL;
9430 int ignore;
9431 int res;
9432
9433 if (mapping == NULL) {
9434 PyErr_BadArgument();
9435 return NULL;
9436 }
9437
9438 if (PyUnicode_READY(input) == -1)
9439 return NULL;
9440 data = PyUnicode_DATA(input);
9441 kind = PyUnicode_KIND(input);
9442 size = PyUnicode_GET_LENGTH(input);
9443
9444 if (size == 0)
9445 return PyUnicode_FromObject(input);
9446
9447 /* allocate enough for a simple 1:1 translation without
9448 replacements, if we need more, we'll resize */
9449 _PyUnicodeWriter_Init(&writer);
9450 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9451 goto onError;
9452
9453 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9454
9455 if (PyUnicode_READY(input) == -1)
9456 return NULL;
9457 if (PyUnicode_IS_ASCII(input)) {
9458 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9459 if (res < 0) {
9460 _PyUnicodeWriter_Dealloc(&writer);
9461 return NULL;
9462 }
9463 if (res == 1)
9464 return _PyUnicodeWriter_Finish(&writer);
9465 }
9466 else {
9467 i = 0;
9468 }
9469
9470 while (i<size) {
9471 /* try to encode it */
9472 int translate;
9473 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9474 Py_ssize_t newpos;
9475 /* startpos for collecting untranslatable chars */
9476 Py_ssize_t collstart;
9477 Py_ssize_t collend;
9478 Py_UCS4 ch;
9479
9480 ch = PyUnicode_READ(kind, data, i);
9481 translate = charmaptranslate_output(ch, mapping, &writer);
9482 if (translate < 0)
9483 goto onError;
9484
9485 if (translate != 0) {
9486 /* it worked => adjust input pointer */
9487 ++i;
9488 continue;
9489 }
9490
9491 /* untranslatable character */
9492 collstart = i;
9493 collend = i+1;
9494
9495 /* find all untranslatable characters */
9496 while (collend < size) {
9497 PyObject *x;
9498 ch = PyUnicode_READ(kind, data, collend);
9499 if (charmaptranslate_lookup(ch, mapping, &x))
9500 goto onError;
9501 Py_XDECREF(x);
9502 if (x != Py_None)
9503 break;
9504 ++collend;
9505 }
9506
9507 if (ignore) {
9508 i = collend;
9509 }
9510 else {
9511 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9512 reason, input, &exc,
9513 collstart, collend, &newpos);
9514 if (repunicode == NULL)
9515 goto onError;
9516 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9517 Py_DECREF(repunicode);
9518 goto onError;
9519 }
9520 Py_DECREF(repunicode);
9521 i = newpos;
9522 }
9523 }
9524 Py_XDECREF(exc);
9525 Py_XDECREF(errorHandler);
9526 return _PyUnicodeWriter_Finish(&writer);
9527
9528 onError:
9529 _PyUnicodeWriter_Dealloc(&writer);
9530 Py_XDECREF(exc);
9531 Py_XDECREF(errorHandler);
9532 return NULL;
9533}
9534
9535/* Deprecated. Use PyUnicode_Translate instead. */
9536PyObject *
9537PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9538 Py_ssize_t size,
9539 PyObject *mapping,
9540 const char *errors)
9541{
9542 PyObject *result;
9543 PyObject *unicode = PyUnicode_FromWideChar(p, size);
9544 if (!unicode)
9545 return NULL;
9546 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9547 Py_DECREF(unicode);
9548 return result;
9549}
9550
9551PyObject *
9552PyUnicode_Translate(PyObject *str,
9553 PyObject *mapping,
9554 const char *errors)
9555{
9556 if (ensure_unicode(str) < 0)
9557 return NULL;
9558 return _PyUnicode_TranslateCharmap(str, mapping, errors);
9559}
9560
9561PyObject *
9562_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9563{
9564 if (!PyUnicode_Check(unicode)) {
9565 PyErr_BadInternalCall();
9566 return NULL;
9567 }
9568 if (PyUnicode_READY(unicode) == -1)
9569 return NULL;
9570 if (PyUnicode_IS_ASCII(unicode)) {
9571 /* If the string is already ASCII, just return the same string */
9572 Py_INCREF(unicode);
9573 return unicode;
9574 }
9575
9576 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9577 PyObject *result = PyUnicode_New(len, 127);
9578 if (result == NULL) {
9579 return NULL;
9580 }
9581
9582 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9583 int kind = PyUnicode_KIND(unicode);
9584 const void *data = PyUnicode_DATA(unicode);
9585 Py_ssize_t i;
9586 for (i = 0; i < len; ++i) {
9587 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9588 if (ch < 127) {
9589 out[i] = ch;
9590 }
9591 else if (Py_UNICODE_ISSPACE(ch)) {
9592 out[i] = ' ';
9593 }
9594 else {
9595 int decimal = Py_UNICODE_TODECIMAL(ch);
9596 if (decimal < 0) {
9597 out[i] = '?';
9598 out[i+1] = '\0';
9599 _PyUnicode_LENGTH(result) = i + 1;
9600 break;
9601 }
9602 out[i] = '0' + decimal;
9603 }
9604 }
9605
9606 assert(_PyUnicode_CheckConsistency(result, 1));
9607 return result;
9608}
9609
9610PyObject *
9611PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9612 Py_ssize_t length)
9613{
9614 PyObject *decimal;
9615 Py_ssize_t i;
9616 Py_UCS4 maxchar;
9617 enum PyUnicode_Kind kind;
9618 const void *data;
9619
9620 maxchar = 127;
9621 for (i = 0; i < length; i++) {
9622 Py_UCS4 ch = s[i];
9623 if (ch > 127) {
9624 int decimal = Py_UNICODE_TODECIMAL(ch);
9625 if (decimal >= 0)
9626 ch = '0' + decimal;
9627 maxchar = Py_MAX(maxchar, ch);
9628 }
9629 }
9630
9631 /* Copy to a new string */
9632 decimal = PyUnicode_New(length, maxchar);
9633 if (decimal == NULL)
9634 return decimal;
9635 kind = PyUnicode_KIND(decimal);
9636 data = PyUnicode_DATA(decimal);
9637 /* Iterate over code points */
9638 for (i = 0; i < length; i++) {
9639 Py_UCS4 ch = s[i];
9640 if (ch > 127) {
9641 int decimal = Py_UNICODE_TODECIMAL(ch);
9642 if (decimal >= 0)
9643 ch = '0' + decimal;
9644 }
9645 PyUnicode_WRITE(kind, data, i, ch);
9646 }
9647 return unicode_result(decimal);
9648}
9649/* --- Decimal Encoder ---------------------------------------------------- */
9650
9651int
9652PyUnicode_EncodeDecimal(Py_UNICODE *s,
9653 Py_ssize_t length,
9654 char *output,
9655 const char *errors)
9656{
9657 PyObject *unicode;
9658 Py_ssize_t i;
9659 enum PyUnicode_Kind kind;
9660 const void *data;
9661
9662 if (output == NULL) {
9663 PyErr_BadArgument();
9664 return -1;
9665 }
9666
9667 unicode = PyUnicode_FromWideChar(s, length);
9668 if (unicode == NULL)
9669 return -1;
9670
9671 kind = PyUnicode_KIND(unicode);
9672 data = PyUnicode_DATA(unicode);
9673
9674 for (i=0; i < length; ) {
9675 PyObject *exc;
9676 Py_UCS4 ch;
9677 int decimal;
9678 Py_ssize_t startpos;
9679
9680 ch = PyUnicode_READ(kind, data, i);
9681
9682 if (Py_UNICODE_ISSPACE(ch)) {
9683 *output++ = ' ';
9684 i++;
9685 continue;
9686 }
9687 decimal = Py_UNICODE_TODECIMAL(ch);
9688 if (decimal >= 0) {
9689 *output++ = '0' + decimal;
9690 i++;
9691 continue;
9692 }
9693 if (0 < ch && ch < 256) {
9694 *output++ = (char)ch;
9695 i++;
9696 continue;
9697 }
9698
9699 startpos = i;
9700 exc = NULL;
9701 raise_encode_exception(&exc, "decimal", unicode,
9702 startpos, startpos+1,
9703 "invalid decimal Unicode string");
9704 Py_XDECREF(exc);
9705 Py_DECREF(unicode);
9706 return -1;
9707 }
9708 /* 0-terminate the output string */
9709 *output++ = '\0';
9710 Py_DECREF(unicode);
9711 return 0;
9712}
9713
9714/* --- Helpers ------------------------------------------------------------ */
9715
9716/* helper macro to fixup start/end slice values */
9717#define ADJUST_INDICES(start, end, len) \
9718 if (end > len) \
9719 end = len; \
9720 else if (end < 0) { \
9721 end += len; \
9722 if (end < 0) \
9723 end = 0; \
9724 } \
9725 if (start < 0) { \
9726 start += len; \
9727 if (start < 0) \
9728 start = 0; \
9729 }
9730
9731static Py_ssize_t
9732any_find_slice(PyObject* s1, PyObject* s2,
9733 Py_ssize_t start,
9734 Py_ssize_t end,
9735 int direction)
9736{
9737 int kind1, kind2;
9738 const void *buf1, *buf2;
9739 Py_ssize_t len1, len2, result;
9740
9741 kind1 = PyUnicode_KIND(s1);
9742 kind2 = PyUnicode_KIND(s2);
9743 if (kind1 < kind2)
9744 return -1;
9745
9746 len1 = PyUnicode_GET_LENGTH(s1);
9747 len2 = PyUnicode_GET_LENGTH(s2);
9748 ADJUST_INDICES(start, end, len1);
9749 if (end - start < len2)
9750 return -1;
9751
9752 buf1 = PyUnicode_DATA(s1);
9753 buf2 = PyUnicode_DATA(s2);
9754 if (len2 == 1) {
9755 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9756 result = findchar((const char *)buf1 + kind1*start,
9757 kind1, end - start, ch, direction);
9758 if (result == -1)
9759 return -1;
9760 else
9761 return start + result;
9762 }
9763
9764 if (kind2 != kind1) {
9765 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9766 if (!buf2)
9767 return -2;
9768 }
9769
9770 if (direction > 0) {
9771 switch (kind1) {
9772 case PyUnicode_1BYTE_KIND:
9773 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9774 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9775 else
9776 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9777 break;
9778 case PyUnicode_2BYTE_KIND:
9779 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9780 break;
9781 case PyUnicode_4BYTE_KIND:
9782 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9783 break;
9784 default:
9785 Py_UNREACHABLE();
9786 }
9787 }
9788 else {
9789 switch (kind1) {
9790 case PyUnicode_1BYTE_KIND:
9791 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9792 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9793 else
9794 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9795 break;
9796 case PyUnicode_2BYTE_KIND:
9797 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9798 break;
9799 case PyUnicode_4BYTE_KIND:
9800 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9801 break;
9802 default:
9803 Py_UNREACHABLE();
9804 }
9805 }
9806
9807 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9808 if (kind2 != kind1)
9809 PyMem_Free((void *)buf2);
9810
9811 return result;
9812}
9813
9814/* _PyUnicode_InsertThousandsGrouping() helper functions */
9815#include "stringlib/localeutil.h"
9816
9817/**
9818 * InsertThousandsGrouping:
9819 * @writer: Unicode writer.
9820 * @n_buffer: Number of characters in @buffer.
9821 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9822 * @d_pos: Start of digits string.
9823 * @n_digits: The number of digits in the string, in which we want
9824 * to put the grouping chars.
9825 * @min_width: The minimum width of the digits in the output string.
9826 * Output will be zero-padded on the left to fill.
9827 * @grouping: see definition in localeconv().
9828 * @thousands_sep: see definition in localeconv().
9829 *
9830 * There are 2 modes: counting and filling. If @writer is NULL,
9831 * we are in counting mode, else filling mode.
9832 * If counting, the required buffer size is returned.
9833 * If filling, we know the buffer will be large enough, so we don't
9834 * need to pass in the buffer size.
9835 * Inserts thousand grouping characters (as defined by grouping and
9836 * thousands_sep) into @writer.
9837 *
9838 * Return value: -1 on error, number of characters otherwise.
9839 **/
9840Py_ssize_t
9841_PyUnicode_InsertThousandsGrouping(
9842 _PyUnicodeWriter *writer,
9843 Py_ssize_t n_buffer,
9844 PyObject *digits,
9845 Py_ssize_t d_pos,
9846 Py_ssize_t n_digits,
9847 Py_ssize_t min_width,
9848 const char *grouping,
9849 PyObject *thousands_sep,
9850 Py_UCS4 *maxchar)
9851{
9852 min_width = Py_MAX(0, min_width);
9853 if (writer) {
9854 assert(digits != NULL);
9855 assert(maxchar == NULL);
9856 }
9857 else {
9858 assert(digits == NULL);
9859 assert(maxchar != NULL);
9860 }
9861 assert(0 <= d_pos);
9862 assert(0 <= n_digits);
9863 assert(grouping != NULL);
9864
9865 if (digits != NULL) {
9866 if (PyUnicode_READY(digits) == -1) {
9867 return -1;
9868 }
9869 }
9870 if (PyUnicode_READY(thousands_sep) == -1) {
9871 return -1;
9872 }
9873
9874 Py_ssize_t count = 0;
9875 Py_ssize_t n_zeros;
9876 int loop_broken = 0;
9877 int use_separator = 0; /* First time through, don't append the
9878 separator. They only go between
9879 groups. */
9880 Py_ssize_t buffer_pos;
9881 Py_ssize_t digits_pos;
9882 Py_ssize_t len;
9883 Py_ssize_t n_chars;
9884 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9885 be looked at */
9886 /* A generator that returns all of the grouping widths, until it
9887 returns 0. */
9888 GroupGenerator groupgen;
9889 GroupGenerator_init(&groupgen, grouping);
9890 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9891
9892 /* if digits are not grouped, thousands separator
9893 should be an empty string */
9894 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9895
9896 digits_pos = d_pos + n_digits;
9897 if (writer) {
9898 buffer_pos = writer->pos + n_buffer;
9899 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9900 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9901 }
9902 else {
9903 buffer_pos = n_buffer;
9904 }
9905
9906 if (!writer) {
9907 *maxchar = 127;
9908 }
9909
9910 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9911 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9912 n_zeros = Py_MAX(0, len - remaining);
9913 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9914
9915 /* Use n_zero zero's and n_chars chars */
9916
9917 /* Count only, don't do anything. */
9918 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9919
9920 /* Copy into the writer. */
9921 InsertThousandsGrouping_fill(writer, &buffer_pos,
9922 digits, &digits_pos,
9923 n_chars, n_zeros,
9924 use_separator ? thousands_sep : NULL,
9925 thousands_sep_len, maxchar);
9926
9927 /* Use a separator next time. */
9928 use_separator = 1;
9929
9930 remaining -= n_chars;
9931 min_width -= len;
9932
9933 if (remaining <= 0 && min_width <= 0) {
9934 loop_broken = 1;
9935 break;
9936 }
9937 min_width -= thousands_sep_len;
9938 }
9939 if (!loop_broken) {
9940 /* We left the loop without using a break statement. */
9941
9942 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9943 n_zeros = Py_MAX(0, len - remaining);
9944 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9945
9946 /* Use n_zero zero's and n_chars chars */
9947 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9948
9949 /* Copy into the writer. */
9950 InsertThousandsGrouping_fill(writer, &buffer_pos,
9951 digits, &digits_pos,
9952 n_chars, n_zeros,
9953 use_separator ? thousands_sep : NULL,
9954 thousands_sep_len, maxchar);
9955 }
9956 return count;
9957}
9958
9959
9960Py_ssize_t
9961PyUnicode_Count(PyObject *str,
9962 PyObject *substr,
9963 Py_ssize_t start,
9964 Py_ssize_t end)
9965{
9966 Py_ssize_t result;
9967 int kind1, kind2;
9968 const void *buf1 = NULL, *buf2 = NULL;
9969 Py_ssize_t len1, len2;
9970
9971 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9972 return -1;
9973
9974 kind1 = PyUnicode_KIND(str);
9975 kind2 = PyUnicode_KIND(substr);
9976 if (kind1 < kind2)
9977 return 0;
9978
9979 len1 = PyUnicode_GET_LENGTH(str);
9980 len2 = PyUnicode_GET_LENGTH(substr);
9981 ADJUST_INDICES(start, end, len1);
9982 if (end - start < len2)
9983 return 0;
9984
9985 buf1 = PyUnicode_DATA(str);
9986 buf2 = PyUnicode_DATA(substr);
9987 if (kind2 != kind1) {
9988 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9989 if (!buf2)
9990 goto onError;
9991 }
9992
9993 switch (kind1) {
9994 case PyUnicode_1BYTE_KIND:
9995 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9996 result = asciilib_count(
9997 ((const Py_UCS1*)buf1) + start, end - start,
9998 buf2, len2, PY_SSIZE_T_MAX
9999 );
10000 else
10001 result = ucs1lib_count(
10002 ((const Py_UCS1*)buf1) + start, end - start,
10003 buf2, len2, PY_SSIZE_T_MAX
10004 );
10005 break;
10006 case PyUnicode_2BYTE_KIND:
10007 result = ucs2lib_count(
10008 ((const Py_UCS2*)buf1) + start, end - start,
10009 buf2, len2, PY_SSIZE_T_MAX
10010 );
10011 break;
10012 case PyUnicode_4BYTE_KIND:
10013 result = ucs4lib_count(
10014 ((const Py_UCS4*)buf1) + start, end - start,
10015 buf2, len2, PY_SSIZE_T_MAX
10016 );
10017 break;
10018 default:
10019 Py_UNREACHABLE();
10020 }
10021
10022 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
10023 if (kind2 != kind1)
10024 PyMem_Free((void *)buf2);
10025
10026 return result;
10027 onError:
10028 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
10029 if (kind2 != kind1)
10030 PyMem_Free((void *)buf2);
10031 return -1;
10032}
10033
10034Py_ssize_t
10035PyUnicode_Find(PyObject *str,
10036 PyObject *substr,
10037 Py_ssize_t start,
10038 Py_ssize_t end,
10039 int direction)
10040{
10041 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
10042 return -2;
10043
10044 return any_find_slice(str, substr, start, end, direction);
10045}
10046
10047Py_ssize_t
10048PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
10049 Py_ssize_t start, Py_ssize_t end,
10050 int direction)
10051{
10052 int kind;
10053 Py_ssize_t len, result;
10054 if (PyUnicode_READY(str) == -1)
10055 return -2;
10056 len = PyUnicode_GET_LENGTH(str);
10057 ADJUST_INDICES(start, end, len);
10058 if (end - start < 1)
10059 return -1;
10060 kind = PyUnicode_KIND(str);
10061 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
10062 kind, end-start, ch, direction);
10063 if (result == -1)
10064 return -1;
10065 else
10066 return start + result;
10067}
10068
10069static int
10070tailmatch(PyObject *self,
10071 PyObject *substring,
10072 Py_ssize_t start,
10073 Py_ssize_t end,
10074 int direction)
10075{
10076 int kind_self;
10077 int kind_sub;
10078 const void *data_self;
10079 const void *data_sub;
10080 Py_ssize_t offset;
10081 Py_ssize_t i;
10082 Py_ssize_t end_sub;
10083
10084 if (PyUnicode_READY(self) == -1 ||
10085 PyUnicode_READY(substring) == -1)
10086 return -1;
10087
10088 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
10089 end -= PyUnicode_GET_LENGTH(substring);
10090 if (end < start)
10091 return 0;
10092
10093 if (PyUnicode_GET_LENGTH(substring) == 0)
10094 return 1;
10095
10096 kind_self = PyUnicode_KIND(self);
10097 data_self = PyUnicode_DATA(self);
10098 kind_sub = PyUnicode_KIND(substring);
10099 data_sub = PyUnicode_DATA(substring);
10100 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
10101
10102 if (direction > 0)
10103 offset = end;
10104 else
10105 offset = start;
10106
10107 if (PyUnicode_READ(kind_self, data_self, offset) ==
10108 PyUnicode_READ(kind_sub, data_sub, 0) &&
10109 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
10110 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
10111 /* If both are of the same kind, memcmp is sufficient */
10112 if (kind_self == kind_sub) {
10113 return ! memcmp((char *)data_self +
10114 (offset * PyUnicode_KIND(substring)),
10115 data_sub,
10116 PyUnicode_GET_LENGTH(substring) *
10117 PyUnicode_KIND(substring));
10118 }
10119 /* otherwise we have to compare each character by first accessing it */
10120 else {
10121 /* We do not need to compare 0 and len(substring)-1 because
10122 the if statement above ensured already that they are equal
10123 when we end up here. */
10124 for (i = 1; i < end_sub; ++i) {
10125 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
10126 PyUnicode_READ(kind_sub, data_sub, i))
10127 return 0;
10128 }
10129 return 1;
10130 }
10131 }
10132
10133 return 0;
10134}
10135
10136Py_ssize_t
10137PyUnicode_Tailmatch(PyObject *str,
10138 PyObject *substr,
10139 Py_ssize_t start,
10140 Py_ssize_t end,
10141 int direction)
10142{
10143 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
10144 return -1;
10145
10146 return tailmatch(str, substr, start, end, direction);
10147}
10148
10149static PyObject *
10150ascii_upper_or_lower(PyObject *self, int lower)
10151{
10152 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
10153 const char *data = PyUnicode_DATA(self);
10154 char *resdata;
10155 PyObject *res;
10156
10157 res = PyUnicode_New(len, 127);
10158 if (res == NULL)
10159 return NULL;
10160 resdata = PyUnicode_DATA(res);
10161 if (lower)
10162 _Py_bytes_lower(resdata, data, len);
10163 else
10164 _Py_bytes_upper(resdata, data, len);
10165 return res;
10166}
10167
10168static Py_UCS4
10169handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
10170{
10171 Py_ssize_t j;
10172 int final_sigma;
10173 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
10174 /* U+03A3 is in the Final_Sigma context when, it is found like this:
10175
10176 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10177
10178 where ! is a negation and \p{xxx} is a character with property xxx.
10179 */
10180 for (j = i - 1; j >= 0; j--) {
10181 c = PyUnicode_READ(kind, data, j);
10182 if (!_PyUnicode_IsCaseIgnorable(c))
10183 break;
10184 }
10185 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10186 if (final_sigma) {
10187 for (j = i + 1; j < length; j++) {
10188 c = PyUnicode_READ(kind, data, j);
10189 if (!_PyUnicode_IsCaseIgnorable(c))
10190 break;
10191 }
10192 final_sigma = j == length || !_PyUnicode_IsCased(c);
10193 }
10194 return (final_sigma) ? 0x3C2 : 0x3C3;
10195}
10196
10197static int
10198lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
10199 Py_UCS4 c, Py_UCS4 *mapped)
10200{
10201 /* Obscure special case. */
10202 if (c == 0x3A3) {
10203 mapped[0] = handle_capital_sigma(kind, data, length, i);
10204 return 1;
10205 }
10206 return _PyUnicode_ToLowerFull(c, mapped);
10207}
10208
10209static Py_ssize_t
10210do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10211{
10212 Py_ssize_t i, k = 0;
10213 int n_res, j;
10214 Py_UCS4 c, mapped[3];
10215
10216 c = PyUnicode_READ(kind, data, 0);
10217 n_res = _PyUnicode_ToTitleFull(c, mapped);
10218 for (j = 0; j < n_res; j++) {
10219 *maxchar = Py_MAX(*maxchar, mapped[j]);
10220 res[k++] = mapped[j];
10221 }
10222 for (i = 1; i < length; i++) {
10223 c = PyUnicode_READ(kind, data, i);
10224 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10225 for (j = 0; j < n_res; j++) {
10226 *maxchar = Py_MAX(*maxchar, mapped[j]);
10227 res[k++] = mapped[j];
10228 }
10229 }
10230 return k;
10231}
10232
10233static Py_ssize_t
10234do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
10235 Py_ssize_t i, k = 0;
10236
10237 for (i = 0; i < length; i++) {
10238 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10239 int n_res, j;
10240 if (Py_UNICODE_ISUPPER(c)) {
10241 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10242 }
10243 else if (Py_UNICODE_ISLOWER(c)) {
10244 n_res = _PyUnicode_ToUpperFull(c, mapped);
10245 }
10246 else {
10247 n_res = 1;
10248 mapped[0] = c;
10249 }
10250 for (j = 0; j < n_res; j++) {
10251 *maxchar = Py_MAX(*maxchar, mapped[j]);
10252 res[k++] = mapped[j];
10253 }
10254 }
10255 return k;
10256}
10257
10258static Py_ssize_t
10259do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
10260 Py_UCS4 *maxchar, int lower)
10261{
10262 Py_ssize_t i, k = 0;
10263
10264 for (i = 0; i < length; i++) {
10265 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10266 int n_res, j;
10267 if (lower)
10268 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10269 else
10270 n_res = _PyUnicode_ToUpperFull(c, mapped);
10271 for (j = 0; j < n_res; j++) {
10272 *maxchar = Py_MAX(*maxchar, mapped[j]);
10273 res[k++] = mapped[j];
10274 }
10275 }
10276 return k;
10277}
10278
10279static Py_ssize_t
10280do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10281{
10282 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10283}
10284
10285static Py_ssize_t
10286do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10287{
10288 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10289}
10290
10291static Py_ssize_t
10292do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10293{
10294 Py_ssize_t i, k = 0;
10295
10296 for (i = 0; i < length; i++) {
10297 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10298 Py_UCS4 mapped[3];
10299 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10300 for (j = 0; j < n_res; j++) {
10301 *maxchar = Py_MAX(*maxchar, mapped[j]);
10302 res[k++] = mapped[j];
10303 }
10304 }
10305 return k;
10306}
10307
10308static Py_ssize_t
10309do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10310{
10311 Py_ssize_t i, k = 0;
10312 int previous_is_cased;
10313
10314 previous_is_cased = 0;
10315 for (i = 0; i < length; i++) {
10316 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10317 Py_UCS4 mapped[3];
10318 int n_res, j;
10319
10320 if (previous_is_cased)
10321 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10322 else
10323 n_res = _PyUnicode_ToTitleFull(c, mapped);
10324
10325 for (j = 0; j < n_res; j++) {
10326 *maxchar = Py_MAX(*maxchar, mapped[j]);
10327 res[k++] = mapped[j];
10328 }
10329
10330 previous_is_cased = _PyUnicode_IsCased(c);
10331 }
10332 return k;
10333}
10334
10335static PyObject *
10336case_operation(PyObject *self,
10337 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10338{
10339 PyObject *res = NULL;
10340 Py_ssize_t length, newlength = 0;
10341 int kind, outkind;
10342 const void *data;
10343 void *outdata;
10344 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10345
10346 assert(PyUnicode_IS_READY(self));
10347
10348 kind = PyUnicode_KIND(self);
10349 data = PyUnicode_DATA(self);
10350 length = PyUnicode_GET_LENGTH(self);
10351 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10352 PyErr_SetString(PyExc_OverflowError, "string is too long");
10353 return NULL;
10354 }
10355 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10356 if (tmp == NULL)
10357 return PyErr_NoMemory();
10358 newlength = perform(kind, data, length, tmp, &maxchar);
10359 res = PyUnicode_New(newlength, maxchar);
10360 if (res == NULL)
10361 goto leave;
10362 tmpend = tmp + newlength;
10363 outdata = PyUnicode_DATA(res);
10364 outkind = PyUnicode_KIND(res);
10365 switch (outkind) {
10366 case PyUnicode_1BYTE_KIND:
10367 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10368 break;
10369 case PyUnicode_2BYTE_KIND:
10370 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10371 break;
10372 case PyUnicode_4BYTE_KIND:
10373 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10374 break;
10375 default:
10376 Py_UNREACHABLE();
10377 }
10378 leave:
10379 PyMem_Free(tmp);
10380 return res;
10381}
10382
10383PyObject *
10384PyUnicode_Join(PyObject *separator, PyObject *seq)
10385{
10386 PyObject *res;
10387 PyObject *fseq;
10388 Py_ssize_t seqlen;
10389 PyObject **items;
10390
10391 fseq = PySequence_Fast(seq, "can only join an iterable");
10392 if (fseq == NULL) {
10393 return NULL;
10394 }
10395
10396 /* NOTE: the following code can't call back into Python code,
10397 * so we are sure that fseq won't be mutated.
10398 */
10399
10400 items = PySequence_Fast_ITEMS(fseq);
10401 seqlen = PySequence_Fast_GET_SIZE(fseq);
10402 res = _PyUnicode_JoinArray(separator, items, seqlen);
10403 Py_DECREF(fseq);
10404 return res;
10405}
10406
10407PyObject *
10408_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10409{
10410 PyObject *res = NULL; /* the result */
10411 PyObject *sep = NULL;
10412 Py_ssize_t seplen;
10413 PyObject *item;
10414 Py_ssize_t sz, i, res_offset;
10415 Py_UCS4 maxchar;
10416 Py_UCS4 item_maxchar;
10417 int use_memcpy;
10418 unsigned char *res_data = NULL, *sep_data = NULL;
10419 PyObject *last_obj;
10420 unsigned int kind = 0;
10421
10422 /* If empty sequence, return u"". */
10423 if (seqlen == 0) {
10424 _Py_RETURN_UNICODE_EMPTY();
10425 }
10426
10427 /* If singleton sequence with an exact Unicode, return that. */
10428 last_obj = NULL;
10429 if (seqlen == 1) {
10430 if (PyUnicode_CheckExact(items[0])) {
10431 res = items[0];
10432 Py_INCREF(res);
10433 return res;
10434 }
10435 seplen = 0;
10436 maxchar = 0;
10437 }
10438 else {
10439 /* Set up sep and seplen */
10440 if (separator == NULL) {
10441 /* fall back to a blank space separator */
10442 sep = PyUnicode_FromOrdinal(' ');
10443 if (!sep)
10444 goto onError;
10445 seplen = 1;
10446 maxchar = 32;
10447 }
10448 else {
10449 if (!PyUnicode_Check(separator)) {
10450 PyErr_Format(PyExc_TypeError,
10451 "separator: expected str instance,"
10452 " %.80s found",
10453 Py_TYPE(separator)->tp_name);
10454 goto onError;
10455 }
10456 if (PyUnicode_READY(separator))
10457 goto onError;
10458 sep = separator;
10459 seplen = PyUnicode_GET_LENGTH(separator);
10460 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10461 /* inc refcount to keep this code path symmetric with the
10462 above case of a blank separator */
10463 Py_INCREF(sep);
10464 }
10465 last_obj = sep;
10466 }
10467
10468 /* There are at least two things to join, or else we have a subclass
10469 * of str in the sequence.
10470 * Do a pre-pass to figure out the total amount of space we'll
10471 * need (sz), and see whether all argument are strings.
10472 */
10473 sz = 0;
10474#ifdef Py_DEBUG
10475 use_memcpy = 0;
10476#else
10477 use_memcpy = 1;
10478#endif
10479 for (i = 0; i < seqlen; i++) {
10480 size_t add_sz;
10481 item = items[i];
10482 if (!PyUnicode_Check(item)) {
10483 PyErr_Format(PyExc_TypeError,
10484 "sequence item %zd: expected str instance,"
10485 " %.80s found",
10486 i, Py_TYPE(item)->tp_name);
10487 goto onError;
10488 }
10489 if (PyUnicode_READY(item) == -1)
10490 goto onError;
10491 add_sz = PyUnicode_GET_LENGTH(item);
10492 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10493 maxchar = Py_MAX(maxchar, item_maxchar);
10494 if (i != 0) {
10495 add_sz += seplen;
10496 }
10497 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10498 PyErr_SetString(PyExc_OverflowError,
10499 "join() result is too long for a Python string");
10500 goto onError;
10501 }
10502 sz += add_sz;
10503 if (use_memcpy && last_obj != NULL) {
10504 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10505 use_memcpy = 0;
10506 }
10507 last_obj = item;
10508 }
10509
10510 res = PyUnicode_New(sz, maxchar);
10511 if (res == NULL)
10512 goto onError;
10513
10514 /* Catenate everything. */
10515#ifdef Py_DEBUG
10516 use_memcpy = 0;
10517#else
10518 if (use_memcpy) {
10519 res_data = PyUnicode_1BYTE_DATA(res);
10520 kind = PyUnicode_KIND(res);
10521 if (seplen != 0)
10522 sep_data = PyUnicode_1BYTE_DATA(sep);
10523 }
10524#endif
10525 if (use_memcpy) {
10526 for (i = 0; i < seqlen; ++i) {
10527 Py_ssize_t itemlen;
10528 item = items[i];
10529
10530 /* Copy item, and maybe the separator. */
10531 if (i && seplen != 0) {
10532 memcpy(res_data,
10533 sep_data,
10534 kind * seplen);
10535 res_data += kind * seplen;
10536 }
10537
10538 itemlen = PyUnicode_GET_LENGTH(item);
10539 if (itemlen != 0) {
10540 memcpy(res_data,
10541 PyUnicode_DATA(item),
10542 kind * itemlen);
10543 res_data += kind * itemlen;
10544 }
10545 }
10546 assert(res_data == PyUnicode_1BYTE_DATA(res)
10547 + kind * PyUnicode_GET_LENGTH(res));
10548 }
10549 else {
10550 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10551 Py_ssize_t itemlen;
10552 item = items[i];
10553
10554 /* Copy item, and maybe the separator. */
10555 if (i && seplen != 0) {
10556 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10557 res_offset += seplen;
10558 }
10559
10560 itemlen = PyUnicode_GET_LENGTH(item);
10561 if (itemlen != 0) {
10562 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10563 res_offset += itemlen;
10564 }
10565 }
10566 assert(res_offset == PyUnicode_GET_LENGTH(res));
10567 }
10568
10569 Py_XDECREF(sep);
10570 assert(_PyUnicode_CheckConsistency(res, 1));
10571 return res;
10572
10573 onError:
10574 Py_XDECREF(sep);
10575 Py_XDECREF(res);
10576 return NULL;
10577}
10578
10579void
10580_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10581 Py_UCS4 fill_char)
10582{
10583 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10584 void *data = PyUnicode_DATA(unicode);
10585 assert(PyUnicode_IS_READY(unicode));
10586 assert(unicode_modifiable(unicode));
10587 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10588 assert(start >= 0);
10589 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10590 unicode_fill(kind, data, fill_char, start, length);
10591}
10592
10593Py_ssize_t
10594PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10595 Py_UCS4 fill_char)
10596{
10597 Py_ssize_t maxlen;
10598
10599 if (!PyUnicode_Check(unicode)) {
10600 PyErr_BadInternalCall();
10601 return -1;
10602 }
10603 if (PyUnicode_READY(unicode) == -1)
10604 return -1;
10605 if (unicode_check_modifiable(unicode))
10606 return -1;
10607
10608 if (start < 0) {
10609 PyErr_SetString(PyExc_IndexError, "string index out of range");
10610 return -1;
10611 }
10612 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10613 PyErr_SetString(PyExc_ValueError,
10614 "fill character is bigger than "
10615 "the string maximum character");
10616 return -1;
10617 }
10618
10619 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10620 length = Py_MIN(maxlen, length);
10621 if (length <= 0)
10622 return 0;
10623
10624 _PyUnicode_FastFill(unicode, start, length, fill_char);
10625 return length;
10626}
10627
10628static PyObject *
10629pad(PyObject *self,
10630 Py_ssize_t left,
10631 Py_ssize_t right,
10632 Py_UCS4 fill)
10633{
10634 PyObject *u;
10635 Py_UCS4 maxchar;
10636 int kind;
10637 void *data;
10638
10639 if (left < 0)
10640 left = 0;
10641 if (right < 0)
10642 right = 0;
10643
10644 if (left == 0 && right == 0)
10645 return unicode_result_unchanged(self);
10646
10647 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10648 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10649 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10650 return NULL;
10651 }
10652 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10653 maxchar = Py_MAX(maxchar, fill);
10654 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10655 if (!u)
10656 return NULL;
10657
10658 kind = PyUnicode_KIND(u);
10659 data = PyUnicode_DATA(u);
10660 if (left)
10661 unicode_fill(kind, data, fill, 0, left);
10662 if (right)
10663 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10664 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10665 assert(_PyUnicode_CheckConsistency(u, 1));
10666 return u;
10667}
10668
10669PyObject *
10670PyUnicode_Splitlines(PyObject *string, int keepends)
10671{
10672 PyObject *list;
10673
10674 if (ensure_unicode(string) < 0)
10675 return NULL;
10676
10677 switch (PyUnicode_KIND(string)) {
10678 case PyUnicode_1BYTE_KIND:
10679 if (PyUnicode_IS_ASCII(string))
10680 list = asciilib_splitlines(
10681 string, PyUnicode_1BYTE_DATA(string),
10682 PyUnicode_GET_LENGTH(string), keepends);
10683 else
10684 list = ucs1lib_splitlines(
10685 string, PyUnicode_1BYTE_DATA(string),
10686 PyUnicode_GET_LENGTH(string), keepends);
10687 break;
10688 case PyUnicode_2BYTE_KIND:
10689 list = ucs2lib_splitlines(
10690 string, PyUnicode_2BYTE_DATA(string),
10691 PyUnicode_GET_LENGTH(string), keepends);
10692 break;
10693 case PyUnicode_4BYTE_KIND:
10694 list = ucs4lib_splitlines(
10695 string, PyUnicode_4BYTE_DATA(string),
10696 PyUnicode_GET_LENGTH(string), keepends);
10697 break;
10698 default:
10699 Py_UNREACHABLE();
10700 }
10701 return list;
10702}
10703
10704static PyObject *
10705split(PyObject *self,
10706 PyObject *substring,
10707 Py_ssize_t maxcount)
10708{
10709 int kind1, kind2;
10710 const void *buf1, *buf2;
10711 Py_ssize_t len1, len2;
10712 PyObject* out;
10713
10714 if (maxcount < 0)
10715 maxcount = PY_SSIZE_T_MAX;
10716
10717 if (PyUnicode_READY(self) == -1)
10718 return NULL;
10719
10720 if (substring == NULL)
10721 switch (PyUnicode_KIND(self)) {
10722 case PyUnicode_1BYTE_KIND:
10723 if (PyUnicode_IS_ASCII(self))
10724 return asciilib_split_whitespace(
10725 self, PyUnicode_1BYTE_DATA(self),
10726 PyUnicode_GET_LENGTH(self), maxcount
10727 );
10728 else
10729 return ucs1lib_split_whitespace(
10730 self, PyUnicode_1BYTE_DATA(self),
10731 PyUnicode_GET_LENGTH(self), maxcount
10732 );
10733 case PyUnicode_2BYTE_KIND:
10734 return ucs2lib_split_whitespace(
10735 self, PyUnicode_2BYTE_DATA(self),
10736 PyUnicode_GET_LENGTH(self), maxcount
10737 );
10738 case PyUnicode_4BYTE_KIND:
10739 return ucs4lib_split_whitespace(
10740 self, PyUnicode_4BYTE_DATA(self),
10741 PyUnicode_GET_LENGTH(self), maxcount
10742 );
10743 default:
10744 Py_UNREACHABLE();
10745 }
10746
10747 if (PyUnicode_READY(substring) == -1)
10748 return NULL;
10749
10750 kind1 = PyUnicode_KIND(self);
10751 kind2 = PyUnicode_KIND(substring);
10752 len1 = PyUnicode_GET_LENGTH(self);
10753 len2 = PyUnicode_GET_LENGTH(substring);
10754 if (kind1 < kind2 || len1 < len2) {
10755 out = PyList_New(1);
10756 if (out == NULL)
10757 return NULL;
10758 Py_INCREF(self);
10759 PyList_SET_ITEM(out, 0, self);
10760 return out;
10761 }
10762 buf1 = PyUnicode_DATA(self);
10763 buf2 = PyUnicode_DATA(substring);
10764 if (kind2 != kind1) {
10765 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10766 if (!buf2)
10767 return NULL;
10768 }
10769
10770 switch (kind1) {
10771 case PyUnicode_1BYTE_KIND:
10772 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10773 out = asciilib_split(
10774 self, buf1, len1, buf2, len2, maxcount);
10775 else
10776 out = ucs1lib_split(
10777 self, buf1, len1, buf2, len2, maxcount);
10778 break;
10779 case PyUnicode_2BYTE_KIND:
10780 out = ucs2lib_split(
10781 self, buf1, len1, buf2, len2, maxcount);
10782 break;
10783 case PyUnicode_4BYTE_KIND:
10784 out = ucs4lib_split(
10785 self, buf1, len1, buf2, len2, maxcount);
10786 break;
10787 default:
10788 out = NULL;
10789 }
10790 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10791 if (kind2 != kind1)
10792 PyMem_Free((void *)buf2);
10793 return out;
10794}
10795
10796static PyObject *
10797rsplit(PyObject *self,
10798 PyObject *substring,
10799 Py_ssize_t maxcount)
10800{
10801 int kind1, kind2;
10802 const void *buf1, *buf2;
10803 Py_ssize_t len1, len2;
10804 PyObject* out;
10805
10806 if (maxcount < 0)
10807 maxcount = PY_SSIZE_T_MAX;
10808
10809 if (PyUnicode_READY(self) == -1)
10810 return NULL;
10811
10812 if (substring == NULL)
10813 switch (PyUnicode_KIND(self)) {
10814 case PyUnicode_1BYTE_KIND:
10815 if (PyUnicode_IS_ASCII(self))
10816 return asciilib_rsplit_whitespace(
10817 self, PyUnicode_1BYTE_DATA(self),
10818 PyUnicode_GET_LENGTH(self), maxcount
10819 );
10820 else
10821 return ucs1lib_rsplit_whitespace(
10822 self, PyUnicode_1BYTE_DATA(self),
10823 PyUnicode_GET_LENGTH(self), maxcount
10824 );
10825 case PyUnicode_2BYTE_KIND:
10826 return ucs2lib_rsplit_whitespace(
10827 self, PyUnicode_2BYTE_DATA(self),
10828 PyUnicode_GET_LENGTH(self), maxcount
10829 );
10830 case PyUnicode_4BYTE_KIND:
10831 return ucs4lib_rsplit_whitespace(
10832 self, PyUnicode_4BYTE_DATA(self),
10833 PyUnicode_GET_LENGTH(self), maxcount
10834 );
10835 default:
10836 Py_UNREACHABLE();
10837 }
10838
10839 if (PyUnicode_READY(substring) == -1)
10840 return NULL;
10841
10842 kind1 = PyUnicode_KIND(self);
10843 kind2 = PyUnicode_KIND(substring);
10844 len1 = PyUnicode_GET_LENGTH(self);
10845 len2 = PyUnicode_GET_LENGTH(substring);
10846 if (kind1 < kind2 || len1 < len2) {
10847 out = PyList_New(1);
10848 if (out == NULL)
10849 return NULL;
10850 Py_INCREF(self);
10851 PyList_SET_ITEM(out, 0, self);
10852 return out;
10853 }
10854 buf1 = PyUnicode_DATA(self);
10855 buf2 = PyUnicode_DATA(substring);
10856 if (kind2 != kind1) {
10857 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10858 if (!buf2)
10859 return NULL;
10860 }
10861
10862 switch (kind1) {
10863 case PyUnicode_1BYTE_KIND:
10864 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10865 out = asciilib_rsplit(
10866 self, buf1, len1, buf2, len2, maxcount);
10867 else
10868 out = ucs1lib_rsplit(
10869 self, buf1, len1, buf2, len2, maxcount);
10870 break;
10871 case PyUnicode_2BYTE_KIND:
10872 out = ucs2lib_rsplit(
10873 self, buf1, len1, buf2, len2, maxcount);
10874 break;
10875 case PyUnicode_4BYTE_KIND:
10876 out = ucs4lib_rsplit(
10877 self, buf1, len1, buf2, len2, maxcount);
10878 break;
10879 default:
10880 out = NULL;
10881 }
10882 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10883 if (kind2 != kind1)
10884 PyMem_Free((void *)buf2);
10885 return out;
10886}
10887
10888static Py_ssize_t
10889anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10890 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10891{
10892 switch (kind) {
10893 case PyUnicode_1BYTE_KIND:
10894 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10895 return asciilib_find(buf1, len1, buf2, len2, offset);
10896 else
10897 return ucs1lib_find(buf1, len1, buf2, len2, offset);
10898 case PyUnicode_2BYTE_KIND:
10899 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10900 case PyUnicode_4BYTE_KIND:
10901 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10902 }
10903 Py_UNREACHABLE();
10904}
10905
10906static Py_ssize_t
10907anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10908 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10909{
10910 switch (kind) {
10911 case PyUnicode_1BYTE_KIND:
10912 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10913 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10914 else
10915 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10916 case PyUnicode_2BYTE_KIND:
10917 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10918 case PyUnicode_4BYTE_KIND:
10919 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10920 }
10921 Py_UNREACHABLE();
10922}
10923
10924static void
10925replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10926 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10927{
10928 int kind = PyUnicode_KIND(u);
10929 void *data = PyUnicode_DATA(u);
10930 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10931 if (kind == PyUnicode_1BYTE_KIND) {
10932 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10933 (Py_UCS1 *)data + len,
10934 u1, u2, maxcount);
10935 }
10936 else if (kind == PyUnicode_2BYTE_KIND) {
10937 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10938 (Py_UCS2 *)data + len,
10939 u1, u2, maxcount);
10940 }
10941 else {
10942 assert(kind == PyUnicode_4BYTE_KIND);
10943 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10944 (Py_UCS4 *)data + len,
10945 u1, u2, maxcount);
10946 }
10947}
10948
10949static PyObject *
10950replace(PyObject *self, PyObject *str1,
10951 PyObject *str2, Py_ssize_t maxcount)
10952{
10953 PyObject *u;
10954 const char *sbuf = PyUnicode_DATA(self);
10955 const void *buf1 = PyUnicode_DATA(str1);
10956 const void *buf2 = PyUnicode_DATA(str2);
10957 int srelease = 0, release1 = 0, release2 = 0;
10958 int skind = PyUnicode_KIND(self);
10959 int kind1 = PyUnicode_KIND(str1);
10960 int kind2 = PyUnicode_KIND(str2);
10961 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10962 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10963 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10964 int mayshrink;
10965 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10966
10967 if (slen < len1)
10968 goto nothing;
10969
10970 if (maxcount < 0)
10971 maxcount = PY_SSIZE_T_MAX;
10972 else if (maxcount == 0)
10973 goto nothing;
10974
10975 if (str1 == str2)
10976 goto nothing;
10977
10978 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10979 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10980 if (maxchar < maxchar_str1)
10981 /* substring too wide to be present */
10982 goto nothing;
10983 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10984 /* Replacing str1 with str2 may cause a maxchar reduction in the
10985 result string. */
10986 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10987 maxchar = Py_MAX(maxchar, maxchar_str2);
10988
10989 if (len1 == len2) {
10990 /* same length */
10991 if (len1 == 0)
10992 goto nothing;
10993 if (len1 == 1) {
10994 /* replace characters */
10995 Py_UCS4 u1, u2;
10996 Py_ssize_t pos;
10997
10998 u1 = PyUnicode_READ(kind1, buf1, 0);
10999 pos = findchar(sbuf, skind, slen, u1, 1);
11000 if (pos < 0)
11001 goto nothing;
11002 u2 = PyUnicode_READ(kind2, buf2, 0);
11003 u = PyUnicode_New(slen, maxchar);
11004 if (!u)
11005 goto error;
11006
11007 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
11008 replace_1char_inplace(u, pos, u1, u2, maxcount);
11009 }
11010 else {
11011 int rkind = skind;
11012 char *res;
11013 Py_ssize_t i;
11014
11015 if (kind1 < rkind) {
11016 /* widen substring */
11017 buf1 = unicode_askind(kind1, buf1, len1, rkind);
11018 if (!buf1) goto error;
11019 release1 = 1;
11020 }
11021 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
11022 if (i < 0)
11023 goto nothing;
11024 if (rkind > kind2) {
11025 /* widen replacement */
11026 buf2 = unicode_askind(kind2, buf2, len2, rkind);
11027 if (!buf2) goto error;
11028 release2 = 1;
11029 }
11030 else if (rkind < kind2) {
11031 /* widen self and buf1 */
11032 rkind = kind2;
11033 if (release1) {
11034 assert(buf1 != PyUnicode_DATA(str1));
11035 PyMem_Free((void *)buf1);
11036 buf1 = PyUnicode_DATA(str1);
11037 release1 = 0;
11038 }
11039 sbuf = unicode_askind(skind, sbuf, slen, rkind);
11040 if (!sbuf) goto error;
11041 srelease = 1;
11042 buf1 = unicode_askind(kind1, buf1, len1, rkind);
11043 if (!buf1) goto error;
11044 release1 = 1;
11045 }
11046 u = PyUnicode_New(slen, maxchar);
11047 if (!u)
11048 goto error;
11049 assert(PyUnicode_KIND(u) == rkind);
11050 res = PyUnicode_DATA(u);
11051
11052 memcpy(res, sbuf, rkind * slen);
11053 /* change everything in-place, starting with this one */
11054 memcpy(res + rkind * i,
11055 buf2,
11056 rkind * len2);
11057 i += len1;
11058
11059 while ( --maxcount > 0) {
11060 i = anylib_find(rkind, self,
11061 sbuf+rkind*i, slen-i,
11062 str1, buf1, len1, i);
11063 if (i == -1)
11064 break;
11065 memcpy(res + rkind * i,
11066 buf2,
11067 rkind * len2);
11068 i += len1;
11069 }
11070 }
11071 }
11072 else {
11073 Py_ssize_t n, i, j, ires;
11074 Py_ssize_t new_size;
11075 int rkind = skind;
11076 char *res;
11077
11078 if (kind1 < rkind) {
11079 /* widen substring */
11080 buf1 = unicode_askind(kind1, buf1, len1, rkind);
11081 if (!buf1) goto error;
11082 release1 = 1;
11083 }
11084 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
11085 if (n == 0)
11086 goto nothing;
11087 if (kind2 < rkind) {
11088 /* widen replacement */
11089 buf2 = unicode_askind(kind2, buf2, len2, rkind);
11090 if (!buf2) goto error;
11091 release2 = 1;
11092 }
11093 else if (kind2 > rkind) {
11094 /* widen self and buf1 */
11095 rkind = kind2;
11096 sbuf = unicode_askind(skind, sbuf, slen, rkind);
11097 if (!sbuf) goto error;
11098 srelease = 1;
11099 if (release1) {
11100 assert(buf1 != PyUnicode_DATA(str1));
11101 PyMem_Free((void *)buf1);
11102 buf1 = PyUnicode_DATA(str1);
11103 release1 = 0;
11104 }
11105 buf1 = unicode_askind(kind1, buf1, len1, rkind);
11106 if (!buf1) goto error;
11107 release1 = 1;
11108 }
11109 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
11110 PyUnicode_GET_LENGTH(str1)); */
11111 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
11112 PyErr_SetString(PyExc_OverflowError,
11113 "replace string is too long");
11114 goto error;
11115 }
11116 new_size = slen + n * (len2 - len1);
11117 if (new_size == 0) {
11118 u = unicode_new_empty();
11119 goto done;
11120 }
11121 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
11122 PyErr_SetString(PyExc_OverflowError,
11123 "replace string is too long");
11124 goto error;
11125 }
11126 u = PyUnicode_New(new_size, maxchar);
11127 if (!u)
11128 goto error;
11129 assert(PyUnicode_KIND(u) == rkind);
11130 res = PyUnicode_DATA(u);
11131 ires = i = 0;
11132 if (len1 > 0) {
11133 while (n-- > 0) {
11134 /* look for next match */
11135 j = anylib_find(rkind, self,
11136 sbuf + rkind * i, slen-i,
11137 str1, buf1, len1, i);
11138 if (j == -1)
11139 break;
11140 else if (j > i) {
11141 /* copy unchanged part [i:j] */
11142 memcpy(res + rkind * ires,
11143 sbuf + rkind * i,
11144 rkind * (j-i));
11145 ires += j - i;
11146 }
11147 /* copy substitution string */
11148 if (len2 > 0) {
11149 memcpy(res + rkind * ires,
11150 buf2,
11151 rkind * len2);
11152 ires += len2;
11153 }
11154 i = j + len1;
11155 }
11156 if (i < slen)
11157 /* copy tail [i:] */
11158 memcpy(res + rkind * ires,
11159 sbuf + rkind * i,
11160 rkind * (slen-i));
11161 }
11162 else {
11163 /* interleave */
11164 while (n > 0) {
11165 memcpy(res + rkind * ires,
11166 buf2,
11167 rkind * len2);
11168 ires += len2;
11169 if (--n <= 0)
11170 break;
11171 memcpy(res + rkind * ires,
11172 sbuf + rkind * i,
11173 rkind);
11174 ires++;
11175 i++;
11176 }
11177 memcpy(res + rkind * ires,
11178 sbuf + rkind * i,
11179 rkind * (slen-i));
11180 }
11181 }
11182
11183 if (mayshrink) {
11184 unicode_adjust_maxchar(&u);
11185 if (u == NULL)
11186 goto error;
11187 }
11188
11189 done:
11190 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11191 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11192 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11193 if (srelease)
11194 PyMem_Free((void *)sbuf);
11195 if (release1)
11196 PyMem_Free((void *)buf1);
11197 if (release2)
11198 PyMem_Free((void *)buf2);
11199 assert(_PyUnicode_CheckConsistency(u, 1));
11200 return u;
11201
11202 nothing:
11203 /* nothing to replace; return original string (when possible) */
11204 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11205 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11206 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11207 if (srelease)
11208 PyMem_Free((void *)sbuf);
11209 if (release1)
11210 PyMem_Free((void *)buf1);
11211 if (release2)
11212 PyMem_Free((void *)buf2);
11213 return unicode_result_unchanged(self);
11214
11215 error:
11216 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11217 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11218 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11219 if (srelease)
11220 PyMem_Free((void *)sbuf);
11221 if (release1)
11222 PyMem_Free((void *)buf1);
11223 if (release2)
11224 PyMem_Free((void *)buf2);
11225 return NULL;
11226}
11227
11228/* --- Unicode Object Methods --------------------------------------------- */
11229
11230/*[clinic input]
11231str.title as unicode_title
11232
11233Return a version of the string where each word is titlecased.
11234
11235More specifically, words start with uppercased characters and all remaining
11236cased characters have lower case.
11237[clinic start generated code]*/
11238
11239static PyObject *
11240unicode_title_impl(PyObject *self)
11241/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
11242{
11243 if (PyUnicode_READY(self) == -1)
11244 return NULL;
11245 return case_operation(self, do_title);
11246}
11247
11248/*[clinic input]
11249str.capitalize as unicode_capitalize
11250
11251Return a capitalized version of the string.
11252
11253More specifically, make the first character have upper case and the rest lower
11254case.
11255[clinic start generated code]*/
11256
11257static PyObject *
11258unicode_capitalize_impl(PyObject *self)
11259/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
11260{
11261 if (PyUnicode_READY(self) == -1)
11262 return NULL;
11263 if (PyUnicode_GET_LENGTH(self) == 0)
11264 return unicode_result_unchanged(self);
11265 return case_operation(self, do_capitalize);
11266}
11267
11268/*[clinic input]
11269str.casefold as unicode_casefold
11270
11271Return a version of the string suitable for caseless comparisons.
11272[clinic start generated code]*/
11273
11274static PyObject *
11275unicode_casefold_impl(PyObject *self)
11276/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
11277{
11278 if (PyUnicode_READY(self) == -1)
11279 return NULL;
11280 if (PyUnicode_IS_ASCII(self))
11281 return ascii_upper_or_lower(self, 1);
11282 return case_operation(self, do_casefold);
11283}
11284
11285
11286/* Argument converter. Accepts a single Unicode character. */
11287
11288static int
11289convert_uc(PyObject *obj, void *addr)
11290{
11291 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
11292
11293 if (!PyUnicode_Check(obj)) {
11294 PyErr_Format(PyExc_TypeError,
11295 "The fill character must be a unicode character, "
11296 "not %.100s", Py_TYPE(obj)->tp_name);
11297 return 0;
11298 }
11299 if (PyUnicode_READY(obj) < 0)
11300 return 0;
11301 if (PyUnicode_GET_LENGTH(obj) != 1) {
11302 PyErr_SetString(PyExc_TypeError,
11303 "The fill character must be exactly one character long");
11304 return 0;
11305 }
11306 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
11307 return 1;
11308}
11309
11310/*[clinic input]
11311str.center as unicode_center
11312
11313 width: Py_ssize_t
11314 fillchar: Py_UCS4 = ' '
11315 /
11316
11317Return a centered string of length width.
11318
11319Padding is done using the specified fill character (default is a space).
11320[clinic start generated code]*/
11321
11322static PyObject *
11323unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11324/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
11325{
11326 Py_ssize_t marg, left;
11327
11328 if (PyUnicode_READY(self) == -1)
11329 return NULL;
11330
11331 if (PyUnicode_GET_LENGTH(self) >= width)
11332 return unicode_result_unchanged(self);
11333
11334 marg = width - PyUnicode_GET_LENGTH(self);
11335 left = marg / 2 + (marg & width & 1);
11336
11337 return pad(self, left, marg - left, fillchar);
11338}
11339
11340/* This function assumes that str1 and str2 are readied by the caller. */
11341
11342static int
11343unicode_compare(PyObject *str1, PyObject *str2)
11344{
11345#define COMPARE(TYPE1, TYPE2) \
11346 do { \
11347 TYPE1* p1 = (TYPE1 *)data1; \
11348 TYPE2* p2 = (TYPE2 *)data2; \
11349 TYPE1* end = p1 + len; \
11350 Py_UCS4 c1, c2; \
11351 for (; p1 != end; p1++, p2++) { \
11352 c1 = *p1; \
11353 c2 = *p2; \
11354 if (c1 != c2) \
11355 return (c1 < c2) ? -1 : 1; \
11356 } \
11357 } \
11358 while (0)
11359
11360 int kind1, kind2;
11361 const void *data1, *data2;
11362 Py_ssize_t len1, len2, len;
11363
11364 kind1 = PyUnicode_KIND(str1);
11365 kind2 = PyUnicode_KIND(str2);
11366 data1 = PyUnicode_DATA(str1);
11367 data2 = PyUnicode_DATA(str2);
11368 len1 = PyUnicode_GET_LENGTH(str1);
11369 len2 = PyUnicode_GET_LENGTH(str2);
11370 len = Py_MIN(len1, len2);
11371
11372 switch(kind1) {
11373 case PyUnicode_1BYTE_KIND:
11374 {
11375 switch(kind2) {
11376 case PyUnicode_1BYTE_KIND:
11377 {
11378 int cmp = memcmp(data1, data2, len);
11379 /* normalize result of memcmp() into the range [-1; 1] */
11380 if (cmp < 0)
11381 return -1;
11382 if (cmp > 0)
11383 return 1;
11384 break;
11385 }
11386 case PyUnicode_2BYTE_KIND:
11387 COMPARE(Py_UCS1, Py_UCS2);
11388 break;
11389 case PyUnicode_4BYTE_KIND:
11390 COMPARE(Py_UCS1, Py_UCS4);
11391 break;
11392 default:
11393 Py_UNREACHABLE();
11394 }
11395 break;
11396 }
11397 case PyUnicode_2BYTE_KIND:
11398 {
11399 switch(kind2) {
11400 case PyUnicode_1BYTE_KIND:
11401 COMPARE(Py_UCS2, Py_UCS1);
11402 break;
11403 case PyUnicode_2BYTE_KIND:
11404 {
11405 COMPARE(Py_UCS2, Py_UCS2);
11406 break;
11407 }
11408 case PyUnicode_4BYTE_KIND:
11409 COMPARE(Py_UCS2, Py_UCS4);
11410 break;
11411 default:
11412 Py_UNREACHABLE();
11413 }
11414 break;
11415 }
11416 case PyUnicode_4BYTE_KIND:
11417 {
11418 switch(kind2) {
11419 case PyUnicode_1BYTE_KIND:
11420 COMPARE(Py_UCS4, Py_UCS1);
11421 break;
11422 case PyUnicode_2BYTE_KIND:
11423 COMPARE(Py_UCS4, Py_UCS2);
11424 break;
11425 case PyUnicode_4BYTE_KIND:
11426 {
11427#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11428 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11429 /* normalize result of wmemcmp() into the range [-1; 1] */
11430 if (cmp < 0)
11431 return -1;
11432 if (cmp > 0)
11433 return 1;
11434#else
11435 COMPARE(Py_UCS4, Py_UCS4);
11436#endif
11437 break;
11438 }
11439 default:
11440 Py_UNREACHABLE();
11441 }
11442 break;
11443 }
11444 default:
11445 Py_UNREACHABLE();
11446 }
11447
11448 if (len1 == len2)
11449 return 0;
11450 if (len1 < len2)
11451 return -1;
11452 else
11453 return 1;
11454
11455#undef COMPARE
11456}
11457
11458static int
11459unicode_compare_eq(PyObject *str1, PyObject *str2)
11460{
11461 int kind;
11462 const void *data1, *data2;
11463 Py_ssize_t len;
11464 int cmp;
11465
11466 len = PyUnicode_GET_LENGTH(str1);
11467 if (PyUnicode_GET_LENGTH(str2) != len)
11468 return 0;
11469 kind = PyUnicode_KIND(str1);
11470 if (PyUnicode_KIND(str2) != kind)
11471 return 0;
11472 data1 = PyUnicode_DATA(str1);
11473 data2 = PyUnicode_DATA(str2);
11474
11475 cmp = memcmp(data1, data2, len * kind);
11476 return (cmp == 0);
11477}
11478
11479
11480int
11481PyUnicode_Compare(PyObject *left, PyObject *right)
11482{
11483 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11484 if (PyUnicode_READY(left) == -1 ||
11485 PyUnicode_READY(right) == -1)
11486 return -1;
11487
11488 /* a string is equal to itself */
11489 if (left == right)
11490 return 0;
11491
11492 return unicode_compare(left, right);
11493 }
11494 PyErr_Format(PyExc_TypeError,
11495 "Can't compare %.100s and %.100s",
11496 Py_TYPE(left)->tp_name,
11497 Py_TYPE(right)->tp_name);
11498 return -1;
11499}
11500
11501int
11502PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11503{
11504 Py_ssize_t i;
11505 int kind;
11506 Py_UCS4 chr;
11507 const unsigned char *ustr = (const unsigned char *)str;
11508
11509 assert(_PyUnicode_CHECK(uni));
11510 if (!PyUnicode_IS_READY(uni)) {
11511 const wchar_t *ws = _PyUnicode_WSTR(uni);
11512 /* Compare Unicode string and source character set string */
11513 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11514 if (chr != ustr[i])
11515 return (chr < ustr[i]) ? -1 : 1;
11516 }
11517 /* This check keeps Python strings that end in '\0' from comparing equal
11518 to C strings identical up to that point. */
11519 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11520 return 1; /* uni is longer */
11521 if (ustr[i])
11522 return -1; /* str is longer */
11523 return 0;
11524 }
11525 kind = PyUnicode_KIND(uni);
11526 if (kind == PyUnicode_1BYTE_KIND) {
11527 const void *data = PyUnicode_1BYTE_DATA(uni);
11528 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11529 size_t len, len2 = strlen(str);
11530 int cmp;
11531
11532 len = Py_MIN(len1, len2);
11533 cmp = memcmp(data, str, len);
11534 if (cmp != 0) {
11535 if (cmp < 0)
11536 return -1;
11537 else
11538 return 1;
11539 }
11540 if (len1 > len2)
11541 return 1; /* uni is longer */
11542 if (len1 < len2)
11543 return -1; /* str is longer */
11544 return 0;
11545 }
11546 else {
11547 const void *data = PyUnicode_DATA(uni);
11548 /* Compare Unicode string and source character set string */
11549 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11550 if (chr != (unsigned char)str[i])
11551 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11552 /* This check keeps Python strings that end in '\0' from comparing equal
11553 to C strings identical up to that point. */
11554 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11555 return 1; /* uni is longer */
11556 if (str[i])
11557 return -1; /* str is longer */
11558 return 0;
11559 }
11560}
11561
11562static int
11563non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11564{
11565 size_t i, len;
11566 const wchar_t *p;
11567 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11568 if (strlen(str) != len)
11569 return 0;
11570 p = _PyUnicode_WSTR(unicode);
11571 assert(p);
11572 for (i = 0; i < len; i++) {
11573 unsigned char c = (unsigned char)str[i];
11574 if (c >= 128 || p[i] != (wchar_t)c)
11575 return 0;
11576 }
11577 return 1;
11578}
11579
11580int
11581_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11582{
11583 size_t len;
11584 assert(_PyUnicode_CHECK(unicode));
11585 assert(str);
11586#ifndef NDEBUG
11587 for (const char *p = str; *p; p++) {
11588 assert((unsigned char)*p < 128);
11589 }
11590#endif
11591 if (PyUnicode_READY(unicode) == -1) {
11592 /* Memory error or bad data */
11593 PyErr_Clear();
11594 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11595 }
11596 if (!PyUnicode_IS_ASCII(unicode))
11597 return 0;
11598 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11599 return strlen(str) == len &&
11600 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11601}
11602
11603int
11604_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11605{
11606 PyObject *right_uni;
11607
11608 assert(_PyUnicode_CHECK(left));
11609 assert(right->string);
11610#ifndef NDEBUG
11611 for (const char *p = right->string; *p; p++) {
11612 assert((unsigned char)*p < 128);
11613 }
11614#endif
11615
11616 if (PyUnicode_READY(left) == -1) {
11617 /* memory error or bad data */
11618 PyErr_Clear();
11619 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11620 }
11621
11622 if (!PyUnicode_IS_ASCII(left))
11623 return 0;
11624
11625 right_uni = _PyUnicode_FromId(right); /* borrowed */
11626 if (right_uni == NULL) {
11627 /* memory error or bad data */
11628 PyErr_Clear();
11629 return _PyUnicode_EqualToASCIIString(left, right->string);
11630 }
11631
11632 if (left == right_uni)
11633 return 1;
11634
11635 if (PyUnicode_CHECK_INTERNED(left))
11636 return 0;
11637
11638#ifdef INTERNED_STRINGS
11639 assert(_PyUnicode_HASH(right_uni) != -1);
11640 Py_hash_t hash = _PyUnicode_HASH(left);
11641 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
11642 return 0;
11643 }
11644#endif
11645
11646 return unicode_compare_eq(left, right_uni);
11647}
11648
11649PyObject *
11650PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11651{
11652 int result;
11653
11654 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11655 Py_RETURN_NOTIMPLEMENTED;
11656
11657 if (PyUnicode_READY(left) == -1 ||
11658 PyUnicode_READY(right) == -1)
11659 return NULL;
11660
11661 if (left == right) {
11662 switch (op) {
11663 case Py_EQ:
11664 case Py_LE:
11665 case Py_GE:
11666 /* a string is equal to itself */
11667 Py_RETURN_TRUE;
11668 case Py_NE:
11669 case Py_LT:
11670 case Py_GT:
11671 Py_RETURN_FALSE;
11672 default:
11673 PyErr_BadArgument();
11674 return NULL;
11675 }
11676 }
11677 else if (op == Py_EQ || op == Py_NE) {
11678 result = unicode_compare_eq(left, right);
11679 result ^= (op == Py_NE);
11680 return PyBool_FromLong(result);
11681 }
11682 else {
11683 result = unicode_compare(left, right);
11684 Py_RETURN_RICHCOMPARE(result, 0, op);
11685 }
11686}
11687
11688int
11689_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11690{
11691 return unicode_eq(aa, bb);
11692}
11693
11694int
11695PyUnicode_Contains(PyObject *str, PyObject *substr)
11696{
11697 int kind1, kind2;
11698 const void *buf1, *buf2;
11699 Py_ssize_t len1, len2;
11700 int result;
11701
11702 if (!PyUnicode_Check(substr)) {
11703 PyErr_Format(PyExc_TypeError,
11704 "'in <string>' requires string as left operand, not %.100s",
11705 Py_TYPE(substr)->tp_name);
11706 return -1;
11707 }
11708 if (PyUnicode_READY(substr) == -1)
11709 return -1;
11710 if (ensure_unicode(str) < 0)
11711 return -1;
11712
11713 kind1 = PyUnicode_KIND(str);
11714 kind2 = PyUnicode_KIND(substr);
11715 if (kind1 < kind2)
11716 return 0;
11717 len1 = PyUnicode_GET_LENGTH(str);
11718 len2 = PyUnicode_GET_LENGTH(substr);
11719 if (len1 < len2)
11720 return 0;
11721 buf1 = PyUnicode_DATA(str);
11722 buf2 = PyUnicode_DATA(substr);
11723 if (len2 == 1) {
11724 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11725 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11726 return result;
11727 }
11728 if (kind2 != kind1) {
11729 buf2 = unicode_askind(kind2, buf2, len2, kind1);
11730 if (!buf2)
11731 return -1;
11732 }
11733
11734 switch (kind1) {
11735 case PyUnicode_1BYTE_KIND:
11736 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11737 break;
11738 case PyUnicode_2BYTE_KIND:
11739 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11740 break;
11741 case PyUnicode_4BYTE_KIND:
11742 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11743 break;
11744 default:
11745 Py_UNREACHABLE();
11746 }
11747
11748 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11749 if (kind2 != kind1)
11750 PyMem_Free((void *)buf2);
11751
11752 return result;
11753}
11754
11755/* Concat to string or Unicode object giving a new Unicode object. */
11756
11757PyObject *
11758PyUnicode_Concat(PyObject *left, PyObject *right)
11759{
11760 PyObject *result;
11761 Py_UCS4 maxchar, maxchar2;
11762 Py_ssize_t left_len, right_len, new_len;
11763
11764 if (ensure_unicode(left) < 0)
11765 return NULL;
11766
11767 if (!PyUnicode_Check(right)) {
11768 PyErr_Format(PyExc_TypeError,
11769 "can only concatenate str (not \"%.200s\") to str",
11770 Py_TYPE(right)->tp_name);
11771 return NULL;
11772 }
11773 if (PyUnicode_READY(right) < 0)
11774 return NULL;
11775
11776 /* Shortcuts */
11777 PyObject *empty = unicode_get_empty(); // Borrowed reference
11778 if (left == empty) {
11779 return PyUnicode_FromObject(right);
11780 }
11781 if (right == empty) {
11782 return PyUnicode_FromObject(left);
11783 }
11784
11785 left_len = PyUnicode_GET_LENGTH(left);
11786 right_len = PyUnicode_GET_LENGTH(right);
11787 if (left_len > PY_SSIZE_T_MAX - right_len) {
11788 PyErr_SetString(PyExc_OverflowError,
11789 "strings are too large to concat");
11790 return NULL;
11791 }
11792 new_len = left_len + right_len;
11793
11794 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11795 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11796 maxchar = Py_MAX(maxchar, maxchar2);
11797
11798 /* Concat the two Unicode strings */
11799 result = PyUnicode_New(new_len, maxchar);
11800 if (result == NULL)
11801 return NULL;
11802 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11803 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11804 assert(_PyUnicode_CheckConsistency(result, 1));
11805 return result;
11806}
11807
11808void
11809PyUnicode_Append(PyObject **p_left, PyObject *right)
11810{
11811 PyObject *left, *res;
11812 Py_UCS4 maxchar, maxchar2;
11813 Py_ssize_t left_len, right_len, new_len;
11814
11815 if (p_left == NULL) {
11816 if (!PyErr_Occurred())
11817 PyErr_BadInternalCall();
11818 return;
11819 }
11820 left = *p_left;
11821 if (right == NULL || left == NULL
11822 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11823 if (!PyErr_Occurred())
11824 PyErr_BadInternalCall();
11825 goto error;
11826 }
11827
11828 if (PyUnicode_READY(left) == -1)
11829 goto error;
11830 if (PyUnicode_READY(right) == -1)
11831 goto error;
11832
11833 /* Shortcuts */
11834 PyObject *empty = unicode_get_empty(); // Borrowed reference
11835 if (left == empty) {
11836 Py_DECREF(left);
11837 Py_INCREF(right);
11838 *p_left = right;
11839 return;
11840 }
11841 if (right == empty) {
11842 return;
11843 }
11844
11845 left_len = PyUnicode_GET_LENGTH(left);
11846 right_len = PyUnicode_GET_LENGTH(right);
11847 if (left_len > PY_SSIZE_T_MAX - right_len) {
11848 PyErr_SetString(PyExc_OverflowError,
11849 "strings are too large to concat");
11850 goto error;
11851 }
11852 new_len = left_len + right_len;
11853
11854 if (unicode_modifiable(left)
11855 && PyUnicode_CheckExact(right)
11856 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11857 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11858 to change the structure size, but characters are stored just after
11859 the structure, and so it requires to move all characters which is
11860 not so different than duplicating the string. */
11861 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11862 {
11863 /* append inplace */
11864 if (unicode_resize(p_left, new_len) != 0)
11865 goto error;
11866
11867 /* copy 'right' into the newly allocated area of 'left' */
11868 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11869 }
11870 else {
11871 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11872 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11873 maxchar = Py_MAX(maxchar, maxchar2);
11874
11875 /* Concat the two Unicode strings */
11876 res = PyUnicode_New(new_len, maxchar);
11877 if (res == NULL)
11878 goto error;
11879 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11880 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11881 Py_DECREF(left);
11882 *p_left = res;
11883 }
11884 assert(_PyUnicode_CheckConsistency(*p_left, 1));
11885 return;
11886
11887error:
11888 Py_CLEAR(*p_left);
11889}
11890
11891void
11892PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11893{
11894 PyUnicode_Append(pleft, right);
11895 Py_XDECREF(right);
11896}
11897
11898/*
11899Wraps stringlib_parse_args_finds() and additionally ensures that the
11900first argument is a unicode object.
11901*/
11902
11903static inline int
11904parse_args_finds_unicode(const char * function_name, PyObject *args,
11905 PyObject **substring,
11906 Py_ssize_t *start, Py_ssize_t *end)
11907{
11908 if(stringlib_parse_args_finds(function_name, args, substring,
11909 start, end)) {
11910 if (ensure_unicode(*substring) < 0)
11911 return 0;
11912 return 1;
11913 }
11914 return 0;
11915}
11916
11917PyDoc_STRVAR(count__doc__,
11918 "S.count(sub[, start[, end]]) -> int\n\
11919\n\
11920Return the number of non-overlapping occurrences of substring sub in\n\
11921string S[start:end]. Optional arguments start and end are\n\
11922interpreted as in slice notation.");
11923
11924static PyObject *
11925unicode_count(PyObject *self, PyObject *args)
11926{
11927 PyObject *substring = NULL; /* initialize to fix a compiler warning */
11928 Py_ssize_t start = 0;
11929 Py_ssize_t end = PY_SSIZE_T_MAX;
11930 PyObject *result;
11931 int kind1, kind2;
11932 const void *buf1, *buf2;
11933 Py_ssize_t len1, len2, iresult;
11934
11935 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11936 return NULL;
11937
11938 kind1 = PyUnicode_KIND(self);
11939 kind2 = PyUnicode_KIND(substring);
11940 if (kind1 < kind2)
11941 return PyLong_FromLong(0);
11942
11943 len1 = PyUnicode_GET_LENGTH(self);
11944 len2 = PyUnicode_GET_LENGTH(substring);
11945 ADJUST_INDICES(start, end, len1);
11946 if (end - start < len2)
11947 return PyLong_FromLong(0);
11948
11949 buf1 = PyUnicode_DATA(self);
11950 buf2 = PyUnicode_DATA(substring);
11951 if (kind2 != kind1) {
11952 buf2 = unicode_askind(kind2, buf2, len2, kind1);
11953 if (!buf2)
11954 return NULL;
11955 }
11956 switch (kind1) {
11957 case PyUnicode_1BYTE_KIND:
11958 iresult = ucs1lib_count(
11959 ((const Py_UCS1*)buf1) + start, end - start,
11960 buf2, len2, PY_SSIZE_T_MAX
11961 );
11962 break;
11963 case PyUnicode_2BYTE_KIND:
11964 iresult = ucs2lib_count(
11965 ((const Py_UCS2*)buf1) + start, end - start,
11966 buf2, len2, PY_SSIZE_T_MAX
11967 );
11968 break;
11969 case PyUnicode_4BYTE_KIND:
11970 iresult = ucs4lib_count(
11971 ((const Py_UCS4*)buf1) + start, end - start,
11972 buf2, len2, PY_SSIZE_T_MAX
11973 );
11974 break;
11975 default:
11976 Py_UNREACHABLE();
11977 }
11978
11979 result = PyLong_FromSsize_t(iresult);
11980
11981 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11982 if (kind2 != kind1)
11983 PyMem_Free((void *)buf2);
11984
11985 return result;
11986}
11987
11988/*[clinic input]
11989str.encode as unicode_encode
11990
11991 encoding: str(c_default="NULL") = 'utf-8'
11992 The encoding in which to encode the string.
11993 errors: str(c_default="NULL") = 'strict'
11994 The error handling scheme to use for encoding errors.
11995 The default is 'strict' meaning that encoding errors raise a
11996 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11997 'xmlcharrefreplace' as well as any other name registered with
11998 codecs.register_error that can handle UnicodeEncodeErrors.
11999
12000Encode the string using the codec registered for encoding.
12001[clinic start generated code]*/
12002
12003static PyObject *
12004unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
12005/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
12006{
12007 return PyUnicode_AsEncodedString(self, encoding, errors);
12008}
12009
12010/*[clinic input]
12011str.expandtabs as unicode_expandtabs
12012
12013 tabsize: int = 8
12014
12015Return a copy where all tab characters are expanded using spaces.
12016
12017If tabsize is not given, a tab size of 8 characters is assumed.
12018[clinic start generated code]*/
12019
12020static PyObject *
12021unicode_expandtabs_impl(PyObject *self, int tabsize)
12022/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
12023{
12024 Py_ssize_t i, j, line_pos, src_len, incr;
12025 Py_UCS4 ch;
12026 PyObject *u;
12027 const void *src_data;
12028 void *dest_data;
12029 int kind;
12030 int found;
12031
12032 if (PyUnicode_READY(self) == -1)
12033 return NULL;
12034
12035 /* First pass: determine size of output string */
12036 src_len = PyUnicode_GET_LENGTH(self);
12037 i = j = line_pos = 0;
12038 kind = PyUnicode_KIND(self);
12039 src_data = PyUnicode_DATA(self);
12040 found = 0;
12041 for (; i < src_len; i++) {
12042 ch = PyUnicode_READ(kind, src_data, i);
12043 if (ch == '\t') {
12044 found = 1;
12045 if (tabsize > 0) {
12046 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
12047 if (j > PY_SSIZE_T_MAX - incr)
12048 goto overflow;
12049 line_pos += incr;
12050 j += incr;
12051 }
12052 }
12053 else {
12054 if (j > PY_SSIZE_T_MAX - 1)
12055 goto overflow;
12056 line_pos++;
12057 j++;
12058 if (ch == '\n' || ch == '\r')
12059 line_pos = 0;
12060 }
12061 }
12062 if (!found)
12063 return unicode_result_unchanged(self);
12064
12065 /* Second pass: create output string and fill it */
12066 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
12067 if (!u)
12068 return NULL;
12069 dest_data = PyUnicode_DATA(u);
12070
12071 i = j = line_pos = 0;
12072
12073 for (; i < src_len; i++) {
12074 ch = PyUnicode_READ(kind, src_data, i);
12075 if (ch == '\t') {
12076 if (tabsize > 0) {
12077 incr = tabsize - (line_pos % tabsize);
12078 line_pos += incr;
12079 unicode_fill(kind, dest_data, ' ', j, incr);
12080 j += incr;
12081 }
12082 }
12083 else {
12084 line_pos++;
12085 PyUnicode_WRITE(kind, dest_data, j, ch);
12086 j++;
12087 if (ch == '\n' || ch == '\r')
12088 line_pos = 0;
12089 }
12090 }
12091 assert (j == PyUnicode_GET_LENGTH(u));
12092 return unicode_result(u);
12093
12094 overflow:
12095 PyErr_SetString(PyExc_OverflowError, "new string is too long");
12096 return NULL;
12097}
12098
12099PyDoc_STRVAR(find__doc__,
12100 "S.find(sub[, start[, end]]) -> int\n\
12101\n\
12102Return the lowest index in S where substring sub is found,\n\
12103such that sub is contained within S[start:end]. Optional\n\
12104arguments start and end are interpreted as in slice notation.\n\
12105\n\
12106Return -1 on failure.");
12107
12108static PyObject *
12109unicode_find(PyObject *self, PyObject *args)
12110{
12111 /* initialize variables to prevent gcc warning */
12112 PyObject *substring = NULL;
12113 Py_ssize_t start = 0;
12114 Py_ssize_t end = 0;
12115 Py_ssize_t result;
12116
12117 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
12118 return NULL;
12119
12120 if (PyUnicode_READY(self) == -1)
12121 return NULL;
12122
12123 result = any_find_slice(self, substring, start, end, 1);
12124
12125 if (result == -2)
12126 return NULL;
12127
12128 return PyLong_FromSsize_t(result);
12129}
12130
12131static PyObject *
12132unicode_getitem(PyObject *self, Py_ssize_t index)
12133{
12134 const void *data;
12135 enum PyUnicode_Kind kind;
12136 Py_UCS4 ch;
12137
12138 if (!PyUnicode_Check(self)) {
12139 PyErr_BadArgument();
12140 return NULL;
12141 }
12142 if (PyUnicode_READY(self) == -1) {
12143 return NULL;
12144 }
12145 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12146 PyErr_SetString(PyExc_IndexError, "string index out of range");
12147 return NULL;
12148 }
12149 kind = PyUnicode_KIND(self);
12150 data = PyUnicode_DATA(self);
12151 ch = PyUnicode_READ(kind, data, index);
12152 return unicode_char(ch);
12153}
12154
12155/* Believe it or not, this produces the same value for ASCII strings
12156 as bytes_hash(). */
12157static Py_hash_t
12158unicode_hash(PyObject *self)
12159{
12160 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
12161
12162#ifdef Py_DEBUG
12163 assert(_Py_HashSecret_Initialized);
12164#endif
12165 if (_PyUnicode_HASH(self) != -1)
12166 return _PyUnicode_HASH(self);
12167 if (PyUnicode_READY(self) == -1)
12168 return -1;
12169
12170 x = _Py_HashBytes(PyUnicode_DATA(self),
12171 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
12172 _PyUnicode_HASH(self) = x;
12173 return x;
12174}
12175
12176PyDoc_STRVAR(index__doc__,
12177 "S.index(sub[, start[, end]]) -> int\n\
12178\n\
12179Return the lowest index in S where substring sub is found,\n\
12180such that sub is contained within S[start:end]. Optional\n\
12181arguments start and end are interpreted as in slice notation.\n\
12182\n\
12183Raises ValueError when the substring is not found.");
12184
12185static PyObject *
12186unicode_index(PyObject *self, PyObject *args)
12187{
12188 /* initialize variables to prevent gcc warning */
12189 Py_ssize_t result;
12190 PyObject *substring = NULL;
12191 Py_ssize_t start = 0;
12192 Py_ssize_t end = 0;
12193
12194 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
12195 return NULL;
12196
12197 if (PyUnicode_READY(self) == -1)
12198 return NULL;
12199
12200 result = any_find_slice(self, substring, start, end, 1);
12201
12202 if (result == -2)
12203 return NULL;
12204
12205 if (result < 0) {
12206 PyErr_SetString(PyExc_ValueError, "substring not found");
12207 return NULL;
12208 }
12209
12210 return PyLong_FromSsize_t(result);
12211}
12212
12213/*[clinic input]
12214str.isascii as unicode_isascii
12215
12216Return True if all characters in the string are ASCII, False otherwise.
12217
12218ASCII characters have code points in the range U+0000-U+007F.
12219Empty string is ASCII too.
12220[clinic start generated code]*/
12221
12222static PyObject *
12223unicode_isascii_impl(PyObject *self)
12224/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12225{
12226 if (PyUnicode_READY(self) == -1) {
12227 return NULL;
12228 }
12229 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12230}
12231
12232/*[clinic input]
12233str.islower as unicode_islower
12234
12235Return True if the string is a lowercase string, False otherwise.
12236
12237A string is lowercase if all cased characters in the string are lowercase and
12238there is at least one cased character in the string.
12239[clinic start generated code]*/
12240
12241static PyObject *
12242unicode_islower_impl(PyObject *self)
12243/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
12244{
12245 Py_ssize_t i, length;
12246 int kind;
12247 const void *data;
12248 int cased;
12249
12250 if (PyUnicode_READY(self) == -1)
12251 return NULL;
12252 length = PyUnicode_GET_LENGTH(self);
12253 kind = PyUnicode_KIND(self);
12254 data = PyUnicode_DATA(self);
12255
12256 /* Shortcut for single character strings */
12257 if (length == 1)
12258 return PyBool_FromLong(
12259 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
12260
12261 /* Special case for empty strings */
12262 if (length == 0)
12263 Py_RETURN_FALSE;
12264
12265 cased = 0;
12266 for (i = 0; i < length; i++) {
12267 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12268
12269 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
12270 Py_RETURN_FALSE;
12271 else if (!cased && Py_UNICODE_ISLOWER(ch))
12272 cased = 1;
12273 }
12274 return PyBool_FromLong(cased);
12275}
12276
12277/*[clinic input]
12278str.isupper as unicode_isupper
12279
12280Return True if the string is an uppercase string, False otherwise.
12281
12282A string is uppercase if all cased characters in the string are uppercase and
12283there is at least one cased character in the string.
12284[clinic start generated code]*/
12285
12286static PyObject *
12287unicode_isupper_impl(PyObject *self)
12288/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
12289{
12290 Py_ssize_t i, length;
12291 int kind;
12292 const void *data;
12293 int cased;
12294
12295 if (PyUnicode_READY(self) == -1)
12296 return NULL;
12297 length = PyUnicode_GET_LENGTH(self);
12298 kind = PyUnicode_KIND(self);
12299 data = PyUnicode_DATA(self);
12300
12301 /* Shortcut for single character strings */
12302 if (length == 1)
12303 return PyBool_FromLong(
12304 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
12305
12306 /* Special case for empty strings */
12307 if (length == 0)
12308 Py_RETURN_FALSE;
12309
12310 cased = 0;
12311 for (i = 0; i < length; i++) {
12312 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12313
12314 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
12315 Py_RETURN_FALSE;
12316 else if (!cased && Py_UNICODE_ISUPPER(ch))
12317 cased = 1;
12318 }
12319 return PyBool_FromLong(cased);
12320}
12321
12322/*[clinic input]
12323str.istitle as unicode_istitle
12324
12325Return True if the string is a title-cased string, False otherwise.
12326
12327In a title-cased string, upper- and title-case characters may only
12328follow uncased characters and lowercase characters only cased ones.
12329[clinic start generated code]*/
12330
12331static PyObject *
12332unicode_istitle_impl(PyObject *self)
12333/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12334{
12335 Py_ssize_t i, length;
12336 int kind;
12337 const void *data;
12338 int cased, previous_is_cased;
12339
12340 if (PyUnicode_READY(self) == -1)
12341 return NULL;
12342 length = PyUnicode_GET_LENGTH(self);
12343 kind = PyUnicode_KIND(self);
12344 data = PyUnicode_DATA(self);
12345
12346 /* Shortcut for single character strings */
12347 if (length == 1) {
12348 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12349 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12350 (Py_UNICODE_ISUPPER(ch) != 0));
12351 }
12352
12353 /* Special case for empty strings */
12354 if (length == 0)
12355 Py_RETURN_FALSE;
12356
12357 cased = 0;
12358 previous_is_cased = 0;
12359 for (i = 0; i < length; i++) {
12360 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12361
12362 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12363 if (previous_is_cased)
12364 Py_RETURN_FALSE;
12365 previous_is_cased = 1;
12366 cased = 1;
12367 }
12368 else if (Py_UNICODE_ISLOWER(ch)) {
12369 if (!previous_is_cased)
12370 Py_RETURN_FALSE;
12371 previous_is_cased = 1;
12372 cased = 1;
12373 }
12374 else
12375 previous_is_cased = 0;
12376 }
12377 return PyBool_FromLong(cased);
12378}
12379
12380/*[clinic input]
12381str.isspace as unicode_isspace
12382
12383Return True if the string is a whitespace string, False otherwise.
12384
12385A string is whitespace if all characters in the string are whitespace and there
12386is at least one character in the string.
12387[clinic start generated code]*/
12388
12389static PyObject *
12390unicode_isspace_impl(PyObject *self)
12391/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12392{
12393 Py_ssize_t i, length;
12394 int kind;
12395 const void *data;
12396
12397 if (PyUnicode_READY(self) == -1)
12398 return NULL;
12399 length = PyUnicode_GET_LENGTH(self);
12400 kind = PyUnicode_KIND(self);
12401 data = PyUnicode_DATA(self);
12402
12403 /* Shortcut for single character strings */
12404 if (length == 1)
12405 return PyBool_FromLong(
12406 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12407
12408 /* Special case for empty strings */
12409 if (length == 0)
12410 Py_RETURN_FALSE;
12411
12412 for (i = 0; i < length; i++) {
12413 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12414 if (!Py_UNICODE_ISSPACE(ch))
12415 Py_RETURN_FALSE;
12416 }
12417 Py_RETURN_TRUE;
12418}
12419
12420/*[clinic input]
12421str.isalpha as unicode_isalpha
12422
12423Return True if the string is an alphabetic string, False otherwise.
12424
12425A string is alphabetic if all characters in the string are alphabetic and there
12426is at least one character in the string.
12427[clinic start generated code]*/
12428
12429static PyObject *
12430unicode_isalpha_impl(PyObject *self)
12431/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12432{
12433 Py_ssize_t i, length;
12434 int kind;
12435 const void *data;
12436
12437 if (PyUnicode_READY(self) == -1)
12438 return NULL;
12439 length = PyUnicode_GET_LENGTH(self);
12440 kind = PyUnicode_KIND(self);
12441 data = PyUnicode_DATA(self);
12442
12443 /* Shortcut for single character strings */
12444 if (length == 1)
12445 return PyBool_FromLong(
12446 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12447
12448 /* Special case for empty strings */
12449 if (length == 0)
12450 Py_RETURN_FALSE;
12451
12452 for (i = 0; i < length; i++) {
12453 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12454 Py_RETURN_FALSE;
12455 }
12456 Py_RETURN_TRUE;
12457}
12458
12459/*[clinic input]
12460str.isalnum as unicode_isalnum
12461
12462Return True if the string is an alpha-numeric string, False otherwise.
12463
12464A string is alpha-numeric if all characters in the string are alpha-numeric and
12465there is at least one character in the string.
12466[clinic start generated code]*/
12467
12468static PyObject *
12469unicode_isalnum_impl(PyObject *self)
12470/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12471{
12472 int kind;
12473 const void *data;
12474 Py_ssize_t len, i;
12475
12476 if (PyUnicode_READY(self) == -1)
12477 return NULL;
12478
12479 kind = PyUnicode_KIND(self);
12480 data = PyUnicode_DATA(self);
12481 len = PyUnicode_GET_LENGTH(self);
12482
12483 /* Shortcut for single character strings */
12484 if (len == 1) {
12485 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12486 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12487 }
12488
12489 /* Special case for empty strings */
12490 if (len == 0)
12491 Py_RETURN_FALSE;
12492
12493 for (i = 0; i < len; i++) {
12494 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12495 if (!Py_UNICODE_ISALNUM(ch))
12496 Py_RETURN_FALSE;
12497 }
12498 Py_RETURN_TRUE;
12499}
12500
12501/*[clinic input]
12502str.isdecimal as unicode_isdecimal
12503
12504Return True if the string is a decimal string, False otherwise.
12505
12506A string is a decimal string if all characters in the string are decimal and
12507there is at least one character in the string.
12508[clinic start generated code]*/
12509
12510static PyObject *
12511unicode_isdecimal_impl(PyObject *self)
12512/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12513{
12514 Py_ssize_t i, length;
12515 int kind;
12516 const void *data;
12517
12518 if (PyUnicode_READY(self) == -1)
12519 return NULL;
12520 length = PyUnicode_GET_LENGTH(self);
12521 kind = PyUnicode_KIND(self);
12522 data = PyUnicode_DATA(self);
12523
12524 /* Shortcut for single character strings */
12525 if (length == 1)
12526 return PyBool_FromLong(
12527 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12528
12529 /* Special case for empty strings */
12530 if (length == 0)
12531 Py_RETURN_FALSE;
12532
12533 for (i = 0; i < length; i++) {
12534 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12535 Py_RETURN_FALSE;
12536 }
12537 Py_RETURN_TRUE;
12538}
12539
12540/*[clinic input]
12541str.isdigit as unicode_isdigit
12542
12543Return True if the string is a digit string, False otherwise.
12544
12545A string is a digit string if all characters in the string are digits and there
12546is at least one character in the string.
12547[clinic start generated code]*/
12548
12549static PyObject *
12550unicode_isdigit_impl(PyObject *self)
12551/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12552{
12553 Py_ssize_t i, length;
12554 int kind;
12555 const void *data;
12556
12557 if (PyUnicode_READY(self) == -1)
12558 return NULL;
12559 length = PyUnicode_GET_LENGTH(self);
12560 kind = PyUnicode_KIND(self);
12561 data = PyUnicode_DATA(self);
12562
12563 /* Shortcut for single character strings */
12564 if (length == 1) {
12565 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12566 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12567 }
12568
12569 /* Special case for empty strings */
12570 if (length == 0)
12571 Py_RETURN_FALSE;
12572
12573 for (i = 0; i < length; i++) {
12574 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12575 Py_RETURN_FALSE;
12576 }
12577 Py_RETURN_TRUE;
12578}
12579
12580/*[clinic input]
12581str.isnumeric as unicode_isnumeric
12582
12583Return True if the string is a numeric string, False otherwise.
12584
12585A string is numeric if all characters in the string are numeric and there is at
12586least one character in the string.
12587[clinic start generated code]*/
12588
12589static PyObject *
12590unicode_isnumeric_impl(PyObject *self)
12591/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12592{
12593 Py_ssize_t i, length;
12594 int kind;
12595 const void *data;
12596
12597 if (PyUnicode_READY(self) == -1)
12598 return NULL;
12599 length = PyUnicode_GET_LENGTH(self);
12600 kind = PyUnicode_KIND(self);
12601 data = PyUnicode_DATA(self);
12602
12603 /* Shortcut for single character strings */
12604 if (length == 1)
12605 return PyBool_FromLong(
12606 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12607
12608 /* Special case for empty strings */
12609 if (length == 0)
12610 Py_RETURN_FALSE;
12611
12612 for (i = 0; i < length; i++) {
12613 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12614 Py_RETURN_FALSE;
12615 }
12616 Py_RETURN_TRUE;
12617}
12618
12619Py_ssize_t
12620_PyUnicode_ScanIdentifier(PyObject *self)
12621{
12622 Py_ssize_t i;
12623 if (PyUnicode_READY(self) == -1)
12624 return -1;
12625
12626 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12627 if (len == 0) {
12628 /* an empty string is not a valid identifier */
12629 return 0;
12630 }
12631
12632 int kind = PyUnicode_KIND(self);
12633 const void *data = PyUnicode_DATA(self);
12634 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12635 /* PEP 3131 says that the first character must be in
12636 XID_Start and subsequent characters in XID_Continue,
12637 and for the ASCII range, the 2.x rules apply (i.e
12638 start with letters and underscore, continue with
12639 letters, digits, underscore). However, given the current
12640 definition of XID_Start and XID_Continue, it is sufficient
12641 to check just for these, except that _ must be allowed
12642 as starting an identifier. */
12643 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12644 return 0;
12645 }
12646
12647 for (i = 1; i < len; i++) {
12648 ch = PyUnicode_READ(kind, data, i);
12649 if (!_PyUnicode_IsXidContinue(ch)) {
12650 return i;
12651 }
12652 }
12653 return i;
12654}
12655
12656int
12657PyUnicode_IsIdentifier(PyObject *self)
12658{
12659 if (PyUnicode_IS_READY(self)) {
12660 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12661 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12662 /* an empty string is not a valid identifier */
12663 return len && i == len;
12664 }
12665 else {
12666_Py_COMP_DIAG_PUSH
12667_Py_COMP_DIAG_IGNORE_DEPR_DECLS
12668 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
12669 if (len == 0) {
12670 /* an empty string is not a valid identifier */
12671 return 0;
12672 }
12673
12674 const wchar_t *wstr = _PyUnicode_WSTR(self);
12675 Py_UCS4 ch = wstr[i++];
12676#if SIZEOF_WCHAR_T == 2
12677 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12678 && i < len
12679 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12680 {
12681 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12682 i++;
12683 }
12684#endif
12685 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12686 return 0;
12687 }
12688
12689 while (i < len) {
12690 ch = wstr[i++];
12691#if SIZEOF_WCHAR_T == 2
12692 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12693 && i < len
12694 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12695 {
12696 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12697 i++;
12698 }
12699#endif
12700 if (!_PyUnicode_IsXidContinue(ch)) {
12701 return 0;
12702 }
12703 }
12704 return 1;
12705_Py_COMP_DIAG_POP
12706 }
12707}
12708
12709/*[clinic input]
12710str.isidentifier as unicode_isidentifier
12711
12712Return True if the string is a valid Python identifier, False otherwise.
12713
12714Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12715such as "def" or "class".
12716[clinic start generated code]*/
12717
12718static PyObject *
12719unicode_isidentifier_impl(PyObject *self)
12720/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12721{
12722 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12723}
12724
12725/*[clinic input]
12726str.isprintable as unicode_isprintable
12727
12728Return True if the string is printable, False otherwise.
12729
12730A string is printable if all of its characters are considered printable in
12731repr() or if it is empty.
12732[clinic start generated code]*/
12733
12734static PyObject *
12735unicode_isprintable_impl(PyObject *self)
12736/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12737{
12738 Py_ssize_t i, length;
12739 int kind;
12740 const void *data;
12741
12742 if (PyUnicode_READY(self) == -1)
12743 return NULL;
12744 length = PyUnicode_GET_LENGTH(self);
12745 kind = PyUnicode_KIND(self);
12746 data = PyUnicode_DATA(self);
12747
12748 /* Shortcut for single character strings */
12749 if (length == 1)
12750 return PyBool_FromLong(
12751 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12752
12753 for (i = 0; i < length; i++) {
12754 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12755 Py_RETURN_FALSE;
12756 }
12757 }
12758 Py_RETURN_TRUE;
12759}
12760
12761/*[clinic input]
12762str.join as unicode_join
12763
12764 iterable: object
12765 /
12766
12767Concatenate any number of strings.
12768
12769The string whose method is called is inserted in between each given string.
12770The result is returned as a new string.
12771
12772Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12773[clinic start generated code]*/
12774
12775static PyObject *
12776unicode_join(PyObject *self, PyObject *iterable)
12777/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12778{
12779 return PyUnicode_Join(self, iterable);
12780}
12781
12782static Py_ssize_t
12783unicode_length(PyObject *self)
12784{
12785 if (PyUnicode_READY(self) == -1)
12786 return -1;
12787 return PyUnicode_GET_LENGTH(self);
12788}
12789
12790/*[clinic input]
12791str.ljust as unicode_ljust
12792
12793 width: Py_ssize_t
12794 fillchar: Py_UCS4 = ' '
12795 /
12796
12797Return a left-justified string of length width.
12798
12799Padding is done using the specified fill character (default is a space).
12800[clinic start generated code]*/
12801
12802static PyObject *
12803unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12804/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12805{
12806 if (PyUnicode_READY(self) == -1)
12807 return NULL;
12808
12809 if (PyUnicode_GET_LENGTH(self) >= width)
12810 return unicode_result_unchanged(self);
12811
12812 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12813}
12814
12815/*[clinic input]
12816str.lower as unicode_lower
12817
12818Return a copy of the string converted to lowercase.
12819[clinic start generated code]*/
12820
12821static PyObject *
12822unicode_lower_impl(PyObject *self)
12823/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12824{
12825 if (PyUnicode_READY(self) == -1)
12826 return NULL;
12827 if (PyUnicode_IS_ASCII(self))
12828 return ascii_upper_or_lower(self, 1);
12829 return case_operation(self, do_lower);
12830}
12831
12832#define LEFTSTRIP 0
12833#define RIGHTSTRIP 1
12834#define BOTHSTRIP 2
12835
12836/* Arrays indexed by above */
12837static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12838
12839#define STRIPNAME(i) (stripfuncnames[i])
12840
12841/* externally visible for str.strip(unicode) */
12842PyObject *
12843_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12844{
12845 const void *data;
12846 int kind;
12847 Py_ssize_t i, j, len;
12848 BLOOM_MASK sepmask;
12849 Py_ssize_t seplen;
12850
12851 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12852 return NULL;
12853
12854 kind = PyUnicode_KIND(self);
12855 data = PyUnicode_DATA(self);
12856 len = PyUnicode_GET_LENGTH(self);
12857 seplen = PyUnicode_GET_LENGTH(sepobj);
12858 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12859 PyUnicode_DATA(sepobj),
12860 seplen);
12861
12862 i = 0;
12863 if (striptype != RIGHTSTRIP) {
12864 while (i < len) {
12865 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12866 if (!BLOOM(sepmask, ch))
12867 break;
12868 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12869 break;
12870 i++;
12871 }
12872 }
12873
12874 j = len;
12875 if (striptype != LEFTSTRIP) {
12876 j--;
12877 while (j >= i) {
12878 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12879 if (!BLOOM(sepmask, ch))
12880 break;
12881 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12882 break;
12883 j--;
12884 }
12885
12886 j++;
12887 }
12888
12889 return PyUnicode_Substring(self, i, j);
12890}
12891
12892PyObject*
12893PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12894{
12895 const unsigned char *data;
12896 int kind;
12897 Py_ssize_t length;
12898
12899 if (PyUnicode_READY(self) == -1)
12900 return NULL;
12901
12902 length = PyUnicode_GET_LENGTH(self);
12903 end = Py_MIN(end, length);
12904
12905 if (start == 0 && end == length)
12906 return unicode_result_unchanged(self);
12907
12908 if (start < 0 || end < 0) {
12909 PyErr_SetString(PyExc_IndexError, "string index out of range");
12910 return NULL;
12911 }
12912 if (start >= length || end < start)
12913 _Py_RETURN_UNICODE_EMPTY();
12914
12915 length = end - start;
12916 if (PyUnicode_IS_ASCII(self)) {
12917 data = PyUnicode_1BYTE_DATA(self);
12918 return _PyUnicode_FromASCII((const char*)(data + start), length);
12919 }
12920 else {
12921 kind = PyUnicode_KIND(self);
12922 data = PyUnicode_1BYTE_DATA(self);
12923 return PyUnicode_FromKindAndData(kind,
12924 data + kind * start,
12925 length);
12926 }
12927}
12928
12929static PyObject *
12930do_strip(PyObject *self, int striptype)
12931{
12932 Py_ssize_t len, i, j;
12933
12934 if (PyUnicode_READY(self) == -1)
12935 return NULL;
12936
12937 len = PyUnicode_GET_LENGTH(self);
12938
12939 if (PyUnicode_IS_ASCII(self)) {
12940 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12941
12942 i = 0;
12943 if (striptype != RIGHTSTRIP) {
12944 while (i < len) {
12945 Py_UCS1 ch = data[i];
12946 if (!_Py_ascii_whitespace[ch])
12947 break;
12948 i++;
12949 }
12950 }
12951
12952 j = len;
12953 if (striptype != LEFTSTRIP) {
12954 j--;
12955 while (j >= i) {
12956 Py_UCS1 ch = data[j];
12957 if (!_Py_ascii_whitespace[ch])
12958 break;
12959 j--;
12960 }
12961 j++;
12962 }
12963 }
12964 else {
12965 int kind = PyUnicode_KIND(self);
12966 const void *data = PyUnicode_DATA(self);
12967
12968 i = 0;
12969 if (striptype != RIGHTSTRIP) {
12970 while (i < len) {
12971 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12972 if (!Py_UNICODE_ISSPACE(ch))
12973 break;
12974 i++;
12975 }
12976 }
12977
12978 j = len;
12979 if (striptype != LEFTSTRIP) {
12980 j--;
12981 while (j >= i) {
12982 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12983 if (!Py_UNICODE_ISSPACE(ch))
12984 break;
12985 j--;
12986 }
12987 j++;
12988 }
12989 }
12990
12991 return PyUnicode_Substring(self, i, j);
12992}
12993
12994
12995static PyObject *
12996do_argstrip(PyObject *self, int striptype, PyObject *sep)
12997{
12998 if (sep != Py_None) {
12999 if (PyUnicode_Check(sep))
13000 return _PyUnicode_XStrip(self, striptype, sep);
13001 else {
13002 PyErr_Format(PyExc_TypeError,
13003 "%s arg must be None or str",
13004 STRIPNAME(striptype));
13005 return NULL;
13006 }
13007 }
13008
13009 return do_strip(self, striptype);
13010}
13011
13012
13013/*[clinic input]
13014str.strip as unicode_strip
13015
13016 chars: object = None
13017 /
13018
13019Return a copy of the string with leading and trailing whitespace removed.
13020
13021If chars is given and not None, remove characters in chars instead.
13022[clinic start generated code]*/
13023
13024static PyObject *
13025unicode_strip_impl(PyObject *self, PyObject *chars)
13026/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
13027{
13028 return do_argstrip(self, BOTHSTRIP, chars);
13029}
13030
13031
13032/*[clinic input]
13033str.lstrip as unicode_lstrip
13034
13035 chars: object = None
13036 /
13037
13038Return a copy of the string with leading whitespace removed.
13039
13040If chars is given and not None, remove characters in chars instead.
13041[clinic start generated code]*/
13042
13043static PyObject *
13044unicode_lstrip_impl(PyObject *self, PyObject *chars)
13045/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
13046{
13047 return do_argstrip(self, LEFTSTRIP, chars);
13048}
13049
13050
13051/*[clinic input]
13052str.rstrip as unicode_rstrip
13053
13054 chars: object = None
13055 /
13056
13057Return a copy of the string with trailing whitespace removed.
13058
13059If chars is given and not None, remove characters in chars instead.
13060[clinic start generated code]*/
13061
13062static PyObject *
13063unicode_rstrip_impl(PyObject *self, PyObject *chars)
13064/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
13065{
13066 return do_argstrip(self, RIGHTSTRIP, chars);
13067}
13068
13069
13070static PyObject*
13071unicode_repeat(PyObject *str, Py_ssize_t len)
13072{
13073 PyObject *u;
13074 Py_ssize_t nchars, n;
13075
13076 if (len < 1)
13077 _Py_RETURN_UNICODE_EMPTY();
13078
13079 /* no repeat, return original string */
13080 if (len == 1)
13081 return unicode_result_unchanged(str);
13082
13083 if (PyUnicode_READY(str) == -1)
13084 return NULL;
13085
13086 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
13087 PyErr_SetString(PyExc_OverflowError,
13088 "repeated string is too long");
13089 return NULL;
13090 }
13091 nchars = len * PyUnicode_GET_LENGTH(str);
13092
13093 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
13094 if (!u)
13095 return NULL;
13096 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
13097
13098 if (PyUnicode_GET_LENGTH(str) == 1) {
13099 int kind = PyUnicode_KIND(str);
13100 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
13101 if (kind == PyUnicode_1BYTE_KIND) {
13102 void *to = PyUnicode_DATA(u);
13103 memset(to, (unsigned char)fill_char, len);
13104 }
13105 else if (kind == PyUnicode_2BYTE_KIND) {
13106 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
13107 for (n = 0; n < len; ++n)
13108 ucs2[n] = fill_char;
13109 } else {
13110 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
13111 assert(kind == PyUnicode_4BYTE_KIND);
13112 for (n = 0; n < len; ++n)
13113 ucs4[n] = fill_char;
13114 }
13115 }
13116 else {
13117 /* number of characters copied this far */
13118 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
13119 Py_ssize_t char_size = PyUnicode_KIND(str);
13120 char *to = (char *) PyUnicode_DATA(u);
13121 memcpy(to, PyUnicode_DATA(str),
13122 PyUnicode_GET_LENGTH(str) * char_size);
13123 while (done < nchars) {
13124 n = (done <= nchars-done) ? done : nchars-done;
13125 memcpy(to + (done * char_size), to, n * char_size);
13126 done += n;
13127 }
13128 }
13129
13130 assert(_PyUnicode_CheckConsistency(u, 1));
13131 return u;
13132}
13133
13134PyObject *
13135PyUnicode_Replace(PyObject *str,
13136 PyObject *substr,
13137 PyObject *replstr,
13138 Py_ssize_t maxcount)
13139{
13140 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
13141 ensure_unicode(replstr) < 0)
13142 return NULL;
13143 return replace(str, substr, replstr, maxcount);
13144}
13145
13146/*[clinic input]
13147str.replace as unicode_replace
13148
13149 old: unicode
13150 new: unicode
13151 count: Py_ssize_t = -1
13152 Maximum number of occurrences to replace.
13153 -1 (the default value) means replace all occurrences.
13154 /
13155
13156Return a copy with all occurrences of substring old replaced by new.
13157
13158If the optional argument count is given, only the first count occurrences are
13159replaced.
13160[clinic start generated code]*/
13161
13162static PyObject *
13163unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13164 Py_ssize_t count)
13165/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
13166{
13167 if (PyUnicode_READY(self) == -1)
13168 return NULL;
13169 return replace(self, old, new, count);
13170}
13171
13172/*[clinic input]
13173str.removeprefix as unicode_removeprefix
13174
13175 prefix: unicode
13176 /
13177
13178Return a str with the given prefix string removed if present.
13179
13180If the string starts with the prefix string, return string[len(prefix):].
13181Otherwise, return a copy of the original string.
13182[clinic start generated code]*/
13183
13184static PyObject *
13185unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13186/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13187{
13188 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13189 if (match == -1) {
13190 return NULL;
13191 }
13192 if (match) {
13193 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13194 PyUnicode_GET_LENGTH(self));
13195 }
13196 return unicode_result_unchanged(self);
13197}
13198
13199/*[clinic input]
13200str.removesuffix as unicode_removesuffix
13201
13202 suffix: unicode
13203 /
13204
13205Return a str with the given suffix string removed if present.
13206
13207If the string ends with the suffix string and that suffix is not empty,
13208return string[:-len(suffix)]. Otherwise, return a copy of the original
13209string.
13210[clinic start generated code]*/
13211
13212static PyObject *
13213unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13214/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13215{
13216 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13217 if (match == -1) {
13218 return NULL;
13219 }
13220 if (match) {
13221 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13222 - PyUnicode_GET_LENGTH(suffix));
13223 }
13224 return unicode_result_unchanged(self);
13225}
13226
13227static PyObject *
13228unicode_repr(PyObject *unicode)
13229{
13230 PyObject *repr;
13231 Py_ssize_t isize;
13232 Py_ssize_t osize, squote, dquote, i, o;
13233 Py_UCS4 max, quote;
13234 int ikind, okind, unchanged;
13235 const void *idata;
13236 void *odata;
13237
13238 if (PyUnicode_READY(unicode) == -1)
13239 return NULL;
13240
13241 isize = PyUnicode_GET_LENGTH(unicode);
13242 idata = PyUnicode_DATA(unicode);
13243
13244 /* Compute length of output, quote characters, and
13245 maximum character */
13246 osize = 0;
13247 max = 127;
13248 squote = dquote = 0;
13249 ikind = PyUnicode_KIND(unicode);
13250 for (i = 0; i < isize; i++) {
13251 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
13252 Py_ssize_t incr = 1;
13253 switch (ch) {
13254 case '\'': squote++; break;
13255 case '"': dquote++; break;
13256 case '\\': case '\t': case '\r': case '\n':
13257 incr = 2;
13258 break;
13259 default:
13260 /* Fast-path ASCII */
13261 if (ch < ' ' || ch == 0x7f)
13262 incr = 4; /* \xHH */
13263 else if (ch < 0x7f)
13264 ;
13265 else if (Py_UNICODE_ISPRINTABLE(ch))
13266 max = ch > max ? ch : max;
13267 else if (ch < 0x100)
13268 incr = 4; /* \xHH */
13269 else if (ch < 0x10000)
13270 incr = 6; /* \uHHHH */
13271 else
13272 incr = 10; /* \uHHHHHHHH */
13273 }
13274 if (osize > PY_SSIZE_T_MAX - incr) {
13275 PyErr_SetString(PyExc_OverflowError,
13276 "string is too long to generate repr");
13277 return NULL;
13278 }
13279 osize += incr;
13280 }
13281
13282 quote = '\'';
13283 unchanged = (osize == isize);
13284 if (squote) {
13285 unchanged = 0;
13286 if (dquote)
13287 /* Both squote and dquote present. Use squote,
13288 and escape them */
13289 osize += squote;
13290 else
13291 quote = '"';
13292 }
13293 osize += 2; /* quotes */
13294
13295 repr = PyUnicode_New(osize, max);
13296 if (repr == NULL)
13297 return NULL;
13298 okind = PyUnicode_KIND(repr);
13299 odata = PyUnicode_DATA(repr);
13300
13301 PyUnicode_WRITE(okind, odata, 0, quote);
13302 PyUnicode_WRITE(okind, odata, osize-1, quote);
13303 if (unchanged) {
13304 _PyUnicode_FastCopyCharacters(repr, 1,
13305 unicode, 0,
13306 isize);
13307 }
13308 else {
13309 for (i = 0, o = 1; i < isize; i++) {
13310 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
13311
13312 /* Escape quotes and backslashes */
13313 if ((ch == quote) || (ch == '\\')) {
13314 PyUnicode_WRITE(okind, odata, o++, '\\');
13315 PyUnicode_WRITE(okind, odata, o++, ch);
13316 continue;
13317 }
13318
13319 /* Map special whitespace to '\t', \n', '\r' */
13320 if (ch == '\t') {
13321 PyUnicode_WRITE(okind, odata, o++, '\\');
13322 PyUnicode_WRITE(okind, odata, o++, 't');
13323 }
13324 else if (ch == '\n') {
13325 PyUnicode_WRITE(okind, odata, o++, '\\');
13326 PyUnicode_WRITE(okind, odata, o++, 'n');
13327 }
13328 else if (ch == '\r') {
13329 PyUnicode_WRITE(okind, odata, o++, '\\');
13330 PyUnicode_WRITE(okind, odata, o++, 'r');
13331 }
13332
13333 /* Map non-printable US ASCII to '\xhh' */
13334 else if (ch < ' ' || ch == 0x7F) {
13335 PyUnicode_WRITE(okind, odata, o++, '\\');
13336 PyUnicode_WRITE(okind, odata, o++, 'x');
13337 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13338 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13339 }
13340
13341 /* Copy ASCII characters as-is */
13342 else if (ch < 0x7F) {
13343 PyUnicode_WRITE(okind, odata, o++, ch);
13344 }
13345
13346 /* Non-ASCII characters */
13347 else {
13348 /* Map Unicode whitespace and control characters
13349 (categories Z* and C* except ASCII space)
13350 */
13351 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13352 PyUnicode_WRITE(okind, odata, o++, '\\');
13353 /* Map 8-bit characters to '\xhh' */
13354 if (ch <= 0xff) {
13355 PyUnicode_WRITE(okind, odata, o++, 'x');
13356 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13357 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13358 }
13359 /* Map 16-bit characters to '\uxxxx' */
13360 else if (ch <= 0xffff) {
13361 PyUnicode_WRITE(okind, odata, o++, 'u');
13362 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13363 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13364 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13365 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13366 }
13367 /* Map 21-bit characters to '\U00xxxxxx' */
13368 else {
13369 PyUnicode_WRITE(okind, odata, o++, 'U');
13370 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13371 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13372 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13373 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13374 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13375 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13376 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13377 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13378 }
13379 }
13380 /* Copy characters as-is */
13381 else {
13382 PyUnicode_WRITE(okind, odata, o++, ch);
13383 }
13384 }
13385 }
13386 }
13387 /* Closing quote already added at the beginning */
13388 assert(_PyUnicode_CheckConsistency(repr, 1));
13389 return repr;
13390}
13391
13392PyDoc_STRVAR(rfind__doc__,
13393 "S.rfind(sub[, start[, end]]) -> int\n\
13394\n\
13395Return the highest index in S where substring sub is found,\n\
13396such that sub is contained within S[start:end]. Optional\n\
13397arguments start and end are interpreted as in slice notation.\n\
13398\n\
13399Return -1 on failure.");
13400
13401static PyObject *
13402unicode_rfind(PyObject *self, PyObject *args)
13403{
13404 /* initialize variables to prevent gcc warning */
13405 PyObject *substring = NULL;
13406 Py_ssize_t start = 0;
13407 Py_ssize_t end = 0;
13408 Py_ssize_t result;
13409
13410 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13411 return NULL;
13412
13413 if (PyUnicode_READY(self) == -1)
13414 return NULL;
13415
13416 result = any_find_slice(self, substring, start, end, -1);
13417
13418 if (result == -2)
13419 return NULL;
13420
13421 return PyLong_FromSsize_t(result);
13422}
13423
13424PyDoc_STRVAR(rindex__doc__,
13425 "S.rindex(sub[, start[, end]]) -> int\n\
13426\n\
13427Return the highest index in S where substring sub is found,\n\
13428such that sub is contained within S[start:end]. Optional\n\
13429arguments start and end are interpreted as in slice notation.\n\
13430\n\
13431Raises ValueError when the substring is not found.");
13432
13433static PyObject *
13434unicode_rindex(PyObject *self, PyObject *args)
13435{
13436 /* initialize variables to prevent gcc warning */
13437 PyObject *substring = NULL;
13438 Py_ssize_t start = 0;
13439 Py_ssize_t end = 0;
13440 Py_ssize_t result;
13441
13442 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13443 return NULL;
13444
13445 if (PyUnicode_READY(self) == -1)
13446 return NULL;
13447
13448 result = any_find_slice(self, substring, start, end, -1);
13449
13450 if (result == -2)
13451 return NULL;
13452
13453 if (result < 0) {
13454 PyErr_SetString(PyExc_ValueError, "substring not found");
13455 return NULL;
13456 }
13457
13458 return PyLong_FromSsize_t(result);
13459}
13460
13461/*[clinic input]
13462str.rjust as unicode_rjust
13463
13464 width: Py_ssize_t
13465 fillchar: Py_UCS4 = ' '
13466 /
13467
13468Return a right-justified string of length width.
13469
13470Padding is done using the specified fill character (default is a space).
13471[clinic start generated code]*/
13472
13473static PyObject *
13474unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13475/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13476{
13477 if (PyUnicode_READY(self) == -1)
13478 return NULL;
13479
13480 if (PyUnicode_GET_LENGTH(self) >= width)
13481 return unicode_result_unchanged(self);
13482
13483 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13484}
13485
13486PyObject *
13487PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13488{
13489 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13490 return NULL;
13491
13492 return split(s, sep, maxsplit);
13493}
13494
13495/*[clinic input]
13496str.split as unicode_split
13497
13498 sep: object = None
13499 The separator used to split the string.
13500
13501 When set to None (the default value), will split on any whitespace
13502 character (including \\n \\r \\t \\f and spaces) and will discard
13503 empty strings from the result.
13504 maxsplit: Py_ssize_t = -1
13505 Maximum number of splits (starting from the left).
13506 -1 (the default value) means no limit.
13507
13508Return a list of the substrings in the string, using sep as the separator string.
13509
13510Note, str.split() is mainly useful for data that has been intentionally
13511delimited. With natural text that includes punctuation, consider using
13512the regular expression module.
13513
13514[clinic start generated code]*/
13515
13516static PyObject *
13517unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13518/*[clinic end generated code: output=3a65b1db356948dc input=906d953b44efc43b]*/
13519{
13520 if (sep == Py_None)
13521 return split(self, NULL, maxsplit);
13522 if (PyUnicode_Check(sep))
13523 return split(self, sep, maxsplit);
13524
13525 PyErr_Format(PyExc_TypeError,
13526 "must be str or None, not %.100s",
13527 Py_TYPE(sep)->tp_name);
13528 return NULL;
13529}
13530
13531PyObject *
13532PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13533{
13534 PyObject* out;
13535 int kind1, kind2;
13536 const void *buf1, *buf2;
13537 Py_ssize_t len1, len2;
13538
13539 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13540 return NULL;
13541
13542 kind1 = PyUnicode_KIND(str_obj);
13543 kind2 = PyUnicode_KIND(sep_obj);
13544 len1 = PyUnicode_GET_LENGTH(str_obj);
13545 len2 = PyUnicode_GET_LENGTH(sep_obj);
13546 if (kind1 < kind2 || len1 < len2) {
13547 PyObject *empty = unicode_get_empty(); // Borrowed reference
13548 return PyTuple_Pack(3, str_obj, empty, empty);
13549 }
13550 buf1 = PyUnicode_DATA(str_obj);
13551 buf2 = PyUnicode_DATA(sep_obj);
13552 if (kind2 != kind1) {
13553 buf2 = unicode_askind(kind2, buf2, len2, kind1);
13554 if (!buf2)
13555 return NULL;
13556 }
13557
13558 switch (kind1) {
13559 case PyUnicode_1BYTE_KIND:
13560 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13561 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13562 else
13563 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13564 break;
13565 case PyUnicode_2BYTE_KIND:
13566 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13567 break;
13568 case PyUnicode_4BYTE_KIND:
13569 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13570 break;
13571 default:
13572 Py_UNREACHABLE();
13573 }
13574
13575 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13576 if (kind2 != kind1)
13577 PyMem_Free((void *)buf2);
13578
13579 return out;
13580}
13581
13582
13583PyObject *
13584PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13585{
13586 PyObject* out;
13587 int kind1, kind2;
13588 const void *buf1, *buf2;
13589 Py_ssize_t len1, len2;
13590
13591 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13592 return NULL;
13593
13594 kind1 = PyUnicode_KIND(str_obj);
13595 kind2 = PyUnicode_KIND(sep_obj);
13596 len1 = PyUnicode_GET_LENGTH(str_obj);
13597 len2 = PyUnicode_GET_LENGTH(sep_obj);
13598 if (kind1 < kind2 || len1 < len2) {
13599 PyObject *empty = unicode_get_empty(); // Borrowed reference
13600 return PyTuple_Pack(3, empty, empty, str_obj);
13601 }
13602 buf1 = PyUnicode_DATA(str_obj);
13603 buf2 = PyUnicode_DATA(sep_obj);
13604 if (kind2 != kind1) {
13605 buf2 = unicode_askind(kind2, buf2, len2, kind1);
13606 if (!buf2)
13607 return NULL;
13608 }
13609
13610 switch (kind1) {
13611 case PyUnicode_1BYTE_KIND:
13612 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13613 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13614 else
13615 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13616 break;
13617 case PyUnicode_2BYTE_KIND:
13618 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13619 break;
13620 case PyUnicode_4BYTE_KIND:
13621 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13622 break;
13623 default:
13624 Py_UNREACHABLE();
13625 }
13626
13627 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13628 if (kind2 != kind1)
13629 PyMem_Free((void *)buf2);
13630
13631 return out;
13632}
13633
13634/*[clinic input]
13635str.partition as unicode_partition
13636
13637 sep: object
13638 /
13639
13640Partition the string into three parts using the given separator.
13641
13642This will search for the separator in the string. If the separator is found,
13643returns a 3-tuple containing the part before the separator, the separator
13644itself, and the part after it.
13645
13646If the separator is not found, returns a 3-tuple containing the original string
13647and two empty strings.
13648[clinic start generated code]*/
13649
13650static PyObject *
13651unicode_partition(PyObject *self, PyObject *sep)
13652/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13653{
13654 return PyUnicode_Partition(self, sep);
13655}
13656
13657/*[clinic input]
13658str.rpartition as unicode_rpartition = str.partition
13659
13660Partition the string into three parts using the given separator.
13661
13662This will search for the separator in the string, starting at the end. If
13663the separator is found, returns a 3-tuple containing the part before the
13664separator, the separator itself, and the part after it.
13665
13666If the separator is not found, returns a 3-tuple containing two empty strings
13667and the original string.
13668[clinic start generated code]*/
13669
13670static PyObject *
13671unicode_rpartition(PyObject *self, PyObject *sep)
13672/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13673{
13674 return PyUnicode_RPartition(self, sep);
13675}
13676
13677PyObject *
13678PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13679{
13680 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13681 return NULL;
13682
13683 return rsplit(s, sep, maxsplit);
13684}
13685
13686/*[clinic input]
13687str.rsplit as unicode_rsplit = str.split
13688
13689Return a list of the substrings in the string, using sep as the separator string.
13690
13691Splitting starts at the end of the string and works to the front.
13692[clinic start generated code]*/
13693
13694static PyObject *
13695unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13696/*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
13697{
13698 if (sep == Py_None)
13699 return rsplit(self, NULL, maxsplit);
13700 if (PyUnicode_Check(sep))
13701 return rsplit(self, sep, maxsplit);
13702
13703 PyErr_Format(PyExc_TypeError,
13704 "must be str or None, not %.100s",
13705 Py_TYPE(sep)->tp_name);
13706 return NULL;
13707}
13708
13709/*[clinic input]
13710str.splitlines as unicode_splitlines
13711
13712 keepends: bool(accept={int}) = False
13713
13714Return a list of the lines in the string, breaking at line boundaries.
13715
13716Line breaks are not included in the resulting list unless keepends is given and
13717true.
13718[clinic start generated code]*/
13719
13720static PyObject *
13721unicode_splitlines_impl(PyObject *self, int keepends)
13722/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13723{
13724 return PyUnicode_Splitlines(self, keepends);
13725}
13726
13727static
13728PyObject *unicode_str(PyObject *self)
13729{
13730 return unicode_result_unchanged(self);
13731}
13732
13733/*[clinic input]
13734str.swapcase as unicode_swapcase
13735
13736Convert uppercase characters to lowercase and lowercase characters to uppercase.
13737[clinic start generated code]*/
13738
13739static PyObject *
13740unicode_swapcase_impl(PyObject *self)
13741/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13742{
13743 if (PyUnicode_READY(self) == -1)
13744 return NULL;
13745 return case_operation(self, do_swapcase);
13746}
13747
13748/*[clinic input]
13749
13750@staticmethod
13751str.maketrans as unicode_maketrans
13752
13753 x: object
13754
13755 y: unicode=NULL
13756
13757 z: unicode=NULL
13758
13759 /
13760
13761Return a translation table usable for str.translate().
13762
13763If there is only one argument, it must be a dictionary mapping Unicode
13764ordinals (integers) or characters to Unicode ordinals, strings or None.
13765Character keys will be then converted to ordinals.
13766If there are two arguments, they must be strings of equal length, and
13767in the resulting dictionary, each character in x will be mapped to the
13768character at the same position in y. If there is a third argument, it
13769must be a string, whose characters will be mapped to None in the result.
13770[clinic start generated code]*/
13771
13772static PyObject *
13773unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13774/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13775{
13776 PyObject *new = NULL, *key, *value;
13777 Py_ssize_t i = 0;
13778 int res;
13779
13780 new = PyDict_New();
13781 if (!new)
13782 return NULL;
13783 if (y != NULL) {
13784 int x_kind, y_kind, z_kind;
13785 const void *x_data, *y_data, *z_data;
13786
13787 /* x must be a string too, of equal length */
13788 if (!PyUnicode_Check(x)) {
13789 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13790 "be a string if there is a second argument");
13791 goto err;
13792 }
13793 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13794 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13795 "arguments must have equal length");
13796 goto err;
13797 }
13798 /* create entries for translating chars in x to those in y */
13799 x_kind = PyUnicode_KIND(x);
13800 y_kind = PyUnicode_KIND(y);
13801 x_data = PyUnicode_DATA(x);
13802 y_data = PyUnicode_DATA(y);
13803 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13804 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13805 if (!key)
13806 goto err;
13807 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13808 if (!value) {
13809 Py_DECREF(key);
13810 goto err;
13811 }
13812 res = PyDict_SetItem(new, key, value);
13813 Py_DECREF(key);
13814 Py_DECREF(value);
13815 if (res < 0)
13816 goto err;
13817 }
13818 /* create entries for deleting chars in z */
13819 if (z != NULL) {
13820 z_kind = PyUnicode_KIND(z);
13821 z_data = PyUnicode_DATA(z);
13822 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13823 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13824 if (!key)
13825 goto err;
13826 res = PyDict_SetItem(new, key, Py_None);
13827 Py_DECREF(key);
13828 if (res < 0)
13829 goto err;
13830 }
13831 }
13832 } else {
13833 int kind;
13834 const void *data;
13835
13836 /* x must be a dict */
13837 if (!PyDict_CheckExact(x)) {
13838 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13839 "to maketrans it must be a dict");
13840 goto err;
13841 }
13842 /* copy entries into the new dict, converting string keys to int keys */
13843 while (PyDict_Next(x, &i, &key, &value)) {
13844 if (PyUnicode_Check(key)) {
13845 /* convert string keys to integer keys */
13846 PyObject *newkey;
13847 if (PyUnicode_GET_LENGTH(key) != 1) {
13848 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13849 "table must be of length 1");
13850 goto err;
13851 }
13852 kind = PyUnicode_KIND(key);
13853 data = PyUnicode_DATA(key);
13854 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13855 if (!newkey)
13856 goto err;
13857 res = PyDict_SetItem(new, newkey, value);
13858 Py_DECREF(newkey);
13859 if (res < 0)
13860 goto err;
13861 } else if (PyLong_Check(key)) {
13862 /* just keep integer keys */
13863 if (PyDict_SetItem(new, key, value) < 0)
13864 goto err;
13865 } else {
13866 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13867 "be strings or integers");
13868 goto err;
13869 }
13870 }
13871 }
13872 return new;
13873 err:
13874 Py_DECREF(new);
13875 return NULL;
13876}
13877
13878/*[clinic input]
13879str.translate as unicode_translate
13880
13881 table: object
13882 Translation table, which must be a mapping of Unicode ordinals to
13883 Unicode ordinals, strings, or None.
13884 /
13885
13886Replace each character in the string using the given translation table.
13887
13888The table must implement lookup/indexing via __getitem__, for instance a
13889dictionary or list. If this operation raises LookupError, the character is
13890left untouched. Characters mapped to None are deleted.
13891[clinic start generated code]*/
13892
13893static PyObject *
13894unicode_translate(PyObject *self, PyObject *table)
13895/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13896{
13897 return _PyUnicode_TranslateCharmap(self, table, "ignore");
13898}
13899
13900/*[clinic input]
13901str.upper as unicode_upper
13902
13903Return a copy of the string converted to uppercase.
13904[clinic start generated code]*/
13905
13906static PyObject *
13907unicode_upper_impl(PyObject *self)
13908/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13909{
13910 if (PyUnicode_READY(self) == -1)
13911 return NULL;
13912 if (PyUnicode_IS_ASCII(self))
13913 return ascii_upper_or_lower(self, 0);
13914 return case_operation(self, do_upper);
13915}
13916
13917/*[clinic input]
13918str.zfill as unicode_zfill
13919
13920 width: Py_ssize_t
13921 /
13922
13923Pad a numeric string with zeros on the left, to fill a field of the given width.
13924
13925The string is never truncated.
13926[clinic start generated code]*/
13927
13928static PyObject *
13929unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13930/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13931{
13932 Py_ssize_t fill;
13933 PyObject *u;
13934 int kind;
13935 const void *data;
13936 Py_UCS4 chr;
13937
13938 if (PyUnicode_READY(self) == -1)
13939 return NULL;
13940
13941 if (PyUnicode_GET_LENGTH(self) >= width)
13942 return unicode_result_unchanged(self);
13943
13944 fill = width - PyUnicode_GET_LENGTH(self);
13945
13946 u = pad(self, fill, 0, '0');
13947
13948 if (u == NULL)
13949 return NULL;
13950
13951 kind = PyUnicode_KIND(u);
13952 data = PyUnicode_DATA(u);
13953 chr = PyUnicode_READ(kind, data, fill);
13954
13955 if (chr == '+' || chr == '-') {
13956 /* move sign to beginning of string */
13957 PyUnicode_WRITE(kind, data, 0, chr);
13958 PyUnicode_WRITE(kind, data, fill, '0');
13959 }
13960
13961 assert(_PyUnicode_CheckConsistency(u, 1));
13962 return u;
13963}
13964
13965#if 0
13966static PyObject *
13967unicode__decimal2ascii(PyObject *self)
13968{
13969 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13970}
13971#endif
13972
13973PyDoc_STRVAR(startswith__doc__,
13974 "S.startswith(prefix[, start[, end]]) -> bool\n\
13975\n\
13976Return True if S starts with the specified prefix, False otherwise.\n\
13977With optional start, test S beginning at that position.\n\
13978With optional end, stop comparing S at that position.\n\
13979prefix can also be a tuple of strings to try.");
13980
13981static PyObject *
13982unicode_startswith(PyObject *self,
13983 PyObject *args)
13984{
13985 PyObject *subobj;
13986 PyObject *substring;
13987 Py_ssize_t start = 0;
13988 Py_ssize_t end = PY_SSIZE_T_MAX;
13989 int result;
13990
13991 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13992 return NULL;
13993 if (PyTuple_Check(subobj)) {
13994 Py_ssize_t i;
13995 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13996 substring = PyTuple_GET_ITEM(subobj, i);
13997 if (!PyUnicode_Check(substring)) {
13998 PyErr_Format(PyExc_TypeError,
13999 "tuple for startswith must only contain str, "
14000 "not %.100s",
14001 Py_TYPE(substring)->tp_name);
14002 return NULL;
14003 }
14004 result = tailmatch(self, substring, start, end, -1);
14005 if (result == -1)
14006 return NULL;
14007 if (result) {
14008 Py_RETURN_TRUE;
14009 }
14010 }
14011 /* nothing matched */
14012 Py_RETURN_FALSE;
14013 }
14014 if (!PyUnicode_Check(subobj)) {
14015 PyErr_Format(PyExc_TypeError,
14016 "startswith first arg must be str or "
14017 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
14018 return NULL;
14019 }
14020 result = tailmatch(self, subobj, start, end, -1);
14021 if (result == -1)
14022 return NULL;
14023 return PyBool_FromLong(result);
14024}
14025
14026
14027PyDoc_STRVAR(endswith__doc__,
14028 "S.endswith(suffix[, start[, end]]) -> bool\n\
14029\n\
14030Return True if S ends with the specified suffix, False otherwise.\n\
14031With optional start, test S beginning at that position.\n\
14032With optional end, stop comparing S at that position.\n\
14033suffix can also be a tuple of strings to try.");
14034
14035static PyObject *
14036unicode_endswith(PyObject *self,
14037 PyObject *args)
14038{
14039 PyObject *subobj;
14040 PyObject *substring;
14041 Py_ssize_t start = 0;
14042 Py_ssize_t end = PY_SSIZE_T_MAX;
14043 int result;
14044
14045 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
14046 return NULL;
14047 if (PyTuple_Check(subobj)) {
14048 Py_ssize_t i;
14049 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
14050 substring = PyTuple_GET_ITEM(subobj, i);
14051 if (!PyUnicode_Check(substring)) {
14052 PyErr_Format(PyExc_TypeError,
14053 "tuple for endswith must only contain str, "
14054 "not %.100s",
14055 Py_TYPE(substring)->tp_name);
14056 return NULL;
14057 }
14058 result = tailmatch(self, substring, start, end, +1);
14059 if (result == -1)
14060 return NULL;
14061 if (result) {
14062 Py_RETURN_TRUE;
14063 }
14064 }
14065 Py_RETURN_FALSE;
14066 }
14067 if (!PyUnicode_Check(subobj)) {
14068 PyErr_Format(PyExc_TypeError,
14069 "endswith first arg must be str or "
14070 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
14071 return NULL;
14072 }
14073 result = tailmatch(self, subobj, start, end, +1);
14074 if (result == -1)
14075 return NULL;
14076 return PyBool_FromLong(result);
14077}
14078
14079static inline void
14080_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
14081{
14082 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
14083 writer->data = PyUnicode_DATA(writer->buffer);
14084
14085 if (!writer->readonly) {
14086 writer->kind = PyUnicode_KIND(writer->buffer);
14087 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
14088 }
14089 else {
14090 /* use a value smaller than PyUnicode_1BYTE_KIND() so
14091 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14092 writer->kind = PyUnicode_WCHAR_KIND;
14093 assert(writer->kind <= PyUnicode_1BYTE_KIND);
14094
14095 /* Copy-on-write mode: set buffer size to 0 so
14096 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
14097 * next write. */
14098 writer->size = 0;
14099 }
14100}
14101
14102void
14103_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
14104{
14105 memset(writer, 0, sizeof(*writer));
14106
14107 /* ASCII is the bare minimum */
14108 writer->min_char = 127;
14109
14110 /* use a value smaller than PyUnicode_1BYTE_KIND() so
14111 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14112 writer->kind = PyUnicode_WCHAR_KIND;
14113 assert(writer->kind <= PyUnicode_1BYTE_KIND);
14114}
14115
14116// Initialize _PyUnicodeWriter with initial buffer
14117static inline void
14118_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
14119{
14120 memset(writer, 0, sizeof(*writer));
14121 writer->buffer = buffer;
14122 _PyUnicodeWriter_Update(writer);
14123 writer->min_length = writer->size;
14124}
14125
14126int
14127_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
14128 Py_ssize_t length, Py_UCS4 maxchar)
14129{
14130 Py_ssize_t newlen;
14131 PyObject *newbuffer;
14132
14133 assert(maxchar <= MAX_UNICODE);
14134
14135 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
14136 assert((maxchar > writer->maxchar && length >= 0)
14137 || length > 0);
14138
14139 if (length > PY_SSIZE_T_MAX - writer->pos) {
14140 PyErr_NoMemory();
14141 return -1;
14142 }
14143 newlen = writer->pos + length;
14144
14145 maxchar = Py_MAX(maxchar, writer->min_char);
14146
14147 if (writer->buffer == NULL) {
14148 assert(!writer->readonly);
14149 if (writer->overallocate
14150 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14151 /* overallocate to limit the number of realloc() */
14152 newlen += newlen / OVERALLOCATE_FACTOR;
14153 }
14154 if (newlen < writer->min_length)
14155 newlen = writer->min_length;
14156
14157 writer->buffer = PyUnicode_New(newlen, maxchar);
14158 if (writer->buffer == NULL)
14159 return -1;
14160 }
14161 else if (newlen > writer->size) {
14162 if (writer->overallocate
14163 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14164 /* overallocate to limit the number of realloc() */
14165 newlen += newlen / OVERALLOCATE_FACTOR;
14166 }
14167 if (newlen < writer->min_length)
14168 newlen = writer->min_length;
14169
14170 if (maxchar > writer->maxchar || writer->readonly) {
14171 /* resize + widen */
14172 maxchar = Py_MAX(maxchar, writer->maxchar);
14173 newbuffer = PyUnicode_New(newlen, maxchar);
14174 if (newbuffer == NULL)
14175 return -1;
14176 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14177 writer->buffer, 0, writer->pos);
14178 Py_DECREF(writer->buffer);
14179 writer->readonly = 0;
14180 }
14181 else {
14182 newbuffer = resize_compact(writer->buffer, newlen);
14183 if (newbuffer == NULL)
14184 return -1;
14185 }
14186 writer->buffer = newbuffer;
14187 }
14188 else if (maxchar > writer->maxchar) {
14189 assert(!writer->readonly);
14190 newbuffer = PyUnicode_New(writer->size, maxchar);
14191 if (newbuffer == NULL)
14192 return -1;
14193 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14194 writer->buffer, 0, writer->pos);
14195 Py_SETREF(writer->buffer, newbuffer);
14196 }
14197 _PyUnicodeWriter_Update(writer);
14198 return 0;
14199
14200#undef OVERALLOCATE_FACTOR
14201}
14202
14203int
14204_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14205 enum PyUnicode_Kind kind)
14206{
14207 Py_UCS4 maxchar;
14208
14209 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14210 assert(writer->kind < kind);
14211
14212 switch (kind)
14213 {
14214 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14215 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14216 case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
14217 default:
14218 Py_UNREACHABLE();
14219 }
14220
14221 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14222}
14223
14224static inline int
14225_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
14226{
14227 assert(ch <= MAX_UNICODE);
14228 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14229 return -1;
14230 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14231 writer->pos++;
14232 return 0;
14233}
14234
14235int
14236_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14237{
14238 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14239}
14240
14241int
14242_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14243{
14244 Py_UCS4 maxchar;
14245 Py_ssize_t len;
14246
14247 if (PyUnicode_READY(str) == -1)
14248 return -1;
14249 len = PyUnicode_GET_LENGTH(str);
14250 if (len == 0)
14251 return 0;
14252 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14253 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
14254 if (writer->buffer == NULL && !writer->overallocate) {
14255 assert(_PyUnicode_CheckConsistency(str, 1));
14256 writer->readonly = 1;
14257 Py_INCREF(str);
14258 writer->buffer = str;
14259 _PyUnicodeWriter_Update(writer);
14260 writer->pos += len;
14261 return 0;
14262 }
14263 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14264 return -1;
14265 }
14266 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14267 str, 0, len);
14268 writer->pos += len;
14269 return 0;
14270}
14271
14272int
14273_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14274 Py_ssize_t start, Py_ssize_t end)
14275{
14276 Py_UCS4 maxchar;
14277 Py_ssize_t len;
14278
14279 if (PyUnicode_READY(str) == -1)
14280 return -1;
14281
14282 assert(0 <= start);
14283 assert(end <= PyUnicode_GET_LENGTH(str));
14284 assert(start <= end);
14285
14286 if (end == 0)
14287 return 0;
14288
14289 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14290 return _PyUnicodeWriter_WriteStr(writer, str);
14291
14292 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14293 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14294 else
14295 maxchar = writer->maxchar;
14296 len = end - start;
14297
14298 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14299 return -1;
14300
14301 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14302 str, start, len);
14303 writer->pos += len;
14304 return 0;
14305}
14306
14307int
14308_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14309 const char *ascii, Py_ssize_t len)
14310{
14311 if (len == -1)
14312 len = strlen(ascii);
14313
14314 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
14315
14316 if (writer->buffer == NULL && !writer->overallocate) {
14317 PyObject *str;
14318
14319 str = _PyUnicode_FromASCII(ascii, len);
14320 if (str == NULL)
14321 return -1;
14322
14323 writer->readonly = 1;
14324 writer->buffer = str;
14325 _PyUnicodeWriter_Update(writer);
14326 writer->pos += len;
14327 return 0;
14328 }
14329
14330 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14331 return -1;
14332
14333 switch (writer->kind)
14334 {
14335 case PyUnicode_1BYTE_KIND:
14336 {
14337 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14338 Py_UCS1 *data = writer->data;
14339
14340 memcpy(data + writer->pos, str, len);
14341 break;
14342 }
14343 case PyUnicode_2BYTE_KIND:
14344 {
14345 _PyUnicode_CONVERT_BYTES(
14346 Py_UCS1, Py_UCS2,
14347 ascii, ascii + len,
14348 (Py_UCS2 *)writer->data + writer->pos);
14349 break;
14350 }
14351 case PyUnicode_4BYTE_KIND:
14352 {
14353 _PyUnicode_CONVERT_BYTES(
14354 Py_UCS1, Py_UCS4,
14355 ascii, ascii + len,
14356 (Py_UCS4 *)writer->data + writer->pos);
14357 break;
14358 }
14359 default:
14360 Py_UNREACHABLE();
14361 }
14362
14363 writer->pos += len;
14364 return 0;
14365}
14366
14367int
14368_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14369 const char *str, Py_ssize_t len)
14370{
14371 Py_UCS4 maxchar;
14372
14373 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14374 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14375 return -1;
14376 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14377 writer->pos += len;
14378 return 0;
14379}
14380
14381PyObject *
14382_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14383{
14384 PyObject *str;
14385
14386 if (writer->pos == 0) {
14387 Py_CLEAR(writer->buffer);
14388 _Py_RETURN_UNICODE_EMPTY();
14389 }
14390
14391 str = writer->buffer;
14392 writer->buffer = NULL;
14393
14394 if (writer->readonly) {
14395 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14396 return str;
14397 }
14398
14399 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14400 PyObject *str2;
14401 str2 = resize_compact(str, writer->pos);
14402 if (str2 == NULL) {
14403 Py_DECREF(str);
14404 return NULL;
14405 }
14406 str = str2;
14407 }
14408
14409 assert(_PyUnicode_CheckConsistency(str, 1));
14410 return unicode_result_ready(str);
14411}
14412
14413void
14414_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14415{
14416 Py_CLEAR(writer->buffer);
14417}
14418
14419#include "stringlib/unicode_format.h"
14420
14421PyDoc_STRVAR(format__doc__,
14422 "S.format(*args, **kwargs) -> str\n\
14423\n\
14424Return a formatted version of S, using substitutions from args and kwargs.\n\
14425The substitutions are identified by braces ('{' and '}').");
14426
14427PyDoc_STRVAR(format_map__doc__,
14428 "S.format_map(mapping) -> str\n\
14429\n\
14430Return a formatted version of S, using substitutions from mapping.\n\
14431The substitutions are identified by braces ('{' and '}').");
14432
14433/*[clinic input]
14434str.__format__ as unicode___format__
14435
14436 format_spec: unicode
14437 /
14438
14439Return a formatted version of the string as described by format_spec.
14440[clinic start generated code]*/
14441
14442static PyObject *
14443unicode___format___impl(PyObject *self, PyObject *format_spec)
14444/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14445{
14446 _PyUnicodeWriter writer;
14447 int ret;
14448
14449 if (PyUnicode_READY(self) == -1)
14450 return NULL;
14451 _PyUnicodeWriter_Init(&writer);
14452 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14453 self, format_spec, 0,
14454 PyUnicode_GET_LENGTH(format_spec));
14455 if (ret == -1) {
14456 _PyUnicodeWriter_Dealloc(&writer);
14457 return NULL;
14458 }
14459 return _PyUnicodeWriter_Finish(&writer);
14460}
14461
14462/*[clinic input]
14463str.__sizeof__ as unicode_sizeof
14464
14465Return the size of the string in memory, in bytes.
14466[clinic start generated code]*/
14467
14468static PyObject *
14469unicode_sizeof_impl(PyObject *self)
14470/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14471{
14472 Py_ssize_t size;
14473
14474 /* If it's a compact object, account for base structure +
14475 character data. */
14476 if (PyUnicode_IS_COMPACT_ASCII(self))
14477 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14478 else if (PyUnicode_IS_COMPACT(self))
14479 size = sizeof(PyCompactUnicodeObject) +
14480 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14481 else {
14482 /* If it is a two-block object, account for base object, and
14483 for character block if present. */
14484 size = sizeof(PyUnicodeObject);
14485 if (_PyUnicode_DATA_ANY(self))
14486 size += (PyUnicode_GET_LENGTH(self) + 1) *
14487 PyUnicode_KIND(self);
14488 }
14489 /* If the wstr pointer is present, account for it unless it is shared
14490 with the data pointer. Check if the data is not shared. */
14491 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14492 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14493 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14494 size += PyUnicode_UTF8_LENGTH(self) + 1;
14495
14496 return PyLong_FromSsize_t(size);
14497}
14498
14499static PyObject *
14500unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14501{
14502 PyObject *copy = _PyUnicode_Copy(v);
14503 if (!copy)
14504 return NULL;
14505 return Py_BuildValue("(N)", copy);
14506}
14507
14508static PyMethodDef unicode_methods[] = {
14509 UNICODE_ENCODE_METHODDEF
14510 UNICODE_REPLACE_METHODDEF
14511 UNICODE_SPLIT_METHODDEF
14512 UNICODE_RSPLIT_METHODDEF
14513 UNICODE_JOIN_METHODDEF
14514 UNICODE_CAPITALIZE_METHODDEF
14515 UNICODE_CASEFOLD_METHODDEF
14516 UNICODE_TITLE_METHODDEF
14517 UNICODE_CENTER_METHODDEF
14518 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14519 UNICODE_EXPANDTABS_METHODDEF
14520 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14521 UNICODE_PARTITION_METHODDEF
14522 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14523 UNICODE_LJUST_METHODDEF
14524 UNICODE_LOWER_METHODDEF
14525 UNICODE_LSTRIP_METHODDEF
14526 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14527 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14528 UNICODE_RJUST_METHODDEF
14529 UNICODE_RSTRIP_METHODDEF
14530 UNICODE_RPARTITION_METHODDEF
14531 UNICODE_SPLITLINES_METHODDEF
14532 UNICODE_STRIP_METHODDEF
14533 UNICODE_SWAPCASE_METHODDEF
14534 UNICODE_TRANSLATE_METHODDEF
14535 UNICODE_UPPER_METHODDEF
14536 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14537 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14538 UNICODE_REMOVEPREFIX_METHODDEF
14539 UNICODE_REMOVESUFFIX_METHODDEF
14540 UNICODE_ISASCII_METHODDEF
14541 UNICODE_ISLOWER_METHODDEF
14542 UNICODE_ISUPPER_METHODDEF
14543 UNICODE_ISTITLE_METHODDEF
14544 UNICODE_ISSPACE_METHODDEF
14545 UNICODE_ISDECIMAL_METHODDEF
14546 UNICODE_ISDIGIT_METHODDEF
14547 UNICODE_ISNUMERIC_METHODDEF
14548 UNICODE_ISALPHA_METHODDEF
14549 UNICODE_ISALNUM_METHODDEF
14550 UNICODE_ISIDENTIFIER_METHODDEF
14551 UNICODE_ISPRINTABLE_METHODDEF
14552 UNICODE_ZFILL_METHODDEF
14553 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
14554 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14555 UNICODE___FORMAT___METHODDEF
14556 UNICODE_MAKETRANS_METHODDEF
14557 UNICODE_SIZEOF_METHODDEF
14558#if 0
14559 /* These methods are just used for debugging the implementation. */
14560 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
14561#endif
14562
14563 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
14564 {NULL, NULL}
14565};
14566
14567static PyObject *
14568unicode_mod(PyObject *v, PyObject *w)
14569{
14570 if (!PyUnicode_Check(v))
14571 Py_RETURN_NOTIMPLEMENTED;
14572 return PyUnicode_Format(v, w);
14573}
14574
14575static PyNumberMethods unicode_as_number = {
14576 0, /*nb_add*/
14577 0, /*nb_subtract*/
14578 0, /*nb_multiply*/
14579 unicode_mod, /*nb_remainder*/
14580};
14581
14582static PySequenceMethods unicode_as_sequence = {
14583 (lenfunc) unicode_length, /* sq_length */
14584 PyUnicode_Concat, /* sq_concat */
14585 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14586 (ssizeargfunc) unicode_getitem, /* sq_item */
14587 0, /* sq_slice */
14588 0, /* sq_ass_item */
14589 0, /* sq_ass_slice */
14590 PyUnicode_Contains, /* sq_contains */
14591};
14592
14593static PyObject*
14594unicode_subscript(PyObject* self, PyObject* item)
14595{
14596 if (PyUnicode_READY(self) == -1)
14597 return NULL;
14598
14599 if (_PyIndex_Check(item)) {
14600 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14601 if (i == -1 && PyErr_Occurred())
14602 return NULL;
14603 if (i < 0)
14604 i += PyUnicode_GET_LENGTH(self);
14605 return unicode_getitem(self, i);
14606 } else if (PySlice_Check(item)) {
14607 Py_ssize_t start, stop, step, slicelength, i;
14608 size_t cur;
14609 PyObject *result;
14610 const void *src_data;
14611 void *dest_data;
14612 int src_kind, dest_kind;
14613 Py_UCS4 ch, max_char, kind_limit;
14614
14615 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14616 return NULL;
14617 }
14618 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14619 &start, &stop, step);
14620
14621 if (slicelength <= 0) {
14622 _Py_RETURN_UNICODE_EMPTY();
14623 } else if (start == 0 && step == 1 &&
14624 slicelength == PyUnicode_GET_LENGTH(self)) {
14625 return unicode_result_unchanged(self);
14626 } else if (step == 1) {
14627 return PyUnicode_Substring(self,
14628 start, start + slicelength);
14629 }
14630 /* General case */
14631 src_kind = PyUnicode_KIND(self);
14632 src_data = PyUnicode_DATA(self);
14633 if (!PyUnicode_IS_ASCII(self)) {
14634 kind_limit = kind_maxchar_limit(src_kind);
14635 max_char = 0;
14636 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14637 ch = PyUnicode_READ(src_kind, src_data, cur);
14638 if (ch > max_char) {
14639 max_char = ch;
14640 if (max_char >= kind_limit)
14641 break;
14642 }
14643 }
14644 }
14645 else
14646 max_char = 127;
14647 result = PyUnicode_New(slicelength, max_char);
14648 if (result == NULL)
14649 return NULL;
14650 dest_kind = PyUnicode_KIND(result);
14651 dest_data = PyUnicode_DATA(result);
14652
14653 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14654 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14655 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14656 }
14657 assert(_PyUnicode_CheckConsistency(result, 1));
14658 return result;
14659 } else {
14660 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14661 return NULL;
14662 }
14663}
14664
14665static PyMappingMethods unicode_as_mapping = {
14666 (lenfunc)unicode_length, /* mp_length */
14667 (binaryfunc)unicode_subscript, /* mp_subscript */
14668 (objobjargproc)0, /* mp_ass_subscript */
14669};
14670
14671
14672/* Helpers for PyUnicode_Format() */
14673
14674struct unicode_formatter_t {
14675 PyObject *args;
14676 int args_owned;
14677 Py_ssize_t arglen, argidx;
14678 PyObject *dict;
14679
14680 enum PyUnicode_Kind fmtkind;
14681 Py_ssize_t fmtcnt, fmtpos;
14682 const void *fmtdata;
14683 PyObject *fmtstr;
14684
14685 _PyUnicodeWriter writer;
14686};
14687
14688struct unicode_format_arg_t {
14689 Py_UCS4 ch;
14690 int flags;
14691 Py_ssize_t width;
14692 int prec;
14693 int sign;
14694};
14695
14696static PyObject *
14697unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14698{
14699 Py_ssize_t argidx = ctx->argidx;
14700
14701 if (argidx < ctx->arglen) {
14702 ctx->argidx++;
14703 if (ctx->arglen < 0)
14704 return ctx->args;
14705 else
14706 return PyTuple_GetItem(ctx->args, argidx);
14707 }
14708 PyErr_SetString(PyExc_TypeError,
14709 "not enough arguments for format string");
14710 return NULL;
14711}
14712
14713/* Returns a new reference to a PyUnicode object, or NULL on failure. */
14714
14715/* Format a float into the writer if the writer is not NULL, or into *p_output
14716 otherwise.
14717
14718 Return 0 on success, raise an exception and return -1 on error. */
14719static int
14720formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14721 PyObject **p_output,
14722 _PyUnicodeWriter *writer)
14723{
14724 char *p;
14725 double x;
14726 Py_ssize_t len;
14727 int prec;
14728 int dtoa_flags;
14729
14730 x = PyFloat_AsDouble(v);
14731 if (x == -1.0 && PyErr_Occurred())
14732 return -1;
14733
14734 prec = arg->prec;
14735 if (prec < 0)
14736 prec = 6;
14737
14738 if (arg->flags & F_ALT)
14739 dtoa_flags = Py_DTSF_ALT;
14740 else
14741 dtoa_flags = 0;
14742 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14743 if (p == NULL)
14744 return -1;
14745 len = strlen(p);
14746 if (writer) {
14747 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14748 PyMem_Free(p);
14749 return -1;
14750 }
14751 }
14752 else
14753 *p_output = _PyUnicode_FromASCII(p, len);
14754 PyMem_Free(p);
14755 return 0;
14756}
14757
14758/* formatlong() emulates the format codes d, u, o, x and X, and
14759 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14760 * Python's regular ints.
14761 * Return value: a new PyUnicodeObject*, or NULL if error.
14762 * The output string is of the form
14763 * "-"? ("0x" | "0X")? digit+
14764 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14765 * set in flags. The case of hex digits will be correct,
14766 * There will be at least prec digits, zero-filled on the left if
14767 * necessary to get that many.
14768 * val object to be converted
14769 * flags bitmask of format flags; only F_ALT is looked at
14770 * prec minimum number of digits; 0-fill on left if needed
14771 * type a character in [duoxX]; u acts the same as d
14772 *
14773 * CAUTION: o, x and X conversions on regular ints can never
14774 * produce a '-' sign, but can for Python's unbounded ints.
14775 */
14776PyObject *
14777_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14778{
14779 PyObject *result = NULL;
14780 char *buf;
14781 Py_ssize_t i;
14782 int sign; /* 1 if '-', else 0 */
14783 int len; /* number of characters */
14784 Py_ssize_t llen;
14785 int numdigits; /* len == numnondigits + numdigits */
14786 int numnondigits = 0;
14787
14788 /* Avoid exceeding SSIZE_T_MAX */
14789 if (prec > INT_MAX-3) {
14790 PyErr_SetString(PyExc_OverflowError,
14791 "precision too large");
14792 return NULL;
14793 }
14794
14795 assert(PyLong_Check(val));
14796
14797 switch (type) {
14798 default:
14799 Py_UNREACHABLE();
14800 case 'd':
14801 case 'i':
14802 case 'u':
14803 /* int and int subclasses should print numerically when a numeric */
14804 /* format code is used (see issue18780) */
14805 result = PyNumber_ToBase(val, 10);
14806 break;
14807 case 'o':
14808 numnondigits = 2;
14809 result = PyNumber_ToBase(val, 8);
14810 break;
14811 case 'x':
14812 case 'X':
14813 numnondigits = 2;
14814 result = PyNumber_ToBase(val, 16);
14815 break;
14816 }
14817 if (!result)
14818 return NULL;
14819
14820 assert(unicode_modifiable(result));
14821 assert(PyUnicode_IS_READY(result));
14822 assert(PyUnicode_IS_ASCII(result));
14823
14824 /* To modify the string in-place, there can only be one reference. */
14825 if (Py_REFCNT(result) != 1) {
14826 Py_DECREF(result);
14827 PyErr_BadInternalCall();
14828 return NULL;
14829 }
14830 buf = PyUnicode_DATA(result);
14831 llen = PyUnicode_GET_LENGTH(result);
14832 if (llen > INT_MAX) {
14833 Py_DECREF(result);
14834 PyErr_SetString(PyExc_ValueError,
14835 "string too large in _PyUnicode_FormatLong");
14836 return NULL;
14837 }
14838 len = (int)llen;
14839 sign = buf[0] == '-';
14840 numnondigits += sign;
14841 numdigits = len - numnondigits;
14842 assert(numdigits > 0);
14843
14844 /* Get rid of base marker unless F_ALT */
14845 if (((alt) == 0 &&
14846 (type == 'o' || type == 'x' || type == 'X'))) {
14847 assert(buf[sign] == '0');
14848 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14849 buf[sign+1] == 'o');
14850 numnondigits -= 2;
14851 buf += 2;
14852 len -= 2;
14853 if (sign)
14854 buf[0] = '-';
14855 assert(len == numnondigits + numdigits);
14856 assert(numdigits > 0);
14857 }
14858
14859 /* Fill with leading zeroes to meet minimum width. */
14860 if (prec > numdigits) {
14861 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14862 numnondigits + prec);
14863 char *b1;
14864 if (!r1) {
14865 Py_DECREF(result);
14866 return NULL;
14867 }
14868 b1 = PyBytes_AS_STRING(r1);
14869 for (i = 0; i < numnondigits; ++i)
14870 *b1++ = *buf++;
14871 for (i = 0; i < prec - numdigits; i++)
14872 *b1++ = '0';
14873 for (i = 0; i < numdigits; i++)
14874 *b1++ = *buf++;
14875 *b1 = '\0';
14876 Py_DECREF(result);
14877 result = r1;
14878 buf = PyBytes_AS_STRING(result);
14879 len = numnondigits + prec;
14880 }
14881
14882 /* Fix up case for hex conversions. */
14883 if (type == 'X') {
14884 /* Need to convert all lower case letters to upper case.
14885 and need to convert 0x to 0X (and -0x to -0X). */
14886 for (i = 0; i < len; i++)
14887 if (buf[i] >= 'a' && buf[i] <= 'x')
14888 buf[i] -= 'a'-'A';
14889 }
14890 if (!PyUnicode_Check(result)
14891 || buf != PyUnicode_DATA(result)) {
14892 PyObject *unicode;
14893 unicode = _PyUnicode_FromASCII(buf, len);
14894 Py_DECREF(result);
14895 result = unicode;
14896 }
14897 else if (len != PyUnicode_GET_LENGTH(result)) {
14898 if (PyUnicode_Resize(&result, len) < 0)
14899 Py_CLEAR(result);
14900 }
14901 return result;
14902}
14903
14904/* Format an integer or a float as an integer.
14905 * Return 1 if the number has been formatted into the writer,
14906 * 0 if the number has been formatted into *p_output
14907 * -1 and raise an exception on error */
14908static int
14909mainformatlong(PyObject *v,
14910 struct unicode_format_arg_t *arg,
14911 PyObject **p_output,
14912 _PyUnicodeWriter *writer)
14913{
14914 PyObject *iobj, *res;
14915 char type = (char)arg->ch;
14916
14917 if (!PyNumber_Check(v))
14918 goto wrongtype;
14919
14920 /* make sure number is a type of integer for o, x, and X */
14921 if (!PyLong_Check(v)) {
14922 if (type == 'o' || type == 'x' || type == 'X') {
14923 iobj = _PyNumber_Index(v);
14924 }
14925 else {
14926 iobj = PyNumber_Long(v);
14927 }
14928 if (iobj == NULL ) {
14929 if (PyErr_ExceptionMatches(PyExc_TypeError))
14930 goto wrongtype;
14931 return -1;
14932 }
14933 assert(PyLong_Check(iobj));
14934 }
14935 else {
14936 iobj = v;
14937 Py_INCREF(iobj);
14938 }
14939
14940 if (PyLong_CheckExact(v)
14941 && arg->width == -1 && arg->prec == -1
14942 && !(arg->flags & (F_SIGN | F_BLANK))
14943 && type != 'X')
14944 {
14945 /* Fast path */
14946 int alternate = arg->flags & F_ALT;
14947 int base;
14948
14949 switch(type)
14950 {
14951 default:
14952 Py_UNREACHABLE();
14953 case 'd':
14954 case 'i':
14955 case 'u':
14956 base = 10;
14957 break;
14958 case 'o':
14959 base = 8;
14960 break;
14961 case 'x':
14962 case 'X':
14963 base = 16;
14964 break;
14965 }
14966
14967 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14968 Py_DECREF(iobj);
14969 return -1;
14970 }
14971 Py_DECREF(iobj);
14972 return 1;
14973 }
14974
14975 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14976 Py_DECREF(iobj);
14977 if (res == NULL)
14978 return -1;
14979 *p_output = res;
14980 return 0;
14981
14982wrongtype:
14983 switch(type)
14984 {
14985 case 'o':
14986 case 'x':
14987 case 'X':
14988 PyErr_Format(PyExc_TypeError,
14989 "%%%c format: an integer is required, "
14990 "not %.200s",
14991 type, Py_TYPE(v)->tp_name);
14992 break;
14993 default:
14994 PyErr_Format(PyExc_TypeError,
14995 "%%%c format: a real number is required, "
14996 "not %.200s",
14997 type, Py_TYPE(v)->tp_name);
14998 break;
14999 }
15000 return -1;
15001}
15002
15003static Py_UCS4
15004formatchar(PyObject *v)
15005{
15006 /* presume that the buffer is at least 3 characters long */
15007 if (PyUnicode_Check(v)) {
15008 if (PyUnicode_GET_LENGTH(v) == 1) {
15009 return PyUnicode_READ_CHAR(v, 0);
15010 }
15011 goto onError;
15012 }
15013 else {
15014 int overflow;
15015 long x = PyLong_AsLongAndOverflow(v, &overflow);
15016 if (x == -1 && PyErr_Occurred()) {
15017 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
15018 goto onError;
15019 }
15020 return (Py_UCS4) -1;
15021 }
15022
15023 if (x < 0 || x > MAX_UNICODE) {
15024 /* this includes an overflow in converting to C long */
15025 PyErr_SetString(PyExc_OverflowError,
15026 "%c arg not in range(0x110000)");
15027 return (Py_UCS4) -1;
15028 }
15029
15030 return (Py_UCS4) x;
15031 }
15032
15033 onError:
15034 PyErr_SetString(PyExc_TypeError,
15035 "%c requires int or char");
15036 return (Py_UCS4) -1;
15037}
15038
15039/* Parse options of an argument: flags, width, precision.
15040 Handle also "%(name)" syntax.
15041
15042 Return 0 if the argument has been formatted into arg->str.
15043 Return 1 if the argument has been written into ctx->writer,
15044 Raise an exception and return -1 on error. */
15045static int
15046unicode_format_arg_parse(struct unicode_formatter_t *ctx,
15047 struct unicode_format_arg_t *arg)
15048{
15049#define FORMAT_READ(ctx) \
15050 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
15051
15052 PyObject *v;
15053
15054 if (arg->ch == '(') {
15055 /* Get argument value from a dictionary. Example: "%(name)s". */
15056 Py_ssize_t keystart;
15057 Py_ssize_t keylen;
15058 PyObject *key;
15059 int pcount = 1;
15060
15061 if (ctx->dict == NULL) {
15062 PyErr_SetString(PyExc_TypeError,
15063 "format requires a mapping");
15064 return -1;
15065 }
15066 ++ctx->fmtpos;
15067 --ctx->fmtcnt;
15068 keystart = ctx->fmtpos;
15069 /* Skip over balanced parentheses */
15070 while (pcount > 0 && --ctx->fmtcnt >= 0) {
15071 arg->ch = FORMAT_READ(ctx);
15072 if (arg->ch == ')')
15073 --pcount;
15074 else if (arg->ch == '(')
15075 ++pcount;
15076 ctx->fmtpos++;
15077 }
15078 keylen = ctx->fmtpos - keystart - 1;
15079 if (ctx->fmtcnt < 0 || pcount > 0) {
15080 PyErr_SetString(PyExc_ValueError,
15081 "incomplete format key");
15082 return -1;
15083 }
15084 key = PyUnicode_Substring(ctx->fmtstr,
15085 keystart, keystart + keylen);
15086 if (key == NULL)
15087 return -1;
15088 if (ctx->args_owned) {
15089 ctx->args_owned = 0;
15090 Py_DECREF(ctx->args);
15091 }
15092 ctx->args = PyObject_GetItem(ctx->dict, key);
15093 Py_DECREF(key);
15094 if (ctx->args == NULL)
15095 return -1;
15096 ctx->args_owned = 1;
15097 ctx->arglen = -1;
15098 ctx->argidx = -2;
15099 }
15100
15101 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
15102 while (--ctx->fmtcnt >= 0) {
15103 arg->ch = FORMAT_READ(ctx);
15104 ctx->fmtpos++;
15105 switch (arg->ch) {
15106 case '-': arg->flags |= F_LJUST; continue;
15107 case '+': arg->flags |= F_SIGN; continue;
15108 case ' ': arg->flags |= F_BLANK; continue;
15109 case '#': arg->flags |= F_ALT; continue;
15110 case '0': arg->flags |= F_ZERO; continue;
15111 }
15112 break;
15113 }
15114
15115 /* Parse width. Example: "%10s" => width=10 */
15116 if (arg->ch == '*') {
15117 v = unicode_format_getnextarg(ctx);
15118 if (v == NULL)
15119 return -1;
15120 if (!PyLong_Check(v)) {
15121 PyErr_SetString(PyExc_TypeError,
15122 "* wants int");
15123 return -1;
15124 }
15125 arg->width = PyLong_AsSsize_t(v);
15126 if (arg->width == -1 && PyErr_Occurred())
15127 return -1;
15128 if (arg->width < 0) {
15129 arg->flags |= F_LJUST;
15130 arg->width = -arg->width;
15131 }
15132 if (--ctx->fmtcnt >= 0) {
15133 arg->ch = FORMAT_READ(ctx);
15134 ctx->fmtpos++;
15135 }
15136 }
15137 else if (arg->ch >= '0' && arg->ch <= '9') {
15138 arg->width = arg->ch - '0';
15139 while (--ctx->fmtcnt >= 0) {
15140 arg->ch = FORMAT_READ(ctx);
15141 ctx->fmtpos++;
15142 if (arg->ch < '0' || arg->ch > '9')
15143 break;
15144 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15145 mixing signed and unsigned comparison. Since arg->ch is between
15146 '0' and '9', casting to int is safe. */
15147 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15148 PyErr_SetString(PyExc_ValueError,
15149 "width too big");
15150 return -1;
15151 }
15152 arg->width = arg->width*10 + (arg->ch - '0');
15153 }
15154 }
15155
15156 /* Parse precision. Example: "%.3f" => prec=3 */
15157 if (arg->ch == '.') {
15158 arg->prec = 0;
15159 if (--ctx->fmtcnt >= 0) {
15160 arg->ch = FORMAT_READ(ctx);
15161 ctx->fmtpos++;
15162 }
15163 if (arg->ch == '*') {
15164 v = unicode_format_getnextarg(ctx);
15165 if (v == NULL)
15166 return -1;
15167 if (!PyLong_Check(v)) {
15168 PyErr_SetString(PyExc_TypeError,
15169 "* wants int");
15170 return -1;
15171 }
15172 arg->prec = _PyLong_AsInt(v);
15173 if (arg->prec == -1 && PyErr_Occurred())
15174 return -1;
15175 if (arg->prec < 0)
15176 arg->prec = 0;
15177 if (--ctx->fmtcnt >= 0) {
15178 arg->ch = FORMAT_READ(ctx);
15179 ctx->fmtpos++;
15180 }
15181 }
15182 else if (arg->ch >= '0' && arg->ch <= '9') {
15183 arg->prec = arg->ch - '0';
15184 while (--ctx->fmtcnt >= 0) {
15185 arg->ch = FORMAT_READ(ctx);
15186 ctx->fmtpos++;
15187 if (arg->ch < '0' || arg->ch > '9')
15188 break;
15189 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15190 PyErr_SetString(PyExc_ValueError,
15191 "precision too big");
15192 return -1;
15193 }
15194 arg->prec = arg->prec*10 + (arg->ch - '0');
15195 }
15196 }
15197 }
15198
15199 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15200 if (ctx->fmtcnt >= 0) {
15201 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15202 if (--ctx->fmtcnt >= 0) {
15203 arg->ch = FORMAT_READ(ctx);
15204 ctx->fmtpos++;
15205 }
15206 }
15207 }
15208 if (ctx->fmtcnt < 0) {
15209 PyErr_SetString(PyExc_ValueError,
15210 "incomplete format");
15211 return -1;
15212 }
15213 return 0;
15214
15215#undef FORMAT_READ
15216}
15217
15218/* Format one argument. Supported conversion specifiers:
15219
15220 - "s", "r", "a": any type
15221 - "i", "d", "u": int or float
15222 - "o", "x", "X": int
15223 - "e", "E", "f", "F", "g", "G": float
15224 - "c": int or str (1 character)
15225
15226 When possible, the output is written directly into the Unicode writer
15227 (ctx->writer). A string is created when padding is required.
15228
15229 Return 0 if the argument has been formatted into *p_str,
15230 1 if the argument has been written into ctx->writer,
15231 -1 on error. */
15232static int
15233unicode_format_arg_format(struct unicode_formatter_t *ctx,
15234 struct unicode_format_arg_t *arg,
15235 PyObject **p_str)
15236{
15237 PyObject *v;
15238 _PyUnicodeWriter *writer = &ctx->writer;
15239
15240 if (ctx->fmtcnt == 0)
15241 ctx->writer.overallocate = 0;
15242
15243 v = unicode_format_getnextarg(ctx);
15244 if (v == NULL)
15245 return -1;
15246
15247
15248 switch (arg->ch) {
15249 case 's':
15250 case 'r':
15251 case 'a':
15252 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15253 /* Fast path */
15254 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15255 return -1;
15256 return 1;
15257 }
15258
15259 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15260 *p_str = v;
15261 Py_INCREF(*p_str);
15262 }
15263 else {
15264 if (arg->ch == 's')
15265 *p_str = PyObject_Str(v);
15266 else if (arg->ch == 'r')
15267 *p_str = PyObject_Repr(v);
15268 else
15269 *p_str = PyObject_ASCII(v);
15270 }
15271 break;
15272
15273 case 'i':
15274 case 'd':
15275 case 'u':
15276 case 'o':
15277 case 'x':
15278 case 'X':
15279 {
15280 int ret = mainformatlong(v, arg, p_str, writer);
15281 if (ret != 0)
15282 return ret;
15283 arg->sign = 1;
15284 break;
15285 }
15286
15287 case 'e':
15288 case 'E':
15289 case 'f':
15290 case 'F':
15291 case 'g':
15292 case 'G':
15293 if (arg->width == -1 && arg->prec == -1
15294 && !(arg->flags & (F_SIGN | F_BLANK)))
15295 {
15296 /* Fast path */
15297 if (formatfloat(v, arg, NULL, writer) == -1)
15298 return -1;
15299 return 1;
15300 }
15301
15302 arg->sign = 1;
15303 if (formatfloat(v, arg, p_str, NULL) == -1)
15304 return -1;
15305 break;
15306
15307 case 'c':
15308 {
15309 Py_UCS4 ch = formatchar(v);
15310 if (ch == (Py_UCS4) -1)
15311 return -1;
15312 if (arg->width == -1 && arg->prec == -1) {
15313 /* Fast path */
15314 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
15315 return -1;
15316 return 1;
15317 }
15318 *p_str = PyUnicode_FromOrdinal(ch);
15319 break;
15320 }
15321
15322 default:
15323 PyErr_Format(PyExc_ValueError,
15324 "unsupported format character '%c' (0x%x) "
15325 "at index %zd",
15326 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15327 (int)arg->ch,
15328 ctx->fmtpos - 1);
15329 return -1;
15330 }
15331 if (*p_str == NULL)
15332 return -1;
15333 assert (PyUnicode_Check(*p_str));
15334 return 0;
15335}
15336
15337static int
15338unicode_format_arg_output(struct unicode_formatter_t *ctx,
15339 struct unicode_format_arg_t *arg,
15340 PyObject *str)
15341{
15342 Py_ssize_t len;
15343 enum PyUnicode_Kind kind;
15344 const void *pbuf;
15345 Py_ssize_t pindex;
15346 Py_UCS4 signchar;
15347 Py_ssize_t buflen;
15348 Py_UCS4 maxchar;
15349 Py_ssize_t sublen;
15350 _PyUnicodeWriter *writer = &ctx->writer;
15351 Py_UCS4 fill;
15352
15353 fill = ' ';
15354 if (arg->sign && arg->flags & F_ZERO)
15355 fill = '0';
15356
15357 if (PyUnicode_READY(str) == -1)
15358 return -1;
15359
15360 len = PyUnicode_GET_LENGTH(str);
15361 if ((arg->width == -1 || arg->width <= len)
15362 && (arg->prec == -1 || arg->prec >= len)
15363 && !(arg->flags & (F_SIGN | F_BLANK)))
15364 {
15365 /* Fast path */
15366 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15367 return -1;
15368 return 0;
15369 }
15370
15371 /* Truncate the string for "s", "r" and "a" formats
15372 if the precision is set */
15373 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15374 if (arg->prec >= 0 && len > arg->prec)
15375 len = arg->prec;
15376 }
15377
15378 /* Adjust sign and width */
15379 kind = PyUnicode_KIND(str);
15380 pbuf = PyUnicode_DATA(str);
15381 pindex = 0;
15382 signchar = '\0';
15383 if (arg->sign) {
15384 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15385 if (ch == '-' || ch == '+') {
15386 signchar = ch;
15387 len--;
15388 pindex++;
15389 }
15390 else if (arg->flags & F_SIGN)
15391 signchar = '+';
15392 else if (arg->flags & F_BLANK)
15393 signchar = ' ';
15394 else
15395 arg->sign = 0;
15396 }
15397 if (arg->width < len)
15398 arg->width = len;
15399
15400 /* Prepare the writer */
15401 maxchar = writer->maxchar;
15402 if (!(arg->flags & F_LJUST)) {
15403 if (arg->sign) {
15404 if ((arg->width-1) > len)
15405 maxchar = Py_MAX(maxchar, fill);
15406 }
15407 else {
15408 if (arg->width > len)
15409 maxchar = Py_MAX(maxchar, fill);
15410 }
15411 }
15412 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15413 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15414 maxchar = Py_MAX(maxchar, strmaxchar);
15415 }
15416
15417 buflen = arg->width;
15418 if (arg->sign && len == arg->width)
15419 buflen++;
15420 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15421 return -1;
15422
15423 /* Write the sign if needed */
15424 if (arg->sign) {
15425 if (fill != ' ') {
15426 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15427 writer->pos += 1;
15428 }
15429 if (arg->width > len)
15430 arg->width--;
15431 }
15432
15433 /* Write the numeric prefix for "x", "X" and "o" formats
15434 if the alternate form is used.
15435 For example, write "0x" for the "%#x" format. */
15436 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15437 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15438 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15439 if (fill != ' ') {
15440 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15441 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15442 writer->pos += 2;
15443 pindex += 2;
15444 }
15445 arg->width -= 2;
15446 if (arg->width < 0)
15447 arg->width = 0;
15448 len -= 2;
15449 }
15450
15451 /* Pad left with the fill character if needed */
15452 if (arg->width > len && !(arg->flags & F_LJUST)) {
15453 sublen = arg->width - len;
15454 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15455 writer->pos += sublen;
15456 arg->width = len;
15457 }
15458
15459 /* If padding with spaces: write sign if needed and/or numeric prefix if
15460 the alternate form is used */
15461 if (fill == ' ') {
15462 if (arg->sign) {
15463 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15464 writer->pos += 1;
15465 }
15466 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15467 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15468 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15469 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15470 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15471 writer->pos += 2;
15472 pindex += 2;
15473 }
15474 }
15475
15476 /* Write characters */
15477 if (len) {
15478 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15479 str, pindex, len);
15480 writer->pos += len;
15481 }
15482
15483 /* Pad right with the fill character if needed */
15484 if (arg->width > len) {
15485 sublen = arg->width - len;
15486 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15487 writer->pos += sublen;
15488 }
15489 return 0;
15490}
15491
15492/* Helper of PyUnicode_Format(): format one arg.
15493 Return 0 on success, raise an exception and return -1 on error. */
15494static int
15495unicode_format_arg(struct unicode_formatter_t *ctx)
15496{
15497 struct unicode_format_arg_t arg;
15498 PyObject *str;
15499 int ret;
15500
15501 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15502 if (arg.ch == '%') {
15503 ctx->fmtpos++;
15504 ctx->fmtcnt--;
15505 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15506 return -1;
15507 return 0;
15508 }
15509 arg.flags = 0;
15510 arg.width = -1;
15511 arg.prec = -1;
15512 arg.sign = 0;
15513 str = NULL;
15514
15515 ret = unicode_format_arg_parse(ctx, &arg);
15516 if (ret == -1)
15517 return -1;
15518
15519 ret = unicode_format_arg_format(ctx, &arg, &str);
15520 if (ret == -1)
15521 return -1;
15522
15523 if (ret != 1) {
15524 ret = unicode_format_arg_output(ctx, &arg, str);
15525 Py_DECREF(str);
15526 if (ret == -1)
15527 return -1;
15528 }
15529
15530 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15531 PyErr_SetString(PyExc_TypeError,
15532 "not all arguments converted during string formatting");
15533 return -1;
15534 }
15535 return 0;
15536}
15537
15538PyObject *
15539PyUnicode_Format(PyObject *format, PyObject *args)
15540{
15541 struct unicode_formatter_t ctx;
15542
15543 if (format == NULL || args == NULL) {
15544 PyErr_BadInternalCall();
15545 return NULL;
15546 }
15547
15548 if (ensure_unicode(format) < 0)
15549 return NULL;
15550
15551 ctx.fmtstr = format;
15552 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15553 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15554 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15555 ctx.fmtpos = 0;
15556
15557 _PyUnicodeWriter_Init(&ctx.writer);
15558 ctx.writer.min_length = ctx.fmtcnt + 100;
15559 ctx.writer.overallocate = 1;
15560
15561 if (PyTuple_Check(args)) {
15562 ctx.arglen = PyTuple_Size(args);
15563 ctx.argidx = 0;
15564 }
15565 else {
15566 ctx.arglen = -1;
15567 ctx.argidx = -2;
15568 }
15569 ctx.args_owned = 0;
15570 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15571 ctx.dict = args;
15572 else
15573 ctx.dict = NULL;
15574 ctx.args = args;
15575
15576 while (--ctx.fmtcnt >= 0) {
15577 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15578 Py_ssize_t nonfmtpos;
15579
15580 nonfmtpos = ctx.fmtpos++;
15581 while (ctx.fmtcnt >= 0 &&
15582 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15583 ctx.fmtpos++;
15584 ctx.fmtcnt--;
15585 }
15586 if (ctx.fmtcnt < 0) {
15587 ctx.fmtpos--;
15588 ctx.writer.overallocate = 0;
15589 }
15590
15591 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15592 nonfmtpos, ctx.fmtpos) < 0)
15593 goto onError;
15594 }
15595 else {
15596 ctx.fmtpos++;
15597 if (unicode_format_arg(&ctx) == -1)
15598 goto onError;
15599 }
15600 }
15601
15602 if (ctx.argidx < ctx.arglen && !ctx.dict) {
15603 PyErr_SetString(PyExc_TypeError,
15604 "not all arguments converted during string formatting");
15605 goto onError;
15606 }
15607
15608 if (ctx.args_owned) {
15609 Py_DECREF(ctx.args);
15610 }
15611 return _PyUnicodeWriter_Finish(&ctx.writer);
15612
15613 onError:
15614 _PyUnicodeWriter_Dealloc(&ctx.writer);
15615 if (ctx.args_owned) {
15616 Py_DECREF(ctx.args);
15617 }
15618 return NULL;
15619}
15620
15621static PyObject *
15622unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15623
15624/*[clinic input]
15625@classmethod
15626str.__new__ as unicode_new
15627
15628 object as x: object = NULL
15629 encoding: str = NULL
15630 errors: str = NULL
15631
15632[clinic start generated code]*/
15633
15634static PyObject *
15635unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15636 const char *errors)
15637/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15638{
15639 PyObject *unicode;
15640 if (x == NULL) {
15641 unicode = unicode_new_empty();
15642 }
15643 else if (encoding == NULL && errors == NULL) {
15644 unicode = PyObject_Str(x);
15645 }
15646 else {
15647 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15648 }
15649
15650 if (unicode != NULL && type != &PyUnicode_Type) {
15651 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15652 }
15653 return unicode;
15654}
15655
15656static PyObject *
15657unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15658{
15659 PyObject *self;
15660 Py_ssize_t length, char_size;
15661 int share_wstr, share_utf8;
15662 unsigned int kind;
15663 void *data;
15664
15665 assert(PyType_IsSubtype(type, &PyUnicode_Type));
15666 assert(_PyUnicode_CHECK(unicode));
15667 if (PyUnicode_READY(unicode) == -1) {
15668 return NULL;
15669 }
15670
15671 self = type->tp_alloc(type, 0);
15672 if (self == NULL) {
15673 return NULL;
15674 }
15675 kind = PyUnicode_KIND(unicode);
15676 length = PyUnicode_GET_LENGTH(unicode);
15677
15678 _PyUnicode_LENGTH(self) = length;
15679#ifdef Py_DEBUG
15680 _PyUnicode_HASH(self) = -1;
15681#else
15682 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15683#endif
15684 _PyUnicode_STATE(self).interned = 0;
15685 _PyUnicode_STATE(self).kind = kind;
15686 _PyUnicode_STATE(self).compact = 0;
15687 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15688 _PyUnicode_STATE(self).ready = 1;
15689 _PyUnicode_WSTR(self) = NULL;
15690 _PyUnicode_UTF8_LENGTH(self) = 0;
15691 _PyUnicode_UTF8(self) = NULL;
15692 _PyUnicode_WSTR_LENGTH(self) = 0;
15693 _PyUnicode_DATA_ANY(self) = NULL;
15694
15695 share_utf8 = 0;
15696 share_wstr = 0;
15697 if (kind == PyUnicode_1BYTE_KIND) {
15698 char_size = 1;
15699 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15700 share_utf8 = 1;
15701 }
15702 else if (kind == PyUnicode_2BYTE_KIND) {
15703 char_size = 2;
15704 if (sizeof(wchar_t) == 2)
15705 share_wstr = 1;
15706 }
15707 else {
15708 assert(kind == PyUnicode_4BYTE_KIND);
15709 char_size = 4;
15710 if (sizeof(wchar_t) == 4)
15711 share_wstr = 1;
15712 }
15713
15714 /* Ensure we won't overflow the length. */
15715 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15716 PyErr_NoMemory();
15717 goto onError;
15718 }
15719 data = PyObject_Malloc((length + 1) * char_size);
15720 if (data == NULL) {
15721 PyErr_NoMemory();
15722 goto onError;
15723 }
15724
15725 _PyUnicode_DATA_ANY(self) = data;
15726 if (share_utf8) {
15727 _PyUnicode_UTF8_LENGTH(self) = length;
15728 _PyUnicode_UTF8(self) = data;
15729 }
15730 if (share_wstr) {
15731 _PyUnicode_WSTR_LENGTH(self) = length;
15732 _PyUnicode_WSTR(self) = (wchar_t *)data;
15733 }
15734
15735 memcpy(data, PyUnicode_DATA(unicode),
15736 kind * (length + 1));
15737 assert(_PyUnicode_CheckConsistency(self, 1));
15738#ifdef Py_DEBUG
15739 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15740#endif
15741 return self;
15742
15743onError:
15744 Py_DECREF(self);
15745 return NULL;
15746}
15747
15748PyDoc_STRVAR(unicode_doc,
15749"str(object='') -> str\n\
15750str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15751\n\
15752Create a new string object from the given object. If encoding or\n\
15753errors is specified, then the object must expose a data buffer\n\
15754that will be decoded using the given encoding and error handler.\n\
15755Otherwise, returns the result of object.__str__() (if defined)\n\
15756or repr(object).\n\
15757encoding defaults to sys.getdefaultencoding().\n\
15758errors defaults to 'strict'.");
15759
15760static PyObject *unicode_iter(PyObject *seq);
15761
15762PyTypeObject PyUnicode_Type = {
15763 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15764 "str", /* tp_name */
15765 sizeof(PyUnicodeObject), /* tp_basicsize */
15766 0, /* tp_itemsize */
15767 /* Slots */
15768 (destructor)unicode_dealloc, /* tp_dealloc */
15769 0, /* tp_vectorcall_offset */
15770 0, /* tp_getattr */
15771 0, /* tp_setattr */
15772 0, /* tp_as_async */
15773 unicode_repr, /* tp_repr */
15774 &unicode_as_number, /* tp_as_number */
15775 &unicode_as_sequence, /* tp_as_sequence */
15776 &unicode_as_mapping, /* tp_as_mapping */
15777 (hashfunc) unicode_hash, /* tp_hash*/
15778 0, /* tp_call*/
15779 (reprfunc) unicode_str, /* tp_str */
15780 PyObject_GenericGetAttr, /* tp_getattro */
15781 0, /* tp_setattro */
15782 0, /* tp_as_buffer */
15783 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15784 Py_TPFLAGS_UNICODE_SUBCLASS |
15785 _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15786 unicode_doc, /* tp_doc */
15787 0, /* tp_traverse */
15788 0, /* tp_clear */
15789 PyUnicode_RichCompare, /* tp_richcompare */
15790 0, /* tp_weaklistoffset */
15791 unicode_iter, /* tp_iter */
15792 0, /* tp_iternext */
15793 unicode_methods, /* tp_methods */
15794 0, /* tp_members */
15795 0, /* tp_getset */
15796 &PyBaseObject_Type, /* tp_base */
15797 0, /* tp_dict */
15798 0, /* tp_descr_get */
15799 0, /* tp_descr_set */
15800 0, /* tp_dictoffset */
15801 0, /* tp_init */
15802 0, /* tp_alloc */
15803 unicode_new, /* tp_new */
15804 PyObject_Del, /* tp_free */
15805};
15806
15807/* Initialize the Unicode implementation */
15808
15809PyStatus
15810_PyUnicode_Init(PyInterpreterState *interp)
15811{
15812 struct _Py_unicode_state *state = &interp->unicode;
15813 if (unicode_create_empty_string_singleton(state) < 0) {
15814 return _PyStatus_NO_MEMORY();
15815 }
15816
15817 if (_Py_IsMainInterpreter(interp)) {
15818 /* initialize the linebreak bloom filter */
15819 const Py_UCS2 linebreak[] = {
15820 0x000A, /* LINE FEED */
15821 0x000D, /* CARRIAGE RETURN */
15822 0x001C, /* FILE SEPARATOR */
15823 0x001D, /* GROUP SEPARATOR */
15824 0x001E, /* RECORD SEPARATOR */
15825 0x0085, /* NEXT LINE */
15826 0x2028, /* LINE SEPARATOR */
15827 0x2029, /* PARAGRAPH SEPARATOR */
15828 };
15829 bloom_linebreak = make_bloom_mask(
15830 PyUnicode_2BYTE_KIND, linebreak,
15831 Py_ARRAY_LENGTH(linebreak));
15832 }
15833
15834 return _PyStatus_OK();
15835}
15836
15837
15838PyStatus
15839_PyUnicode_InitTypes(void)
15840{
15841 if (PyType_Ready(&PyUnicode_Type) < 0) {
15842 return _PyStatus_ERR("Can't initialize unicode type");
15843 }
15844 if (PyType_Ready(&EncodingMapType) < 0) {
15845 return _PyStatus_ERR("Can't initialize encoding map type");
15846 }
15847 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15848 return _PyStatus_ERR("Can't initialize field name iterator type");
15849 }
15850 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15851 return _PyStatus_ERR("Can't initialize formatter iter type");
15852 }
15853 return _PyStatus_OK();
15854}
15855
15856
15857void
15858PyUnicode_InternInPlace(PyObject **p)
15859{
15860 PyObject *s = *p;
15861#ifdef Py_DEBUG
15862 assert(s != NULL);
15863 assert(_PyUnicode_CHECK(s));
15864#else
15865 if (s == NULL || !PyUnicode_Check(s)) {
15866 return;
15867 }
15868#endif
15869
15870 /* If it's a subclass, we don't really know what putting
15871 it in the interned dict might do. */
15872 if (!PyUnicode_CheckExact(s)) {
15873 return;
15874 }
15875
15876 if (PyUnicode_CHECK_INTERNED(s)) {
15877 return;
15878 }
15879
15880#ifdef INTERNED_STRINGS
15881 if (PyUnicode_READY(s) == -1) {
15882 PyErr_Clear();
15883 return;
15884 }
15885
15886 if (interned == NULL) {
15887 interned = PyDict_New();
15888 if (interned == NULL) {
15889 PyErr_Clear(); /* Don't leave an exception */
15890 return;
15891 }
15892 }
15893
15894 PyObject *t = PyDict_SetDefault(interned, s, s);
15895 if (t == NULL) {
15896 PyErr_Clear();
15897 return;
15898 }
15899
15900 if (t != s) {
15901 Py_INCREF(t);
15902 Py_SETREF(*p, t);
15903 return;
15904 }
15905
15906 /* The two references in interned dict (key and value) are not counted by
15907 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15908 this. */
15909 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15910 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15911#else
15912 // PyDict expects that interned strings have their hash
15913 // (PyASCIIObject.hash) already computed.
15914 (void)unicode_hash(s);
15915#endif
15916}
15917
15918void
15919PyUnicode_InternImmortal(PyObject **p)
15920{
15921 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15922 "PyUnicode_InternImmortal() is deprecated; "
15923 "use PyUnicode_InternInPlace() instead", 1) < 0)
15924 {
15925 // The function has no return value, the exception cannot
15926 // be reported to the caller, so just log it.
15927 PyErr_WriteUnraisable(NULL);
15928 }
15929
15930 PyUnicode_InternInPlace(p);
15931 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15932 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15933 Py_INCREF(*p);
15934 }
15935}
15936
15937PyObject *
15938PyUnicode_InternFromString(const char *cp)
15939{
15940 PyObject *s = PyUnicode_FromString(cp);
15941 if (s == NULL)
15942 return NULL;
15943 PyUnicode_InternInPlace(&s);
15944 return s;
15945}
15946
15947
15948void
15949_PyUnicode_ClearInterned(PyInterpreterState *interp)
15950{
15951 if (!_Py_IsMainInterpreter(interp)) {
15952 // interned dict is shared by all interpreters
15953 return;
15954 }
15955
15956 if (interned == NULL) {
15957 return;
15958 }
15959 assert(PyDict_CheckExact(interned));
15960
15961 /* Interned unicode strings are not forcibly deallocated; rather, we give
15962 them their stolen references back, and then clear and DECREF the
15963 interned dict. */
15964
15965#ifdef INTERNED_STATS
15966 fprintf(stderr, "releasing %zd interned strings\n",
15967 PyDict_GET_SIZE(interned));
15968
15969 Py_ssize_t immortal_size = 0, mortal_size = 0;
15970#endif
15971 Py_ssize_t pos = 0;
15972 PyObject *s, *ignored_value;
15973 while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15974 assert(PyUnicode_IS_READY(s));
15975
15976 switch (PyUnicode_CHECK_INTERNED(s)) {
15977 case SSTATE_INTERNED_IMMORTAL:
15978 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
15979#ifdef INTERNED_STATS
15980 immortal_size += PyUnicode_GET_LENGTH(s);
15981#endif
15982 break;
15983 case SSTATE_INTERNED_MORTAL:
15984 // Restore the two references (key and value) ignored
15985 // by PyUnicode_InternInPlace().
15986 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
15987#ifdef INTERNED_STATS
15988 mortal_size += PyUnicode_GET_LENGTH(s);
15989#endif
15990 break;
15991 case SSTATE_NOT_INTERNED:
15992 /* fall through */
15993 default:
15994 Py_UNREACHABLE();
15995 }
15996 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15997 }
15998#ifdef INTERNED_STATS
15999 fprintf(stderr,
16000 "total size of all interned strings: %zd/%zd mortal/immortal\n",
16001 mortal_size, immortal_size);
16002#endif
16003
16004 PyDict_Clear(interned);
16005 Py_CLEAR(interned);
16006}
16007
16008
16009/********************* Unicode Iterator **************************/
16010
16011typedef struct {
16012 PyObject_HEAD
16013 Py_ssize_t it_index;
16014 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
16015} unicodeiterobject;
16016
16017static void
16018unicodeiter_dealloc(unicodeiterobject *it)
16019{
16020 _PyObject_GC_UNTRACK(it);
16021 Py_XDECREF(it->it_seq);
16022 PyObject_GC_Del(it);
16023}
16024
16025static int
16026unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
16027{
16028 Py_VISIT(it->it_seq);
16029 return 0;
16030}
16031
16032static PyObject *
16033unicodeiter_next(unicodeiterobject *it)
16034{
16035 PyObject *seq, *item;
16036
16037 assert(it != NULL);
16038 seq = it->it_seq;
16039 if (seq == NULL)
16040 return NULL;
16041 assert(_PyUnicode_CHECK(seq));
16042
16043 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16044 int kind = PyUnicode_KIND(seq);
16045 const void *data = PyUnicode_DATA(seq);
16046 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
16047 item = PyUnicode_FromOrdinal(chr);
16048 if (item != NULL)
16049 ++it->it_index;
16050 return item;
16051 }
16052
16053 it->it_seq = NULL;
16054 Py_DECREF(seq);
16055 return NULL;
16056}
16057
16058static PyObject *
16059unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
16060{
16061 Py_ssize_t len = 0;
16062 if (it->it_seq)
16063 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
16064 return PyLong_FromSsize_t(len);
16065}
16066
16067PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
16068
16069static PyObject *
16070unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
16071{
16072 _Py_IDENTIFIER(iter);
16073 if (it->it_seq != NULL) {
16074 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
16075 it->it_seq, it->it_index);
16076 } else {
16077 PyObject *u = (PyObject *)_PyUnicode_New(0);
16078 if (u == NULL)
16079 return NULL;
16080 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
16081 }
16082}
16083
16084PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
16085
16086static PyObject *
16087unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
16088{
16089 Py_ssize_t index = PyLong_AsSsize_t(state);
16090 if (index == -1 && PyErr_Occurred())
16091 return NULL;
16092 if (it->it_seq != NULL) {
16093 if (index < 0)
16094 index = 0;
16095 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
16096 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
16097 it->it_index = index;
16098 }
16099 Py_RETURN_NONE;
16100}
16101
16102PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16103
16104static PyMethodDef unicodeiter_methods[] = {
16105 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
16106 length_hint_doc},
16107 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
16108 reduce_doc},
16109 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
16110 setstate_doc},
16111 {NULL, NULL} /* sentinel */
16112};
16113
16114PyTypeObject PyUnicodeIter_Type = {
16115 PyVarObject_HEAD_INIT(&PyType_Type, 0)
16116 "str_iterator", /* tp_name */
16117 sizeof(unicodeiterobject), /* tp_basicsize */
16118 0, /* tp_itemsize */
16119 /* methods */
16120 (destructor)unicodeiter_dealloc, /* tp_dealloc */
16121 0, /* tp_vectorcall_offset */
16122 0, /* tp_getattr */
16123 0, /* tp_setattr */
16124 0, /* tp_as_async */
16125 0, /* tp_repr */
16126 0, /* tp_as_number */
16127 0, /* tp_as_sequence */
16128 0, /* tp_as_mapping */
16129 0, /* tp_hash */
16130 0, /* tp_call */
16131 0, /* tp_str */
16132 PyObject_GenericGetAttr, /* tp_getattro */
16133 0, /* tp_setattro */
16134 0, /* tp_as_buffer */
16135 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16136 0, /* tp_doc */
16137 (traverseproc)unicodeiter_traverse, /* tp_traverse */
16138 0, /* tp_clear */
16139 0, /* tp_richcompare */
16140 0, /* tp_weaklistoffset */
16141 PyObject_SelfIter, /* tp_iter */
16142 (iternextfunc)unicodeiter_next, /* tp_iternext */
16143 unicodeiter_methods, /* tp_methods */
16144 0,
16145};
16146
16147static PyObject *
16148unicode_iter(PyObject *seq)
16149{
16150 unicodeiterobject *it;
16151
16152 if (!PyUnicode_Check(seq)) {
16153 PyErr_BadInternalCall();
16154 return NULL;
16155 }
16156 if (PyUnicode_READY(seq) == -1)
16157 return NULL;
16158 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16159 if (it == NULL)
16160 return NULL;
16161 it->it_index = 0;
16162 Py_INCREF(seq);
16163 it->it_seq = seq;
16164 _PyObject_GC_TRACK(it);
16165 return (PyObject *)it;
16166}
16167
16168static int
16169encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
16170{
16171 int res;
16172 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16173 if (res == -2) {
16174 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16175 return -1;
16176 }
16177 if (res < 0) {
16178 PyErr_NoMemory();
16179 return -1;
16180 }
16181 return 0;
16182}
16183
16184
16185static int
16186config_get_codec_name(wchar_t **config_encoding)
16187{
16188 char *encoding;
16189 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16190 return -1;
16191 }
16192
16193 PyObject *name_obj = NULL;
16194 PyObject *codec = _PyCodec_Lookup(encoding);
16195 PyMem_RawFree(encoding);
16196
16197 if (!codec)
16198 goto error;
16199
16200 name_obj = PyObject_GetAttrString(codec, "name");
16201 Py_CLEAR(codec);
16202 if (!name_obj) {
16203 goto error;
16204 }
16205
16206 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16207 Py_DECREF(name_obj);
16208 if (wname == NULL) {
16209 goto error;
16210 }
16211
16212 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16213 if (raw_wname == NULL) {
16214 PyMem_Free(wname);
16215 PyErr_NoMemory();
16216 goto error;
16217 }
16218
16219 PyMem_RawFree(*config_encoding);
16220 *config_encoding = raw_wname;
16221
16222 PyMem_Free(wname);
16223 return 0;
16224
16225error:
16226 Py_XDECREF(codec);
16227 Py_XDECREF(name_obj);
16228 return -1;
16229}
16230
16231
16232static PyStatus
16233init_stdio_encoding(PyInterpreterState *interp)
16234{
16235 /* Update the stdio encoding to the normalized Python codec name. */
16236 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16237 if (config_get_codec_name(&config->stdio_encoding) < 0) {
16238 return _PyStatus_ERR("failed to get the Python codec name "
16239 "of the stdio encoding");
16240 }
16241 return _PyStatus_OK();
16242}
16243
16244
16245static int
16246init_fs_codec(PyInterpreterState *interp)
16247{
16248 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
16249
16250 _Py_error_handler error_handler;
16251 error_handler = get_error_handler_wide(config->filesystem_errors);
16252 if (error_handler == _Py_ERROR_UNKNOWN) {
16253 PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
16254 return -1;
16255 }
16256
16257 char *encoding, *errors;
16258 if (encode_wstr_utf8(config->filesystem_encoding,
16259 &encoding,
16260 "filesystem_encoding") < 0) {
16261 return -1;
16262 }
16263
16264 if (encode_wstr_utf8(config->filesystem_errors,
16265 &errors,
16266 "filesystem_errors") < 0) {
16267 PyMem_RawFree(encoding);
16268 return -1;
16269 }
16270
16271 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16272 PyMem_RawFree(fs_codec->encoding);
16273 fs_codec->encoding = encoding;
16274 /* encoding has been normalized by init_fs_encoding() */
16275 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16276 PyMem_RawFree(fs_codec->errors);
16277 fs_codec->errors = errors;
16278 fs_codec->error_handler = error_handler;
16279
16280#ifdef _Py_FORCE_UTF8_FS_ENCODING
16281 assert(fs_codec->utf8 == 1);
16282#endif
16283
16284 /* At this point, PyUnicode_EncodeFSDefault() and
16285 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16286 the C implementation of the filesystem encoding. */
16287
16288 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16289 global configuration variables. */
16290 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16291 fs_codec->errors) < 0) {
16292 PyErr_NoMemory();
16293 return -1;
16294 }
16295 return 0;
16296}
16297
16298
16299static PyStatus
16300init_fs_encoding(PyThreadState *tstate)
16301{
16302 PyInterpreterState *interp = tstate->interp;
16303
16304 /* Update the filesystem encoding to the normalized Python codec name.
16305 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16306 (Python codec name). */
16307 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16308 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16309 _Py_DumpPathConfig(tstate);
16310 return _PyStatus_ERR("failed to get the Python codec "
16311 "of the filesystem encoding");
16312 }
16313
16314 if (init_fs_codec(interp) < 0) {
16315 return _PyStatus_ERR("cannot initialize filesystem codec");
16316 }
16317 return _PyStatus_OK();
16318}
16319
16320
16321PyStatus
16322_PyUnicode_InitEncodings(PyThreadState *tstate)
16323{
16324 PyStatus status = init_fs_encoding(tstate);
16325 if (_PyStatus_EXCEPTION(status)) {
16326 return status;
16327 }
16328
16329 return init_stdio_encoding(tstate->interp);
16330}
16331
16332
16333static void
16334_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16335{
16336 PyMem_RawFree(fs_codec->encoding);
16337 fs_codec->encoding = NULL;
16338 fs_codec->utf8 = 0;
16339 PyMem_RawFree(fs_codec->errors);
16340 fs_codec->errors = NULL;
16341 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16342}
16343
16344
16345#ifdef MS_WINDOWS
16346int
16347_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16348{
16349 PyInterpreterState *interp = _PyInterpreterState_GET();
16350 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16351
16352 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16353 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16354 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16355 if (encoding == NULL || errors == NULL) {
16356 PyMem_RawFree(encoding);
16357 PyMem_RawFree(errors);
16358 PyErr_NoMemory();
16359 return -1;
16360 }
16361
16362 PyMem_RawFree(config->filesystem_encoding);
16363 config->filesystem_encoding = encoding;
16364 PyMem_RawFree(config->filesystem_errors);
16365 config->filesystem_errors = errors;
16366
16367 return init_fs_codec(interp);
16368}
16369#endif
16370
16371
16372void
16373_PyUnicode_Fini(PyInterpreterState *interp)
16374{
16375 struct _Py_unicode_state *state = &interp->unicode;
16376
16377 if (_Py_IsMainInterpreter(interp)) {
16378 // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16379 assert(interned == NULL);
16380 // bpo-47182: force a unicodedata CAPI capsule re-import on
16381 // subsequent initialization of main interpreter.
16382 ucnhash_capi = NULL;
16383 }
16384
16385 _PyUnicode_FiniEncodings(&state->fs_codec);
16386
16387 unicode_clear_identifiers(state);
16388
16389 for (Py_ssize_t i = 0; i < 256; i++) {
16390 Py_CLEAR(state->latin1[i]);
16391 }
16392 Py_CLEAR(state->empty_string);
16393}
16394
16395
16396/* A _string module, to export formatter_parser and formatter_field_name_split
16397 to the string.Formatter class implemented in Python. */
16398
16399static PyMethodDef _string_methods[] = {
16400 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16401 METH_O, PyDoc_STR("split the argument as a field name")},
16402 {"formatter_parser", (PyCFunction) formatter_parser,
16403 METH_O, PyDoc_STR("parse the argument as a format string")},
16404 {NULL, NULL}
16405};
16406
16407static struct PyModuleDef _string_module = {
16408 PyModuleDef_HEAD_INIT,
16409 .m_name = "_string",
16410 .m_doc = PyDoc_STR("string helper module"),
16411 .m_size = 0,
16412 .m_methods = _string_methods,
16413};
16414
16415PyMODINIT_FUNC
16416PyInit__string(void)
16417{
16418 return PyModuleDef_Init(&_string_module);
16419}
16420
16421
16422#ifdef __cplusplus
16423}
16424#endif
16425