unicodeobject.h source code [python/Include/unicodeobject.h]

1	#ifndef Py_UNICODEOBJECT_H
2	#define Py_UNICODEOBJECT_H
3
4	#include <stdarg.h>
5
6	/*
7
8	Unicode implementation based on original code by Fredrik Lundh,
9	modified by Marc-Andre Lemburg ([email protected]) according to the
10	Unicode Integration Proposal. (See
11	http://www.egenix.com/files/python/unicode-proposal.txt).
12
13	Copyright (c) Corporation for National Research Initiatives.
14
15
16	Original header:
17	--------------------------------------------------------------------
18
19	* Yet another Unicode string type for Python. This type supports the
20	* 16-bit Basic Multilingual Plane (BMP) only.
21	*
22	* Written by Fredrik Lundh, January 1999.
23	*
24	* Copyright (c) 1999 by Secret Labs AB.
25	* Copyright (c) 1999 by Fredrik Lundh.
26	*
27	* [email protected]
28	* http://www.pythonware.com
29	*
30	* --------------------------------------------------------------------
31	* This Unicode String Type is
32	*
33	* Copyright (c) 1999 by Secret Labs AB
34	* Copyright (c) 1999 by Fredrik Lundh
35	*
36	* By obtaining, using, and/or copying this software and/or its
37	* associated documentation, you agree that you have read, understood,
38	* and will comply with the following terms and conditions:
39	*
40	* Permission to use, copy, modify, and distribute this software and its
41	* associated documentation for any purpose and without fee is hereby
42	* granted, provided that the above copyright notice appears in all
43	* copies, and that both that copyright notice and this permission notice
44	* appear in supporting documentation, and that the name of Secret Labs
45	* AB or the author not be used in advertising or publicity pertaining to
46	* distribution of the software without specific, written prior
47	* permission.
48	*
49	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56	* -------------------------------------------------------------------- */
57
58	#include <ctype.h>
59
60	/ === Internal API ======================================================= /
61
62	/ --- Internal Unicode Format -------------------------------------------- /
63
64	/ Python 3.x requires unicode /
65	#define Py_USING_UNICODE
66
67	#ifndef SIZEOF_WCHAR_T
68	#error Must define SIZEOF_WCHAR_T
69	#endif
70
71	#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73	/ If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.*
74	Otherwise, Unicode strings are stored as UCS-2 (with limited support
75	for UTF-16) /*
76
77	#if Py_UNICODE_SIZE >= 4
78	#define Py_UNICODE_WIDE
79	#endif
80
81	/ Set these flags if the platform has "wchar.h" and the*
82	wchar_t type is a 16-bit unsigned type /*
83	/ #define HAVE_WCHAR_H /
84	/ #define HAVE_USABLE_WCHAR_T /
85
86	/ If the compiler provides a wchar_t type we try to support it*
87	through the interface functions PyUnicode_FromWideChar(),
88	PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). /*
89
90	#ifdef HAVE_USABLE_WCHAR_T
91	# ifndef HAVE_WCHAR_H
92	# define HAVE_WCHAR_H
93	# endif
94	#endif
95
96	#ifdef HAVE_WCHAR_H
97	# include <wchar.h>
98	#endif
99
100	/ Py_UCS4 and Py_UCS2 are typedefs for the respective*
101	unicode representations. /*
102	typedef uint32_t Py_UCS4;
103	typedef uint16_t Py_UCS2;
104	typedef uint8_t Py_UCS1;
105
106	#ifdef __cplusplus
107	extern "C" {
108	#endif
109
110
111	PyAPI_DATA(PyTypeObject) PyUnicode_Type;
112	PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
113
114	#define PyUnicode_Check(op) \
115	PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
116	#define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type)
117
118	/ --- Constants ---------------------------------------------------------- /
119
120	/ This Unicode character will be used as replacement character during*
121	decoding if the errors argument is set to "replace". Note: the
122	Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123	Unicode 3.0. /*
124
125	#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
126
127	/ === Public API ========================================================= /
128
129	/ Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes /
130	PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
131	const char u, /* UTF-8 encoded string /
132	Py_ssize_t size / size of buffer /
133	);
134
135	/ Similar to PyUnicode_FromUnicode(), but u points to null-terminated*
136	UTF-8 encoded bytes. The size is determined with strlen(). /*
137	PyAPI_FUNC(PyObject*) PyUnicode_FromString(
138	const char u /* UTF-8 encoded string /
139	);
140
141	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
142	PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143	PyObject *str,
144	Py_ssize_t start,
145	Py_ssize_t end);
146	#endif
147
148	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
149	/ Copy the string into a UCS4 buffer including the null character if copy_null*
150	is set. Return NULL and raise an exception on error. Raise a SystemError if
151	the buffer is smaller than the string. Return buffer on success.
152
153	buflen is the length of the buffer in (Py_UCS4) characters. /*
154	PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155	PyObject *unicode,
156	Py_UCS4* buffer,
157	Py_ssize_t buflen,
158	int copy_null);
159
160	/ Copy the string into a UCS4 buffer. A new buffer is allocated using*
161	* PyMem_Malloc; if this fails, NULL is returned with a memory error
162	exception set. /*
163	PyAPI_FUNC(Py_UCS4) PyUnicode_AsUCS4Copy(PyObject unicode);
164	#endif
165
166	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
167	/ Get the length of the Unicode object. /
168
169	PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170	PyObject *unicode
171	);
172	#endif
173
174	/ Get the number of Py_UNICODE units in the*
175	string representation. /*
176
177	Py_DEPRECATED(`3.3`) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
178	PyObject unicode /* Unicode object /
179	);
180
181	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
182	/ Read a character from the string. /
183
184	PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
185	PyObject *unicode,
186	Py_ssize_t index
187	);
188
189	/ Write a character to the string. The string must have been created through*
190	PyUnicode_New, must not be shared, and must not have been hashed yet.
191
192	Return 0 on success, -1 on error. /*
193
194	PyAPI_FUNC(int) PyUnicode_WriteChar(
195	PyObject *unicode,
196	Py_ssize_t index,
197	Py_UCS4 character
198	);
199	#endif
200
201	/ Resize a Unicode object. The length is the number of characters, except*
202	if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
203	is the number of Py_UNICODE characters.
204
205	*unicode is modified to point to the new (resized) object and 0
206	returned on success.
207
208	Try to resize the string in place (which is usually faster than allocating
209	a new string and copy characters), or create a new string.
210
211	Error handling is implemented as follows: an exception is set, -1
212	is returned and unicode left untouched.*
213
214	WARNING: The function doesn't check string content, the result may not be a
215	string in canonical representation. /*
216
217	PyAPI_FUNC(int) PyUnicode_Resize(
218	PyObject *unicode, /* Pointer to the Unicode object /
219	Py_ssize_t length / New length /
220	);
221
222	/ Decode obj to a Unicode object.*
223
224	bytes, bytearray and other bytes-like objects are decoded according to the
225	given encoding and error handler. The encoding and error handler can be
226	NULL to have the interface use UTF-8 and "strict".
227
228	All other objects (including Unicode objects) raise an exception.
229
230	The API returns NULL in case of an error. The caller is responsible
231	for decref'ing the returned objects.
232
233	*/
234
235	PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
236	PyObject obj, /* Object /
237	const char encoding, /* encoding /
238	const char errors /* error handling /
239	);
240
241	/ Copy an instance of a Unicode subtype to a new true Unicode object if*
242	necessary. If obj is already a true Unicode object (not a subtype), return
243	the reference with incremented* refcount.*
244
245	The API returns NULL in case of an error. The caller is responsible
246	for decref'ing the returned objects.
247
248	*/
249
250	PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
251	PyObject obj /* Object /
252	);
253
254	PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
255	const char format, /* ASCII-encoded string /
256	va_list vargs
257	);
258	PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
259	const char format, /* ASCII-encoded string /
260	...
261	);
262
263	PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
264	PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
265	const char u /* UTF-8 encoded string /
266	);
267
268	// PyUnicode_InternImmortal() is deprecated since Python 3.10
269	// and will be removed in Python 3.12. Use PyUnicode_InternInPlace() instead.
270	Py_DEPRECATED(`3.10`) PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
271
272	/ Use only if you know it's a string /
273	#define PyUnicode_CHECK_INTERNED(op) \
274	(((PyASCIIObject *)(op))->state.interned)
275
276	/ --- wchar_t support for platforms which support it --------------------- /
277
278	#ifdef HAVE_WCHAR_H
279
280	/ Create a Unicode Object from the wchar_t buffer w of the given*
281	size.
282
283	The buffer is copied into the new object. /*
284
285	PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
286	const wchar_t w, /* wchar_t buffer /
287	Py_ssize_t size / size of buffer /
288	);
289
290	/ Copies the Unicode Object contents into the wchar_t buffer w. At*
291	most size wchar_t characters are copied.
292
293	Note that the resulting wchar_t string may or may not be
294	0-terminated. It is the responsibility of the caller to make sure
295	that the wchar_t string is 0-terminated in case this is required by
296	the application.
297
298	Returns the number of wchar_t characters copied (excluding a
299	possibly trailing 0-termination character) or -1 in case of an
300	error. /*
301
302	PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
303	PyObject unicode, /* Unicode object /
304	wchar_t w, /* wchar_t buffer /
305	Py_ssize_t size / size of buffer /
306	);
307
308	/ Convert the Unicode object to a wide character string. The output string*
309	always ends with a nul character. If size is not NULL, write the number of
310	wide characters (excluding the null character) into size.*
311
312	Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
313	on success. On error, returns NULL, size is undefined and raises a*
314	MemoryError. /*
315
316	PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
317	PyObject unicode, /* Unicode object /
318	Py_ssize_t size /* number of characters of the result /
319	);
320
321	#endif
322
323	/ --- Unicode ordinals --------------------------------------------------- /
324
325	/ Create a Unicode Object from the given Unicode code point ordinal.*
326
327	The ordinal must be in range(0x110000). A ValueError is
328	raised in case it is not.
329
330	*/
331
332	PyAPI_FUNC(PyObject) PyUnicode_FromOrdinal(int* ordinal);
333
334	/ === Builtin Codecs =====================================================*
335
336	Many of these APIs take two arguments encoding and errors. These
337	parameters encoding and errors have the same semantics as the ones
338	of the builtin str() API.
339
340	Setting encoding to NULL causes the default encoding (UTF-8) to be used.
341
342	Error handling is set by errors which may also be set to NULL
343	meaning to use the default handling defined for the codec. Default
344	error handling for all builtin codecs is "strict" (ValueErrors are
345	raised).
346
347	The codecs all use a similar interface. Only deviation from the
348	generic ones are documented.
349
350	*/
351
352	/ --- Manage the default encoding ---------------------------------------- /
353
354	/ Returns "utf-8". /
355	PyAPI_FUNC(const char) PyUnicode_GetDefaultEncoding(void*);
356
357	/ --- Generic Codecs ----------------------------------------------------- /
358
359	/ Create a Unicode object by decoding the encoded string s of the*
360	given size. /*
361
362	PyAPI_FUNC(PyObject*) PyUnicode_Decode(
363	const char s, /* encoded string /
364	Py_ssize_t size, / size of buffer /
365	const char encoding, /* encoding /
366	const char errors /* error handling /
367	);
368
369	/ Decode a Unicode object unicode and return the result as Python*
370	object.
371
372	This API is DEPRECATED. The only supported standard encoding is rot13.
373	Use PyCodec_Decode() to decode with rot13 and non-standard codecs
374	that decode from str. /*
375
376	Py_DEPRECATED(`3.6`) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
377	PyObject unicode, /* Unicode object /
378	const char encoding, /* encoding /
379	const char errors /* error handling /
380	);
381
382	/ Decode a Unicode object unicode and return the result as Unicode*
383	object.
384
385	This API is DEPRECATED. The only supported standard encoding is rot13.
386	Use PyCodec_Decode() to decode with rot13 and non-standard codecs
387	that decode from str to str. /*
388
389	Py_DEPRECATED(`3.6`) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
390	PyObject unicode, /* Unicode object /
391	const char encoding, /* encoding /
392	const char errors /* error handling /
393	);
394
395	/ Encodes a Unicode object and returns the result as Python*
396	object.
397
398	This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString()
399	since all standard encodings (except rot13) encode str to bytes.
400	Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
401	that encode form str to non-bytes. /*
402
403	Py_DEPRECATED(`3.6`) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
404	PyObject unicode, /* Unicode object /
405	const char encoding, /* encoding /
406	const char errors /* error handling /
407	);
408
409	/ Encodes a Unicode object and returns the result as Python string*
410	object. /*
411
412	PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
413	PyObject unicode, /* Unicode object /
414	const char encoding, /* encoding /
415	const char errors /* error handling /
416	);
417
418	/ Encodes a Unicode object and returns the result as Unicode*
419	object.
420
421	This API is DEPRECATED. The only supported standard encodings is rot13.
422	Use PyCodec_Encode() to encode with rot13 and non-standard codecs
423	that encode from str to str. /*
424
425	Py_DEPRECATED(`3.6`) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
426	PyObject unicode, /* Unicode object /
427	const char encoding, /* encoding /
428	const char errors /* error handling /
429	);
430
431	/ Build an encoding map. /
432
433	PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
434	PyObject* string / 256 character map /
435	);
436
437	/ --- UTF-7 Codecs ------------------------------------------------------- /
438
439	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
440	const char string, /* UTF-7 encoded string /
441	Py_ssize_t length, / size of string /
442	const char errors /* error handling /
443	);
444
445	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
446	const char string, /* UTF-7 encoded string /
447	Py_ssize_t length, / size of string /
448	const char errors, /* error handling /
449	Py_ssize_t consumed /* bytes consumed /
450	);
451
452	/ --- UTF-8 Codecs ------------------------------------------------------- /
453
454	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
455	const char string, /* UTF-8 encoded string /
456	Py_ssize_t length, / size of string /
457	const char errors /* error handling /
458	);
459
460	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
461	const char string, /* UTF-8 encoded string /
462	Py_ssize_t length, / size of string /
463	const char errors, /* error handling /
464	Py_ssize_t consumed /* bytes consumed /
465	);
466
467	PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
468	PyObject unicode /* Unicode object /
469	);
470
471	/ Returns a pointer to the default encoding (UTF-8) of the*
472	Unicode object unicode and the size of the encoded representation
473	in bytes stored in size.*
474
475	In case of an error, no size is set.*
476
477	This function caches the UTF-8 encoded string in the unicodeobject
478	and subsequent calls will return the same string. The memory is released
479	when the unicodeobject is deallocated.
480	*/
481
482	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x030A0000
483	PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
484	PyObject *unicode,
485	Py_ssize_t *size);
486	#endif
487
488	/ --- UTF-32 Codecs ------------------------------------------------------ /
489
490	/ Decodes length bytes from a UTF-32 encoded buffer string and returns*
491	the corresponding Unicode object.
492
493	errors (if non-NULL) defines the error handling. It defaults
494	to "strict".
495
496	If byteorder is non-NULL, the decoder starts decoding using the
497	given byte order:
498
499	*byteorder == -1: little endian
500	*byteorder == 0: native order
501	*byteorder == 1: big endian
502
503	In native mode, the first four bytes of the stream are checked for a
504	BOM mark. If found, the BOM mark is analysed, the byte order
505	adjusted and the BOM skipped. In the other modes, no BOM mark
506	interpretation is done. After completion, byteorder is set to the*
507	current byte order at the end of input data.
508
509	If byteorder is NULL, the codec starts in native order mode.
510
511	*/
512
513	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
514	const char string, /* UTF-32 encoded string /
515	Py_ssize_t length, / size of string /
516	const char errors, /* error handling /
517	int byteorder /* pointer to byteorder to use*
518	0=native;-1=LE,1=BE; updated on
519	exit /*
520	);
521
522	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
523	const char string, /* UTF-32 encoded string /
524	Py_ssize_t length, / size of string /
525	const char errors, /* error handling /
526	int byteorder, /* pointer to byteorder to use*
527	0=native;-1=LE,1=BE; updated on
528	exit /*
529	Py_ssize_t consumed /* bytes consumed /
530	);
531
532	/ Returns a Python string using the UTF-32 encoding in native byte*
533	order. The string always starts with a BOM mark. /*
534
535	PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
536	PyObject unicode /* Unicode object /
537	);
538
539	/ Returns a Python string object holding the UTF-32 encoded value of*
540	the Unicode data.
541
542	If byteorder is not 0, output is written according to the following
543	byte order:
544
545	byteorder == -1: little endian
546	byteorder == 0: native byte order (writes a BOM mark)
547	byteorder == 1: big endian
548
549	If byteorder is 0, the output string will always start with the
550	Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
551	prepended.
552
553	*/
554
555	/ --- UTF-16 Codecs ------------------------------------------------------ /
556
557	/ Decodes length bytes from a UTF-16 encoded buffer string and returns*
558	the corresponding Unicode object.
559
560	errors (if non-NULL) defines the error handling. It defaults
561	to "strict".
562
563	If byteorder is non-NULL, the decoder starts decoding using the
564	given byte order:
565
566	*byteorder == -1: little endian
567	*byteorder == 0: native order
568	*byteorder == 1: big endian
569
570	In native mode, the first two bytes of the stream are checked for a
571	BOM mark. If found, the BOM mark is analysed, the byte order
572	adjusted and the BOM skipped. In the other modes, no BOM mark
573	interpretation is done. After completion, byteorder is set to the*
574	current byte order at the end of input data.
575
576	If byteorder is NULL, the codec starts in native order mode.
577
578	*/
579
580	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
581	const char string, /* UTF-16 encoded string /
582	Py_ssize_t length, / size of string /
583	const char errors, /* error handling /
584	int byteorder /* pointer to byteorder to use*
585	0=native;-1=LE,1=BE; updated on
586	exit /*
587	);
588
589	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
590	const char string, /* UTF-16 encoded string /
591	Py_ssize_t length, / size of string /
592	const char errors, /* error handling /
593	int byteorder, /* pointer to byteorder to use*
594	0=native;-1=LE,1=BE; updated on
595	exit /*
596	Py_ssize_t consumed /* bytes consumed /
597	);
598
599	/ Returns a Python string using the UTF-16 encoding in native byte*
600	order. The string always starts with a BOM mark. /*
601
602	PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
603	PyObject unicode /* Unicode object /
604	);
605
606	/ --- Unicode-Escape Codecs ---------------------------------------------- /
607
608	PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
609	const char string, /* Unicode-Escape encoded string /
610	Py_ssize_t length, / size of string /
611	const char errors /* error handling /
612	);
613
614	PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
615	PyObject unicode /* Unicode object /
616	);
617
618	/ --- Raw-Unicode-Escape Codecs ------------------------------------------ /
619
620	PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
621	const char string, /* Raw-Unicode-Escape encoded string /
622	Py_ssize_t length, / size of string /
623	const char errors /* error handling /
624	);
625
626	PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
627	PyObject unicode /* Unicode object /
628	);
629
630	/ --- Latin-1 Codecs -----------------------------------------------------*
631
632	Note: Latin-1 corresponds to the first 256 Unicode ordinals. /*
633
634	PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
635	const char string, /* Latin-1 encoded string /
636	Py_ssize_t length, / size of string /
637	const char errors /* error handling /
638	);
639
640	PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
641	PyObject unicode /* Unicode object /
642	);
643
644	/ --- ASCII Codecs -------------------------------------------------------*
645
646	Only 7-bit ASCII data is excepted. All other codes generate errors.
647
648	*/
649
650	PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
651	const char string, /* ASCII encoded string /
652	Py_ssize_t length, / size of string /
653	const char errors /* error handling /
654	);
655
656	PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
657	PyObject unicode /* Unicode object /
658	);
659
660	/ --- Character Map Codecs -----------------------------------------------*
661
662	This codec uses mappings to encode and decode characters.
663
664	Decoding mappings must map byte ordinals (integers in the range from 0 to
665	255) to Unicode strings, integers (which are then interpreted as Unicode
666	ordinals) or None. Unmapped data bytes (ones which cause a LookupError)
667	as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
668	mapping" and cause an error.
669
670	Encoding mappings must map Unicode ordinal integers to bytes objects,
671	integers in the range from 0 to 255 or None. Unmapped character
672	ordinals (ones which cause a LookupError) as well as mapped to
673	None are treated as "undefined mapping" and cause an error.
674
675	*/
676
677	PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
678	const char string, /* Encoded string /
679	Py_ssize_t length, / size of string /
680	PyObject mapping, /* decoding mapping /
681	const char errors /* error handling /
682	);
683
684	PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
685	PyObject unicode, /* Unicode object /
686	PyObject mapping /* encoding mapping /
687	);
688
689	/ --- MBCS codecs for Windows -------------------------------------------- /
690
691	#ifdef MS_WINDOWS
692	PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
693	const char string, /* MBCS encoded string /
694	Py_ssize_t length, / size of string /
695	const char errors /* error handling /
696	);
697
698	PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
699	const char string, /* MBCS encoded string /
700	Py_ssize_t length, / size of string /
701	const char errors, /* error handling /
702	Py_ssize_t consumed /* bytes consumed /
703	);
704
705	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
706	PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
707	int code_page, / code page number /
708	const char string, /* encoded string /
709	Py_ssize_t length, / size of string /
710	const char errors, /* error handling /
711	Py_ssize_t consumed /* bytes consumed /
712	);
713	#endif
714
715	PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
716	PyObject unicode /* Unicode object /
717	);
718
719	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
720	PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
721	int code_page, / code page number /
722	PyObject unicode, /* Unicode object /
723	const char errors /* error handling /
724	);
725	#endif
726
727	#endif /* MS_WINDOWS */
728
729	/ --- Locale encoding --------------------------------------------------- /
730
731	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
732	/ Decode a string from the current locale encoding. The decoder is strict if*
733	surrogateescape is equal to zero, otherwise it uses the 'surrogateescape'
734	error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
735	be decoded as a surrogate character and surrogateescape* is not equal to*
736	zero, the byte sequence is escaped using the 'surrogateescape' error handler
737	instead of being decoded. str* must end with a null character but cannot*
738	contain embedded null characters. /*
739
740	PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
741	const char *str,
742	Py_ssize_t len,
743	const char *errors);
744
745	/ Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string*
746	length using strlen(). /*
747
748	PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
749	const char *str,
750	const char *errors);
751
752	/ Encode a Unicode object to the current locale encoding. The encoder is*
753	strict is surrogateescape* is equal to zero, otherwise the*
754	"surrogateescape" error handler is used. Return a bytes object. The string
755	cannot contain embedded null characters. /*
756
757	PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
758	PyObject *unicode,
759	const char *errors
760	);
761	#endif
762
763	/ --- File system encoding ---------------------------------------------- /
764
765	/ ParseTuple converter: encode str objects to bytes using*
766	PyUnicode_EncodeFSDefault(); bytes objects are output as-is. /*
767
768	PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject, void**);
769
770	/ ParseTuple converter: decode bytes objects to unicode using*
771	PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. /*
772
773	PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject, void**);
774
775	/ Decode a null-terminated string using Py_FileSystemDefaultEncoding*
776	and the "surrogateescape" error handler.
777
778	If Py_FileSystemDefaultEncoding is not set, fall back to the locale
779	encoding.
780
781	Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
782	*/
783
784	PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
785	const char s /* encoded string /
786	);
787
788	/ Decode a string using Py_FileSystemDefaultEncoding*
789	and the "surrogateescape" error handler.
790
791	If Py_FileSystemDefaultEncoding is not set, fall back to the locale
792	encoding.
793	*/
794
795	PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
796	const char s, /* encoded string /
797	Py_ssize_t size / size /
798	);
799
800	/ Encode a Unicode object to Py_FileSystemDefaultEncoding with the*
801	"surrogateescape" error handler, and return bytes.
802
803	If Py_FileSystemDefaultEncoding is not set, fall back to the locale
804	encoding.
805	*/
806
807	PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
808	PyObject *unicode
809	);
810
811	/ --- Methods & Slots ----------------------------------------------------*
812
813	These are capable of handling Unicode objects and strings on input
814	(we refer to them as strings in the descriptions) and return
815	Unicode objects or integers as appropriate. /*
816
817	/ Concat two strings giving a new Unicode string. /
818
819	PyAPI_FUNC(PyObject*) PyUnicode_Concat(
820	PyObject left, /* Left string /
821	PyObject right /* Right string /
822	);
823
824	/ Concat two strings and put the result in pleft
825	(sets pleft to NULL on error) /
826
827	PyAPI_FUNC(void) PyUnicode_Append(
828	PyObject *pleft, /* Pointer to left string /
829	PyObject right /* Right string /
830	);
831
832	/ Concat two strings, put the result in pleft and drop the right object
833	(sets pleft to NULL on error) /
834
835	PyAPI_FUNC(void) PyUnicode_AppendAndDel(
836	PyObject *pleft, /* Pointer to left string /
837	PyObject right /* Right string /
838	);
839
840	/ Split a string giving a list of Unicode strings.*
841
842	If sep is NULL, splitting will be done at all whitespace
843	substrings. Otherwise, splits occur at the given separator.
844
845	At most maxsplit splits will be done. If negative, no limit is set.
846
847	Separators are not included in the resulting list.
848
849	*/
850
851	PyAPI_FUNC(PyObject*) PyUnicode_Split(
852	PyObject s, /* String to split /
853	PyObject sep, /* String separator /
854	Py_ssize_t maxsplit / Maxsplit count /
855	);
856
857	/ Dito, but split at line breaks.*
858
859	CRLF is considered to be one line break. Line breaks are not
860	included in the resulting list. /*
861
862	PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
863	PyObject s, /* String to split /
864	int keepends / If true, line end markers are included /
865	);
866
867	/ Partition a string using a given separator. /
868
869	PyAPI_FUNC(PyObject*) PyUnicode_Partition(
870	PyObject s, /* String to partition /
871	PyObject sep /* String separator /
872	);
873
874	/ Partition a string using a given separator, searching from the end of the*
875	string. /*
876
877	PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
878	PyObject s, /* String to partition /
879	PyObject sep /* String separator /
880	);
881
882	/ Split a string giving a list of Unicode strings.*
883
884	If sep is NULL, splitting will be done at all whitespace
885	substrings. Otherwise, splits occur at the given separator.
886
887	At most maxsplit splits will be done. But unlike PyUnicode_Split
888	PyUnicode_RSplit splits from the end of the string. If negative,
889	no limit is set.
890
891	Separators are not included in the resulting list.
892
893	*/
894
895	PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
896	PyObject s, /* String to split /
897	PyObject sep, /* String separator /
898	Py_ssize_t maxsplit / Maxsplit count /
899	);
900
901	/ Translate a string by applying a character mapping table to it and*
902	return the resulting Unicode object.
903
904	The mapping table must map Unicode ordinal integers to Unicode strings,
905	Unicode ordinal integers or None (causing deletion of the character).
906
907	Mapping tables may be dictionaries or sequences. Unmapped character
908	ordinals (ones which cause a LookupError) are left untouched and
909	are copied as-is.
910
911	*/
912
913	PyAPI_FUNC(PyObject *) PyUnicode_Translate(
914	PyObject str, /* String /
915	PyObject table, /* Translate table /
916	const char errors /* error handling /
917	);
918
919	/ Join a sequence of strings using the given separator and return*
920	the resulting Unicode string. /*
921
922	PyAPI_FUNC(PyObject*) PyUnicode_Join(
923	PyObject separator, /* Separator string /
924	PyObject seq /* Sequence object /
925	);
926
927	/ Return 1 if substr matches str[start:end] at the given tail end, 0*
928	otherwise. /*
929
930	PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
931	PyObject str, /* String /
932	PyObject substr, /* Prefix or Suffix string /
933	Py_ssize_t start, / Start index /
934	Py_ssize_t end, / Stop index /
935	int direction / Tail end: -1 prefix, +1 suffix /
936	);
937
938	/ Return the first position of substr in str[start:end] using the*
939	given search direction or -1 if not found. -2 is returned in case
940	an error occurred and an exception is set. /*
941
942	PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
943	PyObject str, /* String /
944	PyObject substr, /* Substring to find /
945	Py_ssize_t start, / Start index /
946	Py_ssize_t end, / Stop index /
947	int direction / Find direction: +1 forward, -1 backward /
948	);
949
950	#if !defined(Py_LIMITED_API) \|\| Py_LIMITED_API+0 >= 0x03030000
951	/ Like PyUnicode_Find, but search for single character only. /
952	PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
953	PyObject *str,
954	Py_UCS4 ch,
955	Py_ssize_t start,
956	Py_ssize_t end,
957	int direction
958	);
959	#endif
960
961	/ Count the number of occurrences of substr in str[start:end]. /
962
963	PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
964	PyObject str, /* String /
965	PyObject substr, /* Substring to count /
966	Py_ssize_t start, / Start index /
967	Py_ssize_t end / Stop index /
968	);
969
970	/ Replace at most maxcount occurrences of substr in str with replstr*
971	and return the resulting Unicode object. /*
972
973	PyAPI_FUNC(PyObject *) PyUnicode_Replace(
974	PyObject str, /* String /
975	PyObject substr, /* Substring to find /
976	PyObject replstr, /* Substring to replace /
977	Py_ssize_t maxcount / Max. number of replacements to apply;*
978	-1 = all /*
979	);
980
981	/ Compare two strings and return -1, 0, 1 for less than, equal,*
982	greater than resp.
983	Raise an exception and return -1 on error. /*
984
985	PyAPI_FUNC(int) PyUnicode_Compare(
986	PyObject left, /* Left string /
987	PyObject right /* Right string /
988	);
989
990	/ Compare a Unicode object with C string and return -1, 0, 1 for less than,*
991	equal, and greater than, respectively. It is best to pass only
992	ASCII-encoded strings, but the function interprets the input string as
993	ISO-8859-1 if it contains non-ASCII characters.
994	This function does not raise exceptions. /*
995
996	PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
997	PyObject *left,
998	const char right /* ASCII-encoded string /
999	);
1000
1001	/ Rich compare two strings and return one of the following:*
1002
1003	- NULL in case an exception was raised
1004	- Py_True or Py_False for successful comparisons
1005	- Py_NotImplemented in case the type combination is unknown
1006
1007	Possible values for op:
1008
1009	Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1010
1011	*/
1012
1013	PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1014	PyObject left, /* Left string /
1015	PyObject right, /* Right string /
1016	int op / Operation: Py_EQ, Py_NE, Py_GT, etc. /
1017	);
1018
1019	/ Apply an argument tuple or dictionary to a format string and return*
1020	the resulting Unicode string. /*
1021
1022	PyAPI_FUNC(PyObject *) PyUnicode_Format(
1023	PyObject format, /* Format string /
1024	PyObject args /* Argument tuple or dictionary /
1025	);
1026
1027	/ Checks whether element is contained in container and return 1/0*
1028	accordingly.
1029
1030	element has to coerce to a one element Unicode string. -1 is
1031	returned in case of an error. /*
1032
1033	PyAPI_FUNC(int) PyUnicode_Contains(
1034	PyObject container, /* Container string /
1035	PyObject element /* Element string /
1036	);
1037
1038	/ Checks whether argument is a valid identifier. /
1039
1040	PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1041
1042	/ === Characters Type APIs =============================================== /
1043
1044	#ifndef Py_LIMITED_API
1045	# define Py_CPYTHON_UNICODEOBJECT_H
1046	# include "cpython/unicodeobject.h"
1047	# undef Py_CPYTHON_UNICODEOBJECT_H
1048	#endif
1049
1050	#ifdef __cplusplus
1051	}
1052	#endif
1053	#endif /* !Py_UNICODEOBJECT_H */
1054

Browse the source code of python/Include/unicodeobject.h