unicodectype.c source code [python/Objects/unicodectype.c]

1	/*
2	Unicode character type helpers.
3
4	Written by Marc-Andre Lemburg ([email protected]).
5	Modified for Python 2.0 by Fredrik Lundh ([email protected])
6
7	Copyright (c) Corporation for National Research Initiatives.
8
9	*/
10
11	#include "Python.h"
12
13	#define ALPHA_MASK 0x01
14	#define DECIMAL_MASK 0x02
15	#define DIGIT_MASK 0x04
16	#define LOWER_MASK 0x08
17	#define LINEBREAK_MASK 0x10
18	#define SPACE_MASK 0x20
19	#define TITLE_MASK 0x40
20	#define UPPER_MASK 0x80
21	#define XID_START_MASK 0x100
22	#define XID_CONTINUE_MASK 0x200
23	#define PRINTABLE_MASK 0x400
24	#define NUMERIC_MASK 0x800
25	#define CASE_IGNORABLE_MASK 0x1000
26	#define CASED_MASK 0x2000
27	#define EXTENDED_CASE_MASK 0x4000
28
29	typedef struct {
30	/*
31	These are either deltas to the character or offsets in
32	_PyUnicode_ExtendedCase.
33	*/
34	const int upper;
35	const int lower;
36	const int title;
37	/ Note if more flag space is needed, decimal and digit could be unified. /
38	const unsigned char decimal;
39	const unsigned char digit;
40	const unsigned short flags;
41	} _PyUnicode_TypeRecord;
42
43	#include "unicodetype_db.h"
44
45	static const _PyUnicode_TypeRecord *
46	gettyperecord(Py_UCS4 code)
47	{
48	int index;
49
50	if (code >= `0x110000`)
51	index = `0`;
52	else
53	{
54	index = index1[(code>>SHIFT)];
55	index = index2[(index<<SHIFT)+(code&((`1`<<SHIFT)-`1`))];
56	}
57
58	return &_PyUnicode_TypeRecords[index];
59	}
60
61	/ Returns the titlecase Unicode characters corresponding to ch or just*
62	ch if no titlecase mapping is known. /*
63
64	Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch)
65	{
66	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
67
68	if (ctype->flags & EXTENDED_CASE_MASK)
69	return _PyUnicode_ExtendedCase[ctype->title & `0xFFFF`];
70	return ch + ctype->title;
71	}
72
73	/ Returns 1 for Unicode characters having the category 'Lt', 0*
74	otherwise. /*
75
76	int _PyUnicode_IsTitlecase(Py_UCS4 ch)
77	{
78	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
79
80	return (ctype->flags & TITLE_MASK) != `0`;
81	}
82
83	/ Returns 1 for Unicode characters having the XID_Start property, 0*
84	otherwise. /*
85
86	int _PyUnicode_IsXidStart(Py_UCS4 ch)
87	{
88	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
89
90	return (ctype->flags & XID_START_MASK) != `0`;
91	}
92
93	/ Returns 1 for Unicode characters having the XID_Continue property,*
94	0 otherwise. /*
95
96	int _PyUnicode_IsXidContinue(Py_UCS4 ch)
97	{
98	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
99
100	return (ctype->flags & XID_CONTINUE_MASK) != `0`;
101	}
102
103	/ Returns the integer decimal (0-9) for Unicode characters having*
104	this property, -1 otherwise. /*
105
106	int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
107	{
108	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
109
110	return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -`1`;
111	}
112
113	int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
114	{
115	if (_PyUnicode_ToDecimalDigit(ch) < `0`)
116	return `0`;
117	return `1`;
118	}
119
120	/ Returns the integer digit (0-9) for Unicode characters having*
121	this property, -1 otherwise. /*
122
123	int _PyUnicode_ToDigit(Py_UCS4 ch)
124	{
125	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
126
127	return (ctype->flags & DIGIT_MASK) ? ctype->digit : -`1`;
128	}
129
130	int _PyUnicode_IsDigit(Py_UCS4 ch)
131	{
132	if (_PyUnicode_ToDigit(ch) < `0`)
133	return `0`;
134	return `1`;
135	}
136
137	/ Returns the numeric value as double for Unicode characters having*
138	this property, -1.0 otherwise. /*
139
140	int _PyUnicode_IsNumeric(Py_UCS4 ch)
141	{
142	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
143
144	return (ctype->flags & NUMERIC_MASK) != `0`;
145	}
146
147	/ Returns 1 for Unicode characters to be hex-escaped when repr()ed,*
148	0 otherwise.
149	All characters except those characters defined in the Unicode character
150	database as following categories are considered printable.
151	* Cc (Other, Control)
152	* Cf (Other, Format)
153	* Cs (Other, Surrogate)
154	* Co (Other, Private Use)
155	* Cn (Other, Not Assigned)
156	* Zl Separator, Line ('\u2028', LINE SEPARATOR)
157	* Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
158	* Zs (Separator, Space) other than ASCII space('\x20').
159	*/
160	int _PyUnicode_IsPrintable(Py_UCS4 ch)
161	{
162	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
163
164	return (ctype->flags & PRINTABLE_MASK) != `0`;
165	}
166
167	/ Returns 1 for Unicode characters having the category 'Ll', 0*
168	otherwise. /*
169
170	int _PyUnicode_IsLowercase(Py_UCS4 ch)
171	{
172	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
173
174	return (ctype->flags & LOWER_MASK) != `0`;
175	}
176
177	/ Returns 1 for Unicode characters having the category 'Lu', 0*
178	otherwise. /*
179
180	int _PyUnicode_IsUppercase(Py_UCS4 ch)
181	{
182	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
183
184	return (ctype->flags & UPPER_MASK) != `0`;
185	}
186
187	/ Returns the uppercase Unicode characters corresponding to ch or just*
188	ch if no uppercase mapping is known. /*
189
190	Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
191	{
192	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
193
194	if (ctype->flags & EXTENDED_CASE_MASK)
195	return _PyUnicode_ExtendedCase[ctype->upper & `0xFFFF`];
196	return ch + ctype->upper;
197	}
198
199	/ Returns the lowercase Unicode characters corresponding to ch or just*
200	ch if no lowercase mapping is known. /*
201
202	Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
203	{
204	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
205
206	if (ctype->flags & EXTENDED_CASE_MASK)
207	return _PyUnicode_ExtendedCase[ctype->lower & `0xFFFF`];
208	return ch + ctype->lower;
209	}
210
211	int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
212	{
213	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
214
215	if (ctype->flags & EXTENDED_CASE_MASK) {
216	int index = ctype->lower & `0xFFFF`;
217	int n = ctype->lower >> `24`;
218	int i;
219	for (i = `0`; i < n; i++)
220	res[i] = _PyUnicode_ExtendedCase[index + i];
221	return n;
222	}
223	res[`0`] = ch + ctype->lower;
224	return `1`;
225	}
226
227	int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
228	{
229	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
230
231	if (ctype->flags & EXTENDED_CASE_MASK) {
232	int index = ctype->title & `0xFFFF`;
233	int n = ctype->title >> `24`;
234	int i;
235	for (i = `0`; i < n; i++)
236	res[i] = _PyUnicode_ExtendedCase[index + i];
237	return n;
238	}
239	res[`0`] = ch + ctype->title;
240	return `1`;
241	}
242
243	int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
244	{
245	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
246
247	if (ctype->flags & EXTENDED_CASE_MASK) {
248	int index = ctype->upper & `0xFFFF`;
249	int n = ctype->upper >> `24`;
250	int i;
251	for (i = `0`; i < n; i++)
252	res[i] = _PyUnicode_ExtendedCase[index + i];
253	return n;
254	}
255	res[`0`] = ch + ctype->upper;
256	return `1`;
257	}
258
259	int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
260	{
261	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
262
263	if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> `20`) & `7`) {
264	int index = (ctype->lower & `0xFFFF`) + (ctype->lower >> `24`);
265	int n = (ctype->lower >> `20`) & `7`;
266	int i;
267	for (i = `0`; i < n; i++)
268	res[i] = _PyUnicode_ExtendedCase[index + i];
269	return n;
270	}
271	return _PyUnicode_ToLowerFull(ch, res);
272	}
273
274	int _PyUnicode_IsCased(Py_UCS4 ch)
275	{
276	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
277
278	return (ctype->flags & CASED_MASK) != `0`;
279	}
280
281	int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
282	{
283	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
284
285	return (ctype->flags & CASE_IGNORABLE_MASK) != `0`;
286	}
287
288	/ Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',*
289	'Lo' or 'Lm', 0 otherwise. /*
290
291	int _PyUnicode_IsAlpha(Py_UCS4 ch)
292	{
293	const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
294
295	return (ctype->flags & ALPHA_MASK) != `0`;
296	}
297
298

Browse the source code of python/Objects/unicodectype.c