1 | /* |
2 | Unicode character type helpers. |
3 | |
4 | Written by Marc-Andre Lemburg ([email protected]). |
5 | Modified for Python 2.0 by Fredrik Lundh ([email protected]) |
6 | |
7 | Copyright (c) Corporation for National Research Initiatives. |
8 | |
9 | */ |
10 | |
11 | #include "Python.h" |
12 | |
13 | #define ALPHA_MASK 0x01 |
14 | #define DECIMAL_MASK 0x02 |
15 | #define DIGIT_MASK 0x04 |
16 | #define LOWER_MASK 0x08 |
17 | #define LINEBREAK_MASK 0x10 |
18 | #define SPACE_MASK 0x20 |
19 | #define TITLE_MASK 0x40 |
20 | #define UPPER_MASK 0x80 |
21 | #define XID_START_MASK 0x100 |
22 | #define XID_CONTINUE_MASK 0x200 |
23 | #define PRINTABLE_MASK 0x400 |
24 | #define NUMERIC_MASK 0x800 |
25 | #define CASE_IGNORABLE_MASK 0x1000 |
26 | #define CASED_MASK 0x2000 |
27 | #define EXTENDED_CASE_MASK 0x4000 |
28 | |
29 | typedef struct { |
30 | /* |
31 | These are either deltas to the character or offsets in |
32 | _PyUnicode_ExtendedCase. |
33 | */ |
34 | const int upper; |
35 | const int lower; |
36 | const int title; |
37 | /* Note if more flag space is needed, decimal and digit could be unified. */ |
38 | const unsigned char decimal; |
39 | const unsigned char digit; |
40 | const unsigned short flags; |
41 | } _PyUnicode_TypeRecord; |
42 | |
43 | #include "unicodetype_db.h" |
44 | |
45 | static const _PyUnicode_TypeRecord * |
46 | gettyperecord(Py_UCS4 code) |
47 | { |
48 | int index; |
49 | |
50 | if (code >= 0x110000) |
51 | index = 0; |
52 | else |
53 | { |
54 | index = index1[(code>>SHIFT)]; |
55 | index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; |
56 | } |
57 | |
58 | return &_PyUnicode_TypeRecords[index]; |
59 | } |
60 | |
61 | /* Returns the titlecase Unicode characters corresponding to ch or just |
62 | ch if no titlecase mapping is known. */ |
63 | |
64 | Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch) |
65 | { |
66 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
67 | |
68 | if (ctype->flags & EXTENDED_CASE_MASK) |
69 | return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF]; |
70 | return ch + ctype->title; |
71 | } |
72 | |
73 | /* Returns 1 for Unicode characters having the category 'Lt', 0 |
74 | otherwise. */ |
75 | |
76 | int _PyUnicode_IsTitlecase(Py_UCS4 ch) |
77 | { |
78 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
79 | |
80 | return (ctype->flags & TITLE_MASK) != 0; |
81 | } |
82 | |
83 | /* Returns 1 for Unicode characters having the XID_Start property, 0 |
84 | otherwise. */ |
85 | |
86 | int _PyUnicode_IsXidStart(Py_UCS4 ch) |
87 | { |
88 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
89 | |
90 | return (ctype->flags & XID_START_MASK) != 0; |
91 | } |
92 | |
93 | /* Returns 1 for Unicode characters having the XID_Continue property, |
94 | 0 otherwise. */ |
95 | |
96 | int _PyUnicode_IsXidContinue(Py_UCS4 ch) |
97 | { |
98 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
99 | |
100 | return (ctype->flags & XID_CONTINUE_MASK) != 0; |
101 | } |
102 | |
103 | /* Returns the integer decimal (0-9) for Unicode characters having |
104 | this property, -1 otherwise. */ |
105 | |
106 | int _PyUnicode_ToDecimalDigit(Py_UCS4 ch) |
107 | { |
108 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
109 | |
110 | return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1; |
111 | } |
112 | |
113 | int _PyUnicode_IsDecimalDigit(Py_UCS4 ch) |
114 | { |
115 | if (_PyUnicode_ToDecimalDigit(ch) < 0) |
116 | return 0; |
117 | return 1; |
118 | } |
119 | |
120 | /* Returns the integer digit (0-9) for Unicode characters having |
121 | this property, -1 otherwise. */ |
122 | |
123 | int _PyUnicode_ToDigit(Py_UCS4 ch) |
124 | { |
125 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
126 | |
127 | return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1; |
128 | } |
129 | |
130 | int _PyUnicode_IsDigit(Py_UCS4 ch) |
131 | { |
132 | if (_PyUnicode_ToDigit(ch) < 0) |
133 | return 0; |
134 | return 1; |
135 | } |
136 | |
137 | /* Returns the numeric value as double for Unicode characters having |
138 | this property, -1.0 otherwise. */ |
139 | |
140 | int _PyUnicode_IsNumeric(Py_UCS4 ch) |
141 | { |
142 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
143 | |
144 | return (ctype->flags & NUMERIC_MASK) != 0; |
145 | } |
146 | |
147 | /* Returns 1 for Unicode characters to be hex-escaped when repr()ed, |
148 | 0 otherwise. |
149 | All characters except those characters defined in the Unicode character |
150 | database as following categories are considered printable. |
151 | * Cc (Other, Control) |
152 | * Cf (Other, Format) |
153 | * Cs (Other, Surrogate) |
154 | * Co (Other, Private Use) |
155 | * Cn (Other, Not Assigned) |
156 | * Zl Separator, Line ('\u2028', LINE SEPARATOR) |
157 | * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) |
158 | * Zs (Separator, Space) other than ASCII space('\x20'). |
159 | */ |
160 | int _PyUnicode_IsPrintable(Py_UCS4 ch) |
161 | { |
162 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
163 | |
164 | return (ctype->flags & PRINTABLE_MASK) != 0; |
165 | } |
166 | |
167 | /* Returns 1 for Unicode characters having the category 'Ll', 0 |
168 | otherwise. */ |
169 | |
170 | int _PyUnicode_IsLowercase(Py_UCS4 ch) |
171 | { |
172 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
173 | |
174 | return (ctype->flags & LOWER_MASK) != 0; |
175 | } |
176 | |
177 | /* Returns 1 for Unicode characters having the category 'Lu', 0 |
178 | otherwise. */ |
179 | |
180 | int _PyUnicode_IsUppercase(Py_UCS4 ch) |
181 | { |
182 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
183 | |
184 | return (ctype->flags & UPPER_MASK) != 0; |
185 | } |
186 | |
187 | /* Returns the uppercase Unicode characters corresponding to ch or just |
188 | ch if no uppercase mapping is known. */ |
189 | |
190 | Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) |
191 | { |
192 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
193 | |
194 | if (ctype->flags & EXTENDED_CASE_MASK) |
195 | return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF]; |
196 | return ch + ctype->upper; |
197 | } |
198 | |
199 | /* Returns the lowercase Unicode characters corresponding to ch or just |
200 | ch if no lowercase mapping is known. */ |
201 | |
202 | Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) |
203 | { |
204 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
205 | |
206 | if (ctype->flags & EXTENDED_CASE_MASK) |
207 | return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF]; |
208 | return ch + ctype->lower; |
209 | } |
210 | |
211 | int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) |
212 | { |
213 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
214 | |
215 | if (ctype->flags & EXTENDED_CASE_MASK) { |
216 | int index = ctype->lower & 0xFFFF; |
217 | int n = ctype->lower >> 24; |
218 | int i; |
219 | for (i = 0; i < n; i++) |
220 | res[i] = _PyUnicode_ExtendedCase[index + i]; |
221 | return n; |
222 | } |
223 | res[0] = ch + ctype->lower; |
224 | return 1; |
225 | } |
226 | |
227 | int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) |
228 | { |
229 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
230 | |
231 | if (ctype->flags & EXTENDED_CASE_MASK) { |
232 | int index = ctype->title & 0xFFFF; |
233 | int n = ctype->title >> 24; |
234 | int i; |
235 | for (i = 0; i < n; i++) |
236 | res[i] = _PyUnicode_ExtendedCase[index + i]; |
237 | return n; |
238 | } |
239 | res[0] = ch + ctype->title; |
240 | return 1; |
241 | } |
242 | |
243 | int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) |
244 | { |
245 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
246 | |
247 | if (ctype->flags & EXTENDED_CASE_MASK) { |
248 | int index = ctype->upper & 0xFFFF; |
249 | int n = ctype->upper >> 24; |
250 | int i; |
251 | for (i = 0; i < n; i++) |
252 | res[i] = _PyUnicode_ExtendedCase[index + i]; |
253 | return n; |
254 | } |
255 | res[0] = ch + ctype->upper; |
256 | return 1; |
257 | } |
258 | |
259 | int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) |
260 | { |
261 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
262 | |
263 | if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { |
264 | int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); |
265 | int n = (ctype->lower >> 20) & 7; |
266 | int i; |
267 | for (i = 0; i < n; i++) |
268 | res[i] = _PyUnicode_ExtendedCase[index + i]; |
269 | return n; |
270 | } |
271 | return _PyUnicode_ToLowerFull(ch, res); |
272 | } |
273 | |
274 | int _PyUnicode_IsCased(Py_UCS4 ch) |
275 | { |
276 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
277 | |
278 | return (ctype->flags & CASED_MASK) != 0; |
279 | } |
280 | |
281 | int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch) |
282 | { |
283 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
284 | |
285 | return (ctype->flags & CASE_IGNORABLE_MASK) != 0; |
286 | } |
287 | |
288 | /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', |
289 | 'Lo' or 'Lm', 0 otherwise. */ |
290 | |
291 | int _PyUnicode_IsAlpha(Py_UCS4 ch) |
292 | { |
293 | const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); |
294 | |
295 | return (ctype->flags & ALPHA_MASK) != 0; |
296 | } |
297 | |
298 | |