1 | /* stringlib: codec implementations */ |
2 | |
3 | #if !STRINGLIB_IS_UNICODE |
4 | # error "codecs.h is specific to Unicode" |
5 | #endif |
6 | |
7 | #include "pycore_bitutils.h" // _Py_bswap32() |
8 | |
9 | /* Mask to quickly check whether a C 'size_t' contains a |
10 | non-ASCII, UTF8-encoded char. */ |
11 | #if (SIZEOF_SIZE_T == 8) |
12 | # define ASCII_CHAR_MASK 0x8080808080808080ULL |
13 | #elif (SIZEOF_SIZE_T == 4) |
14 | # define ASCII_CHAR_MASK 0x80808080U |
15 | #else |
16 | # error C 'size_t' size should be either 4 or 8! |
17 | #endif |
18 | |
19 | /* 10xxxxxx */ |
20 | #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) |
21 | |
22 | Py_LOCAL_INLINE(Py_UCS4) |
23 | STRINGLIB(utf8_decode)(const char **inptr, const char *end, |
24 | STRINGLIB_CHAR *dest, |
25 | Py_ssize_t *outpos) |
26 | { |
27 | Py_UCS4 ch; |
28 | const char *s = *inptr; |
29 | STRINGLIB_CHAR *p = dest + *outpos; |
30 | |
31 | while (s < end) { |
32 | ch = (unsigned char)*s; |
33 | |
34 | if (ch < 0x80) { |
35 | /* Fast path for runs of ASCII characters. Given that common UTF-8 |
36 | input will consist of an overwhelming majority of ASCII |
37 | characters, we try to optimize for this case by checking |
38 | as many characters as a C 'size_t' can contain. |
39 | First, check if we can do an aligned read, as most CPUs have |
40 | a penalty for unaligned reads. |
41 | */ |
42 | if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { |
43 | /* Help register allocation */ |
44 | const char *_s = s; |
45 | STRINGLIB_CHAR *_p = p; |
46 | while (_s + SIZEOF_SIZE_T <= end) { |
47 | /* Read a whole size_t at a time (either 4 or 8 bytes), |
48 | and do a fast unrolled copy if it only contains ASCII |
49 | characters. */ |
50 | size_t value = *(const size_t *) _s; |
51 | if (value & ASCII_CHAR_MASK) |
52 | break; |
53 | #if PY_LITTLE_ENDIAN |
54 | _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); |
55 | _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
56 | _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
57 | _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
58 | # if SIZEOF_SIZE_T == 8 |
59 | _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
60 | _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
61 | _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
62 | _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
63 | # endif |
64 | #else |
65 | # if SIZEOF_SIZE_T == 8 |
66 | _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); |
67 | _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); |
68 | _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); |
69 | _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); |
70 | _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
71 | _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
72 | _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
73 | _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); |
74 | # else |
75 | _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); |
76 | _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); |
77 | _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); |
78 | _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); |
79 | # endif |
80 | #endif |
81 | _s += SIZEOF_SIZE_T; |
82 | _p += SIZEOF_SIZE_T; |
83 | } |
84 | s = _s; |
85 | p = _p; |
86 | if (s == end) |
87 | break; |
88 | ch = (unsigned char)*s; |
89 | } |
90 | if (ch < 0x80) { |
91 | s++; |
92 | *p++ = ch; |
93 | continue; |
94 | } |
95 | } |
96 | |
97 | if (ch < 0xE0) { |
98 | /* \xC2\x80-\xDF\xBF -- 0080-07FF */ |
99 | Py_UCS4 ch2; |
100 | if (ch < 0xC2) { |
101 | /* invalid sequence |
102 | \x80-\xBF -- continuation byte |
103 | \xC0-\xC1 -- fake 0000-007F */ |
104 | goto InvalidStart; |
105 | } |
106 | if (end - s < 2) { |
107 | /* unexpected end of data: the caller will decide whether |
108 | it's an error or not */ |
109 | break; |
110 | } |
111 | ch2 = (unsigned char)s[1]; |
112 | if (!IS_CONTINUATION_BYTE(ch2)) |
113 | /* invalid continuation byte */ |
114 | goto InvalidContinuation1; |
115 | ch = (ch << 6) + ch2 - |
116 | ((0xC0 << 6) + 0x80); |
117 | assert ((ch > 0x007F) && (ch <= 0x07FF)); |
118 | s += 2; |
119 | if (STRINGLIB_MAX_CHAR <= 0x007F || |
120 | (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) |
121 | /* Out-of-range */ |
122 | goto Return; |
123 | *p++ = ch; |
124 | continue; |
125 | } |
126 | |
127 | if (ch < 0xF0) { |
128 | /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ |
129 | Py_UCS4 ch2, ch3; |
130 | if (end - s < 3) { |
131 | /* unexpected end of data: the caller will decide whether |
132 | it's an error or not */ |
133 | if (end - s < 2) |
134 | break; |
135 | ch2 = (unsigned char)s[1]; |
136 | if (!IS_CONTINUATION_BYTE(ch2) || |
137 | (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) |
138 | /* for clarification see comments below */ |
139 | goto InvalidContinuation1; |
140 | break; |
141 | } |
142 | ch2 = (unsigned char)s[1]; |
143 | ch3 = (unsigned char)s[2]; |
144 | if (!IS_CONTINUATION_BYTE(ch2)) { |
145 | /* invalid continuation byte */ |
146 | goto InvalidContinuation1; |
147 | } |
148 | if (ch == 0xE0) { |
149 | if (ch2 < 0xA0) |
150 | /* invalid sequence |
151 | \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ |
152 | goto InvalidContinuation1; |
153 | } else if (ch == 0xED && ch2 >= 0xA0) { |
154 | /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF |
155 | will result in surrogates in range D800-DFFF. Surrogates are |
156 | not valid UTF-8 so they are rejected. |
157 | See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf |
158 | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ |
159 | goto InvalidContinuation1; |
160 | } |
161 | if (!IS_CONTINUATION_BYTE(ch3)) { |
162 | /* invalid continuation byte */ |
163 | goto InvalidContinuation2; |
164 | } |
165 | ch = (ch << 12) + (ch2 << 6) + ch3 - |
166 | ((0xE0 << 12) + (0x80 << 6) + 0x80); |
167 | assert ((ch > 0x07FF) && (ch <= 0xFFFF)); |
168 | s += 3; |
169 | if (STRINGLIB_MAX_CHAR <= 0x07FF || |
170 | (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) |
171 | /* Out-of-range */ |
172 | goto Return; |
173 | *p++ = ch; |
174 | continue; |
175 | } |
176 | |
177 | if (ch < 0xF5) { |
178 | /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ |
179 | Py_UCS4 ch2, ch3, ch4; |
180 | if (end - s < 4) { |
181 | /* unexpected end of data: the caller will decide whether |
182 | it's an error or not */ |
183 | if (end - s < 2) |
184 | break; |
185 | ch2 = (unsigned char)s[1]; |
186 | if (!IS_CONTINUATION_BYTE(ch2) || |
187 | (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) |
188 | /* for clarification see comments below */ |
189 | goto InvalidContinuation1; |
190 | if (end - s < 3) |
191 | break; |
192 | ch3 = (unsigned char)s[2]; |
193 | if (!IS_CONTINUATION_BYTE(ch3)) |
194 | goto InvalidContinuation2; |
195 | break; |
196 | } |
197 | ch2 = (unsigned char)s[1]; |
198 | ch3 = (unsigned char)s[2]; |
199 | ch4 = (unsigned char)s[3]; |
200 | if (!IS_CONTINUATION_BYTE(ch2)) { |
201 | /* invalid continuation byte */ |
202 | goto InvalidContinuation1; |
203 | } |
204 | if (ch == 0xF0) { |
205 | if (ch2 < 0x90) |
206 | /* invalid sequence |
207 | \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ |
208 | goto InvalidContinuation1; |
209 | } else if (ch == 0xF4 && ch2 >= 0x90) { |
210 | /* invalid sequence |
211 | \xF4\x90\x80\x80- -- 110000- overflow */ |
212 | goto InvalidContinuation1; |
213 | } |
214 | if (!IS_CONTINUATION_BYTE(ch3)) { |
215 | /* invalid continuation byte */ |
216 | goto InvalidContinuation2; |
217 | } |
218 | if (!IS_CONTINUATION_BYTE(ch4)) { |
219 | /* invalid continuation byte */ |
220 | goto InvalidContinuation3; |
221 | } |
222 | ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - |
223 | ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); |
224 | assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); |
225 | s += 4; |
226 | if (STRINGLIB_MAX_CHAR <= 0xFFFF || |
227 | (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) |
228 | /* Out-of-range */ |
229 | goto Return; |
230 | *p++ = ch; |
231 | continue; |
232 | } |
233 | goto InvalidStart; |
234 | } |
235 | ch = 0; |
236 | Return: |
237 | *inptr = s; |
238 | *outpos = p - dest; |
239 | return ch; |
240 | InvalidStart: |
241 | ch = 1; |
242 | goto Return; |
243 | InvalidContinuation1: |
244 | ch = 2; |
245 | goto Return; |
246 | InvalidContinuation2: |
247 | ch = 3; |
248 | goto Return; |
249 | InvalidContinuation3: |
250 | ch = 4; |
251 | goto Return; |
252 | } |
253 | |
254 | #undef ASCII_CHAR_MASK |
255 | |
256 | |
257 | /* UTF-8 encoder specialized for a Unicode kind to avoid the slow |
258 | PyUnicode_READ() macro. Delete some parts of the code depending on the kind: |
259 | UCS-1 strings don't need to handle surrogates for example. */ |
260 | Py_LOCAL_INLINE(char *) |
261 | STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, |
262 | PyObject *unicode, |
263 | const STRINGLIB_CHAR *data, |
264 | Py_ssize_t size, |
265 | _Py_error_handler error_handler, |
266 | const char *errors) |
267 | { |
268 | Py_ssize_t i; /* index into data of next input character */ |
269 | char *p; /* next free byte in output buffer */ |
270 | #if STRINGLIB_SIZEOF_CHAR > 1 |
271 | PyObject *error_handler_obj = NULL; |
272 | PyObject *exc = NULL; |
273 | PyObject *rep = NULL; |
274 | #endif |
275 | #if STRINGLIB_SIZEOF_CHAR == 1 |
276 | const Py_ssize_t max_char_size = 2; |
277 | #elif STRINGLIB_SIZEOF_CHAR == 2 |
278 | const Py_ssize_t max_char_size = 3; |
279 | #else /* STRINGLIB_SIZEOF_CHAR == 4 */ |
280 | const Py_ssize_t max_char_size = 4; |
281 | #endif |
282 | |
283 | assert(size >= 0); |
284 | if (size > PY_SSIZE_T_MAX / max_char_size) { |
285 | /* integer overflow */ |
286 | PyErr_NoMemory(); |
287 | return NULL; |
288 | } |
289 | |
290 | _PyBytesWriter_Init(writer); |
291 | p = _PyBytesWriter_Alloc(writer, size * max_char_size); |
292 | if (p == NULL) |
293 | return NULL; |
294 | |
295 | for (i = 0; i < size;) { |
296 | Py_UCS4 ch = data[i++]; |
297 | |
298 | if (ch < 0x80) { |
299 | /* Encode ASCII */ |
300 | *p++ = (char) ch; |
301 | |
302 | } |
303 | else |
304 | #if STRINGLIB_SIZEOF_CHAR > 1 |
305 | if (ch < 0x0800) |
306 | #endif |
307 | { |
308 | /* Encode Latin-1 */ |
309 | *p++ = (char)(0xc0 | (ch >> 6)); |
310 | *p++ = (char)(0x80 | (ch & 0x3f)); |
311 | } |
312 | #if STRINGLIB_SIZEOF_CHAR > 1 |
313 | else if (Py_UNICODE_IS_SURROGATE(ch)) { |
314 | Py_ssize_t startpos, endpos, newpos; |
315 | Py_ssize_t k; |
316 | if (error_handler == _Py_ERROR_UNKNOWN) { |
317 | error_handler = _Py_GetErrorHandler(errors); |
318 | } |
319 | |
320 | startpos = i-1; |
321 | endpos = startpos+1; |
322 | |
323 | while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) |
324 | endpos++; |
325 | |
326 | /* Only overallocate the buffer if it's not the last write */ |
327 | writer->overallocate = (endpos < size); |
328 | |
329 | switch (error_handler) |
330 | { |
331 | case _Py_ERROR_REPLACE: |
332 | memset(p, '?', endpos - startpos); |
333 | p += (endpos - startpos); |
334 | /* fall through */ |
335 | case _Py_ERROR_IGNORE: |
336 | i += (endpos - startpos - 1); |
337 | break; |
338 | |
339 | case _Py_ERROR_SURROGATEPASS: |
340 | for (k=startpos; k<endpos; k++) { |
341 | ch = data[k]; |
342 | *p++ = (char)(0xe0 | (ch >> 12)); |
343 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
344 | *p++ = (char)(0x80 | (ch & 0x3f)); |
345 | } |
346 | i += (endpos - startpos - 1); |
347 | break; |
348 | |
349 | case _Py_ERROR_BACKSLASHREPLACE: |
350 | /* subtract preallocated bytes */ |
351 | writer->min_size -= max_char_size * (endpos - startpos); |
352 | p = backslashreplace(writer, p, |
353 | unicode, startpos, endpos); |
354 | if (p == NULL) |
355 | goto error; |
356 | i += (endpos - startpos - 1); |
357 | break; |
358 | |
359 | case _Py_ERROR_XMLCHARREFREPLACE: |
360 | /* subtract preallocated bytes */ |
361 | writer->min_size -= max_char_size * (endpos - startpos); |
362 | p = xmlcharrefreplace(writer, p, |
363 | unicode, startpos, endpos); |
364 | if (p == NULL) |
365 | goto error; |
366 | i += (endpos - startpos - 1); |
367 | break; |
368 | |
369 | case _Py_ERROR_SURROGATEESCAPE: |
370 | for (k=startpos; k<endpos; k++) { |
371 | ch = data[k]; |
372 | if (!(0xDC80 <= ch && ch <= 0xDCFF)) |
373 | break; |
374 | *p++ = (char)(ch & 0xff); |
375 | } |
376 | if (k >= endpos) { |
377 | i += (endpos - startpos - 1); |
378 | break; |
379 | } |
380 | startpos = k; |
381 | assert(startpos < endpos); |
382 | /* fall through */ |
383 | default: |
384 | rep = unicode_encode_call_errorhandler( |
385 | errors, &error_handler_obj, "utf-8" , "surrogates not allowed" , |
386 | unicode, &exc, startpos, endpos, &newpos); |
387 | if (!rep) |
388 | goto error; |
389 | |
390 | if (newpos < startpos) { |
391 | writer->overallocate = 1; |
392 | p = _PyBytesWriter_Prepare(writer, p, |
393 | max_char_size * (startpos - newpos)); |
394 | if (p == NULL) |
395 | goto error; |
396 | } |
397 | else { |
398 | /* subtract preallocated bytes */ |
399 | writer->min_size -= max_char_size * (newpos - startpos); |
400 | /* Only overallocate the buffer if it's not the last write */ |
401 | writer->overallocate = (newpos < size); |
402 | } |
403 | |
404 | if (PyBytes_Check(rep)) { |
405 | p = _PyBytesWriter_WriteBytes(writer, p, |
406 | PyBytes_AS_STRING(rep), |
407 | PyBytes_GET_SIZE(rep)); |
408 | } |
409 | else { |
410 | /* rep is unicode */ |
411 | if (PyUnicode_READY(rep) < 0) |
412 | goto error; |
413 | |
414 | if (!PyUnicode_IS_ASCII(rep)) { |
415 | raise_encode_exception(&exc, "utf-8" , unicode, |
416 | startpos, endpos, |
417 | "surrogates not allowed" ); |
418 | goto error; |
419 | } |
420 | |
421 | p = _PyBytesWriter_WriteBytes(writer, p, |
422 | PyUnicode_DATA(rep), |
423 | PyUnicode_GET_LENGTH(rep)); |
424 | } |
425 | |
426 | if (p == NULL) |
427 | goto error; |
428 | Py_CLEAR(rep); |
429 | |
430 | i = newpos; |
431 | } |
432 | |
433 | /* If overallocation was disabled, ensure that it was the last |
434 | write. Otherwise, we missed an optimization */ |
435 | assert(writer->overallocate || i == size); |
436 | } |
437 | else |
438 | #if STRINGLIB_SIZEOF_CHAR > 2 |
439 | if (ch < 0x10000) |
440 | #endif |
441 | { |
442 | *p++ = (char)(0xe0 | (ch >> 12)); |
443 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
444 | *p++ = (char)(0x80 | (ch & 0x3f)); |
445 | } |
446 | #if STRINGLIB_SIZEOF_CHAR > 2 |
447 | else /* ch >= 0x10000 */ |
448 | { |
449 | assert(ch <= MAX_UNICODE); |
450 | /* Encode UCS4 Unicode ordinals */ |
451 | *p++ = (char)(0xf0 | (ch >> 18)); |
452 | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
453 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
454 | *p++ = (char)(0x80 | (ch & 0x3f)); |
455 | } |
456 | #endif /* STRINGLIB_SIZEOF_CHAR > 2 */ |
457 | #endif /* STRINGLIB_SIZEOF_CHAR > 1 */ |
458 | } |
459 | |
460 | #if STRINGLIB_SIZEOF_CHAR > 1 |
461 | Py_XDECREF(error_handler_obj); |
462 | Py_XDECREF(exc); |
463 | #endif |
464 | return p; |
465 | |
466 | #if STRINGLIB_SIZEOF_CHAR > 1 |
467 | error: |
468 | Py_XDECREF(rep); |
469 | Py_XDECREF(error_handler_obj); |
470 | Py_XDECREF(exc); |
471 | return NULL; |
472 | #endif |
473 | } |
474 | |
475 | /* The pattern for constructing UCS2-repeated masks. */ |
476 | #if SIZEOF_LONG == 8 |
477 | # define UCS2_REPEAT_MASK 0x0001000100010001ul |
478 | #elif SIZEOF_LONG == 4 |
479 | # define UCS2_REPEAT_MASK 0x00010001ul |
480 | #else |
481 | # error C 'long' size should be either 4 or 8! |
482 | #endif |
483 | |
484 | /* The mask for fast checking. */ |
485 | #if STRINGLIB_SIZEOF_CHAR == 1 |
486 | /* The mask for fast checking of whether a C 'long' contains a |
487 | non-ASCII or non-Latin1 UTF16-encoded characters. */ |
488 | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) |
489 | #else |
490 | /* The mask for fast checking of whether a C 'long' may contain |
491 | UTF16-encoded surrogate characters. This is an efficient heuristic, |
492 | assuming that non-surrogate characters with a code point >= 0x8000 are |
493 | rare in most input. |
494 | */ |
495 | # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) |
496 | #endif |
497 | /* The mask for fast byte-swapping. */ |
498 | #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) |
499 | /* Swap bytes. */ |
500 | #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ |
501 | (((value) & STRIPPED_MASK) << 8)) |
502 | |
503 | Py_LOCAL_INLINE(Py_UCS4) |
504 | STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, |
505 | STRINGLIB_CHAR *dest, Py_ssize_t *outpos, |
506 | int native_ordering) |
507 | { |
508 | Py_UCS4 ch; |
509 | const unsigned char *q = *inptr; |
510 | STRINGLIB_CHAR *p = dest + *outpos; |
511 | /* Offsets from q for retrieving byte pairs in the right order. */ |
512 | #if PY_LITTLE_ENDIAN |
513 | int ihi = !!native_ordering, ilo = !native_ordering; |
514 | #else |
515 | int ihi = !native_ordering, ilo = !!native_ordering; |
516 | #endif |
517 | --e; |
518 | |
519 | while (q < e) { |
520 | Py_UCS4 ch2; |
521 | /* First check for possible aligned read of a C 'long'. Unaligned |
522 | reads are more expensive, better to defer to another iteration. */ |
523 | if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) { |
524 | /* Fast path for runs of in-range non-surrogate chars. */ |
525 | const unsigned char *_q = q; |
526 | while (_q + SIZEOF_LONG <= e) { |
527 | unsigned long block = * (const unsigned long *) _q; |
528 | if (native_ordering) { |
529 | /* Can use buffer directly */ |
530 | if (block & FAST_CHAR_MASK) |
531 | break; |
532 | } |
533 | else { |
534 | /* Need to byte-swap */ |
535 | if (block & SWAB(FAST_CHAR_MASK)) |
536 | break; |
537 | #if STRINGLIB_SIZEOF_CHAR == 1 |
538 | block >>= 8; |
539 | #else |
540 | block = SWAB(block); |
541 | #endif |
542 | } |
543 | #if PY_LITTLE_ENDIAN |
544 | # if SIZEOF_LONG == 4 |
545 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
546 | p[1] = (STRINGLIB_CHAR)(block >> 16); |
547 | # elif SIZEOF_LONG == 8 |
548 | p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
549 | p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
550 | p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
551 | p[3] = (STRINGLIB_CHAR)(block >> 48); |
552 | # endif |
553 | #else |
554 | # if SIZEOF_LONG == 4 |
555 | p[0] = (STRINGLIB_CHAR)(block >> 16); |
556 | p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
557 | # elif SIZEOF_LONG == 8 |
558 | p[0] = (STRINGLIB_CHAR)(block >> 48); |
559 | p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); |
560 | p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); |
561 | p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); |
562 | # endif |
563 | #endif |
564 | _q += SIZEOF_LONG; |
565 | p += SIZEOF_LONG / 2; |
566 | } |
567 | q = _q; |
568 | if (q >= e) |
569 | break; |
570 | } |
571 | |
572 | ch = (q[ihi] << 8) | q[ilo]; |
573 | q += 2; |
574 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
575 | #if STRINGLIB_SIZEOF_CHAR < 2 |
576 | if (ch > STRINGLIB_MAX_CHAR) |
577 | /* Out-of-range */ |
578 | goto Return; |
579 | #endif |
580 | *p++ = (STRINGLIB_CHAR)ch; |
581 | continue; |
582 | } |
583 | |
584 | /* UTF-16 code pair: */ |
585 | if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) |
586 | goto IllegalEncoding; |
587 | if (q >= e) |
588 | goto UnexpectedEnd; |
589 | ch2 = (q[ihi] << 8) | q[ilo]; |
590 | q += 2; |
591 | if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) |
592 | goto IllegalSurrogate; |
593 | ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); |
594 | #if STRINGLIB_SIZEOF_CHAR < 4 |
595 | /* Out-of-range */ |
596 | goto Return; |
597 | #else |
598 | *p++ = (STRINGLIB_CHAR)ch; |
599 | #endif |
600 | } |
601 | ch = 0; |
602 | Return: |
603 | *inptr = q; |
604 | *outpos = p - dest; |
605 | return ch; |
606 | UnexpectedEnd: |
607 | ch = 1; |
608 | goto Return; |
609 | IllegalEncoding: |
610 | ch = 2; |
611 | goto Return; |
612 | IllegalSurrogate: |
613 | ch = 3; |
614 | goto Return; |
615 | } |
616 | #undef UCS2_REPEAT_MASK |
617 | #undef FAST_CHAR_MASK |
618 | #undef STRIPPED_MASK |
619 | #undef SWAB |
620 | |
621 | |
622 | #if STRINGLIB_MAX_CHAR >= 0x80 |
623 | Py_LOCAL_INLINE(Py_ssize_t) |
624 | STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, |
625 | Py_ssize_t len, |
626 | unsigned short **outptr, |
627 | int native_ordering) |
628 | { |
629 | unsigned short *out = *outptr; |
630 | const STRINGLIB_CHAR *end = in + len; |
631 | #if STRINGLIB_SIZEOF_CHAR == 1 |
632 | if (native_ordering) { |
633 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
634 | while (in < unrolled_end) { |
635 | out[0] = in[0]; |
636 | out[1] = in[1]; |
637 | out[2] = in[2]; |
638 | out[3] = in[3]; |
639 | in += 4; out += 4; |
640 | } |
641 | while (in < end) { |
642 | *out++ = *in++; |
643 | } |
644 | } else { |
645 | # define SWAB2(CH) ((CH) << 8) /* high byte is zero */ |
646 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
647 | while (in < unrolled_end) { |
648 | out[0] = SWAB2(in[0]); |
649 | out[1] = SWAB2(in[1]); |
650 | out[2] = SWAB2(in[2]); |
651 | out[3] = SWAB2(in[3]); |
652 | in += 4; out += 4; |
653 | } |
654 | while (in < end) { |
655 | Py_UCS4 ch = *in++; |
656 | *out++ = SWAB2((Py_UCS2)ch); |
657 | } |
658 | #undef SWAB2 |
659 | } |
660 | *outptr = out; |
661 | return len; |
662 | #else |
663 | if (native_ordering) { |
664 | #if STRINGLIB_MAX_CHAR < 0x10000 |
665 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
666 | while (in < unrolled_end) { |
667 | /* check if any character is a surrogate character */ |
668 | if (((in[0] ^ 0xd800) & |
669 | (in[1] ^ 0xd800) & |
670 | (in[2] ^ 0xd800) & |
671 | (in[3] ^ 0xd800) & 0xf800) == 0) |
672 | break; |
673 | out[0] = in[0]; |
674 | out[1] = in[1]; |
675 | out[2] = in[2]; |
676 | out[3] = in[3]; |
677 | in += 4; out += 4; |
678 | } |
679 | #endif |
680 | while (in < end) { |
681 | Py_UCS4 ch; |
682 | ch = *in++; |
683 | if (ch < 0xd800) |
684 | *out++ = ch; |
685 | else if (ch < 0xe000) |
686 | /* reject surrogate characters (U+D800-U+DFFF) */ |
687 | goto fail; |
688 | #if STRINGLIB_MAX_CHAR >= 0x10000 |
689 | else if (ch >= 0x10000) { |
690 | out[0] = Py_UNICODE_HIGH_SURROGATE(ch); |
691 | out[1] = Py_UNICODE_LOW_SURROGATE(ch); |
692 | out += 2; |
693 | } |
694 | #endif |
695 | else |
696 | *out++ = ch; |
697 | } |
698 | } else { |
699 | #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) |
700 | #if STRINGLIB_MAX_CHAR < 0x10000 |
701 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
702 | while (in < unrolled_end) { |
703 | /* check if any character is a surrogate character */ |
704 | if (((in[0] ^ 0xd800) & |
705 | (in[1] ^ 0xd800) & |
706 | (in[2] ^ 0xd800) & |
707 | (in[3] ^ 0xd800) & 0xf800) == 0) |
708 | break; |
709 | out[0] = SWAB2(in[0]); |
710 | out[1] = SWAB2(in[1]); |
711 | out[2] = SWAB2(in[2]); |
712 | out[3] = SWAB2(in[3]); |
713 | in += 4; out += 4; |
714 | } |
715 | #endif |
716 | while (in < end) { |
717 | Py_UCS4 ch = *in++; |
718 | if (ch < 0xd800) |
719 | *out++ = SWAB2((Py_UCS2)ch); |
720 | else if (ch < 0xe000) |
721 | /* reject surrogate characters (U+D800-U+DFFF) */ |
722 | goto fail; |
723 | #if STRINGLIB_MAX_CHAR >= 0x10000 |
724 | else if (ch >= 0x10000) { |
725 | Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); |
726 | Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); |
727 | out[0] = SWAB2(ch1); |
728 | out[1] = SWAB2(ch2); |
729 | out += 2; |
730 | } |
731 | #endif |
732 | else |
733 | *out++ = SWAB2((Py_UCS2)ch); |
734 | } |
735 | #undef SWAB2 |
736 | } |
737 | *outptr = out; |
738 | return len; |
739 | fail: |
740 | *outptr = out; |
741 | return len - (end - in + 1); |
742 | #endif |
743 | } |
744 | |
745 | static inline uint32_t |
746 | STRINGLIB(SWAB4)(STRINGLIB_CHAR ch) |
747 | { |
748 | uint32_t word = ch; |
749 | #if STRINGLIB_SIZEOF_CHAR == 1 |
750 | /* high bytes are zero */ |
751 | return (word << 24); |
752 | #elif STRINGLIB_SIZEOF_CHAR == 2 |
753 | /* high bytes are zero */ |
754 | return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); |
755 | #else |
756 | return _Py_bswap32(word); |
757 | #endif |
758 | } |
759 | |
760 | Py_LOCAL_INLINE(Py_ssize_t) |
761 | STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, |
762 | Py_ssize_t len, |
763 | uint32_t **outptr, |
764 | int native_ordering) |
765 | { |
766 | uint32_t *out = *outptr; |
767 | const STRINGLIB_CHAR *end = in + len; |
768 | if (native_ordering) { |
769 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
770 | while (in < unrolled_end) { |
771 | #if STRINGLIB_SIZEOF_CHAR > 1 |
772 | /* check if any character is a surrogate character */ |
773 | if (((in[0] ^ 0xd800) & |
774 | (in[1] ^ 0xd800) & |
775 | (in[2] ^ 0xd800) & |
776 | (in[3] ^ 0xd800) & 0xf800) == 0) |
777 | break; |
778 | #endif |
779 | out[0] = in[0]; |
780 | out[1] = in[1]; |
781 | out[2] = in[2]; |
782 | out[3] = in[3]; |
783 | in += 4; out += 4; |
784 | } |
785 | while (in < end) { |
786 | Py_UCS4 ch; |
787 | ch = *in++; |
788 | #if STRINGLIB_SIZEOF_CHAR > 1 |
789 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
790 | /* reject surrogate characters (U+D800-U+DFFF) */ |
791 | goto fail; |
792 | } |
793 | #endif |
794 | *out++ = ch; |
795 | } |
796 | } else { |
797 | const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); |
798 | while (in < unrolled_end) { |
799 | #if STRINGLIB_SIZEOF_CHAR > 1 |
800 | /* check if any character is a surrogate character */ |
801 | if (((in[0] ^ 0xd800) & |
802 | (in[1] ^ 0xd800) & |
803 | (in[2] ^ 0xd800) & |
804 | (in[3] ^ 0xd800) & 0xf800) == 0) |
805 | break; |
806 | #endif |
807 | out[0] = STRINGLIB(SWAB4)(in[0]); |
808 | out[1] = STRINGLIB(SWAB4)(in[1]); |
809 | out[2] = STRINGLIB(SWAB4)(in[2]); |
810 | out[3] = STRINGLIB(SWAB4)(in[3]); |
811 | in += 4; out += 4; |
812 | } |
813 | while (in < end) { |
814 | Py_UCS4 ch = *in++; |
815 | #if STRINGLIB_SIZEOF_CHAR > 1 |
816 | if (Py_UNICODE_IS_SURROGATE(ch)) { |
817 | /* reject surrogate characters (U+D800-U+DFFF) */ |
818 | goto fail; |
819 | } |
820 | #endif |
821 | *out++ = STRINGLIB(SWAB4)(ch); |
822 | } |
823 | } |
824 | *outptr = out; |
825 | return len; |
826 | #if STRINGLIB_SIZEOF_CHAR > 1 |
827 | fail: |
828 | *outptr = out; |
829 | return len - (end - in + 1); |
830 | #endif |
831 | } |
832 | |
833 | #endif |
834 | |