1 | /* ------------------------------------------------------------------------ |
2 | |
3 | Python Codec Registry and support functions |
4 | |
5 | Written by Marc-Andre Lemburg ([email protected]). |
6 | |
7 | Copyright (c) Corporation for National Research Initiatives. |
8 | |
9 | ------------------------------------------------------------------------ */ |
10 | |
11 | #include "Python.h" |
12 | #include "pycore_interp.h" // PyInterpreterState.codec_search_path |
13 | #include "pycore_pystate.h" // _PyInterpreterState_GET() |
14 | #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI |
15 | #include <ctype.h> |
16 | |
17 | const char *Py_hexdigits = "0123456789abcdef" ; |
18 | |
19 | /* --- Codec Registry ----------------------------------------------------- */ |
20 | |
21 | /* Import the standard encodings package which will register the first |
22 | codec search function. |
23 | |
24 | This is done in a lazy way so that the Unicode implementation does |
25 | not downgrade startup time of scripts not needing it. |
26 | |
27 | ImportErrors are silently ignored by this function. Only one try is |
28 | made. |
29 | |
30 | */ |
31 | |
32 | static int _PyCodecRegistry_Init(void); /* Forward */ |
33 | |
34 | int PyCodec_Register(PyObject *search_function) |
35 | { |
36 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
37 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) |
38 | goto onError; |
39 | if (search_function == NULL) { |
40 | PyErr_BadArgument(); |
41 | goto onError; |
42 | } |
43 | if (!PyCallable_Check(search_function)) { |
44 | PyErr_SetString(PyExc_TypeError, "argument must be callable" ); |
45 | goto onError; |
46 | } |
47 | return PyList_Append(interp->codec_search_path, search_function); |
48 | |
49 | onError: |
50 | return -1; |
51 | } |
52 | |
53 | int |
54 | PyCodec_Unregister(PyObject *search_function) |
55 | { |
56 | PyInterpreterState *interp = PyInterpreterState_Get(); |
57 | PyObject *codec_search_path = interp->codec_search_path; |
58 | /* Do nothing if codec_search_path is not created yet or was cleared. */ |
59 | if (codec_search_path == NULL) { |
60 | return 0; |
61 | } |
62 | |
63 | assert(PyList_CheckExact(codec_search_path)); |
64 | Py_ssize_t n = PyList_GET_SIZE(codec_search_path); |
65 | for (Py_ssize_t i = 0; i < n; i++) { |
66 | PyObject *item = PyList_GET_ITEM(codec_search_path, i); |
67 | if (item == search_function) { |
68 | if (interp->codec_search_cache != NULL) { |
69 | assert(PyDict_CheckExact(interp->codec_search_cache)); |
70 | PyDict_Clear(interp->codec_search_cache); |
71 | } |
72 | return PyList_SetSlice(codec_search_path, i, i+1, NULL); |
73 | } |
74 | } |
75 | return 0; |
76 | } |
77 | |
78 | extern int _Py_normalize_encoding(const char *, char *, size_t); |
79 | |
80 | /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are |
81 | converted to lower case, spaces and hyphens are replaced with underscores. */ |
82 | |
83 | static |
84 | PyObject *normalizestring(const char *string) |
85 | { |
86 | size_t len = strlen(string); |
87 | char *encoding; |
88 | PyObject *v; |
89 | |
90 | if (len > PY_SSIZE_T_MAX) { |
91 | PyErr_SetString(PyExc_OverflowError, "string is too large" ); |
92 | return NULL; |
93 | } |
94 | |
95 | encoding = PyMem_Malloc(len + 1); |
96 | if (encoding == NULL) |
97 | return PyErr_NoMemory(); |
98 | |
99 | if (!_Py_normalize_encoding(string, encoding, len + 1)) |
100 | { |
101 | PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed" ); |
102 | PyMem_Free(encoding); |
103 | return NULL; |
104 | } |
105 | |
106 | v = PyUnicode_FromString(encoding); |
107 | PyMem_Free(encoding); |
108 | return v; |
109 | } |
110 | |
111 | /* Lookup the given encoding and return a tuple providing the codec |
112 | facilities. |
113 | |
114 | The encoding string is looked up converted to all lower-case |
115 | characters. This makes encodings looked up through this mechanism |
116 | effectively case-insensitive. |
117 | |
118 | If no codec is found, a LookupError is set and NULL returned. |
119 | |
120 | As side effect, this tries to load the encodings package, if not |
121 | yet done. This is part of the lazy load strategy for the encodings |
122 | package. |
123 | |
124 | */ |
125 | |
126 | PyObject *_PyCodec_Lookup(const char *encoding) |
127 | { |
128 | if (encoding == NULL) { |
129 | PyErr_BadArgument(); |
130 | return NULL; |
131 | } |
132 | |
133 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
134 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) { |
135 | return NULL; |
136 | } |
137 | |
138 | /* Convert the encoding to a normalized Python string: all |
139 | characters are converted to lower case, spaces and hyphens are |
140 | replaced with underscores. */ |
141 | PyObject *v = normalizestring(encoding); |
142 | if (v == NULL) { |
143 | return NULL; |
144 | } |
145 | PyUnicode_InternInPlace(&v); |
146 | |
147 | /* First, try to lookup the name in the registry dictionary */ |
148 | PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v); |
149 | if (result != NULL) { |
150 | Py_INCREF(result); |
151 | Py_DECREF(v); |
152 | return result; |
153 | } |
154 | else if (PyErr_Occurred()) { |
155 | goto onError; |
156 | } |
157 | |
158 | /* Next, scan the search functions in order of registration */ |
159 | const Py_ssize_t len = PyList_Size(interp->codec_search_path); |
160 | if (len < 0) |
161 | goto onError; |
162 | if (len == 0) { |
163 | PyErr_SetString(PyExc_LookupError, |
164 | "no codec search functions registered: " |
165 | "can't find encoding" ); |
166 | goto onError; |
167 | } |
168 | |
169 | Py_ssize_t i; |
170 | for (i = 0; i < len; i++) { |
171 | PyObject *func; |
172 | |
173 | func = PyList_GetItem(interp->codec_search_path, i); |
174 | if (func == NULL) |
175 | goto onError; |
176 | result = PyObject_CallOneArg(func, v); |
177 | if (result == NULL) |
178 | goto onError; |
179 | if (result == Py_None) { |
180 | Py_DECREF(result); |
181 | continue; |
182 | } |
183 | if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) { |
184 | PyErr_SetString(PyExc_TypeError, |
185 | "codec search functions must return 4-tuples" ); |
186 | Py_DECREF(result); |
187 | goto onError; |
188 | } |
189 | break; |
190 | } |
191 | if (i == len) { |
192 | /* XXX Perhaps we should cache misses too ? */ |
193 | PyErr_Format(PyExc_LookupError, |
194 | "unknown encoding: %s" , encoding); |
195 | goto onError; |
196 | } |
197 | |
198 | /* Cache and return the result */ |
199 | if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) { |
200 | Py_DECREF(result); |
201 | goto onError; |
202 | } |
203 | Py_DECREF(v); |
204 | return result; |
205 | |
206 | onError: |
207 | Py_DECREF(v); |
208 | return NULL; |
209 | } |
210 | |
211 | /* Codec registry encoding check API. */ |
212 | |
213 | int PyCodec_KnownEncoding(const char *encoding) |
214 | { |
215 | PyObject *codecs; |
216 | |
217 | codecs = _PyCodec_Lookup(encoding); |
218 | if (!codecs) { |
219 | PyErr_Clear(); |
220 | return 0; |
221 | } |
222 | else { |
223 | Py_DECREF(codecs); |
224 | return 1; |
225 | } |
226 | } |
227 | |
228 | static |
229 | PyObject *args_tuple(PyObject *object, |
230 | const char *errors) |
231 | { |
232 | PyObject *args; |
233 | |
234 | args = PyTuple_New(1 + (errors != NULL)); |
235 | if (args == NULL) |
236 | return NULL; |
237 | Py_INCREF(object); |
238 | PyTuple_SET_ITEM(args,0,object); |
239 | if (errors) { |
240 | PyObject *v; |
241 | |
242 | v = PyUnicode_FromString(errors); |
243 | if (v == NULL) { |
244 | Py_DECREF(args); |
245 | return NULL; |
246 | } |
247 | PyTuple_SET_ITEM(args, 1, v); |
248 | } |
249 | return args; |
250 | } |
251 | |
252 | /* Helper function to get a codec item */ |
253 | |
254 | static |
255 | PyObject *codec_getitem(const char *encoding, int index) |
256 | { |
257 | PyObject *codecs; |
258 | PyObject *v; |
259 | |
260 | codecs = _PyCodec_Lookup(encoding); |
261 | if (codecs == NULL) |
262 | return NULL; |
263 | v = PyTuple_GET_ITEM(codecs, index); |
264 | Py_DECREF(codecs); |
265 | Py_INCREF(v); |
266 | return v; |
267 | } |
268 | |
269 | /* Helper functions to create an incremental codec. */ |
270 | static |
271 | PyObject *codec_makeincrementalcodec(PyObject *codec_info, |
272 | const char *errors, |
273 | const char *attrname) |
274 | { |
275 | PyObject *ret, *inccodec; |
276 | |
277 | inccodec = PyObject_GetAttrString(codec_info, attrname); |
278 | if (inccodec == NULL) |
279 | return NULL; |
280 | if (errors) |
281 | ret = PyObject_CallFunction(inccodec, "s" , errors); |
282 | else |
283 | ret = _PyObject_CallNoArg(inccodec); |
284 | Py_DECREF(inccodec); |
285 | return ret; |
286 | } |
287 | |
288 | static |
289 | PyObject *codec_getincrementalcodec(const char *encoding, |
290 | const char *errors, |
291 | const char *attrname) |
292 | { |
293 | PyObject *codec_info, *ret; |
294 | |
295 | codec_info = _PyCodec_Lookup(encoding); |
296 | if (codec_info == NULL) |
297 | return NULL; |
298 | ret = codec_makeincrementalcodec(codec_info, errors, attrname); |
299 | Py_DECREF(codec_info); |
300 | return ret; |
301 | } |
302 | |
303 | /* Helper function to create a stream codec. */ |
304 | |
305 | static |
306 | PyObject *codec_getstreamcodec(const char *encoding, |
307 | PyObject *stream, |
308 | const char *errors, |
309 | const int index) |
310 | { |
311 | PyObject *codecs, *streamcodec, *codeccls; |
312 | |
313 | codecs = _PyCodec_Lookup(encoding); |
314 | if (codecs == NULL) |
315 | return NULL; |
316 | |
317 | codeccls = PyTuple_GET_ITEM(codecs, index); |
318 | if (errors != NULL) |
319 | streamcodec = PyObject_CallFunction(codeccls, "Os" , stream, errors); |
320 | else |
321 | streamcodec = PyObject_CallOneArg(codeccls, stream); |
322 | Py_DECREF(codecs); |
323 | return streamcodec; |
324 | } |
325 | |
326 | /* Helpers to work with the result of _PyCodec_Lookup |
327 | |
328 | */ |
329 | PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, |
330 | const char *errors) |
331 | { |
332 | return codec_makeincrementalcodec(codec_info, errors, |
333 | "incrementaldecoder" ); |
334 | } |
335 | |
336 | PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, |
337 | const char *errors) |
338 | { |
339 | return codec_makeincrementalcodec(codec_info, errors, |
340 | "incrementalencoder" ); |
341 | } |
342 | |
343 | |
344 | /* Convenience APIs to query the Codec registry. |
345 | |
346 | All APIs return a codec object with incremented refcount. |
347 | |
348 | */ |
349 | |
350 | PyObject *PyCodec_Encoder(const char *encoding) |
351 | { |
352 | return codec_getitem(encoding, 0); |
353 | } |
354 | |
355 | PyObject *PyCodec_Decoder(const char *encoding) |
356 | { |
357 | return codec_getitem(encoding, 1); |
358 | } |
359 | |
360 | PyObject *PyCodec_IncrementalEncoder(const char *encoding, |
361 | const char *errors) |
362 | { |
363 | return codec_getincrementalcodec(encoding, errors, "incrementalencoder" ); |
364 | } |
365 | |
366 | PyObject *PyCodec_IncrementalDecoder(const char *encoding, |
367 | const char *errors) |
368 | { |
369 | return codec_getincrementalcodec(encoding, errors, "incrementaldecoder" ); |
370 | } |
371 | |
372 | PyObject *PyCodec_StreamReader(const char *encoding, |
373 | PyObject *stream, |
374 | const char *errors) |
375 | { |
376 | return codec_getstreamcodec(encoding, stream, errors, 2); |
377 | } |
378 | |
379 | PyObject *PyCodec_StreamWriter(const char *encoding, |
380 | PyObject *stream, |
381 | const char *errors) |
382 | { |
383 | return codec_getstreamcodec(encoding, stream, errors, 3); |
384 | } |
385 | |
386 | /* Helper that tries to ensure the reported exception chain indicates the |
387 | * codec that was invoked to trigger the failure without changing the type |
388 | * of the exception raised. |
389 | */ |
390 | static void |
391 | wrap_codec_error(const char *operation, |
392 | const char *encoding) |
393 | { |
394 | /* TrySetFromCause will replace the active exception with a suitably |
395 | * updated clone if it can, otherwise it will leave the original |
396 | * exception alone. |
397 | */ |
398 | _PyErr_TrySetFromCause("%s with '%s' codec failed" , |
399 | operation, encoding); |
400 | } |
401 | |
402 | /* Encode an object (e.g. a Unicode object) using the given encoding |
403 | and return the resulting encoded object (usually a Python string). |
404 | |
405 | errors is passed to the encoder factory as argument if non-NULL. */ |
406 | |
407 | static PyObject * |
408 | _PyCodec_EncodeInternal(PyObject *object, |
409 | PyObject *encoder, |
410 | const char *encoding, |
411 | const char *errors) |
412 | { |
413 | PyObject *args = NULL, *result = NULL; |
414 | PyObject *v = NULL; |
415 | |
416 | args = args_tuple(object, errors); |
417 | if (args == NULL) |
418 | goto onError; |
419 | |
420 | result = PyObject_Call(encoder, args, NULL); |
421 | if (result == NULL) { |
422 | wrap_codec_error("encoding" , encoding); |
423 | goto onError; |
424 | } |
425 | |
426 | if (!PyTuple_Check(result) || |
427 | PyTuple_GET_SIZE(result) != 2) { |
428 | PyErr_SetString(PyExc_TypeError, |
429 | "encoder must return a tuple (object, integer)" ); |
430 | goto onError; |
431 | } |
432 | v = PyTuple_GET_ITEM(result,0); |
433 | Py_INCREF(v); |
434 | /* We don't check or use the second (integer) entry. */ |
435 | |
436 | Py_DECREF(args); |
437 | Py_DECREF(encoder); |
438 | Py_DECREF(result); |
439 | return v; |
440 | |
441 | onError: |
442 | Py_XDECREF(result); |
443 | Py_XDECREF(args); |
444 | Py_XDECREF(encoder); |
445 | return NULL; |
446 | } |
447 | |
448 | /* Decode an object (usually a Python string) using the given encoding |
449 | and return an equivalent object (e.g. a Unicode object). |
450 | |
451 | errors is passed to the decoder factory as argument if non-NULL. */ |
452 | |
453 | static PyObject * |
454 | _PyCodec_DecodeInternal(PyObject *object, |
455 | PyObject *decoder, |
456 | const char *encoding, |
457 | const char *errors) |
458 | { |
459 | PyObject *args = NULL, *result = NULL; |
460 | PyObject *v; |
461 | |
462 | args = args_tuple(object, errors); |
463 | if (args == NULL) |
464 | goto onError; |
465 | |
466 | result = PyObject_Call(decoder, args, NULL); |
467 | if (result == NULL) { |
468 | wrap_codec_error("decoding" , encoding); |
469 | goto onError; |
470 | } |
471 | if (!PyTuple_Check(result) || |
472 | PyTuple_GET_SIZE(result) != 2) { |
473 | PyErr_SetString(PyExc_TypeError, |
474 | "decoder must return a tuple (object,integer)" ); |
475 | goto onError; |
476 | } |
477 | v = PyTuple_GET_ITEM(result,0); |
478 | Py_INCREF(v); |
479 | /* We don't check or use the second (integer) entry. */ |
480 | |
481 | Py_DECREF(args); |
482 | Py_DECREF(decoder); |
483 | Py_DECREF(result); |
484 | return v; |
485 | |
486 | onError: |
487 | Py_XDECREF(args); |
488 | Py_XDECREF(decoder); |
489 | Py_XDECREF(result); |
490 | return NULL; |
491 | } |
492 | |
493 | /* Generic encoding/decoding API */ |
494 | PyObject *PyCodec_Encode(PyObject *object, |
495 | const char *encoding, |
496 | const char *errors) |
497 | { |
498 | PyObject *encoder; |
499 | |
500 | encoder = PyCodec_Encoder(encoding); |
501 | if (encoder == NULL) |
502 | return NULL; |
503 | |
504 | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
505 | } |
506 | |
507 | PyObject *PyCodec_Decode(PyObject *object, |
508 | const char *encoding, |
509 | const char *errors) |
510 | { |
511 | PyObject *decoder; |
512 | |
513 | decoder = PyCodec_Decoder(encoding); |
514 | if (decoder == NULL) |
515 | return NULL; |
516 | |
517 | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
518 | } |
519 | |
520 | /* Text encoding/decoding API */ |
521 | PyObject * _PyCodec_LookupTextEncoding(const char *encoding, |
522 | const char *alternate_command) |
523 | { |
524 | _Py_IDENTIFIER(_is_text_encoding); |
525 | PyObject *codec; |
526 | PyObject *attr; |
527 | int is_text_codec; |
528 | |
529 | codec = _PyCodec_Lookup(encoding); |
530 | if (codec == NULL) |
531 | return NULL; |
532 | |
533 | /* Backwards compatibility: assume any raw tuple describes a text |
534 | * encoding, and the same for anything lacking the private |
535 | * attribute. |
536 | */ |
537 | if (!PyTuple_CheckExact(codec)) { |
538 | if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) { |
539 | Py_DECREF(codec); |
540 | return NULL; |
541 | } |
542 | if (attr != NULL) { |
543 | is_text_codec = PyObject_IsTrue(attr); |
544 | Py_DECREF(attr); |
545 | if (is_text_codec <= 0) { |
546 | Py_DECREF(codec); |
547 | if (!is_text_codec) |
548 | PyErr_Format(PyExc_LookupError, |
549 | "'%.400s' is not a text encoding; " |
550 | "use %s to handle arbitrary codecs" , |
551 | encoding, alternate_command); |
552 | return NULL; |
553 | } |
554 | } |
555 | } |
556 | |
557 | /* This appears to be a valid text encoding */ |
558 | return codec; |
559 | } |
560 | |
561 | |
562 | static |
563 | PyObject *codec_getitem_checked(const char *encoding, |
564 | const char *alternate_command, |
565 | int index) |
566 | { |
567 | PyObject *codec; |
568 | PyObject *v; |
569 | |
570 | codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); |
571 | if (codec == NULL) |
572 | return NULL; |
573 | |
574 | v = PyTuple_GET_ITEM(codec, index); |
575 | Py_INCREF(v); |
576 | Py_DECREF(codec); |
577 | return v; |
578 | } |
579 | |
580 | static PyObject * _PyCodec_TextEncoder(const char *encoding) |
581 | { |
582 | return codec_getitem_checked(encoding, "codecs.encode()" , 0); |
583 | } |
584 | |
585 | static PyObject * _PyCodec_TextDecoder(const char *encoding) |
586 | { |
587 | return codec_getitem_checked(encoding, "codecs.decode()" , 1); |
588 | } |
589 | |
590 | PyObject *_PyCodec_EncodeText(PyObject *object, |
591 | const char *encoding, |
592 | const char *errors) |
593 | { |
594 | PyObject *encoder; |
595 | |
596 | encoder = _PyCodec_TextEncoder(encoding); |
597 | if (encoder == NULL) |
598 | return NULL; |
599 | |
600 | return _PyCodec_EncodeInternal(object, encoder, encoding, errors); |
601 | } |
602 | |
603 | PyObject *_PyCodec_DecodeText(PyObject *object, |
604 | const char *encoding, |
605 | const char *errors) |
606 | { |
607 | PyObject *decoder; |
608 | |
609 | decoder = _PyCodec_TextDecoder(encoding); |
610 | if (decoder == NULL) |
611 | return NULL; |
612 | |
613 | return _PyCodec_DecodeInternal(object, decoder, encoding, errors); |
614 | } |
615 | |
616 | /* Register the error handling callback function error under the name |
617 | name. This function will be called by the codec when it encounters |
618 | an unencodable characters/undecodable bytes and doesn't know the |
619 | callback name, when name is specified as the error parameter |
620 | in the call to the encode/decode function. |
621 | Return 0 on success, -1 on error */ |
622 | int PyCodec_RegisterError(const char *name, PyObject *error) |
623 | { |
624 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
625 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) |
626 | return -1; |
627 | if (!PyCallable_Check(error)) { |
628 | PyErr_SetString(PyExc_TypeError, "handler must be callable" ); |
629 | return -1; |
630 | } |
631 | return PyDict_SetItemString(interp->codec_error_registry, |
632 | name, error); |
633 | } |
634 | |
635 | /* Lookup the error handling callback function registered under the |
636 | name error. As a special case NULL can be passed, in which case |
637 | the error handling callback for strict encoding will be returned. */ |
638 | PyObject *PyCodec_LookupError(const char *name) |
639 | { |
640 | PyObject *handler = NULL; |
641 | |
642 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
643 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) |
644 | return NULL; |
645 | |
646 | if (name==NULL) |
647 | name = "strict" ; |
648 | handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name); |
649 | if (handler) { |
650 | Py_INCREF(handler); |
651 | } |
652 | else if (!PyErr_Occurred()) { |
653 | PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'" , name); |
654 | } |
655 | return handler; |
656 | } |
657 | |
658 | static void wrong_exception_type(PyObject *exc) |
659 | { |
660 | PyErr_Format(PyExc_TypeError, |
661 | "don't know how to handle %.200s in error callback" , |
662 | Py_TYPE(exc)->tp_name); |
663 | } |
664 | |
665 | PyObject *PyCodec_StrictErrors(PyObject *exc) |
666 | { |
667 | if (PyExceptionInstance_Check(exc)) |
668 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
669 | else |
670 | PyErr_SetString(PyExc_TypeError, "codec must pass exception instance" ); |
671 | return NULL; |
672 | } |
673 | |
674 | |
675 | PyObject *PyCodec_IgnoreErrors(PyObject *exc) |
676 | { |
677 | Py_ssize_t end; |
678 | |
679 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
680 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
681 | return NULL; |
682 | } |
683 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
684 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) |
685 | return NULL; |
686 | } |
687 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { |
688 | if (PyUnicodeTranslateError_GetEnd(exc, &end)) |
689 | return NULL; |
690 | } |
691 | else { |
692 | wrong_exception_type(exc); |
693 | return NULL; |
694 | } |
695 | return Py_BuildValue("(Nn)" , PyUnicode_New(0, 0), end); |
696 | } |
697 | |
698 | |
699 | PyObject *PyCodec_ReplaceErrors(PyObject *exc) |
700 | { |
701 | Py_ssize_t start, end, i, len; |
702 | |
703 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
704 | PyObject *res; |
705 | Py_UCS1 *outp; |
706 | if (PyUnicodeEncodeError_GetStart(exc, &start)) |
707 | return NULL; |
708 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
709 | return NULL; |
710 | len = end - start; |
711 | res = PyUnicode_New(len, '?'); |
712 | if (res == NULL) |
713 | return NULL; |
714 | assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND); |
715 | outp = PyUnicode_1BYTE_DATA(res); |
716 | for (i = 0; i < len; ++i) |
717 | outp[i] = '?'; |
718 | assert(_PyUnicode_CheckConsistency(res, 1)); |
719 | return Py_BuildValue("(Nn)" , res, end); |
720 | } |
721 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
722 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) |
723 | return NULL; |
724 | return Py_BuildValue("(Cn)" , |
725 | (int)Py_UNICODE_REPLACEMENT_CHARACTER, |
726 | end); |
727 | } |
728 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { |
729 | PyObject *res; |
730 | Py_UCS2 *outp; |
731 | if (PyUnicodeTranslateError_GetStart(exc, &start)) |
732 | return NULL; |
733 | if (PyUnicodeTranslateError_GetEnd(exc, &end)) |
734 | return NULL; |
735 | len = end - start; |
736 | res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER); |
737 | if (res == NULL) |
738 | return NULL; |
739 | assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND); |
740 | outp = PyUnicode_2BYTE_DATA(res); |
741 | for (i = 0; i < len; i++) |
742 | outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER; |
743 | assert(_PyUnicode_CheckConsistency(res, 1)); |
744 | return Py_BuildValue("(Nn)" , res, end); |
745 | } |
746 | else { |
747 | wrong_exception_type(exc); |
748 | return NULL; |
749 | } |
750 | } |
751 | |
752 | PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) |
753 | { |
754 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
755 | PyObject *restuple; |
756 | PyObject *object; |
757 | Py_ssize_t i; |
758 | Py_ssize_t start; |
759 | Py_ssize_t end; |
760 | PyObject *res; |
761 | Py_UCS1 *outp; |
762 | Py_ssize_t ressize; |
763 | Py_UCS4 ch; |
764 | if (PyUnicodeEncodeError_GetStart(exc, &start)) |
765 | return NULL; |
766 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
767 | return NULL; |
768 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) |
769 | return NULL; |
770 | if (end - start > PY_SSIZE_T_MAX / (2+7+1)) |
771 | end = start + PY_SSIZE_T_MAX / (2+7+1); |
772 | for (i = start, ressize = 0; i < end; ++i) { |
773 | /* object is guaranteed to be "ready" */ |
774 | ch = PyUnicode_READ_CHAR(object, i); |
775 | if (ch<10) |
776 | ressize += 2+1+1; |
777 | else if (ch<100) |
778 | ressize += 2+2+1; |
779 | else if (ch<1000) |
780 | ressize += 2+3+1; |
781 | else if (ch<10000) |
782 | ressize += 2+4+1; |
783 | else if (ch<100000) |
784 | ressize += 2+5+1; |
785 | else if (ch<1000000) |
786 | ressize += 2+6+1; |
787 | else |
788 | ressize += 2+7+1; |
789 | } |
790 | /* allocate replacement */ |
791 | res = PyUnicode_New(ressize, 127); |
792 | if (res == NULL) { |
793 | Py_DECREF(object); |
794 | return NULL; |
795 | } |
796 | outp = PyUnicode_1BYTE_DATA(res); |
797 | /* generate replacement */ |
798 | for (i = start; i < end; ++i) { |
799 | int digits; |
800 | int base; |
801 | ch = PyUnicode_READ_CHAR(object, i); |
802 | *outp++ = '&'; |
803 | *outp++ = '#'; |
804 | if (ch<10) { |
805 | digits = 1; |
806 | base = 1; |
807 | } |
808 | else if (ch<100) { |
809 | digits = 2; |
810 | base = 10; |
811 | } |
812 | else if (ch<1000) { |
813 | digits = 3; |
814 | base = 100; |
815 | } |
816 | else if (ch<10000) { |
817 | digits = 4; |
818 | base = 1000; |
819 | } |
820 | else if (ch<100000) { |
821 | digits = 5; |
822 | base = 10000; |
823 | } |
824 | else if (ch<1000000) { |
825 | digits = 6; |
826 | base = 100000; |
827 | } |
828 | else { |
829 | digits = 7; |
830 | base = 1000000; |
831 | } |
832 | while (digits-->0) { |
833 | *outp++ = '0' + ch/base; |
834 | ch %= base; |
835 | base /= 10; |
836 | } |
837 | *outp++ = ';'; |
838 | } |
839 | assert(_PyUnicode_CheckConsistency(res, 1)); |
840 | restuple = Py_BuildValue("(Nn)" , res, end); |
841 | Py_DECREF(object); |
842 | return restuple; |
843 | } |
844 | else { |
845 | wrong_exception_type(exc); |
846 | return NULL; |
847 | } |
848 | } |
849 | |
850 | PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) |
851 | { |
852 | PyObject *object; |
853 | Py_ssize_t i; |
854 | Py_ssize_t start; |
855 | Py_ssize_t end; |
856 | PyObject *res; |
857 | Py_UCS1 *outp; |
858 | int ressize; |
859 | Py_UCS4 c; |
860 | |
861 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
862 | const unsigned char *p; |
863 | if (PyUnicodeDecodeError_GetStart(exc, &start)) |
864 | return NULL; |
865 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) |
866 | return NULL; |
867 | if (!(object = PyUnicodeDecodeError_GetObject(exc))) |
868 | return NULL; |
869 | p = (const unsigned char*)PyBytes_AS_STRING(object); |
870 | res = PyUnicode_New(4 * (end - start), 127); |
871 | if (res == NULL) { |
872 | Py_DECREF(object); |
873 | return NULL; |
874 | } |
875 | outp = PyUnicode_1BYTE_DATA(res); |
876 | for (i = start; i < end; i++, outp += 4) { |
877 | unsigned char c = p[i]; |
878 | outp[0] = '\\'; |
879 | outp[1] = 'x'; |
880 | outp[2] = Py_hexdigits[(c>>4)&0xf]; |
881 | outp[3] = Py_hexdigits[c&0xf]; |
882 | } |
883 | |
884 | assert(_PyUnicode_CheckConsistency(res, 1)); |
885 | Py_DECREF(object); |
886 | return Py_BuildValue("(Nn)" , res, end); |
887 | } |
888 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
889 | if (PyUnicodeEncodeError_GetStart(exc, &start)) |
890 | return NULL; |
891 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
892 | return NULL; |
893 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) |
894 | return NULL; |
895 | } |
896 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { |
897 | if (PyUnicodeTranslateError_GetStart(exc, &start)) |
898 | return NULL; |
899 | if (PyUnicodeTranslateError_GetEnd(exc, &end)) |
900 | return NULL; |
901 | if (!(object = PyUnicodeTranslateError_GetObject(exc))) |
902 | return NULL; |
903 | } |
904 | else { |
905 | wrong_exception_type(exc); |
906 | return NULL; |
907 | } |
908 | |
909 | if (end - start > PY_SSIZE_T_MAX / (1+1+8)) |
910 | end = start + PY_SSIZE_T_MAX / (1+1+8); |
911 | for (i = start, ressize = 0; i < end; ++i) { |
912 | /* object is guaranteed to be "ready" */ |
913 | c = PyUnicode_READ_CHAR(object, i); |
914 | if (c >= 0x10000) { |
915 | ressize += 1+1+8; |
916 | } |
917 | else if (c >= 0x100) { |
918 | ressize += 1+1+4; |
919 | } |
920 | else |
921 | ressize += 1+1+2; |
922 | } |
923 | res = PyUnicode_New(ressize, 127); |
924 | if (res == NULL) { |
925 | Py_DECREF(object); |
926 | return NULL; |
927 | } |
928 | outp = PyUnicode_1BYTE_DATA(res); |
929 | for (i = start; i < end; ++i) { |
930 | c = PyUnicode_READ_CHAR(object, i); |
931 | *outp++ = '\\'; |
932 | if (c >= 0x00010000) { |
933 | *outp++ = 'U'; |
934 | *outp++ = Py_hexdigits[(c>>28)&0xf]; |
935 | *outp++ = Py_hexdigits[(c>>24)&0xf]; |
936 | *outp++ = Py_hexdigits[(c>>20)&0xf]; |
937 | *outp++ = Py_hexdigits[(c>>16)&0xf]; |
938 | *outp++ = Py_hexdigits[(c>>12)&0xf]; |
939 | *outp++ = Py_hexdigits[(c>>8)&0xf]; |
940 | } |
941 | else if (c >= 0x100) { |
942 | *outp++ = 'u'; |
943 | *outp++ = Py_hexdigits[(c>>12)&0xf]; |
944 | *outp++ = Py_hexdigits[(c>>8)&0xf]; |
945 | } |
946 | else |
947 | *outp++ = 'x'; |
948 | *outp++ = Py_hexdigits[(c>>4)&0xf]; |
949 | *outp++ = Py_hexdigits[c&0xf]; |
950 | } |
951 | |
952 | assert(_PyUnicode_CheckConsistency(res, 1)); |
953 | Py_DECREF(object); |
954 | return Py_BuildValue("(Nn)" , res, end); |
955 | } |
956 | |
957 | static _PyUnicode_Name_CAPI *ucnhash_capi = NULL; |
958 | |
959 | PyObject *PyCodec_NameReplaceErrors(PyObject *exc) |
960 | { |
961 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
962 | PyObject *restuple; |
963 | PyObject *object; |
964 | Py_ssize_t i; |
965 | Py_ssize_t start; |
966 | Py_ssize_t end; |
967 | PyObject *res; |
968 | Py_UCS1 *outp; |
969 | Py_ssize_t ressize; |
970 | int replsize; |
971 | Py_UCS4 c; |
972 | char buffer[256]; /* NAME_MAXLEN */ |
973 | if (PyUnicodeEncodeError_GetStart(exc, &start)) |
974 | return NULL; |
975 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
976 | return NULL; |
977 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) |
978 | return NULL; |
979 | if (!ucnhash_capi) { |
980 | /* load the unicode data module */ |
981 | ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import( |
982 | PyUnicodeData_CAPSULE_NAME, 1); |
983 | if (!ucnhash_capi) { |
984 | return NULL; |
985 | } |
986 | } |
987 | for (i = start, ressize = 0; i < end; ++i) { |
988 | /* object is guaranteed to be "ready" */ |
989 | c = PyUnicode_READ_CHAR(object, i); |
990 | if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) { |
991 | replsize = 1+1+1+(int)strlen(buffer)+1; |
992 | } |
993 | else if (c >= 0x10000) { |
994 | replsize = 1+1+8; |
995 | } |
996 | else if (c >= 0x100) { |
997 | replsize = 1+1+4; |
998 | } |
999 | else |
1000 | replsize = 1+1+2; |
1001 | if (ressize > PY_SSIZE_T_MAX - replsize) |
1002 | break; |
1003 | ressize += replsize; |
1004 | } |
1005 | end = i; |
1006 | res = PyUnicode_New(ressize, 127); |
1007 | if (res==NULL) |
1008 | return NULL; |
1009 | for (i = start, outp = PyUnicode_1BYTE_DATA(res); |
1010 | i < end; ++i) { |
1011 | c = PyUnicode_READ_CHAR(object, i); |
1012 | *outp++ = '\\'; |
1013 | if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) { |
1014 | *outp++ = 'N'; |
1015 | *outp++ = '{'; |
1016 | strcpy((char *)outp, buffer); |
1017 | outp += strlen(buffer); |
1018 | *outp++ = '}'; |
1019 | continue; |
1020 | } |
1021 | if (c >= 0x00010000) { |
1022 | *outp++ = 'U'; |
1023 | *outp++ = Py_hexdigits[(c>>28)&0xf]; |
1024 | *outp++ = Py_hexdigits[(c>>24)&0xf]; |
1025 | *outp++ = Py_hexdigits[(c>>20)&0xf]; |
1026 | *outp++ = Py_hexdigits[(c>>16)&0xf]; |
1027 | *outp++ = Py_hexdigits[(c>>12)&0xf]; |
1028 | *outp++ = Py_hexdigits[(c>>8)&0xf]; |
1029 | } |
1030 | else if (c >= 0x100) { |
1031 | *outp++ = 'u'; |
1032 | *outp++ = Py_hexdigits[(c>>12)&0xf]; |
1033 | *outp++ = Py_hexdigits[(c>>8)&0xf]; |
1034 | } |
1035 | else |
1036 | *outp++ = 'x'; |
1037 | *outp++ = Py_hexdigits[(c>>4)&0xf]; |
1038 | *outp++ = Py_hexdigits[c&0xf]; |
1039 | } |
1040 | |
1041 | assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); |
1042 | assert(_PyUnicode_CheckConsistency(res, 1)); |
1043 | restuple = Py_BuildValue("(Nn)" , res, end); |
1044 | Py_DECREF(object); |
1045 | return restuple; |
1046 | } |
1047 | else { |
1048 | wrong_exception_type(exc); |
1049 | return NULL; |
1050 | } |
1051 | } |
1052 | |
1053 | #define ENC_UNKNOWN -1 |
1054 | #define ENC_UTF8 0 |
1055 | #define ENC_UTF16BE 1 |
1056 | #define ENC_UTF16LE 2 |
1057 | #define ENC_UTF32BE 3 |
1058 | #define ENC_UTF32LE 4 |
1059 | |
1060 | static int |
1061 | get_standard_encoding(const char *encoding, int *bytelength) |
1062 | { |
1063 | if (Py_TOLOWER(encoding[0]) == 'u' && |
1064 | Py_TOLOWER(encoding[1]) == 't' && |
1065 | Py_TOLOWER(encoding[2]) == 'f') { |
1066 | encoding += 3; |
1067 | if (*encoding == '-' || *encoding == '_' ) |
1068 | encoding++; |
1069 | if (encoding[0] == '8' && encoding[1] == '\0') { |
1070 | *bytelength = 3; |
1071 | return ENC_UTF8; |
1072 | } |
1073 | else if (encoding[0] == '1' && encoding[1] == '6') { |
1074 | encoding += 2; |
1075 | *bytelength = 2; |
1076 | if (*encoding == '\0') { |
1077 | #ifdef WORDS_BIGENDIAN |
1078 | return ENC_UTF16BE; |
1079 | #else |
1080 | return ENC_UTF16LE; |
1081 | #endif |
1082 | } |
1083 | if (*encoding == '-' || *encoding == '_' ) |
1084 | encoding++; |
1085 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { |
1086 | if (Py_TOLOWER(encoding[0]) == 'b') |
1087 | return ENC_UTF16BE; |
1088 | if (Py_TOLOWER(encoding[0]) == 'l') |
1089 | return ENC_UTF16LE; |
1090 | } |
1091 | } |
1092 | else if (encoding[0] == '3' && encoding[1] == '2') { |
1093 | encoding += 2; |
1094 | *bytelength = 4; |
1095 | if (*encoding == '\0') { |
1096 | #ifdef WORDS_BIGENDIAN |
1097 | return ENC_UTF32BE; |
1098 | #else |
1099 | return ENC_UTF32LE; |
1100 | #endif |
1101 | } |
1102 | if (*encoding == '-' || *encoding == '_' ) |
1103 | encoding++; |
1104 | if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { |
1105 | if (Py_TOLOWER(encoding[0]) == 'b') |
1106 | return ENC_UTF32BE; |
1107 | if (Py_TOLOWER(encoding[0]) == 'l') |
1108 | return ENC_UTF32LE; |
1109 | } |
1110 | } |
1111 | } |
1112 | else if (strcmp(encoding, "CP_UTF8" ) == 0) { |
1113 | *bytelength = 3; |
1114 | return ENC_UTF8; |
1115 | } |
1116 | return ENC_UNKNOWN; |
1117 | } |
1118 | |
1119 | /* This handler is declared static until someone demonstrates |
1120 | a need to call it directly. */ |
1121 | static PyObject * |
1122 | PyCodec_SurrogatePassErrors(PyObject *exc) |
1123 | { |
1124 | PyObject *restuple; |
1125 | PyObject *object; |
1126 | PyObject *encode; |
1127 | const char *encoding; |
1128 | int code; |
1129 | int bytelength; |
1130 | Py_ssize_t i; |
1131 | Py_ssize_t start; |
1132 | Py_ssize_t end; |
1133 | PyObject *res; |
1134 | |
1135 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
1136 | unsigned char *outp; |
1137 | if (PyUnicodeEncodeError_GetStart(exc, &start)) |
1138 | return NULL; |
1139 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
1140 | return NULL; |
1141 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) |
1142 | return NULL; |
1143 | if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) { |
1144 | Py_DECREF(object); |
1145 | return NULL; |
1146 | } |
1147 | if (!(encoding = PyUnicode_AsUTF8(encode))) { |
1148 | Py_DECREF(object); |
1149 | Py_DECREF(encode); |
1150 | return NULL; |
1151 | } |
1152 | code = get_standard_encoding(encoding, &bytelength); |
1153 | Py_DECREF(encode); |
1154 | if (code == ENC_UNKNOWN) { |
1155 | /* Not supported, fail with original exception */ |
1156 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1157 | Py_DECREF(object); |
1158 | return NULL; |
1159 | } |
1160 | |
1161 | if (end - start > PY_SSIZE_T_MAX / bytelength) |
1162 | end = start + PY_SSIZE_T_MAX / bytelength; |
1163 | res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start)); |
1164 | if (!res) { |
1165 | Py_DECREF(object); |
1166 | return NULL; |
1167 | } |
1168 | outp = (unsigned char*)PyBytes_AsString(res); |
1169 | for (i = start; i < end; i++) { |
1170 | /* object is guaranteed to be "ready" */ |
1171 | Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); |
1172 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
1173 | /* Not a surrogate, fail with original exception */ |
1174 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1175 | Py_DECREF(res); |
1176 | Py_DECREF(object); |
1177 | return NULL; |
1178 | } |
1179 | switch (code) { |
1180 | case ENC_UTF8: |
1181 | *outp++ = (unsigned char)(0xe0 | (ch >> 12)); |
1182 | *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); |
1183 | *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); |
1184 | break; |
1185 | case ENC_UTF16LE: |
1186 | *outp++ = (unsigned char) ch; |
1187 | *outp++ = (unsigned char)(ch >> 8); |
1188 | break; |
1189 | case ENC_UTF16BE: |
1190 | *outp++ = (unsigned char)(ch >> 8); |
1191 | *outp++ = (unsigned char) ch; |
1192 | break; |
1193 | case ENC_UTF32LE: |
1194 | *outp++ = (unsigned char) ch; |
1195 | *outp++ = (unsigned char)(ch >> 8); |
1196 | *outp++ = (unsigned char)(ch >> 16); |
1197 | *outp++ = (unsigned char)(ch >> 24); |
1198 | break; |
1199 | case ENC_UTF32BE: |
1200 | *outp++ = (unsigned char)(ch >> 24); |
1201 | *outp++ = (unsigned char)(ch >> 16); |
1202 | *outp++ = (unsigned char)(ch >> 8); |
1203 | *outp++ = (unsigned char) ch; |
1204 | break; |
1205 | } |
1206 | } |
1207 | restuple = Py_BuildValue("(On)" , res, end); |
1208 | Py_DECREF(res); |
1209 | Py_DECREF(object); |
1210 | return restuple; |
1211 | } |
1212 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
1213 | const unsigned char *p; |
1214 | Py_UCS4 ch = 0; |
1215 | if (PyUnicodeDecodeError_GetStart(exc, &start)) |
1216 | return NULL; |
1217 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) |
1218 | return NULL; |
1219 | if (!(object = PyUnicodeDecodeError_GetObject(exc))) |
1220 | return NULL; |
1221 | p = (const unsigned char*)PyBytes_AS_STRING(object); |
1222 | if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) { |
1223 | Py_DECREF(object); |
1224 | return NULL; |
1225 | } |
1226 | if (!(encoding = PyUnicode_AsUTF8(encode))) { |
1227 | Py_DECREF(object); |
1228 | Py_DECREF(encode); |
1229 | return NULL; |
1230 | } |
1231 | code = get_standard_encoding(encoding, &bytelength); |
1232 | Py_DECREF(encode); |
1233 | if (code == ENC_UNKNOWN) { |
1234 | /* Not supported, fail with original exception */ |
1235 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1236 | Py_DECREF(object); |
1237 | return NULL; |
1238 | } |
1239 | |
1240 | /* Try decoding a single surrogate character. If |
1241 | there are more, let the codec call us again. */ |
1242 | p += start; |
1243 | if (PyBytes_GET_SIZE(object) - start >= bytelength) { |
1244 | switch (code) { |
1245 | case ENC_UTF8: |
1246 | if ((p[0] & 0xf0) == 0xe0 && |
1247 | (p[1] & 0xc0) == 0x80 && |
1248 | (p[2] & 0xc0) == 0x80) { |
1249 | /* it's a three-byte code */ |
1250 | ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); |
1251 | } |
1252 | break; |
1253 | case ENC_UTF16LE: |
1254 | ch = p[1] << 8 | p[0]; |
1255 | break; |
1256 | case ENC_UTF16BE: |
1257 | ch = p[0] << 8 | p[1]; |
1258 | break; |
1259 | case ENC_UTF32LE: |
1260 | ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; |
1261 | break; |
1262 | case ENC_UTF32BE: |
1263 | ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; |
1264 | break; |
1265 | } |
1266 | } |
1267 | |
1268 | Py_DECREF(object); |
1269 | if (!Py_UNICODE_IS_SURROGATE(ch)) { |
1270 | /* it's not a surrogate - fail */ |
1271 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1272 | return NULL; |
1273 | } |
1274 | res = PyUnicode_FromOrdinal(ch); |
1275 | if (res == NULL) |
1276 | return NULL; |
1277 | return Py_BuildValue("(Nn)" , res, start + bytelength); |
1278 | } |
1279 | else { |
1280 | wrong_exception_type(exc); |
1281 | return NULL; |
1282 | } |
1283 | } |
1284 | |
1285 | static PyObject * |
1286 | PyCodec_SurrogateEscapeErrors(PyObject *exc) |
1287 | { |
1288 | PyObject *restuple; |
1289 | PyObject *object; |
1290 | Py_ssize_t i; |
1291 | Py_ssize_t start; |
1292 | Py_ssize_t end; |
1293 | PyObject *res; |
1294 | |
1295 | if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { |
1296 | char *outp; |
1297 | if (PyUnicodeEncodeError_GetStart(exc, &start)) |
1298 | return NULL; |
1299 | if (PyUnicodeEncodeError_GetEnd(exc, &end)) |
1300 | return NULL; |
1301 | if (!(object = PyUnicodeEncodeError_GetObject(exc))) |
1302 | return NULL; |
1303 | res = PyBytes_FromStringAndSize(NULL, end-start); |
1304 | if (!res) { |
1305 | Py_DECREF(object); |
1306 | return NULL; |
1307 | } |
1308 | outp = PyBytes_AsString(res); |
1309 | for (i = start; i < end; i++) { |
1310 | /* object is guaranteed to be "ready" */ |
1311 | Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); |
1312 | if (ch < 0xdc80 || ch > 0xdcff) { |
1313 | /* Not a UTF-8b surrogate, fail with original exception */ |
1314 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1315 | Py_DECREF(res); |
1316 | Py_DECREF(object); |
1317 | return NULL; |
1318 | } |
1319 | *outp++ = ch - 0xdc00; |
1320 | } |
1321 | restuple = Py_BuildValue("(On)" , res, end); |
1322 | Py_DECREF(res); |
1323 | Py_DECREF(object); |
1324 | return restuple; |
1325 | } |
1326 | else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { |
1327 | PyObject *str; |
1328 | const unsigned char *p; |
1329 | Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ |
1330 | int consumed = 0; |
1331 | if (PyUnicodeDecodeError_GetStart(exc, &start)) |
1332 | return NULL; |
1333 | if (PyUnicodeDecodeError_GetEnd(exc, &end)) |
1334 | return NULL; |
1335 | if (!(object = PyUnicodeDecodeError_GetObject(exc))) |
1336 | return NULL; |
1337 | p = (const unsigned char*)PyBytes_AS_STRING(object); |
1338 | while (consumed < 4 && consumed < end-start) { |
1339 | /* Refuse to escape ASCII bytes. */ |
1340 | if (p[start+consumed] < 128) |
1341 | break; |
1342 | ch[consumed] = 0xdc00 + p[start+consumed]; |
1343 | consumed++; |
1344 | } |
1345 | Py_DECREF(object); |
1346 | if (!consumed) { |
1347 | /* codec complained about ASCII byte. */ |
1348 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc); |
1349 | return NULL; |
1350 | } |
1351 | str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); |
1352 | if (str == NULL) |
1353 | return NULL; |
1354 | return Py_BuildValue("(Nn)" , str, start+consumed); |
1355 | } |
1356 | else { |
1357 | wrong_exception_type(exc); |
1358 | return NULL; |
1359 | } |
1360 | } |
1361 | |
1362 | |
1363 | static PyObject *strict_errors(PyObject *self, PyObject *exc) |
1364 | { |
1365 | return PyCodec_StrictErrors(exc); |
1366 | } |
1367 | |
1368 | |
1369 | static PyObject *ignore_errors(PyObject *self, PyObject *exc) |
1370 | { |
1371 | return PyCodec_IgnoreErrors(exc); |
1372 | } |
1373 | |
1374 | |
1375 | static PyObject *replace_errors(PyObject *self, PyObject *exc) |
1376 | { |
1377 | return PyCodec_ReplaceErrors(exc); |
1378 | } |
1379 | |
1380 | |
1381 | static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) |
1382 | { |
1383 | return PyCodec_XMLCharRefReplaceErrors(exc); |
1384 | } |
1385 | |
1386 | |
1387 | static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) |
1388 | { |
1389 | return PyCodec_BackslashReplaceErrors(exc); |
1390 | } |
1391 | |
1392 | static PyObject *namereplace_errors(PyObject *self, PyObject *exc) |
1393 | { |
1394 | return PyCodec_NameReplaceErrors(exc); |
1395 | } |
1396 | |
1397 | static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) |
1398 | { |
1399 | return PyCodec_SurrogatePassErrors(exc); |
1400 | } |
1401 | |
1402 | static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc) |
1403 | { |
1404 | return PyCodec_SurrogateEscapeErrors(exc); |
1405 | } |
1406 | |
1407 | static int _PyCodecRegistry_Init(void) |
1408 | { |
1409 | static struct { |
1410 | const char *name; |
1411 | PyMethodDef def; |
1412 | } methods[] = |
1413 | { |
1414 | { |
1415 | "strict" , |
1416 | { |
1417 | "strict_errors" , |
1418 | strict_errors, |
1419 | METH_O, |
1420 | PyDoc_STR("Implements the 'strict' error handling, which " |
1421 | "raises a UnicodeError on coding errors." ) |
1422 | } |
1423 | }, |
1424 | { |
1425 | "ignore" , |
1426 | { |
1427 | "ignore_errors" , |
1428 | ignore_errors, |
1429 | METH_O, |
1430 | PyDoc_STR("Implements the 'ignore' error handling, which " |
1431 | "ignores malformed data and continues." ) |
1432 | } |
1433 | }, |
1434 | { |
1435 | "replace" , |
1436 | { |
1437 | "replace_errors" , |
1438 | replace_errors, |
1439 | METH_O, |
1440 | PyDoc_STR("Implements the 'replace' error handling, which " |
1441 | "replaces malformed data with a replacement marker." ) |
1442 | } |
1443 | }, |
1444 | { |
1445 | "xmlcharrefreplace" , |
1446 | { |
1447 | "xmlcharrefreplace_errors" , |
1448 | xmlcharrefreplace_errors, |
1449 | METH_O, |
1450 | PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, " |
1451 | "which replaces an unencodable character with the " |
1452 | "appropriate XML character reference." ) |
1453 | } |
1454 | }, |
1455 | { |
1456 | "backslashreplace" , |
1457 | { |
1458 | "backslashreplace_errors" , |
1459 | backslashreplace_errors, |
1460 | METH_O, |
1461 | PyDoc_STR("Implements the 'backslashreplace' error handling, " |
1462 | "which replaces malformed data with a backslashed " |
1463 | "escape sequence." ) |
1464 | } |
1465 | }, |
1466 | { |
1467 | "namereplace" , |
1468 | { |
1469 | "namereplace_errors" , |
1470 | namereplace_errors, |
1471 | METH_O, |
1472 | PyDoc_STR("Implements the 'namereplace' error handling, " |
1473 | "which replaces an unencodable character with a " |
1474 | "\\N{...} escape sequence." ) |
1475 | } |
1476 | }, |
1477 | { |
1478 | "surrogatepass" , |
1479 | { |
1480 | "surrogatepass" , |
1481 | surrogatepass_errors, |
1482 | METH_O |
1483 | } |
1484 | }, |
1485 | { |
1486 | "surrogateescape" , |
1487 | { |
1488 | "surrogateescape" , |
1489 | surrogateescape_errors, |
1490 | METH_O |
1491 | } |
1492 | } |
1493 | }; |
1494 | |
1495 | PyInterpreterState *interp = _PyInterpreterState_GET(); |
1496 | PyObject *mod; |
1497 | |
1498 | if (interp->codec_search_path != NULL) |
1499 | return 0; |
1500 | |
1501 | interp->codec_search_path = PyList_New(0); |
1502 | if (interp->codec_search_path == NULL) { |
1503 | return -1; |
1504 | } |
1505 | |
1506 | interp->codec_search_cache = PyDict_New(); |
1507 | if (interp->codec_search_cache == NULL) { |
1508 | return -1; |
1509 | } |
1510 | |
1511 | interp->codec_error_registry = PyDict_New(); |
1512 | if (interp->codec_error_registry == NULL) { |
1513 | return -1; |
1514 | } |
1515 | |
1516 | for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { |
1517 | PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL); |
1518 | if (!func) { |
1519 | return -1; |
1520 | } |
1521 | |
1522 | int res = PyCodec_RegisterError(methods[i].name, func); |
1523 | Py_DECREF(func); |
1524 | if (res) { |
1525 | return -1; |
1526 | } |
1527 | } |
1528 | |
1529 | mod = PyImport_ImportModuleNoBlock("encodings" ); |
1530 | if (mod == NULL) { |
1531 | return -1; |
1532 | } |
1533 | Py_DECREF(mod); |
1534 | interp->codecs_initialized = 1; |
1535 | return 0; |
1536 | } |
1537 | |