1 | /* |
2 | * cjkcodecs.h: common header for cjkcodecs |
3 | * |
4 | * Written by Hye-Shik Chang <[email protected]> |
5 | */ |
6 | |
7 | #ifndef _CJKCODECS_H_ |
8 | #define _CJKCODECS_H_ |
9 | |
10 | #define PY_SSIZE_T_CLEAN |
11 | #include "Python.h" |
12 | #include "multibytecodec.h" |
13 | |
14 | |
15 | /* a unicode "undefined" code point */ |
16 | #define UNIINV 0xFFFE |
17 | |
18 | /* internal-use DBCS code points which aren't used by any charsets */ |
19 | #define NOCHAR 0xFFFF |
20 | #define MULTIC 0xFFFE |
21 | #define DBCINV 0xFFFD |
22 | |
23 | /* shorter macros to save source size of mapping tables */ |
24 | #define U UNIINV |
25 | #define N NOCHAR |
26 | #define M MULTIC |
27 | #define D DBCINV |
28 | |
29 | struct dbcs_index { |
30 | const ucs2_t *map; |
31 | unsigned char bottom, top; |
32 | }; |
33 | typedef struct dbcs_index decode_map; |
34 | |
35 | struct widedbcs_index { |
36 | const Py_UCS4 *map; |
37 | unsigned char bottom, top; |
38 | }; |
39 | typedef struct widedbcs_index widedecode_map; |
40 | |
41 | struct unim_index { |
42 | const DBCHAR *map; |
43 | unsigned char bottom, top; |
44 | }; |
45 | typedef struct unim_index encode_map; |
46 | |
47 | struct unim_index_bytebased { |
48 | const unsigned char *map; |
49 | unsigned char bottom, top; |
50 | }; |
51 | |
52 | struct dbcs_map { |
53 | const char *charset; |
54 | const struct unim_index *encmap; |
55 | const struct dbcs_index *decmap; |
56 | }; |
57 | |
58 | struct pair_encodemap { |
59 | Py_UCS4 uniseq; |
60 | DBCHAR code; |
61 | }; |
62 | |
63 | static const MultibyteCodec *codec_list; |
64 | static const struct dbcs_map *mapping_list; |
65 | |
66 | #define CODEC_INIT(encoding) \ |
67 | static int encoding##_codec_init(const void *config) |
68 | |
69 | #define ENCODER_INIT(encoding) \ |
70 | static int encoding##_encode_init( \ |
71 | MultibyteCodec_State *state, const void *config) |
72 | #define ENCODER(encoding) \ |
73 | static Py_ssize_t encoding##_encode( \ |
74 | MultibyteCodec_State *state, const void *config, \ |
75 | int kind, const void *data, \ |
76 | Py_ssize_t *inpos, Py_ssize_t inlen, \ |
77 | unsigned char **outbuf, Py_ssize_t outleft, int flags) |
78 | #define ENCODER_RESET(encoding) \ |
79 | static Py_ssize_t encoding##_encode_reset( \ |
80 | MultibyteCodec_State *state, const void *config, \ |
81 | unsigned char **outbuf, Py_ssize_t outleft) |
82 | |
83 | #define DECODER_INIT(encoding) \ |
84 | static int encoding##_decode_init( \ |
85 | MultibyteCodec_State *state, const void *config) |
86 | #define DECODER(encoding) \ |
87 | static Py_ssize_t encoding##_decode( \ |
88 | MultibyteCodec_State *state, const void *config, \ |
89 | const unsigned char **inbuf, Py_ssize_t inleft, \ |
90 | _PyUnicodeWriter *writer) |
91 | #define DECODER_RESET(encoding) \ |
92 | static Py_ssize_t encoding##_decode_reset( \ |
93 | MultibyteCodec_State *state, const void *config) |
94 | |
95 | #define NEXT_IN(i) \ |
96 | do { \ |
97 | (*inbuf) += (i); \ |
98 | (inleft) -= (i); \ |
99 | } while (0) |
100 | #define NEXT_INCHAR(i) \ |
101 | do { \ |
102 | (*inpos) += (i); \ |
103 | } while (0) |
104 | #define NEXT_OUT(o) \ |
105 | do { \ |
106 | (*outbuf) += (o); \ |
107 | (outleft) -= (o); \ |
108 | } while (0) |
109 | #define NEXT(i, o) \ |
110 | do { \ |
111 | NEXT_INCHAR(i); \ |
112 | NEXT_OUT(o); \ |
113 | } while (0) |
114 | |
115 | #define REQUIRE_INBUF(n) \ |
116 | do { \ |
117 | if (inleft < (n)) \ |
118 | return MBERR_TOOFEW; \ |
119 | } while (0) |
120 | |
121 | #define REQUIRE_OUTBUF(n) \ |
122 | do { \ |
123 | if (outleft < (n)) \ |
124 | return MBERR_TOOSMALL; \ |
125 | } while (0) |
126 | |
127 | #define INBYTE1 ((*inbuf)[0]) |
128 | #define INBYTE2 ((*inbuf)[1]) |
129 | #define INBYTE3 ((*inbuf)[2]) |
130 | #define INBYTE4 ((*inbuf)[3]) |
131 | |
132 | #define INCHAR1 (PyUnicode_READ(kind, data, *inpos)) |
133 | #define INCHAR2 (PyUnicode_READ(kind, data, *inpos + 1)) |
134 | |
135 | #define OUTCHAR(c) \ |
136 | do { \ |
137 | if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0) \ |
138 | return MBERR_EXCEPTION; \ |
139 | } while (0) |
140 | |
141 | #define OUTCHAR2(c1, c2) \ |
142 | do { \ |
143 | Py_UCS4 _c1 = (c1); \ |
144 | Py_UCS4 _c2 = (c2); \ |
145 | if (_PyUnicodeWriter_Prepare(writer, 2, Py_MAX(_c1, c2)) < 0) \ |
146 | return MBERR_EXCEPTION; \ |
147 | PyUnicode_WRITE(writer->kind, writer->data, writer->pos, _c1); \ |
148 | PyUnicode_WRITE(writer->kind, writer->data, writer->pos + 1, _c2); \ |
149 | writer->pos += 2; \ |
150 | } while (0) |
151 | |
152 | #define OUTBYTEI(c, i) \ |
153 | do { \ |
154 | assert((unsigned char)(c) == (c)); \ |
155 | ((*outbuf)[i]) = (c); \ |
156 | } while (0) |
157 | |
158 | #define OUTBYTE1(c) OUTBYTEI(c, 0) |
159 | #define OUTBYTE2(c) OUTBYTEI(c, 1) |
160 | #define OUTBYTE3(c) OUTBYTEI(c, 2) |
161 | #define OUTBYTE4(c) OUTBYTEI(c, 3) |
162 | |
163 | #define WRITEBYTE1(c1) \ |
164 | do { \ |
165 | REQUIRE_OUTBUF(1); \ |
166 | OUTBYTE1(c1); \ |
167 | } while (0) |
168 | #define WRITEBYTE2(c1, c2) \ |
169 | do { \ |
170 | REQUIRE_OUTBUF(2); \ |
171 | OUTBYTE1(c1); \ |
172 | OUTBYTE2(c2); \ |
173 | } while (0) |
174 | #define WRITEBYTE3(c1, c2, c3) \ |
175 | do { \ |
176 | REQUIRE_OUTBUF(3); \ |
177 | OUTBYTE1(c1); \ |
178 | OUTBYTE2(c2); \ |
179 | OUTBYTE3(c3); \ |
180 | } while (0) |
181 | #define WRITEBYTE4(c1, c2, c3, c4) \ |
182 | do { \ |
183 | REQUIRE_OUTBUF(4); \ |
184 | OUTBYTE1(c1); \ |
185 | OUTBYTE2(c2); \ |
186 | OUTBYTE3(c3); \ |
187 | OUTBYTE4(c4); \ |
188 | } while (0) |
189 | |
190 | #define _TRYMAP_ENC(m, assi, val) \ |
191 | ((m)->map != NULL && (val) >= (m)->bottom && \ |
192 | (val)<= (m)->top && ((assi) = (m)->map[(val) - \ |
193 | (m)->bottom]) != NOCHAR) |
194 | #define TRYMAP_ENC(charset, assi, uni) \ |
195 | _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff) |
196 | |
197 | #define _TRYMAP_DEC(m, assi, val) \ |
198 | ((m)->map != NULL && \ |
199 | (val) >= (m)->bottom && \ |
200 | (val)<= (m)->top && \ |
201 | ((assi) = (m)->map[(val) - (m)->bottom]) != UNIINV) |
202 | #define TRYMAP_DEC(charset, assi, c1, c2) \ |
203 | _TRYMAP_DEC(&charset##_decmap[c1], assi, c2) |
204 | |
205 | #define BEGIN_MAPPINGS_LIST static const struct dbcs_map _mapping_list[] = { |
206 | #define MAPPING_ENCONLY(enc) {#enc, (void*)enc##_encmap, NULL}, |
207 | #define MAPPING_DECONLY(enc) {#enc, NULL, (void*)enc##_decmap}, |
208 | #define MAPPING_ENCDEC(enc) {#enc, (void*)enc##_encmap, (void*)enc##_decmap}, |
209 | #define END_MAPPINGS_LIST \ |
210 | {"", NULL, NULL} }; \ |
211 | static const struct dbcs_map *mapping_list = \ |
212 | (const struct dbcs_map *)_mapping_list; |
213 | |
214 | #define BEGIN_CODECS_LIST static const MultibyteCodec _codec_list[] = { |
215 | #define _STATEFUL_METHODS(enc) \ |
216 | enc##_encode, \ |
217 | enc##_encode_init, \ |
218 | enc##_encode_reset, \ |
219 | enc##_decode, \ |
220 | enc##_decode_init, \ |
221 | enc##_decode_reset, |
222 | #define _STATELESS_METHODS(enc) \ |
223 | enc##_encode, NULL, NULL, \ |
224 | enc##_decode, NULL, NULL, |
225 | #define CODEC_STATEFUL(enc) { \ |
226 | #enc, NULL, NULL, \ |
227 | _STATEFUL_METHODS(enc) \ |
228 | }, |
229 | #define CODEC_STATELESS(enc) { \ |
230 | #enc, NULL, NULL, \ |
231 | _STATELESS_METHODS(enc) \ |
232 | }, |
233 | #define CODEC_STATELESS_WINIT(enc) { \ |
234 | #enc, NULL, \ |
235 | enc##_codec_init, \ |
236 | _STATELESS_METHODS(enc) \ |
237 | }, |
238 | #define END_CODECS_LIST \ |
239 | {"", NULL,} }; \ |
240 | static const MultibyteCodec *codec_list = \ |
241 | (const MultibyteCodec *)_codec_list; |
242 | |
243 | |
244 | |
245 | static PyObject * |
246 | getmultibytecodec(void) |
247 | { |
248 | PyObject *mod = PyImport_ImportModuleNoBlock("_multibytecodec" ); |
249 | if (mod == NULL) { |
250 | return NULL; |
251 | } |
252 | |
253 | PyObject *cofunc = PyObject_GetAttrString(mod, "__create_codec" ); |
254 | Py_DECREF(mod); |
255 | return cofunc; |
256 | } |
257 | |
258 | static PyObject * |
259 | getcodec(PyObject *self, PyObject *encoding) |
260 | { |
261 | PyObject *codecobj, *r, *cofunc; |
262 | const MultibyteCodec *codec; |
263 | const char *enc; |
264 | |
265 | if (!PyUnicode_Check(encoding)) { |
266 | PyErr_SetString(PyExc_TypeError, |
267 | "encoding name must be a string." ); |
268 | return NULL; |
269 | } |
270 | enc = PyUnicode_AsUTF8(encoding); |
271 | if (enc == NULL) |
272 | return NULL; |
273 | |
274 | cofunc = getmultibytecodec(); |
275 | if (cofunc == NULL) |
276 | return NULL; |
277 | |
278 | for (codec = codec_list; codec->encoding[0]; codec++) |
279 | if (strcmp(codec->encoding, enc) == 0) |
280 | break; |
281 | |
282 | if (codec->encoding[0] == '\0') { |
283 | PyErr_SetString(PyExc_LookupError, |
284 | "no such codec is supported." ); |
285 | return NULL; |
286 | } |
287 | |
288 | codecobj = PyCapsule_New((void *)codec, PyMultibyteCodec_CAPSULE_NAME, NULL); |
289 | if (codecobj == NULL) |
290 | return NULL; |
291 | |
292 | r = PyObject_CallOneArg(cofunc, codecobj); |
293 | Py_DECREF(codecobj); |
294 | Py_DECREF(cofunc); |
295 | |
296 | return r; |
297 | } |
298 | |
299 | |
300 | static int |
301 | register_maps(PyObject *module) |
302 | { |
303 | const struct dbcs_map *h; |
304 | |
305 | for (h = mapping_list; h->charset[0] != '\0'; h++) { |
306 | char mhname[256] = "__map_" ; |
307 | strcpy(mhname + sizeof("__map_" ) - 1, h->charset); |
308 | |
309 | PyObject *capsule = PyCapsule_New((void *)h, |
310 | PyMultibyteCodec_CAPSULE_NAME, NULL); |
311 | if (capsule == NULL) { |
312 | return -1; |
313 | } |
314 | if (PyModule_AddObject(module, mhname, capsule) < 0) { |
315 | Py_DECREF(capsule); |
316 | return -1; |
317 | } |
318 | } |
319 | return 0; |
320 | } |
321 | |
322 | #ifdef USING_BINARY_PAIR_SEARCH |
323 | static DBCHAR |
324 | find_pairencmap(ucs2_t body, ucs2_t modifier, |
325 | const struct pair_encodemap *haystack, int haystacksize) |
326 | { |
327 | int pos, min, max; |
328 | Py_UCS4 value = body << 16 | modifier; |
329 | |
330 | min = 0; |
331 | max = haystacksize; |
332 | |
333 | for (pos = haystacksize >> 1; min != max; pos = (min + max) >> 1) { |
334 | if (value < haystack[pos].uniseq) { |
335 | if (max != pos) { |
336 | max = pos; |
337 | continue; |
338 | } |
339 | } |
340 | else if (value > haystack[pos].uniseq) { |
341 | if (min != pos) { |
342 | min = pos; |
343 | continue; |
344 | } |
345 | } |
346 | break; |
347 | } |
348 | |
349 | if (value == haystack[pos].uniseq) { |
350 | return haystack[pos].code; |
351 | } |
352 | return DBCINV; |
353 | } |
354 | #endif |
355 | |
356 | #ifdef USING_IMPORTED_MAPS |
357 | #define IMPORT_MAP(locale, charset, encmap, decmap) \ |
358 | importmap("_codecs_" #locale, "__map_" #charset, \ |
359 | (const void**)encmap, (const void**)decmap) |
360 | |
361 | static int |
362 | importmap(const char *modname, const char *symbol, |
363 | const void **encmap, const void **decmap) |
364 | { |
365 | PyObject *o, *mod; |
366 | |
367 | mod = PyImport_ImportModule(modname); |
368 | if (mod == NULL) |
369 | return -1; |
370 | |
371 | o = PyObject_GetAttrString(mod, symbol); |
372 | if (o == NULL) |
373 | goto errorexit; |
374 | else if (!PyCapsule_IsValid(o, PyMultibyteCodec_CAPSULE_NAME)) { |
375 | PyErr_SetString(PyExc_ValueError, |
376 | "map data must be a Capsule." ); |
377 | goto errorexit; |
378 | } |
379 | else { |
380 | struct dbcs_map *map; |
381 | map = PyCapsule_GetPointer(o, PyMultibyteCodec_CAPSULE_NAME); |
382 | if (encmap != NULL) |
383 | *encmap = map->encmap; |
384 | if (decmap != NULL) |
385 | *decmap = map->decmap; |
386 | Py_DECREF(o); |
387 | } |
388 | |
389 | Py_DECREF(mod); |
390 | return 0; |
391 | |
392 | errorexit: |
393 | Py_DECREF(mod); |
394 | return -1; |
395 | } |
396 | #endif |
397 | |
398 | static int |
399 | _cjk_exec(PyObject *module) |
400 | { |
401 | return register_maps(module); |
402 | } |
403 | |
404 | |
405 | static struct PyMethodDef _cjk_methods[] = { |
406 | {"getcodec" , (PyCFunction)getcodec, METH_O, "" }, |
407 | {NULL, NULL}, |
408 | }; |
409 | |
410 | static PyModuleDef_Slot _cjk_slots[] = { |
411 | {Py_mod_exec, _cjk_exec}, |
412 | {0, NULL} |
413 | }; |
414 | |
415 | #define I_AM_A_MODULE_FOR(loc) \ |
416 | static struct PyModuleDef _cjk_module = { \ |
417 | PyModuleDef_HEAD_INIT, \ |
418 | .m_name = "_codecs_"#loc, \ |
419 | .m_size = 0, \ |
420 | .m_methods = _cjk_methods, \ |
421 | .m_slots = _cjk_slots, \ |
422 | }; \ |
423 | \ |
424 | PyMODINIT_FUNC \ |
425 | PyInit__codecs_##loc(void) \ |
426 | { \ |
427 | return PyModuleDef_Init(&_cjk_module); \ |
428 | } |
429 | |
430 | #endif |
431 | |