1 | /* |
2 | * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings. |
3 | * |
4 | * Written by Hye-Shik Chang <[email protected]> |
5 | */ |
6 | |
7 | #define USING_IMPORTED_MAPS |
8 | #define USING_BINARY_PAIR_SEARCH |
9 | #define EXTERN_JISX0213_PAIR |
10 | #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE |
11 | #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE |
12 | |
13 | #include "cjkcodecs.h" |
14 | #include "alg_jisx0201.h" |
15 | #include "emu_jisx0213_2000.h" |
16 | #include "mappings_jisx0213_pair.h" |
17 | |
18 | /* STATE |
19 | |
20 | state->c[0-3] |
21 | |
22 | 00000000 |
23 | ||^^^^^| |
24 | |+-----+---- G0-3 Character Set |
25 | +----------- Is G0-3 double byte? |
26 | |
27 | state->c[4] |
28 | |
29 | 00000000 |
30 | || |
31 | |+---- Locked-Shift? |
32 | +----- ESC Throughout |
33 | */ |
34 | |
35 | #define ESC 0x1B |
36 | #define SO 0x0E |
37 | #define SI 0x0F |
38 | #define LF 0x0A |
39 | |
40 | #define MAX_ESCSEQLEN 16 |
41 | |
42 | #define CHARSET_ISO8859_1 'A' |
43 | #define CHARSET_ASCII 'B' |
44 | #define CHARSET_ISO8859_7 'F' |
45 | #define CHARSET_JISX0201_K 'I' |
46 | #define CHARSET_JISX0201_R 'J' |
47 | |
48 | #define CHARSET_GB2312 ('A'|CHARSET_DBCS) |
49 | #define CHARSET_JISX0208 ('B'|CHARSET_DBCS) |
50 | #define CHARSET_KSX1001 ('C'|CHARSET_DBCS) |
51 | #define CHARSET_JISX0212 ('D'|CHARSET_DBCS) |
52 | #define CHARSET_GB2312_8565 ('E'|CHARSET_DBCS) |
53 | #define CHARSET_CNS11643_1 ('G'|CHARSET_DBCS) |
54 | #define CHARSET_CNS11643_2 ('H'|CHARSET_DBCS) |
55 | #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS) |
56 | #define CHARSET_JISX0213_2 ('P'|CHARSET_DBCS) |
57 | #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS) |
58 | #define CHARSET_JISX0208_O ('@'|CHARSET_DBCS) |
59 | |
60 | #define CHARSET_DBCS 0x80 |
61 | #define ESCMARK(mark) ((mark) & 0x7f) |
62 | |
63 | #define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@') |
64 | #define IS_ISO2022ESC(c2) \ |
65 | ((c2) == '(' || (c2) == ')' || (c2) == '$' || \ |
66 | (c2) == '.' || (c2) == '&') |
67 | /* this is not a complete list of ISO-2022 escape sequence headers. |
68 | * but, it's enough to implement CJK instances of iso-2022. */ |
69 | |
70 | #define MAP_UNMAPPABLE 0xFFFF |
71 | #define MAP_MULTIPLE_AVAIL 0xFFFE /* for JIS X 0213 */ |
72 | |
73 | #define F_SHIFTED 0x01 |
74 | #define F_ESCTHROUGHOUT 0x02 |
75 | |
76 | #define STATE_SETG(dn, v) do { ((state)->c[dn]) = (v); } while (0) |
77 | #define STATE_GETG(dn) ((state)->c[dn]) |
78 | |
79 | #define STATE_G0 STATE_GETG(0) |
80 | #define STATE_G1 STATE_GETG(1) |
81 | #define STATE_G2 STATE_GETG(2) |
82 | #define STATE_G3 STATE_GETG(3) |
83 | #define STATE_SETG0(v) STATE_SETG(0, v) |
84 | #define STATE_SETG1(v) STATE_SETG(1, v) |
85 | #define STATE_SETG2(v) STATE_SETG(2, v) |
86 | #define STATE_SETG3(v) STATE_SETG(3, v) |
87 | |
88 | #define STATE_SETFLAG(f) do { ((state)->c[4]) |= (f); } while (0) |
89 | #define STATE_GETFLAG(f) ((state)->c[4] & (f)) |
90 | #define STATE_CLEARFLAG(f) do { ((state)->c[4]) &= ~(f); } while (0) |
91 | #define STATE_CLEARFLAGS() do { ((state)->c[4]) = 0; } while (0) |
92 | |
93 | #define ISO2022_CONFIG ((const struct iso2022_config *)config) |
94 | #define CONFIG_ISSET(flag) (ISO2022_CONFIG->flags & (flag)) |
95 | #define CONFIG_DESIGNATIONS (ISO2022_CONFIG->designations) |
96 | |
97 | /* iso2022_config.flags */ |
98 | #define NO_SHIFT 0x01 |
99 | #define USE_G2 0x02 |
100 | #define USE_JISX0208_EXT 0x04 |
101 | |
102 | /*-*- internal data structures -*-*/ |
103 | |
104 | typedef int (*iso2022_init_func)(void); |
105 | typedef Py_UCS4 (*iso2022_decode_func)(const unsigned char *data); |
106 | typedef DBCHAR (*iso2022_encode_func)(const Py_UCS4 *data, Py_ssize_t *length); |
107 | |
108 | struct iso2022_designation { |
109 | unsigned char mark; |
110 | unsigned char plane; |
111 | unsigned char width; |
112 | iso2022_init_func initializer; |
113 | iso2022_decode_func decoder; |
114 | iso2022_encode_func encoder; |
115 | }; |
116 | |
117 | struct iso2022_config { |
118 | int flags; |
119 | const struct iso2022_designation *designations; /* non-ascii desigs */ |
120 | }; |
121 | |
122 | /*-*- iso-2022 codec implementation -*-*/ |
123 | |
124 | CODEC_INIT(iso2022) |
125 | { |
126 | const struct iso2022_designation *desig; |
127 | for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++) |
128 | if (desig->initializer != NULL && desig->initializer() != 0) |
129 | return -1; |
130 | return 0; |
131 | } |
132 | |
133 | ENCODER_INIT(iso2022) |
134 | { |
135 | STATE_CLEARFLAGS(); |
136 | STATE_SETG0(CHARSET_ASCII); |
137 | STATE_SETG1(CHARSET_ASCII); |
138 | return 0; |
139 | } |
140 | |
141 | ENCODER_RESET(iso2022) |
142 | { |
143 | if (STATE_GETFLAG(F_SHIFTED)) { |
144 | WRITEBYTE1(SI); |
145 | NEXT_OUT(1); |
146 | STATE_CLEARFLAG(F_SHIFTED); |
147 | } |
148 | if (STATE_G0 != CHARSET_ASCII) { |
149 | WRITEBYTE3(ESC, '(', 'B'); |
150 | NEXT_OUT(3); |
151 | STATE_SETG0(CHARSET_ASCII); |
152 | } |
153 | return 0; |
154 | } |
155 | |
156 | ENCODER(iso2022) |
157 | { |
158 | while (*inpos < inlen) { |
159 | const struct iso2022_designation *dsg; |
160 | DBCHAR encoded; |
161 | Py_UCS4 c = INCHAR1; |
162 | Py_ssize_t insize; |
163 | |
164 | if (c < 0x80) { |
165 | if (STATE_G0 != CHARSET_ASCII) { |
166 | WRITEBYTE3(ESC, '(', 'B'); |
167 | STATE_SETG0(CHARSET_ASCII); |
168 | NEXT_OUT(3); |
169 | } |
170 | if (STATE_GETFLAG(F_SHIFTED)) { |
171 | WRITEBYTE1(SI); |
172 | STATE_CLEARFLAG(F_SHIFTED); |
173 | NEXT_OUT(1); |
174 | } |
175 | WRITEBYTE1((unsigned char)c); |
176 | NEXT(1, 1); |
177 | continue; |
178 | } |
179 | |
180 | insize = 1; |
181 | |
182 | encoded = MAP_UNMAPPABLE; |
183 | for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) { |
184 | Py_ssize_t length = 1; |
185 | encoded = dsg->encoder(&c, &length); |
186 | if (encoded == MAP_MULTIPLE_AVAIL) { |
187 | /* this implementation won't work for pair |
188 | * of non-bmp characters. */ |
189 | if (inlen - *inpos < 2) { |
190 | if (!(flags & MBENC_FLUSH)) |
191 | return MBERR_TOOFEW; |
192 | length = -1; |
193 | } |
194 | else |
195 | length = 2; |
196 | encoded = dsg->encoder(&c, &length); |
197 | if (encoded != MAP_UNMAPPABLE) { |
198 | insize = length; |
199 | break; |
200 | } |
201 | } |
202 | else if (encoded != MAP_UNMAPPABLE) |
203 | break; |
204 | } |
205 | |
206 | if (!dsg->mark) |
207 | return 1; |
208 | assert(dsg->width == 1 || dsg->width == 2); |
209 | |
210 | switch (dsg->plane) { |
211 | case 0: /* G0 */ |
212 | if (STATE_GETFLAG(F_SHIFTED)) { |
213 | WRITEBYTE1(SI); |
214 | STATE_CLEARFLAG(F_SHIFTED); |
215 | NEXT_OUT(1); |
216 | } |
217 | if (STATE_G0 != dsg->mark) { |
218 | if (dsg->width == 1) { |
219 | WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark)); |
220 | STATE_SETG0(dsg->mark); |
221 | NEXT_OUT(3); |
222 | } |
223 | else if (dsg->mark == CHARSET_JISX0208) { |
224 | WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark)); |
225 | STATE_SETG0(dsg->mark); |
226 | NEXT_OUT(3); |
227 | } |
228 | else { |
229 | WRITEBYTE4(ESC, '$', '(', |
230 | ESCMARK(dsg->mark)); |
231 | STATE_SETG0(dsg->mark); |
232 | NEXT_OUT(4); |
233 | } |
234 | } |
235 | break; |
236 | case 1: /* G1 */ |
237 | if (STATE_G1 != dsg->mark) { |
238 | if (dsg->width == 1) { |
239 | WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark)); |
240 | STATE_SETG1(dsg->mark); |
241 | NEXT_OUT(3); |
242 | } |
243 | else { |
244 | WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark)); |
245 | STATE_SETG1(dsg->mark); |
246 | NEXT_OUT(4); |
247 | } |
248 | } |
249 | if (!STATE_GETFLAG(F_SHIFTED)) { |
250 | WRITEBYTE1(SO); |
251 | STATE_SETFLAG(F_SHIFTED); |
252 | NEXT_OUT(1); |
253 | } |
254 | break; |
255 | default: /* G2 and G3 is not supported: no encoding in |
256 | * CJKCodecs are using them yet */ |
257 | return MBERR_INTERNAL; |
258 | } |
259 | |
260 | if (dsg->width == 1) { |
261 | WRITEBYTE1((unsigned char)encoded); |
262 | NEXT_OUT(1); |
263 | } |
264 | else { |
265 | WRITEBYTE2(encoded >> 8, encoded & 0xff); |
266 | NEXT_OUT(2); |
267 | } |
268 | NEXT_INCHAR(insize); |
269 | } |
270 | |
271 | return 0; |
272 | } |
273 | |
274 | DECODER_INIT(iso2022) |
275 | { |
276 | STATE_CLEARFLAGS(); |
277 | STATE_SETG0(CHARSET_ASCII); |
278 | STATE_SETG1(CHARSET_ASCII); |
279 | STATE_SETG2(CHARSET_ASCII); |
280 | return 0; |
281 | } |
282 | |
283 | DECODER_RESET(iso2022) |
284 | { |
285 | STATE_SETG0(CHARSET_ASCII); |
286 | STATE_CLEARFLAG(F_SHIFTED); |
287 | return 0; |
288 | } |
289 | |
290 | static Py_ssize_t |
291 | iso2022processesc(const void *config, MultibyteCodec_State *state, |
292 | const unsigned char **inbuf, Py_ssize_t *inleft) |
293 | { |
294 | unsigned char charset, designation; |
295 | Py_ssize_t i, esclen = 0; |
296 | |
297 | for (i = 1;i < MAX_ESCSEQLEN;i++) { |
298 | if (i >= *inleft) |
299 | return MBERR_TOOFEW; |
300 | if (IS_ESCEND((*inbuf)[i])) { |
301 | esclen = i + 1; |
302 | break; |
303 | } |
304 | else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft && |
305 | (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') { |
306 | i += 2; |
307 | } |
308 | } |
309 | |
310 | switch (esclen) { |
311 | case 0: |
312 | return 1; /* unterminated escape sequence */ |
313 | case 3: |
314 | if (INBYTE2 == '$') { |
315 | charset = INBYTE3 | CHARSET_DBCS; |
316 | designation = 0; |
317 | } |
318 | else { |
319 | charset = INBYTE3; |
320 | if (INBYTE2 == '(') |
321 | designation = 0; |
322 | else if (INBYTE2 == ')') |
323 | designation = 1; |
324 | else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.') |
325 | designation = 2; |
326 | else |
327 | return 3; |
328 | } |
329 | break; |
330 | case 4: |
331 | if (INBYTE2 != '$') |
332 | return 4; |
333 | |
334 | charset = INBYTE4 | CHARSET_DBCS; |
335 | if (INBYTE3 == '(') |
336 | designation = 0; |
337 | else if (INBYTE3 == ')') |
338 | designation = 1; |
339 | else |
340 | return 4; |
341 | break; |
342 | case 6: /* designation with prefix */ |
343 | if (CONFIG_ISSET(USE_JISX0208_EXT) && |
344 | (*inbuf)[3] == ESC && (*inbuf)[4] == '$' && |
345 | (*inbuf)[5] == 'B') { |
346 | charset = 'B' | CHARSET_DBCS; |
347 | designation = 0; |
348 | } |
349 | else |
350 | return 6; |
351 | break; |
352 | default: |
353 | return esclen; |
354 | } |
355 | |
356 | /* raise error when the charset is not designated for this encoding */ |
357 | if (charset != CHARSET_ASCII) { |
358 | const struct iso2022_designation *dsg; |
359 | |
360 | for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) { |
361 | if (dsg->mark == charset) |
362 | break; |
363 | } |
364 | if (!dsg->mark) |
365 | return esclen; |
366 | } |
367 | |
368 | STATE_SETG(designation, charset); |
369 | *inleft -= esclen; |
370 | (*inbuf) += esclen; |
371 | return 0; |
372 | } |
373 | |
374 | #define ISO8859_7_DECODE(c, writer) \ |
375 | if ((c) < 0xa0) { \ |
376 | OUTCHAR(c); \ |
377 | } else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) { \ |
378 | OUTCHAR(c); \ |
379 | } else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \ |
380 | (0xbffffd77L & (1L << ((c)-0xb4))))) { \ |
381 | OUTCHAR(0x02d0 + (c)); \ |
382 | } else if ((c) == 0xa1) { \ |
383 | OUTCHAR(0x2018); \ |
384 | } else if ((c) == 0xa2) { \ |
385 | OUTCHAR(0x2019); \ |
386 | } else if ((c) == 0xaf) { \ |
387 | OUTCHAR(0x2015); \ |
388 | } |
389 | |
390 | static Py_ssize_t |
391 | iso2022processg2(const void *config, MultibyteCodec_State *state, |
392 | const unsigned char **inbuf, Py_ssize_t *inleft, |
393 | _PyUnicodeWriter *writer) |
394 | { |
395 | /* not written to use encoder, decoder functions because only few |
396 | * encodings use G2 designations in CJKCodecs */ |
397 | if (STATE_G2 == CHARSET_ISO8859_1) { |
398 | if (INBYTE3 < 0x80) |
399 | OUTCHAR(INBYTE3 + 0x80); |
400 | else |
401 | return 3; |
402 | } |
403 | else if (STATE_G2 == CHARSET_ISO8859_7) { |
404 | ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer) |
405 | else |
406 | return 3; |
407 | } |
408 | else if (STATE_G2 == CHARSET_ASCII) { |
409 | if (INBYTE3 & 0x80) |
410 | return 3; |
411 | else |
412 | OUTCHAR(INBYTE3); |
413 | } |
414 | else |
415 | return MBERR_INTERNAL; |
416 | |
417 | (*inbuf) += 3; |
418 | *inleft -= 3; |
419 | return 0; |
420 | } |
421 | |
422 | DECODER(iso2022) |
423 | { |
424 | const struct iso2022_designation *dsgcache = NULL; |
425 | |
426 | while (inleft > 0) { |
427 | unsigned char c = INBYTE1; |
428 | Py_ssize_t err; |
429 | |
430 | if (STATE_GETFLAG(F_ESCTHROUGHOUT)) { |
431 | /* ESC throughout mode: |
432 | * for non-iso2022 escape sequences */ |
433 | OUTCHAR(c); /* assume as ISO-8859-1 */ |
434 | NEXT_IN(1); |
435 | if (IS_ESCEND(c)) { |
436 | STATE_CLEARFLAG(F_ESCTHROUGHOUT); |
437 | } |
438 | continue; |
439 | } |
440 | |
441 | switch (c) { |
442 | case ESC: |
443 | REQUIRE_INBUF(2); |
444 | if (IS_ISO2022ESC(INBYTE2)) { |
445 | err = iso2022processesc(config, state, |
446 | inbuf, &inleft); |
447 | if (err != 0) |
448 | return err; |
449 | } |
450 | else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */ |
451 | REQUIRE_INBUF(3); |
452 | err = iso2022processg2(config, state, |
453 | inbuf, &inleft, writer); |
454 | if (err != 0) |
455 | return err; |
456 | } |
457 | else { |
458 | OUTCHAR(ESC); |
459 | STATE_SETFLAG(F_ESCTHROUGHOUT); |
460 | NEXT_IN(1); |
461 | } |
462 | break; |
463 | case SI: |
464 | if (CONFIG_ISSET(NO_SHIFT)) |
465 | goto bypass; |
466 | STATE_CLEARFLAG(F_SHIFTED); |
467 | NEXT_IN(1); |
468 | break; |
469 | case SO: |
470 | if (CONFIG_ISSET(NO_SHIFT)) |
471 | goto bypass; |
472 | STATE_SETFLAG(F_SHIFTED); |
473 | NEXT_IN(1); |
474 | break; |
475 | case LF: |
476 | STATE_CLEARFLAG(F_SHIFTED); |
477 | OUTCHAR(LF); |
478 | NEXT_IN(1); |
479 | break; |
480 | default: |
481 | if (c < 0x20) /* C0 */ |
482 | goto bypass; |
483 | else if (c >= 0x80) |
484 | return 1; |
485 | else { |
486 | const struct iso2022_designation *dsg; |
487 | unsigned char charset; |
488 | Py_UCS4 decoded; |
489 | |
490 | if (STATE_GETFLAG(F_SHIFTED)) |
491 | charset = STATE_G1; |
492 | else |
493 | charset = STATE_G0; |
494 | |
495 | if (charset == CHARSET_ASCII) { |
496 | bypass: |
497 | OUTCHAR(c); |
498 | NEXT_IN(1); |
499 | break; |
500 | } |
501 | |
502 | if (dsgcache != NULL && |
503 | dsgcache->mark == charset) |
504 | dsg = dsgcache; |
505 | else { |
506 | for (dsg = CONFIG_DESIGNATIONS; |
507 | dsg->mark != charset |
508 | #ifdef Py_DEBUG |
509 | && dsg->mark != '\0' |
510 | #endif |
511 | ; dsg++) |
512 | { |
513 | /* noop */ |
514 | } |
515 | assert(dsg->mark != '\0'); |
516 | dsgcache = dsg; |
517 | } |
518 | |
519 | REQUIRE_INBUF(dsg->width); |
520 | decoded = dsg->decoder(*inbuf); |
521 | if (decoded == MAP_UNMAPPABLE) |
522 | return dsg->width; |
523 | |
524 | if (decoded < 0x10000) { |
525 | OUTCHAR(decoded); |
526 | } |
527 | else if (decoded < 0x30000) { |
528 | OUTCHAR(decoded); |
529 | } |
530 | else { /* JIS X 0213 pairs */ |
531 | OUTCHAR2(decoded >> 16, decoded & 0xffff); |
532 | } |
533 | NEXT_IN(dsg->width); |
534 | } |
535 | break; |
536 | } |
537 | } |
538 | return 0; |
539 | } |
540 | |
541 | /*-*- mapping table holders -*-*/ |
542 | |
543 | #define ENCMAP(enc) static const encode_map *enc##_encmap = NULL; |
544 | #define DECMAP(enc) static const decode_map *enc##_decmap = NULL; |
545 | |
546 | /* kr */ |
547 | ENCMAP(cp949) |
548 | DECMAP(ksx1001) |
549 | |
550 | /* jp */ |
551 | ENCMAP(jisxcommon) |
552 | DECMAP(jisx0208) |
553 | DECMAP(jisx0212) |
554 | ENCMAP(jisx0213_bmp) |
555 | DECMAP(jisx0213_1_bmp) |
556 | DECMAP(jisx0213_2_bmp) |
557 | ENCMAP(jisx0213_emp) |
558 | DECMAP(jisx0213_1_emp) |
559 | DECMAP(jisx0213_2_emp) |
560 | |
561 | /* cn */ |
562 | ENCMAP(gbcommon) |
563 | DECMAP(gb2312) |
564 | |
565 | /* tw */ |
566 | |
567 | /*-*- mapping access functions -*-*/ |
568 | |
569 | static int |
570 | ksx1001_init(void) |
571 | { |
572 | static int initialized = 0; |
573 | |
574 | if (!initialized && ( |
575 | IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) || |
576 | IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap))) |
577 | return -1; |
578 | initialized = 1; |
579 | return 0; |
580 | } |
581 | |
582 | static Py_UCS4 |
583 | ksx1001_decoder(const unsigned char *data) |
584 | { |
585 | Py_UCS4 u; |
586 | if (TRYMAP_DEC(ksx1001, u, data[0], data[1])) |
587 | return u; |
588 | else |
589 | return MAP_UNMAPPABLE; |
590 | } |
591 | |
592 | static DBCHAR |
593 | ksx1001_encoder(const Py_UCS4 *data, Py_ssize_t *length) |
594 | { |
595 | DBCHAR coded; |
596 | assert(*length == 1); |
597 | if (*data < 0x10000) { |
598 | if (TRYMAP_ENC(cp949, coded, *data)) { |
599 | if (!(coded & 0x8000)) |
600 | return coded; |
601 | } |
602 | } |
603 | return MAP_UNMAPPABLE; |
604 | } |
605 | |
606 | static int |
607 | jisx0208_init(void) |
608 | { |
609 | static int initialized = 0; |
610 | |
611 | if (!initialized && ( |
612 | IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) || |
613 | IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap))) |
614 | return -1; |
615 | initialized = 1; |
616 | return 0; |
617 | } |
618 | |
619 | static Py_UCS4 |
620 | jisx0208_decoder(const unsigned char *data) |
621 | { |
622 | Py_UCS4 u; |
623 | if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ |
624 | return 0xff3c; |
625 | else if (TRYMAP_DEC(jisx0208, u, data[0], data[1])) |
626 | return u; |
627 | else |
628 | return MAP_UNMAPPABLE; |
629 | } |
630 | |
631 | static DBCHAR |
632 | jisx0208_encoder(const Py_UCS4 *data, Py_ssize_t *length) |
633 | { |
634 | DBCHAR coded; |
635 | assert(*length == 1); |
636 | if (*data < 0x10000) { |
637 | if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */ |
638 | return 0x2140; |
639 | else if (TRYMAP_ENC(jisxcommon, coded, *data)) { |
640 | if (!(coded & 0x8000)) |
641 | return coded; |
642 | } |
643 | } |
644 | return MAP_UNMAPPABLE; |
645 | } |
646 | |
647 | static int |
648 | jisx0212_init(void) |
649 | { |
650 | static int initialized = 0; |
651 | |
652 | if (!initialized && ( |
653 | IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) || |
654 | IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap))) |
655 | return -1; |
656 | initialized = 1; |
657 | return 0; |
658 | } |
659 | |
660 | static Py_UCS4 |
661 | jisx0212_decoder(const unsigned char *data) |
662 | { |
663 | Py_UCS4 u; |
664 | if (TRYMAP_DEC(jisx0212, u, data[0], data[1])) |
665 | return u; |
666 | else |
667 | return MAP_UNMAPPABLE; |
668 | } |
669 | |
670 | static DBCHAR |
671 | jisx0212_encoder(const Py_UCS4 *data, Py_ssize_t *length) |
672 | { |
673 | DBCHAR coded; |
674 | assert(*length == 1); |
675 | if (*data < 0x10000) { |
676 | if (TRYMAP_ENC(jisxcommon, coded, *data)) { |
677 | if (coded & 0x8000) |
678 | return coded & 0x7fff; |
679 | } |
680 | } |
681 | return MAP_UNMAPPABLE; |
682 | } |
683 | |
684 | static int |
685 | jisx0213_init(void) |
686 | { |
687 | static int initialized = 0; |
688 | |
689 | if (!initialized && ( |
690 | jisx0208_init() || |
691 | IMPORT_MAP(jp, jisx0213_bmp, |
692 | &jisx0213_bmp_encmap, NULL) || |
693 | IMPORT_MAP(jp, jisx0213_1_bmp, |
694 | NULL, &jisx0213_1_bmp_decmap) || |
695 | IMPORT_MAP(jp, jisx0213_2_bmp, |
696 | NULL, &jisx0213_2_bmp_decmap) || |
697 | IMPORT_MAP(jp, jisx0213_emp, |
698 | &jisx0213_emp_encmap, NULL) || |
699 | IMPORT_MAP(jp, jisx0213_1_emp, |
700 | NULL, &jisx0213_1_emp_decmap) || |
701 | IMPORT_MAP(jp, jisx0213_2_emp, |
702 | NULL, &jisx0213_2_emp_decmap) || |
703 | IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap, |
704 | &jisx0213_pair_decmap))) |
705 | return -1; |
706 | initialized = 1; |
707 | return 0; |
708 | } |
709 | |
710 | #define config ((void *)2000) |
711 | static Py_UCS4 |
712 | jisx0213_2000_1_decoder(const unsigned char *data) |
713 | { |
714 | Py_UCS4 u; |
715 | EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1]) |
716 | else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ |
717 | return 0xff3c; |
718 | else if (TRYMAP_DEC(jisx0208, u, data[0], data[1])) |
719 | ; |
720 | else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1])) |
721 | ; |
722 | else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])) |
723 | u |= 0x20000; |
724 | else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1])) |
725 | ; |
726 | else |
727 | return MAP_UNMAPPABLE; |
728 | return u; |
729 | } |
730 | |
731 | static Py_UCS4 |
732 | jisx0213_2000_2_decoder(const unsigned char *data) |
733 | { |
734 | Py_UCS4 u; |
735 | EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(u, data[0], data[1]) |
736 | if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1])) |
737 | ; |
738 | else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])) |
739 | u |= 0x20000; |
740 | else |
741 | return MAP_UNMAPPABLE; |
742 | return u; |
743 | } |
744 | #undef config |
745 | |
746 | static Py_UCS4 |
747 | jisx0213_2004_1_decoder(const unsigned char *data) |
748 | { |
749 | Py_UCS4 u; |
750 | if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ |
751 | return 0xff3c; |
752 | else if (TRYMAP_DEC(jisx0208, u, data[0], data[1])) |
753 | ; |
754 | else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1])) |
755 | ; |
756 | else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])) |
757 | u |= 0x20000; |
758 | else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1])) |
759 | ; |
760 | else |
761 | return MAP_UNMAPPABLE; |
762 | return u; |
763 | } |
764 | |
765 | static Py_UCS4 |
766 | jisx0213_2004_2_decoder(const unsigned char *data) |
767 | { |
768 | Py_UCS4 u; |
769 | if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1])) |
770 | ; |
771 | else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])) |
772 | u |= 0x20000; |
773 | else |
774 | return MAP_UNMAPPABLE; |
775 | return u; |
776 | } |
777 | |
778 | static DBCHAR |
779 | jisx0213_encoder(const Py_UCS4 *data, Py_ssize_t *length, void *config) |
780 | { |
781 | DBCHAR coded; |
782 | |
783 | switch (*length) { |
784 | case 1: /* first character */ |
785 | if (*data >= 0x10000) { |
786 | if ((*data) >> 16 == 0x20000 >> 16) { |
787 | EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data) |
788 | else if (TRYMAP_ENC(jisx0213_emp, coded, (*data) & 0xffff)) |
789 | return coded; |
790 | } |
791 | return MAP_UNMAPPABLE; |
792 | } |
793 | |
794 | EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data) |
795 | else if (TRYMAP_ENC(jisx0213_bmp, coded, *data)) { |
796 | if (coded == MULTIC) |
797 | return MAP_MULTIPLE_AVAIL; |
798 | } |
799 | else if (TRYMAP_ENC(jisxcommon, coded, *data)) { |
800 | if (coded & 0x8000) |
801 | return MAP_UNMAPPABLE; |
802 | } |
803 | else |
804 | return MAP_UNMAPPABLE; |
805 | return coded; |
806 | |
807 | case 2: /* second character of unicode pair */ |
808 | coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1], |
809 | jisx0213_pair_encmap, JISX0213_ENCPAIRS); |
810 | if (coded != DBCINV) |
811 | return coded; |
812 | /* fall through */ |
813 | |
814 | case -1: /* flush unterminated */ |
815 | *length = 1; |
816 | coded = find_pairencmap((ucs2_t)data[0], 0, |
817 | jisx0213_pair_encmap, JISX0213_ENCPAIRS); |
818 | if (coded == DBCINV) |
819 | return MAP_UNMAPPABLE; |
820 | else |
821 | return coded; |
822 | break; |
823 | |
824 | default: |
825 | return MAP_UNMAPPABLE; |
826 | } |
827 | } |
828 | |
829 | static DBCHAR |
830 | jisx0213_2000_1_encoder(const Py_UCS4 *data, Py_ssize_t *length) |
831 | { |
832 | DBCHAR coded = jisx0213_encoder(data, length, (void *)2000); |
833 | if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) |
834 | return coded; |
835 | else if (coded & 0x8000) |
836 | return MAP_UNMAPPABLE; |
837 | else |
838 | return coded; |
839 | } |
840 | |
841 | static DBCHAR |
842 | jisx0213_2000_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length) |
843 | { |
844 | DBCHAR coded; |
845 | Py_ssize_t ilength = *length; |
846 | |
847 | coded = jisx0213_encoder(data, length, (void *)2000); |
848 | switch (ilength) { |
849 | case 1: |
850 | if (coded == MAP_MULTIPLE_AVAIL) |
851 | return MAP_MULTIPLE_AVAIL; |
852 | else |
853 | return MAP_UNMAPPABLE; |
854 | case 2: |
855 | if (*length != 2) |
856 | return MAP_UNMAPPABLE; |
857 | else |
858 | return coded; |
859 | default: |
860 | return MAP_UNMAPPABLE; |
861 | } |
862 | } |
863 | |
864 | static DBCHAR |
865 | jisx0213_2000_2_encoder(const Py_UCS4 *data, Py_ssize_t *length) |
866 | { |
867 | DBCHAR coded = jisx0213_encoder(data, length, (void *)2000); |
868 | if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) |
869 | return coded; |
870 | else if (coded & 0x8000) |
871 | return coded & 0x7fff; |
872 | else |
873 | return MAP_UNMAPPABLE; |
874 | } |
875 | |
876 | static DBCHAR |
877 | jisx0213_2004_1_encoder(const Py_UCS4 *data, Py_ssize_t *length) |
878 | { |
879 | DBCHAR coded = jisx0213_encoder(data, length, NULL); |
880 | if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) |
881 | return coded; |
882 | else if (coded & 0x8000) |
883 | return MAP_UNMAPPABLE; |
884 | else |
885 | return coded; |
886 | } |
887 | |
888 | static DBCHAR |
889 | jisx0213_2004_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length) |
890 | { |
891 | DBCHAR coded; |
892 | Py_ssize_t ilength = *length; |
893 | |
894 | coded = jisx0213_encoder(data, length, NULL); |
895 | switch (ilength) { |
896 | case 1: |
897 | if (coded == MAP_MULTIPLE_AVAIL) |
898 | return MAP_MULTIPLE_AVAIL; |
899 | else |
900 | return MAP_UNMAPPABLE; |
901 | case 2: |
902 | if (*length != 2) |
903 | return MAP_UNMAPPABLE; |
904 | else |
905 | return coded; |
906 | default: |
907 | return MAP_UNMAPPABLE; |
908 | } |
909 | } |
910 | |
911 | static DBCHAR |
912 | jisx0213_2004_2_encoder(const Py_UCS4 *data, Py_ssize_t *length) |
913 | { |
914 | DBCHAR coded = jisx0213_encoder(data, length, NULL); |
915 | if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) |
916 | return coded; |
917 | else if (coded & 0x8000) |
918 | return coded & 0x7fff; |
919 | else |
920 | return MAP_UNMAPPABLE; |
921 | } |
922 | |
923 | static Py_UCS4 |
924 | jisx0201_r_decoder(const unsigned char *data) |
925 | { |
926 | Py_UCS4 u; |
927 | JISX0201_R_DECODE_CHAR(*data, u) |
928 | else |
929 | return MAP_UNMAPPABLE; |
930 | return u; |
931 | } |
932 | |
933 | static DBCHAR |
934 | jisx0201_r_encoder(const Py_UCS4 *data, Py_ssize_t *length) |
935 | { |
936 | DBCHAR coded; |
937 | JISX0201_R_ENCODE(*data, coded) |
938 | else |
939 | return MAP_UNMAPPABLE; |
940 | return coded; |
941 | } |
942 | |
943 | static Py_UCS4 |
944 | jisx0201_k_decoder(const unsigned char *data) |
945 | { |
946 | Py_UCS4 u; |
947 | JISX0201_K_DECODE_CHAR(*data ^ 0x80, u) |
948 | else |
949 | return MAP_UNMAPPABLE; |
950 | return u; |
951 | } |
952 | |
953 | static DBCHAR |
954 | jisx0201_k_encoder(const Py_UCS4 *data, Py_ssize_t *length) |
955 | { |
956 | DBCHAR coded; |
957 | JISX0201_K_ENCODE(*data, coded) |
958 | else |
959 | return MAP_UNMAPPABLE; |
960 | return coded - 0x80; |
961 | } |
962 | |
963 | static int |
964 | gb2312_init(void) |
965 | { |
966 | static int initialized = 0; |
967 | |
968 | if (!initialized && ( |
969 | IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) || |
970 | IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap))) |
971 | return -1; |
972 | initialized = 1; |
973 | return 0; |
974 | } |
975 | |
976 | static Py_UCS4 |
977 | gb2312_decoder(const unsigned char *data) |
978 | { |
979 | Py_UCS4 u; |
980 | if (TRYMAP_DEC(gb2312, u, data[0], data[1])) |
981 | return u; |
982 | else |
983 | return MAP_UNMAPPABLE; |
984 | } |
985 | |
986 | static DBCHAR |
987 | gb2312_encoder(const Py_UCS4 *data, Py_ssize_t *length) |
988 | { |
989 | DBCHAR coded; |
990 | assert(*length == 1); |
991 | if (*data < 0x10000) { |
992 | if (TRYMAP_ENC(gbcommon, coded, *data)) { |
993 | if (!(coded & 0x8000)) |
994 | return coded; |
995 | } |
996 | } |
997 | return MAP_UNMAPPABLE; |
998 | } |
999 | |
1000 | |
1001 | static Py_UCS4 |
1002 | dummy_decoder(const unsigned char *data) |
1003 | { |
1004 | return MAP_UNMAPPABLE; |
1005 | } |
1006 | |
1007 | static DBCHAR |
1008 | dummy_encoder(const Py_UCS4 *data, Py_ssize_t *length) |
1009 | { |
1010 | return MAP_UNMAPPABLE; |
1011 | } |
1012 | |
1013 | /*-*- registry tables -*-*/ |
1014 | |
1015 | #define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \ |
1016 | ksx1001_init, \ |
1017 | ksx1001_decoder, ksx1001_encoder } |
1018 | #define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \ |
1019 | ksx1001_init, \ |
1020 | ksx1001_decoder, ksx1001_encoder } |
1021 | #define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \ |
1022 | NULL, \ |
1023 | jisx0201_r_decoder, jisx0201_r_encoder } |
1024 | #define REGISTRY_JISX0201_K { CHARSET_JISX0201_K, 0, 1, \ |
1025 | NULL, \ |
1026 | jisx0201_k_decoder, jisx0201_k_encoder } |
1027 | #define REGISTRY_JISX0208 { CHARSET_JISX0208, 0, 2, \ |
1028 | jisx0208_init, \ |
1029 | jisx0208_decoder, jisx0208_encoder } |
1030 | #define REGISTRY_JISX0208_O { CHARSET_JISX0208_O, 0, 2, \ |
1031 | jisx0208_init, \ |
1032 | jisx0208_decoder, jisx0208_encoder } |
1033 | #define REGISTRY_JISX0212 { CHARSET_JISX0212, 0, 2, \ |
1034 | jisx0212_init, \ |
1035 | jisx0212_decoder, jisx0212_encoder } |
1036 | #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2, \ |
1037 | jisx0213_init, \ |
1038 | jisx0213_2000_1_decoder, \ |
1039 | jisx0213_2000_1_encoder } |
1040 | #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \ |
1041 | jisx0213_init, \ |
1042 | jisx0213_2000_1_decoder, \ |
1043 | jisx0213_2000_1_encoder_paironly } |
1044 | #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2, \ |
1045 | jisx0213_init, \ |
1046 | jisx0213_2000_2_decoder, \ |
1047 | jisx0213_2000_2_encoder } |
1048 | #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2, \ |
1049 | jisx0213_init, \ |
1050 | jisx0213_2004_1_decoder, \ |
1051 | jisx0213_2004_1_encoder } |
1052 | #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \ |
1053 | jisx0213_init, \ |
1054 | jisx0213_2004_1_decoder, \ |
1055 | jisx0213_2004_1_encoder_paironly } |
1056 | #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2, \ |
1057 | jisx0213_init, \ |
1058 | jisx0213_2004_2_decoder, \ |
1059 | jisx0213_2004_2_encoder } |
1060 | #define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \ |
1061 | gb2312_init, \ |
1062 | gb2312_decoder, gb2312_encoder } |
1063 | #define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \ |
1064 | cns11643_init, \ |
1065 | cns11643_1_decoder, cns11643_1_encoder } |
1066 | #define REGISTRY_CNS11643_2 { CHARSET_CNS11643_2, 2, 2, \ |
1067 | cns11643_init, \ |
1068 | cns11643_2_decoder, cns11643_2_encoder } |
1069 | #define REGISTRY_ISO8859_1 { CHARSET_ISO8859_1, 2, 1, \ |
1070 | NULL, dummy_decoder, dummy_encoder } |
1071 | #define REGISTRY_ISO8859_7 { CHARSET_ISO8859_7, 2, 1, \ |
1072 | NULL, dummy_decoder, dummy_encoder } |
1073 | #define REGISTRY_SENTINEL { 0, } |
1074 | #define CONFIGDEF(var, attrs) \ |
1075 | static const struct iso2022_config iso2022_##var##_config = { \ |
1076 | attrs, iso2022_##var##_designations \ |
1077 | }; |
1078 | |
1079 | static const struct iso2022_designation iso2022_kr_designations[] = { |
1080 | REGISTRY_KSX1001_G1, REGISTRY_SENTINEL |
1081 | }; |
1082 | CONFIGDEF(kr, 0) |
1083 | |
1084 | static const struct iso2022_designation iso2022_jp_designations[] = { |
1085 | REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O, |
1086 | REGISTRY_SENTINEL |
1087 | }; |
1088 | CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT) |
1089 | |
1090 | static const struct iso2022_designation iso2022_jp_1_designations[] = { |
1091 | REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R, |
1092 | REGISTRY_JISX0208_O, REGISTRY_SENTINEL |
1093 | }; |
1094 | CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT) |
1095 | |
1096 | static const struct iso2022_designation iso2022_jp_2_designations[] = { |
1097 | REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0, |
1098 | REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O, |
1099 | REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL |
1100 | }; |
1101 | CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT) |
1102 | |
1103 | static const struct iso2022_designation iso2022_jp_2004_designations[] = { |
1104 | REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208, |
1105 | REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL |
1106 | }; |
1107 | CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT) |
1108 | |
1109 | static const struct iso2022_designation iso2022_jp_3_designations[] = { |
1110 | REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208, |
1111 | REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL |
1112 | }; |
1113 | CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT) |
1114 | |
1115 | static const struct iso2022_designation iso2022_jp_ext_designations[] = { |
1116 | REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R, |
1117 | REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL |
1118 | }; |
1119 | CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT) |
1120 | |
1121 | |
1122 | BEGIN_MAPPINGS_LIST |
1123 | /* no mapping table here */ |
1124 | END_MAPPINGS_LIST |
1125 | |
1126 | #define ISO2022_CODEC(variation) { \ |
1127 | "iso2022_" #variation, \ |
1128 | &iso2022_##variation##_config, \ |
1129 | iso2022_codec_init, \ |
1130 | _STATEFUL_METHODS(iso2022) \ |
1131 | }, |
1132 | |
1133 | BEGIN_CODECS_LIST |
1134 | ISO2022_CODEC(kr) |
1135 | ISO2022_CODEC(jp) |
1136 | ISO2022_CODEC(jp_1) |
1137 | ISO2022_CODEC(jp_2) |
1138 | ISO2022_CODEC(jp_2004) |
1139 | ISO2022_CODEC(jp_3) |
1140 | ISO2022_CODEC(jp_ext) |
1141 | END_CODECS_LIST |
1142 | |
1143 | I_AM_A_MODULE_FOR(iso2022) |
1144 | |