1 | /* |
2 | * _codecs_cn.c: Codecs collection for Mainland Chinese encodings |
3 | * |
4 | * Written by Hye-Shik Chang <[email protected]> |
5 | */ |
6 | |
7 | #include "cjkcodecs.h" |
8 | #include "mappings_cn.h" |
9 | |
10 | /** |
11 | * hz is predefined as 100 on AIX. So we undefine it to avoid |
12 | * conflict against hz codec's. |
13 | */ |
14 | #ifdef _AIX |
15 | #undef hz |
16 | #endif |
17 | |
18 | /* GBK and GB2312 map differently in few code points that are listed below: |
19 | * |
20 | * gb2312 gbk |
21 | * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT |
22 | * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH |
23 | * A844 undefined U+2015 HORIZONTAL BAR |
24 | */ |
25 | |
26 | #define GBK_DECODE(dc1, dc2, writer) \ |
27 | if ((dc1) == 0xa1 && (dc2) == 0xaa) { \ |
28 | OUTCHAR(0x2014); \ |
29 | } \ |
30 | else if ((dc1) == 0xa8 && (dc2) == 0x44) { \ |
31 | OUTCHAR(0x2015); \ |
32 | } \ |
33 | else if ((dc1) == 0xa1 && (dc2) == 0xa4) { \ |
34 | OUTCHAR(0x00b7); \ |
35 | } \ |
36 | else if (TRYMAP_DEC(gb2312, decoded, dc1 ^ 0x80, dc2 ^ 0x80)) { \ |
37 | OUTCHAR(decoded); \ |
38 | } \ |
39 | else if (TRYMAP_DEC(gbkext, decoded, dc1, dc2)) { \ |
40 | OUTCHAR(decoded); \ |
41 | } |
42 | |
43 | #define GBK_ENCODE(code, assi) \ |
44 | if ((code) == 0x2014) { \ |
45 | (assi) = 0xa1aa; \ |
46 | } else if ((code) == 0x2015) { \ |
47 | (assi) = 0xa844; \ |
48 | } else if ((code) == 0x00b7) { \ |
49 | (assi) = 0xa1a4; \ |
50 | } else if ((code) != 0x30fb && TRYMAP_ENC(gbcommon, assi, code)) { \ |
51 | ; \ |
52 | } |
53 | |
54 | /* |
55 | * codecs in this file use the first byte of MultibyteCodec_State.c[8] |
56 | * to store a 0 or 1 state value |
57 | */ |
58 | #define CN_STATE_OFFSET 0 |
59 | |
60 | /* |
61 | * GB2312 codec |
62 | */ |
63 | |
64 | ENCODER(gb2312) |
65 | { |
66 | while (*inpos < inlen) { |
67 | Py_UCS4 c = INCHAR1; |
68 | DBCHAR code; |
69 | |
70 | if (c < 0x80) { |
71 | WRITEBYTE1((unsigned char)c); |
72 | NEXT(1, 1); |
73 | continue; |
74 | } |
75 | |
76 | if (c > 0xFFFF) |
77 | return 1; |
78 | |
79 | REQUIRE_OUTBUF(2); |
80 | if (TRYMAP_ENC(gbcommon, code, c)) |
81 | ; |
82 | else |
83 | return 1; |
84 | |
85 | if (code & 0x8000) /* MSB set: GBK */ |
86 | return 1; |
87 | |
88 | OUTBYTE1((code >> 8) | 0x80); |
89 | OUTBYTE2((code & 0xFF) | 0x80); |
90 | NEXT(1, 2); |
91 | } |
92 | |
93 | return 0; |
94 | } |
95 | |
96 | DECODER(gb2312) |
97 | { |
98 | while (inleft > 0) { |
99 | unsigned char c = **inbuf; |
100 | Py_UCS4 decoded; |
101 | |
102 | if (c < 0x80) { |
103 | OUTCHAR(c); |
104 | NEXT_IN(1); |
105 | continue; |
106 | } |
107 | |
108 | REQUIRE_INBUF(2); |
109 | if (TRYMAP_DEC(gb2312, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) { |
110 | OUTCHAR(decoded); |
111 | NEXT_IN(2); |
112 | } |
113 | else |
114 | return 1; |
115 | } |
116 | |
117 | return 0; |
118 | } |
119 | |
120 | |
121 | /* |
122 | * GBK codec |
123 | */ |
124 | |
125 | ENCODER(gbk) |
126 | { |
127 | while (*inpos < inlen) { |
128 | Py_UCS4 c = INCHAR1; |
129 | DBCHAR code; |
130 | |
131 | if (c < 0x80) { |
132 | WRITEBYTE1((unsigned char)c); |
133 | NEXT(1, 1); |
134 | continue; |
135 | } |
136 | |
137 | if (c > 0xFFFF) |
138 | return 1; |
139 | |
140 | REQUIRE_OUTBUF(2); |
141 | |
142 | GBK_ENCODE(c, code) |
143 | else |
144 | return 1; |
145 | |
146 | OUTBYTE1((code >> 8) | 0x80); |
147 | if (code & 0x8000) |
148 | OUTBYTE2((code & 0xFF)); /* MSB set: GBK */ |
149 | else |
150 | OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */ |
151 | NEXT(1, 2); |
152 | } |
153 | |
154 | return 0; |
155 | } |
156 | |
157 | DECODER(gbk) |
158 | { |
159 | while (inleft > 0) { |
160 | unsigned char c = INBYTE1; |
161 | Py_UCS4 decoded; |
162 | |
163 | if (c < 0x80) { |
164 | OUTCHAR(c); |
165 | NEXT_IN(1); |
166 | continue; |
167 | } |
168 | |
169 | REQUIRE_INBUF(2); |
170 | |
171 | GBK_DECODE(c, INBYTE2, writer) |
172 | else |
173 | return 1; |
174 | |
175 | NEXT_IN(2); |
176 | } |
177 | |
178 | return 0; |
179 | } |
180 | |
181 | |
182 | /* |
183 | * GB18030 codec |
184 | */ |
185 | |
186 | ENCODER(gb18030) |
187 | { |
188 | while (*inpos < inlen) { |
189 | Py_UCS4 c = INCHAR1; |
190 | DBCHAR code; |
191 | |
192 | if (c < 0x80) { |
193 | WRITEBYTE1(c); |
194 | NEXT(1, 1); |
195 | continue; |
196 | } |
197 | |
198 | if (c >= 0x10000) { |
199 | Py_UCS4 tc = c - 0x10000; |
200 | assert (c <= 0x10FFFF); |
201 | |
202 | REQUIRE_OUTBUF(4); |
203 | |
204 | OUTBYTE4((unsigned char)(tc % 10) + 0x30); |
205 | tc /= 10; |
206 | OUTBYTE3((unsigned char)(tc % 126) + 0x81); |
207 | tc /= 126; |
208 | OUTBYTE2((unsigned char)(tc % 10) + 0x30); |
209 | tc /= 10; |
210 | OUTBYTE1((unsigned char)(tc + 0x90)); |
211 | |
212 | NEXT(1, 4); |
213 | continue; |
214 | } |
215 | |
216 | REQUIRE_OUTBUF(2); |
217 | |
218 | GBK_ENCODE(c, code) |
219 | else if (TRYMAP_ENC(gb18030ext, code, c)) |
220 | ; |
221 | else { |
222 | const struct _gb18030_to_unibmp_ranges *utrrange; |
223 | |
224 | REQUIRE_OUTBUF(4); |
225 | |
226 | for (utrrange = gb18030_to_unibmp_ranges; |
227 | utrrange->first != 0; |
228 | utrrange++) |
229 | if (utrrange->first <= c && |
230 | c <= utrrange->last) { |
231 | Py_UCS4 tc; |
232 | |
233 | tc = c - utrrange->first + |
234 | utrrange->base; |
235 | |
236 | OUTBYTE4((unsigned char)(tc % 10) + 0x30); |
237 | tc /= 10; |
238 | OUTBYTE3((unsigned char)(tc % 126) + 0x81); |
239 | tc /= 126; |
240 | OUTBYTE2((unsigned char)(tc % 10) + 0x30); |
241 | tc /= 10; |
242 | OUTBYTE1((unsigned char)tc + 0x81); |
243 | |
244 | NEXT(1, 4); |
245 | break; |
246 | } |
247 | |
248 | if (utrrange->first == 0) |
249 | return 1; |
250 | continue; |
251 | } |
252 | |
253 | OUTBYTE1((code >> 8) | 0x80); |
254 | if (code & 0x8000) |
255 | OUTBYTE2((code & 0xFF)); /* MSB set: GBK or GB18030ext */ |
256 | else |
257 | OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */ |
258 | |
259 | NEXT(1, 2); |
260 | } |
261 | |
262 | return 0; |
263 | } |
264 | |
265 | DECODER(gb18030) |
266 | { |
267 | while (inleft > 0) { |
268 | unsigned char c = INBYTE1, c2; |
269 | Py_UCS4 decoded; |
270 | |
271 | if (c < 0x80) { |
272 | OUTCHAR(c); |
273 | NEXT_IN(1); |
274 | continue; |
275 | } |
276 | |
277 | REQUIRE_INBUF(2); |
278 | |
279 | c2 = INBYTE2; |
280 | if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */ |
281 | const struct _gb18030_to_unibmp_ranges *utr; |
282 | unsigned char c3, c4; |
283 | Py_UCS4 lseq; |
284 | |
285 | REQUIRE_INBUF(4); |
286 | c3 = INBYTE3; |
287 | c4 = INBYTE4; |
288 | if (c < 0x81 || c > 0xFE || |
289 | c3 < 0x81 || c3 > 0xFE || |
290 | c4 < 0x30 || c4 > 0x39) |
291 | return 1; |
292 | c -= 0x81; c2 -= 0x30; |
293 | c3 -= 0x81; c4 -= 0x30; |
294 | |
295 | if (c < 4) { /* U+0080 - U+FFFF */ |
296 | lseq = ((Py_UCS4)c * 10 + c2) * 1260 + |
297 | (Py_UCS4)c3 * 10 + c4; |
298 | if (lseq < 39420) { |
299 | for (utr = gb18030_to_unibmp_ranges; |
300 | lseq >= (utr + 1)->base; |
301 | utr++) ; |
302 | OUTCHAR(utr->first - utr->base + lseq); |
303 | NEXT_IN(4); |
304 | continue; |
305 | } |
306 | } |
307 | else if (c >= 15) { /* U+10000 - U+10FFFF */ |
308 | lseq = 0x10000 + (((Py_UCS4)c-15) * 10 + c2) |
309 | * 1260 + (Py_UCS4)c3 * 10 + c4; |
310 | if (lseq <= 0x10FFFF) { |
311 | OUTCHAR(lseq); |
312 | NEXT_IN(4); |
313 | continue; |
314 | } |
315 | } |
316 | return 1; |
317 | } |
318 | |
319 | GBK_DECODE(c, c2, writer) |
320 | else if (TRYMAP_DEC(gb18030ext, decoded, c, c2)) |
321 | OUTCHAR(decoded); |
322 | else |
323 | return 1; |
324 | |
325 | NEXT_IN(2); |
326 | } |
327 | |
328 | return 0; |
329 | } |
330 | |
331 | |
332 | /* |
333 | * HZ codec |
334 | */ |
335 | |
336 | ENCODER_INIT(hz) |
337 | { |
338 | state->c[CN_STATE_OFFSET] = 0; |
339 | return 0; |
340 | } |
341 | |
342 | ENCODER_RESET(hz) |
343 | { |
344 | if (state->c[CN_STATE_OFFSET] != 0) { |
345 | WRITEBYTE2('~', '}'); |
346 | state->c[CN_STATE_OFFSET] = 0; |
347 | NEXT_OUT(2); |
348 | } |
349 | return 0; |
350 | } |
351 | |
352 | ENCODER(hz) |
353 | { |
354 | while (*inpos < inlen) { |
355 | Py_UCS4 c = INCHAR1; |
356 | DBCHAR code; |
357 | |
358 | if (c < 0x80) { |
359 | if (state->c[CN_STATE_OFFSET]) { |
360 | WRITEBYTE2('~', '}'); |
361 | NEXT_OUT(2); |
362 | state->c[CN_STATE_OFFSET] = 0; |
363 | } |
364 | WRITEBYTE1((unsigned char)c); |
365 | NEXT(1, 1); |
366 | if (c == '~') { |
367 | WRITEBYTE1('~'); |
368 | NEXT_OUT(1); |
369 | } |
370 | continue; |
371 | } |
372 | |
373 | if (c > 0xFFFF) |
374 | return 1; |
375 | |
376 | if (TRYMAP_ENC(gbcommon, code, c)) |
377 | ; |
378 | else |
379 | return 1; |
380 | |
381 | if (code & 0x8000) /* MSB set: GBK */ |
382 | return 1; |
383 | |
384 | if (state->c[CN_STATE_OFFSET] == 0) { |
385 | WRITEBYTE4('~', '{', code >> 8, code & 0xff); |
386 | NEXT(1, 4); |
387 | state->c[CN_STATE_OFFSET] = 1; |
388 | } |
389 | else { |
390 | WRITEBYTE2(code >> 8, code & 0xff); |
391 | NEXT(1, 2); |
392 | } |
393 | } |
394 | |
395 | return 0; |
396 | } |
397 | |
398 | DECODER_INIT(hz) |
399 | { |
400 | state->c[CN_STATE_OFFSET] = 0; |
401 | return 0; |
402 | } |
403 | |
404 | DECODER_RESET(hz) |
405 | { |
406 | state->c[CN_STATE_OFFSET] = 0; |
407 | return 0; |
408 | } |
409 | |
410 | DECODER(hz) |
411 | { |
412 | while (inleft > 0) { |
413 | unsigned char c = INBYTE1; |
414 | Py_UCS4 decoded; |
415 | |
416 | if (c == '~') { |
417 | unsigned char c2 = INBYTE2; |
418 | |
419 | REQUIRE_INBUF(2); |
420 | if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0) |
421 | OUTCHAR('~'); |
422 | else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0) |
423 | state->c[CN_STATE_OFFSET] = 1; /* set GB */ |
424 | else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0) |
425 | ; /* line-continuation */ |
426 | else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1) |
427 | state->c[CN_STATE_OFFSET] = 0; /* set ASCII */ |
428 | else |
429 | return 1; |
430 | NEXT_IN(2); |
431 | continue; |
432 | } |
433 | |
434 | if (c & 0x80) |
435 | return 1; |
436 | |
437 | if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */ |
438 | OUTCHAR(c); |
439 | NEXT_IN(1); |
440 | } |
441 | else { /* GB mode */ |
442 | REQUIRE_INBUF(2); |
443 | if (TRYMAP_DEC(gb2312, decoded, c, INBYTE2)) { |
444 | OUTCHAR(decoded); |
445 | NEXT_IN(2); |
446 | } |
447 | else |
448 | return 1; |
449 | } |
450 | } |
451 | |
452 | return 0; |
453 | } |
454 | |
455 | |
456 | BEGIN_MAPPINGS_LIST |
457 | MAPPING_DECONLY(gb2312) |
458 | MAPPING_DECONLY(gbkext) |
459 | MAPPING_ENCONLY(gbcommon) |
460 | MAPPING_ENCDEC(gb18030ext) |
461 | END_MAPPINGS_LIST |
462 | |
463 | BEGIN_CODECS_LIST |
464 | CODEC_STATELESS(gb2312) |
465 | CODEC_STATELESS(gbk) |
466 | CODEC_STATELESS(gb18030) |
467 | CODEC_STATEFUL(hz) |
468 | END_CODECS_LIST |
469 | |
470 | I_AM_A_MODULE_FOR(cn) |
471 | |