1 | /* |
2 | * _codecs_hk.c: Codecs collection for encodings from Hong Kong |
3 | * |
4 | * Written by Hye-Shik Chang <[email protected]> |
5 | */ |
6 | |
7 | #define USING_IMPORTED_MAPS |
8 | |
9 | #include "cjkcodecs.h" |
10 | #include "mappings_hk.h" |
11 | |
12 | /* |
13 | * BIG5HKSCS codec |
14 | */ |
15 | |
16 | static const encode_map *big5_encmap = NULL; |
17 | static const decode_map *big5_decmap = NULL; |
18 | |
19 | CODEC_INIT(big5hkscs) |
20 | { |
21 | static int initialized = 0; |
22 | |
23 | if (!initialized && IMPORT_MAP(tw, big5, &big5_encmap, &big5_decmap)) |
24 | return -1; |
25 | initialized = 1; |
26 | return 0; |
27 | } |
28 | |
29 | /* |
30 | * There are four possible pair unicode -> big5hkscs maps as in HKSCS 2004: |
31 | * U+00CA U+0304 -> 8862 (U+00CA alone is mapped to 8866) |
32 | * U+00CA U+030C -> 8864 |
33 | * U+00EA U+0304 -> 88a3 (U+00EA alone is mapped to 88a7) |
34 | * U+00EA U+030C -> 88a5 |
35 | * These are handled by not mapping tables but a hand-written code. |
36 | */ |
37 | static const DBCHAR big5hkscs_pairenc_table[4] = {0x8862, 0x8864, 0x88a3, 0x88a5}; |
38 | |
39 | ENCODER(big5hkscs) |
40 | { |
41 | while (*inpos < inlen) { |
42 | Py_UCS4 c = INCHAR1; |
43 | DBCHAR code; |
44 | Py_ssize_t insize; |
45 | |
46 | if (c < 0x80) { |
47 | REQUIRE_OUTBUF(1); |
48 | **outbuf = (unsigned char)c; |
49 | NEXT(1, 1); |
50 | continue; |
51 | } |
52 | |
53 | insize = 1; |
54 | REQUIRE_OUTBUF(2); |
55 | |
56 | if (c < 0x10000) { |
57 | if (TRYMAP_ENC(big5hkscs_bmp, code, c)) { |
58 | if (code == MULTIC) { |
59 | Py_UCS4 c2; |
60 | if (inlen - *inpos >= 2) |
61 | c2 = INCHAR2; |
62 | else |
63 | c2 = 0; |
64 | |
65 | if (inlen - *inpos >= 2 && |
66 | ((c & 0xffdf) == 0x00ca) && |
67 | ((c2 & 0xfff7) == 0x0304)) { |
68 | code = big5hkscs_pairenc_table[ |
69 | ((c >> 4) | |
70 | (c2 >> 3)) & 3]; |
71 | insize = 2; |
72 | } |
73 | else if (inlen - *inpos < 2 && |
74 | !(flags & MBENC_FLUSH)) |
75 | return MBERR_TOOFEW; |
76 | else { |
77 | if (c == 0xca) |
78 | code = 0x8866; |
79 | else /* c == 0xea */ |
80 | code = 0x88a7; |
81 | } |
82 | } |
83 | } |
84 | else if (TRYMAP_ENC(big5, code, c)) |
85 | ; |
86 | else |
87 | return 1; |
88 | } |
89 | else if (c < 0x20000) |
90 | return insize; |
91 | else if (c < 0x30000) { |
92 | if (TRYMAP_ENC(big5hkscs_nonbmp, code, c & 0xffff)) |
93 | ; |
94 | else |
95 | return insize; |
96 | } |
97 | else |
98 | return insize; |
99 | |
100 | OUTBYTE1(code >> 8); |
101 | OUTBYTE2(code & 0xFF); |
102 | NEXT(insize, 2); |
103 | } |
104 | |
105 | return 0; |
106 | } |
107 | |
108 | #define BH2S(c1, c2) (((c1) - 0x87) * (0xfe - 0x40 + 1) + ((c2) - 0x40)) |
109 | |
110 | DECODER(big5hkscs) |
111 | { |
112 | while (inleft > 0) { |
113 | unsigned char c = INBYTE1; |
114 | Py_UCS4 decoded; |
115 | |
116 | if (c < 0x80) { |
117 | OUTCHAR(c); |
118 | NEXT_IN(1); |
119 | continue; |
120 | } |
121 | |
122 | REQUIRE_INBUF(2); |
123 | |
124 | if (0xc6 > c || c > 0xc8 || (c < 0xc7 && INBYTE2 < 0xa1)) { |
125 | if (TRYMAP_DEC(big5, decoded, c, INBYTE2)) { |
126 | OUTCHAR(decoded); |
127 | NEXT_IN(2); |
128 | continue; |
129 | } |
130 | } |
131 | |
132 | if (TRYMAP_DEC(big5hkscs, decoded, c, INBYTE2)) |
133 | { |
134 | int s = BH2S(c, INBYTE2); |
135 | const unsigned char *hintbase; |
136 | |
137 | assert(0x87 <= c && c <= 0xfe); |
138 | assert(0x40 <= INBYTE2 && INBYTE2 <= 0xfe); |
139 | |
140 | if (BH2S(0x87, 0x40) <= s && s <= BH2S(0xa0, 0xfe)) { |
141 | hintbase = big5hkscs_phint_0; |
142 | s -= BH2S(0x87, 0x40); |
143 | } |
144 | else if (BH2S(0xc6,0xa1) <= s && s <= BH2S(0xc8,0xfe)){ |
145 | hintbase = big5hkscs_phint_12130; |
146 | s -= BH2S(0xc6, 0xa1); |
147 | } |
148 | else if (BH2S(0xf9,0xd6) <= s && s <= BH2S(0xfe,0xfe)){ |
149 | hintbase = big5hkscs_phint_21924; |
150 | s -= BH2S(0xf9, 0xd6); |
151 | } |
152 | else |
153 | return MBERR_INTERNAL; |
154 | |
155 | if (hintbase[s >> 3] & (1 << (s & 7))) { |
156 | OUTCHAR(decoded | 0x20000); |
157 | NEXT_IN(2); |
158 | } |
159 | else { |
160 | OUTCHAR(decoded); |
161 | NEXT_IN(2); |
162 | } |
163 | continue; |
164 | } |
165 | |
166 | switch ((c << 8) | INBYTE2) { |
167 | case 0x8862: OUTCHAR2(0x00ca, 0x0304); break; |
168 | case 0x8864: OUTCHAR2(0x00ca, 0x030c); break; |
169 | case 0x88a3: OUTCHAR2(0x00ea, 0x0304); break; |
170 | case 0x88a5: OUTCHAR2(0x00ea, 0x030c); break; |
171 | default: return 1; |
172 | } |
173 | |
174 | NEXT_IN(2); /* all decoded code points are pairs, above. */ |
175 | } |
176 | |
177 | return 0; |
178 | } |
179 | |
180 | |
181 | BEGIN_MAPPINGS_LIST |
182 | MAPPING_DECONLY(big5hkscs) |
183 | MAPPING_ENCONLY(big5hkscs_bmp) |
184 | MAPPING_ENCONLY(big5hkscs_nonbmp) |
185 | END_MAPPINGS_LIST |
186 | |
187 | BEGIN_CODECS_LIST |
188 | CODEC_STATELESS_WINIT(big5hkscs) |
189 | END_CODECS_LIST |
190 | |
191 | I_AM_A_MODULE_FOR(hk) |
192 | |