1/*
2 * _codecs_kr.c: Codecs collection for Korean encodings
3 *
4 * Written by Hye-Shik Chang <[email protected]>
5 */
6
7#include "cjkcodecs.h"
8#include "mappings_kr.h"
9
10/*
11 * EUC-KR codec
12 */
13
14#define EUCKR_JAMO_FIRSTBYTE 0xA4
15#define EUCKR_JAMO_FILLER 0xD4
16
17static const unsigned char u2cgk_choseong[19] = {
18 0xa1, 0xa2, 0xa4, 0xa7, 0xa8, 0xa9, 0xb1, 0xb2,
19 0xb3, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
20 0xbc, 0xbd, 0xbe
21};
22static const unsigned char u2cgk_jungseong[21] = {
23 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6,
24 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce,
25 0xcf, 0xd0, 0xd1, 0xd2, 0xd3
26};
27static const unsigned char u2cgk_jongseong[28] = {
28 0xd4, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
29 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
30 0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xba,
31 0xbb, 0xbc, 0xbd, 0xbe
32};
33
34ENCODER(euc_kr)
35{
36 while (*inpos < inlen) {
37 Py_UCS4 c = INCHAR1;
38 DBCHAR code;
39
40 if (c < 0x80) {
41 WRITEBYTE1((unsigned char)c);
42 NEXT(1, 1);
43 continue;
44 }
45
46 if (c > 0xFFFF)
47 return 1;
48
49 REQUIRE_OUTBUF(2);
50 if (TRYMAP_ENC(cp949, code, c))
51 ;
52 else
53 return 1;
54
55 if ((code & 0x8000) == 0) {
56 /* KS X 1001 coded character */
57 OUTBYTE1((code >> 8) | 0x80);
58 OUTBYTE2((code & 0xFF) | 0x80);
59 NEXT(1, 2);
60 }
61 else {
62 /* Mapping is found in CP949 extension,
63 but we encode it in KS X 1001:1998 Annex 3,
64 make-up sequence for EUC-KR. */
65
66 REQUIRE_OUTBUF(8);
67
68 /* syllable composition precedence */
69 OUTBYTE1(EUCKR_JAMO_FIRSTBYTE);
70 OUTBYTE2(EUCKR_JAMO_FILLER);
71
72 /* All code points in CP949 extension are in unicode
73 * Hangul Syllable area. */
74 assert(0xac00 <= c && c <= 0xd7a3);
75 c -= 0xac00;
76
77 OUTBYTE3(EUCKR_JAMO_FIRSTBYTE);
78 OUTBYTE4(u2cgk_choseong[c / 588]);
79 NEXT_OUT(4);
80
81 OUTBYTE1(EUCKR_JAMO_FIRSTBYTE);
82 OUTBYTE2(u2cgk_jungseong[(c / 28) % 21]);
83 OUTBYTE3(EUCKR_JAMO_FIRSTBYTE);
84 OUTBYTE4(u2cgk_jongseong[c % 28]);
85 NEXT(1, 4);
86 }
87 }
88
89 return 0;
90}
91
92#define NONE 127
93
94static const unsigned char cgk2u_choseong[] = { /* [A1, BE] */
95 0, 1, NONE, 2, NONE, NONE, 3, 4,
96 5, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
97 6, 7, 8, NONE, 9, 10, 11, 12,
98 13, 14, 15, 16, 17, 18
99};
100static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */
101 1, 2, 3, 4, 5, 6, 7, NONE,
102 8, 9, 10, 11, 12, 13, 14, 15,
103 16, 17, NONE, 18, 19, 20, 21, 22,
104 NONE, 23, 24, 25, 26, 27
105};
106
107DECODER(euc_kr)
108{
109 while (inleft > 0) {
110 unsigned char c = INBYTE1;
111 Py_UCS4 decoded;
112
113 if (c < 0x80) {
114 OUTCHAR(c);
115 NEXT_IN(1);
116 continue;
117 }
118
119 REQUIRE_INBUF(2);
120
121 if (c == EUCKR_JAMO_FIRSTBYTE &&
122 INBYTE2 == EUCKR_JAMO_FILLER) {
123 /* KS X 1001:1998 Annex 3 make-up sequence */
124 DBCHAR cho, jung, jong;
125
126 REQUIRE_INBUF(8);
127 if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
128 (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
129 (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
130 return 1;
131
132 c = (*inbuf)[3];
133 if (0xa1 <= c && c <= 0xbe)
134 cho = cgk2u_choseong[c - 0xa1];
135 else
136 cho = NONE;
137
138 c = (*inbuf)[5];
139 jung = (0xbf <= c && c <= 0xd3) ? c - 0xbf : NONE;
140
141 c = (*inbuf)[7];
142 if (c == EUCKR_JAMO_FILLER)
143 jong = 0;
144 else if (0xa1 <= c && c <= 0xbe)
145 jong = cgk2u_jongseong[c - 0xa1];
146 else
147 jong = NONE;
148
149 if (cho == NONE || jung == NONE || jong == NONE)
150 return 1;
151
152 OUTCHAR(0xac00 + cho*588 + jung*28 + jong);
153 NEXT_IN(8);
154 }
155 else if (TRYMAP_DEC(ksx1001, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) {
156 OUTCHAR(decoded);
157 NEXT_IN(2);
158 }
159 else
160 return 1;
161 }
162
163 return 0;
164}
165#undef NONE
166
167
168/*
169 * CP949 codec
170 */
171
172ENCODER(cp949)
173{
174 while (*inpos < inlen) {
175 Py_UCS4 c = INCHAR1;
176 DBCHAR code;
177
178 if (c < 0x80) {
179 WRITEBYTE1((unsigned char)c);
180 NEXT(1, 1);
181 continue;
182 }
183
184 if (c > 0xFFFF)
185 return 1;
186
187 REQUIRE_OUTBUF(2);
188 if (TRYMAP_ENC(cp949, code, c))
189 ;
190 else
191 return 1;
192
193 OUTBYTE1((code >> 8) | 0x80);
194 if (code & 0x8000)
195 OUTBYTE2(code & 0xFF); /* MSB set: CP949 */
196 else
197 OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: ks x 1001 */
198 NEXT(1, 2);
199 }
200
201 return 0;
202}
203
204DECODER(cp949)
205{
206 while (inleft > 0) {
207 unsigned char c = INBYTE1;
208 Py_UCS4 decoded;
209
210 if (c < 0x80) {
211 OUTCHAR(c);
212 NEXT_IN(1);
213 continue;
214 }
215
216 REQUIRE_INBUF(2);
217 if (TRYMAP_DEC(ksx1001, decoded, c ^ 0x80, INBYTE2 ^ 0x80))
218 OUTCHAR(decoded);
219 else if (TRYMAP_DEC(cp949ext, decoded, c, INBYTE2))
220 OUTCHAR(decoded);
221 else
222 return 1;
223
224 NEXT_IN(2);
225 }
226
227 return 0;
228}
229
230
231/*
232 * JOHAB codec
233 */
234
235static const unsigned char u2johabidx_choseong[32] = {
236 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
237 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
238 0x10, 0x11, 0x12, 0x13, 0x14,
239};
240static const unsigned char u2johabidx_jungseong[32] = {
241 0x03, 0x04, 0x05, 0x06, 0x07,
242 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
243 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
244 0x1a, 0x1b, 0x1c, 0x1d,
245};
246static const unsigned char u2johabidx_jongseong[32] = {
247 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
248 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
249 0x10, 0x11, 0x13, 0x14, 0x15, 0x16, 0x17,
250 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
251};
252static const DBCHAR u2johabjamo[] = {
253 0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441,
254 0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f,
255 0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441,
256 0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461,
257 0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1,
258 0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
259 0x8741, 0x8761, 0x8781, 0x87a1,
260};
261
262ENCODER(johab)
263{
264 while (*inpos < inlen) {
265 Py_UCS4 c = INCHAR1;
266 DBCHAR code;
267
268 if (c < 0x80) {
269 WRITEBYTE1((unsigned char)c);
270 NEXT(1, 1);
271 continue;
272 }
273
274 if (c > 0xFFFF)
275 return 1;
276
277 REQUIRE_OUTBUF(2);
278
279 if (c >= 0xac00 && c <= 0xd7a3) {
280 c -= 0xac00;
281 code = 0x8000 |
282 (u2johabidx_choseong[c / 588] << 10) |
283 (u2johabidx_jungseong[(c / 28) % 21] << 5) |
284 u2johabidx_jongseong[c % 28];
285 }
286 else if (c >= 0x3131 && c <= 0x3163)
287 code = u2johabjamo[c - 0x3131];
288 else if (TRYMAP_ENC(cp949, code, c)) {
289 unsigned char c1, c2, t2;
290 unsigned short t1;
291
292 assert((code & 0x8000) == 0);
293 c1 = code >> 8;
294 c2 = code & 0xff;
295 if (((c1 >= 0x21 && c1 <= 0x2c) ||
296 (c1 >= 0x4a && c1 <= 0x7d)) &&
297 (c2 >= 0x21 && c2 <= 0x7e)) {
298 t1 = (c1 < 0x4a ? (c1 - 0x21 + 0x1b2) :
299 (c1 - 0x21 + 0x197));
300 t2 = ((t1 & 1) ? 0x5e : 0) + (c2 - 0x21);
301 OUTBYTE1(t1 >> 1);
302 OUTBYTE2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43);
303 NEXT(1, 2);
304 continue;
305 }
306 else
307 return 1;
308 }
309 else
310 return 1;
311
312 OUTBYTE1(code >> 8);
313 OUTBYTE2(code & 0xff);
314 NEXT(1, 2);
315 }
316
317 return 0;
318}
319
320#define FILL 0xfd
321#define NONE 0xff
322
323static const unsigned char johabidx_choseong[32] = {
324 NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
325 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
326 0x0e, 0x0f, 0x10, 0x11, 0x12, NONE, NONE, NONE,
327 NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
328};
329static const unsigned char johabidx_jungseong[32] = {
330 NONE, NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04,
331 NONE, NONE, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
332 NONE, NONE, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
333 NONE, NONE, 0x11, 0x12, 0x13, 0x14, NONE, NONE,
334};
335static const unsigned char johabidx_jongseong[32] = {
336 NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
337 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
338 0x0f, 0x10, NONE, 0x11, 0x12, 0x13, 0x14, 0x15,
339 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, NONE, NONE,
340};
341
342static const unsigned char johabjamo_choseong[32] = {
343 NONE, FILL, 0x31, 0x32, 0x34, 0x37, 0x38, 0x39,
344 0x41, 0x42, 0x43, 0x45, 0x46, 0x47, 0x48, 0x49,
345 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE, NONE,
346 NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
347};
348static const unsigned char johabjamo_jungseong[32] = {
349 NONE, NONE, FILL, 0x4f, 0x50, 0x51, 0x52, 0x53,
350 NONE, NONE, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
351 NONE, NONE, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
352 NONE, NONE, 0x60, 0x61, 0x62, 0x63, NONE, NONE,
353};
354static const unsigned char johabjamo_jongseong[32] = {
355 NONE, FILL, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
356 0x37, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
357 0x40, 0x41, NONE, 0x42, 0x44, 0x45, 0x46, 0x47,
358 0x48, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE,
359};
360
361DECODER(johab)
362{
363 while (inleft > 0) {
364 unsigned char c = INBYTE1, c2;
365 Py_UCS4 decoded;
366
367 if (c < 0x80) {
368 OUTCHAR(c);
369 NEXT_IN(1);
370 continue;
371 }
372
373 REQUIRE_INBUF(2);
374 c2 = INBYTE2;
375
376 if (c < 0xd8) {
377 /* johab hangul */
378 unsigned char c_cho, c_jung, c_jong;
379 unsigned char i_cho, i_jung, i_jong;
380
381 c_cho = (c >> 2) & 0x1f;
382 c_jung = ((c << 3) | c2 >> 5) & 0x1f;
383 c_jong = c2 & 0x1f;
384
385 i_cho = johabidx_choseong[c_cho];
386 i_jung = johabidx_jungseong[c_jung];
387 i_jong = johabidx_jongseong[c_jong];
388
389 if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
390 return 1;
391
392 /* we don't use U+1100 hangul jamo yet. */
393 if (i_cho == FILL) {
394 if (i_jung == FILL) {
395 if (i_jong == FILL)
396 OUTCHAR(0x3000);
397 else
398 OUTCHAR(0x3100 |
399 johabjamo_jongseong[c_jong]);
400 }
401 else {
402 if (i_jong == FILL)
403 OUTCHAR(0x3100 |
404 johabjamo_jungseong[c_jung]);
405 else
406 return 1;
407 }
408 } else {
409 if (i_jung == FILL) {
410 if (i_jong == FILL)
411 OUTCHAR(0x3100 |
412 johabjamo_choseong[c_cho]);
413 else
414 return 1;
415 }
416 else
417 OUTCHAR(0xac00 +
418 i_cho * 588 +
419 i_jung * 28 +
420 (i_jong == FILL ? 0 : i_jong));
421 }
422 NEXT_IN(2);
423 } else {
424 /* KS X 1001 except hangul jamos and syllables */
425 if (c == 0xdf || c > 0xf9 ||
426 c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
427 (c2 & 0x7f) == 0x7f ||
428 (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
429 return 1;
430 else {
431 unsigned char t1, t2;
432
433 t1 = (c < 0xe0 ? 2 * (c - 0xd9) :
434 2 * c - 0x197);
435 t2 = (c2 < 0x91 ? c2 - 0x31 : c2 - 0x43);
436 t1 = t1 + (t2 < 0x5e ? 0 : 1) + 0x21;
437 t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
438
439 if (TRYMAP_DEC(ksx1001, decoded, t1, t2)) {
440 OUTCHAR(decoded);
441 NEXT_IN(2);
442 }
443 else {
444 return 1;
445 }
446 }
447 }
448 }
449
450 return 0;
451}
452#undef NONE
453#undef FILL
454
455
456BEGIN_MAPPINGS_LIST
457 MAPPING_DECONLY(ksx1001)
458 MAPPING_ENCONLY(cp949)
459 MAPPING_DECONLY(cp949ext)
460END_MAPPINGS_LIST
461
462BEGIN_CODECS_LIST
463 CODEC_STATELESS(euc_kr)
464 CODEC_STATELESS(cp949)
465 CODEC_STATELESS(johab)
466END_CODECS_LIST
467
468I_AM_A_MODULE_FOR(kr)
469