unicodedata.c source code [python/Modules/unicodedata.c]

1	/ ------------------------------------------------------------------------*
2
3	unicodedata -- Provides access to the Unicode database.
4
5	Data was extracted from the UnicodeData.txt file.
6	The current version number is reported in the unidata_version constant.
7
8	Written by Marc-Andre Lemburg ([email protected]).
9	Modified for Python 2.0 by Fredrik Lundh ([email protected])
10	Modified by Martin v. Löwis ([email protected])
11
12	Copyright (c) Corporation for National Research Initiatives.
13
14	------------------------------------------------------------------------ /*
15
16	#define PY_SSIZE_T_CLEAN
17
18	#include "Python.h"
19	#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
20	#include "structmember.h" // PyMemberDef
21
22	#include <stdbool.h>
23
24	_Py_IDENTIFIER(NFC);
25	_Py_IDENTIFIER(NFD);
26	_Py_IDENTIFIER(NFKC);
27	_Py_IDENTIFIER(NFKD);
28
29	/[clinic input]*
30	module unicodedata
31	class unicodedata.UCD 'PreviousDBVersion ' '<not used>'*
32	[clinic start generated code]/*
33	/[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]/
34
35	/ character properties /
36
37	typedef struct {
38	const unsigned char category; / index into*
39	_PyUnicode_CategoryNames /*
40	const unsigned char combining; / combining class value 0 - 255 /
41	const unsigned char bidirectional; / index into*
42	_PyUnicode_BidirectionalNames /*
43	const unsigned char mirrored; / true if mirrored in bidir mode /
44	const unsigned char east_asian_width; / index into*
45	_PyUnicode_EastAsianWidth /*
46	const unsigned char normalization_quick_check; / see is_normalized() /
47	} _PyUnicode_DatabaseRecord;
48
49	typedef struct change_record {
50	/ sequence of fields should be the same as in merge_old_version /
51	const unsigned char bidir_changed;
52	const unsigned char category_changed;
53	const unsigned char decimal_changed;
54	const unsigned char mirrored_changed;
55	const unsigned char east_asian_width_changed;
56	const double numeric_changed;
57	} change_record;
58
59	/ data file generated by Tools/unicode/makeunicodedata.py /
60	#include "unicodedata_db.h"
61
62	static const _PyUnicode_DatabaseRecord*
63	_getrecord_ex(Py_UCS4 code)
64	{
65	int index;
66	if (code >= `0x110000`)
67	index = `0`;
68	else {
69	index = index1[(code>>SHIFT)];
70	index = index2[(index<<SHIFT)+(code&((`1`<<SHIFT)-`1`))];
71	}
72
73	return &_PyUnicode_Database_Records[index];
74	}
75
76	/ ------------- Previous-version API ------------------------------------- /
77	typedef struct previous_version {
78	PyObject_HEAD
79	const char *name;
80	const change_record* (*getrecord)(Py_UCS4);
81	Py_UCS4 (*normalization)(Py_UCS4);
82	} PreviousDBVersion;
83
84	#include "clinic/unicodedata.c.h"
85
86	#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
87
88	static PyMemberDef DB_members[] = {
89	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
90	{NULL}
91	};
92
93	// Check if self is an unicodedata.UCD instance.
94	// If self is NULL (when the PyCapsule C API is used), return 0.
95	// PyModule_Check() is used to avoid having to retrieve the ucd_type.
96	// See unicodedata_functions comment to the rationale of this macro.
97	#define UCD_Check(self) (self != NULL && !PyModule_Check(self))
98
99	static PyObject*
100	new_previous_version(PyTypeObject *ucd_type,
101	const charname, const* change_record* (*getrecord)(Py_UCS4),
102	Py_UCS4 (*normalization)(Py_UCS4))
103	{
104	PreviousDBVersion *self;
105	self = PyObject_GC_New(PreviousDBVersion, ucd_type);
106	if (self == NULL)
107	return NULL;
108	self->name = name;
109	self->getrecord = getrecord;
110	self->normalization = normalization;
111	PyObject_GC_Track(self);
112	return (PyObject*)self;
113	}
114
115
116	/ --- Module API --------------------------------------------------------- /
117
118	/[clinic input]*
119	unicodedata.UCD.decimal
120
121	self: self
122	chr: int(accept={str})
123	default: object=NULL
124	/
125
126	Converts a Unicode character into its equivalent decimal value.
127
128	Returns the decimal value assigned to the character chr as integer.
129	If no such value is defined, default is returned, or, if not given,
130	ValueError is raised.
131	[clinic start generated code]/*
132
133	static PyObject *
134	unicodedata_UCD_decimal_impl(PyObject self, int* chr,
135	PyObject *default_value)
136	/[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]/
137	{
138	int have_old = `0`;
139	long rc;
140	Py_UCS4 c = (Py_UCS4)chr;
141
142	if (UCD_Check(self)) {
143	const change_record *old = get_old_record(self, c);
144	if (old->category_changed == `0`) {
145	/ unassigned /
146	have_old = `1`;
147	rc = -`1`;
148	}
149	else if (old->decimal_changed != `0xFF`) {
150	have_old = `1`;
151	rc = old->decimal_changed;
152	}
153	}
154
155	if (!have_old)
156	rc = Py_UNICODE_TODECIMAL(c);
157	if (rc < `0`) {
158	if (default_value == NULL) {
159	PyErr_SetString(PyExc_ValueError,
160	"not a decimal");
161	return NULL;
162	}
163	else {
164	Py_INCREF(default_value);
165	return default_value;
166	}
167	}
168	return PyLong_FromLong(rc);
169	}
170
171	/[clinic input]*
172	unicodedata.UCD.digit
173
174	self: self
175	chr: int(accept={str})
176	default: object=NULL
177	/
178
179	Converts a Unicode character into its equivalent digit value.
180
181	Returns the digit value assigned to the character chr as integer.
182	If no such value is defined, default is returned, or, if not given,
183	ValueError is raised.
184	[clinic start generated code]/*
185
186	static PyObject *
187	unicodedata_UCD_digit_impl(PyObject self, int* chr, PyObject *default_value)
188	/[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]/
189	{
190	long rc;
191	Py_UCS4 c = (Py_UCS4)chr;
192	rc = Py_UNICODE_TODIGIT(c);
193	if (rc < `0`) {
194	if (default_value == NULL) {
195	PyErr_SetString(PyExc_ValueError, "not a digit");
196	return NULL;
197	}
198	else {
199	Py_INCREF(default_value);
200	return default_value;
201	}
202	}
203	return PyLong_FromLong(rc);
204	}
205
206	/[clinic input]*
207	unicodedata.UCD.numeric
208
209	self: self
210	chr: int(accept={str})
211	default: object=NULL
212	/
213
214	Converts a Unicode character into its equivalent numeric value.
215
216	Returns the numeric value assigned to the character chr as float.
217	If no such value is defined, default is returned, or, if not given,
218	ValueError is raised.
219	[clinic start generated code]/*
220
221	static PyObject *
222	unicodedata_UCD_numeric_impl(PyObject self, int* chr,
223	PyObject *default_value)
224	/[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]/
225	{
226	int have_old = `0`;
227	double rc;
228	Py_UCS4 c = (Py_UCS4)chr;
229
230	if (UCD_Check(self)) {
231	const change_record *old = get_old_record(self, c);
232	if (old->category_changed == `0`) {
233	/ unassigned /
234	have_old = `1`;
235	rc = -`1.0`;
236	}
237	else if (old->decimal_changed != `0xFF`) {
238	have_old = `1`;
239	rc = old->decimal_changed;
240	}
241	}
242
243	if (!have_old)
244	rc = Py_UNICODE_TONUMERIC(c);
245	if (rc == -`1.0`) {
246	if (default_value == NULL) {
247	PyErr_SetString(PyExc_ValueError, "not a numeric character");
248	return NULL;
249	}
250	else {
251	Py_INCREF(default_value);
252	return default_value;
253	}
254	}
255	return PyFloat_FromDouble(rc);
256	}
257
258	/[clinic input]*
259	unicodedata.UCD.category
260
261	self: self
262	chr: int(accept={str})
263	/
264
265	Returns the general category assigned to the character chr as string.
266	[clinic start generated code]/*
267
268	static PyObject *
269	unicodedata_UCD_category_impl(PyObject self, int* chr)
270	/[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]/
271	{
272	int index;
273	Py_UCS4 c = (Py_UCS4)chr;
274	index = (int) _getrecord_ex(c)->category;
275	if (UCD_Check(self)) {
276	const change_record *old = get_old_record(self, c);
277	if (old->category_changed != `0xFF`)
278	index = old->category_changed;
279	}
280	return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
281	}
282
283	/[clinic input]*
284	unicodedata.UCD.bidirectional
285
286	self: self
287	chr: int(accept={str})
288	/
289
290	Returns the bidirectional class assigned to the character chr as string.
291
292	If no such value is defined, an empty string is returned.
293	[clinic start generated code]/*
294
295	static PyObject *
296	unicodedata_UCD_bidirectional_impl(PyObject self, int* chr)
297	/[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]/
298	{
299	int index;
300	Py_UCS4 c = (Py_UCS4)chr;
301	index = (int) _getrecord_ex(c)->bidirectional;
302	if (UCD_Check(self)) {
303	const change_record *old = get_old_record(self, c);
304	if (old->category_changed == `0`)
305	index = `0`; / unassigned /
306	else if (old->bidir_changed != `0xFF`)
307	index = old->bidir_changed;
308	}
309	return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
310	}
311
312	/[clinic input]*
313	unicodedata.UCD.combining -> int
314
315	self: self
316	chr: int(accept={str})
317	/
318
319	Returns the canonical combining class assigned to the character chr as integer.
320
321	Returns 0 if no combining class is defined.
322	[clinic start generated code]/*
323
324	static int
325	unicodedata_UCD_combining_impl(PyObject self, int* chr)
326	/[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]/
327	{
328	int index;
329	Py_UCS4 c = (Py_UCS4)chr;
330	index = (int) _getrecord_ex(c)->combining;
331	if (UCD_Check(self)) {
332	const change_record *old = get_old_record(self, c);
333	if (old->category_changed == `0`)
334	index = `0`; / unassigned /
335	}
336	return index;
337	}
338
339	/[clinic input]*
340	unicodedata.UCD.mirrored -> int
341
342	self: self
343	chr: int(accept={str})
344	/
345
346	Returns the mirrored property assigned to the character chr as integer.
347
348	Returns 1 if the character has been identified as a "mirrored"
349	character in bidirectional text, 0 otherwise.
350	[clinic start generated code]/*
351
352	static int
353	unicodedata_UCD_mirrored_impl(PyObject self, int* chr)
354	/[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]/
355	{
356	int index;
357	Py_UCS4 c = (Py_UCS4)chr;
358	index = (int) _getrecord_ex(c)->mirrored;
359	if (UCD_Check(self)) {
360	const change_record *old = get_old_record(self, c);
361	if (old->category_changed == `0`)
362	index = `0`; / unassigned /
363	else if (old->mirrored_changed != `0xFF`)
364	index = old->mirrored_changed;
365	}
366	return index;
367	}
368
369	/[clinic input]*
370	unicodedata.UCD.east_asian_width
371
372	self: self
373	chr: int(accept={str})
374	/
375
376	Returns the east asian width assigned to the character chr as string.
377	[clinic start generated code]/*
378
379	static PyObject *
380	unicodedata_UCD_east_asian_width_impl(PyObject self, int* chr)
381	/[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]/
382	{
383	int index;
384	Py_UCS4 c = (Py_UCS4)chr;
385	index = (int) _getrecord_ex(c)->east_asian_width;
386	if (UCD_Check(self)) {
387	const change_record *old = get_old_record(self, c);
388	if (old->category_changed == `0`)
389	index = `0`; / unassigned /
390	else if (old->east_asian_width_changed != `0xFF`)
391	index = old->east_asian_width_changed;
392	}
393	return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
394	}
395
396	/[clinic input]*
397	unicodedata.UCD.decomposition
398
399	self: self
400	chr: int(accept={str})
401	/
402
403	Returns the character decomposition mapping assigned to the character chr as string.
404
405	An empty string is returned in case no such mapping is defined.
406	[clinic start generated code]/*
407
408	static PyObject *
409	unicodedata_UCD_decomposition_impl(PyObject self, int* chr)
410	/[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]/
411	{
412	char decomp[`256`];
413	int code, index, count;
414	size_t i;
415	unsigned int prefix_index;
416	Py_UCS4 c = (Py_UCS4)chr;
417
418	code = (int)c;
419
420	if (UCD_Check(self)) {
421	const change_record *old = get_old_record(self, c);
422	if (old->category_changed == `0`)
423	return PyUnicode_FromString(""); / unassigned /
424	}
425
426	if (code < `0` \|\| code >= `0x110000`)
427	index = `0`;
428	else {
429	index = decomp_index1[(code>>DECOMP_SHIFT)];
430	index = decomp_index2[(index<<DECOMP_SHIFT)+
431	(code&((`1`<<DECOMP_SHIFT)-`1`))];
432	}
433
434	/ high byte is number of hex bytes (usually one or two), low byte*
435	is prefix code (from/*
436	count = decomp_data[index] >> `8`;
437
438	/ XXX: could allocate the PyString up front instead*
439	(strlen(prefix) + 5 count + 1 bytes) /
440
441	/ Based on how index is calculated above and decomp_data is generated*
442	from Tools/unicode/makeunicodedata.py, it should not be possible
443	to overflow decomp_prefix. /*
444	prefix_index = decomp_data[index] & `255`;
445	assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
446
447	/ copy prefix /
448	i = strlen(decomp_prefix[prefix_index]);
449	memcpy(decomp, decomp_prefix[prefix_index], i);
450
451	while (count-- > `0`) {
452	if (i)
453	decomp[i++] = `' '`;
454	assert(i < sizeof(decomp));
455	PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
456	decomp_data[++index]);
457	i += strlen(decomp + i);
458	}
459	return PyUnicode_FromStringAndSize(decomp, i);
460	}
461
462	static void
463	get_decomp_record(PyObject *self, Py_UCS4 code,
464	int index, int* prefix, int* *count)
465	{
466	if (code >= `0x110000`) {
467	*index = `0`;
468	}
469	else if (UCD_Check(self)
470	&& get_old_record(self, code)->category_changed==`0`) {
471	/ unassigned in old version /
472	*index = `0`;
473	}
474	else {
475	*index = decomp_index1[(code>>DECOMP_SHIFT)];
476	index = decomp_index2[(index<<DECOMP_SHIFT)+
477	(code&((`1`<<DECOMP_SHIFT)-`1`))];
478	}
479
480	/ high byte is number of hex bytes (usually one or two), low byte*
481	is prefix code (from/*
482	count = decomp_data[index] >> `8`;
483	prefix = decomp_data[index] & `255`;
484
485	(*index)++;
486	}
487
488	#define SBase 0xAC00
489	#define LBase 0x1100
490	#define VBase 0x1161
491	#define TBase 0x11A7
492	#define LCount 19
493	#define VCount 21
494	#define TCount 28
495	#define NCount (VCount*TCount)
496	#define SCount (LCount*NCount)
497
498	static PyObject*
499	nfd_nfkd(PyObject self, PyObject input, int k)
500	{
501	PyObject *result;
502	Py_UCS4 *output;
503	Py_ssize_t i, o, osize;
504	int kind;
505	const void *data;
506	/ Longest decomposition in Unicode 3.2: U+FDFA /
507	Py_UCS4 stack[`20`];
508	Py_ssize_t space, isize;
509	int index, prefix, count, stackptr;
510	unsigned char prev, cur;
511
512	stackptr = `0`;
513	isize = PyUnicode_GET_LENGTH(input);
514	space = isize;
515	/ Overallocate at most 10 characters. /
516	if (space > `10`) {
517	if (space <= PY_SSIZE_T_MAX - `10`)
518	space += `10`;
519	}
520	else {
521	space *= `2`;
522	}
523	osize = space;
524	output = PyMem_NEW(Py_UCS4, space);
525	if (!output) {
526	PyErr_NoMemory();
527	return NULL;
528	}
529	i = o = `0`;
530	kind = PyUnicode_KIND(input);
531	data = PyUnicode_DATA(input);
532
533	while (i < isize) {
534	stack[stackptr++] = PyUnicode_READ(kind, data, i++);
535	while(stackptr) {
536	Py_UCS4 code = stack[--stackptr];
537	/ Hangul Decomposition adds three characters in*
538	a single step, so we need at least that much room. /*
539	if (space < `3`) {
540	Py_UCS4 *new_output;
541	osize += `10`;
542	space += `10`;
543	new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
544	if (new_output == NULL) {
545	PyMem_Free(output);
546	PyErr_NoMemory();
547	return NULL;
548	}
549	output = new_output;
550	}
551	/ Hangul Decomposition. /
552	if (SBase <= code && code < (SBase+SCount)) {
553	int SIndex = code - SBase;
554	int L = LBase + SIndex / NCount;
555	int V = VBase + (SIndex % NCount) / TCount;
556	int T = TBase + SIndex % TCount;
557	output[o++] = L;
558	output[o++] = V;
559	space -= `2`;
560	if (T != TBase) {
561	output[o++] = T;
562	space --;
563	}
564	continue;
565	}
566	/ normalization changes /
567	if (UCD_Check(self)) {
568	Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
569	if (value != `0`) {
570	stack[stackptr++] = value;
571	continue;
572	}
573	}
574
575	/ Other decompositions. /
576	get_decomp_record(self, code, &index, &prefix, &count);
577
578	/ Copy character if it is not decomposable, or has a*
579	compatibility decomposition, but we do NFD. /*
580	if (!count \|\| (prefix && !k)) {
581	output[o++] = code;
582	space--;
583	continue;
584	}
585	/ Copy decomposition onto the stack, in reverse*
586	order. /*
587	while(count) {
588	code = decomp_data[index + (--count)];
589	stack[stackptr++] = code;
590	}
591	}
592	}
593
594	result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
595	output, o);
596	PyMem_Free(output);
597	if (!result)
598	return NULL;
599	/ result is guaranteed to be ready, as it is compact. /
600	kind = PyUnicode_KIND(result);
601	data = PyUnicode_DATA(result);
602
603	/ Sort canonically. /
604	i = `0`;
605	prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
606	for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
607	cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
608	if (prev == `0` \|\| cur == `0` \|\| prev <= cur) {
609	prev = cur;
610	continue;
611	}
612	/ Non-canonical order. Need to switch i with previous. /*
613	o = i - `1`;
614	while (`1`) {
615	Py_UCS4 tmp = PyUnicode_READ(kind, data, o+`1`);
616	PyUnicode_WRITE(kind, data, o+`1`,
617	PyUnicode_READ(kind, data, o));
618	PyUnicode_WRITE(kind, data, o, tmp);
619	o--;
620	if (o < `0`)
621	break;
622	prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
623	if (prev == `0` \|\| prev <= cur)
624	break;
625	}
626	prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
627	}
628	return result;
629	}
630
631	static int
632	find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
633	{
634	unsigned int index;
635	for (index = `0`; nfc[index].start; index++) {
636	unsigned int start = nfc[index].start;
637	if (code < start)
638	return -`1`;
639	if (code <= start + nfc[index].count) {
640	unsigned int delta = code - start;
641	return nfc[index].index + delta;
642	}
643	}
644	return -`1`;
645	}
646
647	static PyObject*
648	nfc_nfkc(PyObject self, PyObject input, int k)
649	{
650	PyObject *result;
651	int kind;
652	const void *data;
653	Py_UCS4 *output;
654	Py_ssize_t i, i1, o, len;
655	int f,l,index,index1,comb;
656	Py_UCS4 code;
657	Py_ssize_t skipped[`20`];
658	int cskipped = `0`;
659
660	result = nfd_nfkd(self, input, k);
661	if (!result)
662	return NULL;
663	/ result will be "ready". /
664	kind = PyUnicode_KIND(result);
665	data = PyUnicode_DATA(result);
666	len = PyUnicode_GET_LENGTH(result);
667
668	/ We allocate a buffer for the output.*
669	If we find that we made no changes, we still return
670	the NFD result. /*
671	output = PyMem_NEW(Py_UCS4, len);
672	if (!output) {
673	PyErr_NoMemory();
674	Py_DECREF(result);
675	return `0`;
676	}
677	i = o = `0`;
678
679	again:
680	while (i < len) {
681	for (index = `0`; index < cskipped; index++) {
682	if (skipped[index] == i) {
683	/ i character is skipped.
684	Remove from list. /*
685	skipped[index] = skipped[cskipped-`1`];
686	cskipped--;
687	i++;
688	goto again; / continue while /
689	}
690	}
691	/ Hangul Composition. We don't need to check for <LV,T>*
692	pairs, since we always have decomposed data. /*
693	code = PyUnicode_READ(kind, data, i);
694	if (LBase <= code && code < (LBase+LCount) &&
695	i + `1` < len &&
696	VBase <= PyUnicode_READ(kind, data, i+`1`) &&
697	PyUnicode_READ(kind, data, i+`1`) < (VBase+VCount)) {
698	/ check L character is a modern leading consonant (0x1100 ~ 0x1112)*
699	and V character is a modern vowel (0x1161 ~ 0x1175). /*
700	int LIndex, VIndex;
701	LIndex = code - LBase;
702	VIndex = PyUnicode_READ(kind, data, i+`1`) - VBase;
703	code = SBase + (LIndexVCount+VIndex)TCount;
704	i+=`2`;
705	if (i < len &&
706	TBase < PyUnicode_READ(kind, data, i) &&
707	PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
708	/ check T character is a modern trailing consonant*
709	(0x11A8 ~ 0x11C2). /*
710	code += PyUnicode_READ(kind, data, i)-TBase;
711	i++;
712	}
713	output[o++] = code;
714	continue;
715	}
716
717	/ code is still input[i] here /
718	f = find_nfc_index(nfc_first, code);
719	if (f == -`1`) {
720	output[o++] = code;
721	i++;
722	continue;
723	}
724	/ Find next unblocked character. /
725	i1 = i+`1`;
726	comb = `0`;
727	/ output base character for now; might be updated later. /
728	output[o] = PyUnicode_READ(kind, data, i);
729	while (i1 < len) {
730	Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
731	int comb1 = _getrecord_ex(code1)->combining;
732	if (comb) {
733	if (comb1 == `0`)
734	break;
735	if (comb >= comb1) {
736	/ Character is blocked. /
737	i1++;
738	continue;
739	}
740	}
741	l = find_nfc_index(nfc_last, code1);
742	/ i1 cannot be combined with i. If i1*
743	is a starter, we don't need to look further.
744	Otherwise, record the combining class. /*
745	if (l == -`1`) {
746	not_combinable:
747	if (comb1 == `0`)
748	break;
749	comb = comb1;
750	i1++;
751	continue;
752	}
753	index = f*TOTAL_LAST + l;
754	index1 = comp_index[index >> COMP_SHIFT];
755	code = comp_data[(index1<<COMP_SHIFT)+
756	(index&((`1`<<COMP_SHIFT)-`1`))];
757	if (code == `0`)
758	goto not_combinable;
759
760	/ Replace the original character. /
761	output[o] = code;
762	/ Mark the second character unused. /
763	assert(cskipped < `20`);
764	skipped[cskipped++] = i1;
765	i1++;
766	f = find_nfc_index(nfc_first, output[o]);
767	if (f == -`1`)
768	break;
769	}
770	/ Output character was already written.*
771	Just advance the indices. /*
772	o++; i++;
773	}
774	if (o == len) {
775	/ No changes. Return original string. /
776	PyMem_Free(output);
777	return result;
778	}
779	Py_DECREF(result);
780	result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
781	output, o);
782	PyMem_Free(output);
783	return result;
784	}
785
786	// This needs to match the logic in makeunicodedata.py
787	// which constructs the quickcheck data.
788	typedef enum {YES = `0`, MAYBE = `1`, NO = `2`} QuickcheckResult;
789
790	/ Run the Unicode normalization "quickcheck" algorithm.*
791	*
792	* Return YES or NO if quickcheck determines the input is certainly
793	* normalized or certainly not, and MAYBE if quickcheck is unable to
794	* tell.
795	*
796	* If `yes_only` is true, then return MAYBE as soon as we determine
797	* the answer is not YES.
798	*
799	* For background and details on the algorithm, see UAX #15:
800	* https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
801	*/
802	static QuickcheckResult
803	is_normalized_quickcheck(PyObject self, PyObject input, bool nfc, bool k,
804	bool yes_only)
805	{
806	/ UCD 3.2.0 is requested, quickchecks must be disabled. /
807	if (UCD_Check(self)) {
808	return NO;
809	}
810
811	Py_ssize_t i, len;
812	int kind;
813	const void *data;
814	unsigned char prev_combining = `0`;
815
816	/ The two quickcheck bits at this shift have type QuickcheckResult. /
817	int quickcheck_shift = (nfc ? `4` : `0`) + (k ? `2` : `0`);
818
819	QuickcheckResult result = YES; / certainly normalized, unless we find something /
820
821	i = `0`;
822	kind = PyUnicode_KIND(input);
823	data = PyUnicode_DATA(input);
824	len = PyUnicode_GET_LENGTH(input);
825	while (i < len) {
826	Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
827	const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
828
829	unsigned char combining = record->combining;
830	if (combining && prev_combining > combining)
831	return NO; / non-canonical sort order, not normalized /
832	prev_combining = combining;
833
834	unsigned char quickcheck_whole = record->normalization_quick_check;
835	if (yes_only) {
836	if (quickcheck_whole & (`3` << quickcheck_shift))
837	return MAYBE;
838	} else {
839	switch ((quickcheck_whole >> quickcheck_shift) & `3`) {
840	case NO:
841	return NO;
842	case MAYBE:
843	result = MAYBE; / this string might need normalization /
844	}
845	}
846	}
847	return result;
848	}
849
850	/[clinic input]*
851	unicodedata.UCD.is_normalized
852
853	self: self
854	form: unicode
855	unistr as input: unicode
856	/
857
858	Return whether the Unicode string unistr is in the normal form 'form'.
859
860	Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
861	[clinic start generated code]/*
862
863	static PyObject *
864	unicodedata_UCD_is_normalized_impl(PyObject self, PyObject form,
865	PyObject *input)
866	/[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]/
867	{
868	if (PyUnicode_READY(input) == -`1`) {
869	return NULL;
870	}
871
872	if (PyUnicode_GET_LENGTH(input) == `0`) {
873	/ special case empty input strings. /
874	Py_RETURN_TRUE;
875	}
876
877	PyObject *result;
878	bool nfc = false;
879	bool k = false;
880	QuickcheckResult m;
881
882	PyObject *cmp;
883	int match = `0`;
884
885	if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
886	nfc = true;
887	}
888	else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
889	nfc = true;
890	k = true;
891	}
892	else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
893	/ matches default values for `nfc` and `k` /
894	}
895	else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
896	k = true;
897	}
898	else {
899	PyErr_SetString(PyExc_ValueError, "invalid normalization form");
900	return NULL;
901	}
902
903	m = is_normalized_quickcheck(self, input, nfc, k, false);
904
905	if (m == MAYBE) {
906	cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
907	if (cmp == NULL) {
908	return NULL;
909	}
910	match = PyUnicode_Compare(input, cmp);
911	Py_DECREF(cmp);
912	result = (match == `0`) ? Py_True : Py_False;
913	}
914	else {
915	result = (m == YES) ? Py_True : Py_False;
916	}
917
918	Py_INCREF(result);
919	return result;
920	}
921
922
923	/[clinic input]*
924	unicodedata.UCD.normalize
925
926	self: self
927	form: unicode
928	unistr as input: unicode
929	/
930
931	Return the normal form 'form' for the Unicode string unistr.
932
933	Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
934	[clinic start generated code]/*
935
936	static PyObject *
937	unicodedata_UCD_normalize_impl(PyObject self, PyObject form,
938	PyObject *input)
939	/[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]/
940	{
941	if (PyUnicode_GET_LENGTH(input) == `0`) {
942	/ Special case empty input strings, since resizing*
943	them later would cause internal errors. /*
944	Py_INCREF(input);
945	return input;
946	}
947
948	if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
949	if (is_normalized_quickcheck(self, input,
950	true, false, true) == YES) {
951	Py_INCREF(input);
952	return input;
953	}
954	return nfc_nfkc(self, input, `0`);
955	}
956	if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
957	if (is_normalized_quickcheck(self, input,
958	true, true, true) == YES) {
959	Py_INCREF(input);
960	return input;
961	}
962	return nfc_nfkc(self, input, `1`);
963	}
964	if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
965	if (is_normalized_quickcheck(self, input,
966	false, false, true) == YES) {
967	Py_INCREF(input);
968	return input;
969	}
970	return nfd_nfkd(self, input, `0`);
971	}
972	if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
973	if (is_normalized_quickcheck(self, input,
974	false, true, true) == YES) {
975	Py_INCREF(input);
976	return input;
977	}
978	return nfd_nfkd(self, input, `1`);
979	}
980	PyErr_SetString(PyExc_ValueError, "invalid normalization form");
981	return NULL;
982	}
983
984	/ -------------------------------------------------------------------- /
985	/ unicode character name tables /
986
987	/ data file generated by Tools/unicode/makeunicodedata.py /
988	#include "unicodename_db.h"
989
990	/ -------------------------------------------------------------------- /
991	/ database code (cut and pasted from the unidb package) /
992
993	static unsigned long
994	_gethash(const char s, int* len, int scale)
995	{
996	int i;
997	unsigned long h = `0`;
998	unsigned long ix;
999	for (i = `0`; i < len; i++) {
1000	h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
1001	ix = h & `0xff000000`;
1002	if (ix)
1003	h = (h ^ ((ix>>`24`) & `0xff`)) & `0x00ffffff`;
1004	}
1005	return h;
1006	}
1007
1008	static const char * const hangul_syllables[][`3`] = {
1009	{ "G", "A", "" },
1010	{ "GG", "AE", "G" },
1011	{ "N", "YA", "GG" },
1012	{ "D", "YAE", "GS" },
1013	{ "DD", "EO", "N", },
1014	{ "R", "E", "NJ" },
1015	{ "M", "YEO", "NH" },
1016	{ "B", "YE", "D" },
1017	{ "BB", "O", "L" },
1018	{ "S", "WA", "LG" },
1019	{ "SS", "WAE", "LM" },
1020	{ "", "OE", "LB" },
1021	{ "J", "YO", "LS" },
1022	{ "JJ", "U", "LT" },
1023	{ "C", "WEO", "LP" },
1024	{ "K", "WE", "LH" },
1025	{ "T", "WI", "M" },
1026	{ "P", "YU", "B" },
1027	{ "H", "EU", "BS" },
1028	{ `0`, "YI", "S" },
1029	{ `0`, "I", "SS" },
1030	{ `0`, `0`, "NG" },
1031	{ `0`, `0`, "J" },
1032	{ `0`, `0`, "C" },
1033	{ `0`, `0`, "K" },
1034	{ `0`, `0`, "T" },
1035	{ `0`, `0`, "P" },
1036	{ `0`, `0`, "H" }
1037	};
1038
1039	/ These ranges need to match makeunicodedata.py:cjk_ranges. /
1040	static int
1041	is_unified_ideograph(Py_UCS4 code)
1042	{
1043	return
1044	(`0x3400` <= code && code <= `0x4DBF`) \|\| / CJK Ideograph Extension A /
1045	(`0x4E00` <= code && code <= `0x9FFC`) \|\| / CJK Ideograph /
1046	(`0x20000` <= code && code <= `0x2A6DD`) \|\| / CJK Ideograph Extension B /
1047	(`0x2A700` <= code && code <= `0x2B734`) \|\| / CJK Ideograph Extension C /
1048	(`0x2B740` <= code && code <= `0x2B81D`) \|\| / CJK Ideograph Extension D /
1049	(`0x2B820` <= code && code <= `0x2CEA1`) \|\| / CJK Ideograph Extension E /
1050	(`0x2CEB0` <= code && code <= `0x2EBE0`) \|\| / CJK Ideograph Extension F /
1051	(`0x30000` <= code && code <= `0x3134A`); / CJK Ideograph Extension G /
1052	}
1053
1054	/ macros used to determine if the given code point is in the PUA range that*
1055	* we are using to store aliases and named sequences */
1056	#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1057	#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1058	(cp < named_sequences_end))
1059
1060	static int
1061	_getucname(PyObject *self,
1062	Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
1063	{
1064	/ Find the name associated with the given code point.*
1065	* If with_alias_and_seq is 1, check for names in the Private Use Area 15
1066	* that we are using for aliases and named sequences. */
1067	int offset;
1068	int i;
1069	int word;
1070	const unsigned char* w;
1071
1072	if (code >= `0x110000`)
1073	return `0`;
1074
1075	/ XXX should we just skip all the code points in the PUAs here? /
1076	if (!with_alias_and_seq && (IS_ALIAS(code) \|\| IS_NAMED_SEQ(code)))
1077	return `0`;
1078
1079	if (UCD_Check(self)) {
1080	/ in 3.2.0 there are no aliases and named sequences /
1081	const change_record *old;
1082	if (IS_ALIAS(code) \|\| IS_NAMED_SEQ(code))
1083	return `0`;
1084	old = get_old_record(self, code);
1085	if (old->category_changed == `0`) {
1086	/ unassigned /
1087	return `0`;
1088	}
1089	}
1090
1091	if (SBase <= code && code < SBase+SCount) {
1092	/ Hangul syllable. /
1093	int SIndex = code - SBase;
1094	int L = SIndex / NCount;
1095	int V = (SIndex % NCount) / TCount;
1096	int T = SIndex % TCount;
1097
1098	if (buflen < `27`)
1099	/ Worst case: HANGUL SYLLABLE <10chars>. /
1100	return `0`;
1101	strcpy(buffer, "HANGUL SYLLABLE ");
1102	buffer += `16`;
1103	strcpy(buffer, hangul_syllables[L][`0`]);
1104	buffer += strlen(hangul_syllables[L][`0`]);
1105	strcpy(buffer, hangul_syllables[V][`1`]);
1106	buffer += strlen(hangul_syllables[V][`1`]);
1107	strcpy(buffer, hangul_syllables[T][`2`]);
1108	buffer += strlen(hangul_syllables[T][`2`]);
1109	*buffer = `'\0'`;
1110	return `1`;
1111	}
1112
1113	if (is_unified_ideograph(code)) {
1114	if (buflen < `28`)
1115	/ Worst case: CJK UNIFIED IDEOGRAPH-20000 /
1116	return `0`;
1117	sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1118	return `1`;
1119	}
1120
1121	/ get offset into phrasebook /
1122	offset = phrasebook_offset1[(code>>phrasebook_shift)];
1123	offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1124	(code&((`1`<<phrasebook_shift)-`1`))];
1125	if (!offset)
1126	return `0`;
1127
1128	i = `0`;
1129
1130	for (;;) {
1131	/ get word index /
1132	word = phrasebook[offset] - phrasebook_short;
1133	if (word >= `0`) {
1134	word = (word << `8`) + phrasebook[offset+`1`];
1135	offset += `2`;
1136	} else
1137	word = phrasebook[offset++];
1138	if (i) {
1139	if (i > buflen)
1140	return `0`; / buffer overflow /
1141	buffer[i++] = `' '`;
1142	}
1143	/ copy word string from lexicon. the last character in the*
1144	word has bit 7 set. the last word in a string ends with
1145	0x80 /*
1146	w = lexicon + lexicon_offset[word];
1147	while (*w < `128`) {
1148	if (i >= buflen)
1149	return `0`; / buffer overflow /
1150	buffer[i++] = *w++;
1151	}
1152	if (i >= buflen)
1153	return `0`; / buffer overflow /
1154	buffer[i++] = *w & `127`;
1155	if (*w == `128`)
1156	break; / end of word /
1157	}
1158
1159	return `1`;
1160	}
1161
1162	static int
1163	capi_getucname(Py_UCS4 code,
1164	char* buffer, int buflen,
1165	int with_alias_and_seq)
1166	{
1167	return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
1168
1169	}
1170
1171	static int
1172	_cmpname(PyObject self, int* code, const char* name, int namelen)
1173	{
1174	/ check if code corresponds to the given name /
1175	int i;
1176	char buffer[NAME_MAXLEN+`1`];
1177	if (!_getucname(self, code, buffer, NAME_MAXLEN, `1`))
1178	return `0`;
1179	for (i = `0`; i < namelen; i++) {
1180	if (Py_TOUPPER(name[i]) != buffer[i])
1181	return `0`;
1182	}
1183	return buffer[namelen] == `'\0'`;
1184	}
1185
1186	static void
1187	find_syllable(const char str, int* len, int* pos, int* count, int column)
1188	{
1189	int i, len1;
1190	*len = -`1`;
1191	for (i = `0`; i < count; i++) {
1192	const char *s = hangul_syllables[i][column];
1193	len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1194	if (len1 <= *len)
1195	continue;
1196	if (strncmp(str, s, len1) == `0`) {
1197	*len = len1;
1198	*pos = i;
1199	}
1200	}
1201	if (*len == -`1`) {
1202	*len = `0`;
1203	}
1204	}
1205
1206	static int
1207	_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1208	{
1209	/ check if named sequences are allowed /
1210	if (!with_named_seq && IS_NAMED_SEQ(cp))
1211	return `0`;
1212	/ if the code point is in the PUA range that we use for aliases,*
1213	* convert it to obtain the right code point */
1214	if (IS_ALIAS(cp))
1215	*code = name_aliases[cp-aliases_start];
1216	else
1217	*code = cp;
1218	return `1`;
1219	}
1220
1221	static int
1222	_getcode(PyObject* self,
1223	const char* name, int namelen, Py_UCS4* code, int with_named_seq)
1224	{
1225	/ Return the code point associated with the given name.*
1226	* Named aliases are resolved too (unless self != NULL (i.e. we are using
1227	* 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
1228	* using for the named sequence, and the caller must then convert it. */
1229	unsigned int h, v;
1230	unsigned int mask = code_size-`1`;
1231	unsigned int i, incr;
1232
1233	/ Check for hangul syllables. /
1234	if (strncmp(name, "HANGUL SYLLABLE ", `16`) == `0`) {
1235	int len, L = -`1`, V = -`1`, T = -`1`;
1236	const char *pos = name + `16`;
1237	find_syllable(pos, &len, &L, LCount, `0`);
1238	pos += len;
1239	find_syllable(pos, &len, &V, VCount, `1`);
1240	pos += len;
1241	find_syllable(pos, &len, &T, TCount, `2`);
1242	pos += len;
1243	if (L != -`1` && V != -`1` && T != -`1` && pos-name == namelen) {
1244	code = SBase + (LVCount+V)*TCount + T;
1245	return `1`;
1246	}
1247	/ Otherwise, it's an illegal syllable name. /
1248	return `0`;
1249	}
1250
1251	/ Check for unified ideographs. /
1252	if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", `22`) == `0`) {
1253	/ Four or five hexdigits must follow. /
1254	v = `0`;
1255	name += `22`;
1256	namelen -= `22`;
1257	if (namelen != `4` && namelen != `5`)
1258	return `0`;
1259	while (namelen--) {
1260	v *= `16`;
1261	if (name >= `'0'` && name <= `'9'`)
1262	v += *name - `'0'`;
1263	else if (name >= `'A'` && name <= `'F'`)
1264	v += *name - `'A'` + `10`;
1265	else
1266	return `0`;
1267	name++;
1268	}
1269	if (!is_unified_ideograph(v))
1270	return `0`;
1271	*code = v;
1272	return `1`;
1273	}
1274
1275	/ the following is the same as python's dictionary lookup, with*
1276	only minor changes. see the makeunicodedata script for more
1277	details /*
1278
1279	h = (unsigned int) _gethash(name, namelen, code_magic);
1280	i = (~h) & mask;
1281	v = code_hash[i];
1282	if (!v)
1283	return `0`;
1284	if (_cmpname(self, v, name, namelen)) {
1285	return _check_alias_and_seq(v, code, with_named_seq);
1286	}
1287	incr = (h ^ (h >> `3`)) & mask;
1288	if (!incr)
1289	incr = mask;
1290	for (;;) {
1291	i = (i + incr) & mask;
1292	v = code_hash[i];
1293	if (!v)
1294	return `0`;
1295	if (_cmpname(self, v, name, namelen)) {
1296	return _check_alias_and_seq(v, code, with_named_seq);
1297	}
1298	incr = incr << `1`;
1299	if (incr > mask)
1300	incr = incr ^ code_poly;
1301	}
1302	}
1303
1304	static int
1305	capi_getcode(const char* name, int namelen, Py_UCS4* code,
1306	int with_named_seq)
1307	{
1308	return _getcode(NULL, name, namelen, code, with_named_seq);
1309
1310	}
1311
1312	static void
1313	unicodedata_destroy_capi(PyObject *capsule)
1314	{
1315	void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
1316	PyMem_Free(capi);
1317	}
1318
1319	static PyObject *
1320	unicodedata_create_capi(void)
1321	{
1322	_PyUnicode_Name_CAPI capi = PyMem_Malloc(sizeof*(_PyUnicode_Name_CAPI));
1323	if (capi == NULL) {
1324	PyErr_NoMemory();
1325	return NULL;
1326	}
1327	capi->getname = capi_getucname;
1328	capi->getcode = capi_getcode;
1329
1330	PyObject *capsule = PyCapsule_New(capi,
1331	PyUnicodeData_CAPSULE_NAME,
1332	unicodedata_destroy_capi);
1333	if (capsule == NULL) {
1334	PyMem_Free(capi);
1335	}
1336	return capsule;
1337	};
1338
1339
1340	/ -------------------------------------------------------------------- /
1341	/ Python bindings /
1342
1343	/[clinic input]*
1344	unicodedata.UCD.name
1345
1346	self: self
1347	chr: int(accept={str})
1348	default: object=NULL
1349	/
1350
1351	Returns the name assigned to the character chr as a string.
1352
1353	If no name is defined, default is returned, or, if not given,
1354	ValueError is raised.
1355	[clinic start generated code]/*
1356
1357	static PyObject *
1358	unicodedata_UCD_name_impl(PyObject self, int* chr, PyObject *default_value)
1359	/[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]/
1360	{
1361	char name[NAME_MAXLEN+`1`];
1362	Py_UCS4 c = (Py_UCS4)chr;
1363
1364	if (!_getucname(self, c, name, NAME_MAXLEN, `0`)) {
1365	if (default_value == NULL) {
1366	PyErr_SetString(PyExc_ValueError, "no such name");
1367	return NULL;
1368	}
1369	else {
1370	Py_INCREF(default_value);
1371	return default_value;
1372	}
1373	}
1374
1375	return PyUnicode_FromString(name);
1376	}
1377
1378	/[clinic input]*
1379	unicodedata.UCD.lookup
1380
1381	self: self
1382	name: str(accept={str, robuffer}, zeroes=True)
1383	/
1384
1385	Look up character by name.
1386
1387	If a character with the given name is found, return the
1388	corresponding character. If not found, KeyError is raised.
1389	[clinic start generated code]/*
1390
1391	static PyObject *
1392	unicodedata_UCD_lookup_impl(PyObject self, const* char *name,
1393	Py_ssize_clean_t name_length)
1394	/[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]/
1395	{
1396	Py_UCS4 code;
1397	unsigned int index;
1398	if (name_length > NAME_MAXLEN) {
1399	PyErr_SetString(PyExc_KeyError, "name too long");
1400	return NULL;
1401	}
1402
1403	if (!_getcode(self, name, (int)name_length, &code, `1`)) {
1404	PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1405	return NULL;
1406	}
1407	/ check if code is in the PUA range that we use for named sequences*
1408	and convert it /*
1409	if (IS_NAMED_SEQ(code)) {
1410	index = code-named_sequences_start;
1411	return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1412	named_sequences[index].seq,
1413	named_sequences[index].seqlen);
1414	}
1415	return PyUnicode_FromOrdinal(code);
1416	}
1417
1418	// List of functions used to define module functions AND* unicodedata.UCD*
1419	// methods. For module functions, self is the module. For UCD methods, self
1420	// is an UCD instance. The UCD_Check() macro is used to check if self is
1421	// an UCD instance.
1422	static PyMethodDef unicodedata_functions[] = {
1423	UNICODEDATA_UCD_DECIMAL_METHODDEF
1424	UNICODEDATA_UCD_DIGIT_METHODDEF
1425	UNICODEDATA_UCD_NUMERIC_METHODDEF
1426	UNICODEDATA_UCD_CATEGORY_METHODDEF
1427	UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1428	UNICODEDATA_UCD_COMBINING_METHODDEF
1429	UNICODEDATA_UCD_MIRRORED_METHODDEF
1430	UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1431	UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1432	UNICODEDATA_UCD_NAME_METHODDEF
1433	UNICODEDATA_UCD_LOOKUP_METHODDEF
1434	UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1435	UNICODEDATA_UCD_NORMALIZE_METHODDEF
1436	{NULL, NULL} / sentinel /
1437	};
1438
1439	static int
1440	ucd_traverse(PreviousDBVersion self, visitproc visit, void* *arg)
1441	{
1442	Py_VISIT(Py_TYPE(self));
1443	return `0`;
1444	}
1445
1446	static void
1447	ucd_dealloc(PreviousDBVersion *self)
1448	{
1449	PyTypeObject *tp = Py_TYPE(self);
1450	PyObject_GC_UnTrack(self);
1451	PyObject_GC_Del(self);
1452	Py_DECREF(tp);
1453	}
1454
1455	static PyType_Slot ucd_type_slots[] = {
1456	{Py_tp_dealloc, ucd_dealloc},
1457	{Py_tp_traverse, ucd_traverse},
1458	{Py_tp_getattro, PyObject_GenericGetAttr},
1459	{Py_tp_methods, unicodedata_functions},
1460	{Py_tp_members, DB_members},
1461	{`0`, `0`}
1462	};
1463
1464	static PyType_Spec ucd_type_spec = {
1465	.name = "unicodedata.UCD",
1466	.basicsize = sizeof(PreviousDBVersion),
1467	.flags = (Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_DISALLOW_INSTANTIATION \|
1468	Py_TPFLAGS_HAVE_GC \| Py_TPFLAGS_IMMUTABLETYPE),
1469	.slots = ucd_type_slots
1470	};
1471
1472	PyDoc_STRVAR(unicodedata_docstring,
1473	"This module provides access to the Unicode Character Database which\n\
1474	defines character properties for all Unicode characters. The data in\n\
1475	this database is based on the UnicodeData.txt file version\n\
1476	" UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1477	\n\
1478	The module uses the same names and symbols as defined by the\n\
1479	UnicodeData File Format " UNIDATA_VERSION ".");
1480
1481	static int
1482	unicodedata_exec(PyObject *module)
1483	{
1484	if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < `0`) {
1485	return -`1`;
1486	}
1487
1488	PyTypeObject ucd_type = (PyTypeObject )PyType_FromSpec(&ucd_type_spec);
1489	if (ucd_type == NULL) {
1490	return -`1`;
1491	}
1492
1493	if (PyModule_AddType(module, ucd_type) < `0`) {
1494	Py_DECREF(ucd_type);
1495	return -`1`;
1496	}
1497
1498	// Unicode database version 3.2.0 used by the IDNA encoding
1499	PyObject *v;
1500	v = new_previous_version(ucd_type, "3.2.0",
1501	get_change_3_2_0, normalization_3_2_0);
1502	Py_DECREF(ucd_type);
1503	if (v == NULL) {
1504	return -`1`;
1505	}
1506	if (PyModule_AddObject(module, "ucd_3_2_0", v) < `0`) {
1507	Py_DECREF(v);
1508	return -`1`;
1509	}
1510
1511	/ Export C API /
1512	PyObject *capsule = unicodedata_create_capi();
1513	if (capsule == NULL) {
1514	return -`1`;
1515	}
1516	int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
1517	Py_DECREF(capsule);
1518	if (rc < `0`) {
1519	return -`1`;
1520	}
1521	return `0`;
1522	}
1523
1524	static PyModuleDef_Slot unicodedata_slots[] = {
1525	{Py_mod_exec, unicodedata_exec},
1526	{`0`, NULL}
1527	};
1528
1529	static struct PyModuleDef unicodedata_module = {
1530	PyModuleDef_HEAD_INIT,
1531	.m_name = "unicodedata",
1532	.m_doc = unicodedata_docstring,
1533	.m_size = `0`,
1534	.m_methods = unicodedata_functions,
1535	.m_slots = unicodedata_slots,
1536	};
1537
1538	PyMODINIT_FUNC
1539	PyInit_unicodedata(void)
1540	{
1541	return PyModuleDef_Init(&unicodedata_module);
1542	}
1543
1544
1545	/*
1546	Local variables:
1547	c-basic-offset: 4
1548	indent-tabs-mode: nil
1549	End:
1550	*/
1551

Browse the source code of python/Modules/unicodedata.c