1 | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #ifndef RE2_UNICODE_CASEFOLD_H_ |
6 | #define RE2_UNICODE_CASEFOLD_H_ |
7 | |
8 | // Unicode case folding tables. |
9 | |
10 | // The Unicode case folding tables encode the mapping from one Unicode point |
11 | // to the next largest Unicode point with equivalent folding. The largest |
12 | // point wraps back to the first. For example, the tables map: |
13 | // |
14 | // 'A' -> 'a' |
15 | // 'a' -> 'A' |
16 | // |
17 | // 'K' -> 'k' |
18 | // 'k' -> 'K' (Kelvin symbol) |
19 | // 'K' -> 'K' |
20 | // |
21 | // Like everything Unicode, these tables are big. If we represent the table |
22 | // as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB. |
23 | // Most table entries look like the ones around them: |
24 | // 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. |
25 | // Instead of listing all the pairs explicitly, we make a list of ranges |
26 | // and deltas, so that the table entries for 'A' through 'Z' can be represented |
27 | // as a single entry { 'A', 'Z', +32 }. |
28 | // |
29 | // In addition to blocks that map to each other (A-Z mapping to a-z) |
30 | // there are blocks of pairs that individually map to each other |
31 | // (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...). |
32 | // For those, the special delta value EvenOdd marks even/odd pairs |
33 | // (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs. |
34 | // |
35 | // In this form, the table has 274 entries, about 3kB. If we were to split |
36 | // the table into one for 16-bit codes and an overflow table for larger ones, |
37 | // we could get it down to about 1.5kB, but that's not worth the complexity. |
38 | // |
39 | // The grouped form also allows for efficient fold range calculations |
40 | // rather than looping one character at a time. |
41 | |
42 | #include <stdint.h> |
43 | |
44 | #include "util/utf.h" |
45 | |
46 | namespace re2 { |
47 | |
48 | enum { |
49 | EvenOdd = 1, |
50 | OddEven = -1, |
51 | EvenOddSkip = 1<<30, |
52 | OddEvenSkip, |
53 | }; |
54 | |
55 | struct CaseFold { |
56 | Rune lo; |
57 | Rune hi; |
58 | int32_t delta; |
59 | }; |
60 | |
61 | extern const CaseFold unicode_casefold[]; |
62 | extern const int num_unicode_casefold; |
63 | |
64 | extern const CaseFold unicode_tolower[]; |
65 | extern const int num_unicode_tolower; |
66 | |
67 | // Returns the CaseFold* in the tables that contains rune. |
68 | // If rune is not in the tables, returns the first CaseFold* after rune. |
69 | // If rune is larger than any value in the tables, returns NULL. |
70 | extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune); |
71 | |
72 | // Returns the result of applying the fold f to the rune r. |
73 | extern Rune ApplyFold(const CaseFold *f, Rune r); |
74 | |
75 | } // namespace re2 |
76 | |
77 | #endif // RE2_UNICODE_CASEFOLD_H_ |
78 | |