1 | /* |
2 | * Copyright (c) Facebook, Inc. and its affiliates. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #include <folly/Unicode.h> |
18 | #include <folly/Conv.h> |
19 | |
20 | namespace folly { |
21 | |
22 | ////////////////////////////////////////////////////////////////////// |
23 | |
24 | std::string codePointToUtf8(char32_t cp) { |
25 | std::string result; |
26 | |
27 | // Based on description from http://en.wikipedia.org/wiki/UTF-8. |
28 | |
29 | if (cp <= 0x7f) { |
30 | result.resize(1); |
31 | result[0] = static_cast<char>(cp); |
32 | } else if (cp <= 0x7FF) { |
33 | result.resize(2); |
34 | result[1] = static_cast<char>(0x80 | (0x3f & cp)); |
35 | result[0] = static_cast<char>(0xC0 | (cp >> 6)); |
36 | } else if (cp <= 0xFFFF) { |
37 | result.resize(3); |
38 | result[2] = static_cast<char>(0x80 | (0x3f & cp)); |
39 | result[1] = (0x80 | static_cast<char>((0x3f & (cp >> 6)))); |
40 | result[0] = (0xE0 | static_cast<char>(cp >> 12)); |
41 | } else if (cp <= 0x10FFFF) { |
42 | result.resize(4); |
43 | result[3] = static_cast<char>(0x80 | (0x3f & cp)); |
44 | result[2] = static_cast<char>(0x80 | (0x3f & (cp >> 6))); |
45 | result[1] = static_cast<char>(0x80 | (0x3f & (cp >> 12))); |
46 | result[0] = static_cast<char>(0xF0 | (cp >> 18)); |
47 | } |
48 | |
49 | return result; |
50 | } |
51 | |
52 | char32_t utf8ToCodePoint( |
53 | const unsigned char*& p, |
54 | const unsigned char* const e, |
55 | bool skipOnError) { |
56 | /* The following encodings are valid, except for the 5 and 6 byte |
57 | * combinations: |
58 | * 0xxxxxxx |
59 | * 110xxxxx 10xxxxxx |
60 | * 1110xxxx 10xxxxxx 10xxxxxx |
61 | * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
62 | * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
63 | * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
64 | */ |
65 | |
66 | const auto skip = [&] { |
67 | ++p; |
68 | return U'\ufffd'; |
69 | }; |
70 | |
71 | if (p >= e) { |
72 | if (skipOnError) { |
73 | return skip(); |
74 | } |
75 | throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string" ); |
76 | } |
77 | |
78 | unsigned char fst = *p; |
79 | if (!(fst & 0x80)) { |
80 | // trivial case |
81 | return *p++; |
82 | } |
83 | |
84 | static const uint32_t bitMask[] = { |
85 | (1 << 7) - 1, |
86 | (1 << 11) - 1, |
87 | (1 << 16) - 1, |
88 | (1 << 21) - 1, |
89 | }; |
90 | |
91 | // upper control bits are masked out later |
92 | uint32_t d = fst; |
93 | |
94 | if ((fst & 0xC0) != 0xC0) { |
95 | if (skipOnError) { |
96 | return skip(); |
97 | } |
98 | throw std::runtime_error( |
99 | to<std::string>("folly::utf8ToCodePoint i=0 d=" , d)); |
100 | } |
101 | |
102 | fst <<= 1; |
103 | |
104 | for (unsigned int i = 1; i != 4 && p + i < e; ++i) { |
105 | const unsigned char tmp = p[i]; |
106 | |
107 | if ((tmp & 0xC0) != 0x80) { |
108 | if (skipOnError) { |
109 | return skip(); |
110 | } |
111 | throw std::runtime_error(to<std::string>( |
112 | "folly::utf8ToCodePoint i=" , i, " tmp=" , (uint32_t)tmp)); |
113 | } |
114 | |
115 | d = (d << 6) | (tmp & 0x3F); |
116 | fst <<= 1; |
117 | |
118 | if (!(fst & 0x80)) { |
119 | d &= bitMask[i]; |
120 | |
121 | // overlong, could have been encoded with i bytes |
122 | if ((d & ~bitMask[i - 1]) == 0) { |
123 | if (skipOnError) { |
124 | return skip(); |
125 | } |
126 | throw std::runtime_error( |
127 | to<std::string>("folly::utf8ToCodePoint i=" , i, " d=" , d)); |
128 | } |
129 | |
130 | // check for surrogates only needed for 3 bytes |
131 | if (i == 2) { |
132 | if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) { |
133 | if (skipOnError) { |
134 | return skip(); |
135 | } |
136 | throw std::runtime_error( |
137 | to<std::string>("folly::utf8ToCodePoint i=" , i, " d=" , d)); |
138 | } |
139 | } |
140 | |
141 | p += i + 1; |
142 | return d; |
143 | } |
144 | } |
145 | |
146 | if (skipOnError) { |
147 | return skip(); |
148 | } |
149 | throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out" ); |
150 | } |
151 | |
152 | ////////////////////////////////////////////////////////////////////// |
153 | |
154 | } // namespace folly |
155 | |