1/* Copyright (c) 2011, 2018 Ben Noordhuis <[email protected]>
2 *
3 * Permission to use, copy, modify, and/or distribute this software for any
4 * purpose with or without fee is hereby granted, provided that the above
5 * copyright notice and this permission notice appear in all copies.
6 *
7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14 */
15
16/* Derived from https://github.com/bnoordhuis/punycode
17 * but updated to support IDNA 2008.
18 */
19
20#include "uv.h"
21#include "idna.h"
22#include <string.h>
23
24static unsigned uv__utf8_decode1_slow(const char** p,
25 const char* pe,
26 unsigned a) {
27 unsigned b;
28 unsigned c;
29 unsigned d;
30 unsigned min;
31
32 if (a > 0xF7)
33 return -1;
34
35 switch (*p - pe) {
36 default:
37 if (a > 0xEF) {
38 min = 0x10000;
39 a = a & 7;
40 b = (unsigned char) *(*p)++;
41 c = (unsigned char) *(*p)++;
42 d = (unsigned char) *(*p)++;
43 break;
44 }
45 /* Fall through. */
46 case 2:
47 if (a > 0xDF) {
48 min = 0x800;
49 b = 0x80 | (a & 15);
50 c = (unsigned char) *(*p)++;
51 d = (unsigned char) *(*p)++;
52 a = 0;
53 break;
54 }
55 /* Fall through. */
56 case 1:
57 if (a > 0xBF) {
58 min = 0x80;
59 b = 0x80;
60 c = 0x80 | (a & 31);
61 d = (unsigned char) *(*p)++;
62 a = 0;
63 break;
64 }
65 return -1; /* Invalid continuation byte. */
66 }
67
68 if (0x80 != (0xC0 & (b ^ c ^ d)))
69 return -1; /* Invalid sequence. */
70
71 b &= 63;
72 c &= 63;
73 d &= 63;
74 a = (a << 18) | (b << 12) | (c << 6) | d;
75
76 if (a < min)
77 return -1; /* Overlong sequence. */
78
79 if (a > 0x10FFFF)
80 return -1; /* Four-byte sequence > U+10FFFF. */
81
82 if (a >= 0xD800 && a <= 0xDFFF)
83 return -1; /* Surrogate pair. */
84
85 return a;
86}
87
88unsigned uv__utf8_decode1(const char** p, const char* pe) {
89 unsigned a;
90
91 a = (unsigned char) *(*p)++;
92
93 if (a < 128)
94 return a; /* ASCII, common case. */
95
96 return uv__utf8_decode1_slow(p, pe, a);
97}
98
99#define foreach_codepoint(c, p, pe) \
100 for (; (void) (*p <= pe && (c = uv__utf8_decode1(p, pe))), *p <= pe;)
101
102static int uv__idna_toascii_label(const char* s, const char* se,
103 char** d, char* de) {
104 static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
105 const char* ss;
106 unsigned c;
107 unsigned h;
108 unsigned k;
109 unsigned n;
110 unsigned m;
111 unsigned q;
112 unsigned t;
113 unsigned x;
114 unsigned y;
115 unsigned bias;
116 unsigned delta;
117 unsigned todo;
118 int first;
119
120 h = 0;
121 ss = s;
122 todo = 0;
123
124 foreach_codepoint(c, &s, se) {
125 if (c < 128)
126 h++;
127 else if (c == (unsigned) -1)
128 return UV_EINVAL;
129 else
130 todo++;
131 }
132
133 if (todo > 0) {
134 if (*d < de) *(*d)++ = 'x';
135 if (*d < de) *(*d)++ = 'n';
136 if (*d < de) *(*d)++ = '-';
137 if (*d < de) *(*d)++ = '-';
138 }
139
140 x = 0;
141 s = ss;
142 foreach_codepoint(c, &s, se) {
143 if (c > 127)
144 continue;
145
146 if (*d < de)
147 *(*d)++ = c;
148
149 if (++x == h)
150 break; /* Visited all ASCII characters. */
151 }
152
153 if (todo == 0)
154 return h;
155
156 /* Only write separator when we've written ASCII characters first. */
157 if (h > 0)
158 if (*d < de)
159 *(*d)++ = '-';
160
161 n = 128;
162 bias = 72;
163 delta = 0;
164 first = 1;
165
166 while (todo > 0) {
167 m = -1;
168 s = ss;
169 foreach_codepoint(c, &s, se)
170 if (c >= n)
171 if (c < m)
172 m = c;
173
174 x = m - n;
175 y = h + 1;
176
177 if (x > ~delta / y)
178 return UV_E2BIG; /* Overflow. */
179
180 delta += x * y;
181 n = m;
182
183 s = ss;
184 foreach_codepoint(c, &s, se) {
185 if (c < n)
186 if (++delta == 0)
187 return UV_E2BIG; /* Overflow. */
188
189 if (c != n)
190 continue;
191
192 for (k = 36, q = delta; /* empty */; k += 36) {
193 t = 1;
194
195 if (k > bias)
196 t = k - bias;
197
198 if (t > 26)
199 t = 26;
200
201 if (q < t)
202 break;
203
204 /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
205 * 10 <= y <= 35, we can optimize the long division
206 * into a table-based reciprocal multiplication.
207 */
208 x = q - t;
209 y = 36 - t; /* 10 <= y <= 35 since 1 <= t <= 26. */
210 q = x / y;
211 t = t + x % y; /* 1 <= t <= 35 because of y. */
212
213 if (*d < de)
214 *(*d)++ = alphabet[t];
215 }
216
217 if (*d < de)
218 *(*d)++ = alphabet[q];
219
220 delta /= 2;
221
222 if (first) {
223 delta /= 350;
224 first = 0;
225 }
226
227 /* No overflow check is needed because |delta| was just
228 * divided by 2 and |delta+delta >= delta + delta/h|.
229 */
230 h++;
231 delta += delta / h;
232
233 for (bias = 0; delta > 35 * 26 / 2; bias += 36)
234 delta /= 35;
235
236 bias += 36 * delta / (delta + 38);
237 delta = 0;
238 todo--;
239 }
240
241 delta++;
242 n++;
243 }
244
245 return 0;
246}
247
248#undef foreach_codepoint
249
250long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
251 const char* si;
252 const char* st;
253 unsigned c;
254 char* ds;
255 int rc;
256
257 ds = d;
258
259 for (si = s; si < se; /* empty */) {
260 st = si;
261 c = uv__utf8_decode1(&si, se);
262
263 if (c != '.')
264 if (c != 0x3002) /* 。 */
265 if (c != 0xFF0E) /* . */
266 if (c != 0xFF61) /* 。 */
267 continue;
268
269 rc = uv__idna_toascii_label(s, st, &d, de);
270
271 if (rc < 0)
272 return rc;
273
274 if (d < de)
275 *d++ = '.';
276
277 s = si;
278 }
279
280 if (s < se) {
281 rc = uv__idna_toascii_label(s, se, &d, de);
282
283 if (rc < 0)
284 return rc;
285 }
286
287 if (d < de)
288 *d++ = '\0';
289
290 return d - ds; /* Number of bytes written. */
291}
292