1 | /* $OpenBSD$ */ |
2 | |
3 | /* |
4 | * Copyright (c) 2008 Nicholas Marriott <[email protected]> |
5 | * |
6 | * Permission to use, copy, modify, and distribute this software for any |
7 | * purpose with or without fee is hereby granted, provided that the above |
8 | * copyright notice and this permission notice appear in all copies. |
9 | * |
10 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
11 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
12 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
13 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
14 | * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER |
15 | * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
16 | * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
17 | */ |
18 | |
19 | #include <sys/types.h> |
20 | |
21 | #include <ctype.h> |
22 | #include <errno.h> |
23 | #include <stdlib.h> |
24 | #include <string.h> |
25 | #include <wchar.h> |
26 | |
27 | #include "tmux.h" |
28 | |
29 | static int utf8_width(wchar_t); |
30 | |
31 | /* Set a single character. */ |
32 | void |
33 | utf8_set(struct utf8_data *ud, u_char ch) |
34 | { |
35 | static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; |
36 | |
37 | memcpy(ud, &empty, sizeof *ud); |
38 | *ud->data = ch; |
39 | } |
40 | |
41 | /* Copy UTF-8 character. */ |
42 | void |
43 | utf8_copy(struct utf8_data *to, const struct utf8_data *from) |
44 | { |
45 | u_int i; |
46 | |
47 | memcpy(to, from, sizeof *to); |
48 | |
49 | for (i = to->size; i < sizeof to->data; i++) |
50 | to->data[i] = '\0'; |
51 | } |
52 | |
53 | /* |
54 | * Open UTF-8 sequence. |
55 | * |
56 | * 11000010-11011111 C2-DF start of 2-byte sequence |
57 | * 11100000-11101111 E0-EF start of 3-byte sequence |
58 | * 11110000-11110100 F0-F4 start of 4-byte sequence |
59 | */ |
60 | enum utf8_state |
61 | utf8_open(struct utf8_data *ud, u_char ch) |
62 | { |
63 | memset(ud, 0, sizeof *ud); |
64 | if (ch >= 0xc2 && ch <= 0xdf) |
65 | ud->size = 2; |
66 | else if (ch >= 0xe0 && ch <= 0xef) |
67 | ud->size = 3; |
68 | else if (ch >= 0xf0 && ch <= 0xf4) |
69 | ud->size = 4; |
70 | else |
71 | return (UTF8_ERROR); |
72 | utf8_append(ud, ch); |
73 | return (UTF8_MORE); |
74 | } |
75 | |
76 | /* Append character to UTF-8, closing if finished. */ |
77 | enum utf8_state |
78 | utf8_append(struct utf8_data *ud, u_char ch) |
79 | { |
80 | wchar_t wc; |
81 | int width; |
82 | |
83 | if (ud->have >= ud->size) |
84 | fatalx("UTF-8 character overflow" ); |
85 | if (ud->size > sizeof ud->data) |
86 | fatalx("UTF-8 character size too large" ); |
87 | |
88 | if (ud->have != 0 && (ch & 0xc0) != 0x80) |
89 | ud->width = 0xff; |
90 | |
91 | ud->data[ud->have++] = ch; |
92 | if (ud->have != ud->size) |
93 | return (UTF8_MORE); |
94 | |
95 | if (ud->width == 0xff) |
96 | return (UTF8_ERROR); |
97 | |
98 | if (utf8_combine(ud, &wc) != UTF8_DONE) |
99 | return (UTF8_ERROR); |
100 | if ((width = utf8_width(wc)) < 0) |
101 | return (UTF8_ERROR); |
102 | ud->width = width; |
103 | |
104 | return (UTF8_DONE); |
105 | } |
106 | |
107 | /* Get width of Unicode character. */ |
108 | static int |
109 | utf8_width(wchar_t wc) |
110 | { |
111 | int width; |
112 | |
113 | #ifdef HAVE_UTF8PROC |
114 | width = utf8proc_wcwidth(wc); |
115 | #else |
116 | width = wcwidth(wc); |
117 | #endif |
118 | if (width < 0 || width > 0xff) { |
119 | log_debug("Unicode %04lx, wcwidth() %d" , (long)wc, width); |
120 | |
121 | #ifndef __OpenBSD__ |
122 | /* |
123 | * Many platforms (particularly and inevitably OS X) have no |
124 | * width for relatively common characters (wcwidth() returns |
125 | * -1); assume width 1 in this case. This will be wrong for |
126 | * genuinely nonprintable characters, but they should be |
127 | * rare. We may pass through stuff that ideally we would block, |
128 | * but this is no worse than sending the same to the terminal |
129 | * without tmux. |
130 | */ |
131 | if (width < 0) |
132 | return (1); |
133 | #endif |
134 | return (-1); |
135 | } |
136 | return (width); |
137 | } |
138 | |
139 | /* Combine UTF-8 into Unicode. */ |
140 | enum utf8_state |
141 | utf8_combine(const struct utf8_data *ud, wchar_t *wc) |
142 | { |
143 | #ifdef HAVE_UTF8PROC |
144 | switch (utf8proc_mbtowc(wc, ud->data, ud->size)) { |
145 | #else |
146 | switch (mbtowc(wc, ud->data, ud->size)) { |
147 | #endif |
148 | case -1: |
149 | log_debug("UTF-8 %.*s, mbtowc() %d" , (int)ud->size, ud->data, |
150 | errno); |
151 | mbtowc(NULL, NULL, MB_CUR_MAX); |
152 | return (UTF8_ERROR); |
153 | case 0: |
154 | return (UTF8_ERROR); |
155 | default: |
156 | return (UTF8_DONE); |
157 | } |
158 | } |
159 | |
160 | /* Split Unicode into UTF-8. */ |
161 | enum utf8_state |
162 | utf8_split(wchar_t wc, struct utf8_data *ud) |
163 | { |
164 | char s[MB_LEN_MAX]; |
165 | int slen; |
166 | |
167 | #ifdef HAVE_UTF8PROC |
168 | slen = utf8proc_wctomb(s, wc); |
169 | #else |
170 | slen = wctomb(s, wc); |
171 | #endif |
172 | if (slen <= 0 || slen > (int)sizeof ud->data) |
173 | return (UTF8_ERROR); |
174 | |
175 | memcpy(ud->data, s, slen); |
176 | ud->size = slen; |
177 | |
178 | ud->width = utf8_width(wc); |
179 | return (UTF8_DONE); |
180 | } |
181 | |
182 | /* |
183 | * Encode len characters from src into dst, which is guaranteed to have four |
184 | * bytes available for each character from src (for \abc or UTF-8) plus space |
185 | * for \0. |
186 | */ |
187 | int |
188 | utf8_strvis(char *dst, const char *src, size_t len, int flag) |
189 | { |
190 | struct utf8_data ud; |
191 | const char *start, *end; |
192 | enum utf8_state more; |
193 | size_t i; |
194 | |
195 | start = dst; |
196 | end = src + len; |
197 | |
198 | while (src < end) { |
199 | if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { |
200 | while (++src < end && more == UTF8_MORE) |
201 | more = utf8_append(&ud, *src); |
202 | if (more == UTF8_DONE) { |
203 | /* UTF-8 character finished. */ |
204 | for (i = 0; i < ud.size; i++) |
205 | *dst++ = ud.data[i]; |
206 | continue; |
207 | } |
208 | /* Not a complete, valid UTF-8 character. */ |
209 | src -= ud.have; |
210 | } |
211 | if (src[0] == '$' && src < end - 1) { |
212 | if (isalpha((u_char)src[1]) || |
213 | src[1] == '_' || |
214 | src[1] == '{') |
215 | *dst++ = '\\'; |
216 | *dst++ = '$'; |
217 | } else if (src < end - 1) |
218 | dst = vis(dst, src[0], flag, src[1]); |
219 | else if (src < end) |
220 | dst = vis(dst, src[0], flag, '\0'); |
221 | src++; |
222 | } |
223 | |
224 | *dst = '\0'; |
225 | return (dst - start); |
226 | } |
227 | |
228 | /* Same as utf8_strvis but allocate the buffer. */ |
229 | int |
230 | utf8_stravis(char **dst, const char *src, int flag) |
231 | { |
232 | char *buf; |
233 | int len; |
234 | |
235 | buf = xreallocarray(NULL, 4, strlen(src) + 1); |
236 | len = utf8_strvis(buf, src, strlen(src), flag); |
237 | |
238 | *dst = xrealloc(buf, len + 1); |
239 | return (len); |
240 | } |
241 | |
242 | /* Does this string contain anything that isn't valid UTF-8? */ |
243 | int |
244 | utf8_isvalid(const char *s) |
245 | { |
246 | struct utf8_data ud; |
247 | const char *end; |
248 | enum utf8_state more; |
249 | |
250 | end = s + strlen(s); |
251 | while (s < end) { |
252 | if ((more = utf8_open(&ud, *s)) == UTF8_MORE) { |
253 | while (++s < end && more == UTF8_MORE) |
254 | more = utf8_append(&ud, *s); |
255 | if (more == UTF8_DONE) |
256 | continue; |
257 | return (0); |
258 | } |
259 | if (*s < 0x20 || *s > 0x7e) |
260 | return (0); |
261 | s++; |
262 | } |
263 | return (1); |
264 | } |
265 | |
266 | /* |
267 | * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free |
268 | * the returned string. Anything not valid printable ASCII or UTF-8 is |
269 | * stripped. |
270 | */ |
271 | char * |
272 | utf8_sanitize(const char *src) |
273 | { |
274 | char *dst; |
275 | size_t n; |
276 | enum utf8_state more; |
277 | struct utf8_data ud; |
278 | u_int i; |
279 | |
280 | dst = NULL; |
281 | |
282 | n = 0; |
283 | while (*src != '\0') { |
284 | dst = xreallocarray(dst, n + 1, sizeof *dst); |
285 | if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { |
286 | while (*++src != '\0' && more == UTF8_MORE) |
287 | more = utf8_append(&ud, *src); |
288 | if (more == UTF8_DONE) { |
289 | dst = xreallocarray(dst, n + ud.width, |
290 | sizeof *dst); |
291 | for (i = 0; i < ud.width; i++) |
292 | dst[n++] = '_'; |
293 | continue; |
294 | } |
295 | src -= ud.have; |
296 | } |
297 | if (*src > 0x1f && *src < 0x7f) |
298 | dst[n++] = *src; |
299 | else |
300 | dst[n++] = '_'; |
301 | src++; |
302 | } |
303 | |
304 | dst = xreallocarray(dst, n + 1, sizeof *dst); |
305 | dst[n] = '\0'; |
306 | return (dst); |
307 | } |
308 | |
309 | /* Get UTF-8 buffer length. */ |
310 | size_t |
311 | utf8_strlen(const struct utf8_data *s) |
312 | { |
313 | size_t i; |
314 | |
315 | for (i = 0; s[i].size != 0; i++) |
316 | /* nothing */; |
317 | return (i); |
318 | } |
319 | |
320 | /* Get UTF-8 string width. */ |
321 | u_int |
322 | utf8_strwidth(const struct utf8_data *s, ssize_t n) |
323 | { |
324 | ssize_t i; |
325 | u_int width; |
326 | |
327 | width = 0; |
328 | for (i = 0; s[i].size != 0; i++) { |
329 | if (n != -1 && n == i) |
330 | break; |
331 | width += s[i].width; |
332 | } |
333 | return (width); |
334 | } |
335 | |
336 | /* |
337 | * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. |
338 | * Caller frees. |
339 | */ |
340 | struct utf8_data * |
341 | utf8_fromcstr(const char *src) |
342 | { |
343 | struct utf8_data *dst; |
344 | size_t n; |
345 | enum utf8_state more; |
346 | |
347 | dst = NULL; |
348 | |
349 | n = 0; |
350 | while (*src != '\0') { |
351 | dst = xreallocarray(dst, n + 1, sizeof *dst); |
352 | if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { |
353 | while (*++src != '\0' && more == UTF8_MORE) |
354 | more = utf8_append(&dst[n], *src); |
355 | if (more == UTF8_DONE) { |
356 | n++; |
357 | continue; |
358 | } |
359 | src -= dst[n].have; |
360 | } |
361 | utf8_set(&dst[n], *src); |
362 | n++; |
363 | src++; |
364 | } |
365 | |
366 | dst = xreallocarray(dst, n + 1, sizeof *dst); |
367 | dst[n].size = 0; |
368 | return (dst); |
369 | } |
370 | |
371 | /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ |
372 | char * |
373 | utf8_tocstr(struct utf8_data *src) |
374 | { |
375 | char *dst; |
376 | size_t n; |
377 | |
378 | dst = NULL; |
379 | |
380 | n = 0; |
381 | for(; src->size != 0; src++) { |
382 | dst = xreallocarray(dst, n + src->size, 1); |
383 | memcpy(dst + n, src->data, src->size); |
384 | n += src->size; |
385 | } |
386 | |
387 | dst = xreallocarray(dst, n + 1, 1); |
388 | dst[n] = '\0'; |
389 | return (dst); |
390 | } |
391 | |
392 | /* Get width of UTF-8 string. */ |
393 | u_int |
394 | utf8_cstrwidth(const char *s) |
395 | { |
396 | struct utf8_data tmp; |
397 | u_int width; |
398 | enum utf8_state more; |
399 | |
400 | width = 0; |
401 | while (*s != '\0') { |
402 | if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { |
403 | while (*++s != '\0' && more == UTF8_MORE) |
404 | more = utf8_append(&tmp, *s); |
405 | if (more == UTF8_DONE) { |
406 | width += tmp.width; |
407 | continue; |
408 | } |
409 | s -= tmp.have; |
410 | } |
411 | if (*s > 0x1f && *s != 0x7f) |
412 | width++; |
413 | s++; |
414 | } |
415 | return (width); |
416 | } |
417 | |
418 | /* Pad UTF-8 string to width on the left. Caller frees. */ |
419 | char * |
420 | utf8_padcstr(const char *s, u_int width) |
421 | { |
422 | size_t slen; |
423 | char *out; |
424 | u_int n, i; |
425 | |
426 | n = utf8_cstrwidth(s); |
427 | if (n >= width) |
428 | return (xstrdup(s)); |
429 | |
430 | slen = strlen(s); |
431 | out = xmalloc(slen + 1 + (width - n)); |
432 | memcpy(out, s, slen); |
433 | for (i = n; i < width; i++) |
434 | out[slen++] = ' '; |
435 | out[slen] = '\0'; |
436 | return (out); |
437 | } |
438 | |
439 | /* Pad UTF-8 string to width on the right. Caller frees. */ |
440 | char * |
441 | utf8_rpadcstr(const char *s, u_int width) |
442 | { |
443 | size_t slen; |
444 | char *out; |
445 | u_int n, i; |
446 | |
447 | n = utf8_cstrwidth(s); |
448 | if (n >= width) |
449 | return (xstrdup(s)); |
450 | |
451 | slen = strlen(s); |
452 | out = xmalloc(slen + 1 + (width - n)); |
453 | for (i = 0; i < width - n; i++) |
454 | out[i] = ' '; |
455 | memcpy(out + i, s, slen); |
456 | out[i + slen] = '\0'; |
457 | return (out); |
458 | } |
459 | |
460 | int |
461 | utf8_cstrhas(const char *s, const struct utf8_data *ud) |
462 | { |
463 | struct utf8_data *copy, *loop; |
464 | int found = 0; |
465 | |
466 | copy = utf8_fromcstr(s); |
467 | for (loop = copy; loop->size != 0; loop++) { |
468 | if (loop->size != ud->size) |
469 | continue; |
470 | if (memcmp(loop->data, ud->data, loop->size) == 0) { |
471 | found = 1; |
472 | break; |
473 | } |
474 | } |
475 | free(copy); |
476 | |
477 | return (found); |
478 | } |
479 | |