1/* $OpenBSD$ */
2
3/*
4 * Copyright (c) 2008 Nicholas Marriott <[email protected]>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/types.h>
20
21#include <ctype.h>
22#include <errno.h>
23#include <stdlib.h>
24#include <string.h>
25#include <wchar.h>
26
27#include "tmux.h"
28
29static int utf8_width(wchar_t);
30
31/* Set a single character. */
32void
33utf8_set(struct utf8_data *ud, u_char ch)
34{
35 static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
36
37 memcpy(ud, &empty, sizeof *ud);
38 *ud->data = ch;
39}
40
41/* Copy UTF-8 character. */
42void
43utf8_copy(struct utf8_data *to, const struct utf8_data *from)
44{
45 u_int i;
46
47 memcpy(to, from, sizeof *to);
48
49 for (i = to->size; i < sizeof to->data; i++)
50 to->data[i] = '\0';
51}
52
53/*
54 * Open UTF-8 sequence.
55 *
56 * 11000010-11011111 C2-DF start of 2-byte sequence
57 * 11100000-11101111 E0-EF start of 3-byte sequence
58 * 11110000-11110100 F0-F4 start of 4-byte sequence
59 */
60enum utf8_state
61utf8_open(struct utf8_data *ud, u_char ch)
62{
63 memset(ud, 0, sizeof *ud);
64 if (ch >= 0xc2 && ch <= 0xdf)
65 ud->size = 2;
66 else if (ch >= 0xe0 && ch <= 0xef)
67 ud->size = 3;
68 else if (ch >= 0xf0 && ch <= 0xf4)
69 ud->size = 4;
70 else
71 return (UTF8_ERROR);
72 utf8_append(ud, ch);
73 return (UTF8_MORE);
74}
75
76/* Append character to UTF-8, closing if finished. */
77enum utf8_state
78utf8_append(struct utf8_data *ud, u_char ch)
79{
80 wchar_t wc;
81 int width;
82
83 if (ud->have >= ud->size)
84 fatalx("UTF-8 character overflow");
85 if (ud->size > sizeof ud->data)
86 fatalx("UTF-8 character size too large");
87
88 if (ud->have != 0 && (ch & 0xc0) != 0x80)
89 ud->width = 0xff;
90
91 ud->data[ud->have++] = ch;
92 if (ud->have != ud->size)
93 return (UTF8_MORE);
94
95 if (ud->width == 0xff)
96 return (UTF8_ERROR);
97
98 if (utf8_combine(ud, &wc) != UTF8_DONE)
99 return (UTF8_ERROR);
100 if ((width = utf8_width(wc)) < 0)
101 return (UTF8_ERROR);
102 ud->width = width;
103
104 return (UTF8_DONE);
105}
106
107/* Get width of Unicode character. */
108static int
109utf8_width(wchar_t wc)
110{
111 int width;
112
113#ifdef HAVE_UTF8PROC
114 width = utf8proc_wcwidth(wc);
115#else
116 width = wcwidth(wc);
117#endif
118 if (width < 0 || width > 0xff) {
119 log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
120
121#ifndef __OpenBSD__
122 /*
123 * Many platforms (particularly and inevitably OS X) have no
124 * width for relatively common characters (wcwidth() returns
125 * -1); assume width 1 in this case. This will be wrong for
126 * genuinely nonprintable characters, but they should be
127 * rare. We may pass through stuff that ideally we would block,
128 * but this is no worse than sending the same to the terminal
129 * without tmux.
130 */
131 if (width < 0)
132 return (1);
133#endif
134 return (-1);
135 }
136 return (width);
137}
138
139/* Combine UTF-8 into Unicode. */
140enum utf8_state
141utf8_combine(const struct utf8_data *ud, wchar_t *wc)
142{
143#ifdef HAVE_UTF8PROC
144 switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
145#else
146 switch (mbtowc(wc, ud->data, ud->size)) {
147#endif
148 case -1:
149 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
150 errno);
151 mbtowc(NULL, NULL, MB_CUR_MAX);
152 return (UTF8_ERROR);
153 case 0:
154 return (UTF8_ERROR);
155 default:
156 return (UTF8_DONE);
157 }
158}
159
160/* Split Unicode into UTF-8. */
161enum utf8_state
162utf8_split(wchar_t wc, struct utf8_data *ud)
163{
164 char s[MB_LEN_MAX];
165 int slen;
166
167#ifdef HAVE_UTF8PROC
168 slen = utf8proc_wctomb(s, wc);
169#else
170 slen = wctomb(s, wc);
171#endif
172 if (slen <= 0 || slen > (int)sizeof ud->data)
173 return (UTF8_ERROR);
174
175 memcpy(ud->data, s, slen);
176 ud->size = slen;
177
178 ud->width = utf8_width(wc);
179 return (UTF8_DONE);
180}
181
182/*
183 * Encode len characters from src into dst, which is guaranteed to have four
184 * bytes available for each character from src (for \abc or UTF-8) plus space
185 * for \0.
186 */
187int
188utf8_strvis(char *dst, const char *src, size_t len, int flag)
189{
190 struct utf8_data ud;
191 const char *start, *end;
192 enum utf8_state more;
193 size_t i;
194
195 start = dst;
196 end = src + len;
197
198 while (src < end) {
199 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
200 while (++src < end && more == UTF8_MORE)
201 more = utf8_append(&ud, *src);
202 if (more == UTF8_DONE) {
203 /* UTF-8 character finished. */
204 for (i = 0; i < ud.size; i++)
205 *dst++ = ud.data[i];
206 continue;
207 }
208 /* Not a complete, valid UTF-8 character. */
209 src -= ud.have;
210 }
211 if (src[0] == '$' && src < end - 1) {
212 if (isalpha((u_char)src[1]) ||
213 src[1] == '_' ||
214 src[1] == '{')
215 *dst++ = '\\';
216 *dst++ = '$';
217 } else if (src < end - 1)
218 dst = vis(dst, src[0], flag, src[1]);
219 else if (src < end)
220 dst = vis(dst, src[0], flag, '\0');
221 src++;
222 }
223
224 *dst = '\0';
225 return (dst - start);
226}
227
228/* Same as utf8_strvis but allocate the buffer. */
229int
230utf8_stravis(char **dst, const char *src, int flag)
231{
232 char *buf;
233 int len;
234
235 buf = xreallocarray(NULL, 4, strlen(src) + 1);
236 len = utf8_strvis(buf, src, strlen(src), flag);
237
238 *dst = xrealloc(buf, len + 1);
239 return (len);
240}
241
242/* Does this string contain anything that isn't valid UTF-8? */
243int
244utf8_isvalid(const char *s)
245{
246 struct utf8_data ud;
247 const char *end;
248 enum utf8_state more;
249
250 end = s + strlen(s);
251 while (s < end) {
252 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
253 while (++s < end && more == UTF8_MORE)
254 more = utf8_append(&ud, *s);
255 if (more == UTF8_DONE)
256 continue;
257 return (0);
258 }
259 if (*s < 0x20 || *s > 0x7e)
260 return (0);
261 s++;
262 }
263 return (1);
264}
265
266/*
267 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
268 * the returned string. Anything not valid printable ASCII or UTF-8 is
269 * stripped.
270 */
271char *
272utf8_sanitize(const char *src)
273{
274 char *dst;
275 size_t n;
276 enum utf8_state more;
277 struct utf8_data ud;
278 u_int i;
279
280 dst = NULL;
281
282 n = 0;
283 while (*src != '\0') {
284 dst = xreallocarray(dst, n + 1, sizeof *dst);
285 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
286 while (*++src != '\0' && more == UTF8_MORE)
287 more = utf8_append(&ud, *src);
288 if (more == UTF8_DONE) {
289 dst = xreallocarray(dst, n + ud.width,
290 sizeof *dst);
291 for (i = 0; i < ud.width; i++)
292 dst[n++] = '_';
293 continue;
294 }
295 src -= ud.have;
296 }
297 if (*src > 0x1f && *src < 0x7f)
298 dst[n++] = *src;
299 else
300 dst[n++] = '_';
301 src++;
302 }
303
304 dst = xreallocarray(dst, n + 1, sizeof *dst);
305 dst[n] = '\0';
306 return (dst);
307}
308
309/* Get UTF-8 buffer length. */
310size_t
311utf8_strlen(const struct utf8_data *s)
312{
313 size_t i;
314
315 for (i = 0; s[i].size != 0; i++)
316 /* nothing */;
317 return (i);
318}
319
320/* Get UTF-8 string width. */
321u_int
322utf8_strwidth(const struct utf8_data *s, ssize_t n)
323{
324 ssize_t i;
325 u_int width;
326
327 width = 0;
328 for (i = 0; s[i].size != 0; i++) {
329 if (n != -1 && n == i)
330 break;
331 width += s[i].width;
332 }
333 return (width);
334}
335
336/*
337 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
338 * Caller frees.
339 */
340struct utf8_data *
341utf8_fromcstr(const char *src)
342{
343 struct utf8_data *dst;
344 size_t n;
345 enum utf8_state more;
346
347 dst = NULL;
348
349 n = 0;
350 while (*src != '\0') {
351 dst = xreallocarray(dst, n + 1, sizeof *dst);
352 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
353 while (*++src != '\0' && more == UTF8_MORE)
354 more = utf8_append(&dst[n], *src);
355 if (more == UTF8_DONE) {
356 n++;
357 continue;
358 }
359 src -= dst[n].have;
360 }
361 utf8_set(&dst[n], *src);
362 n++;
363 src++;
364 }
365
366 dst = xreallocarray(dst, n + 1, sizeof *dst);
367 dst[n].size = 0;
368 return (dst);
369}
370
371/* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
372char *
373utf8_tocstr(struct utf8_data *src)
374{
375 char *dst;
376 size_t n;
377
378 dst = NULL;
379
380 n = 0;
381 for(; src->size != 0; src++) {
382 dst = xreallocarray(dst, n + src->size, 1);
383 memcpy(dst + n, src->data, src->size);
384 n += src->size;
385 }
386
387 dst = xreallocarray(dst, n + 1, 1);
388 dst[n] = '\0';
389 return (dst);
390}
391
392/* Get width of UTF-8 string. */
393u_int
394utf8_cstrwidth(const char *s)
395{
396 struct utf8_data tmp;
397 u_int width;
398 enum utf8_state more;
399
400 width = 0;
401 while (*s != '\0') {
402 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
403 while (*++s != '\0' && more == UTF8_MORE)
404 more = utf8_append(&tmp, *s);
405 if (more == UTF8_DONE) {
406 width += tmp.width;
407 continue;
408 }
409 s -= tmp.have;
410 }
411 if (*s > 0x1f && *s != 0x7f)
412 width++;
413 s++;
414 }
415 return (width);
416}
417
418/* Pad UTF-8 string to width on the left. Caller frees. */
419char *
420utf8_padcstr(const char *s, u_int width)
421{
422 size_t slen;
423 char *out;
424 u_int n, i;
425
426 n = utf8_cstrwidth(s);
427 if (n >= width)
428 return (xstrdup(s));
429
430 slen = strlen(s);
431 out = xmalloc(slen + 1 + (width - n));
432 memcpy(out, s, slen);
433 for (i = n; i < width; i++)
434 out[slen++] = ' ';
435 out[slen] = '\0';
436 return (out);
437}
438
439/* Pad UTF-8 string to width on the right. Caller frees. */
440char *
441utf8_rpadcstr(const char *s, u_int width)
442{
443 size_t slen;
444 char *out;
445 u_int n, i;
446
447 n = utf8_cstrwidth(s);
448 if (n >= width)
449 return (xstrdup(s));
450
451 slen = strlen(s);
452 out = xmalloc(slen + 1 + (width - n));
453 for (i = 0; i < width - n; i++)
454 out[i] = ' ';
455 memcpy(out + i, s, slen);
456 out[i + slen] = '\0';
457 return (out);
458}
459
460int
461utf8_cstrhas(const char *s, const struct utf8_data *ud)
462{
463 struct utf8_data *copy, *loop;
464 int found = 0;
465
466 copy = utf8_fromcstr(s);
467 for (loop = copy; loop->size != 0; loop++) {
468 if (loop->size != ud->size)
469 continue;
470 if (memcmp(loop->data, ud->data, loop->size) == 0) {
471 found = 1;
472 break;
473 }
474 }
475 free(copy);
476
477 return (found);
478}
479