utf8.c source code [tmux/utf8.c]

1	/ $OpenBSD$ /
2
3	/*
4	* Copyright (c) 2008 Nicholas Marriott <[email protected]>
5	*
6	* Permission to use, copy, modify, and distribute this software for any
7	* purpose with or without fee is hereby granted, provided that the above
8	* copyright notice and this permission notice appear in all copies.
9	*
10	* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11	* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12	* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13	* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14	* WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15	* IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16	* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17	*/
18
19	#include <sys/types.h>
20
21	#include <ctype.h>
22	#include <errno.h>
23	#include <stdlib.h>
24	#include <string.h>
25	#include <wchar.h>
26
27	#include "tmux.h"
28
29	static int utf8_width(wchar_t);
30
31	/ Set a single character. /
32	void
33	utf8_set(struct utf8_data *ud, u_char ch)
34	{
35	static const struct utf8_data empty = { { `0` }, `1`, `1`, `1` };
36
37	memcpy(ud, &empty, sizeof *ud);
38	*ud->data = ch;
39	}
40
41	/ Copy UTF-8 character. /
42	void
43	utf8_copy(struct utf8_data to, const* struct utf8_data *from)
44	{
45	u_int i;
46
47	memcpy(to, from, sizeof *to);
48
49	for (i = to->size; i < sizeof to->data; i++)
50	to->data[i] = `'\0'`;
51	}
52
53	/*
54	* Open UTF-8 sequence.
55	*
56	* 11000010-11011111 C2-DF start of 2-byte sequence
57	* 11100000-11101111 E0-EF start of 3-byte sequence
58	* 11110000-11110100 F0-F4 start of 4-byte sequence
59	*/
60	enum utf8_state
61	utf8_open(struct utf8_data *ud, u_char ch)
62	{
63	memset(ud, `0`, sizeof *ud);
64	if (ch >= `0xc2` && ch <= `0xdf`)
65	ud->size = `2`;
66	else if (ch >= `0xe0` && ch <= `0xef`)
67	ud->size = `3`;
68	else if (ch >= `0xf0` && ch <= `0xf4`)
69	ud->size = `4`;
70	else
71	return (UTF8_ERROR);
72	utf8_append(ud, ch);
73	return (UTF8_MORE);
74	}
75
76	/ Append character to UTF-8, closing if finished. /
77	enum utf8_state
78	utf8_append(struct utf8_data *ud, u_char ch)
79	{
80	wchar_t wc;
81	int width;
82
83	if (ud->have >= ud->size)
84	fatalx("UTF-8 character overflow");
85	if (ud->size > sizeof ud->data)
86	fatalx("UTF-8 character size too large");
87
88	if (ud->have != `0` && (ch & `0xc0`) != `0x80`)
89	ud->width = `0xff`;
90
91	ud->data[ud->have++] = ch;
92	if (ud->have != ud->size)
93	return (UTF8_MORE);
94
95	if (ud->width == `0xff`)
96	return (UTF8_ERROR);
97
98	if (utf8_combine(ud, &wc) != UTF8_DONE)
99	return (UTF8_ERROR);
100	if ((width = utf8_width(wc)) < `0`)
101	return (UTF8_ERROR);
102	ud->width = width;
103
104	return (UTF8_DONE);
105	}
106
107	/ Get width of Unicode character. /
108	static int
109	utf8_width(wchar_t wc)
110	{
111	int width;
112
113	#ifdef HAVE_UTF8PROC
114	width = utf8proc_wcwidth(wc);
115	#else
116	width = wcwidth(wc);
117	#endif
118	if (width < `0` \|\| width > `0xff`) {
119	log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
120
121	#ifndef __OpenBSD__
122	/*
123	* Many platforms (particularly and inevitably OS X) have no
124	* width for relatively common characters (wcwidth() returns
125	* -1); assume width 1 in this case. This will be wrong for
126	* genuinely nonprintable characters, but they should be
127	* rare. We may pass through stuff that ideally we would block,
128	* but this is no worse than sending the same to the terminal
129	* without tmux.
130	*/
131	if (width < `0`)
132	return (`1`);
133	#endif
134	return (-`1`);
135	}
136	return (width);
137	}
138
139	/ Combine UTF-8 into Unicode. /
140	enum utf8_state
141	utf8_combine(const struct utf8_data ud, wchar_t wc)
142	{
143	#ifdef HAVE_UTF8PROC
144	switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
145	#else
146	switch (mbtowc(wc, ud->data, ud->size)) {
147	#endif
148	case -`1`:
149	log_debug("UTF-8 %.s, mbtowc() %d", (int*)ud->size, ud->data,
150	errno);
151	mbtowc(NULL, NULL, MB_CUR_MAX);
152	return (UTF8_ERROR);
153	case `0`:
154	return (UTF8_ERROR);
155	default:
156	return (UTF8_DONE);
157	}
158	}
159
160	/ Split Unicode into UTF-8. /
161	enum utf8_state
162	utf8_split(wchar_t wc, struct utf8_data *ud)
163	{
164	char s[MB_LEN_MAX];
165	int slen;
166
167	#ifdef HAVE_UTF8PROC
168	slen = utf8proc_wctomb(s, wc);
169	#else
170	slen = wctomb(s, wc);
171	#endif
172	if (slen <= `0` \|\| slen > (int)sizeof ud->data)
173	return (UTF8_ERROR);
174
175	memcpy(ud->data, s, slen);
176	ud->size = slen;
177
178	ud->width = utf8_width(wc);
179	return (UTF8_DONE);
180	}
181
182	/*
183	* Encode len characters from src into dst, which is guaranteed to have four
184	* bytes available for each character from src (for \abc or UTF-8) plus space
185	* for \0.
186	*/
187	int
188	utf8_strvis(char dst, const* char src, size_t len, int* flag)
189	{
190	struct utf8_data ud;
191	const char start, end;
192	enum utf8_state more;
193	size_t i;
194
195	start = dst;
196	end = src + len;
197
198	while (src < end) {
199	if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
200	while (++src < end && more == UTF8_MORE)
201	more = utf8_append(&ud, *src);
202	if (more == UTF8_DONE) {
203	/ UTF-8 character finished. /
204	for (i = `0`; i < ud.size; i++)
205	*dst++ = ud.data[i];
206	continue;
207	}
208	/ Not a complete, valid UTF-8 character. /
209	src -= ud.have;
210	}
211	if (src[`0`] == `'$'` && src < end - `1`) {
212	if (isalpha((u_char)src[`1`]) \|\|
213	src[`1`] == `'_'` \|\|
214	src[`1`] == `'{'`)
215	*dst++ = `'\\'`;
216	*dst++ = `'$'`;
217	} else if (src < end - `1`)
218	dst = vis(dst, src[`0`], flag, src[`1`]);
219	else if (src < end)
220	dst = vis(dst, src[`0`], flag, `'\0'`);
221	src++;
222	}
223
224	*dst = `'\0'`;
225	return (dst - start);
226	}
227
228	/ Same as utf8_strvis but allocate the buffer. /
229	int
230	utf8_stravis(char *dst, const* char src, int* flag)
231	{
232	char *buf;
233	int len;
234
235	buf = xreallocarray(NULL, `4`, strlen(src) + `1`);
236	len = utf8_strvis(buf, src, strlen(src), flag);
237
238	*dst = xrealloc(buf, len + `1`);
239	return (len);
240	}
241
242	/ Does this string contain anything that isn't valid UTF-8? /
243	int
244	utf8_isvalid(const char *s)
245	{
246	struct utf8_data ud;
247	const char *end;
248	enum utf8_state more;
249
250	end = s + strlen(s);
251	while (s < end) {
252	if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
253	while (++s < end && more == UTF8_MORE)
254	more = utf8_append(&ud, *s);
255	if (more == UTF8_DONE)
256	continue;
257	return (`0`);
258	}
259	if (s < `0x20` \|\| s > `0x7e`)
260	return (`0`);
261	s++;
262	}
263	return (`1`);
264	}
265
266	/*
267	* Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
268	* the returned string. Anything not valid printable ASCII or UTF-8 is
269	* stripped.
270	*/
271	char *
272	utf8_sanitize(const char *src)
273	{
274	char *dst;
275	size_t n;
276	enum utf8_state more;
277	struct utf8_data ud;
278	u_int i;
279
280	dst = NULL;
281
282	n = `0`;
283	while (*src != `'\0'`) {
284	dst = xreallocarray(dst, n + `1`, sizeof *dst);
285	if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
286	while (*++src != `'\0'` && more == UTF8_MORE)
287	more = utf8_append(&ud, *src);
288	if (more == UTF8_DONE) {
289	dst = xreallocarray(dst, n + ud.width,
290	sizeof *dst);
291	for (i = `0`; i < ud.width; i++)
292	dst[n++] = `'_'`;
293	continue;
294	}
295	src -= ud.have;
296	}
297	if (src > `0x1f` && src < `0x7f`)
298	dst[n++] = *src;
299	else
300	dst[n++] = `'_'`;
301	src++;
302	}
303
304	dst = xreallocarray(dst, n + `1`, sizeof *dst);
305	dst[n] = `'\0'`;
306	return (dst);
307	}
308
309	/ Get UTF-8 buffer length. /
310	size_t
311	utf8_strlen(const struct utf8_data *s)
312	{
313	size_t i;
314
315	for (i = `0`; s[i].size != `0`; i++)
316	/ nothing /;
317	return (i);
318	}
319
320	/ Get UTF-8 string width. /
321	u_int
322	utf8_strwidth(const struct utf8_data *s, ssize_t n)
323	{
324	ssize_t i;
325	u_int width;
326
327	width = `0`;
328	for (i = `0`; s[i].size != `0`; i++) {
329	if (n != -`1` && n == i)
330	break;
331	width += s[i].width;
332	}
333	return (width);
334	}
335
336	/*
337	* Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
338	* Caller frees.
339	*/
340	struct utf8_data *
341	utf8_fromcstr(const char *src)
342	{
343	struct utf8_data *dst;
344	size_t n;
345	enum utf8_state more;
346
347	dst = NULL;
348
349	n = `0`;
350	while (*src != `'\0'`) {
351	dst = xreallocarray(dst, n + `1`, sizeof *dst);
352	if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
353	while (*++src != `'\0'` && more == UTF8_MORE)
354	more = utf8_append(&dst[n], *src);
355	if (more == UTF8_DONE) {
356	n++;
357	continue;
358	}
359	src -= dst[n].have;
360	}
361	utf8_set(&dst[n], *src);
362	n++;
363	src++;
364	}
365
366	dst = xreallocarray(dst, n + `1`, sizeof *dst);
367	dst[n].size = `0`;
368	return (dst);
369	}
370
371	/ Convert from a buffer of UTF-8 characters into a string. Caller frees. /
372	char *
373	utf8_tocstr(struct utf8_data *src)
374	{
375	char *dst;
376	size_t n;
377
378	dst = NULL;
379
380	n = `0`;
381	for(; src->size != `0`; src++) {
382	dst = xreallocarray(dst, n + src->size, `1`);
383	memcpy(dst + n, src->data, src->size);
384	n += src->size;
385	}
386
387	dst = xreallocarray(dst, n + `1`, `1`);
388	dst[n] = `'\0'`;
389	return (dst);
390	}
391
392	/ Get width of UTF-8 string. /
393	u_int
394	utf8_cstrwidth(const char *s)
395	{
396	struct utf8_data tmp;
397	u_int width;
398	enum utf8_state more;
399
400	width = `0`;
401	while (*s != `'\0'`) {
402	if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
403	while (*++s != `'\0'` && more == UTF8_MORE)
404	more = utf8_append(&tmp, *s);
405	if (more == UTF8_DONE) {
406	width += tmp.width;
407	continue;
408	}
409	s -= tmp.have;
410	}
411	if (s > `0x1f` && s != `0x7f`)
412	width++;
413	s++;
414	}
415	return (width);
416	}
417
418	/ Pad UTF-8 string to width on the left. Caller frees. /
419	char *
420	utf8_padcstr(const char *s, u_int width)
421	{
422	size_t slen;
423	char *out;
424	u_int n, i;
425
426	n = utf8_cstrwidth(s);
427	if (n >= width)
428	return (xstrdup(s));
429
430	slen = strlen(s);
431	out = xmalloc(slen + `1` + (width - n));
432	memcpy(out, s, slen);
433	for (i = n; i < width; i++)
434	out[slen++] = `' '`;
435	out[slen] = `'\0'`;
436	return (out);
437	}
438
439	/ Pad UTF-8 string to width on the right. Caller frees. /
440	char *
441	utf8_rpadcstr(const char *s, u_int width)
442	{
443	size_t slen;
444	char *out;
445	u_int n, i;
446
447	n = utf8_cstrwidth(s);
448	if (n >= width)
449	return (xstrdup(s));
450
451	slen = strlen(s);
452	out = xmalloc(slen + `1` + (width - n));
453	for (i = `0`; i < width - n; i++)
454	out[i] = `' '`;
455	memcpy(out + i, s, slen);
456	out[i + slen] = `'\0'`;
457	return (out);
458	}
459
460	int
461	utf8_cstrhas(const char s, const* struct utf8_data *ud)
462	{
463	struct utf8_data copy, loop;
464	int found = `0`;
465
466	copy = utf8_fromcstr(s);
467	for (loop = copy; loop->size != `0`; loop++) {
468	if (loop->size != ud->size)
469	continue;
470	if (memcmp(loop->data, ud->data, loop->size) == `0`) {
471	found = `1`;
472	break;
473	}
474	}
475	free(copy);
476
477	return (found);
478	}
479

Browse the source code of tmux/utf8.c