1 | /* ----------------------------------------------------------------------- * |
2 | * |
3 | * Copyright 1996-2009 The NASM Authors - All Rights Reserved |
4 | * See the file AUTHORS included with the NASM distribution for |
5 | * the specific copyright holders. |
6 | * |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following |
9 | * conditions are met: |
10 | * |
11 | * * Redistributions of source code must retain the above copyright |
12 | * notice, this list of conditions and the following disclaimer. |
13 | * * Redistributions in binary form must reproduce the above |
14 | * copyright notice, this list of conditions and the following |
15 | * disclaimer in the documentation and/or other materials provided |
16 | * with the distribution. |
17 | * |
18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
19 | * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, |
20 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
21 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
22 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
23 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
25 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
26 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
27 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
28 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
29 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
30 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
31 | * |
32 | * ----------------------------------------------------------------------- */ |
33 | |
34 | /* |
35 | * strfunc.c |
36 | * |
37 | * String transformation functions |
38 | */ |
39 | |
40 | #include "nasmlib.h" |
41 | #include "nasm.h" |
42 | |
43 | /* |
44 | * Convert a string in UTF-8 format to UTF-16LE |
45 | */ |
46 | static size_t utf8_to_16le(uint8_t *str, size_t len, char *op) |
47 | { |
48 | #define EMIT(x) do { if (op) { WRITESHORT(op,x); } outlen++; } while(0) |
49 | |
50 | size_t outlen = 0; |
51 | int expect = 0; |
52 | uint8_t c; |
53 | uint32_t v = 0, vmin = 0; |
54 | |
55 | while (len--) { |
56 | c = *str++; |
57 | |
58 | if (expect) { |
59 | if ((c & 0xc0) != 0x80) { |
60 | expect = 0; |
61 | return -1; |
62 | } else { |
63 | v = (v << 6) | (c & 0x3f); |
64 | if (!--expect) { |
65 | if (v < vmin || v > 0x10ffff || |
66 | (v >= 0xd800 && v <= 0xdfff)) { |
67 | return -1; |
68 | } else if (v > 0xffff) { |
69 | v -= 0x10000; |
70 | EMIT(0xd800 | (v >> 10)); |
71 | EMIT(0xdc00 | (v & 0x3ff)); |
72 | } else { |
73 | EMIT(v); |
74 | } |
75 | } |
76 | continue; |
77 | } |
78 | } |
79 | |
80 | if (c < 0x80) { |
81 | EMIT(c); |
82 | } else if (c < 0xc0 || c >= 0xfe) { |
83 | /* Invalid UTF-8 */ |
84 | return -1; |
85 | } else if (c < 0xe0) { |
86 | v = c & 0x1f; |
87 | expect = 1; |
88 | vmin = 0x80; |
89 | } else if (c < 0xf0) { |
90 | v = c & 0x0f; |
91 | expect = 2; |
92 | vmin = 0x800; |
93 | } else if (c < 0xf8) { |
94 | v = c & 0x07; |
95 | expect = 3; |
96 | vmin = 0x10000; |
97 | } else if (c < 0xfc) { |
98 | v = c & 0x03; |
99 | expect = 4; |
100 | vmin = 0x200000; |
101 | } else { |
102 | v = c & 0x01; |
103 | expect = 5; |
104 | vmin = 0x4000000; |
105 | } |
106 | } |
107 | |
108 | return expect ? (size_t)-1 : outlen << 1; |
109 | |
110 | #undef EMIT |
111 | } |
112 | |
113 | /* |
114 | * Convert a string in UTF-8 format to UTF-16BE |
115 | */ |
116 | static size_t utf8_to_16be(uint8_t *str, size_t len, char *op) |
117 | { |
118 | #define EMIT(x) \ |
119 | do { \ |
120 | uint16_t _y = (x); \ |
121 | if (op) { \ |
122 | WRITECHAR(op, _y >> 8); \ |
123 | WRITECHAR(op, _y); \ |
124 | } \ |
125 | outlen++; \ |
126 | } while (0) \ |
127 | |
128 | size_t outlen = 0; |
129 | int expect = 0; |
130 | uint8_t c; |
131 | uint32_t v = 0, vmin = 0; |
132 | |
133 | while (len--) { |
134 | c = *str++; |
135 | |
136 | if (expect) { |
137 | if ((c & 0xc0) != 0x80) { |
138 | expect = 0; |
139 | return -1; |
140 | } else { |
141 | v = (v << 6) | (c & 0x3f); |
142 | if (!--expect) { |
143 | if (v < vmin || v > 0x10ffff || |
144 | (v >= 0xd800 && v <= 0xdfff)) { |
145 | return -1; |
146 | } else if (v > 0xffff) { |
147 | v -= 0x10000; |
148 | EMIT(0xdc00 | (v & 0x3ff)); |
149 | EMIT(0xd800 | (v >> 10)); |
150 | } else { |
151 | EMIT(v); |
152 | } |
153 | } |
154 | continue; |
155 | } |
156 | } |
157 | |
158 | if (c < 0x80) { |
159 | EMIT(c); |
160 | } else if (c < 0xc0 || c >= 0xfe) { |
161 | /* Invalid UTF-8 */ |
162 | return -1; |
163 | } else if (c < 0xe0) { |
164 | v = c & 0x1f; |
165 | expect = 1; |
166 | vmin = 0x80; |
167 | } else if (c < 0xf0) { |
168 | v = c & 0x0f; |
169 | expect = 2; |
170 | vmin = 0x800; |
171 | } else if (c < 0xf8) { |
172 | v = c & 0x07; |
173 | expect = 3; |
174 | vmin = 0x10000; |
175 | } else if (c < 0xfc) { |
176 | v = c & 0x03; |
177 | expect = 4; |
178 | vmin = 0x200000; |
179 | } else { |
180 | v = c & 0x01; |
181 | expect = 5; |
182 | vmin = 0x4000000; |
183 | } |
184 | } |
185 | |
186 | return expect ? (size_t)-1 : outlen << 1; |
187 | |
188 | #undef EMIT |
189 | } |
190 | |
191 | /* |
192 | * Convert a string in UTF-8 format to UTF-32LE |
193 | */ |
194 | static size_t utf8_to_32le(uint8_t *str, size_t len, char *op) |
195 | { |
196 | #define EMIT(x) do { if (op) { WRITELONG(op,x); } outlen++; } while(0) |
197 | |
198 | size_t outlen = 0; |
199 | int expect = 0; |
200 | uint8_t c; |
201 | uint32_t v = 0, vmin = 0; |
202 | |
203 | while (len--) { |
204 | c = *str++; |
205 | |
206 | if (expect) { |
207 | if ((c & 0xc0) != 0x80) { |
208 | return -1; |
209 | } else { |
210 | v = (v << 6) | (c & 0x3f); |
211 | if (!--expect) { |
212 | if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) { |
213 | return -1; |
214 | } else { |
215 | EMIT(v); |
216 | } |
217 | } |
218 | continue; |
219 | } |
220 | } |
221 | |
222 | if (c < 0x80) { |
223 | EMIT(c); |
224 | } else if (c < 0xc0 || c >= 0xfe) { |
225 | /* Invalid UTF-8 */ |
226 | return -1; |
227 | } else if (c < 0xe0) { |
228 | v = c & 0x1f; |
229 | expect = 1; |
230 | vmin = 0x80; |
231 | } else if (c < 0xf0) { |
232 | v = c & 0x0f; |
233 | expect = 2; |
234 | vmin = 0x800; |
235 | } else if (c < 0xf8) { |
236 | v = c & 0x07; |
237 | expect = 3; |
238 | vmin = 0x10000; |
239 | } else if (c < 0xfc) { |
240 | v = c & 0x03; |
241 | expect = 4; |
242 | vmin = 0x200000; |
243 | } else { |
244 | v = c & 0x01; |
245 | expect = 5; |
246 | vmin = 0x4000000; |
247 | } |
248 | } |
249 | |
250 | return expect ? (size_t)-1 : outlen << 2; |
251 | |
252 | #undef EMIT |
253 | } |
254 | |
255 | /* |
256 | * Convert a string in UTF-8 format to UTF-32BE |
257 | */ |
258 | static size_t utf8_to_32be(uint8_t *str, size_t len, char *op) |
259 | { |
260 | #define EMIT(x) \ |
261 | do { \ |
262 | uint32_t _y = (x); \ |
263 | if (op) { \ |
264 | WRITECHAR(op,_y >> 24); \ |
265 | WRITECHAR(op,_y >> 16); \ |
266 | WRITECHAR(op,_y >> 8); \ |
267 | WRITECHAR(op,_y); \ |
268 | } \ |
269 | outlen++; \ |
270 | } while (0) |
271 | |
272 | size_t outlen = 0; |
273 | int expect = 0; |
274 | uint8_t c; |
275 | uint32_t v = 0, vmin = 0; |
276 | |
277 | while (len--) { |
278 | c = *str++; |
279 | |
280 | if (expect) { |
281 | if ((c & 0xc0) != 0x80) { |
282 | return -1; |
283 | } else { |
284 | v = (v << 6) | (c & 0x3f); |
285 | if (!--expect) { |
286 | if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) { |
287 | return -1; |
288 | } else { |
289 | EMIT(v); |
290 | } |
291 | } |
292 | continue; |
293 | } |
294 | } |
295 | |
296 | if (c < 0x80) { |
297 | EMIT(c); |
298 | } else if (c < 0xc0 || c >= 0xfe) { |
299 | /* Invalid UTF-8 */ |
300 | return -1; |
301 | } else if (c < 0xe0) { |
302 | v = c & 0x1f; |
303 | expect = 1; |
304 | vmin = 0x80; |
305 | } else if (c < 0xf0) { |
306 | v = c & 0x0f; |
307 | expect = 2; |
308 | vmin = 0x800; |
309 | } else if (c < 0xf8) { |
310 | v = c & 0x07; |
311 | expect = 3; |
312 | vmin = 0x10000; |
313 | } else if (c < 0xfc) { |
314 | v = c & 0x03; |
315 | expect = 4; |
316 | vmin = 0x200000; |
317 | } else { |
318 | v = c & 0x01; |
319 | expect = 5; |
320 | vmin = 0x4000000; |
321 | } |
322 | } |
323 | |
324 | return expect ? (size_t)-1 : outlen << 2; |
325 | |
326 | #undef EMIT |
327 | } |
328 | |
329 | typedef size_t (*transform_func)(uint8_t *, size_t, char *); |
330 | |
331 | /* |
332 | * Apply a specific string transform and return it in a nasm_malloc'd |
333 | * buffer, returning the length. On error, returns (size_t)-1 and no |
334 | * buffer is allocated. |
335 | */ |
336 | size_t string_transform(char *str, size_t len, char **out, enum strfunc func) |
337 | { |
338 | /* This should match enum strfunc in nasm.h */ |
339 | static const transform_func str_transforms[] = { |
340 | utf8_to_16le, |
341 | utf8_to_16le, |
342 | utf8_to_16be, |
343 | utf8_to_32le, |
344 | utf8_to_32le, |
345 | utf8_to_32be, |
346 | }; |
347 | transform_func transform = str_transforms[func]; |
348 | size_t outlen; |
349 | uint8_t *s = (uint8_t *)str; |
350 | char *buf; |
351 | |
352 | outlen = transform(s, len, NULL); |
353 | if (outlen == (size_t)-1) |
354 | return -1; |
355 | |
356 | *out = buf = nasm_malloc(outlen+1); |
357 | buf[outlen] = '\0'; /* Forcibly null-terminate the buffer */ |
358 | return transform(s, len, buf); |
359 | } |
360 | |