1 | /* A fuzz test for CPython. |
2 | |
3 | The only exposed function is LLVMFuzzerTestOneInput, which is called by |
4 | fuzzers and by the _fuzz module for smoke tests. |
5 | |
6 | To build exactly one fuzz test, as when running in oss-fuzz etc., |
7 | build with -D _Py_FUZZ_ONE and -D _Py_FUZZ_<test_name>. e.g. to build |
8 | LLVMFuzzerTestOneInput to only run "fuzz_builtin_float", build this file with |
9 | -D _Py_FUZZ_ONE -D _Py_FUZZ_fuzz_builtin_float. |
10 | |
11 | See the source code for LLVMFuzzerTestOneInput for details. */ |
12 | |
13 | #include <Python.h> |
14 | #include <stdlib.h> |
15 | #include <inttypes.h> |
16 | |
17 | /* Fuzz PyFloat_FromString as a proxy for float(str). */ |
18 | static int fuzz_builtin_float(const char* data, size_t size) { |
19 | PyObject* s = PyBytes_FromStringAndSize(data, size); |
20 | if (s == NULL) return 0; |
21 | PyObject* f = PyFloat_FromString(s); |
22 | if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_ValueError)) { |
23 | PyErr_Clear(); |
24 | } |
25 | |
26 | Py_XDECREF(f); |
27 | Py_DECREF(s); |
28 | return 0; |
29 | } |
30 | |
31 | #define MAX_INT_TEST_SIZE 0x10000 |
32 | |
33 | /* Fuzz PyLong_FromUnicodeObject as a proxy for int(str). */ |
34 | static int fuzz_builtin_int(const char* data, size_t size) { |
35 | /* Ignore test cases with very long ints to avoid timeouts |
36 | int("9" * 1000000) is not a very interesting test caase */ |
37 | if (size > MAX_INT_TEST_SIZE) { |
38 | return 0; |
39 | } |
40 | /* Pick a random valid base. (When the fuzzed function takes extra |
41 | parameters, it's somewhat normal to hash the input to generate those |
42 | parameters. We want to exercise all code paths, so we do so here.) */ |
43 | int base = _Py_HashBytes(data, size) % 37; |
44 | if (base == 1) { |
45 | // 1 is the only number between 0 and 36 that is not a valid base. |
46 | base = 0; |
47 | } |
48 | if (base == -1) { |
49 | return 0; // An error occurred, bail early. |
50 | } |
51 | if (base < 0) { |
52 | base = -base; |
53 | } |
54 | |
55 | PyObject* s = PyUnicode_FromStringAndSize(data, size); |
56 | if (s == NULL) { |
57 | if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { |
58 | PyErr_Clear(); |
59 | } |
60 | return 0; |
61 | } |
62 | PyObject* l = PyLong_FromUnicodeObject(s, base); |
63 | if (l == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) { |
64 | PyErr_Clear(); |
65 | } |
66 | PyErr_Clear(); |
67 | Py_XDECREF(l); |
68 | Py_DECREF(s); |
69 | return 0; |
70 | } |
71 | |
72 | /* Fuzz PyUnicode_FromStringAndSize as a proxy for unicode(str). */ |
73 | static int fuzz_builtin_unicode(const char* data, size_t size) { |
74 | PyObject* s = PyUnicode_FromStringAndSize(data, size); |
75 | if (s == NULL && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { |
76 | PyErr_Clear(); |
77 | } |
78 | Py_XDECREF(s); |
79 | return 0; |
80 | } |
81 | |
82 | |
83 | PyObject* struct_unpack_method = NULL; |
84 | PyObject* struct_error = NULL; |
85 | /* Called by LLVMFuzzerTestOneInput for initialization */ |
86 | static int init_struct_unpack(void) { |
87 | /* Import struct.unpack */ |
88 | PyObject* struct_module = PyImport_ImportModule("struct" ); |
89 | if (struct_module == NULL) { |
90 | return 0; |
91 | } |
92 | struct_error = PyObject_GetAttrString(struct_module, "error" ); |
93 | if (struct_error == NULL) { |
94 | return 0; |
95 | } |
96 | struct_unpack_method = PyObject_GetAttrString(struct_module, "unpack" ); |
97 | return struct_unpack_method != NULL; |
98 | } |
99 | /* Fuzz struct.unpack(x, y) */ |
100 | static int fuzz_struct_unpack(const char* data, size_t size) { |
101 | /* Everything up to the first null byte is considered the |
102 | format. Everything after is the buffer */ |
103 | const char* first_null = memchr(data, '\0', size); |
104 | if (first_null == NULL) { |
105 | return 0; |
106 | } |
107 | |
108 | size_t format_length = first_null - data; |
109 | size_t buffer_length = size - format_length - 1; |
110 | |
111 | PyObject* pattern = PyBytes_FromStringAndSize(data, format_length); |
112 | if (pattern == NULL) { |
113 | return 0; |
114 | } |
115 | PyObject* buffer = PyBytes_FromStringAndSize(first_null + 1, buffer_length); |
116 | if (buffer == NULL) { |
117 | Py_DECREF(pattern); |
118 | return 0; |
119 | } |
120 | |
121 | PyObject* unpacked = PyObject_CallFunctionObjArgs( |
122 | struct_unpack_method, pattern, buffer, NULL); |
123 | /* Ignore any overflow errors, these are easily triggered accidentally */ |
124 | if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_OverflowError)) { |
125 | PyErr_Clear(); |
126 | } |
127 | /* The pascal format string will throw a negative size when passing 0 |
128 | like: struct.unpack('0p', b'') */ |
129 | if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_SystemError)) { |
130 | PyErr_Clear(); |
131 | } |
132 | /* Ignore any struct.error exceptions, these can be caused by invalid |
133 | formats or incomplete buffers both of which are common. */ |
134 | if (unpacked == NULL && PyErr_ExceptionMatches(struct_error)) { |
135 | PyErr_Clear(); |
136 | } |
137 | |
138 | Py_XDECREF(unpacked); |
139 | Py_DECREF(pattern); |
140 | Py_DECREF(buffer); |
141 | return 0; |
142 | } |
143 | |
144 | |
145 | #define MAX_JSON_TEST_SIZE 0x10000 |
146 | |
147 | PyObject* json_loads_method = NULL; |
148 | /* Called by LLVMFuzzerTestOneInput for initialization */ |
149 | static int init_json_loads(void) { |
150 | /* Import json.loads */ |
151 | PyObject* json_module = PyImport_ImportModule("json" ); |
152 | if (json_module == NULL) { |
153 | return 0; |
154 | } |
155 | json_loads_method = PyObject_GetAttrString(json_module, "loads" ); |
156 | return json_loads_method != NULL; |
157 | } |
158 | /* Fuzz json.loads(x) */ |
159 | static int fuzz_json_loads(const char* data, size_t size) { |
160 | /* Since python supports arbitrarily large ints in JSON, |
161 | long inputs can lead to timeouts on boring inputs like |
162 | `json.loads("9" * 100000)` */ |
163 | if (size > MAX_JSON_TEST_SIZE) { |
164 | return 0; |
165 | } |
166 | PyObject* input_bytes = PyBytes_FromStringAndSize(data, size); |
167 | if (input_bytes == NULL) { |
168 | return 0; |
169 | } |
170 | PyObject* parsed = PyObject_CallOneArg(json_loads_method, input_bytes); |
171 | if (parsed == NULL) { |
172 | /* Ignore ValueError as the fuzzer will more than likely |
173 | generate some invalid json and values */ |
174 | if (PyErr_ExceptionMatches(PyExc_ValueError) || |
175 | /* Ignore RecursionError as the fuzzer generates long sequences of |
176 | arrays such as `[[[...` */ |
177 | PyErr_ExceptionMatches(PyExc_RecursionError) || |
178 | /* Ignore unicode errors, invalid byte sequences are common */ |
179 | PyErr_ExceptionMatches(PyExc_UnicodeDecodeError) |
180 | ) { |
181 | PyErr_Clear(); |
182 | } |
183 | } |
184 | Py_DECREF(input_bytes); |
185 | Py_XDECREF(parsed); |
186 | return 0; |
187 | } |
188 | |
189 | #define MAX_RE_TEST_SIZE 0x10000 |
190 | |
191 | PyObject* sre_compile_method = NULL; |
192 | PyObject* sre_error_exception = NULL; |
193 | int SRE_FLAG_DEBUG = 0; |
194 | /* Called by LLVMFuzzerTestOneInput for initialization */ |
195 | static int init_sre_compile(void) { |
196 | /* Import sre_compile.compile and sre.error */ |
197 | PyObject* sre_compile_module = PyImport_ImportModule("sre_compile" ); |
198 | if (sre_compile_module == NULL) { |
199 | return 0; |
200 | } |
201 | sre_compile_method = PyObject_GetAttrString(sre_compile_module, "compile" ); |
202 | if (sre_compile_method == NULL) { |
203 | return 0; |
204 | } |
205 | |
206 | PyObject* sre_constants = PyImport_ImportModule("sre_constants" ); |
207 | if (sre_constants == NULL) { |
208 | return 0; |
209 | } |
210 | sre_error_exception = PyObject_GetAttrString(sre_constants, "error" ); |
211 | if (sre_error_exception == NULL) { |
212 | return 0; |
213 | } |
214 | PyObject* debug_flag = PyObject_GetAttrString(sre_constants, "SRE_FLAG_DEBUG" ); |
215 | if (debug_flag == NULL) { |
216 | return 0; |
217 | } |
218 | SRE_FLAG_DEBUG = PyLong_AsLong(debug_flag); |
219 | return 1; |
220 | } |
221 | /* Fuzz _sre.compile(x) */ |
222 | static int fuzz_sre_compile(const char* data, size_t size) { |
223 | /* Ignore really long regex patterns that will timeout the fuzzer */ |
224 | if (size > MAX_RE_TEST_SIZE) { |
225 | return 0; |
226 | } |
227 | /* We treat the first 2 bytes of the input as a number for the flags */ |
228 | if (size < 2) { |
229 | return 0; |
230 | } |
231 | uint16_t flags = ((uint16_t*) data)[0]; |
232 | /* We remove the SRE_FLAG_DEBUG if present. This is because it |
233 | prints to stdout which greatly decreases fuzzing speed */ |
234 | flags &= ~SRE_FLAG_DEBUG; |
235 | |
236 | /* Pull the pattern from the remaining bytes */ |
237 | PyObject* pattern_bytes = PyBytes_FromStringAndSize(data + 2, size - 2); |
238 | if (pattern_bytes == NULL) { |
239 | return 0; |
240 | } |
241 | PyObject* flags_obj = PyLong_FromUnsignedLong(flags); |
242 | if (flags_obj == NULL) { |
243 | Py_DECREF(pattern_bytes); |
244 | return 0; |
245 | } |
246 | |
247 | /* compiled = _sre.compile(data[2:], data[0:2] */ |
248 | PyObject* compiled = PyObject_CallFunctionObjArgs( |
249 | sre_compile_method, pattern_bytes, flags_obj, NULL); |
250 | /* Ignore ValueError as the fuzzer will more than likely |
251 | generate some invalid combination of flags */ |
252 | if (compiled == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) { |
253 | PyErr_Clear(); |
254 | } |
255 | /* Ignore some common errors thrown by sre_parse: |
256 | Overflow, Assertion, Recursion and Index */ |
257 | if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) || |
258 | PyErr_ExceptionMatches(PyExc_AssertionError) || |
259 | PyErr_ExceptionMatches(PyExc_RecursionError) || |
260 | PyErr_ExceptionMatches(PyExc_IndexError)) |
261 | ) { |
262 | PyErr_Clear(); |
263 | } |
264 | /* Ignore re.error */ |
265 | if (compiled == NULL && PyErr_ExceptionMatches(sre_error_exception)) { |
266 | PyErr_Clear(); |
267 | } |
268 | |
269 | Py_DECREF(pattern_bytes); |
270 | Py_DECREF(flags_obj); |
271 | Py_XDECREF(compiled); |
272 | return 0; |
273 | } |
274 | |
275 | /* Some random patterns used to test re.match. |
276 | Be careful not to add catostraphically slow regexes here, we want to |
277 | exercise the matching code without causing timeouts.*/ |
278 | static const char* regex_patterns[] = { |
279 | "." , "^" , "abc" , "abc|def" , "^xxx$" , "\\b" , "()" , "[a-zA-Z0-9]" , |
280 | "abc+" , "[^A-Z]" , "[x]" , "(?=)" , "a{z}" , "a+b" , "a*?" , "a??" , "a+?" , |
281 | "{}" , "a{,}" , "{" , "}" , "^\\(*\\d{3}\\)*( |-)*\\d{3}( |-)*\\d{4}$" , |
282 | "(?:a*)*" , "a{1,2}?" |
283 | }; |
284 | const size_t NUM_PATTERNS = sizeof(regex_patterns) / sizeof(regex_patterns[0]); |
285 | PyObject** compiled_patterns = NULL; |
286 | /* Called by LLVMFuzzerTestOneInput for initialization */ |
287 | static int init_sre_match(void) { |
288 | PyObject* re_module = PyImport_ImportModule("re" ); |
289 | if (re_module == NULL) { |
290 | return 0; |
291 | } |
292 | compiled_patterns = (PyObject**) PyMem_RawMalloc( |
293 | sizeof(PyObject*) * NUM_PATTERNS); |
294 | if (compiled_patterns == NULL) { |
295 | PyErr_NoMemory(); |
296 | return 0; |
297 | } |
298 | |
299 | /* Precompile all the regex patterns on the first run for faster fuzzing */ |
300 | for (size_t i = 0; i < NUM_PATTERNS; i++) { |
301 | PyObject* compiled = PyObject_CallMethod( |
302 | re_module, "compile" , "y" , regex_patterns[i]); |
303 | /* Bail if any of the patterns fail to compile */ |
304 | if (compiled == NULL) { |
305 | return 0; |
306 | } |
307 | compiled_patterns[i] = compiled; |
308 | } |
309 | return 1; |
310 | } |
311 | /* Fuzz re.match(x) */ |
312 | static int fuzz_sre_match(const char* data, size_t size) { |
313 | if (size < 1 || size > MAX_RE_TEST_SIZE) { |
314 | return 0; |
315 | } |
316 | /* Use the first byte as a uint8_t specifying the index of the |
317 | regex to use */ |
318 | unsigned char idx = (unsigned char) data[0]; |
319 | idx = idx % NUM_PATTERNS; |
320 | |
321 | /* Pull the string to match from the remaining bytes */ |
322 | PyObject* to_match = PyBytes_FromStringAndSize(data + 1, size - 1); |
323 | if (to_match == NULL) { |
324 | return 0; |
325 | } |
326 | |
327 | PyObject* pattern = compiled_patterns[idx]; |
328 | PyObject* match_callable = PyObject_GetAttrString(pattern, "match" ); |
329 | |
330 | PyObject* matches = PyObject_CallOneArg(match_callable, to_match); |
331 | |
332 | Py_XDECREF(matches); |
333 | Py_DECREF(match_callable); |
334 | Py_DECREF(to_match); |
335 | return 0; |
336 | } |
337 | |
338 | #define MAX_CSV_TEST_SIZE 0x10000 |
339 | PyObject* csv_module = NULL; |
340 | PyObject* csv_error = NULL; |
341 | /* Called by LLVMFuzzerTestOneInput for initialization */ |
342 | static int init_csv_reader(void) { |
343 | /* Import csv and csv.Error */ |
344 | csv_module = PyImport_ImportModule("csv" ); |
345 | if (csv_module == NULL) { |
346 | return 0; |
347 | } |
348 | csv_error = PyObject_GetAttrString(csv_module, "Error" ); |
349 | return csv_error != NULL; |
350 | } |
351 | /* Fuzz csv.reader([x]) */ |
352 | static int fuzz_csv_reader(const char* data, size_t size) { |
353 | if (size < 1 || size > MAX_CSV_TEST_SIZE) { |
354 | return 0; |
355 | } |
356 | /* Ignore non null-terminated strings since _csv can't handle |
357 | embedded nulls */ |
358 | if (memchr(data, '\0', size) == NULL) { |
359 | return 0; |
360 | } |
361 | |
362 | PyObject* s = PyUnicode_FromString(data); |
363 | /* Ignore exceptions until we have a valid string */ |
364 | if (s == NULL) { |
365 | PyErr_Clear(); |
366 | return 0; |
367 | } |
368 | |
369 | /* Split on \n so we can test multiple lines */ |
370 | PyObject* lines = PyObject_CallMethod(s, "split" , "s" , "\n" ); |
371 | if (lines == NULL) { |
372 | Py_DECREF(s); |
373 | return 0; |
374 | } |
375 | |
376 | PyObject* reader = PyObject_CallMethod(csv_module, "reader" , "N" , lines); |
377 | if (reader) { |
378 | /* Consume all of the reader as an iterator */ |
379 | PyObject* parsed_line; |
380 | while ((parsed_line = PyIter_Next(reader))) { |
381 | Py_DECREF(parsed_line); |
382 | } |
383 | } |
384 | |
385 | /* Ignore csv.Error because we're probably going to generate |
386 | some bad files (embedded new-lines, unterminated quotes etc) */ |
387 | if (PyErr_ExceptionMatches(csv_error)) { |
388 | PyErr_Clear(); |
389 | } |
390 | |
391 | Py_XDECREF(reader); |
392 | Py_DECREF(s); |
393 | return 0; |
394 | } |
395 | |
396 | /* Run fuzzer and abort on failure. */ |
397 | static int _run_fuzz(const uint8_t *data, size_t size, int(*fuzzer)(const char* , size_t)) { |
398 | int rv = fuzzer((const char*) data, size); |
399 | if (PyErr_Occurred()) { |
400 | /* Fuzz tests should handle expected errors for themselves. |
401 | This is last-ditch check in case they didn't. */ |
402 | PyErr_Print(); |
403 | abort(); |
404 | } |
405 | /* Someday the return value might mean something, propagate it. */ |
406 | return rv; |
407 | } |
408 | |
409 | /* CPython generates a lot of leak warnings for whatever reason. */ |
410 | int __lsan_is_turned_off(void) { return 1; } |
411 | |
412 | |
413 | int LLVMFuzzerInitialize(int *argc, char ***argv) { |
414 | wchar_t* wide_program_name = Py_DecodeLocale(*argv[0], NULL); |
415 | Py_SetProgramName(wide_program_name); |
416 | return 0; |
417 | } |
418 | |
419 | /* Fuzz test interface. |
420 | This returns the bitwise or of all fuzz test's return values. |
421 | |
422 | All fuzz tests must return 0, as all nonzero return codes are reserved for |
423 | future use -- we propagate the return values for that future case. |
424 | (And we bitwise or when running multiple tests to verify that normally we |
425 | only return 0.) */ |
426 | int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { |
427 | if (!Py_IsInitialized()) { |
428 | /* LLVMFuzzerTestOneInput is called repeatedly from the same process, |
429 | with no separate initialization phase, sadly, so we need to |
430 | initialize CPython ourselves on the first run. */ |
431 | Py_InitializeEx(0); |
432 | } |
433 | |
434 | int rv = 0; |
435 | |
436 | #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_float) |
437 | rv |= _run_fuzz(data, size, fuzz_builtin_float); |
438 | #endif |
439 | #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_int) |
440 | rv |= _run_fuzz(data, size, fuzz_builtin_int); |
441 | #endif |
442 | #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_unicode) |
443 | rv |= _run_fuzz(data, size, fuzz_builtin_unicode); |
444 | #endif |
445 | #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_struct_unpack) |
446 | static int STRUCT_UNPACK_INITIALIZED = 0; |
447 | if (!STRUCT_UNPACK_INITIALIZED && !init_struct_unpack()) { |
448 | PyErr_Print(); |
449 | abort(); |
450 | } else { |
451 | STRUCT_UNPACK_INITIALIZED = 1; |
452 | } |
453 | rv |= _run_fuzz(data, size, fuzz_struct_unpack); |
454 | #endif |
455 | #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads) |
456 | static int JSON_LOADS_INITIALIZED = 0; |
457 | if (!JSON_LOADS_INITIALIZED && !init_json_loads()) { |
458 | PyErr_Print(); |
459 | abort(); |
460 | } else { |
461 | JSON_LOADS_INITIALIZED = 1; |
462 | } |
463 | |
464 | rv |= _run_fuzz(data, size, fuzz_json_loads); |
465 | #endif |
466 | #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_compile) |
467 | static int SRE_COMPILE_INITIALIZED = 0; |
468 | if (!SRE_COMPILE_INITIALIZED && !init_sre_compile()) { |
469 | PyErr_Print(); |
470 | abort(); |
471 | } else { |
472 | SRE_COMPILE_INITIALIZED = 1; |
473 | } |
474 | |
475 | rv |= _run_fuzz(data, size, fuzz_sre_compile); |
476 | #endif |
477 | #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_match) |
478 | static int SRE_MATCH_INITIALIZED = 0; |
479 | if (!SRE_MATCH_INITIALIZED && !init_sre_match()) { |
480 | PyErr_Print(); |
481 | abort(); |
482 | } else { |
483 | SRE_MATCH_INITIALIZED = 1; |
484 | } |
485 | |
486 | rv |= _run_fuzz(data, size, fuzz_sre_match); |
487 | #endif |
488 | #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_csv_reader) |
489 | static int CSV_READER_INITIALIZED = 0; |
490 | if (!CSV_READER_INITIALIZED && !init_csv_reader()) { |
491 | PyErr_Print(); |
492 | abort(); |
493 | } else { |
494 | CSV_READER_INITIALIZED = 1; |
495 | } |
496 | |
497 | rv |= _run_fuzz(data, size, fuzz_csv_reader); |
498 | #endif |
499 | return rv; |
500 | } |
501 | |