1/* A fuzz test for CPython.
2
3 The only exposed function is LLVMFuzzerTestOneInput, which is called by
4 fuzzers and by the _fuzz module for smoke tests.
5
6 To build exactly one fuzz test, as when running in oss-fuzz etc.,
7 build with -D _Py_FUZZ_ONE and -D _Py_FUZZ_<test_name>. e.g. to build
8 LLVMFuzzerTestOneInput to only run "fuzz_builtin_float", build this file with
9 -D _Py_FUZZ_ONE -D _Py_FUZZ_fuzz_builtin_float.
10
11 See the source code for LLVMFuzzerTestOneInput for details. */
12
13#include <Python.h>
14#include <stdlib.h>
15#include <inttypes.h>
16
17/* Fuzz PyFloat_FromString as a proxy for float(str). */
18static int fuzz_builtin_float(const char* data, size_t size) {
19 PyObject* s = PyBytes_FromStringAndSize(data, size);
20 if (s == NULL) return 0;
21 PyObject* f = PyFloat_FromString(s);
22 if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_ValueError)) {
23 PyErr_Clear();
24 }
25
26 Py_XDECREF(f);
27 Py_DECREF(s);
28 return 0;
29}
30
31#define MAX_INT_TEST_SIZE 0x10000
32
33/* Fuzz PyLong_FromUnicodeObject as a proxy for int(str). */
34static int fuzz_builtin_int(const char* data, size_t size) {
35 /* Ignore test cases with very long ints to avoid timeouts
36 int("9" * 1000000) is not a very interesting test caase */
37 if (size > MAX_INT_TEST_SIZE) {
38 return 0;
39 }
40 /* Pick a random valid base. (When the fuzzed function takes extra
41 parameters, it's somewhat normal to hash the input to generate those
42 parameters. We want to exercise all code paths, so we do so here.) */
43 int base = _Py_HashBytes(data, size) % 37;
44 if (base == 1) {
45 // 1 is the only number between 0 and 36 that is not a valid base.
46 base = 0;
47 }
48 if (base == -1) {
49 return 0; // An error occurred, bail early.
50 }
51 if (base < 0) {
52 base = -base;
53 }
54
55 PyObject* s = PyUnicode_FromStringAndSize(data, size);
56 if (s == NULL) {
57 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
58 PyErr_Clear();
59 }
60 return 0;
61 }
62 PyObject* l = PyLong_FromUnicodeObject(s, base);
63 if (l == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
64 PyErr_Clear();
65 }
66 PyErr_Clear();
67 Py_XDECREF(l);
68 Py_DECREF(s);
69 return 0;
70}
71
72/* Fuzz PyUnicode_FromStringAndSize as a proxy for unicode(str). */
73static int fuzz_builtin_unicode(const char* data, size_t size) {
74 PyObject* s = PyUnicode_FromStringAndSize(data, size);
75 if (s == NULL && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
76 PyErr_Clear();
77 }
78 Py_XDECREF(s);
79 return 0;
80}
81
82
83PyObject* struct_unpack_method = NULL;
84PyObject* struct_error = NULL;
85/* Called by LLVMFuzzerTestOneInput for initialization */
86static int init_struct_unpack(void) {
87 /* Import struct.unpack */
88 PyObject* struct_module = PyImport_ImportModule("struct");
89 if (struct_module == NULL) {
90 return 0;
91 }
92 struct_error = PyObject_GetAttrString(struct_module, "error");
93 if (struct_error == NULL) {
94 return 0;
95 }
96 struct_unpack_method = PyObject_GetAttrString(struct_module, "unpack");
97 return struct_unpack_method != NULL;
98}
99/* Fuzz struct.unpack(x, y) */
100static int fuzz_struct_unpack(const char* data, size_t size) {
101 /* Everything up to the first null byte is considered the
102 format. Everything after is the buffer */
103 const char* first_null = memchr(data, '\0', size);
104 if (first_null == NULL) {
105 return 0;
106 }
107
108 size_t format_length = first_null - data;
109 size_t buffer_length = size - format_length - 1;
110
111 PyObject* pattern = PyBytes_FromStringAndSize(data, format_length);
112 if (pattern == NULL) {
113 return 0;
114 }
115 PyObject* buffer = PyBytes_FromStringAndSize(first_null + 1, buffer_length);
116 if (buffer == NULL) {
117 Py_DECREF(pattern);
118 return 0;
119 }
120
121 PyObject* unpacked = PyObject_CallFunctionObjArgs(
122 struct_unpack_method, pattern, buffer, NULL);
123 /* Ignore any overflow errors, these are easily triggered accidentally */
124 if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_OverflowError)) {
125 PyErr_Clear();
126 }
127 /* The pascal format string will throw a negative size when passing 0
128 like: struct.unpack('0p', b'') */
129 if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_SystemError)) {
130 PyErr_Clear();
131 }
132 /* Ignore any struct.error exceptions, these can be caused by invalid
133 formats or incomplete buffers both of which are common. */
134 if (unpacked == NULL && PyErr_ExceptionMatches(struct_error)) {
135 PyErr_Clear();
136 }
137
138 Py_XDECREF(unpacked);
139 Py_DECREF(pattern);
140 Py_DECREF(buffer);
141 return 0;
142}
143
144
145#define MAX_JSON_TEST_SIZE 0x10000
146
147PyObject* json_loads_method = NULL;
148/* Called by LLVMFuzzerTestOneInput for initialization */
149static int init_json_loads(void) {
150 /* Import json.loads */
151 PyObject* json_module = PyImport_ImportModule("json");
152 if (json_module == NULL) {
153 return 0;
154 }
155 json_loads_method = PyObject_GetAttrString(json_module, "loads");
156 return json_loads_method != NULL;
157}
158/* Fuzz json.loads(x) */
159static int fuzz_json_loads(const char* data, size_t size) {
160 /* Since python supports arbitrarily large ints in JSON,
161 long inputs can lead to timeouts on boring inputs like
162 `json.loads("9" * 100000)` */
163 if (size > MAX_JSON_TEST_SIZE) {
164 return 0;
165 }
166 PyObject* input_bytes = PyBytes_FromStringAndSize(data, size);
167 if (input_bytes == NULL) {
168 return 0;
169 }
170 PyObject* parsed = PyObject_CallOneArg(json_loads_method, input_bytes);
171 if (parsed == NULL) {
172 /* Ignore ValueError as the fuzzer will more than likely
173 generate some invalid json and values */
174 if (PyErr_ExceptionMatches(PyExc_ValueError) ||
175 /* Ignore RecursionError as the fuzzer generates long sequences of
176 arrays such as `[[[...` */
177 PyErr_ExceptionMatches(PyExc_RecursionError) ||
178 /* Ignore unicode errors, invalid byte sequences are common */
179 PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)
180 ) {
181 PyErr_Clear();
182 }
183 }
184 Py_DECREF(input_bytes);
185 Py_XDECREF(parsed);
186 return 0;
187}
188
189#define MAX_RE_TEST_SIZE 0x10000
190
191PyObject* sre_compile_method = NULL;
192PyObject* sre_error_exception = NULL;
193int SRE_FLAG_DEBUG = 0;
194/* Called by LLVMFuzzerTestOneInput for initialization */
195static int init_sre_compile(void) {
196 /* Import sre_compile.compile and sre.error */
197 PyObject* sre_compile_module = PyImport_ImportModule("sre_compile");
198 if (sre_compile_module == NULL) {
199 return 0;
200 }
201 sre_compile_method = PyObject_GetAttrString(sre_compile_module, "compile");
202 if (sre_compile_method == NULL) {
203 return 0;
204 }
205
206 PyObject* sre_constants = PyImport_ImportModule("sre_constants");
207 if (sre_constants == NULL) {
208 return 0;
209 }
210 sre_error_exception = PyObject_GetAttrString(sre_constants, "error");
211 if (sre_error_exception == NULL) {
212 return 0;
213 }
214 PyObject* debug_flag = PyObject_GetAttrString(sre_constants, "SRE_FLAG_DEBUG");
215 if (debug_flag == NULL) {
216 return 0;
217 }
218 SRE_FLAG_DEBUG = PyLong_AsLong(debug_flag);
219 return 1;
220}
221/* Fuzz _sre.compile(x) */
222static int fuzz_sre_compile(const char* data, size_t size) {
223 /* Ignore really long regex patterns that will timeout the fuzzer */
224 if (size > MAX_RE_TEST_SIZE) {
225 return 0;
226 }
227 /* We treat the first 2 bytes of the input as a number for the flags */
228 if (size < 2) {
229 return 0;
230 }
231 uint16_t flags = ((uint16_t*) data)[0];
232 /* We remove the SRE_FLAG_DEBUG if present. This is because it
233 prints to stdout which greatly decreases fuzzing speed */
234 flags &= ~SRE_FLAG_DEBUG;
235
236 /* Pull the pattern from the remaining bytes */
237 PyObject* pattern_bytes = PyBytes_FromStringAndSize(data + 2, size - 2);
238 if (pattern_bytes == NULL) {
239 return 0;
240 }
241 PyObject* flags_obj = PyLong_FromUnsignedLong(flags);
242 if (flags_obj == NULL) {
243 Py_DECREF(pattern_bytes);
244 return 0;
245 }
246
247 /* compiled = _sre.compile(data[2:], data[0:2] */
248 PyObject* compiled = PyObject_CallFunctionObjArgs(
249 sre_compile_method, pattern_bytes, flags_obj, NULL);
250 /* Ignore ValueError as the fuzzer will more than likely
251 generate some invalid combination of flags */
252 if (compiled == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
253 PyErr_Clear();
254 }
255 /* Ignore some common errors thrown by sre_parse:
256 Overflow, Assertion, Recursion and Index */
257 if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) ||
258 PyErr_ExceptionMatches(PyExc_AssertionError) ||
259 PyErr_ExceptionMatches(PyExc_RecursionError) ||
260 PyErr_ExceptionMatches(PyExc_IndexError))
261 ) {
262 PyErr_Clear();
263 }
264 /* Ignore re.error */
265 if (compiled == NULL && PyErr_ExceptionMatches(sre_error_exception)) {
266 PyErr_Clear();
267 }
268
269 Py_DECREF(pattern_bytes);
270 Py_DECREF(flags_obj);
271 Py_XDECREF(compiled);
272 return 0;
273}
274
275/* Some random patterns used to test re.match.
276 Be careful not to add catostraphically slow regexes here, we want to
277 exercise the matching code without causing timeouts.*/
278static const char* regex_patterns[] = {
279 ".", "^", "abc", "abc|def", "^xxx$", "\\b", "()", "[a-zA-Z0-9]",
280 "abc+", "[^A-Z]", "[x]", "(?=)", "a{z}", "a+b", "a*?", "a??", "a+?",
281 "{}", "a{,}", "{", "}", "^\\(*\\d{3}\\)*( |-)*\\d{3}( |-)*\\d{4}$",
282 "(?:a*)*", "a{1,2}?"
283};
284const size_t NUM_PATTERNS = sizeof(regex_patterns) / sizeof(regex_patterns[0]);
285PyObject** compiled_patterns = NULL;
286/* Called by LLVMFuzzerTestOneInput for initialization */
287static int init_sre_match(void) {
288 PyObject* re_module = PyImport_ImportModule("re");
289 if (re_module == NULL) {
290 return 0;
291 }
292 compiled_patterns = (PyObject**) PyMem_RawMalloc(
293 sizeof(PyObject*) * NUM_PATTERNS);
294 if (compiled_patterns == NULL) {
295 PyErr_NoMemory();
296 return 0;
297 }
298
299 /* Precompile all the regex patterns on the first run for faster fuzzing */
300 for (size_t i = 0; i < NUM_PATTERNS; i++) {
301 PyObject* compiled = PyObject_CallMethod(
302 re_module, "compile", "y", regex_patterns[i]);
303 /* Bail if any of the patterns fail to compile */
304 if (compiled == NULL) {
305 return 0;
306 }
307 compiled_patterns[i] = compiled;
308 }
309 return 1;
310}
311/* Fuzz re.match(x) */
312static int fuzz_sre_match(const char* data, size_t size) {
313 if (size < 1 || size > MAX_RE_TEST_SIZE) {
314 return 0;
315 }
316 /* Use the first byte as a uint8_t specifying the index of the
317 regex to use */
318 unsigned char idx = (unsigned char) data[0];
319 idx = idx % NUM_PATTERNS;
320
321 /* Pull the string to match from the remaining bytes */
322 PyObject* to_match = PyBytes_FromStringAndSize(data + 1, size - 1);
323 if (to_match == NULL) {
324 return 0;
325 }
326
327 PyObject* pattern = compiled_patterns[idx];
328 PyObject* match_callable = PyObject_GetAttrString(pattern, "match");
329
330 PyObject* matches = PyObject_CallOneArg(match_callable, to_match);
331
332 Py_XDECREF(matches);
333 Py_DECREF(match_callable);
334 Py_DECREF(to_match);
335 return 0;
336}
337
338#define MAX_CSV_TEST_SIZE 0x10000
339PyObject* csv_module = NULL;
340PyObject* csv_error = NULL;
341/* Called by LLVMFuzzerTestOneInput for initialization */
342static int init_csv_reader(void) {
343 /* Import csv and csv.Error */
344 csv_module = PyImport_ImportModule("csv");
345 if (csv_module == NULL) {
346 return 0;
347 }
348 csv_error = PyObject_GetAttrString(csv_module, "Error");
349 return csv_error != NULL;
350}
351/* Fuzz csv.reader([x]) */
352static int fuzz_csv_reader(const char* data, size_t size) {
353 if (size < 1 || size > MAX_CSV_TEST_SIZE) {
354 return 0;
355 }
356 /* Ignore non null-terminated strings since _csv can't handle
357 embedded nulls */
358 if (memchr(data, '\0', size) == NULL) {
359 return 0;
360 }
361
362 PyObject* s = PyUnicode_FromString(data);
363 /* Ignore exceptions until we have a valid string */
364 if (s == NULL) {
365 PyErr_Clear();
366 return 0;
367 }
368
369 /* Split on \n so we can test multiple lines */
370 PyObject* lines = PyObject_CallMethod(s, "split", "s", "\n");
371 if (lines == NULL) {
372 Py_DECREF(s);
373 return 0;
374 }
375
376 PyObject* reader = PyObject_CallMethod(csv_module, "reader", "N", lines);
377 if (reader) {
378 /* Consume all of the reader as an iterator */
379 PyObject* parsed_line;
380 while ((parsed_line = PyIter_Next(reader))) {
381 Py_DECREF(parsed_line);
382 }
383 }
384
385 /* Ignore csv.Error because we're probably going to generate
386 some bad files (embedded new-lines, unterminated quotes etc) */
387 if (PyErr_ExceptionMatches(csv_error)) {
388 PyErr_Clear();
389 }
390
391 Py_XDECREF(reader);
392 Py_DECREF(s);
393 return 0;
394}
395
396/* Run fuzzer and abort on failure. */
397static int _run_fuzz(const uint8_t *data, size_t size, int(*fuzzer)(const char* , size_t)) {
398 int rv = fuzzer((const char*) data, size);
399 if (PyErr_Occurred()) {
400 /* Fuzz tests should handle expected errors for themselves.
401 This is last-ditch check in case they didn't. */
402 PyErr_Print();
403 abort();
404 }
405 /* Someday the return value might mean something, propagate it. */
406 return rv;
407}
408
409/* CPython generates a lot of leak warnings for whatever reason. */
410int __lsan_is_turned_off(void) { return 1; }
411
412
413int LLVMFuzzerInitialize(int *argc, char ***argv) {
414 wchar_t* wide_program_name = Py_DecodeLocale(*argv[0], NULL);
415 Py_SetProgramName(wide_program_name);
416 return 0;
417}
418
419/* Fuzz test interface.
420 This returns the bitwise or of all fuzz test's return values.
421
422 All fuzz tests must return 0, as all nonzero return codes are reserved for
423 future use -- we propagate the return values for that future case.
424 (And we bitwise or when running multiple tests to verify that normally we
425 only return 0.) */
426int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
427 if (!Py_IsInitialized()) {
428 /* LLVMFuzzerTestOneInput is called repeatedly from the same process,
429 with no separate initialization phase, sadly, so we need to
430 initialize CPython ourselves on the first run. */
431 Py_InitializeEx(0);
432 }
433
434 int rv = 0;
435
436#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_float)
437 rv |= _run_fuzz(data, size, fuzz_builtin_float);
438#endif
439#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_int)
440 rv |= _run_fuzz(data, size, fuzz_builtin_int);
441#endif
442#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_unicode)
443 rv |= _run_fuzz(data, size, fuzz_builtin_unicode);
444#endif
445#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_struct_unpack)
446 static int STRUCT_UNPACK_INITIALIZED = 0;
447 if (!STRUCT_UNPACK_INITIALIZED && !init_struct_unpack()) {
448 PyErr_Print();
449 abort();
450 } else {
451 STRUCT_UNPACK_INITIALIZED = 1;
452 }
453 rv |= _run_fuzz(data, size, fuzz_struct_unpack);
454#endif
455#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads)
456 static int JSON_LOADS_INITIALIZED = 0;
457 if (!JSON_LOADS_INITIALIZED && !init_json_loads()) {
458 PyErr_Print();
459 abort();
460 } else {
461 JSON_LOADS_INITIALIZED = 1;
462 }
463
464 rv |= _run_fuzz(data, size, fuzz_json_loads);
465#endif
466#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_compile)
467 static int SRE_COMPILE_INITIALIZED = 0;
468 if (!SRE_COMPILE_INITIALIZED && !init_sre_compile()) {
469 PyErr_Print();
470 abort();
471 } else {
472 SRE_COMPILE_INITIALIZED = 1;
473 }
474
475 rv |= _run_fuzz(data, size, fuzz_sre_compile);
476#endif
477#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_sre_match)
478 static int SRE_MATCH_INITIALIZED = 0;
479 if (!SRE_MATCH_INITIALIZED && !init_sre_match()) {
480 PyErr_Print();
481 abort();
482 } else {
483 SRE_MATCH_INITIALIZED = 1;
484 }
485
486 rv |= _run_fuzz(data, size, fuzz_sre_match);
487#endif
488#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_csv_reader)
489 static int CSV_READER_INITIALIZED = 0;
490 if (!CSV_READER_INITIALIZED && !init_csv_reader()) {
491 PyErr_Print();
492 abort();
493 } else {
494 CSV_READER_INITIALIZED = 1;
495 }
496
497 rv |= _run_fuzz(data, size, fuzz_csv_reader);
498#endif
499 return rv;
500}
501