fuzzer.c source code [python/Modules/_xxtestfuzz/fuzzer.c]

1	/ A fuzz test for CPython.*
2
3	The only exposed function is LLVMFuzzerTestOneInput, which is called by
4	fuzzers and by the _fuzz module for smoke tests.
5
6	To build exactly one fuzz test, as when running in oss-fuzz etc.,
7	build with -D _Py_FUZZ_ONE and -D _Py_FUZZ_<test_name>. e.g. to build
8	LLVMFuzzerTestOneInput to only run "fuzz_builtin_float", build this file with
9	-D _Py_FUZZ_ONE -D _Py_FUZZ_fuzz_builtin_float.
10
11	See the source code for LLVMFuzzerTestOneInput for details. /*
12
13	#include <Python.h>
14	#include <stdlib.h>
15	#include <inttypes.h>
16
17	/ Fuzz PyFloat_FromString as a proxy for float(str). /
18	static int fuzz_builtin_float(const char* data, size_t size) {
19	PyObject* s = PyBytes_FromStringAndSize(data, size);
20	if (s == NULL) return `0`;
21	PyObject* f = PyFloat_FromString(s);
22	if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_ValueError)) {
23	PyErr_Clear();
24	}
25
26	Py_XDECREF(f);
27	Py_DECREF(s);
28	return `0`;
29	}
30
31	#define MAX_INT_TEST_SIZE 0x10000
32
33	/ Fuzz PyLong_FromUnicodeObject as a proxy for int(str). /
34	static int fuzz_builtin_int(const char* data, size_t size) {
35	/ Ignore test cases with very long ints to avoid timeouts*
36	int("9" 1000000) is not a very interesting test caase /
37	if (size > MAX_INT_TEST_SIZE) {
38	return `0`;
39	}
40	/ Pick a random valid base. (When the fuzzed function takes extra*
41	parameters, it's somewhat normal to hash the input to generate those
42	parameters. We want to exercise all code paths, so we do so here.) /*
43	int base = _Py_HashBytes(data, size) % `37`;
44	if (base == `1`) {
45	// 1 is the only number between 0 and 36 that is not a valid base.
46	base = `0`;
47	}
48	if (base == -`1`) {
49	return `0`; // An error occurred, bail early.
50	}
51	if (base < `0`) {
52	base = -base;
53	}
54
55	PyObject* s = PyUnicode_FromStringAndSize(data, size);
56	if (s == NULL) {
57	if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
58	PyErr_Clear();
59	}
60	return `0`;
61	}
62	PyObject* l = PyLong_FromUnicodeObject(s, base);
63	if (l == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
64	PyErr_Clear();
65	}
66	PyErr_Clear();
67	Py_XDECREF(l);
68	Py_DECREF(s);
69	return `0`;
70	}
71
72	/ Fuzz PyUnicode_FromStringAndSize as a proxy for unicode(str). /
73	static int fuzz_builtin_unicode(const char* data, size_t size) {
74	PyObject* s = PyUnicode_FromStringAndSize(data, size);
75	if (s == NULL && PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
76	PyErr_Clear();
77	}
78	Py_XDECREF(s);
79	return `0`;
80	}
81
82
83	PyObject* struct_unpack_method = NULL;
84	PyObject* struct_error = NULL;
85	/ Called by LLVMFuzzerTestOneInput for initialization /
86	static int init_struct_unpack(void) {
87	/ Import struct.unpack /
88	PyObject* struct_module = PyImport_ImportModule("struct");
89	if (struct_module == NULL) {
90	return `0`;
91	}
92	struct_error = PyObject_GetAttrString(struct_module, "error");
93	if (struct_error == NULL) {
94	return `0`;
95	}
96	struct_unpack_method = PyObject_GetAttrString(struct_module, "unpack");
97	return struct_unpack_method != NULL;
98	}
99	/ Fuzz struct.unpack(x, y) /
100	static int fuzz_struct_unpack(const char* data, size_t size) {
101	/ Everything up to the first null byte is considered the*
102	format. Everything after is the buffer /*
103	const char* first_null = memchr(data, `'\0'`, size);
104	if (first_null == NULL) {
105	return `0`;
106	}
107
108	size_t format_length = first_null - data;
109	size_t buffer_length = size - format_length - `1`;
110
111	PyObject* pattern = PyBytes_FromStringAndSize(data, format_length);
112	if (pattern == NULL) {
113	return `0`;
114	}
115	PyObject* buffer = PyBytes_FromStringAndSize(first_null + `1`, buffer_length);
116	if (buffer == NULL) {
117	Py_DECREF(pattern);
118	return `0`;
119	}
120
121	PyObject* unpacked = PyObject_CallFunctionObjArgs(
122	struct_unpack_method, pattern, buffer, NULL);
123	/ Ignore any overflow errors, these are easily triggered accidentally /
124	if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_OverflowError)) {
125	PyErr_Clear();
126	}
127	/ The pascal format string will throw a negative size when passing 0*
128	like: struct.unpack('0p', b'') /*
129	if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_SystemError)) {
130	PyErr_Clear();
131	}
132	/ Ignore any struct.error exceptions, these can be caused by invalid*
133	formats or incomplete buffers both of which are common. /*
134	if (unpacked == NULL && PyErr_ExceptionMatches(struct_error)) {
135	PyErr_Clear();
136	}
137
138	Py_XDECREF(unpacked);
139	Py_DECREF(pattern);
140	Py_DECREF(buffer);
141	return `0`;
142	}
143
144
145	#define MAX_JSON_TEST_SIZE 0x10000
146
147	PyObject* json_loads_method = NULL;
148	/ Called by LLVMFuzzerTestOneInput for initialization /
149	static int init_json_loads(void) {
150	/ Import json.loads /
151	PyObject* json_module = PyImport_ImportModule("json");
152	if (json_module == NULL) {
153	return `0`;
154	}
155	json_loads_method = PyObject_GetAttrString(json_module, "loads");
156	return json_loads_method != NULL;
157	}
158	/ Fuzz json.loads(x) /
159	static int fuzz_json_loads(const char* data, size_t size) {
160	/ Since python supports arbitrarily large ints in JSON,*
161	long inputs can lead to timeouts on boring inputs like
162	`json.loads("9" 100000)` /
163	if (size > MAX_JSON_TEST_SIZE) {
164	return `0`;
165	}
166	PyObject* input_bytes = PyBytes_FromStringAndSize(data, size);
167	if (input_bytes == NULL) {
168	return `0`;
169	}
170	PyObject* parsed = PyObject_CallOneArg(json_loads_method, input_bytes);
171	if (parsed == NULL) {
172	/ Ignore ValueError as the fuzzer will more than likely*
173	generate some invalid json and values /*
174	if (PyErr_ExceptionMatches(PyExc_ValueError) \|\|
175	/ Ignore RecursionError as the fuzzer generates long sequences of*
176	arrays such as `[[[...` /*
177	PyErr_ExceptionMatches(PyExc_RecursionError) \|\|
178	/ Ignore unicode errors, invalid byte sequences are common /
179	PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)
180	) {
181	PyErr_Clear();
182	}
183	}
184	Py_DECREF(input_bytes);
185	Py_XDECREF(parsed);
186	return `0`;
187	}
188
189	#define MAX_RE_TEST_SIZE 0x10000
190
191	PyObject* sre_compile_method = NULL;
192	PyObject* sre_error_exception = NULL;
193	int SRE_FLAG_DEBUG = `0`;
194	/ Called by LLVMFuzzerTestOneInput for initialization /
195	static int init_sre_compile(void) {
196	/ Import sre_compile.compile and sre.error /
197	PyObject* sre_compile_module = PyImport_ImportModule("sre_compile");
198	if (sre_compile_module == NULL) {
199	return `0`;
200	}
201	sre_compile_method = PyObject_GetAttrString(sre_compile_module, "compile");
202	if (sre_compile_method == NULL) {
203	return `0`;
204	}
205
206	PyObject* sre_constants = PyImport_ImportModule("sre_constants");
207	if (sre_constants == NULL) {
208	return `0`;
209	}
210	sre_error_exception = PyObject_GetAttrString(sre_constants, "error");
211	if (sre_error_exception == NULL) {
212	return `0`;
213	}
214	PyObject* debug_flag = PyObject_GetAttrString(sre_constants, "SRE_FLAG_DEBUG");
215	if (debug_flag == NULL) {
216	return `0`;
217	}
218	SRE_FLAG_DEBUG = PyLong_AsLong(debug_flag);
219	return `1`;
220	}
221	/ Fuzz _sre.compile(x) /
222	static int fuzz_sre_compile(const char* data, size_t size) {
223	/ Ignore really long regex patterns that will timeout the fuzzer /
224	if (size > MAX_RE_TEST_SIZE) {
225	return `0`;
226	}
227	/ We treat the first 2 bytes of the input as a number for the flags /
228	if (size < `2`) {
229	return `0`;
230	}
231	uint16_t flags = ((uint16_t*) data)[`0`];
232	/ We remove the SRE_FLAG_DEBUG if present. This is because it*
233	prints to stdout which greatly decreases fuzzing speed /*
234	flags &= ~SRE_FLAG_DEBUG;
235
236	/ Pull the pattern from the remaining bytes /
237	PyObject* pattern_bytes = PyBytes_FromStringAndSize(data + `2`, size - `2`);
238	if (pattern_bytes == NULL) {
239	return `0`;
240	}
241	PyObject* flags_obj = PyLong_FromUnsignedLong(flags);
242	if (flags_obj == NULL) {
243	Py_DECREF(pattern_bytes);
244	return `0`;
245	}
246
247	/ compiled = _sre.compile(data[2:], data[0:2] /
248	PyObject* compiled = PyObject_CallFunctionObjArgs(
249	sre_compile_method, pattern_bytes, flags_obj, NULL);
250	/ Ignore ValueError as the fuzzer will more than likely*
251	generate some invalid combination of flags /*
252	if (compiled == NULL && PyErr_ExceptionMatches(PyExc_ValueError)) {
253	PyErr_Clear();
254	}
255	/ Ignore some common errors thrown by sre_parse:*
256	Overflow, Assertion, Recursion and Index /*
257	if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) \|\|
258	PyErr_ExceptionMatches(PyExc_AssertionError) \|\|
259	PyErr_ExceptionMatches(PyExc_RecursionError) \|\|
260	PyErr_ExceptionMatches(PyExc_IndexError))
261	) {
262	PyErr_Clear();
263	}
264	/ Ignore re.error /
265	if (compiled == NULL && PyErr_ExceptionMatches(sre_error_exception)) {
266	PyErr_Clear();
267	}
268
269	Py_DECREF(pattern_bytes);
270	Py_DECREF(flags_obj);
271	Py_XDECREF(compiled);
272	return `0`;
273	}
274
275	/ Some random patterns used to test re.match.*
276	Be careful not to add catostraphically slow regexes here, we want to
277	exercise the matching code without causing timeouts./*
278	static const char* regex_patterns[] = {
279	".", "^", "abc", "abc\|def", "^xxx$", "\\b", "()", "[a-zA-Z0-9]",
280	"abc+", "[^A-Z]", "[x]", "(?=)", "a{z}", "a+b", "a*?", "a??", "a+?",
281	"{}", "a{,}", "{", "}", "^\$\\d{3}\$( \|-)\\d{3}( \|-)\\d{4}$",
282	"(?:a)", "a{1,2}?"
283	};
284	const size_t NUM_PATTERNS = sizeof(regex_patterns) / sizeof(regex_patterns[`0`]);
285	PyObject** compiled_patterns = NULL;
286	/ Called by LLVMFuzzerTestOneInput for initialization /
287	static int init_sre_match(void) {
288	PyObject* re_module = PyImport_ImportModule("re");
289	if (re_module == NULL) {
290	return `0`;
291	}
292	compiled_patterns = (PyObject**) PyMem_RawMalloc(
293	sizeof(PyObject) NUM_PATTERNS);
294	if (compiled_patterns == NULL) {
295	PyErr_NoMemory();
296	return `0`;
297	}
298
299	/ Precompile all the regex patterns on the first run for faster fuzzing /
300	for (size_t i = `0`; i < NUM_PATTERNS; i++) {
301	PyObject* compiled = PyObject_CallMethod(
302	re_module, "compile", "y", regex_patterns[i]);
303	/ Bail if any of the patterns fail to compile /
304	if (compiled == NULL) {
305	return `0`;
306	}
307	compiled_patterns[i] = compiled;
308	}
309	return `1`;
310	}
311	/ Fuzz re.match(x) /
312	static int fuzz_sre_match(const char* data, size_t size) {
313	if (size < `1` \|\| size > MAX_RE_TEST_SIZE) {
314	return `0`;
315	}
316	/ Use the first byte as a uint8_t specifying the index of the*
317	regex to use /*
318	unsigned char idx = (unsigned char) data[`0`];
319	idx = idx % NUM_PATTERNS;
320
321	/ Pull the string to match from the remaining bytes /
322	PyObject* to_match = PyBytes_FromStringAndSize(data + `1`, size - `1`);
323	if (to_match == NULL) {
324	return `0`;
325	}
326
327	PyObject* pattern = compiled_patterns[idx];
328	PyObject* match_callable = PyObject_GetAttrString(pattern, "match");
329
330	PyObject* matches = PyObject_CallOneArg(match_callable, to_match);
331
332	Py_XDECREF(matches);
333	Py_DECREF(match_callable);
334	Py_DECREF(to_match);
335	return `0`;
336	}
337
338	#define MAX_CSV_TEST_SIZE 0x10000
339	PyObject* csv_module = NULL;
340	PyObject* csv_error = NULL;
341	/ Called by LLVMFuzzerTestOneInput for initialization /
342	static int init_csv_reader(void) {
343	/ Import csv and csv.Error /
344	csv_module = PyImport_ImportModule("csv");
345	if (csv_module == NULL) {
346	return `0`;
347	}
348	csv_error = PyObject_GetAttrString(csv_module, "Error");
349	return csv_error != NULL;
350	}
351	/ Fuzz csv.reader([x]) /
352	static int fuzz_csv_reader(const char* data, size_t size) {
353	if (size < `1` \|\| size > MAX_CSV_TEST_SIZE) {
354	return `0`;
355	}
356	/ Ignore non null-terminated strings since _csv can't handle*
357	embedded nulls /*
358	if (memchr(data, `'\0'`, size) == NULL) {
359	return `0`;
360	}
361
362	PyObject* s = PyUnicode_FromString(data);
363	/ Ignore exceptions until we have a valid string /
364	if (s == NULL) {
365	PyErr_Clear();
366	return `0`;
367	}
368
369	/ Split on \n so we can test multiple lines /
370	PyObject* lines = PyObject_CallMethod(s, "split", "s", "\n");
371	if (lines == NULL) {
372	Py_DECREF(s);
373	return `0`;
374	}
375
376	PyObject* reader = PyObject_CallMethod(csv_module, "reader", "N", lines);
377	if (reader) {
378	/ Consume all of the reader as an iterator /
379	PyObject* parsed_line;
380	while ((parsed_line = PyIter_Next(reader))) {
381	Py_DECREF(parsed_line);
382	}
383	}
384
385	/ Ignore csv.Error because we're probably going to generate*
386	some bad files (embedded new-lines, unterminated quotes etc) /*
387	if (PyErr_ExceptionMatches(csv_error)) {
388	PyErr_Clear();
389	}
390
391	Py_XDECREF(reader);
392	Py_DECREF(s);
393	return `0`;
394	}
395
396	/ Run fuzzer and abort on failure. /
397	static int _run_fuzz(const uint8_t data, size_t size, int(fuzzer)(const char* , size_t)) {
398	int rv = fuzzer((const char*) data, size);
399	if (PyErr_Occurred()) {
400	/ Fuzz tests should handle expected errors for themselves.*
401	This is last-ditch check in case they didn't. /*
402	PyErr_Print();
403	abort();
404	}
405	/ Someday the return value might mean something, propagate it. /
406	return rv;
407	}
408
409	/ CPython generates a lot of leak warnings for whatever reason. /
410	int __lsan_is_turned_off(void) { return `1`; }
411
412
413	int LLVMFuzzerInitialize(int argc, char* ***argv) {
414	wchar_t* wide_program_name = Py_DecodeLocale(*argv[`0`], NULL);
415	Py_SetProgramName(wide_program_name);
416	return `0`;
417	}
418
419	/ Fuzz test interface.*
420	This returns the bitwise or of all fuzz test's return values.
421
422	All fuzz tests must return 0, as all nonzero return codes are reserved for
423	future use -- we propagate the return values for that future case.
424	(And we bitwise or when running multiple tests to verify that normally we
425	only return 0.) /*
426	int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
427	if (!Py_IsInitialized()) {
428	/ LLVMFuzzerTestOneInput is called repeatedly from the same process,*
429	with no separate initialization phase, sadly, so we need to
430	initialize CPython ourselves on the first run. /*
431	Py_InitializeEx(`0`);
432	}
433
434	int rv = `0`;
435
436	#if !defined(_Py_FUZZ_ONE) \|\| defined(_Py_FUZZ_fuzz_builtin_float)
437	rv \|= _run_fuzz(data, size, fuzz_builtin_float);
438	#endif
439	#if !defined(_Py_FUZZ_ONE) \|\| defined(_Py_FUZZ_fuzz_builtin_int)
440	rv \|= _run_fuzz(data, size, fuzz_builtin_int);
441	#endif
442	#if !defined(_Py_FUZZ_ONE) \|\| defined(_Py_FUZZ_fuzz_builtin_unicode)
443	rv \|= _run_fuzz(data, size, fuzz_builtin_unicode);
444	#endif
445	#if !defined(_Py_FUZZ_ONE) \|\| defined(_Py_FUZZ_fuzz_struct_unpack)
446	static int STRUCT_UNPACK_INITIALIZED = `0`;
447	if (!STRUCT_UNPACK_INITIALIZED && !init_struct_unpack()) {
448	PyErr_Print();
449	abort();
450	} else {
451	STRUCT_UNPACK_INITIALIZED = `1`;
452	}
453	rv \|= _run_fuzz(data, size, fuzz_struct_unpack);
454	#endif
455	#if !defined(_Py_FUZZ_ONE) \|\| defined(_Py_FUZZ_fuzz_json_loads)
456	static int JSON_LOADS_INITIALIZED = `0`;
457	if (!JSON_LOADS_INITIALIZED && !init_json_loads()) {
458	PyErr_Print();
459	abort();
460	} else {
461	JSON_LOADS_INITIALIZED = `1`;
462	}
463
464	rv \|= _run_fuzz(data, size, fuzz_json_loads);
465	#endif
466	#if !defined(_Py_FUZZ_ONE) \|\| defined(_Py_FUZZ_fuzz_sre_compile)
467	static int SRE_COMPILE_INITIALIZED = `0`;
468	if (!SRE_COMPILE_INITIALIZED && !init_sre_compile()) {
469	PyErr_Print();
470	abort();
471	} else {
472	SRE_COMPILE_INITIALIZED = `1`;
473	}
474
475	rv \|= _run_fuzz(data, size, fuzz_sre_compile);
476	#endif
477	#if !defined(_Py_FUZZ_ONE) \|\| defined(_Py_FUZZ_fuzz_sre_match)
478	static int SRE_MATCH_INITIALIZED = `0`;
479	if (!SRE_MATCH_INITIALIZED && !init_sre_match()) {
480	PyErr_Print();
481	abort();
482	} else {
483	SRE_MATCH_INITIALIZED = `1`;
484	}
485
486	rv \|= _run_fuzz(data, size, fuzz_sre_match);
487	#endif
488	#if !defined(_Py_FUZZ_ONE) \|\| defined(_Py_FUZZ_fuzz_csv_reader)
489	static int CSV_READER_INITIALIZED = `0`;
490	if (!CSV_READER_INITIALIZED && !init_csv_reader()) {
491	PyErr_Print();
492	abort();
493	} else {
494	CSV_READER_INITIALIZED = `1`;
495	}
496
497	rv \|= _run_fuzz(data, size, fuzz_csv_reader);
498	#endif
499	return rv;
500	}
501

Browse the source code of python/Modules/_xxtestfuzz/fuzzer.c