1 | /* csv module */ |
2 | |
3 | /* |
4 | |
5 | This module provides the low-level underpinnings of a CSV reading/writing |
6 | module. Users should not use this module directly, but import the csv.py |
7 | module instead. |
8 | |
9 | */ |
10 | |
11 | #define MODULE_VERSION "1.0" |
12 | |
13 | #include "Python.h" |
14 | #include "structmember.h" // PyMemberDef |
15 | #include <stdbool.h> |
16 | |
17 | |
18 | typedef struct { |
19 | PyObject *error_obj; /* CSV exception */ |
20 | PyObject *dialects; /* Dialect registry */ |
21 | PyTypeObject *dialect_type; |
22 | PyTypeObject *reader_type; |
23 | PyTypeObject *writer_type; |
24 | long field_limit; /* max parsed field size */ |
25 | } _csvstate; |
26 | |
27 | static struct PyModuleDef _csvmodule; |
28 | |
29 | static inline _csvstate* |
30 | get_csv_state(PyObject *module) |
31 | { |
32 | void *state = PyModule_GetState(module); |
33 | assert(state != NULL); |
34 | return (_csvstate *)state; |
35 | } |
36 | |
37 | static int |
38 | _csv_clear(PyObject *module) |
39 | { |
40 | _csvstate *module_state = PyModule_GetState(module); |
41 | Py_CLEAR(module_state->error_obj); |
42 | Py_CLEAR(module_state->dialects); |
43 | Py_CLEAR(module_state->dialect_type); |
44 | Py_CLEAR(module_state->reader_type); |
45 | Py_CLEAR(module_state->writer_type); |
46 | return 0; |
47 | } |
48 | |
49 | static int |
50 | _csv_traverse(PyObject *module, visitproc visit, void *arg) |
51 | { |
52 | _csvstate *module_state = PyModule_GetState(module); |
53 | Py_VISIT(module_state->error_obj); |
54 | Py_VISIT(module_state->dialects); |
55 | Py_VISIT(module_state->dialect_type); |
56 | Py_VISIT(module_state->reader_type); |
57 | Py_VISIT(module_state->writer_type); |
58 | return 0; |
59 | } |
60 | |
61 | static void |
62 | _csv_free(void *module) |
63 | { |
64 | _csv_clear((PyObject *)module); |
65 | } |
66 | |
67 | typedef enum { |
68 | START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD, |
69 | IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD, |
70 | EAT_CRNL,AFTER_ESCAPED_CRNL |
71 | } ParserState; |
72 | |
73 | typedef enum { |
74 | QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE |
75 | } QuoteStyle; |
76 | |
77 | typedef struct { |
78 | QuoteStyle style; |
79 | const char *name; |
80 | } StyleDesc; |
81 | |
82 | static const StyleDesc quote_styles[] = { |
83 | { QUOTE_MINIMAL, "QUOTE_MINIMAL" }, |
84 | { QUOTE_ALL, "QUOTE_ALL" }, |
85 | { QUOTE_NONNUMERIC, "QUOTE_NONNUMERIC" }, |
86 | { QUOTE_NONE, "QUOTE_NONE" }, |
87 | { 0 } |
88 | }; |
89 | |
90 | typedef struct { |
91 | PyObject_HEAD |
92 | |
93 | char doublequote; /* is " represented by ""? */ |
94 | char skipinitialspace; /* ignore spaces following delimiter? */ |
95 | char strict; /* raise exception on bad CSV */ |
96 | int quoting; /* style of quoting to write */ |
97 | Py_UCS4 delimiter; /* field separator */ |
98 | Py_UCS4 quotechar; /* quote character */ |
99 | Py_UCS4 escapechar; /* escape character */ |
100 | PyObject *lineterminator; /* string to write between records */ |
101 | |
102 | } DialectObj; |
103 | |
104 | typedef struct { |
105 | PyObject_HEAD |
106 | |
107 | PyObject *input_iter; /* iterate over this for input lines */ |
108 | |
109 | DialectObj *dialect; /* parsing dialect */ |
110 | |
111 | PyObject *fields; /* field list for current record */ |
112 | ParserState state; /* current CSV parse state */ |
113 | Py_UCS4 *field; /* temporary buffer */ |
114 | Py_ssize_t field_size; /* size of allocated buffer */ |
115 | Py_ssize_t field_len; /* length of current field */ |
116 | int numeric_field; /* treat field as numeric */ |
117 | unsigned long line_num; /* Source-file line number */ |
118 | } ReaderObj; |
119 | |
120 | typedef struct { |
121 | PyObject_HEAD |
122 | |
123 | PyObject *write; /* write output lines to this file */ |
124 | |
125 | DialectObj *dialect; /* parsing dialect */ |
126 | |
127 | Py_UCS4 *rec; /* buffer for parser.join */ |
128 | Py_ssize_t rec_size; /* size of allocated record */ |
129 | Py_ssize_t rec_len; /* length of record */ |
130 | int num_fields; /* number of fields in record */ |
131 | |
132 | PyObject *error_obj; /* cached error object */ |
133 | } WriterObj; |
134 | |
135 | /* |
136 | * DIALECT class |
137 | */ |
138 | |
139 | static PyObject * |
140 | get_dialect_from_registry(PyObject *name_obj, _csvstate *module_state) |
141 | { |
142 | PyObject *dialect_obj; |
143 | |
144 | dialect_obj = PyDict_GetItemWithError(module_state->dialects, name_obj); |
145 | if (dialect_obj == NULL) { |
146 | if (!PyErr_Occurred()) |
147 | PyErr_Format(module_state->error_obj, "unknown dialect" ); |
148 | } |
149 | else |
150 | Py_INCREF(dialect_obj); |
151 | |
152 | return dialect_obj; |
153 | } |
154 | |
155 | static PyObject * |
156 | get_nullchar_as_None(Py_UCS4 c) |
157 | { |
158 | if (c == '\0') { |
159 | Py_RETURN_NONE; |
160 | } |
161 | else |
162 | return PyUnicode_FromOrdinal(c); |
163 | } |
164 | |
165 | static PyObject * |
166 | Dialect_get_lineterminator(DialectObj *self, void *Py_UNUSED(ignored)) |
167 | { |
168 | Py_XINCREF(self->lineterminator); |
169 | return self->lineterminator; |
170 | } |
171 | |
172 | static PyObject * |
173 | Dialect_get_delimiter(DialectObj *self, void *Py_UNUSED(ignored)) |
174 | { |
175 | return get_nullchar_as_None(self->delimiter); |
176 | } |
177 | |
178 | static PyObject * |
179 | Dialect_get_escapechar(DialectObj *self, void *Py_UNUSED(ignored)) |
180 | { |
181 | return get_nullchar_as_None(self->escapechar); |
182 | } |
183 | |
184 | static PyObject * |
185 | Dialect_get_quotechar(DialectObj *self, void *Py_UNUSED(ignored)) |
186 | { |
187 | return get_nullchar_as_None(self->quotechar); |
188 | } |
189 | |
190 | static PyObject * |
191 | Dialect_get_quoting(DialectObj *self, void *Py_UNUSED(ignored)) |
192 | { |
193 | return PyLong_FromLong(self->quoting); |
194 | } |
195 | |
196 | static int |
197 | _set_bool(const char *name, char *target, PyObject *src, bool dflt) |
198 | { |
199 | if (src == NULL) |
200 | *target = dflt; |
201 | else { |
202 | int b = PyObject_IsTrue(src); |
203 | if (b < 0) |
204 | return -1; |
205 | *target = (char)b; |
206 | } |
207 | return 0; |
208 | } |
209 | |
210 | static int |
211 | _set_int(const char *name, int *target, PyObject *src, int dflt) |
212 | { |
213 | if (src == NULL) |
214 | *target = dflt; |
215 | else { |
216 | int value; |
217 | if (!PyLong_CheckExact(src)) { |
218 | PyErr_Format(PyExc_TypeError, |
219 | "\"%s\" must be an integer" , name); |
220 | return -1; |
221 | } |
222 | value = _PyLong_AsInt(src); |
223 | if (value == -1 && PyErr_Occurred()) { |
224 | return -1; |
225 | } |
226 | *target = value; |
227 | } |
228 | return 0; |
229 | } |
230 | |
231 | static int |
232 | _set_char_or_none(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt) |
233 | { |
234 | if (src == NULL) { |
235 | *target = dflt; |
236 | } |
237 | else { |
238 | *target = '\0'; |
239 | if (src != Py_None) { |
240 | if (!PyUnicode_Check(src)) { |
241 | PyErr_Format(PyExc_TypeError, |
242 | "\"%s\" must be string or None, not %.200s" , name, |
243 | Py_TYPE(src)->tp_name); |
244 | return -1; |
245 | } |
246 | Py_ssize_t len = PyUnicode_GetLength(src); |
247 | if (len < 0) { |
248 | return -1; |
249 | } |
250 | if (len > 1) { |
251 | PyErr_Format(PyExc_TypeError, |
252 | "\"%s\" must be a 1-character string" , |
253 | name); |
254 | return -1; |
255 | } |
256 | /* PyUnicode_READY() is called in PyUnicode_GetLength() */ |
257 | else { |
258 | *target = PyUnicode_READ_CHAR(src, 0); |
259 | } |
260 | } |
261 | } |
262 | return 0; |
263 | } |
264 | |
265 | static int |
266 | _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt) |
267 | { |
268 | if (src == NULL) { |
269 | *target = dflt; |
270 | } |
271 | else { |
272 | *target = '\0'; |
273 | if (!PyUnicode_Check(src)) { |
274 | PyErr_Format(PyExc_TypeError, |
275 | "\"%s\" must be string, not %.200s" , name, |
276 | Py_TYPE(src)->tp_name); |
277 | return -1; |
278 | } |
279 | Py_ssize_t len = PyUnicode_GetLength(src); |
280 | if (len < 0) { |
281 | return -1; |
282 | } |
283 | if (len > 1) { |
284 | PyErr_Format(PyExc_TypeError, |
285 | "\"%s\" must be a 1-character string" , |
286 | name); |
287 | return -1; |
288 | } |
289 | /* PyUnicode_READY() is called in PyUnicode_GetLength() */ |
290 | else { |
291 | *target = PyUnicode_READ_CHAR(src, 0); |
292 | } |
293 | } |
294 | return 0; |
295 | } |
296 | |
297 | static int |
298 | _set_str(const char *name, PyObject **target, PyObject *src, const char *dflt) |
299 | { |
300 | if (src == NULL) |
301 | *target = PyUnicode_DecodeASCII(dflt, strlen(dflt), NULL); |
302 | else { |
303 | if (src == Py_None) |
304 | *target = NULL; |
305 | else if (!PyUnicode_Check(src)) { |
306 | PyErr_Format(PyExc_TypeError, |
307 | "\"%s\" must be a string" , name); |
308 | return -1; |
309 | } |
310 | else { |
311 | if (PyUnicode_READY(src) == -1) |
312 | return -1; |
313 | Py_INCREF(src); |
314 | Py_XSETREF(*target, src); |
315 | } |
316 | } |
317 | return 0; |
318 | } |
319 | |
320 | static int |
321 | dialect_check_quoting(int quoting) |
322 | { |
323 | const StyleDesc *qs; |
324 | |
325 | for (qs = quote_styles; qs->name; qs++) { |
326 | if ((int)qs->style == quoting) |
327 | return 0; |
328 | } |
329 | PyErr_Format(PyExc_TypeError, "bad \"quoting\" value" ); |
330 | return -1; |
331 | } |
332 | |
333 | #define D_OFF(x) offsetof(DialectObj, x) |
334 | |
335 | static struct PyMemberDef Dialect_memberlist[] = { |
336 | { "skipinitialspace" , T_BOOL, D_OFF(skipinitialspace), READONLY }, |
337 | { "doublequote" , T_BOOL, D_OFF(doublequote), READONLY }, |
338 | { "strict" , T_BOOL, D_OFF(strict), READONLY }, |
339 | { NULL } |
340 | }; |
341 | |
342 | static PyGetSetDef Dialect_getsetlist[] = { |
343 | { "delimiter" , (getter)Dialect_get_delimiter}, |
344 | { "escapechar" , (getter)Dialect_get_escapechar}, |
345 | { "lineterminator" , (getter)Dialect_get_lineterminator}, |
346 | { "quotechar" , (getter)Dialect_get_quotechar}, |
347 | { "quoting" , (getter)Dialect_get_quoting}, |
348 | {NULL}, |
349 | }; |
350 | |
351 | static void |
352 | Dialect_dealloc(DialectObj *self) |
353 | { |
354 | PyTypeObject *tp = Py_TYPE(self); |
355 | PyObject_GC_UnTrack(self); |
356 | tp->tp_clear((PyObject *)self); |
357 | PyObject_GC_Del(self); |
358 | Py_DECREF(tp); |
359 | } |
360 | |
361 | static char *dialect_kws[] = { |
362 | "dialect" , |
363 | "delimiter" , |
364 | "doublequote" , |
365 | "escapechar" , |
366 | "lineterminator" , |
367 | "quotechar" , |
368 | "quoting" , |
369 | "skipinitialspace" , |
370 | "strict" , |
371 | NULL |
372 | }; |
373 | |
374 | static _csvstate * |
375 | _csv_state_from_type(PyTypeObject *type, const char *name) |
376 | { |
377 | PyObject *module = _PyType_GetModuleByDef(type, &_csvmodule); |
378 | if (module == NULL) { |
379 | return NULL; |
380 | } |
381 | _csvstate *module_state = PyModule_GetState(module); |
382 | if (module_state == NULL) { |
383 | PyErr_Format(PyExc_SystemError, |
384 | "%s: No _csv module state found" , name); |
385 | return NULL; |
386 | } |
387 | return module_state; |
388 | } |
389 | |
390 | static PyObject * |
391 | dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) |
392 | { |
393 | DialectObj *self; |
394 | PyObject *ret = NULL; |
395 | PyObject *dialect = NULL; |
396 | PyObject *delimiter = NULL; |
397 | PyObject *doublequote = NULL; |
398 | PyObject *escapechar = NULL; |
399 | PyObject *lineterminator = NULL; |
400 | PyObject *quotechar = NULL; |
401 | PyObject *quoting = NULL; |
402 | PyObject *skipinitialspace = NULL; |
403 | PyObject *strict = NULL; |
404 | |
405 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, |
406 | "|OOOOOOOOO" , dialect_kws, |
407 | &dialect, |
408 | &delimiter, |
409 | &doublequote, |
410 | &escapechar, |
411 | &lineterminator, |
412 | "echar, |
413 | "ing, |
414 | &skipinitialspace, |
415 | &strict)) |
416 | return NULL; |
417 | |
418 | _csvstate *module_state = _csv_state_from_type(type, "dialect_new" ); |
419 | if (module_state == NULL) { |
420 | return NULL; |
421 | } |
422 | |
423 | if (dialect != NULL) { |
424 | if (PyUnicode_Check(dialect)) { |
425 | dialect = get_dialect_from_registry(dialect, module_state); |
426 | if (dialect == NULL) |
427 | return NULL; |
428 | } |
429 | else |
430 | Py_INCREF(dialect); |
431 | /* Can we reuse this instance? */ |
432 | if (PyObject_TypeCheck(dialect, module_state->dialect_type) && |
433 | delimiter == NULL && |
434 | doublequote == NULL && |
435 | escapechar == NULL && |
436 | lineterminator == NULL && |
437 | quotechar == NULL && |
438 | quoting == NULL && |
439 | skipinitialspace == NULL && |
440 | strict == NULL) |
441 | return dialect; |
442 | } |
443 | |
444 | self = (DialectObj *)type->tp_alloc(type, 0); |
445 | if (self == NULL) { |
446 | Py_CLEAR(dialect); |
447 | return NULL; |
448 | } |
449 | self->lineterminator = NULL; |
450 | |
451 | Py_XINCREF(delimiter); |
452 | Py_XINCREF(doublequote); |
453 | Py_XINCREF(escapechar); |
454 | Py_XINCREF(lineterminator); |
455 | Py_XINCREF(quotechar); |
456 | Py_XINCREF(quoting); |
457 | Py_XINCREF(skipinitialspace); |
458 | Py_XINCREF(strict); |
459 | if (dialect != NULL) { |
460 | #define DIALECT_GETATTR(v, n) \ |
461 | do { \ |
462 | if (v == NULL) { \ |
463 | v = PyObject_GetAttrString(dialect, n); \ |
464 | if (v == NULL) \ |
465 | PyErr_Clear(); \ |
466 | } \ |
467 | } while (0) |
468 | DIALECT_GETATTR(delimiter, "delimiter" ); |
469 | DIALECT_GETATTR(doublequote, "doublequote" ); |
470 | DIALECT_GETATTR(escapechar, "escapechar" ); |
471 | DIALECT_GETATTR(lineterminator, "lineterminator" ); |
472 | DIALECT_GETATTR(quotechar, "quotechar" ); |
473 | DIALECT_GETATTR(quoting, "quoting" ); |
474 | DIALECT_GETATTR(skipinitialspace, "skipinitialspace" ); |
475 | DIALECT_GETATTR(strict, "strict" ); |
476 | } |
477 | |
478 | /* check types and convert to C values */ |
479 | #define DIASET(meth, name, target, src, dflt) \ |
480 | if (meth(name, target, src, dflt)) \ |
481 | goto err |
482 | DIASET(_set_char, "delimiter" , &self->delimiter, delimiter, ','); |
483 | DIASET(_set_bool, "doublequote" , &self->doublequote, doublequote, true); |
484 | DIASET(_set_char_or_none, "escapechar" , &self->escapechar, escapechar, 0); |
485 | DIASET(_set_str, "lineterminator" , &self->lineterminator, lineterminator, "\r\n" ); |
486 | DIASET(_set_char_or_none, "quotechar" , &self->quotechar, quotechar, '"'); |
487 | DIASET(_set_int, "quoting" , &self->quoting, quoting, QUOTE_MINIMAL); |
488 | DIASET(_set_bool, "skipinitialspace" , &self->skipinitialspace, skipinitialspace, false); |
489 | DIASET(_set_bool, "strict" , &self->strict, strict, false); |
490 | |
491 | /* validate options */ |
492 | if (dialect_check_quoting(self->quoting)) |
493 | goto err; |
494 | if (self->delimiter == 0) { |
495 | PyErr_SetString(PyExc_TypeError, |
496 | "\"delimiter\" must be a 1-character string" ); |
497 | goto err; |
498 | } |
499 | if (quotechar == Py_None && quoting == NULL) |
500 | self->quoting = QUOTE_NONE; |
501 | if (self->quoting != QUOTE_NONE && self->quotechar == 0) { |
502 | PyErr_SetString(PyExc_TypeError, |
503 | "quotechar must be set if quoting enabled" ); |
504 | goto err; |
505 | } |
506 | if (self->lineterminator == 0) { |
507 | PyErr_SetString(PyExc_TypeError, "lineterminator must be set" ); |
508 | goto err; |
509 | } |
510 | |
511 | ret = (PyObject *)self; |
512 | Py_INCREF(self); |
513 | err: |
514 | Py_CLEAR(self); |
515 | Py_CLEAR(dialect); |
516 | Py_CLEAR(delimiter); |
517 | Py_CLEAR(doublequote); |
518 | Py_CLEAR(escapechar); |
519 | Py_CLEAR(lineterminator); |
520 | Py_CLEAR(quotechar); |
521 | Py_CLEAR(quoting); |
522 | Py_CLEAR(skipinitialspace); |
523 | Py_CLEAR(strict); |
524 | return ret; |
525 | } |
526 | |
527 | /* Since dialect is now a heap type, it inherits pickling method for |
528 | * protocol 0 and 1 from object, therefore it needs to be overridden */ |
529 | |
530 | PyDoc_STRVAR(dialect_reduce_doc, "raises an exception to avoid pickling" ); |
531 | |
532 | static PyObject * |
533 | Dialect_reduce(PyObject *self, PyObject *args) { |
534 | PyErr_Format(PyExc_TypeError, |
535 | "cannot pickle '%.100s' instances" , _PyType_Name(Py_TYPE(self))); |
536 | return NULL; |
537 | } |
538 | |
539 | static struct PyMethodDef dialect_methods[] = { |
540 | {"__reduce__" , Dialect_reduce, METH_VARARGS, dialect_reduce_doc}, |
541 | {"__reduce_ex__" , Dialect_reduce, METH_VARARGS, dialect_reduce_doc}, |
542 | {NULL, NULL} |
543 | }; |
544 | |
545 | PyDoc_STRVAR(Dialect_Type_doc, |
546 | "CSV dialect\n" |
547 | "\n" |
548 | "The Dialect type records CSV parsing and generation options.\n" ); |
549 | |
550 | static int |
551 | Dialect_clear(DialectObj *self) |
552 | { |
553 | Py_CLEAR(self->lineterminator); |
554 | return 0; |
555 | } |
556 | |
557 | static int |
558 | Dialect_traverse(DialectObj *self, visitproc visit, void *arg) |
559 | { |
560 | Py_VISIT(self->lineterminator); |
561 | Py_VISIT(Py_TYPE(self)); |
562 | return 0; |
563 | } |
564 | |
565 | static PyType_Slot Dialect_Type_slots[] = { |
566 | {Py_tp_doc, (char*)Dialect_Type_doc}, |
567 | {Py_tp_members, Dialect_memberlist}, |
568 | {Py_tp_getset, Dialect_getsetlist}, |
569 | {Py_tp_new, dialect_new}, |
570 | {Py_tp_methods, dialect_methods}, |
571 | {Py_tp_dealloc, Dialect_dealloc}, |
572 | {Py_tp_clear, Dialect_clear}, |
573 | {Py_tp_traverse, Dialect_traverse}, |
574 | {0, NULL} |
575 | }; |
576 | |
577 | PyType_Spec Dialect_Type_spec = { |
578 | .name = "_csv.Dialect" , |
579 | .basicsize = sizeof(DialectObj), |
580 | .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC | |
581 | Py_TPFLAGS_IMMUTABLETYPE), |
582 | .slots = Dialect_Type_slots, |
583 | }; |
584 | |
585 | |
586 | /* |
587 | * Return an instance of the dialect type, given a Python instance or kwarg |
588 | * description of the dialect |
589 | */ |
590 | static PyObject * |
591 | _call_dialect(_csvstate *module_state, PyObject *dialect_inst, PyObject *kwargs) |
592 | { |
593 | PyObject *type = (PyObject *)module_state->dialect_type; |
594 | if (dialect_inst) { |
595 | return PyObject_VectorcallDict(type, &dialect_inst, 1, kwargs); |
596 | } |
597 | else { |
598 | return PyObject_VectorcallDict(type, NULL, 0, kwargs); |
599 | } |
600 | } |
601 | |
602 | /* |
603 | * READER |
604 | */ |
605 | static int |
606 | parse_save_field(ReaderObj *self) |
607 | { |
608 | PyObject *field; |
609 | |
610 | field = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, |
611 | (void *) self->field, self->field_len); |
612 | if (field == NULL) |
613 | return -1; |
614 | self->field_len = 0; |
615 | if (self->numeric_field) { |
616 | PyObject *tmp; |
617 | |
618 | self->numeric_field = 0; |
619 | tmp = PyNumber_Float(field); |
620 | Py_DECREF(field); |
621 | if (tmp == NULL) |
622 | return -1; |
623 | field = tmp; |
624 | } |
625 | if (PyList_Append(self->fields, field) < 0) { |
626 | Py_DECREF(field); |
627 | return -1; |
628 | } |
629 | Py_DECREF(field); |
630 | return 0; |
631 | } |
632 | |
633 | static int |
634 | parse_grow_buff(ReaderObj *self) |
635 | { |
636 | assert((size_t)self->field_size <= PY_SSIZE_T_MAX / sizeof(Py_UCS4)); |
637 | |
638 | Py_ssize_t field_size_new = self->field_size ? 2 * self->field_size : 4096; |
639 | Py_UCS4 *field_new = self->field; |
640 | PyMem_Resize(field_new, Py_UCS4, field_size_new); |
641 | if (field_new == NULL) { |
642 | PyErr_NoMemory(); |
643 | return 0; |
644 | } |
645 | self->field = field_new; |
646 | self->field_size = field_size_new; |
647 | return 1; |
648 | } |
649 | |
650 | static int |
651 | parse_add_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) |
652 | { |
653 | if (self->field_len >= module_state->field_limit) { |
654 | PyErr_Format(module_state->error_obj, |
655 | "field larger than field limit (%ld)" , |
656 | module_state->field_limit); |
657 | return -1; |
658 | } |
659 | if (self->field_len == self->field_size && !parse_grow_buff(self)) |
660 | return -1; |
661 | self->field[self->field_len++] = c; |
662 | return 0; |
663 | } |
664 | |
665 | static int |
666 | parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c) |
667 | { |
668 | DialectObj *dialect = self->dialect; |
669 | |
670 | switch (self->state) { |
671 | case START_RECORD: |
672 | /* start of record */ |
673 | if (c == '\0') |
674 | /* empty line - return [] */ |
675 | break; |
676 | else if (c == '\n' || c == '\r') { |
677 | self->state = EAT_CRNL; |
678 | break; |
679 | } |
680 | /* normal character - handle as START_FIELD */ |
681 | self->state = START_FIELD; |
682 | /* fallthru */ |
683 | case START_FIELD: |
684 | /* expecting field */ |
685 | if (c == '\n' || c == '\r' || c == '\0') { |
686 | /* save empty field - return [fields] */ |
687 | if (parse_save_field(self) < 0) |
688 | return -1; |
689 | self->state = (c == '\0' ? START_RECORD : EAT_CRNL); |
690 | } |
691 | else if (c == dialect->quotechar && |
692 | dialect->quoting != QUOTE_NONE) { |
693 | /* start quoted field */ |
694 | self->state = IN_QUOTED_FIELD; |
695 | } |
696 | else if (c == dialect->escapechar) { |
697 | /* possible escaped character */ |
698 | self->state = ESCAPED_CHAR; |
699 | } |
700 | else if (c == ' ' && dialect->skipinitialspace) |
701 | /* ignore space at start of field */ |
702 | ; |
703 | else if (c == dialect->delimiter) { |
704 | /* save empty field */ |
705 | if (parse_save_field(self) < 0) |
706 | return -1; |
707 | } |
708 | else { |
709 | /* begin new unquoted field */ |
710 | if (dialect->quoting == QUOTE_NONNUMERIC) |
711 | self->numeric_field = 1; |
712 | if (parse_add_char(self, module_state, c) < 0) |
713 | return -1; |
714 | self->state = IN_FIELD; |
715 | } |
716 | break; |
717 | |
718 | case ESCAPED_CHAR: |
719 | if (c == '\n' || c=='\r') { |
720 | if (parse_add_char(self, module_state, c) < 0) |
721 | return -1; |
722 | self->state = AFTER_ESCAPED_CRNL; |
723 | break; |
724 | } |
725 | if (c == '\0') |
726 | c = '\n'; |
727 | if (parse_add_char(self, module_state, c) < 0) |
728 | return -1; |
729 | self->state = IN_FIELD; |
730 | break; |
731 | |
732 | case AFTER_ESCAPED_CRNL: |
733 | if (c == '\0') |
734 | break; |
735 | /*fallthru*/ |
736 | |
737 | case IN_FIELD: |
738 | /* in unquoted field */ |
739 | if (c == '\n' || c == '\r' || c == '\0') { |
740 | /* end of line - return [fields] */ |
741 | if (parse_save_field(self) < 0) |
742 | return -1; |
743 | self->state = (c == '\0' ? START_RECORD : EAT_CRNL); |
744 | } |
745 | else if (c == dialect->escapechar) { |
746 | /* possible escaped character */ |
747 | self->state = ESCAPED_CHAR; |
748 | } |
749 | else if (c == dialect->delimiter) { |
750 | /* save field - wait for new field */ |
751 | if (parse_save_field(self) < 0) |
752 | return -1; |
753 | self->state = START_FIELD; |
754 | } |
755 | else { |
756 | /* normal character - save in field */ |
757 | if (parse_add_char(self, module_state, c) < 0) |
758 | return -1; |
759 | } |
760 | break; |
761 | |
762 | case IN_QUOTED_FIELD: |
763 | /* in quoted field */ |
764 | if (c == '\0') |
765 | ; |
766 | else if (c == dialect->escapechar) { |
767 | /* Possible escape character */ |
768 | self->state = ESCAPE_IN_QUOTED_FIELD; |
769 | } |
770 | else if (c == dialect->quotechar && |
771 | dialect->quoting != QUOTE_NONE) { |
772 | if (dialect->doublequote) { |
773 | /* doublequote; " represented by "" */ |
774 | self->state = QUOTE_IN_QUOTED_FIELD; |
775 | } |
776 | else { |
777 | /* end of quote part of field */ |
778 | self->state = IN_FIELD; |
779 | } |
780 | } |
781 | else { |
782 | /* normal character - save in field */ |
783 | if (parse_add_char(self, module_state, c) < 0) |
784 | return -1; |
785 | } |
786 | break; |
787 | |
788 | case ESCAPE_IN_QUOTED_FIELD: |
789 | if (c == '\0') |
790 | c = '\n'; |
791 | if (parse_add_char(self, module_state, c) < 0) |
792 | return -1; |
793 | self->state = IN_QUOTED_FIELD; |
794 | break; |
795 | |
796 | case QUOTE_IN_QUOTED_FIELD: |
797 | /* doublequote - seen a quote in a quoted field */ |
798 | if (dialect->quoting != QUOTE_NONE && |
799 | c == dialect->quotechar) { |
800 | /* save "" as " */ |
801 | if (parse_add_char(self, module_state, c) < 0) |
802 | return -1; |
803 | self->state = IN_QUOTED_FIELD; |
804 | } |
805 | else if (c == dialect->delimiter) { |
806 | /* save field - wait for new field */ |
807 | if (parse_save_field(self) < 0) |
808 | return -1; |
809 | self->state = START_FIELD; |
810 | } |
811 | else if (c == '\n' || c == '\r' || c == '\0') { |
812 | /* end of line - return [fields] */ |
813 | if (parse_save_field(self) < 0) |
814 | return -1; |
815 | self->state = (c == '\0' ? START_RECORD : EAT_CRNL); |
816 | } |
817 | else if (!dialect->strict) { |
818 | if (parse_add_char(self, module_state, c) < 0) |
819 | return -1; |
820 | self->state = IN_FIELD; |
821 | } |
822 | else { |
823 | /* illegal */ |
824 | PyErr_Format(module_state->error_obj, "'%c' expected after '%c'" , |
825 | dialect->delimiter, |
826 | dialect->quotechar); |
827 | return -1; |
828 | } |
829 | break; |
830 | |
831 | case EAT_CRNL: |
832 | if (c == '\n' || c == '\r') |
833 | ; |
834 | else if (c == '\0') |
835 | self->state = START_RECORD; |
836 | else { |
837 | PyErr_Format(module_state->error_obj, |
838 | "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?" ); |
839 | return -1; |
840 | } |
841 | break; |
842 | |
843 | } |
844 | return 0; |
845 | } |
846 | |
847 | static int |
848 | parse_reset(ReaderObj *self) |
849 | { |
850 | Py_XSETREF(self->fields, PyList_New(0)); |
851 | if (self->fields == NULL) |
852 | return -1; |
853 | self->field_len = 0; |
854 | self->state = START_RECORD; |
855 | self->numeric_field = 0; |
856 | return 0; |
857 | } |
858 | |
859 | static PyObject * |
860 | Reader_iternext(ReaderObj *self) |
861 | { |
862 | PyObject *fields = NULL; |
863 | Py_UCS4 c; |
864 | Py_ssize_t pos, linelen; |
865 | unsigned int kind; |
866 | const void *data; |
867 | PyObject *lineobj; |
868 | |
869 | _csvstate *module_state = _csv_state_from_type(Py_TYPE(self), |
870 | "Reader.__next__" ); |
871 | if (module_state == NULL) { |
872 | return NULL; |
873 | } |
874 | |
875 | if (parse_reset(self) < 0) |
876 | return NULL; |
877 | do { |
878 | lineobj = PyIter_Next(self->input_iter); |
879 | if (lineobj == NULL) { |
880 | /* End of input OR exception */ |
881 | if (!PyErr_Occurred() && (self->field_len != 0 || |
882 | self->state == IN_QUOTED_FIELD)) { |
883 | if (self->dialect->strict) |
884 | PyErr_SetString(module_state->error_obj, |
885 | "unexpected end of data" ); |
886 | else if (parse_save_field(self) >= 0) |
887 | break; |
888 | } |
889 | return NULL; |
890 | } |
891 | if (!PyUnicode_Check(lineobj)) { |
892 | PyErr_Format(module_state->error_obj, |
893 | "iterator should return strings, " |
894 | "not %.200s " |
895 | "(the file should be opened in text mode)" , |
896 | Py_TYPE(lineobj)->tp_name |
897 | ); |
898 | Py_DECREF(lineobj); |
899 | return NULL; |
900 | } |
901 | if (PyUnicode_READY(lineobj) == -1) { |
902 | Py_DECREF(lineobj); |
903 | return NULL; |
904 | } |
905 | ++self->line_num; |
906 | kind = PyUnicode_KIND(lineobj); |
907 | data = PyUnicode_DATA(lineobj); |
908 | pos = 0; |
909 | linelen = PyUnicode_GET_LENGTH(lineobj); |
910 | while (linelen--) { |
911 | c = PyUnicode_READ(kind, data, pos); |
912 | if (c == '\0') { |
913 | Py_DECREF(lineobj); |
914 | PyErr_Format(module_state->error_obj, |
915 | "line contains NUL" ); |
916 | goto err; |
917 | } |
918 | if (parse_process_char(self, module_state, c) < 0) { |
919 | Py_DECREF(lineobj); |
920 | goto err; |
921 | } |
922 | pos++; |
923 | } |
924 | Py_DECREF(lineobj); |
925 | if (parse_process_char(self, module_state, 0) < 0) |
926 | goto err; |
927 | } while (self->state != START_RECORD); |
928 | |
929 | fields = self->fields; |
930 | self->fields = NULL; |
931 | err: |
932 | return fields; |
933 | } |
934 | |
935 | static void |
936 | Reader_dealloc(ReaderObj *self) |
937 | { |
938 | PyTypeObject *tp = Py_TYPE(self); |
939 | PyObject_GC_UnTrack(self); |
940 | tp->tp_clear((PyObject *)self); |
941 | if (self->field != NULL) { |
942 | PyMem_Free(self->field); |
943 | self->field = NULL; |
944 | } |
945 | PyObject_GC_Del(self); |
946 | Py_DECREF(tp); |
947 | } |
948 | |
949 | static int |
950 | Reader_traverse(ReaderObj *self, visitproc visit, void *arg) |
951 | { |
952 | Py_VISIT(self->dialect); |
953 | Py_VISIT(self->input_iter); |
954 | Py_VISIT(self->fields); |
955 | Py_VISIT(Py_TYPE(self)); |
956 | return 0; |
957 | } |
958 | |
959 | static int |
960 | Reader_clear(ReaderObj *self) |
961 | { |
962 | Py_CLEAR(self->dialect); |
963 | Py_CLEAR(self->input_iter); |
964 | Py_CLEAR(self->fields); |
965 | return 0; |
966 | } |
967 | |
968 | PyDoc_STRVAR(Reader_Type_doc, |
969 | "CSV reader\n" |
970 | "\n" |
971 | "Reader objects are responsible for reading and parsing tabular data\n" |
972 | "in CSV format.\n" |
973 | ); |
974 | |
975 | static struct PyMethodDef Reader_methods[] = { |
976 | { NULL, NULL } |
977 | }; |
978 | #define R_OFF(x) offsetof(ReaderObj, x) |
979 | |
980 | static struct PyMemberDef Reader_memberlist[] = { |
981 | { "dialect" , T_OBJECT, R_OFF(dialect), READONLY }, |
982 | { "line_num" , T_ULONG, R_OFF(line_num), READONLY }, |
983 | { NULL } |
984 | }; |
985 | |
986 | |
987 | static PyType_Slot Reader_Type_slots[] = { |
988 | {Py_tp_doc, (char*)Reader_Type_doc}, |
989 | {Py_tp_traverse, Reader_traverse}, |
990 | {Py_tp_iter, PyObject_SelfIter}, |
991 | {Py_tp_iternext, Reader_iternext}, |
992 | {Py_tp_methods, Reader_methods}, |
993 | {Py_tp_members, Reader_memberlist}, |
994 | {Py_tp_clear, Reader_clear}, |
995 | {Py_tp_dealloc, Reader_dealloc}, |
996 | {0, NULL} |
997 | }; |
998 | |
999 | PyType_Spec Reader_Type_spec = { |
1000 | .name = "_csv.reader" , |
1001 | .basicsize = sizeof(ReaderObj), |
1002 | .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC | |
1003 | Py_TPFLAGS_IMMUTABLETYPE), |
1004 | .slots = Reader_Type_slots |
1005 | }; |
1006 | |
1007 | |
1008 | static PyObject * |
1009 | csv_reader(PyObject *module, PyObject *args, PyObject *keyword_args) |
1010 | { |
1011 | PyObject * iterator, * dialect = NULL; |
1012 | _csvstate *module_state = get_csv_state(module); |
1013 | ReaderObj * self = PyObject_GC_New( |
1014 | ReaderObj, |
1015 | module_state->reader_type); |
1016 | |
1017 | if (!self) |
1018 | return NULL; |
1019 | |
1020 | self->dialect = NULL; |
1021 | self->fields = NULL; |
1022 | self->input_iter = NULL; |
1023 | self->field = NULL; |
1024 | self->field_size = 0; |
1025 | self->line_num = 0; |
1026 | |
1027 | if (parse_reset(self) < 0) { |
1028 | Py_DECREF(self); |
1029 | return NULL; |
1030 | } |
1031 | |
1032 | if (!PyArg_UnpackTuple(args, "" , 1, 2, &iterator, &dialect)) { |
1033 | Py_DECREF(self); |
1034 | return NULL; |
1035 | } |
1036 | self->input_iter = PyObject_GetIter(iterator); |
1037 | if (self->input_iter == NULL) { |
1038 | Py_DECREF(self); |
1039 | return NULL; |
1040 | } |
1041 | self->dialect = (DialectObj *)_call_dialect(module_state, dialect, |
1042 | keyword_args); |
1043 | if (self->dialect == NULL) { |
1044 | Py_DECREF(self); |
1045 | return NULL; |
1046 | } |
1047 | |
1048 | PyObject_GC_Track(self); |
1049 | return (PyObject *)self; |
1050 | } |
1051 | |
1052 | /* |
1053 | * WRITER |
1054 | */ |
1055 | /* ---------------------------------------------------------------- */ |
1056 | static void |
1057 | join_reset(WriterObj *self) |
1058 | { |
1059 | self->rec_len = 0; |
1060 | self->num_fields = 0; |
1061 | } |
1062 | |
1063 | #define MEM_INCR 32768 |
1064 | |
1065 | /* Calculate new record length or append field to record. Return new |
1066 | * record length. |
1067 | */ |
1068 | static Py_ssize_t |
1069 | join_append_data(WriterObj *self, unsigned int field_kind, const void *field_data, |
1070 | Py_ssize_t field_len, int *quoted, |
1071 | int copy_phase) |
1072 | { |
1073 | DialectObj *dialect = self->dialect; |
1074 | int i; |
1075 | Py_ssize_t rec_len; |
1076 | |
1077 | #define INCLEN \ |
1078 | do {\ |
1079 | if (!copy_phase && rec_len == PY_SSIZE_T_MAX) { \ |
1080 | goto overflow; \ |
1081 | } \ |
1082 | rec_len++; \ |
1083 | } while(0) |
1084 | |
1085 | #define ADDCH(c) \ |
1086 | do {\ |
1087 | if (copy_phase) \ |
1088 | self->rec[rec_len] = c;\ |
1089 | INCLEN;\ |
1090 | } while(0) |
1091 | |
1092 | rec_len = self->rec_len; |
1093 | |
1094 | /* If this is not the first field we need a field separator */ |
1095 | if (self->num_fields > 0) |
1096 | ADDCH(dialect->delimiter); |
1097 | |
1098 | /* Handle preceding quote */ |
1099 | if (copy_phase && *quoted) |
1100 | ADDCH(dialect->quotechar); |
1101 | |
1102 | /* Copy/count field data */ |
1103 | /* If field is null just pass over */ |
1104 | for (i = 0; field_data && (i < field_len); i++) { |
1105 | Py_UCS4 c = PyUnicode_READ(field_kind, field_data, i); |
1106 | int want_escape = 0; |
1107 | |
1108 | if (c == dialect->delimiter || |
1109 | c == dialect->escapechar || |
1110 | c == dialect->quotechar || |
1111 | PyUnicode_FindChar( |
1112 | dialect->lineterminator, c, 0, |
1113 | PyUnicode_GET_LENGTH(dialect->lineterminator), 1) >= 0) { |
1114 | if (dialect->quoting == QUOTE_NONE) |
1115 | want_escape = 1; |
1116 | else { |
1117 | if (c == dialect->quotechar) { |
1118 | if (dialect->doublequote) |
1119 | ADDCH(dialect->quotechar); |
1120 | else |
1121 | want_escape = 1; |
1122 | } |
1123 | else if (c == dialect->escapechar) { |
1124 | want_escape = 1; |
1125 | } |
1126 | if (!want_escape) |
1127 | *quoted = 1; |
1128 | } |
1129 | if (want_escape) { |
1130 | if (!dialect->escapechar) { |
1131 | PyErr_Format(self->error_obj, |
1132 | "need to escape, but no escapechar set" ); |
1133 | return -1; |
1134 | } |
1135 | ADDCH(dialect->escapechar); |
1136 | } |
1137 | } |
1138 | /* Copy field character into record buffer. |
1139 | */ |
1140 | ADDCH(c); |
1141 | } |
1142 | |
1143 | if (*quoted) { |
1144 | if (copy_phase) |
1145 | ADDCH(dialect->quotechar); |
1146 | else { |
1147 | INCLEN; /* starting quote */ |
1148 | INCLEN; /* ending quote */ |
1149 | } |
1150 | } |
1151 | return rec_len; |
1152 | |
1153 | overflow: |
1154 | PyErr_NoMemory(); |
1155 | return -1; |
1156 | #undef ADDCH |
1157 | #undef INCLEN |
1158 | } |
1159 | |
1160 | static int |
1161 | join_check_rec_size(WriterObj *self, Py_ssize_t rec_len) |
1162 | { |
1163 | assert(rec_len >= 0); |
1164 | |
1165 | if (rec_len > self->rec_size) { |
1166 | size_t rec_size_new = (size_t)(rec_len / MEM_INCR + 1) * MEM_INCR; |
1167 | Py_UCS4 *rec_new = self->rec; |
1168 | PyMem_Resize(rec_new, Py_UCS4, rec_size_new); |
1169 | if (rec_new == NULL) { |
1170 | PyErr_NoMemory(); |
1171 | return 0; |
1172 | } |
1173 | self->rec = rec_new; |
1174 | self->rec_size = (Py_ssize_t)rec_size_new; |
1175 | } |
1176 | return 1; |
1177 | } |
1178 | |
1179 | static int |
1180 | join_append(WriterObj *self, PyObject *field, int quoted) |
1181 | { |
1182 | unsigned int field_kind = -1; |
1183 | const void *field_data = NULL; |
1184 | Py_ssize_t field_len = 0; |
1185 | Py_ssize_t rec_len; |
1186 | |
1187 | if (field != NULL) { |
1188 | if (PyUnicode_READY(field) == -1) |
1189 | return 0; |
1190 | field_kind = PyUnicode_KIND(field); |
1191 | field_data = PyUnicode_DATA(field); |
1192 | field_len = PyUnicode_GET_LENGTH(field); |
1193 | } |
1194 | rec_len = join_append_data(self, field_kind, field_data, field_len, |
1195 | "ed, 0); |
1196 | if (rec_len < 0) |
1197 | return 0; |
1198 | |
1199 | /* grow record buffer if necessary */ |
1200 | if (!join_check_rec_size(self, rec_len)) |
1201 | return 0; |
1202 | |
1203 | self->rec_len = join_append_data(self, field_kind, field_data, field_len, |
1204 | "ed, 1); |
1205 | self->num_fields++; |
1206 | |
1207 | return 1; |
1208 | } |
1209 | |
1210 | static int |
1211 | join_append_lineterminator(WriterObj *self) |
1212 | { |
1213 | Py_ssize_t terminator_len, i; |
1214 | unsigned int term_kind; |
1215 | const void *term_data; |
1216 | |
1217 | terminator_len = PyUnicode_GET_LENGTH(self->dialect->lineterminator); |
1218 | if (terminator_len == -1) |
1219 | return 0; |
1220 | |
1221 | /* grow record buffer if necessary */ |
1222 | if (!join_check_rec_size(self, self->rec_len + terminator_len)) |
1223 | return 0; |
1224 | |
1225 | term_kind = PyUnicode_KIND(self->dialect->lineterminator); |
1226 | term_data = PyUnicode_DATA(self->dialect->lineterminator); |
1227 | for (i = 0; i < terminator_len; i++) |
1228 | self->rec[self->rec_len + i] = PyUnicode_READ(term_kind, term_data, i); |
1229 | self->rec_len += terminator_len; |
1230 | |
1231 | return 1; |
1232 | } |
1233 | |
1234 | PyDoc_STRVAR(csv_writerow_doc, |
1235 | "writerow(iterable)\n" |
1236 | "\n" |
1237 | "Construct and write a CSV record from an iterable of fields. Non-string\n" |
1238 | "elements will be converted to string." ); |
1239 | |
1240 | static PyObject * |
1241 | csv_writerow(WriterObj *self, PyObject *seq) |
1242 | { |
1243 | DialectObj *dialect = self->dialect; |
1244 | PyObject *iter, *field, *line, *result; |
1245 | |
1246 | iter = PyObject_GetIter(seq); |
1247 | if (iter == NULL) { |
1248 | if (PyErr_ExceptionMatches(PyExc_TypeError)) { |
1249 | PyErr_Format(self->error_obj, |
1250 | "iterable expected, not %.200s" , |
1251 | Py_TYPE(seq)->tp_name); |
1252 | } |
1253 | return NULL; |
1254 | } |
1255 | |
1256 | /* Join all fields in internal buffer. |
1257 | */ |
1258 | join_reset(self); |
1259 | while ((field = PyIter_Next(iter))) { |
1260 | int append_ok; |
1261 | int quoted; |
1262 | |
1263 | switch (dialect->quoting) { |
1264 | case QUOTE_NONNUMERIC: |
1265 | quoted = !PyNumber_Check(field); |
1266 | break; |
1267 | case QUOTE_ALL: |
1268 | quoted = 1; |
1269 | break; |
1270 | default: |
1271 | quoted = 0; |
1272 | break; |
1273 | } |
1274 | |
1275 | if (PyUnicode_Check(field)) { |
1276 | append_ok = join_append(self, field, quoted); |
1277 | Py_DECREF(field); |
1278 | } |
1279 | else if (field == Py_None) { |
1280 | append_ok = join_append(self, NULL, quoted); |
1281 | Py_DECREF(field); |
1282 | } |
1283 | else { |
1284 | PyObject *str; |
1285 | |
1286 | str = PyObject_Str(field); |
1287 | Py_DECREF(field); |
1288 | if (str == NULL) { |
1289 | Py_DECREF(iter); |
1290 | return NULL; |
1291 | } |
1292 | append_ok = join_append(self, str, quoted); |
1293 | Py_DECREF(str); |
1294 | } |
1295 | if (!append_ok) { |
1296 | Py_DECREF(iter); |
1297 | return NULL; |
1298 | } |
1299 | } |
1300 | Py_DECREF(iter); |
1301 | if (PyErr_Occurred()) |
1302 | return NULL; |
1303 | |
1304 | if (self->num_fields > 0 && self->rec_len == 0) { |
1305 | if (dialect->quoting == QUOTE_NONE) { |
1306 | PyErr_Format(self->error_obj, |
1307 | "single empty field record must be quoted" ); |
1308 | return NULL; |
1309 | } |
1310 | self->num_fields--; |
1311 | if (!join_append(self, NULL, 1)) |
1312 | return NULL; |
1313 | } |
1314 | |
1315 | /* Add line terminator. |
1316 | */ |
1317 | if (!join_append_lineterminator(self)) { |
1318 | return NULL; |
1319 | } |
1320 | |
1321 | line = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, |
1322 | (void *) self->rec, self->rec_len); |
1323 | if (line == NULL) { |
1324 | return NULL; |
1325 | } |
1326 | result = PyObject_CallOneArg(self->write, line); |
1327 | Py_DECREF(line); |
1328 | return result; |
1329 | } |
1330 | |
1331 | PyDoc_STRVAR(csv_writerows_doc, |
1332 | "writerows(iterable of iterables)\n" |
1333 | "\n" |
1334 | "Construct and write a series of iterables to a csv file. Non-string\n" |
1335 | "elements will be converted to string." ); |
1336 | |
1337 | static PyObject * |
1338 | csv_writerows(WriterObj *self, PyObject *seqseq) |
1339 | { |
1340 | PyObject *row_iter, *row_obj, *result; |
1341 | |
1342 | row_iter = PyObject_GetIter(seqseq); |
1343 | if (row_iter == NULL) { |
1344 | return NULL; |
1345 | } |
1346 | while ((row_obj = PyIter_Next(row_iter))) { |
1347 | result = csv_writerow(self, row_obj); |
1348 | Py_DECREF(row_obj); |
1349 | if (!result) { |
1350 | Py_DECREF(row_iter); |
1351 | return NULL; |
1352 | } |
1353 | else |
1354 | Py_DECREF(result); |
1355 | } |
1356 | Py_DECREF(row_iter); |
1357 | if (PyErr_Occurred()) |
1358 | return NULL; |
1359 | Py_RETURN_NONE; |
1360 | } |
1361 | |
1362 | static struct PyMethodDef Writer_methods[] = { |
1363 | { "writerow" , (PyCFunction)csv_writerow, METH_O, csv_writerow_doc}, |
1364 | { "writerows" , (PyCFunction)csv_writerows, METH_O, csv_writerows_doc}, |
1365 | { NULL, NULL } |
1366 | }; |
1367 | |
1368 | #define W_OFF(x) offsetof(WriterObj, x) |
1369 | |
1370 | static struct PyMemberDef Writer_memberlist[] = { |
1371 | { "dialect" , T_OBJECT, W_OFF(dialect), READONLY }, |
1372 | { NULL } |
1373 | }; |
1374 | |
1375 | static int |
1376 | Writer_traverse(WriterObj *self, visitproc visit, void *arg) |
1377 | { |
1378 | Py_VISIT(self->dialect); |
1379 | Py_VISIT(self->write); |
1380 | Py_VISIT(self->error_obj); |
1381 | Py_VISIT(Py_TYPE(self)); |
1382 | return 0; |
1383 | } |
1384 | |
1385 | static int |
1386 | Writer_clear(WriterObj *self) |
1387 | { |
1388 | Py_CLEAR(self->dialect); |
1389 | Py_CLEAR(self->write); |
1390 | Py_CLEAR(self->error_obj); |
1391 | return 0; |
1392 | } |
1393 | |
1394 | static void |
1395 | Writer_dealloc(WriterObj *self) |
1396 | { |
1397 | PyTypeObject *tp = Py_TYPE(self); |
1398 | PyObject_GC_UnTrack(self); |
1399 | tp->tp_clear((PyObject *)self); |
1400 | if (self->rec != NULL) { |
1401 | PyMem_Free(self->rec); |
1402 | } |
1403 | PyObject_GC_Del(self); |
1404 | Py_DECREF(tp); |
1405 | } |
1406 | |
1407 | PyDoc_STRVAR(Writer_Type_doc, |
1408 | "CSV writer\n" |
1409 | "\n" |
1410 | "Writer objects are responsible for generating tabular data\n" |
1411 | "in CSV format from sequence input.\n" |
1412 | ); |
1413 | |
1414 | static PyType_Slot Writer_Type_slots[] = { |
1415 | {Py_tp_doc, (char*)Writer_Type_doc}, |
1416 | {Py_tp_traverse, Writer_traverse}, |
1417 | {Py_tp_clear, Writer_clear}, |
1418 | {Py_tp_dealloc, Writer_dealloc}, |
1419 | {Py_tp_methods, Writer_methods}, |
1420 | {Py_tp_members, Writer_memberlist}, |
1421 | {0, NULL} |
1422 | }; |
1423 | |
1424 | PyType_Spec Writer_Type_spec = { |
1425 | .name = "_csv.writer" , |
1426 | .basicsize = sizeof(WriterObj), |
1427 | .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC | |
1428 | Py_TPFLAGS_IMMUTABLETYPE), |
1429 | .slots = Writer_Type_slots, |
1430 | }; |
1431 | |
1432 | |
1433 | static PyObject * |
1434 | csv_writer(PyObject *module, PyObject *args, PyObject *keyword_args) |
1435 | { |
1436 | PyObject * output_file, * dialect = NULL; |
1437 | _csvstate *module_state = get_csv_state(module); |
1438 | WriterObj * self = PyObject_GC_New(WriterObj, module_state->writer_type); |
1439 | _Py_IDENTIFIER(write); |
1440 | |
1441 | if (!self) |
1442 | return NULL; |
1443 | |
1444 | self->dialect = NULL; |
1445 | self->write = NULL; |
1446 | |
1447 | self->rec = NULL; |
1448 | self->rec_size = 0; |
1449 | self->rec_len = 0; |
1450 | self->num_fields = 0; |
1451 | |
1452 | self->error_obj = Py_NewRef(module_state->error_obj); |
1453 | |
1454 | if (!PyArg_UnpackTuple(args, "" , 1, 2, &output_file, &dialect)) { |
1455 | Py_DECREF(self); |
1456 | return NULL; |
1457 | } |
1458 | if (_PyObject_LookupAttrId(output_file, &PyId_write, &self->write) < 0) { |
1459 | Py_DECREF(self); |
1460 | return NULL; |
1461 | } |
1462 | if (self->write == NULL || !PyCallable_Check(self->write)) { |
1463 | PyErr_SetString(PyExc_TypeError, |
1464 | "argument 1 must have a \"write\" method" ); |
1465 | Py_DECREF(self); |
1466 | return NULL; |
1467 | } |
1468 | self->dialect = (DialectObj *)_call_dialect(module_state, dialect, |
1469 | keyword_args); |
1470 | if (self->dialect == NULL) { |
1471 | Py_DECREF(self); |
1472 | return NULL; |
1473 | } |
1474 | PyObject_GC_Track(self); |
1475 | return (PyObject *)self; |
1476 | } |
1477 | |
1478 | /* |
1479 | * DIALECT REGISTRY |
1480 | */ |
1481 | static PyObject * |
1482 | csv_list_dialects(PyObject *module, PyObject *args) |
1483 | { |
1484 | return PyDict_Keys(get_csv_state(module)->dialects); |
1485 | } |
1486 | |
1487 | static PyObject * |
1488 | csv_register_dialect(PyObject *module, PyObject *args, PyObject *kwargs) |
1489 | { |
1490 | PyObject *name_obj, *dialect_obj = NULL; |
1491 | _csvstate *module_state = get_csv_state(module); |
1492 | PyObject *dialect; |
1493 | |
1494 | if (!PyArg_UnpackTuple(args, "" , 1, 2, &name_obj, &dialect_obj)) |
1495 | return NULL; |
1496 | if (!PyUnicode_Check(name_obj)) { |
1497 | PyErr_SetString(PyExc_TypeError, |
1498 | "dialect name must be a string" ); |
1499 | return NULL; |
1500 | } |
1501 | if (PyUnicode_READY(name_obj) == -1) |
1502 | return NULL; |
1503 | dialect = _call_dialect(module_state, dialect_obj, kwargs); |
1504 | if (dialect == NULL) |
1505 | return NULL; |
1506 | if (PyDict_SetItem(module_state->dialects, name_obj, dialect) < 0) { |
1507 | Py_DECREF(dialect); |
1508 | return NULL; |
1509 | } |
1510 | Py_DECREF(dialect); |
1511 | Py_RETURN_NONE; |
1512 | } |
1513 | |
1514 | static PyObject * |
1515 | csv_unregister_dialect(PyObject *module, PyObject *name_obj) |
1516 | { |
1517 | _csvstate *module_state = get_csv_state(module); |
1518 | if (PyDict_DelItem(module_state->dialects, name_obj) < 0) { |
1519 | if (PyErr_ExceptionMatches(PyExc_KeyError)) { |
1520 | PyErr_Format(module_state->error_obj, "unknown dialect" ); |
1521 | } |
1522 | return NULL; |
1523 | } |
1524 | Py_RETURN_NONE; |
1525 | } |
1526 | |
1527 | static PyObject * |
1528 | csv_get_dialect(PyObject *module, PyObject *name_obj) |
1529 | { |
1530 | return get_dialect_from_registry(name_obj, get_csv_state(module)); |
1531 | } |
1532 | |
1533 | static PyObject * |
1534 | csv_field_size_limit(PyObject *module, PyObject *args) |
1535 | { |
1536 | PyObject *new_limit = NULL; |
1537 | _csvstate *module_state = get_csv_state(module); |
1538 | long old_limit = module_state->field_limit; |
1539 | |
1540 | if (!PyArg_UnpackTuple(args, "field_size_limit" , 0, 1, &new_limit)) |
1541 | return NULL; |
1542 | if (new_limit != NULL) { |
1543 | if (!PyLong_CheckExact(new_limit)) { |
1544 | PyErr_Format(PyExc_TypeError, |
1545 | "limit must be an integer" ); |
1546 | return NULL; |
1547 | } |
1548 | module_state->field_limit = PyLong_AsLong(new_limit); |
1549 | if (module_state->field_limit == -1 && PyErr_Occurred()) { |
1550 | module_state->field_limit = old_limit; |
1551 | return NULL; |
1552 | } |
1553 | } |
1554 | return PyLong_FromLong(old_limit); |
1555 | } |
1556 | |
1557 | static PyType_Slot error_slots[] = { |
1558 | {0, NULL}, |
1559 | }; |
1560 | |
1561 | PyType_Spec error_spec = { |
1562 | .name = "_csv.Error" , |
1563 | .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, |
1564 | .slots = error_slots, |
1565 | }; |
1566 | |
1567 | /* |
1568 | * MODULE |
1569 | */ |
1570 | |
1571 | PyDoc_STRVAR(csv_module_doc, |
1572 | "CSV parsing and writing.\n" |
1573 | "\n" |
1574 | "This module provides classes that assist in the reading and writing\n" |
1575 | "of Comma Separated Value (CSV) files, and implements the interface\n" |
1576 | "described by PEP 305. Although many CSV files are simple to parse,\n" |
1577 | "the format is not formally defined by a stable specification and\n" |
1578 | "is subtle enough that parsing lines of a CSV file with something\n" |
1579 | "like line.split(\",\") is bound to fail. The module supports three\n" |
1580 | "basic APIs: reading, writing, and registration of dialects.\n" |
1581 | "\n" |
1582 | "\n" |
1583 | "DIALECT REGISTRATION:\n" |
1584 | "\n" |
1585 | "Readers and writers support a dialect argument, which is a convenient\n" |
1586 | "handle on a group of settings. When the dialect argument is a string,\n" |
1587 | "it identifies one of the dialects previously registered with the module.\n" |
1588 | "If it is a class or instance, the attributes of the argument are used as\n" |
1589 | "the settings for the reader or writer:\n" |
1590 | "\n" |
1591 | " class excel:\n" |
1592 | " delimiter = ','\n" |
1593 | " quotechar = '\"'\n" |
1594 | " escapechar = None\n" |
1595 | " doublequote = True\n" |
1596 | " skipinitialspace = False\n" |
1597 | " lineterminator = '\\r\\n'\n" |
1598 | " quoting = QUOTE_MINIMAL\n" |
1599 | "\n" |
1600 | "SETTINGS:\n" |
1601 | "\n" |
1602 | " * quotechar - specifies a one-character string to use as the\n" |
1603 | " quoting character. It defaults to '\"'.\n" |
1604 | " * delimiter - specifies a one-character string to use as the\n" |
1605 | " field separator. It defaults to ','.\n" |
1606 | " * skipinitialspace - specifies how to interpret whitespace which\n" |
1607 | " immediately follows a delimiter. It defaults to False, which\n" |
1608 | " means that whitespace immediately following a delimiter is part\n" |
1609 | " of the following field.\n" |
1610 | " * lineterminator - specifies the character sequence which should\n" |
1611 | " terminate rows.\n" |
1612 | " * quoting - controls when quotes should be generated by the writer.\n" |
1613 | " It can take on any of the following module constants:\n" |
1614 | "\n" |
1615 | " csv.QUOTE_MINIMAL means only when required, for example, when a\n" |
1616 | " field contains either the quotechar or the delimiter\n" |
1617 | " csv.QUOTE_ALL means that quotes are always placed around fields.\n" |
1618 | " csv.QUOTE_NONNUMERIC means that quotes are always placed around\n" |
1619 | " fields which do not parse as integers or floating point\n" |
1620 | " numbers.\n" |
1621 | " csv.QUOTE_NONE means that quotes are never placed around fields.\n" |
1622 | " * escapechar - specifies a one-character string used to escape\n" |
1623 | " the delimiter when quoting is set to QUOTE_NONE.\n" |
1624 | " * doublequote - controls the handling of quotes inside fields. When\n" |
1625 | " True, two consecutive quotes are interpreted as one during read,\n" |
1626 | " and when writing, each quote character embedded in the data is\n" |
1627 | " written as two quotes\n" ); |
1628 | |
1629 | PyDoc_STRVAR(csv_reader_doc, |
1630 | " csv_reader = reader(iterable [, dialect='excel']\n" |
1631 | " [optional keyword args])\n" |
1632 | " for row in csv_reader:\n" |
1633 | " process(row)\n" |
1634 | "\n" |
1635 | "The \"iterable\" argument can be any object that returns a line\n" |
1636 | "of input for each iteration, such as a file object or a list. The\n" |
1637 | "optional \"dialect\" parameter is discussed below. The function\n" |
1638 | "also accepts optional keyword arguments which override settings\n" |
1639 | "provided by the dialect.\n" |
1640 | "\n" |
1641 | "The returned object is an iterator. Each iteration returns a row\n" |
1642 | "of the CSV file (which can span multiple input lines).\n" ); |
1643 | |
1644 | PyDoc_STRVAR(csv_writer_doc, |
1645 | " csv_writer = csv.writer(fileobj [, dialect='excel']\n" |
1646 | " [optional keyword args])\n" |
1647 | " for row in sequence:\n" |
1648 | " csv_writer.writerow(row)\n" |
1649 | "\n" |
1650 | " [or]\n" |
1651 | "\n" |
1652 | " csv_writer = csv.writer(fileobj [, dialect='excel']\n" |
1653 | " [optional keyword args])\n" |
1654 | " csv_writer.writerows(rows)\n" |
1655 | "\n" |
1656 | "The \"fileobj\" argument can be any object that supports the file API.\n" ); |
1657 | |
1658 | PyDoc_STRVAR(csv_list_dialects_doc, |
1659 | "Return a list of all know dialect names.\n" |
1660 | " names = csv.list_dialects()" ); |
1661 | |
1662 | PyDoc_STRVAR(csv_get_dialect_doc, |
1663 | "Return the dialect instance associated with name.\n" |
1664 | " dialect = csv.get_dialect(name)" ); |
1665 | |
1666 | PyDoc_STRVAR(csv_register_dialect_doc, |
1667 | "Create a mapping from a string name to a dialect class.\n" |
1668 | " dialect = csv.register_dialect(name[, dialect[, **fmtparams]])" ); |
1669 | |
1670 | PyDoc_STRVAR(csv_unregister_dialect_doc, |
1671 | "Delete the name/dialect mapping associated with a string name.\n" |
1672 | " csv.unregister_dialect(name)" ); |
1673 | |
1674 | PyDoc_STRVAR(csv_field_size_limit_doc, |
1675 | "Sets an upper limit on parsed fields.\n" |
1676 | " csv.field_size_limit([limit])\n" |
1677 | "\n" |
1678 | "Returns old limit. If limit is not given, no new limit is set and\n" |
1679 | "the old limit is returned" ); |
1680 | |
1681 | static struct PyMethodDef csv_methods[] = { |
1682 | { "reader" , (PyCFunction)(void(*)(void))csv_reader, |
1683 | METH_VARARGS | METH_KEYWORDS, csv_reader_doc}, |
1684 | { "writer" , (PyCFunction)(void(*)(void))csv_writer, |
1685 | METH_VARARGS | METH_KEYWORDS, csv_writer_doc}, |
1686 | { "list_dialects" , (PyCFunction)csv_list_dialects, |
1687 | METH_NOARGS, csv_list_dialects_doc}, |
1688 | { "register_dialect" , (PyCFunction)(void(*)(void))csv_register_dialect, |
1689 | METH_VARARGS | METH_KEYWORDS, csv_register_dialect_doc}, |
1690 | { "unregister_dialect" , (PyCFunction)csv_unregister_dialect, |
1691 | METH_O, csv_unregister_dialect_doc}, |
1692 | { "get_dialect" , (PyCFunction)csv_get_dialect, |
1693 | METH_O, csv_get_dialect_doc}, |
1694 | { "field_size_limit" , (PyCFunction)csv_field_size_limit, |
1695 | METH_VARARGS, csv_field_size_limit_doc}, |
1696 | { NULL, NULL } |
1697 | }; |
1698 | |
1699 | static int |
1700 | csv_exec(PyObject *module) { |
1701 | const StyleDesc *style; |
1702 | PyObject *temp; |
1703 | _csvstate *module_state = get_csv_state(module); |
1704 | |
1705 | temp = PyType_FromModuleAndSpec(module, &Dialect_Type_spec, NULL); |
1706 | module_state->dialect_type = (PyTypeObject *)temp; |
1707 | if (PyModule_AddObjectRef(module, "Dialect" , temp) < 0) { |
1708 | return -1; |
1709 | } |
1710 | |
1711 | temp = PyType_FromModuleAndSpec(module, &Reader_Type_spec, NULL); |
1712 | module_state->reader_type = (PyTypeObject *)temp; |
1713 | if (PyModule_AddObjectRef(module, "Reader" , temp) < 0) { |
1714 | return -1; |
1715 | } |
1716 | |
1717 | temp = PyType_FromModuleAndSpec(module, &Writer_Type_spec, NULL); |
1718 | module_state->writer_type = (PyTypeObject *)temp; |
1719 | if (PyModule_AddObjectRef(module, "Writer" , temp) < 0) { |
1720 | return -1; |
1721 | } |
1722 | |
1723 | /* Add version to the module. */ |
1724 | if (PyModule_AddStringConstant(module, "__version__" , |
1725 | MODULE_VERSION) == -1) { |
1726 | return -1; |
1727 | } |
1728 | |
1729 | /* Set the field limit */ |
1730 | module_state->field_limit = 128 * 1024; |
1731 | |
1732 | /* Add _dialects dictionary */ |
1733 | module_state->dialects = PyDict_New(); |
1734 | if (PyModule_AddObjectRef(module, "_dialects" , module_state->dialects) < 0) { |
1735 | return -1; |
1736 | } |
1737 | |
1738 | /* Add quote styles into dictionary */ |
1739 | for (style = quote_styles; style->name; style++) { |
1740 | if (PyModule_AddIntConstant(module, style->name, |
1741 | style->style) == -1) |
1742 | return -1; |
1743 | } |
1744 | |
1745 | /* Add the CSV exception object to the module. */ |
1746 | PyObject *bases = PyTuple_Pack(1, PyExc_Exception); |
1747 | if (bases == NULL) { |
1748 | return -1; |
1749 | } |
1750 | module_state->error_obj = PyType_FromModuleAndSpec(module, &error_spec, |
1751 | bases); |
1752 | Py_DECREF(bases); |
1753 | if (module_state->error_obj == NULL) { |
1754 | return -1; |
1755 | } |
1756 | if (PyModule_AddType(module, (PyTypeObject *)module_state->error_obj) != 0) { |
1757 | return -1; |
1758 | } |
1759 | |
1760 | return 0; |
1761 | } |
1762 | |
1763 | static PyModuleDef_Slot csv_slots[] = { |
1764 | {Py_mod_exec, csv_exec}, |
1765 | {0, NULL} |
1766 | }; |
1767 | |
1768 | static struct PyModuleDef _csvmodule = { |
1769 | PyModuleDef_HEAD_INIT, |
1770 | "_csv" , |
1771 | csv_module_doc, |
1772 | sizeof(_csvstate), |
1773 | csv_methods, |
1774 | csv_slots, |
1775 | _csv_traverse, |
1776 | _csv_clear, |
1777 | _csv_free |
1778 | }; |
1779 | |
1780 | PyMODINIT_FUNC |
1781 | PyInit__csv(void) |
1782 | { |
1783 | return PyModuleDef_Init(&_csvmodule); |
1784 | } |
1785 | |