1 | /* |
2 | unicode_format.h -- implementation of str.format(). |
3 | */ |
4 | |
5 | /************************************************************************/ |
6 | /*********** Global data structures and forward declarations *********/ |
7 | /************************************************************************/ |
8 | |
9 | /* |
10 | A SubString consists of the characters between two string or |
11 | unicode pointers. |
12 | */ |
13 | typedef struct { |
14 | PyObject *str; /* borrowed reference */ |
15 | Py_ssize_t start, end; |
16 | } SubString; |
17 | |
18 | |
19 | typedef enum { |
20 | ANS_INIT, |
21 | ANS_AUTO, |
22 | ANS_MANUAL |
23 | } AutoNumberState; /* Keep track if we're auto-numbering fields */ |
24 | |
25 | /* Keeps track of our auto-numbering state, and which number field we're on */ |
26 | typedef struct { |
27 | AutoNumberState an_state; |
28 | int an_field_number; |
29 | } AutoNumber; |
30 | |
31 | |
32 | /* forward declaration for recursion */ |
33 | static PyObject * |
34 | build_string(SubString *input, PyObject *args, PyObject *kwargs, |
35 | int recursion_depth, AutoNumber *auto_number); |
36 | |
37 | |
38 | |
39 | /************************************************************************/ |
40 | /************************** Utility functions ************************/ |
41 | /************************************************************************/ |
42 | |
43 | static void |
44 | AutoNumber_Init(AutoNumber *auto_number) |
45 | { |
46 | auto_number->an_state = ANS_INIT; |
47 | auto_number->an_field_number = 0; |
48 | } |
49 | |
50 | /* fill in a SubString from a pointer and length */ |
51 | Py_LOCAL_INLINE(void) |
52 | SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end) |
53 | { |
54 | str->str = s; |
55 | str->start = start; |
56 | str->end = end; |
57 | } |
58 | |
59 | /* return a new string. if str->str is NULL, return None */ |
60 | Py_LOCAL_INLINE(PyObject *) |
61 | SubString_new_object(SubString *str) |
62 | { |
63 | if (str->str == NULL) |
64 | Py_RETURN_NONE; |
65 | return PyUnicode_Substring(str->str, str->start, str->end); |
66 | } |
67 | |
68 | /* return a new string. if str->str is NULL, return a new empty string */ |
69 | Py_LOCAL_INLINE(PyObject *) |
70 | SubString_new_object_or_empty(SubString *str) |
71 | { |
72 | if (str->str == NULL) { |
73 | return PyUnicode_New(0, 0); |
74 | } |
75 | return SubString_new_object(str); |
76 | } |
77 | |
78 | /* Return 1 if an error has been detected switching between automatic |
79 | field numbering and manual field specification, else return 0. Set |
80 | ValueError on error. */ |
81 | static int |
82 | autonumber_state_error(AutoNumberState state, int field_name_is_empty) |
83 | { |
84 | if (state == ANS_MANUAL) { |
85 | if (field_name_is_empty) { |
86 | PyErr_SetString(PyExc_ValueError, "cannot switch from " |
87 | "manual field specification to " |
88 | "automatic field numbering" ); |
89 | return 1; |
90 | } |
91 | } |
92 | else { |
93 | if (!field_name_is_empty) { |
94 | PyErr_SetString(PyExc_ValueError, "cannot switch from " |
95 | "automatic field numbering to " |
96 | "manual field specification" ); |
97 | return 1; |
98 | } |
99 | } |
100 | return 0; |
101 | } |
102 | |
103 | |
104 | /************************************************************************/ |
105 | /*********** Format string parsing -- integers and identifiers *********/ |
106 | /************************************************************************/ |
107 | |
108 | static Py_ssize_t |
109 | get_integer(const SubString *str) |
110 | { |
111 | Py_ssize_t accumulator = 0; |
112 | Py_ssize_t digitval; |
113 | Py_ssize_t i; |
114 | |
115 | /* empty string is an error */ |
116 | if (str->start >= str->end) |
117 | return -1; |
118 | |
119 | for (i = str->start; i < str->end; i++) { |
120 | digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i)); |
121 | if (digitval < 0) |
122 | return -1; |
123 | /* |
124 | Detect possible overflow before it happens: |
125 | |
126 | accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if |
127 | accumulator > (PY_SSIZE_T_MAX - digitval) / 10. |
128 | */ |
129 | if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) { |
130 | PyErr_Format(PyExc_ValueError, |
131 | "Too many decimal digits in format string" ); |
132 | return -1; |
133 | } |
134 | accumulator = accumulator * 10 + digitval; |
135 | } |
136 | return accumulator; |
137 | } |
138 | |
139 | /************************************************************************/ |
140 | /******** Functions to get field objects and specification strings ******/ |
141 | /************************************************************************/ |
142 | |
143 | /* do the equivalent of obj.name */ |
144 | static PyObject * |
145 | getattr(PyObject *obj, SubString *name) |
146 | { |
147 | PyObject *newobj; |
148 | PyObject *str = SubString_new_object(name); |
149 | if (str == NULL) |
150 | return NULL; |
151 | newobj = PyObject_GetAttr(obj, str); |
152 | Py_DECREF(str); |
153 | return newobj; |
154 | } |
155 | |
156 | /* do the equivalent of obj[idx], where obj is a sequence */ |
157 | static PyObject * |
158 | getitem_sequence(PyObject *obj, Py_ssize_t idx) |
159 | { |
160 | return PySequence_GetItem(obj, idx); |
161 | } |
162 | |
163 | /* do the equivalent of obj[idx], where obj is not a sequence */ |
164 | static PyObject * |
165 | getitem_idx(PyObject *obj, Py_ssize_t idx) |
166 | { |
167 | PyObject *newobj; |
168 | PyObject *idx_obj = PyLong_FromSsize_t(idx); |
169 | if (idx_obj == NULL) |
170 | return NULL; |
171 | newobj = PyObject_GetItem(obj, idx_obj); |
172 | Py_DECREF(idx_obj); |
173 | return newobj; |
174 | } |
175 | |
176 | /* do the equivalent of obj[name] */ |
177 | static PyObject * |
178 | getitem_str(PyObject *obj, SubString *name) |
179 | { |
180 | PyObject *newobj; |
181 | PyObject *str = SubString_new_object(name); |
182 | if (str == NULL) |
183 | return NULL; |
184 | newobj = PyObject_GetItem(obj, str); |
185 | Py_DECREF(str); |
186 | return newobj; |
187 | } |
188 | |
189 | typedef struct { |
190 | /* the entire string we're parsing. we assume that someone else |
191 | is managing its lifetime, and that it will exist for the |
192 | lifetime of the iterator. can be empty */ |
193 | SubString str; |
194 | |
195 | /* index to where we are inside field_name */ |
196 | Py_ssize_t index; |
197 | } FieldNameIterator; |
198 | |
199 | |
200 | static int |
201 | FieldNameIterator_init(FieldNameIterator *self, PyObject *s, |
202 | Py_ssize_t start, Py_ssize_t end) |
203 | { |
204 | SubString_init(&self->str, s, start, end); |
205 | self->index = start; |
206 | return 1; |
207 | } |
208 | |
209 | static int |
210 | _FieldNameIterator_attr(FieldNameIterator *self, SubString *name) |
211 | { |
212 | Py_UCS4 c; |
213 | |
214 | name->str = self->str.str; |
215 | name->start = self->index; |
216 | |
217 | /* return everything until '.' or '[' */ |
218 | while (self->index < self->str.end) { |
219 | c = PyUnicode_READ_CHAR(self->str.str, self->index++); |
220 | switch (c) { |
221 | case '[': |
222 | case '.': |
223 | /* backup so that we this character will be seen next time */ |
224 | self->index--; |
225 | break; |
226 | default: |
227 | continue; |
228 | } |
229 | break; |
230 | } |
231 | /* end of string is okay */ |
232 | name->end = self->index; |
233 | return 1; |
234 | } |
235 | |
236 | static int |
237 | _FieldNameIterator_item(FieldNameIterator *self, SubString *name) |
238 | { |
239 | int bracket_seen = 0; |
240 | Py_UCS4 c; |
241 | |
242 | name->str = self->str.str; |
243 | name->start = self->index; |
244 | |
245 | /* return everything until ']' */ |
246 | while (self->index < self->str.end) { |
247 | c = PyUnicode_READ_CHAR(self->str.str, self->index++); |
248 | switch (c) { |
249 | case ']': |
250 | bracket_seen = 1; |
251 | break; |
252 | default: |
253 | continue; |
254 | } |
255 | break; |
256 | } |
257 | /* make sure we ended with a ']' */ |
258 | if (!bracket_seen) { |
259 | PyErr_SetString(PyExc_ValueError, "Missing ']' in format string" ); |
260 | return 0; |
261 | } |
262 | |
263 | /* end of string is okay */ |
264 | /* don't include the ']' */ |
265 | name->end = self->index-1; |
266 | return 1; |
267 | } |
268 | |
269 | /* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */ |
270 | static int |
271 | FieldNameIterator_next(FieldNameIterator *self, int *is_attribute, |
272 | Py_ssize_t *name_idx, SubString *name) |
273 | { |
274 | /* check at end of input */ |
275 | if (self->index >= self->str.end) |
276 | return 1; |
277 | |
278 | switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) { |
279 | case '.': |
280 | *is_attribute = 1; |
281 | if (_FieldNameIterator_attr(self, name) == 0) |
282 | return 0; |
283 | *name_idx = -1; |
284 | break; |
285 | case '[': |
286 | *is_attribute = 0; |
287 | if (_FieldNameIterator_item(self, name) == 0) |
288 | return 0; |
289 | *name_idx = get_integer(name); |
290 | if (*name_idx == -1 && PyErr_Occurred()) |
291 | return 0; |
292 | break; |
293 | default: |
294 | /* Invalid character follows ']' */ |
295 | PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may " |
296 | "follow ']' in format field specifier" ); |
297 | return 0; |
298 | } |
299 | |
300 | /* empty string is an error */ |
301 | if (name->start == name->end) { |
302 | PyErr_SetString(PyExc_ValueError, "Empty attribute in format string" ); |
303 | return 0; |
304 | } |
305 | |
306 | return 2; |
307 | } |
308 | |
309 | |
310 | /* input: field_name |
311 | output: 'first' points to the part before the first '[' or '.' |
312 | 'first_idx' is -1 if 'first' is not an integer, otherwise |
313 | it's the value of first converted to an integer |
314 | 'rest' is an iterator to return the rest |
315 | */ |
316 | static int |
317 | field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first, |
318 | Py_ssize_t *first_idx, FieldNameIterator *rest, |
319 | AutoNumber *auto_number) |
320 | { |
321 | Py_UCS4 c; |
322 | Py_ssize_t i = start; |
323 | int field_name_is_empty; |
324 | int using_numeric_index; |
325 | |
326 | /* find the part up until the first '.' or '[' */ |
327 | while (i < end) { |
328 | switch (c = PyUnicode_READ_CHAR(str, i++)) { |
329 | case '[': |
330 | case '.': |
331 | /* backup so that we this character is available to the |
332 | "rest" iterator */ |
333 | i--; |
334 | break; |
335 | default: |
336 | continue; |
337 | } |
338 | break; |
339 | } |
340 | |
341 | /* set up the return values */ |
342 | SubString_init(first, str, start, i); |
343 | FieldNameIterator_init(rest, str, i, end); |
344 | |
345 | /* see if "first" is an integer, in which case it's used as an index */ |
346 | *first_idx = get_integer(first); |
347 | if (*first_idx == -1 && PyErr_Occurred()) |
348 | return 0; |
349 | |
350 | field_name_is_empty = first->start >= first->end; |
351 | |
352 | /* If the field name is omitted or if we have a numeric index |
353 | specified, then we're doing numeric indexing into args. */ |
354 | using_numeric_index = field_name_is_empty || *first_idx != -1; |
355 | |
356 | /* We always get here exactly one time for each field we're |
357 | processing. And we get here in field order (counting by left |
358 | braces). So this is the perfect place to handle automatic field |
359 | numbering if the field name is omitted. */ |
360 | |
361 | /* Check if we need to do the auto-numbering. It's not needed if |
362 | we're called from string.Format routines, because it's handled |
363 | in that class by itself. */ |
364 | if (auto_number) { |
365 | /* Initialize our auto numbering state if this is the first |
366 | time we're either auto-numbering or manually numbering. */ |
367 | if (auto_number->an_state == ANS_INIT && using_numeric_index) |
368 | auto_number->an_state = field_name_is_empty ? |
369 | ANS_AUTO : ANS_MANUAL; |
370 | |
371 | /* Make sure our state is consistent with what we're doing |
372 | this time through. Only check if we're using a numeric |
373 | index. */ |
374 | if (using_numeric_index) |
375 | if (autonumber_state_error(auto_number->an_state, |
376 | field_name_is_empty)) |
377 | return 0; |
378 | /* Zero length field means we want to do auto-numbering of the |
379 | fields. */ |
380 | if (field_name_is_empty) |
381 | *first_idx = (auto_number->an_field_number)++; |
382 | } |
383 | |
384 | return 1; |
385 | } |
386 | |
387 | |
388 | /* |
389 | get_field_object returns the object inside {}, before the |
390 | format_spec. It handles getindex and getattr lookups and consumes |
391 | the entire input string. |
392 | */ |
393 | static PyObject * |
394 | get_field_object(SubString *input, PyObject *args, PyObject *kwargs, |
395 | AutoNumber *auto_number) |
396 | { |
397 | PyObject *obj = NULL; |
398 | int ok; |
399 | int is_attribute; |
400 | SubString name; |
401 | SubString first; |
402 | Py_ssize_t index; |
403 | FieldNameIterator rest; |
404 | |
405 | if (!field_name_split(input->str, input->start, input->end, &first, |
406 | &index, &rest, auto_number)) { |
407 | goto error; |
408 | } |
409 | |
410 | if (index == -1) { |
411 | /* look up in kwargs */ |
412 | PyObject *key = SubString_new_object(&first); |
413 | if (key == NULL) { |
414 | goto error; |
415 | } |
416 | if (kwargs == NULL) { |
417 | PyErr_SetObject(PyExc_KeyError, key); |
418 | Py_DECREF(key); |
419 | goto error; |
420 | } |
421 | /* Use PyObject_GetItem instead of PyDict_GetItem because this |
422 | code is no longer just used with kwargs. It might be passed |
423 | a non-dict when called through format_map. */ |
424 | obj = PyObject_GetItem(kwargs, key); |
425 | Py_DECREF(key); |
426 | if (obj == NULL) { |
427 | goto error; |
428 | } |
429 | } |
430 | else { |
431 | /* If args is NULL, we have a format string with a positional field |
432 | with only kwargs to retrieve it from. This can only happen when |
433 | used with format_map(), where positional arguments are not |
434 | allowed. */ |
435 | if (args == NULL) { |
436 | PyErr_SetString(PyExc_ValueError, "Format string contains " |
437 | "positional fields" ); |
438 | goto error; |
439 | } |
440 | |
441 | /* look up in args */ |
442 | obj = PySequence_GetItem(args, index); |
443 | if (obj == NULL) { |
444 | PyErr_Format(PyExc_IndexError, |
445 | "Replacement index %zd out of range for positional " |
446 | "args tuple" , |
447 | index); |
448 | goto error; |
449 | } |
450 | } |
451 | |
452 | /* iterate over the rest of the field_name */ |
453 | while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index, |
454 | &name)) == 2) { |
455 | PyObject *tmp; |
456 | |
457 | if (is_attribute) |
458 | /* getattr lookup "." */ |
459 | tmp = getattr(obj, &name); |
460 | else |
461 | /* getitem lookup "[]" */ |
462 | if (index == -1) |
463 | tmp = getitem_str(obj, &name); |
464 | else |
465 | if (PySequence_Check(obj)) |
466 | tmp = getitem_sequence(obj, index); |
467 | else |
468 | /* not a sequence */ |
469 | tmp = getitem_idx(obj, index); |
470 | if (tmp == NULL) |
471 | goto error; |
472 | |
473 | /* assign to obj */ |
474 | Py_DECREF(obj); |
475 | obj = tmp; |
476 | } |
477 | /* end of iterator, this is the non-error case */ |
478 | if (ok == 1) |
479 | return obj; |
480 | error: |
481 | Py_XDECREF(obj); |
482 | return NULL; |
483 | } |
484 | |
485 | /************************************************************************/ |
486 | /***************** Field rendering functions **************************/ |
487 | /************************************************************************/ |
488 | |
489 | /* |
490 | render_field() is the main function in this section. It takes the |
491 | field object and field specification string generated by |
492 | get_field_and_spec, and renders the field into the output string. |
493 | |
494 | render_field calls fieldobj.__format__(format_spec) method, and |
495 | appends to the output. |
496 | */ |
497 | static int |
498 | render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer) |
499 | { |
500 | int ok = 0; |
501 | PyObject *result = NULL; |
502 | PyObject *format_spec_object = NULL; |
503 | int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL; |
504 | int err; |
505 | |
506 | /* If we know the type exactly, skip the lookup of __format__ and just |
507 | call the formatter directly. */ |
508 | if (PyUnicode_CheckExact(fieldobj)) |
509 | formatter = _PyUnicode_FormatAdvancedWriter; |
510 | else if (PyLong_CheckExact(fieldobj)) |
511 | formatter = _PyLong_FormatAdvancedWriter; |
512 | else if (PyFloat_CheckExact(fieldobj)) |
513 | formatter = _PyFloat_FormatAdvancedWriter; |
514 | else if (PyComplex_CheckExact(fieldobj)) |
515 | formatter = _PyComplex_FormatAdvancedWriter; |
516 | |
517 | if (formatter) { |
518 | /* we know exactly which formatter will be called when __format__ is |
519 | looked up, so call it directly, instead. */ |
520 | err = formatter(writer, fieldobj, format_spec->str, |
521 | format_spec->start, format_spec->end); |
522 | return (err == 0); |
523 | } |
524 | else { |
525 | /* We need to create an object out of the pointers we have, because |
526 | __format__ takes a string/unicode object for format_spec. */ |
527 | if (format_spec->str) |
528 | format_spec_object = PyUnicode_Substring(format_spec->str, |
529 | format_spec->start, |
530 | format_spec->end); |
531 | else |
532 | format_spec_object = PyUnicode_New(0, 0); |
533 | if (format_spec_object == NULL) |
534 | goto done; |
535 | |
536 | result = PyObject_Format(fieldobj, format_spec_object); |
537 | } |
538 | if (result == NULL) |
539 | goto done; |
540 | |
541 | if (_PyUnicodeWriter_WriteStr(writer, result) == -1) |
542 | goto done; |
543 | ok = 1; |
544 | |
545 | done: |
546 | Py_XDECREF(format_spec_object); |
547 | Py_XDECREF(result); |
548 | return ok; |
549 | } |
550 | |
551 | static int |
552 | parse_field(SubString *str, SubString *field_name, SubString *format_spec, |
553 | int *format_spec_needs_expanding, Py_UCS4 *conversion) |
554 | { |
555 | /* Note this function works if the field name is zero length, |
556 | which is good. Zero length field names are handled later, in |
557 | field_name_split. */ |
558 | |
559 | Py_UCS4 c = 0; |
560 | |
561 | /* initialize these, as they may be empty */ |
562 | *conversion = '\0'; |
563 | SubString_init(format_spec, NULL, 0, 0); |
564 | |
565 | /* Search for the field name. it's terminated by the end of |
566 | the string, or a ':' or '!' */ |
567 | field_name->str = str->str; |
568 | field_name->start = str->start; |
569 | while (str->start < str->end) { |
570 | switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) { |
571 | case '{': |
572 | PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name" ); |
573 | return 0; |
574 | case '[': |
575 | for (; str->start < str->end; str->start++) |
576 | if (PyUnicode_READ_CHAR(str->str, str->start) == ']') |
577 | break; |
578 | continue; |
579 | case '}': |
580 | case ':': |
581 | case '!': |
582 | break; |
583 | default: |
584 | continue; |
585 | } |
586 | break; |
587 | } |
588 | |
589 | field_name->end = str->start - 1; |
590 | if (c == '!' || c == ':') { |
591 | Py_ssize_t count; |
592 | /* we have a format specifier and/or a conversion */ |
593 | /* don't include the last character */ |
594 | |
595 | /* see if there's a conversion specifier */ |
596 | if (c == '!') { |
597 | /* there must be another character present */ |
598 | if (str->start >= str->end) { |
599 | PyErr_SetString(PyExc_ValueError, |
600 | "end of string while looking for conversion " |
601 | "specifier" ); |
602 | return 0; |
603 | } |
604 | *conversion = PyUnicode_READ_CHAR(str->str, str->start++); |
605 | |
606 | if (str->start < str->end) { |
607 | c = PyUnicode_READ_CHAR(str->str, str->start++); |
608 | if (c == '}') |
609 | return 1; |
610 | if (c != ':') { |
611 | PyErr_SetString(PyExc_ValueError, |
612 | "expected ':' after conversion specifier" ); |
613 | return 0; |
614 | } |
615 | } |
616 | } |
617 | format_spec->str = str->str; |
618 | format_spec->start = str->start; |
619 | count = 1; |
620 | while (str->start < str->end) { |
621 | switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) { |
622 | case '{': |
623 | *format_spec_needs_expanding = 1; |
624 | count++; |
625 | break; |
626 | case '}': |
627 | count--; |
628 | if (count == 0) { |
629 | format_spec->end = str->start - 1; |
630 | return 1; |
631 | } |
632 | break; |
633 | default: |
634 | break; |
635 | } |
636 | } |
637 | |
638 | PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec" ); |
639 | return 0; |
640 | } |
641 | else if (c != '}') { |
642 | PyErr_SetString(PyExc_ValueError, "expected '}' before end of string" ); |
643 | return 0; |
644 | } |
645 | |
646 | return 1; |
647 | } |
648 | |
649 | /************************************************************************/ |
650 | /******* Output string allocation and escape-to-markup processing ******/ |
651 | /************************************************************************/ |
652 | |
653 | /* MarkupIterator breaks the string into pieces of either literal |
654 | text, or things inside {} that need to be marked up. it is |
655 | designed to make it easy to wrap a Python iterator around it, for |
656 | use with the Formatter class */ |
657 | |
658 | typedef struct { |
659 | SubString str; |
660 | } MarkupIterator; |
661 | |
662 | static int |
663 | MarkupIterator_init(MarkupIterator *self, PyObject *str, |
664 | Py_ssize_t start, Py_ssize_t end) |
665 | { |
666 | SubString_init(&self->str, str, start, end); |
667 | return 1; |
668 | } |
669 | |
670 | /* returns 0 on error, 1 on non-error termination, and 2 if it got a |
671 | string (or something to be expanded) */ |
672 | static int |
673 | MarkupIterator_next(MarkupIterator *self, SubString *literal, |
674 | int *field_present, SubString *field_name, |
675 | SubString *format_spec, Py_UCS4 *conversion, |
676 | int *format_spec_needs_expanding) |
677 | { |
678 | int at_end; |
679 | Py_UCS4 c = 0; |
680 | Py_ssize_t start; |
681 | Py_ssize_t len; |
682 | int markup_follows = 0; |
683 | |
684 | /* initialize all of the output variables */ |
685 | SubString_init(literal, NULL, 0, 0); |
686 | SubString_init(field_name, NULL, 0, 0); |
687 | SubString_init(format_spec, NULL, 0, 0); |
688 | *conversion = '\0'; |
689 | *format_spec_needs_expanding = 0; |
690 | *field_present = 0; |
691 | |
692 | /* No more input, end of iterator. This is the normal exit |
693 | path. */ |
694 | if (self->str.start >= self->str.end) |
695 | return 1; |
696 | |
697 | start = self->str.start; |
698 | |
699 | /* First read any literal text. Read until the end of string, an |
700 | escaped '{' or '}', or an unescaped '{'. In order to never |
701 | allocate memory and so I can just pass pointers around, if |
702 | there's an escaped '{' or '}' then we'll return the literal |
703 | including the brace, but no format object. The next time |
704 | through, we'll return the rest of the literal, skipping past |
705 | the second consecutive brace. */ |
706 | while (self->str.start < self->str.end) { |
707 | switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) { |
708 | case '{': |
709 | case '}': |
710 | markup_follows = 1; |
711 | break; |
712 | default: |
713 | continue; |
714 | } |
715 | break; |
716 | } |
717 | |
718 | at_end = self->str.start >= self->str.end; |
719 | len = self->str.start - start; |
720 | |
721 | if ((c == '}') && (at_end || |
722 | (c != PyUnicode_READ_CHAR(self->str.str, |
723 | self->str.start)))) { |
724 | PyErr_SetString(PyExc_ValueError, "Single '}' encountered " |
725 | "in format string" ); |
726 | return 0; |
727 | } |
728 | if (at_end && c == '{') { |
729 | PyErr_SetString(PyExc_ValueError, "Single '{' encountered " |
730 | "in format string" ); |
731 | return 0; |
732 | } |
733 | if (!at_end) { |
734 | if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) { |
735 | /* escaped } or {, skip it in the input. there is no |
736 | markup object following us, just this literal text */ |
737 | self->str.start++; |
738 | markup_follows = 0; |
739 | } |
740 | else |
741 | len--; |
742 | } |
743 | |
744 | /* record the literal text */ |
745 | literal->str = self->str.str; |
746 | literal->start = start; |
747 | literal->end = start + len; |
748 | |
749 | if (!markup_follows) |
750 | return 2; |
751 | |
752 | /* this is markup; parse the field */ |
753 | *field_present = 1; |
754 | if (!parse_field(&self->str, field_name, format_spec, |
755 | format_spec_needs_expanding, conversion)) |
756 | return 0; |
757 | return 2; |
758 | } |
759 | |
760 | |
761 | /* do the !r or !s conversion on obj */ |
762 | static PyObject * |
763 | do_conversion(PyObject *obj, Py_UCS4 conversion) |
764 | { |
765 | /* XXX in pre-3.0, do we need to convert this to unicode, since it |
766 | might have returned a string? */ |
767 | switch (conversion) { |
768 | case 'r': |
769 | return PyObject_Repr(obj); |
770 | case 's': |
771 | return PyObject_Str(obj); |
772 | case 'a': |
773 | return PyObject_ASCII(obj); |
774 | default: |
775 | if (conversion > 32 && conversion < 127) { |
776 | /* It's the ASCII subrange; casting to char is safe |
777 | (assuming the execution character set is an ASCII |
778 | superset). */ |
779 | PyErr_Format(PyExc_ValueError, |
780 | "Unknown conversion specifier %c" , |
781 | (char)conversion); |
782 | } else |
783 | PyErr_Format(PyExc_ValueError, |
784 | "Unknown conversion specifier \\x%x" , |
785 | (unsigned int)conversion); |
786 | return NULL; |
787 | } |
788 | } |
789 | |
790 | /* given: |
791 | |
792 | {field_name!conversion:format_spec} |
793 | |
794 | compute the result and write it to output. |
795 | format_spec_needs_expanding is an optimization. if it's false, |
796 | just output the string directly, otherwise recursively expand the |
797 | format_spec string. |
798 | |
799 | field_name is allowed to be zero length, in which case we |
800 | are doing auto field numbering. |
801 | */ |
802 | |
803 | static int |
804 | output_markup(SubString *field_name, SubString *format_spec, |
805 | int format_spec_needs_expanding, Py_UCS4 conversion, |
806 | _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs, |
807 | int recursion_depth, AutoNumber *auto_number) |
808 | { |
809 | PyObject *tmp = NULL; |
810 | PyObject *fieldobj = NULL; |
811 | SubString expanded_format_spec; |
812 | SubString *actual_format_spec; |
813 | int result = 0; |
814 | |
815 | /* convert field_name to an object */ |
816 | fieldobj = get_field_object(field_name, args, kwargs, auto_number); |
817 | if (fieldobj == NULL) |
818 | goto done; |
819 | |
820 | if (conversion != '\0') { |
821 | tmp = do_conversion(fieldobj, conversion); |
822 | if (tmp == NULL || PyUnicode_READY(tmp) == -1) |
823 | goto done; |
824 | |
825 | /* do the assignment, transferring ownership: fieldobj = tmp */ |
826 | Py_DECREF(fieldobj); |
827 | fieldobj = tmp; |
828 | tmp = NULL; |
829 | } |
830 | |
831 | /* if needed, recursively compute the format_spec */ |
832 | if (format_spec_needs_expanding) { |
833 | tmp = build_string(format_spec, args, kwargs, recursion_depth-1, |
834 | auto_number); |
835 | if (tmp == NULL || PyUnicode_READY(tmp) == -1) |
836 | goto done; |
837 | |
838 | /* note that in the case we're expanding the format string, |
839 | tmp must be kept around until after the call to |
840 | render_field. */ |
841 | SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp)); |
842 | actual_format_spec = &expanded_format_spec; |
843 | } |
844 | else |
845 | actual_format_spec = format_spec; |
846 | |
847 | if (render_field(fieldobj, actual_format_spec, writer) == 0) |
848 | goto done; |
849 | |
850 | result = 1; |
851 | |
852 | done: |
853 | Py_XDECREF(fieldobj); |
854 | Py_XDECREF(tmp); |
855 | |
856 | return result; |
857 | } |
858 | |
859 | /* |
860 | do_markup is the top-level loop for the format() method. It |
861 | searches through the format string for escapes to markup codes, and |
862 | calls other functions to move non-markup text to the output, |
863 | and to perform the markup to the output. |
864 | */ |
865 | static int |
866 | do_markup(SubString *input, PyObject *args, PyObject *kwargs, |
867 | _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number) |
868 | { |
869 | MarkupIterator iter; |
870 | int format_spec_needs_expanding; |
871 | int result; |
872 | int field_present; |
873 | SubString literal; |
874 | SubString field_name; |
875 | SubString format_spec; |
876 | Py_UCS4 conversion; |
877 | |
878 | MarkupIterator_init(&iter, input->str, input->start, input->end); |
879 | while ((result = MarkupIterator_next(&iter, &literal, &field_present, |
880 | &field_name, &format_spec, |
881 | &conversion, |
882 | &format_spec_needs_expanding)) == 2) { |
883 | if (literal.end != literal.start) { |
884 | if (!field_present && iter.str.start == iter.str.end) |
885 | writer->overallocate = 0; |
886 | if (_PyUnicodeWriter_WriteSubstring(writer, literal.str, |
887 | literal.start, literal.end) < 0) |
888 | return 0; |
889 | } |
890 | |
891 | if (field_present) { |
892 | if (iter.str.start == iter.str.end) |
893 | writer->overallocate = 0; |
894 | if (!output_markup(&field_name, &format_spec, |
895 | format_spec_needs_expanding, conversion, writer, |
896 | args, kwargs, recursion_depth, auto_number)) |
897 | return 0; |
898 | } |
899 | } |
900 | return result; |
901 | } |
902 | |
903 | |
904 | /* |
905 | build_string allocates the output string and then |
906 | calls do_markup to do the heavy lifting. |
907 | */ |
908 | static PyObject * |
909 | build_string(SubString *input, PyObject *args, PyObject *kwargs, |
910 | int recursion_depth, AutoNumber *auto_number) |
911 | { |
912 | _PyUnicodeWriter writer; |
913 | |
914 | /* check the recursion level */ |
915 | if (recursion_depth <= 0) { |
916 | PyErr_SetString(PyExc_ValueError, |
917 | "Max string recursion exceeded" ); |
918 | return NULL; |
919 | } |
920 | |
921 | _PyUnicodeWriter_Init(&writer); |
922 | writer.overallocate = 1; |
923 | writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100; |
924 | |
925 | if (!do_markup(input, args, kwargs, &writer, recursion_depth, |
926 | auto_number)) { |
927 | _PyUnicodeWriter_Dealloc(&writer); |
928 | return NULL; |
929 | } |
930 | |
931 | return _PyUnicodeWriter_Finish(&writer); |
932 | } |
933 | |
934 | /************************************************************************/ |
935 | /*********** main routine ***********************************************/ |
936 | /************************************************************************/ |
937 | |
938 | /* this is the main entry point */ |
939 | static PyObject * |
940 | do_string_format(PyObject *self, PyObject *args, PyObject *kwargs) |
941 | { |
942 | SubString input; |
943 | |
944 | /* PEP 3101 says only 2 levels, so that |
945 | "{0:{1}}".format('abc', 's') # works |
946 | "{0:{1:{2}}}".format('abc', 's', '') # fails |
947 | */ |
948 | int recursion_depth = 2; |
949 | |
950 | AutoNumber auto_number; |
951 | |
952 | if (PyUnicode_READY(self) == -1) |
953 | return NULL; |
954 | |
955 | AutoNumber_Init(&auto_number); |
956 | SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self)); |
957 | return build_string(&input, args, kwargs, recursion_depth, &auto_number); |
958 | } |
959 | |
960 | static PyObject * |
961 | do_string_format_map(PyObject *self, PyObject *obj) |
962 | { |
963 | return do_string_format(self, NULL, obj); |
964 | } |
965 | |
966 | |
967 | /************************************************************************/ |
968 | /*********** formatteriterator ******************************************/ |
969 | /************************************************************************/ |
970 | |
971 | /* This is used to implement string.Formatter.vparse(). It exists so |
972 | Formatter can share code with the built in unicode.format() method. |
973 | It's really just a wrapper around MarkupIterator that is callable |
974 | from Python. */ |
975 | |
976 | typedef struct { |
977 | PyObject_HEAD |
978 | PyObject *str; |
979 | MarkupIterator it_markup; |
980 | } formatteriterobject; |
981 | |
982 | static void |
983 | formatteriter_dealloc(formatteriterobject *it) |
984 | { |
985 | Py_XDECREF(it->str); |
986 | PyObject_Free(it); |
987 | } |
988 | |
989 | /* returns a tuple: |
990 | (literal, field_name, format_spec, conversion) |
991 | |
992 | literal is any literal text to output. might be zero length |
993 | field_name is the string before the ':'. might be None |
994 | format_spec is the string after the ':'. mibht be None |
995 | conversion is either None, or the string after the '!' |
996 | */ |
997 | static PyObject * |
998 | formatteriter_next(formatteriterobject *it) |
999 | { |
1000 | SubString literal; |
1001 | SubString field_name; |
1002 | SubString format_spec; |
1003 | Py_UCS4 conversion; |
1004 | int format_spec_needs_expanding; |
1005 | int field_present; |
1006 | int result = MarkupIterator_next(&it->it_markup, &literal, &field_present, |
1007 | &field_name, &format_spec, &conversion, |
1008 | &format_spec_needs_expanding); |
1009 | |
1010 | /* all of the SubString objects point into it->str, so no |
1011 | memory management needs to be done on them */ |
1012 | assert(0 <= result && result <= 2); |
1013 | if (result == 0 || result == 1) |
1014 | /* if 0, error has already been set, if 1, iterator is empty */ |
1015 | return NULL; |
1016 | else { |
1017 | PyObject *literal_str = NULL; |
1018 | PyObject *field_name_str = NULL; |
1019 | PyObject *format_spec_str = NULL; |
1020 | PyObject *conversion_str = NULL; |
1021 | PyObject *tuple = NULL; |
1022 | |
1023 | literal_str = SubString_new_object(&literal); |
1024 | if (literal_str == NULL) |
1025 | goto done; |
1026 | |
1027 | field_name_str = SubString_new_object(&field_name); |
1028 | if (field_name_str == NULL) |
1029 | goto done; |
1030 | |
1031 | /* if field_name is non-zero length, return a string for |
1032 | format_spec (even if zero length), else return None */ |
1033 | format_spec_str = (field_present ? |
1034 | SubString_new_object_or_empty : |
1035 | SubString_new_object)(&format_spec); |
1036 | if (format_spec_str == NULL) |
1037 | goto done; |
1038 | |
1039 | /* if the conversion is not specified, return a None, |
1040 | otherwise create a one length string with the conversion |
1041 | character */ |
1042 | if (conversion == '\0') { |
1043 | conversion_str = Py_None; |
1044 | Py_INCREF(conversion_str); |
1045 | } |
1046 | else |
1047 | conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, |
1048 | &conversion, 1); |
1049 | if (conversion_str == NULL) |
1050 | goto done; |
1051 | |
1052 | tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str, |
1053 | conversion_str); |
1054 | done: |
1055 | Py_XDECREF(literal_str); |
1056 | Py_XDECREF(field_name_str); |
1057 | Py_XDECREF(format_spec_str); |
1058 | Py_XDECREF(conversion_str); |
1059 | return tuple; |
1060 | } |
1061 | } |
1062 | |
1063 | static PyMethodDef formatteriter_methods[] = { |
1064 | {NULL, NULL} /* sentinel */ |
1065 | }; |
1066 | |
1067 | static PyTypeObject PyFormatterIter_Type = { |
1068 | PyVarObject_HEAD_INIT(&PyType_Type, 0) |
1069 | "formatteriterator" , /* tp_name */ |
1070 | sizeof(formatteriterobject), /* tp_basicsize */ |
1071 | 0, /* tp_itemsize */ |
1072 | /* methods */ |
1073 | (destructor)formatteriter_dealloc, /* tp_dealloc */ |
1074 | 0, /* tp_vectorcall_offset */ |
1075 | 0, /* tp_getattr */ |
1076 | 0, /* tp_setattr */ |
1077 | 0, /* tp_as_async */ |
1078 | 0, /* tp_repr */ |
1079 | 0, /* tp_as_number */ |
1080 | 0, /* tp_as_sequence */ |
1081 | 0, /* tp_as_mapping */ |
1082 | 0, /* tp_hash */ |
1083 | 0, /* tp_call */ |
1084 | 0, /* tp_str */ |
1085 | PyObject_GenericGetAttr, /* tp_getattro */ |
1086 | 0, /* tp_setattro */ |
1087 | 0, /* tp_as_buffer */ |
1088 | Py_TPFLAGS_DEFAULT, /* tp_flags */ |
1089 | 0, /* tp_doc */ |
1090 | 0, /* tp_traverse */ |
1091 | 0, /* tp_clear */ |
1092 | 0, /* tp_richcompare */ |
1093 | 0, /* tp_weaklistoffset */ |
1094 | PyObject_SelfIter, /* tp_iter */ |
1095 | (iternextfunc)formatteriter_next, /* tp_iternext */ |
1096 | formatteriter_methods, /* tp_methods */ |
1097 | 0, |
1098 | }; |
1099 | |
1100 | /* unicode_formatter_parser is used to implement |
1101 | string.Formatter.vformat. it parses a string and returns tuples |
1102 | describing the parsed elements. It's a wrapper around |
1103 | stringlib/string_format.h's MarkupIterator */ |
1104 | static PyObject * |
1105 | formatter_parser(PyObject *ignored, PyObject *self) |
1106 | { |
1107 | formatteriterobject *it; |
1108 | |
1109 | if (!PyUnicode_Check(self)) { |
1110 | PyErr_Format(PyExc_TypeError, "expected str, got %s" , Py_TYPE(self)->tp_name); |
1111 | return NULL; |
1112 | } |
1113 | |
1114 | if (PyUnicode_READY(self) == -1) |
1115 | return NULL; |
1116 | |
1117 | it = PyObject_New(formatteriterobject, &PyFormatterIter_Type); |
1118 | if (it == NULL) |
1119 | return NULL; |
1120 | |
1121 | /* take ownership, give the object to the iterator */ |
1122 | Py_INCREF(self); |
1123 | it->str = self; |
1124 | |
1125 | /* initialize the contained MarkupIterator */ |
1126 | MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self)); |
1127 | return (PyObject *)it; |
1128 | } |
1129 | |
1130 | |
1131 | /************************************************************************/ |
1132 | /*********** fieldnameiterator ******************************************/ |
1133 | /************************************************************************/ |
1134 | |
1135 | |
1136 | /* This is used to implement string.Formatter.vparse(). It parses the |
1137 | field name into attribute and item values. It's a Python-callable |
1138 | wrapper around FieldNameIterator */ |
1139 | |
1140 | typedef struct { |
1141 | PyObject_HEAD |
1142 | PyObject *str; |
1143 | FieldNameIterator it_field; |
1144 | } fieldnameiterobject; |
1145 | |
1146 | static void |
1147 | fieldnameiter_dealloc(fieldnameiterobject *it) |
1148 | { |
1149 | Py_XDECREF(it->str); |
1150 | PyObject_Free(it); |
1151 | } |
1152 | |
1153 | /* returns a tuple: |
1154 | (is_attr, value) |
1155 | is_attr is true if we used attribute syntax (e.g., '.foo') |
1156 | false if we used index syntax (e.g., '[foo]') |
1157 | value is an integer or string |
1158 | */ |
1159 | static PyObject * |
1160 | fieldnameiter_next(fieldnameiterobject *it) |
1161 | { |
1162 | int result; |
1163 | int is_attr; |
1164 | Py_ssize_t idx; |
1165 | SubString name; |
1166 | |
1167 | result = FieldNameIterator_next(&it->it_field, &is_attr, |
1168 | &idx, &name); |
1169 | if (result == 0 || result == 1) |
1170 | /* if 0, error has already been set, if 1, iterator is empty */ |
1171 | return NULL; |
1172 | else { |
1173 | PyObject* result = NULL; |
1174 | PyObject* is_attr_obj = NULL; |
1175 | PyObject* obj = NULL; |
1176 | |
1177 | is_attr_obj = PyBool_FromLong(is_attr); |
1178 | if (is_attr_obj == NULL) |
1179 | goto done; |
1180 | |
1181 | /* either an integer or a string */ |
1182 | if (idx != -1) |
1183 | obj = PyLong_FromSsize_t(idx); |
1184 | else |
1185 | obj = SubString_new_object(&name); |
1186 | if (obj == NULL) |
1187 | goto done; |
1188 | |
1189 | /* return a tuple of values */ |
1190 | result = PyTuple_Pack(2, is_attr_obj, obj); |
1191 | |
1192 | done: |
1193 | Py_XDECREF(is_attr_obj); |
1194 | Py_XDECREF(obj); |
1195 | return result; |
1196 | } |
1197 | } |
1198 | |
1199 | static PyMethodDef fieldnameiter_methods[] = { |
1200 | {NULL, NULL} /* sentinel */ |
1201 | }; |
1202 | |
1203 | static PyTypeObject PyFieldNameIter_Type = { |
1204 | PyVarObject_HEAD_INIT(&PyType_Type, 0) |
1205 | "fieldnameiterator" , /* tp_name */ |
1206 | sizeof(fieldnameiterobject), /* tp_basicsize */ |
1207 | 0, /* tp_itemsize */ |
1208 | /* methods */ |
1209 | (destructor)fieldnameiter_dealloc, /* tp_dealloc */ |
1210 | 0, /* tp_vectorcall_offset */ |
1211 | 0, /* tp_getattr */ |
1212 | 0, /* tp_setattr */ |
1213 | 0, /* tp_as_async */ |
1214 | 0, /* tp_repr */ |
1215 | 0, /* tp_as_number */ |
1216 | 0, /* tp_as_sequence */ |
1217 | 0, /* tp_as_mapping */ |
1218 | 0, /* tp_hash */ |
1219 | 0, /* tp_call */ |
1220 | 0, /* tp_str */ |
1221 | PyObject_GenericGetAttr, /* tp_getattro */ |
1222 | 0, /* tp_setattro */ |
1223 | 0, /* tp_as_buffer */ |
1224 | Py_TPFLAGS_DEFAULT, /* tp_flags */ |
1225 | 0, /* tp_doc */ |
1226 | 0, /* tp_traverse */ |
1227 | 0, /* tp_clear */ |
1228 | 0, /* tp_richcompare */ |
1229 | 0, /* tp_weaklistoffset */ |
1230 | PyObject_SelfIter, /* tp_iter */ |
1231 | (iternextfunc)fieldnameiter_next, /* tp_iternext */ |
1232 | fieldnameiter_methods, /* tp_methods */ |
1233 | 0}; |
1234 | |
1235 | /* unicode_formatter_field_name_split is used to implement |
1236 | string.Formatter.vformat. it takes a PEP 3101 "field name", and |
1237 | returns a tuple of (first, rest): "first", the part before the |
1238 | first '.' or '['; and "rest", an iterator for the rest of the field |
1239 | name. it's a wrapper around stringlib/string_format.h's |
1240 | field_name_split. The iterator it returns is a |
1241 | FieldNameIterator */ |
1242 | static PyObject * |
1243 | formatter_field_name_split(PyObject *ignored, PyObject *self) |
1244 | { |
1245 | SubString first; |
1246 | Py_ssize_t first_idx; |
1247 | fieldnameiterobject *it; |
1248 | |
1249 | PyObject *first_obj = NULL; |
1250 | PyObject *result = NULL; |
1251 | |
1252 | if (!PyUnicode_Check(self)) { |
1253 | PyErr_Format(PyExc_TypeError, "expected str, got %s" , Py_TYPE(self)->tp_name); |
1254 | return NULL; |
1255 | } |
1256 | |
1257 | if (PyUnicode_READY(self) == -1) |
1258 | return NULL; |
1259 | |
1260 | it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type); |
1261 | if (it == NULL) |
1262 | return NULL; |
1263 | |
1264 | /* take ownership, give the object to the iterator. this is |
1265 | just to keep the field_name alive */ |
1266 | Py_INCREF(self); |
1267 | it->str = self; |
1268 | |
1269 | /* Pass in auto_number = NULL. We'll return an empty string for |
1270 | first_obj in that case. */ |
1271 | if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self), |
1272 | &first, &first_idx, &it->it_field, NULL)) |
1273 | goto done; |
1274 | |
1275 | /* first becomes an integer, if possible; else a string */ |
1276 | if (first_idx != -1) |
1277 | first_obj = PyLong_FromSsize_t(first_idx); |
1278 | else |
1279 | /* convert "first" into a string object */ |
1280 | first_obj = SubString_new_object(&first); |
1281 | if (first_obj == NULL) |
1282 | goto done; |
1283 | |
1284 | /* return a tuple of values */ |
1285 | result = PyTuple_Pack(2, first_obj, it); |
1286 | |
1287 | done: |
1288 | Py_XDECREF(it); |
1289 | Py_XDECREF(first_obj); |
1290 | return result; |
1291 | } |
1292 | |