1#include <stdbool.h>
2
3#include <Python.h>
4
5#include "tokenizer.h"
6#include "pegen.h"
7#include "string_parser.h"
8
9//// STRING HANDLING FUNCTIONS ////
10
11static int
12warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
13{
14 PyObject *msg =
15 PyUnicode_FromFormat("invalid escape sequence '\\%c'", first_invalid_escape_char);
16 if (msg == NULL) {
17 return -1;
18 }
19 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
20 t->lineno, NULL, NULL) < 0) {
21 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
22 /* Replace the DeprecationWarning exception with a SyntaxError
23 to get a more accurate error report */
24 PyErr_Clear();
25
26 /* This is needed, in order for the SyntaxError to point to the token t,
27 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
28 error location, if p->known_err_token is not set. */
29 p->known_err_token = t;
30 RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", first_invalid_escape_char);
31 }
32 Py_DECREF(msg);
33 return -1;
34 }
35 Py_DECREF(msg);
36 return 0;
37}
38
39static PyObject *
40decode_utf8(const char **sPtr, const char *end)
41{
42 const char *s;
43 const char *t;
44 t = s = *sPtr;
45 while (s < end && (*s & 0x80)) {
46 s++;
47 }
48 *sPtr = s;
49 return PyUnicode_DecodeUTF8(t, s - t, NULL);
50}
51
52static PyObject *
53decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
54{
55 PyObject *v;
56 PyObject *u;
57 char *buf;
58 char *p;
59 const char *end;
60
61 /* check for integer overflow */
62 if (len > SIZE_MAX / 6) {
63 return NULL;
64 }
65 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
66 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
67 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
68 if (u == NULL) {
69 return NULL;
70 }
71 p = buf = PyBytes_AsString(u);
72 if (p == NULL) {
73 return NULL;
74 }
75 end = s + len;
76 while (s < end) {
77 if (*s == '\\') {
78 *p++ = *s++;
79 if (s >= end || *s & 0x80) {
80 strcpy(p, "u005c");
81 p += 5;
82 if (s >= end) {
83 break;
84 }
85 }
86 }
87 if (*s & 0x80) {
88 PyObject *w;
89 int kind;
90 const void *data;
91 Py_ssize_t w_len;
92 Py_ssize_t i;
93 w = decode_utf8(&s, end);
94 if (w == NULL) {
95 Py_DECREF(u);
96 return NULL;
97 }
98 kind = PyUnicode_KIND(w);
99 data = PyUnicode_DATA(w);
100 w_len = PyUnicode_GET_LENGTH(w);
101 for (i = 0; i < w_len; i++) {
102 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
103 sprintf(p, "\\U%08x", chr);
104 p += 10;
105 }
106 /* Should be impossible to overflow */
107 assert(p - buf <= PyBytes_GET_SIZE(u));
108 Py_DECREF(w);
109 }
110 else {
111 *p++ = *s++;
112 }
113 }
114 len = p - buf;
115 s = buf;
116
117 const char *first_invalid_escape;
118 v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
119
120 if (v != NULL && first_invalid_escape != NULL) {
121 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
122 /* We have not decref u before because first_invalid_escape points
123 inside u. */
124 Py_XDECREF(u);
125 Py_DECREF(v);
126 return NULL;
127 }
128 }
129 Py_XDECREF(u);
130 return v;
131}
132
133static PyObject *
134decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
135{
136 const char *first_invalid_escape;
137 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
138 if (result == NULL) {
139 return NULL;
140 }
141
142 if (first_invalid_escape != NULL) {
143 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
144 Py_DECREF(result);
145 return NULL;
146 }
147 }
148 return result;
149}
150
151/* s must include the bracketing quote characters, and r, b, u,
152 &/or f prefixes (if any), and embedded escape sequences (if any).
153 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
154 If the string is an f-string, set *fstr and *fstrlen to the unparsed
155 string object. Return 0 if no errors occurred. */
156int
157_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
158 const char **fstr, Py_ssize_t *fstrlen, Token *t)
159{
160 const char *s = PyBytes_AsString(t->bytes);
161 if (s == NULL) {
162 return -1;
163 }
164
165 size_t len;
166 int quote = Py_CHARMASK(*s);
167 int fmode = 0;
168 *bytesmode = 0;
169 *rawmode = 0;
170 *result = NULL;
171 *fstr = NULL;
172 if (Py_ISALPHA(quote)) {
173 while (!*bytesmode || !*rawmode) {
174 if (quote == 'b' || quote == 'B') {
175 quote =(unsigned char)*++s;
176 *bytesmode = 1;
177 }
178 else if (quote == 'u' || quote == 'U') {
179 quote = (unsigned char)*++s;
180 }
181 else if (quote == 'r' || quote == 'R') {
182 quote = (unsigned char)*++s;
183 *rawmode = 1;
184 }
185 else if (quote == 'f' || quote == 'F') {
186 quote = (unsigned char)*++s;
187 fmode = 1;
188 }
189 else {
190 break;
191 }
192 }
193 }
194
195 /* fstrings are only allowed in Python 3.6 and greater */
196 if (fmode && p->feature_version < 6) {
197 p->error_indicator = 1;
198 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
199 return -1;
200 }
201
202 if (fmode && *bytesmode) {
203 PyErr_BadInternalCall();
204 return -1;
205 }
206 if (quote != '\'' && quote != '\"') {
207 PyErr_BadInternalCall();
208 return -1;
209 }
210 /* Skip the leading quote char. */
211 s++;
212 len = strlen(s);
213 if (len > INT_MAX) {
214 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
215 return -1;
216 }
217 if (s[--len] != quote) {
218 /* Last quote char must match the first. */
219 PyErr_BadInternalCall();
220 return -1;
221 }
222 if (len >= 4 && s[0] == quote && s[1] == quote) {
223 /* A triple quoted string. We've already skipped one quote at
224 the start and one at the end of the string. Now skip the
225 two at the start. */
226 s += 2;
227 len -= 2;
228 /* And check that the last two match. */
229 if (s[--len] != quote || s[--len] != quote) {
230 PyErr_BadInternalCall();
231 return -1;
232 }
233 }
234
235 if (fmode) {
236 /* Just return the bytes. The caller will parse the resulting
237 string. */
238 *fstr = s;
239 *fstrlen = len;
240 return 0;
241 }
242
243 /* Not an f-string. */
244 /* Avoid invoking escape decoding routines if possible. */
245 *rawmode = *rawmode || strchr(s, '\\') == NULL;
246 if (*bytesmode) {
247 /* Disallow non-ASCII characters. */
248 const char *ch;
249 for (ch = s; *ch; ch++) {
250 if (Py_CHARMASK(*ch) >= 0x80) {
251 RAISE_SYNTAX_ERROR(
252 "bytes can only contain ASCII "
253 "literal characters");
254 return -1;
255 }
256 }
257 if (*rawmode) {
258 *result = PyBytes_FromStringAndSize(s, len);
259 }
260 else {
261 *result = decode_bytes_with_escapes(p, s, len, t);
262 }
263 }
264 else {
265 if (*rawmode) {
266 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
267 }
268 else {
269 *result = decode_unicode_with_escapes(p, s, len, t);
270 }
271 }
272 return *result == NULL ? -1 : 0;
273}
274
275
276
277// FSTRING STUFF
278
279/* Fix locations for the given node and its children.
280
281 `parent` is the enclosing node.
282 `expr_start` is the starting position of the expression (pointing to the open brace).
283 `n` is the node which locations are going to be fixed relative to parent.
284 `expr_str` is the child node's string representation, including braces.
285*/
286static bool
287fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols)
288{
289 *p_lines = 0;
290 *p_cols = 0;
291 assert(expr_start != NULL && *expr_start == '{');
292 if (parent && parent->bytes) {
293 const char *parent_str = PyBytes_AsString(parent->bytes);
294 if (!parent_str) {
295 return false;
296 }
297 // The following is needed, in order to correctly shift the column
298 // offset, in the case that (disregarding any whitespace) a newline
299 // immediately follows the opening curly brace of the fstring expression.
300 bool newline_after_brace = 1;
301 const char *start = expr_start + 1;
302 while (start && *start != '}' && *start != '\n') {
303 if (*start != ' ' && *start != '\t' && *start != '\f') {
304 newline_after_brace = 0;
305 break;
306 }
307 start++;
308 }
309
310 // Account for the characters from the last newline character to our
311 // left until the beginning of expr_start.
312 if (!newline_after_brace) {
313 start = expr_start;
314 while (start > parent_str && *start != '\n') {
315 start--;
316 }
317 *p_cols += (int)(expr_start - start);
318 }
319 /* adjust the start based on the number of newlines encountered
320 before the f-string expression */
321 for (const char *p = parent_str; p < expr_start; p++) {
322 if (*p == '\n') {
323 (*p_lines)++;
324 }
325 }
326 }
327 return true;
328}
329
330
331/* Compile this expression in to an expr_ty. Add parens around the
332 expression, in order to allow leading spaces in the expression. */
333static expr_ty
334fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
335 Token *t)
336{
337 expr_ty expr = NULL;
338 char *str;
339 Py_ssize_t len;
340 const char *s;
341 expr_ty result = NULL;
342
343 assert(expr_end >= expr_start);
344 assert(*(expr_start-1) == '{');
345 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
346 *expr_end == '=');
347
348 /* If the substring is all whitespace, it's an error. We need to catch this
349 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
350 because turning the expression '' in to '()' would go from being invalid
351 to valid. */
352 for (s = expr_start; s != expr_end; s++) {
353 char c = *s;
354 /* The Python parser ignores only the following whitespace
355 characters (\r already is converted to \n). */
356 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
357 break;
358 }
359 }
360 if (s == expr_end) {
361 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
362 return NULL;
363 }
364
365 len = expr_end - expr_start;
366 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
367 str = PyMem_Calloc(len + 3, sizeof(char));
368 if (str == NULL) {
369 PyErr_NoMemory();
370 return NULL;
371 }
372
373 // The call to fstring_find_expr_location is responsible for finding the column offset
374 // the generated AST nodes need to be shifted to the right, which is equal to the number
375 // of the f-string characters before the expression starts.
376 memcpy(str+1, expr_start, len);
377 int lines, cols;
378 if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) {
379 PyMem_Free(str);
380 return NULL;
381 }
382
383 // The parentheses are needed in order to allow for leading whitespace within
384 // the f-string expression. This consequently gets parsed as a group (see the
385 // group rule in python.gram).
386 str[0] = '(';
387 str[len+1] = ')';
388
389 struct tok_state* tok = PyTokenizer_FromString(str, 1);
390 if (tok == NULL) {
391 PyMem_Free(str);
392 return NULL;
393 }
394 Py_INCREF(p->tok->filename);
395
396 tok->filename = p->tok->filename;
397 tok->lineno = t->lineno + lines - 1;
398
399 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
400 NULL, p->arena);
401
402 p2->starting_lineno = t->lineno + lines;
403 p2->starting_col_offset = t->col_offset + cols;
404
405 expr = _PyPegen_run_parser(p2);
406
407 if (expr == NULL) {
408 goto exit;
409 }
410 result = expr;
411
412exit:
413 PyMem_Free(str);
414 _PyPegen_Parser_Free(p2);
415 PyTokenizer_Free(tok);
416 return result;
417}
418
419/* Return -1 on error.
420
421 Return 0 if we reached the end of the literal.
422
423 Return 1 if we haven't reached the end of the literal, but we want
424 the caller to process the literal up to this point. Used for
425 doubled braces.
426*/
427static int
428fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
429 PyObject **literal, int recurse_lvl, Token *t)
430{
431 /* Get any literal string. It ends when we hit an un-doubled left
432 brace (which isn't part of a unicode name escape such as
433 "\N{EULER CONSTANT}"), or the end of the string. */
434
435 const char *s = *str;
436 const char *literal_start = s;
437 int result = 0;
438
439 assert(*literal == NULL);
440 while (s < end) {
441 char ch = *s++;
442 if (!raw && ch == '\\' && s < end) {
443 ch = *s++;
444 if (ch == 'N') {
445 /* We need to look at and skip matching braces for "\N{name}"
446 sequences because otherwise we'll think the opening '{'
447 starts an expression, which is not the case with "\N".
448 Keep looking for either a matched '{' '}' pair, or the end
449 of the string. */
450
451 if (s < end && *s++ == '{') {
452 while (s < end && *s++ != '}') {
453 }
454 continue;
455 }
456
457 /* This is an invalid "\N" sequence, since it's a "\N" not
458 followed by a "{". Just keep parsing this literal. This
459 error will be caught later by
460 decode_unicode_with_escapes(). */
461 continue;
462 }
463 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
464 return -1;
465 }
466 }
467 if (ch == '{' || ch == '}') {
468 /* Check for doubled braces, but only at the top level. If
469 we checked at every level, then f'{0:{3}}' would fail
470 with the two closing braces. */
471 if (recurse_lvl == 0) {
472 if (s < end && *s == ch) {
473 /* We're going to tell the caller that the literal ends
474 here, but that they should continue scanning. But also
475 skip over the second brace when we resume scanning. */
476 *str = s + 1;
477 result = 1;
478 goto done;
479 }
480
481 /* Where a single '{' is the start of a new expression, a
482 single '}' is not allowed. */
483 if (ch == '}') {
484 *str = s - 1;
485 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
486 return -1;
487 }
488 }
489 /* We're either at a '{', which means we're starting another
490 expression; or a '}', which means we're at the end of this
491 f-string (for a nested format_spec). */
492 s--;
493 break;
494 }
495 }
496 *str = s;
497 assert(s <= end);
498 assert(s == end || *s == '{' || *s == '}');
499done:
500 if (literal_start != s) {
501 if (raw) {
502 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
503 s - literal_start,
504 NULL, NULL);
505 }
506 else {
507 *literal = decode_unicode_with_escapes(p, literal_start,
508 s - literal_start, t);
509 }
510 if (!*literal) {
511 return -1;
512 }
513 }
514 return result;
515}
516
517/* Forward declaration because parsing is recursive. */
518static expr_ty
519fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
520 Token *first_token, Token* t, Token *last_token);
521
522/* Parse the f-string at *str, ending at end. We know *str starts an
523 expression (so it must be a '{'). Returns the FormattedValue node, which
524 includes the expression, conversion character, format_spec expression, and
525 optionally the text of the expression (if = is used).
526
527 Note that I don't do a perfect job here: I don't make sure that a
528 closing brace doesn't match an opening paren, for example. It
529 doesn't need to error on all invalid expressions, just correctly
530 find the end of all valid ones. Any errors inside the expression
531 will be caught when we parse it later.
532
533 *expression is set to the expression. For an '=' "debug" expression,
534 *expr_text is set to the debug text (the original text of the expression,
535 including the '=' and any whitespace around it, as a string object). If
536 not a debug expression, *expr_text set to NULL. */
537static int
538fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
539 PyObject **expr_text, expr_ty *expression, Token *first_token,
540 Token *t, Token *last_token)
541{
542 /* Return -1 on error, else 0. */
543
544 const char *expr_start;
545 const char *expr_end;
546 expr_ty simple_expression;
547 expr_ty format_spec = NULL; /* Optional format specifier. */
548 int conversion = -1; /* The conversion char. Use default if not
549 specified, or !r if using = and no format
550 spec. */
551
552 /* 0 if we're not in a string, else the quote char we're trying to
553 match (single or double quote). */
554 char quote_char = 0;
555
556 /* If we're inside a string, 1=normal, 3=triple-quoted. */
557 int string_type = 0;
558
559 /* Keep track of nesting level for braces/parens/brackets in
560 expressions. */
561 Py_ssize_t nested_depth = 0;
562 char parenstack[MAXLEVEL];
563
564 *expr_text = NULL;
565
566 /* Can only nest one level deep. */
567 if (recurse_lvl >= 2) {
568 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
569 goto error;
570 }
571
572 /* The first char must be a left brace, or we wouldn't have gotten
573 here. Skip over it. */
574 assert(**str == '{');
575 *str += 1;
576
577 expr_start = *str;
578 for (; *str < end; (*str)++) {
579 char ch;
580
581 /* Loop invariants. */
582 assert(nested_depth >= 0);
583 assert(*str >= expr_start && *str < end);
584 if (quote_char) {
585 assert(string_type == 1 || string_type == 3);
586 } else {
587 assert(string_type == 0);
588 }
589
590 ch = **str;
591 /* Nowhere inside an expression is a backslash allowed. */
592 if (ch == '\\') {
593 /* Error: can't include a backslash character, inside
594 parens or strings or not. */
595 RAISE_SYNTAX_ERROR(
596 "f-string expression part "
597 "cannot include a backslash");
598 goto error;
599 }
600 if (quote_char) {
601 /* We're inside a string. See if we're at the end. */
602 /* This code needs to implement the same non-error logic
603 as tok_get from tokenizer.c, at the letter_quote
604 label. To actually share that code would be a
605 nightmare. But, it's unlikely to change and is small,
606 so duplicate it here. Note we don't need to catch all
607 of the errors, since they'll be caught when parsing the
608 expression. We just need to match the non-error
609 cases. Thus we can ignore \n in single-quoted strings,
610 for example. Or non-terminated strings. */
611 if (ch == quote_char) {
612 /* Does this match the string_type (single or triple
613 quoted)? */
614 if (string_type == 3) {
615 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
616 /* We're at the end of a triple quoted string. */
617 *str += 2;
618 string_type = 0;
619 quote_char = 0;
620 continue;
621 }
622 } else {
623 /* We're at the end of a normal string. */
624 quote_char = 0;
625 string_type = 0;
626 continue;
627 }
628 }
629 } else if (ch == '\'' || ch == '"') {
630 /* Is this a triple quoted string? */
631 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
632 string_type = 3;
633 *str += 2;
634 } else {
635 /* Start of a normal string. */
636 string_type = 1;
637 }
638 /* Start looking for the end of the string. */
639 quote_char = ch;
640 } else if (ch == '[' || ch == '{' || ch == '(') {
641 if (nested_depth >= MAXLEVEL) {
642 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
643 goto error;
644 }
645 parenstack[nested_depth] = ch;
646 nested_depth++;
647 } else if (ch == '#') {
648 /* Error: can't include a comment character, inside parens
649 or not. */
650 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
651 goto error;
652 } else if (nested_depth == 0 &&
653 (ch == '!' || ch == ':' || ch == '}' ||
654 ch == '=' || ch == '>' || ch == '<')) {
655 /* See if there's a next character. */
656 if (*str+1 < end) {
657 char next = *(*str+1);
658
659 /* For "!=". since '=' is not an allowed conversion character,
660 nothing is lost in this test. */
661 if ((ch == '!' && next == '=') || /* != */
662 (ch == '=' && next == '=') || /* == */
663 (ch == '<' && next == '=') || /* <= */
664 (ch == '>' && next == '=') /* >= */
665 ) {
666 *str += 1;
667 continue;
668 }
669 }
670 /* Don't get out of the loop for these, if they're single
671 chars (not part of 2-char tokens). If by themselves, they
672 don't end an expression (unlike say '!'). */
673 if (ch == '>' || ch == '<') {
674 continue;
675 }
676
677 /* Normal way out of this loop. */
678 break;
679 } else if (ch == ']' || ch == '}' || ch == ')') {
680 if (!nested_depth) {
681 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
682 goto error;
683 }
684 nested_depth--;
685 int opening = (unsigned char)parenstack[nested_depth];
686 if (!((opening == '(' && ch == ')') ||
687 (opening == '[' && ch == ']') ||
688 (opening == '{' && ch == '}')))
689 {
690 RAISE_SYNTAX_ERROR(
691 "f-string: closing parenthesis '%c' "
692 "does not match opening parenthesis '%c'",
693 ch, opening);
694 goto error;
695 }
696 } else {
697 /* Just consume this char and loop around. */
698 }
699 }
700 expr_end = *str;
701 /* If we leave the above loop in a string or with mismatched parens, we
702 don't really care. We'll get a syntax error when compiling the
703 expression. But, we can produce a better error message, so let's just
704 do that.*/
705 if (quote_char) {
706 RAISE_SYNTAX_ERROR("f-string: unterminated string");
707 goto error;
708 }
709 if (nested_depth) {
710 int opening = (unsigned char)parenstack[nested_depth - 1];
711 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
712 goto error;
713 }
714
715 if (*str >= end) {
716 goto unexpected_end_of_string;
717 }
718
719 /* Compile the expression as soon as possible, so we show errors
720 related to the expression before errors related to the
721 conversion or format_spec. */
722 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
723 if (!simple_expression) {
724 goto error;
725 }
726
727 /* Check for =, which puts the text value of the expression in
728 expr_text. */
729 if (**str == '=') {
730 if (p->feature_version < 8) {
731 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
732 "only supported in Python 3.8 and greater");
733 goto error;
734 }
735 *str += 1;
736
737 /* Skip over ASCII whitespace. No need to test for end of string
738 here, since we know there's at least a trailing quote somewhere
739 ahead. */
740 while (Py_ISSPACE(**str)) {
741 *str += 1;
742 }
743 if (*str >= end) {
744 goto unexpected_end_of_string;
745 }
746 /* Set *expr_text to the text of the expression. */
747 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
748 if (!*expr_text) {
749 goto error;
750 }
751 }
752
753 /* Check for a conversion char, if present. */
754 if (**str == '!') {
755 *str += 1;
756 if (*str >= end) {
757 goto unexpected_end_of_string;
758 }
759
760 conversion = (unsigned char)**str;
761 *str += 1;
762
763 /* Validate the conversion. */
764 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
765 RAISE_SYNTAX_ERROR(
766 "f-string: invalid conversion character: "
767 "expected 's', 'r', or 'a'");
768 goto error;
769 }
770
771 }
772
773 /* Check for the format spec, if present. */
774 if (*str >= end) {
775 goto unexpected_end_of_string;
776 }
777 if (**str == ':') {
778 *str += 1;
779 if (*str >= end) {
780 goto unexpected_end_of_string;
781 }
782
783 /* Parse the format spec. */
784 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
785 first_token, t, last_token);
786 if (!format_spec) {
787 goto error;
788 }
789 }
790
791 if (*str >= end || **str != '}') {
792 goto unexpected_end_of_string;
793 }
794
795 /* We're at a right brace. Consume it. */
796 assert(*str < end);
797 assert(**str == '}');
798 *str += 1;
799
800 /* If we're in = mode (detected by non-NULL expr_text), and have no format
801 spec and no explicit conversion, set the conversion to 'r'. */
802 if (*expr_text && format_spec == NULL && conversion == -1) {
803 conversion = 'r';
804 }
805
806 /* And now create the FormattedValue node that represents this
807 entire expression with the conversion and format spec. */
808 //TODO: Fix this
809 *expression = _PyAST_FormattedValue(simple_expression, conversion,
810 format_spec, first_token->lineno,
811 first_token->col_offset,
812 last_token->end_lineno,
813 last_token->end_col_offset, p->arena);
814 if (!*expression) {
815 goto error;
816 }
817
818 return 0;
819
820unexpected_end_of_string:
821 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
822 /* Falls through to error. */
823
824error:
825 Py_XDECREF(*expr_text);
826 return -1;
827
828}
829
830/* Return -1 on error.
831
832 Return 0 if we have a literal (possible zero length) and an
833 expression (zero length if at the end of the string.
834
835 Return 1 if we have a literal, but no expression, and we want the
836 caller to call us again. This is used to deal with doubled
837 braces.
838
839 When called multiple times on the string 'a{{b{0}c', this function
840 will return:
841
842 1. the literal 'a{' with no expression, and a return value
843 of 1. Despite the fact that there's no expression, the return
844 value of 1 means we're not finished yet.
845
846 2. the literal 'b' and the expression '0', with a return value of
847 0. The fact that there's an expression means we're not finished.
848
849 3. literal 'c' with no expression and a return value of 0. The
850 combination of the return value of 0 with no expression means
851 we're finished.
852*/
853static int
854fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
855 int recurse_lvl, PyObject **literal,
856 PyObject **expr_text, expr_ty *expression,
857 Token *first_token, Token *t, Token *last_token)
858{
859 int result;
860
861 assert(*literal == NULL && *expression == NULL);
862
863 /* Get any literal string. */
864 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
865 if (result < 0) {
866 goto error;
867 }
868
869 assert(result == 0 || result == 1);
870
871 if (result == 1) {
872 /* We have a literal, but don't look at the expression. */
873 return 1;
874 }
875
876 if (*str >= end || **str == '}') {
877 /* We're at the end of the string or the end of a nested
878 f-string: no expression. The top-level error case where we
879 expect to be at the end of the string but we're at a '}' is
880 handled later. */
881 return 0;
882 }
883
884 /* We must now be the start of an expression, on a '{'. */
885 assert(**str == '{');
886
887 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
888 expression, first_token, t, last_token) < 0) {
889 goto error;
890 }
891
892 return 0;
893
894error:
895 Py_CLEAR(*literal);
896 return -1;
897}
898
899#ifdef NDEBUG
900#define ExprList_check_invariants(l)
901#else
902static void
903ExprList_check_invariants(ExprList *l)
904{
905 /* Check our invariants. Make sure this object is "live", and
906 hasn't been deallocated. */
907 assert(l->size >= 0);
908 assert(l->p != NULL);
909 if (l->size <= EXPRLIST_N_CACHED) {
910 assert(l->data == l->p);
911 }
912}
913#endif
914
915static void
916ExprList_Init(ExprList *l)
917{
918 l->allocated = EXPRLIST_N_CACHED;
919 l->size = 0;
920
921 /* Until we start allocating dynamically, p points to data. */
922 l->p = l->data;
923
924 ExprList_check_invariants(l);
925}
926
927static int
928ExprList_Append(ExprList *l, expr_ty exp)
929{
930 ExprList_check_invariants(l);
931 if (l->size >= l->allocated) {
932 /* We need to alloc (or realloc) the memory. */
933 Py_ssize_t new_size = l->allocated * 2;
934
935 /* See if we've ever allocated anything dynamically. */
936 if (l->p == l->data) {
937 Py_ssize_t i;
938 /* We're still using the cached data. Switch to
939 alloc-ing. */
940 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
941 if (!l->p) {
942 return -1;
943 }
944 /* Copy the cached data into the new buffer. */
945 for (i = 0; i < l->size; i++) {
946 l->p[i] = l->data[i];
947 }
948 } else {
949 /* Just realloc. */
950 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
951 if (!tmp) {
952 PyMem_Free(l->p);
953 l->p = NULL;
954 return -1;
955 }
956 l->p = tmp;
957 }
958
959 l->allocated = new_size;
960 assert(l->allocated == 2 * l->size);
961 }
962
963 l->p[l->size++] = exp;
964
965 ExprList_check_invariants(l);
966 return 0;
967}
968
969static void
970ExprList_Dealloc(ExprList *l)
971{
972 ExprList_check_invariants(l);
973
974 /* If there's been an error, or we've never dynamically allocated,
975 do nothing. */
976 if (!l->p || l->p == l->data) {
977 /* Do nothing. */
978 } else {
979 /* We have dynamically allocated. Free the memory. */
980 PyMem_Free(l->p);
981 }
982 l->p = NULL;
983 l->size = -1;
984}
985
986static asdl_expr_seq *
987ExprList_Finish(ExprList *l, PyArena *arena)
988{
989 asdl_expr_seq *seq;
990
991 ExprList_check_invariants(l);
992
993 /* Allocate the asdl_seq and copy the expressions in to it. */
994 seq = _Py_asdl_expr_seq_new(l->size, arena);
995 if (seq) {
996 Py_ssize_t i;
997 for (i = 0; i < l->size; i++) {
998 asdl_seq_SET(seq, i, l->p[i]);
999 }
1000 }
1001 ExprList_Dealloc(l);
1002 return seq;
1003}
1004
1005#ifdef NDEBUG
1006#define FstringParser_check_invariants(state)
1007#else
1008static void
1009FstringParser_check_invariants(FstringParser *state)
1010{
1011 if (state->last_str) {
1012 assert(PyUnicode_CheckExact(state->last_str));
1013 }
1014 ExprList_check_invariants(&state->expr_list);
1015}
1016#endif
1017
1018void
1019_PyPegen_FstringParser_Init(FstringParser *state)
1020{
1021 state->last_str = NULL;
1022 state->fmode = 0;
1023 ExprList_Init(&state->expr_list);
1024 FstringParser_check_invariants(state);
1025}
1026
1027void
1028_PyPegen_FstringParser_Dealloc(FstringParser *state)
1029{
1030 FstringParser_check_invariants(state);
1031
1032 Py_XDECREF(state->last_str);
1033 ExprList_Dealloc(&state->expr_list);
1034}
1035
1036/* Make a Constant node, but decref the PyUnicode object being added. */
1037static expr_ty
1038make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1039{
1040 PyObject *s = *str;
1041 PyObject *kind = NULL;
1042 *str = NULL;
1043 assert(PyUnicode_CheckExact(s));
1044 if (_PyArena_AddPyObject(p->arena, s) < 0) {
1045 Py_DECREF(s);
1046 return NULL;
1047 }
1048 const char* the_str = PyBytes_AsString(first_token->bytes);
1049 if (the_str && the_str[0] == 'u') {
1050 kind = _PyPegen_new_identifier(p, "u");
1051 }
1052
1053 if (kind == NULL && PyErr_Occurred()) {
1054 return NULL;
1055 }
1056
1057 return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
1058 last_token->end_lineno, last_token->end_col_offset,
1059 p->arena);
1060
1061}
1062
1063
1064/* Add a non-f-string (that is, a regular literal string). str is
1065 decref'd. */
1066int
1067_PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1068{
1069 FstringParser_check_invariants(state);
1070
1071 assert(PyUnicode_CheckExact(str));
1072
1073 if (PyUnicode_GET_LENGTH(str) == 0) {
1074 Py_DECREF(str);
1075 return 0;
1076 }
1077
1078 if (!state->last_str) {
1079 /* We didn't have a string before, so just remember this one. */
1080 state->last_str = str;
1081 } else {
1082 /* Concatenate this with the previous string. */
1083 PyUnicode_AppendAndDel(&state->last_str, str);
1084 if (!state->last_str) {
1085 return -1;
1086 }
1087 }
1088 FstringParser_check_invariants(state);
1089 return 0;
1090}
1091
1092/* Parse an f-string. The f-string is in *str to end, with no
1093 'f' or quotes. */
1094int
1095_PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1096 const char *end, int raw, int recurse_lvl,
1097 Token *first_token, Token* t, Token *last_token)
1098{
1099 FstringParser_check_invariants(state);
1100 state->fmode = 1;
1101
1102 /* Parse the f-string. */
1103 while (1) {
1104 PyObject *literal = NULL;
1105 PyObject *expr_text = NULL;
1106 expr_ty expression = NULL;
1107
1108 /* If there's a zero length literal in front of the
1109 expression, literal will be NULL. If we're at the end of
1110 the f-string, expression will be NULL (unless result == 1,
1111 see below). */
1112 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1113 &literal, &expr_text,
1114 &expression, first_token, t, last_token);
1115 if (result < 0) {
1116 return -1;
1117 }
1118
1119 /* Add the literal, if any. */
1120 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1121 Py_XDECREF(expr_text);
1122 return -1;
1123 }
1124 /* Add the expr_text, if any. */
1125 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1126 return -1;
1127 }
1128
1129 /* We've dealt with the literal and expr_text, their ownership has
1130 been transferred to the state object. Don't look at them again. */
1131
1132 /* See if we should just loop around to get the next literal
1133 and expression, while ignoring the expression this
1134 time. This is used for un-doubling braces, as an
1135 optimization. */
1136 if (result == 1) {
1137 continue;
1138 }
1139
1140 if (!expression) {
1141 /* We're done with this f-string. */
1142 break;
1143 }
1144
1145 /* We know we have an expression. Convert any existing string
1146 to a Constant node. */
1147 if (!state->last_str) {
1148 /* Do nothing. No previous literal. */
1149 } else {
1150 /* Convert the existing last_str literal to a Constant node. */
1151 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1152 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
1153 return -1;
1154 }
1155 }
1156
1157 if (ExprList_Append(&state->expr_list, expression) < 0) {
1158 return -1;
1159 }
1160 }
1161
1162 /* If recurse_lvl is zero, then we must be at the end of the
1163 string. Otherwise, we must be at a right brace. */
1164
1165 if (recurse_lvl == 0 && *str < end-1) {
1166 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1167 return -1;
1168 }
1169 if (recurse_lvl != 0 && **str != '}') {
1170 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1171 return -1;
1172 }
1173
1174 FstringParser_check_invariants(state);
1175 return 0;
1176}
1177
1178/* Convert the partial state reflected in last_str and expr_list to an
1179 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1180expr_ty
1181_PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1182 Token *last_token)
1183{
1184 asdl_expr_seq *seq;
1185
1186 FstringParser_check_invariants(state);
1187
1188 /* If we're just a constant string with no expressions, return
1189 that. */
1190 if (!state->fmode) {
1191 assert(!state->expr_list.size);
1192 if (!state->last_str) {
1193 /* Create a zero length string. */
1194 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1195 if (!state->last_str) {
1196 goto error;
1197 }
1198 }
1199 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1200 }
1201
1202 /* Create a Constant node out of last_str, if needed. It will be the
1203 last node in our expression list. */
1204 if (state->last_str) {
1205 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1206 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
1207 goto error;
1208 }
1209 }
1210 /* This has already been freed. */
1211 assert(state->last_str == NULL);
1212
1213 seq = ExprList_Finish(&state->expr_list, p->arena);
1214 if (!seq) {
1215 goto error;
1216 }
1217
1218 return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1219 last_token->end_lineno, last_token->end_col_offset,
1220 p->arena);
1221
1222error:
1223 _PyPegen_FstringParser_Dealloc(state);
1224 return NULL;
1225}
1226
1227/* Given an f-string (with no 'f' or quotes) that's in *str and ends
1228 at end, parse it into an expr_ty. Return NULL on error. Adjust
1229 str to point past the parsed portion. */
1230static expr_ty
1231fstring_parse(Parser *p, const char **str, const char *end, int raw,
1232 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1233{
1234 FstringParser state;
1235
1236 _PyPegen_FstringParser_Init(&state);
1237 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1238 first_token, t, last_token) < 0) {
1239 _PyPegen_FstringParser_Dealloc(&state);
1240 return NULL;
1241 }
1242
1243 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1244}
1245