1
2/* Tokenizer implementation */
3
4#define PY_SSIZE_T_CLEAN
5#include "Python.h"
6
7#include <ctype.h>
8#include <assert.h>
9
10#include "tokenizer.h"
11#include "errcode.h"
12
13#include "unicodeobject.h"
14#include "bytesobject.h"
15#include "fileobject.h"
16#include "abstract.h"
17
18/* Alternate tab spacing */
19#define ALTTABSIZE 1
20
21#define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
26
27#define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
33
34
35/* Don't ever change this -- it would break the portability of Python code */
36#define TABSIZE 8
37
38/* Forward */
39static struct tok_state *tok_new(void);
40static int tok_nextc(struct tok_state *tok);
41static void tok_backup(struct tok_state *tok, int c);
42static int syntaxerror(struct tok_state *tok, const char *format, ...);
43
44/* Spaces in this constant are treated as "zero or more spaces or tabs" when
45 tokenizing. */
46static const char* type_comment_prefix = "# type: ";
47
48/* Create and initialize a new tok_state structure */
49
50static struct tok_state *
51tok_new(void)
52{
53 struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
54 sizeof(struct tok_state));
55 if (tok == NULL)
56 return NULL;
57 tok->buf = tok->cur = tok->inp = NULL;
58 tok->fp_interactive = 0;
59 tok->interactive_src_start = NULL;
60 tok->interactive_src_end = NULL;
61 tok->start = NULL;
62 tok->end = NULL;
63 tok->done = E_OK;
64 tok->fp = NULL;
65 tok->input = NULL;
66 tok->tabsize = TABSIZE;
67 tok->indent = 0;
68 tok->indstack[0] = 0;
69 tok->atbol = 1;
70 tok->pendin = 0;
71 tok->prompt = tok->nextprompt = NULL;
72 tok->lineno = 0;
73 tok->level = 0;
74 tok->altindstack[0] = 0;
75 tok->decoding_state = STATE_INIT;
76 tok->decoding_erred = 0;
77 tok->enc = NULL;
78 tok->encoding = NULL;
79 tok->cont_line = 0;
80 tok->filename = NULL;
81 tok->decoding_readline = NULL;
82 tok->decoding_buffer = NULL;
83 tok->type_comments = 0;
84 tok->async_hacks = 0;
85 tok->async_def = 0;
86 tok->async_def_indent = 0;
87 tok->async_def_nl = 0;
88 tok->interactive_underflow = IUNDERFLOW_NORMAL;
89 tok->str = NULL;
90 return tok;
91}
92
93static char *
94new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
95{
96 char* result = (char *)PyMem_Malloc(len + 1);
97 if (!result) {
98 tok->done = E_NOMEM;
99 return NULL;
100 }
101 memcpy(result, s, len);
102 result[len] = '\0';
103 return result;
104}
105
106static char *
107error_ret(struct tok_state *tok) /* XXX */
108{
109 tok->decoding_erred = 1;
110 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
111 PyMem_Free(tok->buf);
112 tok->buf = tok->cur = tok->inp = NULL;
113 tok->start = NULL;
114 tok->end = NULL;
115 tok->done = E_DECODE;
116 return NULL; /* as if it were EOF */
117}
118
119
120static const char *
121get_normal_name(const char *s) /* for utf-8 and latin-1 */
122{
123 char buf[13];
124 int i;
125 for (i = 0; i < 12; i++) {
126 int c = s[i];
127 if (c == '\0')
128 break;
129 else if (c == '_')
130 buf[i] = '-';
131 else
132 buf[i] = tolower(c);
133 }
134 buf[i] = '\0';
135 if (strcmp(buf, "utf-8") == 0 ||
136 strncmp(buf, "utf-8-", 6) == 0)
137 return "utf-8";
138 else if (strcmp(buf, "latin-1") == 0 ||
139 strcmp(buf, "iso-8859-1") == 0 ||
140 strcmp(buf, "iso-latin-1") == 0 ||
141 strncmp(buf, "latin-1-", 8) == 0 ||
142 strncmp(buf, "iso-8859-1-", 11) == 0 ||
143 strncmp(buf, "iso-latin-1-", 12) == 0)
144 return "iso-8859-1";
145 else
146 return s;
147}
148
149/* Return the coding spec in S, or NULL if none is found. */
150
151static int
152get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
153{
154 Py_ssize_t i;
155 *spec = NULL;
156 /* Coding spec must be in a comment, and that comment must be
157 * the only statement on the source code line. */
158 for (i = 0; i < size - 6; i++) {
159 if (s[i] == '#')
160 break;
161 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
162 return 1;
163 }
164 for (; i < size - 6; i++) { /* XXX inefficient search */
165 const char* t = s + i;
166 if (memcmp(t, "coding", 6) == 0) {
167 const char* begin = NULL;
168 t += 6;
169 if (t[0] != ':' && t[0] != '=')
170 continue;
171 do {
172 t++;
173 } while (t[0] == ' ' || t[0] == '\t');
174
175 begin = t;
176 while (Py_ISALNUM(t[0]) ||
177 t[0] == '-' || t[0] == '_' || t[0] == '.')
178 t++;
179
180 if (begin < t) {
181 char* r = new_string(begin, t - begin, tok);
182 const char* q;
183 if (!r)
184 return 0;
185 q = get_normal_name(r);
186 if (r != q) {
187 PyMem_Free(r);
188 r = new_string(q, strlen(q), tok);
189 if (!r)
190 return 0;
191 }
192 *spec = r;
193 break;
194 }
195 }
196 }
197 return 1;
198}
199
200/* Check whether the line contains a coding spec. If it does,
201 invoke the set_readline function for the new encoding.
202 This function receives the tok_state and the new encoding.
203 Return 1 on success, 0 on failure. */
204
205static int
206check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
207 int set_readline(struct tok_state *, const char *))
208{
209 char *cs;
210 if (tok->cont_line) {
211 /* It's a continuation line, so it can't be a coding spec. */
212 tok->decoding_state = STATE_NORMAL;
213 return 1;
214 }
215 if (!get_coding_spec(line, &cs, size, tok)) {
216 return 0;
217 }
218 if (!cs) {
219 Py_ssize_t i;
220 for (i = 0; i < size; i++) {
221 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
222 break;
223 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
224 /* Stop checking coding spec after a line containing
225 * anything except a comment. */
226 tok->decoding_state = STATE_NORMAL;
227 break;
228 }
229 }
230 return 1;
231 }
232 tok->decoding_state = STATE_NORMAL;
233 if (tok->encoding == NULL) {
234 assert(tok->decoding_readline == NULL);
235 if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
236 error_ret(tok);
237 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
238 PyMem_Free(cs);
239 return 0;
240 }
241 tok->encoding = cs;
242 } else { /* then, compare cs with BOM */
243 if (strcmp(tok->encoding, cs) != 0) {
244 error_ret(tok);
245 PyErr_Format(PyExc_SyntaxError,
246 "encoding problem: %s with BOM", cs);
247 PyMem_Free(cs);
248 return 0;
249 }
250 PyMem_Free(cs);
251 }
252 return 1;
253}
254
255/* See whether the file starts with a BOM. If it does,
256 invoke the set_readline function with the new encoding.
257 Return 1 on success, 0 on failure. */
258
259static int
260check_bom(int get_char(struct tok_state *),
261 void unget_char(int, struct tok_state *),
262 int set_readline(struct tok_state *, const char *),
263 struct tok_state *tok)
264{
265 int ch1, ch2, ch3;
266 ch1 = get_char(tok);
267 tok->decoding_state = STATE_SEEK_CODING;
268 if (ch1 == EOF) {
269 return 1;
270 } else if (ch1 == 0xEF) {
271 ch2 = get_char(tok);
272 if (ch2 != 0xBB) {
273 unget_char(ch2, tok);
274 unget_char(ch1, tok);
275 return 1;
276 }
277 ch3 = get_char(tok);
278 if (ch3 != 0xBF) {
279 unget_char(ch3, tok);
280 unget_char(ch2, tok);
281 unget_char(ch1, tok);
282 return 1;
283 }
284#if 0
285 /* Disable support for UTF-16 BOMs until a decision
286 is made whether this needs to be supported. */
287 } else if (ch1 == 0xFE) {
288 ch2 = get_char(tok);
289 if (ch2 != 0xFF) {
290 unget_char(ch2, tok);
291 unget_char(ch1, tok);
292 return 1;
293 }
294 if (!set_readline(tok, "utf-16-be"))
295 return 0;
296 tok->decoding_state = STATE_NORMAL;
297 } else if (ch1 == 0xFF) {
298 ch2 = get_char(tok);
299 if (ch2 != 0xFE) {
300 unget_char(ch2, tok);
301 unget_char(ch1, tok);
302 return 1;
303 }
304 if (!set_readline(tok, "utf-16-le"))
305 return 0;
306 tok->decoding_state = STATE_NORMAL;
307#endif
308 } else {
309 unget_char(ch1, tok);
310 return 1;
311 }
312 if (tok->encoding != NULL)
313 PyMem_Free(tok->encoding);
314 tok->encoding = new_string("utf-8", 5, tok);
315 if (!tok->encoding)
316 return 0;
317 /* No need to set_readline: input is already utf-8 */
318 return 1;
319}
320
321static int
322tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
323 assert(tok->fp_interactive);
324
325 if (!line) {
326 return 0;
327 }
328
329 Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
330 Py_ssize_t line_size = strlen(line);
331 char* new_str = tok->interactive_src_start;
332
333 new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
334 if (!new_str) {
335 if (tok->interactive_src_start) {
336 PyMem_Free(tok->interactive_src_start);
337 }
338 tok->interactive_src_start = NULL;
339 tok->interactive_src_end = NULL;
340 tok->done = E_NOMEM;
341 return -1;
342 }
343 strcpy(new_str + current_size, line);
344
345 tok->interactive_src_start = new_str;
346 tok->interactive_src_end = new_str + current_size + line_size;
347 return 0;
348}
349
350
351/* Read a line of text from TOK into S, using the stream in TOK.
352 Return NULL on failure, else S.
353
354 On entry, tok->decoding_buffer will be one of:
355 1) NULL: need to call tok->decoding_readline to get a new line
356 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
357 stored the result in tok->decoding_buffer
358 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
359 (in the s buffer) to copy entire contents of the line read
360 by tok->decoding_readline. tok->decoding_buffer has the overflow.
361 In this case, tok_readline_recode is called in a loop (with an expanded buffer)
362 until the buffer ends with a '\n' (or until the end of the file is
363 reached): see tok_nextc and its calls to tok_reserve_buf.
364*/
365
366static int
367tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
368{
369 Py_ssize_t cur = tok->cur - tok->buf;
370 Py_ssize_t oldsize = tok->inp - tok->buf;
371 Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
372 if (newsize > tok->end - tok->buf) {
373 char *newbuf = tok->buf;
374 Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
375 Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
376 Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
377 newbuf = (char *)PyMem_Realloc(newbuf, newsize);
378 if (newbuf == NULL) {
379 tok->done = E_NOMEM;
380 return 0;
381 }
382 tok->buf = newbuf;
383 tok->cur = tok->buf + cur;
384 tok->inp = tok->buf + oldsize;
385 tok->end = tok->buf + newsize;
386 tok->start = start < 0 ? NULL : tok->buf + start;
387 tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
388 tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
389 }
390 return 1;
391}
392
393static int
394tok_readline_recode(struct tok_state *tok) {
395 PyObject *line;
396 const char *buf;
397 Py_ssize_t buflen;
398 line = tok->decoding_buffer;
399 if (line == NULL) {
400 line = PyObject_CallNoArgs(tok->decoding_readline);
401 if (line == NULL) {
402 error_ret(tok);
403 goto error;
404 }
405 }
406 else {
407 tok->decoding_buffer = NULL;
408 }
409 buf = PyUnicode_AsUTF8AndSize(line, &buflen);
410 if (buf == NULL) {
411 error_ret(tok);
412 goto error;
413 }
414 if (!tok_reserve_buf(tok, buflen + 1)) {
415 goto error;
416 }
417 memcpy(tok->inp, buf, buflen);
418 tok->inp += buflen;
419 *tok->inp = '\0';
420 if (tok->fp_interactive &&
421 tok_concatenate_interactive_new_line(tok, buf) == -1) {
422 goto error;
423 }
424 Py_DECREF(line);
425 return 1;
426error:
427 Py_XDECREF(line);
428 return 0;
429}
430
431/* Set the readline function for TOK to a StreamReader's
432 readline function. The StreamReader is named ENC.
433
434 This function is called from check_bom and check_coding_spec.
435
436 ENC is usually identical to the future value of tok->encoding,
437 except for the (currently unsupported) case of UTF-16.
438
439 Return 1 on success, 0 on failure. */
440
441static int
442fp_setreadl(struct tok_state *tok, const char* enc)
443{
444 PyObject *readline, *io, *stream;
445 _Py_IDENTIFIER(open);
446 _Py_IDENTIFIER(readline);
447 int fd;
448 long pos;
449
450 fd = fileno(tok->fp);
451 /* Due to buffering the file offset for fd can be different from the file
452 * position of tok->fp. If tok->fp was opened in text mode on Windows,
453 * its file position counts CRLF as one char and can't be directly mapped
454 * to the file offset for fd. Instead we step back one byte and read to
455 * the end of line.*/
456 pos = ftell(tok->fp);
457 if (pos == -1 ||
458 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
459 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
460 return 0;
461 }
462
463 io = PyImport_ImportModuleNoBlock("io");
464 if (io == NULL)
465 return 0;
466
467 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
468 fd, "r", -1, enc, Py_None, Py_None, Py_False);
469 Py_DECREF(io);
470 if (stream == NULL)
471 return 0;
472
473 readline = _PyObject_GetAttrId(stream, &PyId_readline);
474 Py_DECREF(stream);
475 if (readline == NULL)
476 return 0;
477 Py_XSETREF(tok->decoding_readline, readline);
478
479 if (pos > 0) {
480 PyObject *bufobj = _PyObject_CallNoArg(readline);
481 if (bufobj == NULL)
482 return 0;
483 Py_DECREF(bufobj);
484 }
485
486 return 1;
487}
488
489/* Fetch the next byte from TOK. */
490
491static int fp_getc(struct tok_state *tok) {
492 return getc(tok->fp);
493}
494
495/* Unfetch the last byte back into TOK. */
496
497static void fp_ungetc(int c, struct tok_state *tok) {
498 ungetc(c, tok->fp);
499}
500
501/* Check whether the characters at s start a valid
502 UTF-8 sequence. Return the number of characters forming
503 the sequence if yes, 0 if not. */
504static int valid_utf8(const unsigned char* s)
505{
506 int expected = 0;
507 int length;
508 if (*s < 0x80)
509 /* single-byte code */
510 return 1;
511 if (*s < 0xc0)
512 /* following byte */
513 return 0;
514 if (*s < 0xE0)
515 expected = 1;
516 else if (*s < 0xF0)
517 expected = 2;
518 else if (*s < 0xF8)
519 expected = 3;
520 else
521 return 0;
522 length = expected + 1;
523 for (; expected; expected--)
524 if (s[expected] < 0x80 || s[expected] >= 0xC0)
525 return 0;
526 return length;
527}
528
529static int
530ensure_utf8(char *line, struct tok_state *tok)
531{
532 int badchar = 0;
533 unsigned char *c;
534 int length;
535 for (c = (unsigned char *)line; *c; c += length) {
536 if (!(length = valid_utf8(c))) {
537 badchar = *c;
538 break;
539 }
540 }
541 if (badchar) {
542 /* Need to add 1 to the line number, since this line
543 has not been counted, yet. */
544 PyErr_Format(PyExc_SyntaxError,
545 "Non-UTF-8 code starting with '\\x%.2x' "
546 "in file %U on line %i, "
547 "but no encoding declared; "
548 "see https://python.org/dev/peps/pep-0263/ for details",
549 badchar, tok->filename, tok->lineno + 1);
550 return 0;
551 }
552 return 1;
553}
554
555/* Fetch a byte from TOK, using the string buffer. */
556
557static int
558buf_getc(struct tok_state *tok) {
559 return Py_CHARMASK(*tok->str++);
560}
561
562/* Unfetch a byte from TOK, using the string buffer. */
563
564static void
565buf_ungetc(int c, struct tok_state *tok) {
566 tok->str--;
567 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
568}
569
570/* Set the readline function for TOK to ENC. For the string-based
571 tokenizer, this means to just record the encoding. */
572
573static int
574buf_setreadl(struct tok_state *tok, const char* enc) {
575 tok->enc = enc;
576 return 1;
577}
578
579/* Return a UTF-8 encoding Python string object from the
580 C byte string STR, which is encoded with ENC. */
581
582static PyObject *
583translate_into_utf8(const char* str, const char* enc) {
584 PyObject *utf8;
585 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
586 if (buf == NULL)
587 return NULL;
588 utf8 = PyUnicode_AsUTF8String(buf);
589 Py_DECREF(buf);
590 return utf8;
591}
592
593
594static char *
595translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
596 int skip_next_lf = 0;
597 size_t needed_length = strlen(s) + 2, final_length;
598 char *buf, *current;
599 char c = '\0';
600 buf = PyMem_Malloc(needed_length);
601 if (buf == NULL) {
602 tok->done = E_NOMEM;
603 return NULL;
604 }
605 for (current = buf; *s; s++, current++) {
606 c = *s;
607 if (skip_next_lf) {
608 skip_next_lf = 0;
609 if (c == '\n') {
610 c = *++s;
611 if (!c)
612 break;
613 }
614 }
615 if (c == '\r') {
616 skip_next_lf = 1;
617 c = '\n';
618 }
619 *current = c;
620 }
621 /* If this is exec input, add a newline to the end of the string if
622 there isn't one already. */
623 if (exec_input && c != '\n') {
624 *current = '\n';
625 current++;
626 }
627 *current = '\0';
628 final_length = current - buf + 1;
629 if (final_length < needed_length && final_length) {
630 /* should never fail */
631 char* result = PyMem_Realloc(buf, final_length);
632 if (result == NULL) {
633 PyMem_Free(buf);
634 }
635 buf = result;
636 }
637 return buf;
638}
639
640/* Decode a byte string STR for use as the buffer of TOK.
641 Look for encoding declarations inside STR, and record them
642 inside TOK. */
643
644static char *
645decode_str(const char *input, int single, struct tok_state *tok)
646{
647 PyObject* utf8 = NULL;
648 char *str;
649 const char *s;
650 const char *newl[2] = {NULL, NULL};
651 int lineno = 0;
652 tok->input = str = translate_newlines(input, single, tok);
653 if (str == NULL)
654 return NULL;
655 tok->enc = NULL;
656 tok->str = str;
657 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
658 return error_ret(tok);
659 str = tok->str; /* string after BOM if any */
660 assert(str);
661 if (tok->enc != NULL) {
662 utf8 = translate_into_utf8(str, tok->enc);
663 if (utf8 == NULL)
664 return error_ret(tok);
665 str = PyBytes_AsString(utf8);
666 }
667 for (s = str;; s++) {
668 if (*s == '\0') break;
669 else if (*s == '\n') {
670 assert(lineno < 2);
671 newl[lineno] = s;
672 lineno++;
673 if (lineno == 2) break;
674 }
675 }
676 tok->enc = NULL;
677 /* need to check line 1 and 2 separately since check_coding_spec
678 assumes a single line as input */
679 if (newl[0]) {
680 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
681 return NULL;
682 }
683 if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
684 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
685 tok, buf_setreadl))
686 return NULL;
687 }
688 }
689 if (tok->enc != NULL) {
690 assert(utf8 == NULL);
691 utf8 = translate_into_utf8(str, tok->enc);
692 if (utf8 == NULL)
693 return error_ret(tok);
694 str = PyBytes_AS_STRING(utf8);
695 }
696 assert(tok->decoding_buffer == NULL);
697 tok->decoding_buffer = utf8; /* CAUTION */
698 return str;
699}
700
701/* Set up tokenizer for string */
702
703struct tok_state *
704PyTokenizer_FromString(const char *str, int exec_input)
705{
706 struct tok_state *tok = tok_new();
707 char *decoded;
708
709 if (tok == NULL)
710 return NULL;
711 decoded = decode_str(str, exec_input, tok);
712 if (decoded == NULL) {
713 PyTokenizer_Free(tok);
714 return NULL;
715 }
716
717 tok->buf = tok->cur = tok->inp = decoded;
718 tok->end = decoded;
719 return tok;
720}
721
722/* Set up tokenizer for UTF-8 string */
723
724struct tok_state *
725PyTokenizer_FromUTF8(const char *str, int exec_input)
726{
727 struct tok_state *tok = tok_new();
728 char *translated;
729 if (tok == NULL)
730 return NULL;
731 tok->input = translated = translate_newlines(str, exec_input, tok);
732 if (translated == NULL) {
733 PyTokenizer_Free(tok);
734 return NULL;
735 }
736 tok->decoding_state = STATE_NORMAL;
737 tok->enc = NULL;
738 tok->str = translated;
739 tok->encoding = new_string("utf-8", 5, tok);
740 if (!tok->encoding) {
741 PyTokenizer_Free(tok);
742 return NULL;
743 }
744
745 tok->buf = tok->cur = tok->inp = translated;
746 tok->end = translated;
747 return tok;
748}
749
750/* Set up tokenizer for file */
751
752struct tok_state *
753PyTokenizer_FromFile(FILE *fp, const char* enc,
754 const char *ps1, const char *ps2)
755{
756 struct tok_state *tok = tok_new();
757 if (tok == NULL)
758 return NULL;
759 if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
760 PyTokenizer_Free(tok);
761 return NULL;
762 }
763 tok->cur = tok->inp = tok->buf;
764 tok->end = tok->buf + BUFSIZ;
765 tok->fp = fp;
766 tok->prompt = ps1;
767 tok->nextprompt = ps2;
768 if (enc != NULL) {
769 /* Must copy encoding declaration since it
770 gets copied into the parse tree. */
771 tok->encoding = new_string(enc, strlen(enc), tok);
772 if (!tok->encoding) {
773 PyTokenizer_Free(tok);
774 return NULL;
775 }
776 tok->decoding_state = STATE_NORMAL;
777 }
778 return tok;
779}
780
781/* Free a tok_state structure */
782
783void
784PyTokenizer_Free(struct tok_state *tok)
785{
786 if (tok->encoding != NULL) {
787 PyMem_Free(tok->encoding);
788 }
789 Py_XDECREF(tok->decoding_readline);
790 Py_XDECREF(tok->decoding_buffer);
791 Py_XDECREF(tok->filename);
792 if (tok->fp != NULL && tok->buf != NULL) {
793 PyMem_Free(tok->buf);
794 }
795 if (tok->input) {
796 PyMem_Free(tok->input);
797 }
798 if (tok->interactive_src_start != NULL) {
799 PyMem_Free(tok->interactive_src_start);
800 }
801 PyMem_Free(tok);
802}
803
804static int
805tok_readline_raw(struct tok_state *tok)
806{
807 do {
808 if (!tok_reserve_buf(tok, BUFSIZ)) {
809 return 0;
810 }
811 char *line = Py_UniversalNewlineFgets(tok->inp,
812 (int)(tok->end - tok->inp),
813 tok->fp, NULL);
814 if (line == NULL) {
815 return 1;
816 }
817 if (tok->fp_interactive &&
818 tok_concatenate_interactive_new_line(tok, line) == -1) {
819 return 0;
820 }
821 tok->inp = strchr(tok->inp, '\0');
822 if (tok->inp == tok->buf) {
823 return 0;
824 }
825 } while (tok->inp[-1] != '\n');
826 return 1;
827}
828
829static int
830tok_underflow_string(struct tok_state *tok) {
831 char *end = strchr(tok->inp, '\n');
832 if (end != NULL) {
833 end++;
834 }
835 else {
836 end = strchr(tok->inp, '\0');
837 if (end == tok->inp) {
838 tok->done = E_EOF;
839 return 0;
840 }
841 }
842 if (tok->start == NULL) {
843 tok->buf = tok->cur;
844 }
845 tok->line_start = tok->cur;
846 tok->lineno++;
847 tok->inp = end;
848 return 1;
849}
850
851static int
852tok_underflow_interactive(struct tok_state *tok) {
853 if (tok->interactive_underflow == IUNDERFLOW_STOP) {
854 tok->done = E_INTERACT_STOP;
855 return 1;
856 }
857 char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
858 if (newtok != NULL) {
859 char *translated = translate_newlines(newtok, 0, tok);
860 PyMem_Free(newtok);
861 if (translated == NULL) {
862 return 0;
863 }
864 newtok = translated;
865 }
866 if (tok->encoding && newtok && *newtok) {
867 /* Recode to UTF-8 */
868 Py_ssize_t buflen;
869 const char* buf;
870 PyObject *u = translate_into_utf8(newtok, tok->encoding);
871 PyMem_Free(newtok);
872 if (u == NULL) {
873 tok->done = E_DECODE;
874 return 0;
875 }
876 buflen = PyBytes_GET_SIZE(u);
877 buf = PyBytes_AS_STRING(u);
878 newtok = PyMem_Malloc(buflen+1);
879 if (newtok == NULL) {
880 Py_DECREF(u);
881 tok->done = E_NOMEM;
882 return 0;
883 }
884 strcpy(newtok, buf);
885 Py_DECREF(u);
886 }
887 if (tok->fp_interactive &&
888 tok_concatenate_interactive_new_line(tok, newtok) == -1) {
889 PyMem_Free(newtok);
890 return 0;
891 }
892 if (tok->nextprompt != NULL) {
893 tok->prompt = tok->nextprompt;
894 }
895 if (newtok == NULL) {
896 tok->done = E_INTR;
897 }
898 else if (*newtok == '\0') {
899 PyMem_Free(newtok);
900 tok->done = E_EOF;
901 }
902 else if (tok->start != NULL) {
903 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
904 size_t size = strlen(newtok);
905 tok->lineno++;
906 if (!tok_reserve_buf(tok, size + 1)) {
907 PyMem_Free(tok->buf);
908 tok->buf = NULL;
909 PyMem_Free(newtok);
910 return 0;
911 }
912 memcpy(tok->cur, newtok, size + 1);
913 PyMem_Free(newtok);
914 tok->inp += size;
915 tok->multi_line_start = tok->buf + cur_multi_line_start;
916 }
917 else {
918 tok->lineno++;
919 PyMem_Free(tok->buf);
920 tok->buf = newtok;
921 tok->cur = tok->buf;
922 tok->line_start = tok->buf;
923 tok->inp = strchr(tok->buf, '\0');
924 tok->end = tok->inp + 1;
925 }
926 if (tok->done != E_OK) {
927 if (tok->prompt != NULL) {
928 PySys_WriteStderr("\n");
929 }
930 return 0;
931 }
932 return 1;
933}
934
935static int
936tok_underflow_file(struct tok_state *tok) {
937 if (tok->start == NULL) {
938 tok->cur = tok->inp = tok->buf;
939 }
940 if (tok->decoding_state == STATE_INIT) {
941 /* We have not yet determined the encoding.
942 If an encoding is found, use the file-pointer
943 reader functions from now on. */
944 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
945 error_ret(tok);
946 return 0;
947 }
948 assert(tok->decoding_state != STATE_INIT);
949 }
950 /* Read until '\n' or EOF */
951 if (tok->decoding_readline != NULL) {
952 /* We already have a codec associated with this input. */
953 if (!tok_readline_recode(tok)) {
954 return 0;
955 }
956 }
957 else {
958 /* We want a 'raw' read. */
959 if (!tok_readline_raw(tok)) {
960 return 0;
961 }
962 }
963 if (tok->inp == tok->cur) {
964 tok->done = E_EOF;
965 return 0;
966 }
967 if (tok->inp[-1] != '\n') {
968 /* Last line does not end in \n, fake one */
969 *tok->inp++ = '\n';
970 *tok->inp = '\0';
971 }
972
973 tok->lineno++;
974 if (tok->decoding_state != STATE_NORMAL) {
975 if (tok->lineno > 2) {
976 tok->decoding_state = STATE_NORMAL;
977 }
978 else if (!check_coding_spec(tok->cur, strlen(tok->cur),
979 tok, fp_setreadl))
980 {
981 return 0;
982 }
983 }
984 /* The default encoding is UTF-8, so make sure we don't have any
985 non-UTF-8 sequences in it. */
986 if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
987 error_ret(tok);
988 return 0;
989 }
990 assert(tok->done == E_OK);
991 return tok->done == E_OK;
992}
993
994#if defined(Py_DEBUG)
995static void
996print_escape(FILE *f, const char *s, Py_ssize_t size)
997{
998 if (s == NULL) {
999 fputs("NULL", f);
1000 return;
1001 }
1002 putc('"', f);
1003 while (size-- > 0) {
1004 unsigned char c = *s++;
1005 switch (c) {
1006 case '\n': fputs("\\n", f); break;
1007 case '\r': fputs("\\r", f); break;
1008 case '\t': fputs("\\t", f); break;
1009 case '\f': fputs("\\f", f); break;
1010 case '\'': fputs("\\'", f); break;
1011 case '"': fputs("\\\"", f); break;
1012 default:
1013 if (0x20 <= c && c <= 0x7f)
1014 putc(c, f);
1015 else
1016 fprintf(f, "\\x%02x", c);
1017 }
1018 }
1019 putc('"', f);
1020}
1021#endif
1022
1023/* Get next char, updating state; error code goes into tok->done */
1024
1025static int
1026tok_nextc(struct tok_state *tok)
1027{
1028 int rc;
1029 for (;;) {
1030 if (tok->cur != tok->inp) {
1031 return Py_CHARMASK(*tok->cur++); /* Fast path */
1032 }
1033 if (tok->done != E_OK) {
1034 return EOF;
1035 }
1036 if (tok->fp == NULL) {
1037 rc = tok_underflow_string(tok);
1038 }
1039 else if (tok->prompt != NULL) {
1040 rc = tok_underflow_interactive(tok);
1041 }
1042 else {
1043 rc = tok_underflow_file(tok);
1044 }
1045#if defined(Py_DEBUG)
1046 if (Py_DebugFlag) {
1047 fprintf(stderr, "line[%d] = ", tok->lineno);
1048 print_escape(stderr, tok->cur, tok->inp - tok->cur);
1049 fprintf(stderr, " tok->done = %d\n", tok->done);
1050 }
1051#endif
1052 if (!rc) {
1053 tok->cur = tok->inp;
1054 return EOF;
1055 }
1056 tok->line_start = tok->cur;
1057 }
1058 Py_UNREACHABLE();
1059}
1060
1061/* Back-up one character */
1062
1063static void
1064tok_backup(struct tok_state *tok, int c)
1065{
1066 if (c != EOF) {
1067 if (--tok->cur < tok->buf) {
1068 Py_FatalError("tokenizer beginning of buffer");
1069 }
1070 if ((int)(unsigned char)*tok->cur != c) {
1071 Py_FatalError("tok_backup: wrong character");
1072 }
1073 }
1074}
1075
1076static int
1077_syntaxerror_range(struct tok_state *tok, const char *format,
1078 int col_offset, int end_col_offset,
1079 va_list vargs)
1080{
1081 PyObject *errmsg, *errtext, *args;
1082 errmsg = PyUnicode_FromFormatV(format, vargs);
1083 if (!errmsg) {
1084 goto error;
1085 }
1086
1087 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1088 "replace");
1089 if (!errtext) {
1090 goto error;
1091 }
1092
1093 if (col_offset == -1) {
1094 col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1095 }
1096 if (end_col_offset == -1) {
1097 end_col_offset = col_offset;
1098 }
1099
1100 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1101 if (line_len != tok->cur - tok->line_start) {
1102 Py_DECREF(errtext);
1103 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1104 "replace");
1105 }
1106 if (!errtext) {
1107 goto error;
1108 }
1109
1110 args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1111 col_offset, errtext, tok->lineno, end_col_offset);
1112 if (args) {
1113 PyErr_SetObject(PyExc_SyntaxError, args);
1114 Py_DECREF(args);
1115 }
1116
1117error:
1118 Py_XDECREF(errmsg);
1119 tok->done = E_ERROR;
1120 return ERRORTOKEN;
1121}
1122
1123static int
1124syntaxerror(struct tok_state *tok, const char *format, ...)
1125{
1126 va_list vargs;
1127#ifdef HAVE_STDARG_PROTOTYPES
1128 va_start(vargs, format);
1129#else
1130 va_start(vargs);
1131#endif
1132 int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1133 va_end(vargs);
1134 return ret;
1135}
1136
1137static int
1138syntaxerror_known_range(struct tok_state *tok,
1139 int col_offset, int end_col_offset,
1140 const char *format, ...)
1141{
1142 va_list vargs;
1143#ifdef HAVE_STDARG_PROTOTYPES
1144 va_start(vargs, format);
1145#else
1146 va_start(vargs);
1147#endif
1148 int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1149 va_end(vargs);
1150 return ret;
1151}
1152
1153
1154
1155static int
1156indenterror(struct tok_state *tok)
1157{
1158 tok->done = E_TABSPACE;
1159 tok->cur = tok->inp;
1160 return ERRORTOKEN;
1161}
1162
1163static int
1164parser_warn(struct tok_state *tok, const char *format, ...)
1165{
1166 PyObject *errmsg;
1167 va_list vargs;
1168#ifdef HAVE_STDARG_PROTOTYPES
1169 va_start(vargs, format);
1170#else
1171 va_start(vargs);
1172#endif
1173 errmsg = PyUnicode_FromFormatV(format, vargs);
1174 va_end(vargs);
1175 if (!errmsg) {
1176 goto error;
1177 }
1178
1179 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, errmsg, tok->filename,
1180 tok->lineno, NULL, NULL) < 0) {
1181 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
1182 /* Replace the DeprecationWarning exception with a SyntaxError
1183 to get a more accurate error report */
1184 PyErr_Clear();
1185 syntaxerror(tok, "%U", errmsg);
1186 }
1187 goto error;
1188 }
1189 Py_DECREF(errmsg);
1190 return 0;
1191
1192error:
1193 Py_XDECREF(errmsg);
1194 tok->done = E_ERROR;
1195 return -1;
1196}
1197
1198static int
1199lookahead(struct tok_state *tok, const char *test)
1200{
1201 const char *s = test;
1202 int res = 0;
1203 while (1) {
1204 int c = tok_nextc(tok);
1205 if (*s == 0) {
1206 res = !is_potential_identifier_char(c);
1207 }
1208 else if (c == *s) {
1209 s++;
1210 continue;
1211 }
1212
1213 tok_backup(tok, c);
1214 while (s != test) {
1215 tok_backup(tok, *--s);
1216 }
1217 return res;
1218 }
1219}
1220
1221static int
1222verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1223{
1224 /* Emit a deprecation warning only if the numeric literal is immediately
1225 * followed by one of keywords which can occurr after a numeric literal
1226 * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1227 * It allows to gradually deprecate existing valid code without adding
1228 * warning before error in most cases of invalid numeric literal (which
1229 * would be confusiong and break existing tests).
1230 * Raise a syntax error with slighly better message than plain
1231 * "invalid syntax" if the numeric literal is immediately followed by
1232 * other keyword or identifier.
1233 */
1234 int r = 0;
1235 if (c == 'a') {
1236 r = lookahead(tok, "nd");
1237 }
1238 else if (c == 'e') {
1239 r = lookahead(tok, "lse");
1240 }
1241 else if (c == 'f') {
1242 r = lookahead(tok, "or");
1243 }
1244 else if (c == 'i') {
1245 int c2 = tok_nextc(tok);
1246 if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1247 r = 1;
1248 }
1249 tok_backup(tok, c2);
1250 }
1251 else if (c == 'o') {
1252 r = lookahead(tok, "r");
1253 }
1254 else if (c == 'n') {
1255 r = lookahead(tok, "ot");
1256 }
1257 if (r) {
1258 tok_backup(tok, c);
1259 if (parser_warn(tok, "invalid %s literal", kind)) {
1260 return 0;
1261 }
1262 tok_nextc(tok);
1263 }
1264 else /* In future releases, only error will remain. */
1265 if (is_potential_identifier_char(c)) {
1266 tok_backup(tok, c);
1267 syntaxerror(tok, "invalid %s literal", kind);
1268 return 0;
1269 }
1270 return 1;
1271}
1272
1273/* Verify that the identifier follows PEP 3131.
1274 All identifier strings are guaranteed to be "ready" unicode objects.
1275 */
1276static int
1277verify_identifier(struct tok_state *tok)
1278{
1279 PyObject *s;
1280 if (tok->decoding_erred)
1281 return 0;
1282 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1283 if (s == NULL) {
1284 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1285 tok->done = E_DECODE;
1286 }
1287 else {
1288 tok->done = E_ERROR;
1289 }
1290 return 0;
1291 }
1292 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1293 if (invalid < 0) {
1294 Py_DECREF(s);
1295 tok->done = E_ERROR;
1296 return 0;
1297 }
1298 assert(PyUnicode_GET_LENGTH(s) > 0);
1299 if (invalid < PyUnicode_GET_LENGTH(s)) {
1300 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1301 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1302 /* Determine the offset in UTF-8 encoded input */
1303 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1304 if (s != NULL) {
1305 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1306 }
1307 if (s == NULL) {
1308 tok->done = E_ERROR;
1309 return 0;
1310 }
1311 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1312 }
1313 Py_DECREF(s);
1314 // PyUnicode_FromFormatV() does not support %X
1315 char hex[9];
1316 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1317 if (Py_UNICODE_ISPRINTABLE(ch)) {
1318 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1319 }
1320 else {
1321 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1322 }
1323 return 0;
1324 }
1325 Py_DECREF(s);
1326 return 1;
1327}
1328
1329static int
1330tok_decimal_tail(struct tok_state *tok)
1331{
1332 int c;
1333
1334 while (1) {
1335 do {
1336 c = tok_nextc(tok);
1337 } while (isdigit(c));
1338 if (c != '_') {
1339 break;
1340 }
1341 c = tok_nextc(tok);
1342 if (!isdigit(c)) {
1343 tok_backup(tok, c);
1344 syntaxerror(tok, "invalid decimal literal");
1345 return 0;
1346 }
1347 }
1348 return c;
1349}
1350
1351/* Get next token, after space stripping etc. */
1352
1353static inline int
1354tok_continuation_line(struct tok_state *tok) {
1355 int c = tok_nextc(tok);
1356 if (c != '\n') {
1357 tok->done = E_LINECONT;
1358 return -1;
1359 }
1360 c = tok_nextc(tok);
1361 if (c == EOF) {
1362 tok->done = E_EOF;
1363 tok->cur = tok->inp;
1364 return -1;
1365 } else {
1366 tok_backup(tok, c);
1367 }
1368 return c;
1369}
1370
1371static int
1372tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1373{
1374 int c;
1375 int blankline, nonascii;
1376
1377 *p_start = *p_end = NULL;
1378 nextline:
1379 tok->start = NULL;
1380 blankline = 0;
1381
1382 /* Get indentation level */
1383 if (tok->atbol) {
1384 int col = 0;
1385 int altcol = 0;
1386 tok->atbol = 0;
1387 int cont_line_col = 0;
1388 for (;;) {
1389 c = tok_nextc(tok);
1390 if (c == ' ') {
1391 col++, altcol++;
1392 }
1393 else if (c == '\t') {
1394 col = (col / tok->tabsize + 1) * tok->tabsize;
1395 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1396 }
1397 else if (c == '\014') {/* Control-L (formfeed) */
1398 col = altcol = 0; /* For Emacs users */
1399 }
1400 else if (c == '\\') {
1401 // Indentation cannot be split over multiple physical lines
1402 // using backslashes. This means that if we found a backslash
1403 // preceded by whitespace, **the first one we find** determines
1404 // the level of indentation of whatever comes next.
1405 cont_line_col = cont_line_col ? cont_line_col : col;
1406 if ((c = tok_continuation_line(tok)) == -1) {
1407 return ERRORTOKEN;
1408 }
1409 }
1410 else {
1411 break;
1412 }
1413 }
1414 tok_backup(tok, c);
1415 if (c == '#' || c == '\n') {
1416 /* Lines with only whitespace and/or comments
1417 shouldn't affect the indentation and are
1418 not passed to the parser as NEWLINE tokens,
1419 except *totally* empty lines in interactive
1420 mode, which signal the end of a command group. */
1421 if (col == 0 && c == '\n' && tok->prompt != NULL) {
1422 blankline = 0; /* Let it through */
1423 }
1424 else if (tok->prompt != NULL && tok->lineno == 1) {
1425 /* In interactive mode, if the first line contains
1426 only spaces and/or a comment, let it through. */
1427 blankline = 0;
1428 col = altcol = 0;
1429 }
1430 else {
1431 blankline = 1; /* Ignore completely */
1432 }
1433 /* We can't jump back right here since we still
1434 may need to skip to the end of a comment */
1435 }
1436 if (!blankline && tok->level == 0) {
1437 col = cont_line_col ? cont_line_col : col;
1438 altcol = cont_line_col ? cont_line_col : altcol;
1439 if (col == tok->indstack[tok->indent]) {
1440 /* No change */
1441 if (altcol != tok->altindstack[tok->indent]) {
1442 return indenterror(tok);
1443 }
1444 }
1445 else if (col > tok->indstack[tok->indent]) {
1446 /* Indent -- always one */
1447 if (tok->indent+1 >= MAXINDENT) {
1448 tok->done = E_TOODEEP;
1449 tok->cur = tok->inp;
1450 return ERRORTOKEN;
1451 }
1452 if (altcol <= tok->altindstack[tok->indent]) {
1453 return indenterror(tok);
1454 }
1455 tok->pendin++;
1456 tok->indstack[++tok->indent] = col;
1457 tok->altindstack[tok->indent] = altcol;
1458 }
1459 else /* col < tok->indstack[tok->indent] */ {
1460 /* Dedent -- any number, must be consistent */
1461 while (tok->indent > 0 &&
1462 col < tok->indstack[tok->indent]) {
1463 tok->pendin--;
1464 tok->indent--;
1465 }
1466 if (col != tok->indstack[tok->indent]) {
1467 tok->done = E_DEDENT;
1468 tok->cur = tok->inp;
1469 return ERRORTOKEN;
1470 }
1471 if (altcol != tok->altindstack[tok->indent]) {
1472 return indenterror(tok);
1473 }
1474 }
1475 }
1476 }
1477
1478 tok->start = tok->cur;
1479
1480 /* Return pending indents/dedents */
1481 if (tok->pendin != 0) {
1482 if (tok->pendin < 0) {
1483 tok->pendin++;
1484 return DEDENT;
1485 }
1486 else {
1487 tok->pendin--;
1488 return INDENT;
1489 }
1490 }
1491
1492 /* Peek ahead at the next character */
1493 c = tok_nextc(tok);
1494 tok_backup(tok, c);
1495 /* Check if we are closing an async function */
1496 if (tok->async_def
1497 && !blankline
1498 /* Due to some implementation artifacts of type comments,
1499 * a TYPE_COMMENT at the start of a function won't set an
1500 * indentation level and it will produce a NEWLINE after it.
1501 * To avoid spuriously ending an async function due to this,
1502 * wait until we have some non-newline char in front of us. */
1503 && c != '\n'
1504 && tok->level == 0
1505 /* There was a NEWLINE after ASYNC DEF,
1506 so we're past the signature. */
1507 && tok->async_def_nl
1508 /* Current indentation level is less than where
1509 the async function was defined */
1510 && tok->async_def_indent >= tok->indent)
1511 {
1512 tok->async_def = 0;
1513 tok->async_def_indent = 0;
1514 tok->async_def_nl = 0;
1515 }
1516
1517 again:
1518 tok->start = NULL;
1519 /* Skip spaces */
1520 do {
1521 c = tok_nextc(tok);
1522 } while (c == ' ' || c == '\t' || c == '\014');
1523
1524 /* Set start of current token */
1525 tok->start = tok->cur - 1;
1526
1527 /* Skip comment, unless it's a type comment */
1528 if (c == '#') {
1529 const char *prefix, *p, *type_start;
1530
1531 while (c != EOF && c != '\n') {
1532 c = tok_nextc(tok);
1533 }
1534
1535 if (tok->type_comments) {
1536 p = tok->start;
1537 prefix = type_comment_prefix;
1538 while (*prefix && p < tok->cur) {
1539 if (*prefix == ' ') {
1540 while (*p == ' ' || *p == '\t') {
1541 p++;
1542 }
1543 } else if (*prefix == *p) {
1544 p++;
1545 } else {
1546 break;
1547 }
1548
1549 prefix++;
1550 }
1551
1552 /* This is a type comment if we matched all of type_comment_prefix. */
1553 if (!*prefix) {
1554 int is_type_ignore = 1;
1555 const char *ignore_end = p + 6;
1556 tok_backup(tok, c); /* don't eat the newline or EOF */
1557
1558 type_start = p;
1559
1560 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1561 * or anything ASCII and non-alphanumeric. */
1562 is_type_ignore = (
1563 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1564 && !(tok->cur > ignore_end
1565 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1566
1567 if (is_type_ignore) {
1568 *p_start = ignore_end;
1569 *p_end = tok->cur;
1570
1571 /* If this type ignore is the only thing on the line, consume the newline also. */
1572 if (blankline) {
1573 tok_nextc(tok);
1574 tok->atbol = 1;
1575 }
1576 return TYPE_IGNORE;
1577 } else {
1578 *p_start = type_start; /* after type_comment_prefix */
1579 *p_end = tok->cur;
1580 return TYPE_COMMENT;
1581 }
1582 }
1583 }
1584 }
1585
1586 if (tok->done == E_INTERACT_STOP) {
1587 return ENDMARKER;
1588 }
1589
1590 /* Check for EOF and errors now */
1591 if (c == EOF) {
1592 if (tok->level) {
1593 return ERRORTOKEN;
1594 }
1595 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1596 }
1597
1598 /* Identifier (most frequent token!) */
1599 nonascii = 0;
1600 if (is_potential_identifier_start(c)) {
1601 /* Process the various legal combinations of b"", r"", u"", and f"". */
1602 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1603 while (1) {
1604 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1605 saw_b = 1;
1606 /* Since this is a backwards compatibility support literal we don't
1607 want to support it in arbitrary order like byte literals. */
1608 else if (!(saw_b || saw_u || saw_r || saw_f)
1609 && (c == 'u'|| c == 'U')) {
1610 saw_u = 1;
1611 }
1612 /* ur"" and ru"" are not supported */
1613 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1614 saw_r = 1;
1615 }
1616 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1617 saw_f = 1;
1618 }
1619 else {
1620 break;
1621 }
1622 c = tok_nextc(tok);
1623 if (c == '"' || c == '\'') {
1624 goto letter_quote;
1625 }
1626 }
1627 while (is_potential_identifier_char(c)) {
1628 if (c >= 128) {
1629 nonascii = 1;
1630 }
1631 c = tok_nextc(tok);
1632 }
1633 tok_backup(tok, c);
1634 if (nonascii && !verify_identifier(tok)) {
1635 return ERRORTOKEN;
1636 }
1637
1638 *p_start = tok->start;
1639 *p_end = tok->cur;
1640
1641 /* async/await parsing block. */
1642 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1643 /* May be an 'async' or 'await' token. For Python 3.7 or
1644 later we recognize them unconditionally. For Python
1645 3.5 or 3.6 we recognize 'async' in front of 'def', and
1646 either one inside of 'async def'. (Technically we
1647 shouldn't recognize these at all for 3.4 or earlier,
1648 but there's no *valid* Python 3.4 code that would be
1649 rejected, and async functions will be rejected in a
1650 later phase.) */
1651 if (!tok->async_hacks || tok->async_def) {
1652 /* Always recognize the keywords. */
1653 if (memcmp(tok->start, "async", 5) == 0) {
1654 return ASYNC;
1655 }
1656 if (memcmp(tok->start, "await", 5) == 0) {
1657 return AWAIT;
1658 }
1659 }
1660 else if (memcmp(tok->start, "async", 5) == 0) {
1661 /* The current token is 'async'.
1662 Look ahead one token to see if that is 'def'. */
1663
1664 struct tok_state ahead_tok;
1665 const char *ahead_tok_start = NULL;
1666 const char *ahead_tok_end = NULL;
1667 int ahead_tok_kind;
1668
1669 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1670 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1671 &ahead_tok_end);
1672
1673 if (ahead_tok_kind == NAME
1674 && ahead_tok.cur - ahead_tok.start == 3
1675 && memcmp(ahead_tok.start, "def", 3) == 0)
1676 {
1677 /* The next token is going to be 'def', so instead of
1678 returning a plain NAME token, return ASYNC. */
1679 tok->async_def_indent = tok->indent;
1680 tok->async_def = 1;
1681 return ASYNC;
1682 }
1683 }
1684 }
1685
1686 return NAME;
1687 }
1688
1689 /* Newline */
1690 if (c == '\n') {
1691 tok->atbol = 1;
1692 if (blankline || tok->level > 0) {
1693 goto nextline;
1694 }
1695 *p_start = tok->start;
1696 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1697 tok->cont_line = 0;
1698 if (tok->async_def) {
1699 /* We're somewhere inside an 'async def' function, and
1700 we've encountered a NEWLINE after its signature. */
1701 tok->async_def_nl = 1;
1702 }
1703 return NEWLINE;
1704 }
1705
1706 /* Period or number starting with period? */
1707 if (c == '.') {
1708 c = tok_nextc(tok);
1709 if (isdigit(c)) {
1710 goto fraction;
1711 } else if (c == '.') {
1712 c = tok_nextc(tok);
1713 if (c == '.') {
1714 *p_start = tok->start;
1715 *p_end = tok->cur;
1716 return ELLIPSIS;
1717 }
1718 else {
1719 tok_backup(tok, c);
1720 }
1721 tok_backup(tok, '.');
1722 }
1723 else {
1724 tok_backup(tok, c);
1725 }
1726 *p_start = tok->start;
1727 *p_end = tok->cur;
1728 return DOT;
1729 }
1730
1731 /* Number */
1732 if (isdigit(c)) {
1733 if (c == '0') {
1734 /* Hex, octal or binary -- maybe. */
1735 c = tok_nextc(tok);
1736 if (c == 'x' || c == 'X') {
1737 /* Hex */
1738 c = tok_nextc(tok);
1739 do {
1740 if (c == '_') {
1741 c = tok_nextc(tok);
1742 }
1743 if (!isxdigit(c)) {
1744 tok_backup(tok, c);
1745 return syntaxerror(tok, "invalid hexadecimal literal");
1746 }
1747 do {
1748 c = tok_nextc(tok);
1749 } while (isxdigit(c));
1750 } while (c == '_');
1751 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1752 return ERRORTOKEN;
1753 }
1754 }
1755 else if (c == 'o' || c == 'O') {
1756 /* Octal */
1757 c = tok_nextc(tok);
1758 do {
1759 if (c == '_') {
1760 c = tok_nextc(tok);
1761 }
1762 if (c < '0' || c >= '8') {
1763 if (isdigit(c)) {
1764 return syntaxerror(tok,
1765 "invalid digit '%c' in octal literal", c);
1766 }
1767 else {
1768 tok_backup(tok, c);
1769 return syntaxerror(tok, "invalid octal literal");
1770 }
1771 }
1772 do {
1773 c = tok_nextc(tok);
1774 } while ('0' <= c && c < '8');
1775 } while (c == '_');
1776 if (isdigit(c)) {
1777 return syntaxerror(tok,
1778 "invalid digit '%c' in octal literal", c);
1779 }
1780 if (!verify_end_of_number(tok, c, "octal")) {
1781 return ERRORTOKEN;
1782 }
1783 }
1784 else if (c == 'b' || c == 'B') {
1785 /* Binary */
1786 c = tok_nextc(tok);
1787 do {
1788 if (c == '_') {
1789 c = tok_nextc(tok);
1790 }
1791 if (c != '0' && c != '1') {
1792 if (isdigit(c)) {
1793 return syntaxerror(tok,
1794 "invalid digit '%c' in binary literal", c);
1795 }
1796 else {
1797 tok_backup(tok, c);
1798 return syntaxerror(tok, "invalid binary literal");
1799 }
1800 }
1801 do {
1802 c = tok_nextc(tok);
1803 } while (c == '0' || c == '1');
1804 } while (c == '_');
1805 if (isdigit(c)) {
1806 return syntaxerror(tok,
1807 "invalid digit '%c' in binary literal", c);
1808 }
1809 if (!verify_end_of_number(tok, c, "binary")) {
1810 return ERRORTOKEN;
1811 }
1812 }
1813 else {
1814 int nonzero = 0;
1815 /* maybe old-style octal; c is first char of it */
1816 /* in any case, allow '0' as a literal */
1817 while (1) {
1818 if (c == '_') {
1819 c = tok_nextc(tok);
1820 if (!isdigit(c)) {
1821 tok_backup(tok, c);
1822 return syntaxerror(tok, "invalid decimal literal");
1823 }
1824 }
1825 if (c != '0') {
1826 break;
1827 }
1828 c = tok_nextc(tok);
1829 }
1830 char* zeros_end = tok->cur;
1831 if (isdigit(c)) {
1832 nonzero = 1;
1833 c = tok_decimal_tail(tok);
1834 if (c == 0) {
1835 return ERRORTOKEN;
1836 }
1837 }
1838 if (c == '.') {
1839 c = tok_nextc(tok);
1840 goto fraction;
1841 }
1842 else if (c == 'e' || c == 'E') {
1843 goto exponent;
1844 }
1845 else if (c == 'j' || c == 'J') {
1846 goto imaginary;
1847 }
1848 else if (nonzero) {
1849 /* Old-style octal: now disallowed. */
1850 tok_backup(tok, c);
1851 return syntaxerror_known_range(
1852 tok, (int)(tok->start + 1 - tok->line_start),
1853 (int)(zeros_end - tok->line_start),
1854 "leading zeros in decimal integer "
1855 "literals are not permitted; "
1856 "use an 0o prefix for octal integers");
1857 }
1858 if (!verify_end_of_number(tok, c, "decimal")) {
1859 return ERRORTOKEN;
1860 }
1861 }
1862 }
1863 else {
1864 /* Decimal */
1865 c = tok_decimal_tail(tok);
1866 if (c == 0) {
1867 return ERRORTOKEN;
1868 }
1869 {
1870 /* Accept floating point numbers. */
1871 if (c == '.') {
1872 c = tok_nextc(tok);
1873 fraction:
1874 /* Fraction */
1875 if (isdigit(c)) {
1876 c = tok_decimal_tail(tok);
1877 if (c == 0) {
1878 return ERRORTOKEN;
1879 }
1880 }
1881 }
1882 if (c == 'e' || c == 'E') {
1883 int e;
1884 exponent:
1885 e = c;
1886 /* Exponent part */
1887 c = tok_nextc(tok);
1888 if (c == '+' || c == '-') {
1889 c = tok_nextc(tok);
1890 if (!isdigit(c)) {
1891 tok_backup(tok, c);
1892 return syntaxerror(tok, "invalid decimal literal");
1893 }
1894 } else if (!isdigit(c)) {
1895 tok_backup(tok, c);
1896 if (!verify_end_of_number(tok, e, "decimal")) {
1897 return ERRORTOKEN;
1898 }
1899 tok_backup(tok, e);
1900 *p_start = tok->start;
1901 *p_end = tok->cur;
1902 return NUMBER;
1903 }
1904 c = tok_decimal_tail(tok);
1905 if (c == 0) {
1906 return ERRORTOKEN;
1907 }
1908 }
1909 if (c == 'j' || c == 'J') {
1910 /* Imaginary part */
1911 imaginary:
1912 c = tok_nextc(tok);
1913 if (!verify_end_of_number(tok, c, "imaginary")) {
1914 return ERRORTOKEN;
1915 }
1916 }
1917 else if (!verify_end_of_number(tok, c, "decimal")) {
1918 return ERRORTOKEN;
1919 }
1920 }
1921 }
1922 tok_backup(tok, c);
1923 *p_start = tok->start;
1924 *p_end = tok->cur;
1925 return NUMBER;
1926 }
1927
1928 letter_quote:
1929 /* String */
1930 if (c == '\'' || c == '"') {
1931 int quote = c;
1932 int quote_size = 1; /* 1 or 3 */
1933 int end_quote_size = 0;
1934
1935 /* Nodes of type STRING, especially multi line strings
1936 must be handled differently in order to get both
1937 the starting line number and the column offset right.
1938 (cf. issue 16806) */
1939 tok->first_lineno = tok->lineno;
1940 tok->multi_line_start = tok->line_start;
1941
1942 /* Find the quote size and start of string */
1943 c = tok_nextc(tok);
1944 if (c == quote) {
1945 c = tok_nextc(tok);
1946 if (c == quote) {
1947 quote_size = 3;
1948 }
1949 else {
1950 end_quote_size = 1; /* empty string found */
1951 }
1952 }
1953 if (c != quote) {
1954 tok_backup(tok, c);
1955 }
1956
1957 /* Get rest of string */
1958 while (end_quote_size != quote_size) {
1959 c = tok_nextc(tok);
1960 if (c == EOF || (quote_size == 1 && c == '\n')) {
1961 assert(tok->multi_line_start != NULL);
1962 // shift the tok_state's location into
1963 // the start of string, and report the error
1964 // from the initial quote character
1965 tok->cur = (char *)tok->start;
1966 tok->cur++;
1967 tok->line_start = tok->multi_line_start;
1968 int start = tok->lineno;
1969 tok->lineno = tok->first_lineno;
1970 if (quote_size == 3) {
1971 syntaxerror(tok, "unterminated triple-quoted string literal"
1972 " (detected at line %d)", start);
1973 if (c != '\n') {
1974 tok->done = E_EOFS;
1975 }
1976 return ERRORTOKEN;
1977 }
1978 else {
1979 syntaxerror(tok, "unterminated string literal (detected at"
1980 " line %d)", start);
1981 if (c != '\n') {
1982 tok->done = E_EOLS;
1983 }
1984 return ERRORTOKEN;
1985 }
1986 }
1987 if (c == quote) {
1988 end_quote_size += 1;
1989 }
1990 else {
1991 end_quote_size = 0;
1992 if (c == '\\') {
1993 tok_nextc(tok); /* skip escaped char */
1994 }
1995 }
1996 }
1997
1998 *p_start = tok->start;
1999 *p_end = tok->cur;
2000 return STRING;
2001 }
2002
2003 /* Line continuation */
2004 if (c == '\\') {
2005 if ((c = tok_continuation_line(tok)) == -1) {
2006 return ERRORTOKEN;
2007 }
2008 tok->cont_line = 1;
2009 goto again; /* Read next line */
2010 }
2011
2012 /* Check for two-character token */
2013 {
2014 int c2 = tok_nextc(tok);
2015 int token = PyToken_TwoChars(c, c2);
2016 if (token != OP) {
2017 int c3 = tok_nextc(tok);
2018 int token3 = PyToken_ThreeChars(c, c2, c3);
2019 if (token3 != OP) {
2020 token = token3;
2021 }
2022 else {
2023 tok_backup(tok, c3);
2024 }
2025 *p_start = tok->start;
2026 *p_end = tok->cur;
2027 return token;
2028 }
2029 tok_backup(tok, c2);
2030 }
2031
2032 /* Keep track of parentheses nesting level */
2033 switch (c) {
2034 case '(':
2035 case '[':
2036 case '{':
2037 if (tok->level >= MAXLEVEL) {
2038 return syntaxerror(tok, "too many nested parentheses");
2039 }
2040 tok->parenstack[tok->level] = c;
2041 tok->parenlinenostack[tok->level] = tok->lineno;
2042 tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2043 tok->level++;
2044 break;
2045 case ')':
2046 case ']':
2047 case '}':
2048 if (!tok->level) {
2049 return syntaxerror(tok, "unmatched '%c'", c);
2050 }
2051 tok->level--;
2052 int opening = tok->parenstack[tok->level];
2053 if (!((opening == '(' && c == ')') ||
2054 (opening == '[' && c == ']') ||
2055 (opening == '{' && c == '}')))
2056 {
2057 if (tok->parenlinenostack[tok->level] != tok->lineno) {
2058 return syntaxerror(tok,
2059 "closing parenthesis '%c' does not match "
2060 "opening parenthesis '%c' on line %d",
2061 c, opening, tok->parenlinenostack[tok->level]);
2062 }
2063 else {
2064 return syntaxerror(tok,
2065 "closing parenthesis '%c' does not match "
2066 "opening parenthesis '%c'",
2067 c, opening);
2068 }
2069 }
2070 break;
2071 }
2072
2073 /* Punctuation character */
2074 *p_start = tok->start;
2075 *p_end = tok->cur;
2076 return PyToken_OneChar(c);
2077}
2078
2079int
2080PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
2081{
2082 int result = tok_get(tok, p_start, p_end);
2083 if (tok->decoding_erred) {
2084 result = ERRORTOKEN;
2085 tok->done = E_DECODE;
2086 }
2087 return result;
2088}
2089
2090/* Get the encoding of a Python file. Check for the coding cookie and check if
2091 the file starts with a BOM.
2092
2093 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2094 encoding in the first or second line of the file (in which case the encoding
2095 should be assumed to be UTF-8).
2096
2097 The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2098 by the caller. */
2099
2100char *
2101PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2102{
2103 struct tok_state *tok;
2104 FILE *fp;
2105 const char *p_start = NULL;
2106 const char *p_end = NULL;
2107 char *encoding = NULL;
2108
2109 fd = _Py_dup(fd);
2110 if (fd < 0) {
2111 return NULL;
2112 }
2113
2114 fp = fdopen(fd, "r");
2115 if (fp == NULL) {
2116 return NULL;
2117 }
2118 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2119 if (tok == NULL) {
2120 fclose(fp);
2121 return NULL;
2122 }
2123 if (filename != NULL) {
2124 Py_INCREF(filename);
2125 tok->filename = filename;
2126 }
2127 else {
2128 tok->filename = PyUnicode_FromString("<string>");
2129 if (tok->filename == NULL) {
2130 fclose(fp);
2131 PyTokenizer_Free(tok);
2132 return encoding;
2133 }
2134 }
2135 while (tok->lineno < 2 && tok->done == E_OK) {
2136 PyTokenizer_Get(tok, &p_start, &p_end);
2137 }
2138 fclose(fp);
2139 if (tok->encoding) {
2140 encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
2141 if (encoding) {
2142 strcpy(encoding, tok->encoding);
2143 }
2144 }
2145 PyTokenizer_Free(tok);
2146 return encoding;
2147}
2148
2149char *
2150PyTokenizer_FindEncoding(int fd)
2151{
2152 return PyTokenizer_FindEncodingFilename(fd, NULL);
2153}
2154
2155#ifdef Py_DEBUG
2156
2157void
2158tok_dump(int type, char *start, char *end)
2159{
2160 fprintf(stderr, "%s", _PyParser_TokenNames[type]);
2161 if (type == NAME || type == NUMBER || type == STRING || type == OP)
2162 fprintf(stderr, "(%.*s)", (int)(end - start), start);
2163}
2164
2165#endif
2166