1 | |
2 | /* Tokenizer implementation */ |
3 | |
4 | #define PY_SSIZE_T_CLEAN |
5 | #include "Python.h" |
6 | |
7 | #include <ctype.h> |
8 | #include <assert.h> |
9 | |
10 | #include "tokenizer.h" |
11 | #include "errcode.h" |
12 | |
13 | #include "unicodeobject.h" |
14 | #include "bytesobject.h" |
15 | #include "fileobject.h" |
16 | #include "abstract.h" |
17 | |
18 | /* Alternate tab spacing */ |
19 | #define ALTTABSIZE 1 |
20 | |
21 | #define is_potential_identifier_start(c) (\ |
22 | (c >= 'a' && c <= 'z')\ |
23 | || (c >= 'A' && c <= 'Z')\ |
24 | || c == '_'\ |
25 | || (c >= 128)) |
26 | |
27 | #define is_potential_identifier_char(c) (\ |
28 | (c >= 'a' && c <= 'z')\ |
29 | || (c >= 'A' && c <= 'Z')\ |
30 | || (c >= '0' && c <= '9')\ |
31 | || c == '_'\ |
32 | || (c >= 128)) |
33 | |
34 | |
35 | /* Don't ever change this -- it would break the portability of Python code */ |
36 | #define TABSIZE 8 |
37 | |
38 | /* Forward */ |
39 | static struct tok_state *tok_new(void); |
40 | static int tok_nextc(struct tok_state *tok); |
41 | static void tok_backup(struct tok_state *tok, int c); |
42 | static int syntaxerror(struct tok_state *tok, const char *format, ...); |
43 | |
44 | /* Spaces in this constant are treated as "zero or more spaces or tabs" when |
45 | tokenizing. */ |
46 | static const char* = "# type: " ; |
47 | |
48 | /* Create and initialize a new tok_state structure */ |
49 | |
50 | static struct tok_state * |
51 | tok_new(void) |
52 | { |
53 | struct tok_state *tok = (struct tok_state *)PyMem_Malloc( |
54 | sizeof(struct tok_state)); |
55 | if (tok == NULL) |
56 | return NULL; |
57 | tok->buf = tok->cur = tok->inp = NULL; |
58 | tok->fp_interactive = 0; |
59 | tok->interactive_src_start = NULL; |
60 | tok->interactive_src_end = NULL; |
61 | tok->start = NULL; |
62 | tok->end = NULL; |
63 | tok->done = E_OK; |
64 | tok->fp = NULL; |
65 | tok->input = NULL; |
66 | tok->tabsize = TABSIZE; |
67 | tok->indent = 0; |
68 | tok->indstack[0] = 0; |
69 | tok->atbol = 1; |
70 | tok->pendin = 0; |
71 | tok->prompt = tok->nextprompt = NULL; |
72 | tok->lineno = 0; |
73 | tok->level = 0; |
74 | tok->altindstack[0] = 0; |
75 | tok->decoding_state = STATE_INIT; |
76 | tok->decoding_erred = 0; |
77 | tok->enc = NULL; |
78 | tok->encoding = NULL; |
79 | tok->cont_line = 0; |
80 | tok->filename = NULL; |
81 | tok->decoding_readline = NULL; |
82 | tok->decoding_buffer = NULL; |
83 | tok->type_comments = 0; |
84 | tok->async_hacks = 0; |
85 | tok->async_def = 0; |
86 | tok->async_def_indent = 0; |
87 | tok->async_def_nl = 0; |
88 | tok->interactive_underflow = IUNDERFLOW_NORMAL; |
89 | tok->str = NULL; |
90 | return tok; |
91 | } |
92 | |
93 | static char * |
94 | new_string(const char *s, Py_ssize_t len, struct tok_state *tok) |
95 | { |
96 | char* result = (char *)PyMem_Malloc(len + 1); |
97 | if (!result) { |
98 | tok->done = E_NOMEM; |
99 | return NULL; |
100 | } |
101 | memcpy(result, s, len); |
102 | result[len] = '\0'; |
103 | return result; |
104 | } |
105 | |
106 | static char * |
107 | error_ret(struct tok_state *tok) /* XXX */ |
108 | { |
109 | tok->decoding_erred = 1; |
110 | if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ |
111 | PyMem_Free(tok->buf); |
112 | tok->buf = tok->cur = tok->inp = NULL; |
113 | tok->start = NULL; |
114 | tok->end = NULL; |
115 | tok->done = E_DECODE; |
116 | return NULL; /* as if it were EOF */ |
117 | } |
118 | |
119 | |
120 | static const char * |
121 | get_normal_name(const char *s) /* for utf-8 and latin-1 */ |
122 | { |
123 | char buf[13]; |
124 | int i; |
125 | for (i = 0; i < 12; i++) { |
126 | int c = s[i]; |
127 | if (c == '\0') |
128 | break; |
129 | else if (c == '_') |
130 | buf[i] = '-'; |
131 | else |
132 | buf[i] = tolower(c); |
133 | } |
134 | buf[i] = '\0'; |
135 | if (strcmp(buf, "utf-8" ) == 0 || |
136 | strncmp(buf, "utf-8-" , 6) == 0) |
137 | return "utf-8" ; |
138 | else if (strcmp(buf, "latin-1" ) == 0 || |
139 | strcmp(buf, "iso-8859-1" ) == 0 || |
140 | strcmp(buf, "iso-latin-1" ) == 0 || |
141 | strncmp(buf, "latin-1-" , 8) == 0 || |
142 | strncmp(buf, "iso-8859-1-" , 11) == 0 || |
143 | strncmp(buf, "iso-latin-1-" , 12) == 0) |
144 | return "iso-8859-1" ; |
145 | else |
146 | return s; |
147 | } |
148 | |
149 | /* Return the coding spec in S, or NULL if none is found. */ |
150 | |
151 | static int |
152 | get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok) |
153 | { |
154 | Py_ssize_t i; |
155 | *spec = NULL; |
156 | /* Coding spec must be in a comment, and that comment must be |
157 | * the only statement on the source code line. */ |
158 | for (i = 0; i < size - 6; i++) { |
159 | if (s[i] == '#') |
160 | break; |
161 | if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') |
162 | return 1; |
163 | } |
164 | for (; i < size - 6; i++) { /* XXX inefficient search */ |
165 | const char* t = s + i; |
166 | if (memcmp(t, "coding" , 6) == 0) { |
167 | const char* begin = NULL; |
168 | t += 6; |
169 | if (t[0] != ':' && t[0] != '=') |
170 | continue; |
171 | do { |
172 | t++; |
173 | } while (t[0] == ' ' || t[0] == '\t'); |
174 | |
175 | begin = t; |
176 | while (Py_ISALNUM(t[0]) || |
177 | t[0] == '-' || t[0] == '_' || t[0] == '.') |
178 | t++; |
179 | |
180 | if (begin < t) { |
181 | char* r = new_string(begin, t - begin, tok); |
182 | const char* q; |
183 | if (!r) |
184 | return 0; |
185 | q = get_normal_name(r); |
186 | if (r != q) { |
187 | PyMem_Free(r); |
188 | r = new_string(q, strlen(q), tok); |
189 | if (!r) |
190 | return 0; |
191 | } |
192 | *spec = r; |
193 | break; |
194 | } |
195 | } |
196 | } |
197 | return 1; |
198 | } |
199 | |
200 | /* Check whether the line contains a coding spec. If it does, |
201 | invoke the set_readline function for the new encoding. |
202 | This function receives the tok_state and the new encoding. |
203 | Return 1 on success, 0 on failure. */ |
204 | |
205 | static int |
206 | check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, |
207 | int set_readline(struct tok_state *, const char *)) |
208 | { |
209 | char *cs; |
210 | if (tok->cont_line) { |
211 | /* It's a continuation line, so it can't be a coding spec. */ |
212 | tok->decoding_state = STATE_NORMAL; |
213 | return 1; |
214 | } |
215 | if (!get_coding_spec(line, &cs, size, tok)) { |
216 | return 0; |
217 | } |
218 | if (!cs) { |
219 | Py_ssize_t i; |
220 | for (i = 0; i < size; i++) { |
221 | if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') |
222 | break; |
223 | if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { |
224 | /* Stop checking coding spec after a line containing |
225 | * anything except a comment. */ |
226 | tok->decoding_state = STATE_NORMAL; |
227 | break; |
228 | } |
229 | } |
230 | return 1; |
231 | } |
232 | tok->decoding_state = STATE_NORMAL; |
233 | if (tok->encoding == NULL) { |
234 | assert(tok->decoding_readline == NULL); |
235 | if (strcmp(cs, "utf-8" ) != 0 && !set_readline(tok, cs)) { |
236 | error_ret(tok); |
237 | PyErr_Format(PyExc_SyntaxError, "encoding problem: %s" , cs); |
238 | PyMem_Free(cs); |
239 | return 0; |
240 | } |
241 | tok->encoding = cs; |
242 | } else { /* then, compare cs with BOM */ |
243 | if (strcmp(tok->encoding, cs) != 0) { |
244 | error_ret(tok); |
245 | PyErr_Format(PyExc_SyntaxError, |
246 | "encoding problem: %s with BOM" , cs); |
247 | PyMem_Free(cs); |
248 | return 0; |
249 | } |
250 | PyMem_Free(cs); |
251 | } |
252 | return 1; |
253 | } |
254 | |
255 | /* See whether the file starts with a BOM. If it does, |
256 | invoke the set_readline function with the new encoding. |
257 | Return 1 on success, 0 on failure. */ |
258 | |
259 | static int |
260 | check_bom(int get_char(struct tok_state *), |
261 | void unget_char(int, struct tok_state *), |
262 | int set_readline(struct tok_state *, const char *), |
263 | struct tok_state *tok) |
264 | { |
265 | int ch1, ch2, ch3; |
266 | ch1 = get_char(tok); |
267 | tok->decoding_state = STATE_SEEK_CODING; |
268 | if (ch1 == EOF) { |
269 | return 1; |
270 | } else if (ch1 == 0xEF) { |
271 | ch2 = get_char(tok); |
272 | if (ch2 != 0xBB) { |
273 | unget_char(ch2, tok); |
274 | unget_char(ch1, tok); |
275 | return 1; |
276 | } |
277 | ch3 = get_char(tok); |
278 | if (ch3 != 0xBF) { |
279 | unget_char(ch3, tok); |
280 | unget_char(ch2, tok); |
281 | unget_char(ch1, tok); |
282 | return 1; |
283 | } |
284 | #if 0 |
285 | /* Disable support for UTF-16 BOMs until a decision |
286 | is made whether this needs to be supported. */ |
287 | } else if (ch1 == 0xFE) { |
288 | ch2 = get_char(tok); |
289 | if (ch2 != 0xFF) { |
290 | unget_char(ch2, tok); |
291 | unget_char(ch1, tok); |
292 | return 1; |
293 | } |
294 | if (!set_readline(tok, "utf-16-be" )) |
295 | return 0; |
296 | tok->decoding_state = STATE_NORMAL; |
297 | } else if (ch1 == 0xFF) { |
298 | ch2 = get_char(tok); |
299 | if (ch2 != 0xFE) { |
300 | unget_char(ch2, tok); |
301 | unget_char(ch1, tok); |
302 | return 1; |
303 | } |
304 | if (!set_readline(tok, "utf-16-le" )) |
305 | return 0; |
306 | tok->decoding_state = STATE_NORMAL; |
307 | #endif |
308 | } else { |
309 | unget_char(ch1, tok); |
310 | return 1; |
311 | } |
312 | if (tok->encoding != NULL) |
313 | PyMem_Free(tok->encoding); |
314 | tok->encoding = new_string("utf-8" , 5, tok); |
315 | if (!tok->encoding) |
316 | return 0; |
317 | /* No need to set_readline: input is already utf-8 */ |
318 | return 1; |
319 | } |
320 | |
321 | static int |
322 | tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { |
323 | assert(tok->fp_interactive); |
324 | |
325 | if (!line) { |
326 | return 0; |
327 | } |
328 | |
329 | Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start; |
330 | Py_ssize_t line_size = strlen(line); |
331 | char* new_str = tok->interactive_src_start; |
332 | |
333 | new_str = PyMem_Realloc(new_str, current_size + line_size + 1); |
334 | if (!new_str) { |
335 | if (tok->interactive_src_start) { |
336 | PyMem_Free(tok->interactive_src_start); |
337 | } |
338 | tok->interactive_src_start = NULL; |
339 | tok->interactive_src_end = NULL; |
340 | tok->done = E_NOMEM; |
341 | return -1; |
342 | } |
343 | strcpy(new_str + current_size, line); |
344 | |
345 | tok->interactive_src_start = new_str; |
346 | tok->interactive_src_end = new_str + current_size + line_size; |
347 | return 0; |
348 | } |
349 | |
350 | |
351 | /* Read a line of text from TOK into S, using the stream in TOK. |
352 | Return NULL on failure, else S. |
353 | |
354 | On entry, tok->decoding_buffer will be one of: |
355 | 1) NULL: need to call tok->decoding_readline to get a new line |
356 | 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and |
357 | stored the result in tok->decoding_buffer |
358 | 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room |
359 | (in the s buffer) to copy entire contents of the line read |
360 | by tok->decoding_readline. tok->decoding_buffer has the overflow. |
361 | In this case, tok_readline_recode is called in a loop (with an expanded buffer) |
362 | until the buffer ends with a '\n' (or until the end of the file is |
363 | reached): see tok_nextc and its calls to tok_reserve_buf. |
364 | */ |
365 | |
366 | static int |
367 | tok_reserve_buf(struct tok_state *tok, Py_ssize_t size) |
368 | { |
369 | Py_ssize_t cur = tok->cur - tok->buf; |
370 | Py_ssize_t oldsize = tok->inp - tok->buf; |
371 | Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1); |
372 | if (newsize > tok->end - tok->buf) { |
373 | char *newbuf = tok->buf; |
374 | Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf; |
375 | Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf; |
376 | Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf; |
377 | newbuf = (char *)PyMem_Realloc(newbuf, newsize); |
378 | if (newbuf == NULL) { |
379 | tok->done = E_NOMEM; |
380 | return 0; |
381 | } |
382 | tok->buf = newbuf; |
383 | tok->cur = tok->buf + cur; |
384 | tok->inp = tok->buf + oldsize; |
385 | tok->end = tok->buf + newsize; |
386 | tok->start = start < 0 ? NULL : tok->buf + start; |
387 | tok->line_start = line_start < 0 ? NULL : tok->buf + line_start; |
388 | tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start; |
389 | } |
390 | return 1; |
391 | } |
392 | |
393 | static int |
394 | tok_readline_recode(struct tok_state *tok) { |
395 | PyObject *line; |
396 | const char *buf; |
397 | Py_ssize_t buflen; |
398 | line = tok->decoding_buffer; |
399 | if (line == NULL) { |
400 | line = PyObject_CallNoArgs(tok->decoding_readline); |
401 | if (line == NULL) { |
402 | error_ret(tok); |
403 | goto error; |
404 | } |
405 | } |
406 | else { |
407 | tok->decoding_buffer = NULL; |
408 | } |
409 | buf = PyUnicode_AsUTF8AndSize(line, &buflen); |
410 | if (buf == NULL) { |
411 | error_ret(tok); |
412 | goto error; |
413 | } |
414 | if (!tok_reserve_buf(tok, buflen + 1)) { |
415 | goto error; |
416 | } |
417 | memcpy(tok->inp, buf, buflen); |
418 | tok->inp += buflen; |
419 | *tok->inp = '\0'; |
420 | if (tok->fp_interactive && |
421 | tok_concatenate_interactive_new_line(tok, buf) == -1) { |
422 | goto error; |
423 | } |
424 | Py_DECREF(line); |
425 | return 1; |
426 | error: |
427 | Py_XDECREF(line); |
428 | return 0; |
429 | } |
430 | |
431 | /* Set the readline function for TOK to a StreamReader's |
432 | readline function. The StreamReader is named ENC. |
433 | |
434 | This function is called from check_bom and check_coding_spec. |
435 | |
436 | ENC is usually identical to the future value of tok->encoding, |
437 | except for the (currently unsupported) case of UTF-16. |
438 | |
439 | Return 1 on success, 0 on failure. */ |
440 | |
441 | static int |
442 | fp_setreadl(struct tok_state *tok, const char* enc) |
443 | { |
444 | PyObject *readline, *io, *stream; |
445 | _Py_IDENTIFIER(open); |
446 | _Py_IDENTIFIER(readline); |
447 | int fd; |
448 | long pos; |
449 | |
450 | fd = fileno(tok->fp); |
451 | /* Due to buffering the file offset for fd can be different from the file |
452 | * position of tok->fp. If tok->fp was opened in text mode on Windows, |
453 | * its file position counts CRLF as one char and can't be directly mapped |
454 | * to the file offset for fd. Instead we step back one byte and read to |
455 | * the end of line.*/ |
456 | pos = ftell(tok->fp); |
457 | if (pos == -1 || |
458 | lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) { |
459 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); |
460 | return 0; |
461 | } |
462 | |
463 | io = PyImport_ImportModuleNoBlock("io" ); |
464 | if (io == NULL) |
465 | return 0; |
466 | |
467 | stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO" , |
468 | fd, "r" , -1, enc, Py_None, Py_None, Py_False); |
469 | Py_DECREF(io); |
470 | if (stream == NULL) |
471 | return 0; |
472 | |
473 | readline = _PyObject_GetAttrId(stream, &PyId_readline); |
474 | Py_DECREF(stream); |
475 | if (readline == NULL) |
476 | return 0; |
477 | Py_XSETREF(tok->decoding_readline, readline); |
478 | |
479 | if (pos > 0) { |
480 | PyObject *bufobj = _PyObject_CallNoArg(readline); |
481 | if (bufobj == NULL) |
482 | return 0; |
483 | Py_DECREF(bufobj); |
484 | } |
485 | |
486 | return 1; |
487 | } |
488 | |
489 | /* Fetch the next byte from TOK. */ |
490 | |
491 | static int fp_getc(struct tok_state *tok) { |
492 | return getc(tok->fp); |
493 | } |
494 | |
495 | /* Unfetch the last byte back into TOK. */ |
496 | |
497 | static void fp_ungetc(int c, struct tok_state *tok) { |
498 | ungetc(c, tok->fp); |
499 | } |
500 | |
501 | /* Check whether the characters at s start a valid |
502 | UTF-8 sequence. Return the number of characters forming |
503 | the sequence if yes, 0 if not. */ |
504 | static int valid_utf8(const unsigned char* s) |
505 | { |
506 | int expected = 0; |
507 | int length; |
508 | if (*s < 0x80) |
509 | /* single-byte code */ |
510 | return 1; |
511 | if (*s < 0xc0) |
512 | /* following byte */ |
513 | return 0; |
514 | if (*s < 0xE0) |
515 | expected = 1; |
516 | else if (*s < 0xF0) |
517 | expected = 2; |
518 | else if (*s < 0xF8) |
519 | expected = 3; |
520 | else |
521 | return 0; |
522 | length = expected + 1; |
523 | for (; expected; expected--) |
524 | if (s[expected] < 0x80 || s[expected] >= 0xC0) |
525 | return 0; |
526 | return length; |
527 | } |
528 | |
529 | static int |
530 | ensure_utf8(char *line, struct tok_state *tok) |
531 | { |
532 | int badchar = 0; |
533 | unsigned char *c; |
534 | int length; |
535 | for (c = (unsigned char *)line; *c; c += length) { |
536 | if (!(length = valid_utf8(c))) { |
537 | badchar = *c; |
538 | break; |
539 | } |
540 | } |
541 | if (badchar) { |
542 | /* Need to add 1 to the line number, since this line |
543 | has not been counted, yet. */ |
544 | PyErr_Format(PyExc_SyntaxError, |
545 | "Non-UTF-8 code starting with '\\x%.2x' " |
546 | "in file %U on line %i, " |
547 | "but no encoding declared; " |
548 | "see https://python.org/dev/peps/pep-0263/ for details" , |
549 | badchar, tok->filename, tok->lineno + 1); |
550 | return 0; |
551 | } |
552 | return 1; |
553 | } |
554 | |
555 | /* Fetch a byte from TOK, using the string buffer. */ |
556 | |
557 | static int |
558 | buf_getc(struct tok_state *tok) { |
559 | return Py_CHARMASK(*tok->str++); |
560 | } |
561 | |
562 | /* Unfetch a byte from TOK, using the string buffer. */ |
563 | |
564 | static void |
565 | buf_ungetc(int c, struct tok_state *tok) { |
566 | tok->str--; |
567 | assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ |
568 | } |
569 | |
570 | /* Set the readline function for TOK to ENC. For the string-based |
571 | tokenizer, this means to just record the encoding. */ |
572 | |
573 | static int |
574 | buf_setreadl(struct tok_state *tok, const char* enc) { |
575 | tok->enc = enc; |
576 | return 1; |
577 | } |
578 | |
579 | /* Return a UTF-8 encoding Python string object from the |
580 | C byte string STR, which is encoded with ENC. */ |
581 | |
582 | static PyObject * |
583 | translate_into_utf8(const char* str, const char* enc) { |
584 | PyObject *utf8; |
585 | PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); |
586 | if (buf == NULL) |
587 | return NULL; |
588 | utf8 = PyUnicode_AsUTF8String(buf); |
589 | Py_DECREF(buf); |
590 | return utf8; |
591 | } |
592 | |
593 | |
594 | static char * |
595 | translate_newlines(const char *s, int exec_input, struct tok_state *tok) { |
596 | int skip_next_lf = 0; |
597 | size_t needed_length = strlen(s) + 2, final_length; |
598 | char *buf, *current; |
599 | char c = '\0'; |
600 | buf = PyMem_Malloc(needed_length); |
601 | if (buf == NULL) { |
602 | tok->done = E_NOMEM; |
603 | return NULL; |
604 | } |
605 | for (current = buf; *s; s++, current++) { |
606 | c = *s; |
607 | if (skip_next_lf) { |
608 | skip_next_lf = 0; |
609 | if (c == '\n') { |
610 | c = *++s; |
611 | if (!c) |
612 | break; |
613 | } |
614 | } |
615 | if (c == '\r') { |
616 | skip_next_lf = 1; |
617 | c = '\n'; |
618 | } |
619 | *current = c; |
620 | } |
621 | /* If this is exec input, add a newline to the end of the string if |
622 | there isn't one already. */ |
623 | if (exec_input && c != '\n') { |
624 | *current = '\n'; |
625 | current++; |
626 | } |
627 | *current = '\0'; |
628 | final_length = current - buf + 1; |
629 | if (final_length < needed_length && final_length) { |
630 | /* should never fail */ |
631 | char* result = PyMem_Realloc(buf, final_length); |
632 | if (result == NULL) { |
633 | PyMem_Free(buf); |
634 | } |
635 | buf = result; |
636 | } |
637 | return buf; |
638 | } |
639 | |
640 | /* Decode a byte string STR for use as the buffer of TOK. |
641 | Look for encoding declarations inside STR, and record them |
642 | inside TOK. */ |
643 | |
644 | static char * |
645 | decode_str(const char *input, int single, struct tok_state *tok) |
646 | { |
647 | PyObject* utf8 = NULL; |
648 | char *str; |
649 | const char *s; |
650 | const char *newl[2] = {NULL, NULL}; |
651 | int lineno = 0; |
652 | tok->input = str = translate_newlines(input, single, tok); |
653 | if (str == NULL) |
654 | return NULL; |
655 | tok->enc = NULL; |
656 | tok->str = str; |
657 | if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) |
658 | return error_ret(tok); |
659 | str = tok->str; /* string after BOM if any */ |
660 | assert(str); |
661 | if (tok->enc != NULL) { |
662 | utf8 = translate_into_utf8(str, tok->enc); |
663 | if (utf8 == NULL) |
664 | return error_ret(tok); |
665 | str = PyBytes_AsString(utf8); |
666 | } |
667 | for (s = str;; s++) { |
668 | if (*s == '\0') break; |
669 | else if (*s == '\n') { |
670 | assert(lineno < 2); |
671 | newl[lineno] = s; |
672 | lineno++; |
673 | if (lineno == 2) break; |
674 | } |
675 | } |
676 | tok->enc = NULL; |
677 | /* need to check line 1 and 2 separately since check_coding_spec |
678 | assumes a single line as input */ |
679 | if (newl[0]) { |
680 | if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) { |
681 | return NULL; |
682 | } |
683 | if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) { |
684 | if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], |
685 | tok, buf_setreadl)) |
686 | return NULL; |
687 | } |
688 | } |
689 | if (tok->enc != NULL) { |
690 | assert(utf8 == NULL); |
691 | utf8 = translate_into_utf8(str, tok->enc); |
692 | if (utf8 == NULL) |
693 | return error_ret(tok); |
694 | str = PyBytes_AS_STRING(utf8); |
695 | } |
696 | assert(tok->decoding_buffer == NULL); |
697 | tok->decoding_buffer = utf8; /* CAUTION */ |
698 | return str; |
699 | } |
700 | |
701 | /* Set up tokenizer for string */ |
702 | |
703 | struct tok_state * |
704 | PyTokenizer_FromString(const char *str, int exec_input) |
705 | { |
706 | struct tok_state *tok = tok_new(); |
707 | char *decoded; |
708 | |
709 | if (tok == NULL) |
710 | return NULL; |
711 | decoded = decode_str(str, exec_input, tok); |
712 | if (decoded == NULL) { |
713 | PyTokenizer_Free(tok); |
714 | return NULL; |
715 | } |
716 | |
717 | tok->buf = tok->cur = tok->inp = decoded; |
718 | tok->end = decoded; |
719 | return tok; |
720 | } |
721 | |
722 | /* Set up tokenizer for UTF-8 string */ |
723 | |
724 | struct tok_state * |
725 | PyTokenizer_FromUTF8(const char *str, int exec_input) |
726 | { |
727 | struct tok_state *tok = tok_new(); |
728 | char *translated; |
729 | if (tok == NULL) |
730 | return NULL; |
731 | tok->input = translated = translate_newlines(str, exec_input, tok); |
732 | if (translated == NULL) { |
733 | PyTokenizer_Free(tok); |
734 | return NULL; |
735 | } |
736 | tok->decoding_state = STATE_NORMAL; |
737 | tok->enc = NULL; |
738 | tok->str = translated; |
739 | tok->encoding = new_string("utf-8" , 5, tok); |
740 | if (!tok->encoding) { |
741 | PyTokenizer_Free(tok); |
742 | return NULL; |
743 | } |
744 | |
745 | tok->buf = tok->cur = tok->inp = translated; |
746 | tok->end = translated; |
747 | return tok; |
748 | } |
749 | |
750 | /* Set up tokenizer for file */ |
751 | |
752 | struct tok_state * |
753 | PyTokenizer_FromFile(FILE *fp, const char* enc, |
754 | const char *ps1, const char *ps2) |
755 | { |
756 | struct tok_state *tok = tok_new(); |
757 | if (tok == NULL) |
758 | return NULL; |
759 | if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { |
760 | PyTokenizer_Free(tok); |
761 | return NULL; |
762 | } |
763 | tok->cur = tok->inp = tok->buf; |
764 | tok->end = tok->buf + BUFSIZ; |
765 | tok->fp = fp; |
766 | tok->prompt = ps1; |
767 | tok->nextprompt = ps2; |
768 | if (enc != NULL) { |
769 | /* Must copy encoding declaration since it |
770 | gets copied into the parse tree. */ |
771 | tok->encoding = new_string(enc, strlen(enc), tok); |
772 | if (!tok->encoding) { |
773 | PyTokenizer_Free(tok); |
774 | return NULL; |
775 | } |
776 | tok->decoding_state = STATE_NORMAL; |
777 | } |
778 | return tok; |
779 | } |
780 | |
781 | /* Free a tok_state structure */ |
782 | |
783 | void |
784 | PyTokenizer_Free(struct tok_state *tok) |
785 | { |
786 | if (tok->encoding != NULL) { |
787 | PyMem_Free(tok->encoding); |
788 | } |
789 | Py_XDECREF(tok->decoding_readline); |
790 | Py_XDECREF(tok->decoding_buffer); |
791 | Py_XDECREF(tok->filename); |
792 | if (tok->fp != NULL && tok->buf != NULL) { |
793 | PyMem_Free(tok->buf); |
794 | } |
795 | if (tok->input) { |
796 | PyMem_Free(tok->input); |
797 | } |
798 | if (tok->interactive_src_start != NULL) { |
799 | PyMem_Free(tok->interactive_src_start); |
800 | } |
801 | PyMem_Free(tok); |
802 | } |
803 | |
804 | static int |
805 | tok_readline_raw(struct tok_state *tok) |
806 | { |
807 | do { |
808 | if (!tok_reserve_buf(tok, BUFSIZ)) { |
809 | return 0; |
810 | } |
811 | char *line = Py_UniversalNewlineFgets(tok->inp, |
812 | (int)(tok->end - tok->inp), |
813 | tok->fp, NULL); |
814 | if (line == NULL) { |
815 | return 1; |
816 | } |
817 | if (tok->fp_interactive && |
818 | tok_concatenate_interactive_new_line(tok, line) == -1) { |
819 | return 0; |
820 | } |
821 | tok->inp = strchr(tok->inp, '\0'); |
822 | if (tok->inp == tok->buf) { |
823 | return 0; |
824 | } |
825 | } while (tok->inp[-1] != '\n'); |
826 | return 1; |
827 | } |
828 | |
829 | static int |
830 | tok_underflow_string(struct tok_state *tok) { |
831 | char *end = strchr(tok->inp, '\n'); |
832 | if (end != NULL) { |
833 | end++; |
834 | } |
835 | else { |
836 | end = strchr(tok->inp, '\0'); |
837 | if (end == tok->inp) { |
838 | tok->done = E_EOF; |
839 | return 0; |
840 | } |
841 | } |
842 | if (tok->start == NULL) { |
843 | tok->buf = tok->cur; |
844 | } |
845 | tok->line_start = tok->cur; |
846 | tok->lineno++; |
847 | tok->inp = end; |
848 | return 1; |
849 | } |
850 | |
851 | static int |
852 | tok_underflow_interactive(struct tok_state *tok) { |
853 | if (tok->interactive_underflow == IUNDERFLOW_STOP) { |
854 | tok->done = E_INTERACT_STOP; |
855 | return 1; |
856 | } |
857 | char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt); |
858 | if (newtok != NULL) { |
859 | char *translated = translate_newlines(newtok, 0, tok); |
860 | PyMem_Free(newtok); |
861 | if (translated == NULL) { |
862 | return 0; |
863 | } |
864 | newtok = translated; |
865 | } |
866 | if (tok->encoding && newtok && *newtok) { |
867 | /* Recode to UTF-8 */ |
868 | Py_ssize_t buflen; |
869 | const char* buf; |
870 | PyObject *u = translate_into_utf8(newtok, tok->encoding); |
871 | PyMem_Free(newtok); |
872 | if (u == NULL) { |
873 | tok->done = E_DECODE; |
874 | return 0; |
875 | } |
876 | buflen = PyBytes_GET_SIZE(u); |
877 | buf = PyBytes_AS_STRING(u); |
878 | newtok = PyMem_Malloc(buflen+1); |
879 | if (newtok == NULL) { |
880 | Py_DECREF(u); |
881 | tok->done = E_NOMEM; |
882 | return 0; |
883 | } |
884 | strcpy(newtok, buf); |
885 | Py_DECREF(u); |
886 | } |
887 | if (tok->fp_interactive && |
888 | tok_concatenate_interactive_new_line(tok, newtok) == -1) { |
889 | PyMem_Free(newtok); |
890 | return 0; |
891 | } |
892 | if (tok->nextprompt != NULL) { |
893 | tok->prompt = tok->nextprompt; |
894 | } |
895 | if (newtok == NULL) { |
896 | tok->done = E_INTR; |
897 | } |
898 | else if (*newtok == '\0') { |
899 | PyMem_Free(newtok); |
900 | tok->done = E_EOF; |
901 | } |
902 | else if (tok->start != NULL) { |
903 | Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; |
904 | size_t size = strlen(newtok); |
905 | tok->lineno++; |
906 | if (!tok_reserve_buf(tok, size + 1)) { |
907 | PyMem_Free(tok->buf); |
908 | tok->buf = NULL; |
909 | PyMem_Free(newtok); |
910 | return 0; |
911 | } |
912 | memcpy(tok->cur, newtok, size + 1); |
913 | PyMem_Free(newtok); |
914 | tok->inp += size; |
915 | tok->multi_line_start = tok->buf + cur_multi_line_start; |
916 | } |
917 | else { |
918 | tok->lineno++; |
919 | PyMem_Free(tok->buf); |
920 | tok->buf = newtok; |
921 | tok->cur = tok->buf; |
922 | tok->line_start = tok->buf; |
923 | tok->inp = strchr(tok->buf, '\0'); |
924 | tok->end = tok->inp + 1; |
925 | } |
926 | if (tok->done != E_OK) { |
927 | if (tok->prompt != NULL) { |
928 | PySys_WriteStderr("\n" ); |
929 | } |
930 | return 0; |
931 | } |
932 | return 1; |
933 | } |
934 | |
935 | static int |
936 | tok_underflow_file(struct tok_state *tok) { |
937 | if (tok->start == NULL) { |
938 | tok->cur = tok->inp = tok->buf; |
939 | } |
940 | if (tok->decoding_state == STATE_INIT) { |
941 | /* We have not yet determined the encoding. |
942 | If an encoding is found, use the file-pointer |
943 | reader functions from now on. */ |
944 | if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) { |
945 | error_ret(tok); |
946 | return 0; |
947 | } |
948 | assert(tok->decoding_state != STATE_INIT); |
949 | } |
950 | /* Read until '\n' or EOF */ |
951 | if (tok->decoding_readline != NULL) { |
952 | /* We already have a codec associated with this input. */ |
953 | if (!tok_readline_recode(tok)) { |
954 | return 0; |
955 | } |
956 | } |
957 | else { |
958 | /* We want a 'raw' read. */ |
959 | if (!tok_readline_raw(tok)) { |
960 | return 0; |
961 | } |
962 | } |
963 | if (tok->inp == tok->cur) { |
964 | tok->done = E_EOF; |
965 | return 0; |
966 | } |
967 | if (tok->inp[-1] != '\n') { |
968 | /* Last line does not end in \n, fake one */ |
969 | *tok->inp++ = '\n'; |
970 | *tok->inp = '\0'; |
971 | } |
972 | |
973 | tok->lineno++; |
974 | if (tok->decoding_state != STATE_NORMAL) { |
975 | if (tok->lineno > 2) { |
976 | tok->decoding_state = STATE_NORMAL; |
977 | } |
978 | else if (!check_coding_spec(tok->cur, strlen(tok->cur), |
979 | tok, fp_setreadl)) |
980 | { |
981 | return 0; |
982 | } |
983 | } |
984 | /* The default encoding is UTF-8, so make sure we don't have any |
985 | non-UTF-8 sequences in it. */ |
986 | if (!tok->encoding && !ensure_utf8(tok->cur, tok)) { |
987 | error_ret(tok); |
988 | return 0; |
989 | } |
990 | assert(tok->done == E_OK); |
991 | return tok->done == E_OK; |
992 | } |
993 | |
994 | #if defined(Py_DEBUG) |
995 | static void |
996 | print_escape(FILE *f, const char *s, Py_ssize_t size) |
997 | { |
998 | if (s == NULL) { |
999 | fputs("NULL" , f); |
1000 | return; |
1001 | } |
1002 | putc('"', f); |
1003 | while (size-- > 0) { |
1004 | unsigned char c = *s++; |
1005 | switch (c) { |
1006 | case '\n': fputs("\\n" , f); break; |
1007 | case '\r': fputs("\\r" , f); break; |
1008 | case '\t': fputs("\\t" , f); break; |
1009 | case '\f': fputs("\\f" , f); break; |
1010 | case '\'': fputs("\\'" , f); break; |
1011 | case '"': fputs("\\\"" , f); break; |
1012 | default: |
1013 | if (0x20 <= c && c <= 0x7f) |
1014 | putc(c, f); |
1015 | else |
1016 | fprintf(f, "\\x%02x" , c); |
1017 | } |
1018 | } |
1019 | putc('"', f); |
1020 | } |
1021 | #endif |
1022 | |
1023 | /* Get next char, updating state; error code goes into tok->done */ |
1024 | |
1025 | static int |
1026 | tok_nextc(struct tok_state *tok) |
1027 | { |
1028 | int rc; |
1029 | for (;;) { |
1030 | if (tok->cur != tok->inp) { |
1031 | return Py_CHARMASK(*tok->cur++); /* Fast path */ |
1032 | } |
1033 | if (tok->done != E_OK) { |
1034 | return EOF; |
1035 | } |
1036 | if (tok->fp == NULL) { |
1037 | rc = tok_underflow_string(tok); |
1038 | } |
1039 | else if (tok->prompt != NULL) { |
1040 | rc = tok_underflow_interactive(tok); |
1041 | } |
1042 | else { |
1043 | rc = tok_underflow_file(tok); |
1044 | } |
1045 | #if defined(Py_DEBUG) |
1046 | if (Py_DebugFlag) { |
1047 | fprintf(stderr, "line[%d] = " , tok->lineno); |
1048 | print_escape(stderr, tok->cur, tok->inp - tok->cur); |
1049 | fprintf(stderr, " tok->done = %d\n" , tok->done); |
1050 | } |
1051 | #endif |
1052 | if (!rc) { |
1053 | tok->cur = tok->inp; |
1054 | return EOF; |
1055 | } |
1056 | tok->line_start = tok->cur; |
1057 | } |
1058 | Py_UNREACHABLE(); |
1059 | } |
1060 | |
1061 | /* Back-up one character */ |
1062 | |
1063 | static void |
1064 | tok_backup(struct tok_state *tok, int c) |
1065 | { |
1066 | if (c != EOF) { |
1067 | if (--tok->cur < tok->buf) { |
1068 | Py_FatalError("tokenizer beginning of buffer" ); |
1069 | } |
1070 | if ((int)(unsigned char)*tok->cur != c) { |
1071 | Py_FatalError("tok_backup: wrong character" ); |
1072 | } |
1073 | } |
1074 | } |
1075 | |
1076 | static int |
1077 | _syntaxerror_range(struct tok_state *tok, const char *format, |
1078 | int col_offset, int end_col_offset, |
1079 | va_list vargs) |
1080 | { |
1081 | PyObject *errmsg, *errtext, *args; |
1082 | errmsg = PyUnicode_FromFormatV(format, vargs); |
1083 | if (!errmsg) { |
1084 | goto error; |
1085 | } |
1086 | |
1087 | errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start, |
1088 | "replace" ); |
1089 | if (!errtext) { |
1090 | goto error; |
1091 | } |
1092 | |
1093 | if (col_offset == -1) { |
1094 | col_offset = (int)PyUnicode_GET_LENGTH(errtext); |
1095 | } |
1096 | if (end_col_offset == -1) { |
1097 | end_col_offset = col_offset; |
1098 | } |
1099 | |
1100 | Py_ssize_t line_len = strcspn(tok->line_start, "\n" ); |
1101 | if (line_len != tok->cur - tok->line_start) { |
1102 | Py_DECREF(errtext); |
1103 | errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len, |
1104 | "replace" ); |
1105 | } |
1106 | if (!errtext) { |
1107 | goto error; |
1108 | } |
1109 | |
1110 | args = Py_BuildValue("(O(OiiNii))" , errmsg, tok->filename, tok->lineno, |
1111 | col_offset, errtext, tok->lineno, end_col_offset); |
1112 | if (args) { |
1113 | PyErr_SetObject(PyExc_SyntaxError, args); |
1114 | Py_DECREF(args); |
1115 | } |
1116 | |
1117 | error: |
1118 | Py_XDECREF(errmsg); |
1119 | tok->done = E_ERROR; |
1120 | return ERRORTOKEN; |
1121 | } |
1122 | |
1123 | static int |
1124 | syntaxerror(struct tok_state *tok, const char *format, ...) |
1125 | { |
1126 | va_list vargs; |
1127 | #ifdef HAVE_STDARG_PROTOTYPES |
1128 | va_start(vargs, format); |
1129 | #else |
1130 | va_start(vargs); |
1131 | #endif |
1132 | int ret = _syntaxerror_range(tok, format, -1, -1, vargs); |
1133 | va_end(vargs); |
1134 | return ret; |
1135 | } |
1136 | |
1137 | static int |
1138 | syntaxerror_known_range(struct tok_state *tok, |
1139 | int col_offset, int end_col_offset, |
1140 | const char *format, ...) |
1141 | { |
1142 | va_list vargs; |
1143 | #ifdef HAVE_STDARG_PROTOTYPES |
1144 | va_start(vargs, format); |
1145 | #else |
1146 | va_start(vargs); |
1147 | #endif |
1148 | int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs); |
1149 | va_end(vargs); |
1150 | return ret; |
1151 | } |
1152 | |
1153 | |
1154 | |
1155 | static int |
1156 | indenterror(struct tok_state *tok) |
1157 | { |
1158 | tok->done = E_TABSPACE; |
1159 | tok->cur = tok->inp; |
1160 | return ERRORTOKEN; |
1161 | } |
1162 | |
1163 | static int |
1164 | parser_warn(struct tok_state *tok, const char *format, ...) |
1165 | { |
1166 | PyObject *errmsg; |
1167 | va_list vargs; |
1168 | #ifdef HAVE_STDARG_PROTOTYPES |
1169 | va_start(vargs, format); |
1170 | #else |
1171 | va_start(vargs); |
1172 | #endif |
1173 | errmsg = PyUnicode_FromFormatV(format, vargs); |
1174 | va_end(vargs); |
1175 | if (!errmsg) { |
1176 | goto error; |
1177 | } |
1178 | |
1179 | if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, errmsg, tok->filename, |
1180 | tok->lineno, NULL, NULL) < 0) { |
1181 | if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) { |
1182 | /* Replace the DeprecationWarning exception with a SyntaxError |
1183 | to get a more accurate error report */ |
1184 | PyErr_Clear(); |
1185 | syntaxerror(tok, "%U" , errmsg); |
1186 | } |
1187 | goto error; |
1188 | } |
1189 | Py_DECREF(errmsg); |
1190 | return 0; |
1191 | |
1192 | error: |
1193 | Py_XDECREF(errmsg); |
1194 | tok->done = E_ERROR; |
1195 | return -1; |
1196 | } |
1197 | |
1198 | static int |
1199 | lookahead(struct tok_state *tok, const char *test) |
1200 | { |
1201 | const char *s = test; |
1202 | int res = 0; |
1203 | while (1) { |
1204 | int c = tok_nextc(tok); |
1205 | if (*s == 0) { |
1206 | res = !is_potential_identifier_char(c); |
1207 | } |
1208 | else if (c == *s) { |
1209 | s++; |
1210 | continue; |
1211 | } |
1212 | |
1213 | tok_backup(tok, c); |
1214 | while (s != test) { |
1215 | tok_backup(tok, *--s); |
1216 | } |
1217 | return res; |
1218 | } |
1219 | } |
1220 | |
1221 | static int |
1222 | verify_end_of_number(struct tok_state *tok, int c, const char *kind) |
1223 | { |
1224 | /* Emit a deprecation warning only if the numeric literal is immediately |
1225 | * followed by one of keywords which can occurr after a numeric literal |
1226 | * in valid code: "and", "else", "for", "if", "in", "is" and "or". |
1227 | * It allows to gradually deprecate existing valid code without adding |
1228 | * warning before error in most cases of invalid numeric literal (which |
1229 | * would be confusiong and break existing tests). |
1230 | * Raise a syntax error with slighly better message than plain |
1231 | * "invalid syntax" if the numeric literal is immediately followed by |
1232 | * other keyword or identifier. |
1233 | */ |
1234 | int r = 0; |
1235 | if (c == 'a') { |
1236 | r = lookahead(tok, "nd" ); |
1237 | } |
1238 | else if (c == 'e') { |
1239 | r = lookahead(tok, "lse" ); |
1240 | } |
1241 | else if (c == 'f') { |
1242 | r = lookahead(tok, "or" ); |
1243 | } |
1244 | else if (c == 'i') { |
1245 | int c2 = tok_nextc(tok); |
1246 | if (c2 == 'f' || c2 == 'n' || c2 == 's') { |
1247 | r = 1; |
1248 | } |
1249 | tok_backup(tok, c2); |
1250 | } |
1251 | else if (c == 'o') { |
1252 | r = lookahead(tok, "r" ); |
1253 | } |
1254 | else if (c == 'n') { |
1255 | r = lookahead(tok, "ot" ); |
1256 | } |
1257 | if (r) { |
1258 | tok_backup(tok, c); |
1259 | if (parser_warn(tok, "invalid %s literal" , kind)) { |
1260 | return 0; |
1261 | } |
1262 | tok_nextc(tok); |
1263 | } |
1264 | else /* In future releases, only error will remain. */ |
1265 | if (is_potential_identifier_char(c)) { |
1266 | tok_backup(tok, c); |
1267 | syntaxerror(tok, "invalid %s literal" , kind); |
1268 | return 0; |
1269 | } |
1270 | return 1; |
1271 | } |
1272 | |
1273 | /* Verify that the identifier follows PEP 3131. |
1274 | All identifier strings are guaranteed to be "ready" unicode objects. |
1275 | */ |
1276 | static int |
1277 | verify_identifier(struct tok_state *tok) |
1278 | { |
1279 | PyObject *s; |
1280 | if (tok->decoding_erred) |
1281 | return 0; |
1282 | s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); |
1283 | if (s == NULL) { |
1284 | if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { |
1285 | tok->done = E_DECODE; |
1286 | } |
1287 | else { |
1288 | tok->done = E_ERROR; |
1289 | } |
1290 | return 0; |
1291 | } |
1292 | Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); |
1293 | if (invalid < 0) { |
1294 | Py_DECREF(s); |
1295 | tok->done = E_ERROR; |
1296 | return 0; |
1297 | } |
1298 | assert(PyUnicode_GET_LENGTH(s) > 0); |
1299 | if (invalid < PyUnicode_GET_LENGTH(s)) { |
1300 | Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); |
1301 | if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { |
1302 | /* Determine the offset in UTF-8 encoded input */ |
1303 | Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); |
1304 | if (s != NULL) { |
1305 | Py_SETREF(s, PyUnicode_AsUTF8String(s)); |
1306 | } |
1307 | if (s == NULL) { |
1308 | tok->done = E_ERROR; |
1309 | return 0; |
1310 | } |
1311 | tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); |
1312 | } |
1313 | Py_DECREF(s); |
1314 | // PyUnicode_FromFormatV() does not support %X |
1315 | char hex[9]; |
1316 | (void)PyOS_snprintf(hex, sizeof(hex), "%04X" , ch); |
1317 | if (Py_UNICODE_ISPRINTABLE(ch)) { |
1318 | syntaxerror(tok, "invalid character '%c' (U+%s)" , ch, hex); |
1319 | } |
1320 | else { |
1321 | syntaxerror(tok, "invalid non-printable character U+%s" , hex); |
1322 | } |
1323 | return 0; |
1324 | } |
1325 | Py_DECREF(s); |
1326 | return 1; |
1327 | } |
1328 | |
1329 | static int |
1330 | tok_decimal_tail(struct tok_state *tok) |
1331 | { |
1332 | int c; |
1333 | |
1334 | while (1) { |
1335 | do { |
1336 | c = tok_nextc(tok); |
1337 | } while (isdigit(c)); |
1338 | if (c != '_') { |
1339 | break; |
1340 | } |
1341 | c = tok_nextc(tok); |
1342 | if (!isdigit(c)) { |
1343 | tok_backup(tok, c); |
1344 | syntaxerror(tok, "invalid decimal literal" ); |
1345 | return 0; |
1346 | } |
1347 | } |
1348 | return c; |
1349 | } |
1350 | |
1351 | /* Get next token, after space stripping etc. */ |
1352 | |
1353 | static inline int |
1354 | tok_continuation_line(struct tok_state *tok) { |
1355 | int c = tok_nextc(tok); |
1356 | if (c != '\n') { |
1357 | tok->done = E_LINECONT; |
1358 | return -1; |
1359 | } |
1360 | c = tok_nextc(tok); |
1361 | if (c == EOF) { |
1362 | tok->done = E_EOF; |
1363 | tok->cur = tok->inp; |
1364 | return -1; |
1365 | } else { |
1366 | tok_backup(tok, c); |
1367 | } |
1368 | return c; |
1369 | } |
1370 | |
1371 | static int |
1372 | tok_get(struct tok_state *tok, const char **p_start, const char **p_end) |
1373 | { |
1374 | int c; |
1375 | int blankline, nonascii; |
1376 | |
1377 | *p_start = *p_end = NULL; |
1378 | nextline: |
1379 | tok->start = NULL; |
1380 | blankline = 0; |
1381 | |
1382 | /* Get indentation level */ |
1383 | if (tok->atbol) { |
1384 | int col = 0; |
1385 | int altcol = 0; |
1386 | tok->atbol = 0; |
1387 | int cont_line_col = 0; |
1388 | for (;;) { |
1389 | c = tok_nextc(tok); |
1390 | if (c == ' ') { |
1391 | col++, altcol++; |
1392 | } |
1393 | else if (c == '\t') { |
1394 | col = (col / tok->tabsize + 1) * tok->tabsize; |
1395 | altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; |
1396 | } |
1397 | else if (c == '\014') {/* Control-L (formfeed) */ |
1398 | col = altcol = 0; /* For Emacs users */ |
1399 | } |
1400 | else if (c == '\\') { |
1401 | // Indentation cannot be split over multiple physical lines |
1402 | // using backslashes. This means that if we found a backslash |
1403 | // preceded by whitespace, **the first one we find** determines |
1404 | // the level of indentation of whatever comes next. |
1405 | cont_line_col = cont_line_col ? cont_line_col : col; |
1406 | if ((c = tok_continuation_line(tok)) == -1) { |
1407 | return ERRORTOKEN; |
1408 | } |
1409 | } |
1410 | else { |
1411 | break; |
1412 | } |
1413 | } |
1414 | tok_backup(tok, c); |
1415 | if (c == '#' || c == '\n') { |
1416 | /* Lines with only whitespace and/or comments |
1417 | shouldn't affect the indentation and are |
1418 | not passed to the parser as NEWLINE tokens, |
1419 | except *totally* empty lines in interactive |
1420 | mode, which signal the end of a command group. */ |
1421 | if (col == 0 && c == '\n' && tok->prompt != NULL) { |
1422 | blankline = 0; /* Let it through */ |
1423 | } |
1424 | else if (tok->prompt != NULL && tok->lineno == 1) { |
1425 | /* In interactive mode, if the first line contains |
1426 | only spaces and/or a comment, let it through. */ |
1427 | blankline = 0; |
1428 | col = altcol = 0; |
1429 | } |
1430 | else { |
1431 | blankline = 1; /* Ignore completely */ |
1432 | } |
1433 | /* We can't jump back right here since we still |
1434 | may need to skip to the end of a comment */ |
1435 | } |
1436 | if (!blankline && tok->level == 0) { |
1437 | col = cont_line_col ? cont_line_col : col; |
1438 | altcol = cont_line_col ? cont_line_col : altcol; |
1439 | if (col == tok->indstack[tok->indent]) { |
1440 | /* No change */ |
1441 | if (altcol != tok->altindstack[tok->indent]) { |
1442 | return indenterror(tok); |
1443 | } |
1444 | } |
1445 | else if (col > tok->indstack[tok->indent]) { |
1446 | /* Indent -- always one */ |
1447 | if (tok->indent+1 >= MAXINDENT) { |
1448 | tok->done = E_TOODEEP; |
1449 | tok->cur = tok->inp; |
1450 | return ERRORTOKEN; |
1451 | } |
1452 | if (altcol <= tok->altindstack[tok->indent]) { |
1453 | return indenterror(tok); |
1454 | } |
1455 | tok->pendin++; |
1456 | tok->indstack[++tok->indent] = col; |
1457 | tok->altindstack[tok->indent] = altcol; |
1458 | } |
1459 | else /* col < tok->indstack[tok->indent] */ { |
1460 | /* Dedent -- any number, must be consistent */ |
1461 | while (tok->indent > 0 && |
1462 | col < tok->indstack[tok->indent]) { |
1463 | tok->pendin--; |
1464 | tok->indent--; |
1465 | } |
1466 | if (col != tok->indstack[tok->indent]) { |
1467 | tok->done = E_DEDENT; |
1468 | tok->cur = tok->inp; |
1469 | return ERRORTOKEN; |
1470 | } |
1471 | if (altcol != tok->altindstack[tok->indent]) { |
1472 | return indenterror(tok); |
1473 | } |
1474 | } |
1475 | } |
1476 | } |
1477 | |
1478 | tok->start = tok->cur; |
1479 | |
1480 | /* Return pending indents/dedents */ |
1481 | if (tok->pendin != 0) { |
1482 | if (tok->pendin < 0) { |
1483 | tok->pendin++; |
1484 | return DEDENT; |
1485 | } |
1486 | else { |
1487 | tok->pendin--; |
1488 | return INDENT; |
1489 | } |
1490 | } |
1491 | |
1492 | /* Peek ahead at the next character */ |
1493 | c = tok_nextc(tok); |
1494 | tok_backup(tok, c); |
1495 | /* Check if we are closing an async function */ |
1496 | if (tok->async_def |
1497 | && !blankline |
1498 | /* Due to some implementation artifacts of type comments, |
1499 | * a TYPE_COMMENT at the start of a function won't set an |
1500 | * indentation level and it will produce a NEWLINE after it. |
1501 | * To avoid spuriously ending an async function due to this, |
1502 | * wait until we have some non-newline char in front of us. */ |
1503 | && c != '\n' |
1504 | && tok->level == 0 |
1505 | /* There was a NEWLINE after ASYNC DEF, |
1506 | so we're past the signature. */ |
1507 | && tok->async_def_nl |
1508 | /* Current indentation level is less than where |
1509 | the async function was defined */ |
1510 | && tok->async_def_indent >= tok->indent) |
1511 | { |
1512 | tok->async_def = 0; |
1513 | tok->async_def_indent = 0; |
1514 | tok->async_def_nl = 0; |
1515 | } |
1516 | |
1517 | again: |
1518 | tok->start = NULL; |
1519 | /* Skip spaces */ |
1520 | do { |
1521 | c = tok_nextc(tok); |
1522 | } while (c == ' ' || c == '\t' || c == '\014'); |
1523 | |
1524 | /* Set start of current token */ |
1525 | tok->start = tok->cur - 1; |
1526 | |
1527 | /* Skip comment, unless it's a type comment */ |
1528 | if (c == '#') { |
1529 | const char *prefix, *p, *type_start; |
1530 | |
1531 | while (c != EOF && c != '\n') { |
1532 | c = tok_nextc(tok); |
1533 | } |
1534 | |
1535 | if (tok->type_comments) { |
1536 | p = tok->start; |
1537 | prefix = type_comment_prefix; |
1538 | while (*prefix && p < tok->cur) { |
1539 | if (*prefix == ' ') { |
1540 | while (*p == ' ' || *p == '\t') { |
1541 | p++; |
1542 | } |
1543 | } else if (*prefix == *p) { |
1544 | p++; |
1545 | } else { |
1546 | break; |
1547 | } |
1548 | |
1549 | prefix++; |
1550 | } |
1551 | |
1552 | /* This is a type comment if we matched all of type_comment_prefix. */ |
1553 | if (!*prefix) { |
1554 | int is_type_ignore = 1; |
1555 | const char *ignore_end = p + 6; |
1556 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
1557 | |
1558 | type_start = p; |
1559 | |
1560 | /* A TYPE_IGNORE is "type: ignore" followed by the end of the token |
1561 | * or anything ASCII and non-alphanumeric. */ |
1562 | is_type_ignore = ( |
1563 | tok->cur >= ignore_end && memcmp(p, "ignore" , 6) == 0 |
1564 | && !(tok->cur > ignore_end |
1565 | && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); |
1566 | |
1567 | if (is_type_ignore) { |
1568 | *p_start = ignore_end; |
1569 | *p_end = tok->cur; |
1570 | |
1571 | /* If this type ignore is the only thing on the line, consume the newline also. */ |
1572 | if (blankline) { |
1573 | tok_nextc(tok); |
1574 | tok->atbol = 1; |
1575 | } |
1576 | return TYPE_IGNORE; |
1577 | } else { |
1578 | *p_start = type_start; /* after type_comment_prefix */ |
1579 | *p_end = tok->cur; |
1580 | return TYPE_COMMENT; |
1581 | } |
1582 | } |
1583 | } |
1584 | } |
1585 | |
1586 | if (tok->done == E_INTERACT_STOP) { |
1587 | return ENDMARKER; |
1588 | } |
1589 | |
1590 | /* Check for EOF and errors now */ |
1591 | if (c == EOF) { |
1592 | if (tok->level) { |
1593 | return ERRORTOKEN; |
1594 | } |
1595 | return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; |
1596 | } |
1597 | |
1598 | /* Identifier (most frequent token!) */ |
1599 | nonascii = 0; |
1600 | if (is_potential_identifier_start(c)) { |
1601 | /* Process the various legal combinations of b"", r"", u"", and f"". */ |
1602 | int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0; |
1603 | while (1) { |
1604 | if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B')) |
1605 | saw_b = 1; |
1606 | /* Since this is a backwards compatibility support literal we don't |
1607 | want to support it in arbitrary order like byte literals. */ |
1608 | else if (!(saw_b || saw_u || saw_r || saw_f) |
1609 | && (c == 'u'|| c == 'U')) { |
1610 | saw_u = 1; |
1611 | } |
1612 | /* ur"" and ru"" are not supported */ |
1613 | else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) { |
1614 | saw_r = 1; |
1615 | } |
1616 | else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) { |
1617 | saw_f = 1; |
1618 | } |
1619 | else { |
1620 | break; |
1621 | } |
1622 | c = tok_nextc(tok); |
1623 | if (c == '"' || c == '\'') { |
1624 | goto letter_quote; |
1625 | } |
1626 | } |
1627 | while (is_potential_identifier_char(c)) { |
1628 | if (c >= 128) { |
1629 | nonascii = 1; |
1630 | } |
1631 | c = tok_nextc(tok); |
1632 | } |
1633 | tok_backup(tok, c); |
1634 | if (nonascii && !verify_identifier(tok)) { |
1635 | return ERRORTOKEN; |
1636 | } |
1637 | |
1638 | *p_start = tok->start; |
1639 | *p_end = tok->cur; |
1640 | |
1641 | /* async/await parsing block. */ |
1642 | if (tok->cur - tok->start == 5 && tok->start[0] == 'a') { |
1643 | /* May be an 'async' or 'await' token. For Python 3.7 or |
1644 | later we recognize them unconditionally. For Python |
1645 | 3.5 or 3.6 we recognize 'async' in front of 'def', and |
1646 | either one inside of 'async def'. (Technically we |
1647 | shouldn't recognize these at all for 3.4 or earlier, |
1648 | but there's no *valid* Python 3.4 code that would be |
1649 | rejected, and async functions will be rejected in a |
1650 | later phase.) */ |
1651 | if (!tok->async_hacks || tok->async_def) { |
1652 | /* Always recognize the keywords. */ |
1653 | if (memcmp(tok->start, "async" , 5) == 0) { |
1654 | return ASYNC; |
1655 | } |
1656 | if (memcmp(tok->start, "await" , 5) == 0) { |
1657 | return AWAIT; |
1658 | } |
1659 | } |
1660 | else if (memcmp(tok->start, "async" , 5) == 0) { |
1661 | /* The current token is 'async'. |
1662 | Look ahead one token to see if that is 'def'. */ |
1663 | |
1664 | struct tok_state ahead_tok; |
1665 | const char *ahead_tok_start = NULL; |
1666 | const char *ahead_tok_end = NULL; |
1667 | int ahead_tok_kind; |
1668 | |
1669 | memcpy(&ahead_tok, tok, sizeof(ahead_tok)); |
1670 | ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, |
1671 | &ahead_tok_end); |
1672 | |
1673 | if (ahead_tok_kind == NAME |
1674 | && ahead_tok.cur - ahead_tok.start == 3 |
1675 | && memcmp(ahead_tok.start, "def" , 3) == 0) |
1676 | { |
1677 | /* The next token is going to be 'def', so instead of |
1678 | returning a plain NAME token, return ASYNC. */ |
1679 | tok->async_def_indent = tok->indent; |
1680 | tok->async_def = 1; |
1681 | return ASYNC; |
1682 | } |
1683 | } |
1684 | } |
1685 | |
1686 | return NAME; |
1687 | } |
1688 | |
1689 | /* Newline */ |
1690 | if (c == '\n') { |
1691 | tok->atbol = 1; |
1692 | if (blankline || tok->level > 0) { |
1693 | goto nextline; |
1694 | } |
1695 | *p_start = tok->start; |
1696 | *p_end = tok->cur - 1; /* Leave '\n' out of the string */ |
1697 | tok->cont_line = 0; |
1698 | if (tok->async_def) { |
1699 | /* We're somewhere inside an 'async def' function, and |
1700 | we've encountered a NEWLINE after its signature. */ |
1701 | tok->async_def_nl = 1; |
1702 | } |
1703 | return NEWLINE; |
1704 | } |
1705 | |
1706 | /* Period or number starting with period? */ |
1707 | if (c == '.') { |
1708 | c = tok_nextc(tok); |
1709 | if (isdigit(c)) { |
1710 | goto fraction; |
1711 | } else if (c == '.') { |
1712 | c = tok_nextc(tok); |
1713 | if (c == '.') { |
1714 | *p_start = tok->start; |
1715 | *p_end = tok->cur; |
1716 | return ELLIPSIS; |
1717 | } |
1718 | else { |
1719 | tok_backup(tok, c); |
1720 | } |
1721 | tok_backup(tok, '.'); |
1722 | } |
1723 | else { |
1724 | tok_backup(tok, c); |
1725 | } |
1726 | *p_start = tok->start; |
1727 | *p_end = tok->cur; |
1728 | return DOT; |
1729 | } |
1730 | |
1731 | /* Number */ |
1732 | if (isdigit(c)) { |
1733 | if (c == '0') { |
1734 | /* Hex, octal or binary -- maybe. */ |
1735 | c = tok_nextc(tok); |
1736 | if (c == 'x' || c == 'X') { |
1737 | /* Hex */ |
1738 | c = tok_nextc(tok); |
1739 | do { |
1740 | if (c == '_') { |
1741 | c = tok_nextc(tok); |
1742 | } |
1743 | if (!isxdigit(c)) { |
1744 | tok_backup(tok, c); |
1745 | return syntaxerror(tok, "invalid hexadecimal literal" ); |
1746 | } |
1747 | do { |
1748 | c = tok_nextc(tok); |
1749 | } while (isxdigit(c)); |
1750 | } while (c == '_'); |
1751 | if (!verify_end_of_number(tok, c, "hexadecimal" )) { |
1752 | return ERRORTOKEN; |
1753 | } |
1754 | } |
1755 | else if (c == 'o' || c == 'O') { |
1756 | /* Octal */ |
1757 | c = tok_nextc(tok); |
1758 | do { |
1759 | if (c == '_') { |
1760 | c = tok_nextc(tok); |
1761 | } |
1762 | if (c < '0' || c >= '8') { |
1763 | if (isdigit(c)) { |
1764 | return syntaxerror(tok, |
1765 | "invalid digit '%c' in octal literal" , c); |
1766 | } |
1767 | else { |
1768 | tok_backup(tok, c); |
1769 | return syntaxerror(tok, "invalid octal literal" ); |
1770 | } |
1771 | } |
1772 | do { |
1773 | c = tok_nextc(tok); |
1774 | } while ('0' <= c && c < '8'); |
1775 | } while (c == '_'); |
1776 | if (isdigit(c)) { |
1777 | return syntaxerror(tok, |
1778 | "invalid digit '%c' in octal literal" , c); |
1779 | } |
1780 | if (!verify_end_of_number(tok, c, "octal" )) { |
1781 | return ERRORTOKEN; |
1782 | } |
1783 | } |
1784 | else if (c == 'b' || c == 'B') { |
1785 | /* Binary */ |
1786 | c = tok_nextc(tok); |
1787 | do { |
1788 | if (c == '_') { |
1789 | c = tok_nextc(tok); |
1790 | } |
1791 | if (c != '0' && c != '1') { |
1792 | if (isdigit(c)) { |
1793 | return syntaxerror(tok, |
1794 | "invalid digit '%c' in binary literal" , c); |
1795 | } |
1796 | else { |
1797 | tok_backup(tok, c); |
1798 | return syntaxerror(tok, "invalid binary literal" ); |
1799 | } |
1800 | } |
1801 | do { |
1802 | c = tok_nextc(tok); |
1803 | } while (c == '0' || c == '1'); |
1804 | } while (c == '_'); |
1805 | if (isdigit(c)) { |
1806 | return syntaxerror(tok, |
1807 | "invalid digit '%c' in binary literal" , c); |
1808 | } |
1809 | if (!verify_end_of_number(tok, c, "binary" )) { |
1810 | return ERRORTOKEN; |
1811 | } |
1812 | } |
1813 | else { |
1814 | int nonzero = 0; |
1815 | /* maybe old-style octal; c is first char of it */ |
1816 | /* in any case, allow '0' as a literal */ |
1817 | while (1) { |
1818 | if (c == '_') { |
1819 | c = tok_nextc(tok); |
1820 | if (!isdigit(c)) { |
1821 | tok_backup(tok, c); |
1822 | return syntaxerror(tok, "invalid decimal literal" ); |
1823 | } |
1824 | } |
1825 | if (c != '0') { |
1826 | break; |
1827 | } |
1828 | c = tok_nextc(tok); |
1829 | } |
1830 | char* zeros_end = tok->cur; |
1831 | if (isdigit(c)) { |
1832 | nonzero = 1; |
1833 | c = tok_decimal_tail(tok); |
1834 | if (c == 0) { |
1835 | return ERRORTOKEN; |
1836 | } |
1837 | } |
1838 | if (c == '.') { |
1839 | c = tok_nextc(tok); |
1840 | goto fraction; |
1841 | } |
1842 | else if (c == 'e' || c == 'E') { |
1843 | goto exponent; |
1844 | } |
1845 | else if (c == 'j' || c == 'J') { |
1846 | goto imaginary; |
1847 | } |
1848 | else if (nonzero) { |
1849 | /* Old-style octal: now disallowed. */ |
1850 | tok_backup(tok, c); |
1851 | return syntaxerror_known_range( |
1852 | tok, (int)(tok->start + 1 - tok->line_start), |
1853 | (int)(zeros_end - tok->line_start), |
1854 | "leading zeros in decimal integer " |
1855 | "literals are not permitted; " |
1856 | "use an 0o prefix for octal integers" ); |
1857 | } |
1858 | if (!verify_end_of_number(tok, c, "decimal" )) { |
1859 | return ERRORTOKEN; |
1860 | } |
1861 | } |
1862 | } |
1863 | else { |
1864 | /* Decimal */ |
1865 | c = tok_decimal_tail(tok); |
1866 | if (c == 0) { |
1867 | return ERRORTOKEN; |
1868 | } |
1869 | { |
1870 | /* Accept floating point numbers. */ |
1871 | if (c == '.') { |
1872 | c = tok_nextc(tok); |
1873 | fraction: |
1874 | /* Fraction */ |
1875 | if (isdigit(c)) { |
1876 | c = tok_decimal_tail(tok); |
1877 | if (c == 0) { |
1878 | return ERRORTOKEN; |
1879 | } |
1880 | } |
1881 | } |
1882 | if (c == 'e' || c == 'E') { |
1883 | int e; |
1884 | exponent: |
1885 | e = c; |
1886 | /* Exponent part */ |
1887 | c = tok_nextc(tok); |
1888 | if (c == '+' || c == '-') { |
1889 | c = tok_nextc(tok); |
1890 | if (!isdigit(c)) { |
1891 | tok_backup(tok, c); |
1892 | return syntaxerror(tok, "invalid decimal literal" ); |
1893 | } |
1894 | } else if (!isdigit(c)) { |
1895 | tok_backup(tok, c); |
1896 | if (!verify_end_of_number(tok, e, "decimal" )) { |
1897 | return ERRORTOKEN; |
1898 | } |
1899 | tok_backup(tok, e); |
1900 | *p_start = tok->start; |
1901 | *p_end = tok->cur; |
1902 | return NUMBER; |
1903 | } |
1904 | c = tok_decimal_tail(tok); |
1905 | if (c == 0) { |
1906 | return ERRORTOKEN; |
1907 | } |
1908 | } |
1909 | if (c == 'j' || c == 'J') { |
1910 | /* Imaginary part */ |
1911 | imaginary: |
1912 | c = tok_nextc(tok); |
1913 | if (!verify_end_of_number(tok, c, "imaginary" )) { |
1914 | return ERRORTOKEN; |
1915 | } |
1916 | } |
1917 | else if (!verify_end_of_number(tok, c, "decimal" )) { |
1918 | return ERRORTOKEN; |
1919 | } |
1920 | } |
1921 | } |
1922 | tok_backup(tok, c); |
1923 | *p_start = tok->start; |
1924 | *p_end = tok->cur; |
1925 | return NUMBER; |
1926 | } |
1927 | |
1928 | letter_quote: |
1929 | /* String */ |
1930 | if (c == '\'' || c == '"') { |
1931 | int quote = c; |
1932 | int quote_size = 1; /* 1 or 3 */ |
1933 | int end_quote_size = 0; |
1934 | |
1935 | /* Nodes of type STRING, especially multi line strings |
1936 | must be handled differently in order to get both |
1937 | the starting line number and the column offset right. |
1938 | (cf. issue 16806) */ |
1939 | tok->first_lineno = tok->lineno; |
1940 | tok->multi_line_start = tok->line_start; |
1941 | |
1942 | /* Find the quote size and start of string */ |
1943 | c = tok_nextc(tok); |
1944 | if (c == quote) { |
1945 | c = tok_nextc(tok); |
1946 | if (c == quote) { |
1947 | quote_size = 3; |
1948 | } |
1949 | else { |
1950 | end_quote_size = 1; /* empty string found */ |
1951 | } |
1952 | } |
1953 | if (c != quote) { |
1954 | tok_backup(tok, c); |
1955 | } |
1956 | |
1957 | /* Get rest of string */ |
1958 | while (end_quote_size != quote_size) { |
1959 | c = tok_nextc(tok); |
1960 | if (c == EOF || (quote_size == 1 && c == '\n')) { |
1961 | assert(tok->multi_line_start != NULL); |
1962 | // shift the tok_state's location into |
1963 | // the start of string, and report the error |
1964 | // from the initial quote character |
1965 | tok->cur = (char *)tok->start; |
1966 | tok->cur++; |
1967 | tok->line_start = tok->multi_line_start; |
1968 | int start = tok->lineno; |
1969 | tok->lineno = tok->first_lineno; |
1970 | if (quote_size == 3) { |
1971 | syntaxerror(tok, "unterminated triple-quoted string literal" |
1972 | " (detected at line %d)" , start); |
1973 | if (c != '\n') { |
1974 | tok->done = E_EOFS; |
1975 | } |
1976 | return ERRORTOKEN; |
1977 | } |
1978 | else { |
1979 | syntaxerror(tok, "unterminated string literal (detected at" |
1980 | " line %d)" , start); |
1981 | if (c != '\n') { |
1982 | tok->done = E_EOLS; |
1983 | } |
1984 | return ERRORTOKEN; |
1985 | } |
1986 | } |
1987 | if (c == quote) { |
1988 | end_quote_size += 1; |
1989 | } |
1990 | else { |
1991 | end_quote_size = 0; |
1992 | if (c == '\\') { |
1993 | tok_nextc(tok); /* skip escaped char */ |
1994 | } |
1995 | } |
1996 | } |
1997 | |
1998 | *p_start = tok->start; |
1999 | *p_end = tok->cur; |
2000 | return STRING; |
2001 | } |
2002 | |
2003 | /* Line continuation */ |
2004 | if (c == '\\') { |
2005 | if ((c = tok_continuation_line(tok)) == -1) { |
2006 | return ERRORTOKEN; |
2007 | } |
2008 | tok->cont_line = 1; |
2009 | goto again; /* Read next line */ |
2010 | } |
2011 | |
2012 | /* Check for two-character token */ |
2013 | { |
2014 | int c2 = tok_nextc(tok); |
2015 | int token = PyToken_TwoChars(c, c2); |
2016 | if (token != OP) { |
2017 | int c3 = tok_nextc(tok); |
2018 | int token3 = PyToken_ThreeChars(c, c2, c3); |
2019 | if (token3 != OP) { |
2020 | token = token3; |
2021 | } |
2022 | else { |
2023 | tok_backup(tok, c3); |
2024 | } |
2025 | *p_start = tok->start; |
2026 | *p_end = tok->cur; |
2027 | return token; |
2028 | } |
2029 | tok_backup(tok, c2); |
2030 | } |
2031 | |
2032 | /* Keep track of parentheses nesting level */ |
2033 | switch (c) { |
2034 | case '(': |
2035 | case '[': |
2036 | case '{': |
2037 | if (tok->level >= MAXLEVEL) { |
2038 | return syntaxerror(tok, "too many nested parentheses" ); |
2039 | } |
2040 | tok->parenstack[tok->level] = c; |
2041 | tok->parenlinenostack[tok->level] = tok->lineno; |
2042 | tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start); |
2043 | tok->level++; |
2044 | break; |
2045 | case ')': |
2046 | case ']': |
2047 | case '}': |
2048 | if (!tok->level) { |
2049 | return syntaxerror(tok, "unmatched '%c'" , c); |
2050 | } |
2051 | tok->level--; |
2052 | int opening = tok->parenstack[tok->level]; |
2053 | if (!((opening == '(' && c == ')') || |
2054 | (opening == '[' && c == ']') || |
2055 | (opening == '{' && c == '}'))) |
2056 | { |
2057 | if (tok->parenlinenostack[tok->level] != tok->lineno) { |
2058 | return syntaxerror(tok, |
2059 | "closing parenthesis '%c' does not match " |
2060 | "opening parenthesis '%c' on line %d" , |
2061 | c, opening, tok->parenlinenostack[tok->level]); |
2062 | } |
2063 | else { |
2064 | return syntaxerror(tok, |
2065 | "closing parenthesis '%c' does not match " |
2066 | "opening parenthesis '%c'" , |
2067 | c, opening); |
2068 | } |
2069 | } |
2070 | break; |
2071 | } |
2072 | |
2073 | /* Punctuation character */ |
2074 | *p_start = tok->start; |
2075 | *p_end = tok->cur; |
2076 | return PyToken_OneChar(c); |
2077 | } |
2078 | |
2079 | int |
2080 | PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end) |
2081 | { |
2082 | int result = tok_get(tok, p_start, p_end); |
2083 | if (tok->decoding_erred) { |
2084 | result = ERRORTOKEN; |
2085 | tok->done = E_DECODE; |
2086 | } |
2087 | return result; |
2088 | } |
2089 | |
2090 | /* Get the encoding of a Python file. Check for the coding cookie and check if |
2091 | the file starts with a BOM. |
2092 | |
2093 | PyTokenizer_FindEncodingFilename() returns NULL when it can't find the |
2094 | encoding in the first or second line of the file (in which case the encoding |
2095 | should be assumed to be UTF-8). |
2096 | |
2097 | The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed |
2098 | by the caller. */ |
2099 | |
2100 | char * |
2101 | PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) |
2102 | { |
2103 | struct tok_state *tok; |
2104 | FILE *fp; |
2105 | const char *p_start = NULL; |
2106 | const char *p_end = NULL; |
2107 | char *encoding = NULL; |
2108 | |
2109 | fd = _Py_dup(fd); |
2110 | if (fd < 0) { |
2111 | return NULL; |
2112 | } |
2113 | |
2114 | fp = fdopen(fd, "r" ); |
2115 | if (fp == NULL) { |
2116 | return NULL; |
2117 | } |
2118 | tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL); |
2119 | if (tok == NULL) { |
2120 | fclose(fp); |
2121 | return NULL; |
2122 | } |
2123 | if (filename != NULL) { |
2124 | Py_INCREF(filename); |
2125 | tok->filename = filename; |
2126 | } |
2127 | else { |
2128 | tok->filename = PyUnicode_FromString("<string>" ); |
2129 | if (tok->filename == NULL) { |
2130 | fclose(fp); |
2131 | PyTokenizer_Free(tok); |
2132 | return encoding; |
2133 | } |
2134 | } |
2135 | while (tok->lineno < 2 && tok->done == E_OK) { |
2136 | PyTokenizer_Get(tok, &p_start, &p_end); |
2137 | } |
2138 | fclose(fp); |
2139 | if (tok->encoding) { |
2140 | encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1); |
2141 | if (encoding) { |
2142 | strcpy(encoding, tok->encoding); |
2143 | } |
2144 | } |
2145 | PyTokenizer_Free(tok); |
2146 | return encoding; |
2147 | } |
2148 | |
2149 | char * |
2150 | PyTokenizer_FindEncoding(int fd) |
2151 | { |
2152 | return PyTokenizer_FindEncodingFilename(fd, NULL); |
2153 | } |
2154 | |
2155 | #ifdef Py_DEBUG |
2156 | |
2157 | void |
2158 | tok_dump(int type, char *start, char *end) |
2159 | { |
2160 | fprintf(stderr, "%s" , _PyParser_TokenNames[type]); |
2161 | if (type == NAME || type == NUMBER || type == STRING || type == OP) |
2162 | fprintf(stderr, "(%.*s)" , (int)(end - start), start); |
2163 | } |
2164 | |
2165 | #endif |
2166 | |