1 | /* |
2 | * Secret Labs' Regular Expression Engine |
3 | * |
4 | * regular expression matching engine |
5 | * |
6 | * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. |
7 | * |
8 | * See the _sre.c file for information on usage and redistribution. |
9 | */ |
10 | |
11 | /* String matching engine */ |
12 | |
13 | /* This file is included three times, with different character settings */ |
14 | |
15 | LOCAL(int) |
16 | SRE(at)(SRE_STATE* state, const SRE_CHAR* ptr, SRE_CODE at) |
17 | { |
18 | /* check if pointer is at given position */ |
19 | |
20 | Py_ssize_t thisp, thatp; |
21 | |
22 | switch (at) { |
23 | |
24 | case SRE_AT_BEGINNING: |
25 | case SRE_AT_BEGINNING_STRING: |
26 | return ((void*) ptr == state->beginning); |
27 | |
28 | case SRE_AT_BEGINNING_LINE: |
29 | return ((void*) ptr == state->beginning || |
30 | SRE_IS_LINEBREAK((int) ptr[-1])); |
31 | |
32 | case SRE_AT_END: |
33 | return (((SRE_CHAR *)state->end - ptr == 1 && |
34 | SRE_IS_LINEBREAK((int) ptr[0])) || |
35 | ((void*) ptr == state->end)); |
36 | |
37 | case SRE_AT_END_LINE: |
38 | return ((void*) ptr == state->end || |
39 | SRE_IS_LINEBREAK((int) ptr[0])); |
40 | |
41 | case SRE_AT_END_STRING: |
42 | return ((void*) ptr == state->end); |
43 | |
44 | case SRE_AT_BOUNDARY: |
45 | if (state->beginning == state->end) |
46 | return 0; |
47 | thatp = ((void*) ptr > state->beginning) ? |
48 | SRE_IS_WORD((int) ptr[-1]) : 0; |
49 | thisp = ((void*) ptr < state->end) ? |
50 | SRE_IS_WORD((int) ptr[0]) : 0; |
51 | return thisp != thatp; |
52 | |
53 | case SRE_AT_NON_BOUNDARY: |
54 | if (state->beginning == state->end) |
55 | return 0; |
56 | thatp = ((void*) ptr > state->beginning) ? |
57 | SRE_IS_WORD((int) ptr[-1]) : 0; |
58 | thisp = ((void*) ptr < state->end) ? |
59 | SRE_IS_WORD((int) ptr[0]) : 0; |
60 | return thisp == thatp; |
61 | |
62 | case SRE_AT_LOC_BOUNDARY: |
63 | if (state->beginning == state->end) |
64 | return 0; |
65 | thatp = ((void*) ptr > state->beginning) ? |
66 | SRE_LOC_IS_WORD((int) ptr[-1]) : 0; |
67 | thisp = ((void*) ptr < state->end) ? |
68 | SRE_LOC_IS_WORD((int) ptr[0]) : 0; |
69 | return thisp != thatp; |
70 | |
71 | case SRE_AT_LOC_NON_BOUNDARY: |
72 | if (state->beginning == state->end) |
73 | return 0; |
74 | thatp = ((void*) ptr > state->beginning) ? |
75 | SRE_LOC_IS_WORD((int) ptr[-1]) : 0; |
76 | thisp = ((void*) ptr < state->end) ? |
77 | SRE_LOC_IS_WORD((int) ptr[0]) : 0; |
78 | return thisp == thatp; |
79 | |
80 | case SRE_AT_UNI_BOUNDARY: |
81 | if (state->beginning == state->end) |
82 | return 0; |
83 | thatp = ((void*) ptr > state->beginning) ? |
84 | SRE_UNI_IS_WORD((int) ptr[-1]) : 0; |
85 | thisp = ((void*) ptr < state->end) ? |
86 | SRE_UNI_IS_WORD((int) ptr[0]) : 0; |
87 | return thisp != thatp; |
88 | |
89 | case SRE_AT_UNI_NON_BOUNDARY: |
90 | if (state->beginning == state->end) |
91 | return 0; |
92 | thatp = ((void*) ptr > state->beginning) ? |
93 | SRE_UNI_IS_WORD((int) ptr[-1]) : 0; |
94 | thisp = ((void*) ptr < state->end) ? |
95 | SRE_UNI_IS_WORD((int) ptr[0]) : 0; |
96 | return thisp == thatp; |
97 | |
98 | } |
99 | |
100 | return 0; |
101 | } |
102 | |
103 | LOCAL(int) |
104 | SRE(charset)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch) |
105 | { |
106 | /* check if character is a member of the given set */ |
107 | |
108 | int ok = 1; |
109 | |
110 | for (;;) { |
111 | switch (*set++) { |
112 | |
113 | case SRE_OP_FAILURE: |
114 | return !ok; |
115 | |
116 | case SRE_OP_LITERAL: |
117 | /* <LITERAL> <code> */ |
118 | if (ch == set[0]) |
119 | return ok; |
120 | set++; |
121 | break; |
122 | |
123 | case SRE_OP_CATEGORY: |
124 | /* <CATEGORY> <code> */ |
125 | if (sre_category(set[0], (int) ch)) |
126 | return ok; |
127 | set++; |
128 | break; |
129 | |
130 | case SRE_OP_CHARSET: |
131 | /* <CHARSET> <bitmap> */ |
132 | if (ch < 256 && |
133 | (set[ch/SRE_CODE_BITS] & (1u << (ch & (SRE_CODE_BITS-1))))) |
134 | return ok; |
135 | set += 256/SRE_CODE_BITS; |
136 | break; |
137 | |
138 | case SRE_OP_RANGE: |
139 | /* <RANGE> <lower> <upper> */ |
140 | if (set[0] <= ch && ch <= set[1]) |
141 | return ok; |
142 | set += 2; |
143 | break; |
144 | |
145 | case SRE_OP_RANGE_UNI_IGNORE: |
146 | /* <RANGE_UNI_IGNORE> <lower> <upper> */ |
147 | { |
148 | SRE_CODE uch; |
149 | /* ch is already lower cased */ |
150 | if (set[0] <= ch && ch <= set[1]) |
151 | return ok; |
152 | uch = sre_upper_unicode(ch); |
153 | if (set[0] <= uch && uch <= set[1]) |
154 | return ok; |
155 | set += 2; |
156 | break; |
157 | } |
158 | |
159 | case SRE_OP_NEGATE: |
160 | ok = !ok; |
161 | break; |
162 | |
163 | case SRE_OP_BIGCHARSET: |
164 | /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */ |
165 | { |
166 | Py_ssize_t count, block; |
167 | count = *(set++); |
168 | |
169 | if (ch < 0x10000u) |
170 | block = ((unsigned char*)set)[ch >> 8]; |
171 | else |
172 | block = -1; |
173 | set += 256/sizeof(SRE_CODE); |
174 | if (block >=0 && |
175 | (set[(block * 256 + (ch & 255))/SRE_CODE_BITS] & |
176 | (1u << (ch & (SRE_CODE_BITS-1))))) |
177 | return ok; |
178 | set += count * (256/SRE_CODE_BITS); |
179 | break; |
180 | } |
181 | |
182 | default: |
183 | /* internal error -- there's not much we can do about it |
184 | here, so let's just pretend it didn't match... */ |
185 | return 0; |
186 | } |
187 | } |
188 | } |
189 | |
190 | LOCAL(int) |
191 | SRE(charset_loc_ignore)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch) |
192 | { |
193 | SRE_CODE lo, up; |
194 | lo = sre_lower_locale(ch); |
195 | if (SRE(charset)(state, set, lo)) |
196 | return 1; |
197 | |
198 | up = sre_upper_locale(ch); |
199 | return up != lo && SRE(charset)(state, set, up); |
200 | } |
201 | |
202 | LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel); |
203 | |
204 | LOCAL(Py_ssize_t) |
205 | SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount) |
206 | { |
207 | SRE_CODE chr; |
208 | SRE_CHAR c; |
209 | const SRE_CHAR* ptr = (const SRE_CHAR *)state->ptr; |
210 | const SRE_CHAR* end = (const SRE_CHAR *)state->end; |
211 | Py_ssize_t i; |
212 | |
213 | /* adjust end */ |
214 | if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT) |
215 | end = ptr + maxcount; |
216 | |
217 | switch (pattern[0]) { |
218 | |
219 | case SRE_OP_IN: |
220 | /* repeated set */ |
221 | TRACE(("|%p|%p|COUNT IN\n" , pattern, ptr)); |
222 | while (ptr < end && SRE(charset)(state, pattern + 2, *ptr)) |
223 | ptr++; |
224 | break; |
225 | |
226 | case SRE_OP_ANY: |
227 | /* repeated dot wildcard. */ |
228 | TRACE(("|%p|%p|COUNT ANY\n" , pattern, ptr)); |
229 | while (ptr < end && !SRE_IS_LINEBREAK(*ptr)) |
230 | ptr++; |
231 | break; |
232 | |
233 | case SRE_OP_ANY_ALL: |
234 | /* repeated dot wildcard. skip to the end of the target |
235 | string, and backtrack from there */ |
236 | TRACE(("|%p|%p|COUNT ANY_ALL\n" , pattern, ptr)); |
237 | ptr = end; |
238 | break; |
239 | |
240 | case SRE_OP_LITERAL: |
241 | /* repeated literal */ |
242 | chr = pattern[1]; |
243 | TRACE(("|%p|%p|COUNT LITERAL %d\n" , pattern, ptr, chr)); |
244 | c = (SRE_CHAR) chr; |
245 | #if SIZEOF_SRE_CHAR < 4 |
246 | if ((SRE_CODE) c != chr) |
247 | ; /* literal can't match: doesn't fit in char width */ |
248 | else |
249 | #endif |
250 | while (ptr < end && *ptr == c) |
251 | ptr++; |
252 | break; |
253 | |
254 | case SRE_OP_LITERAL_IGNORE: |
255 | /* repeated literal */ |
256 | chr = pattern[1]; |
257 | TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n" , pattern, ptr, chr)); |
258 | while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) == chr) |
259 | ptr++; |
260 | break; |
261 | |
262 | case SRE_OP_LITERAL_UNI_IGNORE: |
263 | /* repeated literal */ |
264 | chr = pattern[1]; |
265 | TRACE(("|%p|%p|COUNT LITERAL_UNI_IGNORE %d\n" , pattern, ptr, chr)); |
266 | while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) == chr) |
267 | ptr++; |
268 | break; |
269 | |
270 | case SRE_OP_LITERAL_LOC_IGNORE: |
271 | /* repeated literal */ |
272 | chr = pattern[1]; |
273 | TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n" , pattern, ptr, chr)); |
274 | while (ptr < end && char_loc_ignore(chr, *ptr)) |
275 | ptr++; |
276 | break; |
277 | |
278 | case SRE_OP_NOT_LITERAL: |
279 | /* repeated non-literal */ |
280 | chr = pattern[1]; |
281 | TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n" , pattern, ptr, chr)); |
282 | c = (SRE_CHAR) chr; |
283 | #if SIZEOF_SRE_CHAR < 4 |
284 | if ((SRE_CODE) c != chr) |
285 | ptr = end; /* literal can't match: doesn't fit in char width */ |
286 | else |
287 | #endif |
288 | while (ptr < end && *ptr != c) |
289 | ptr++; |
290 | break; |
291 | |
292 | case SRE_OP_NOT_LITERAL_IGNORE: |
293 | /* repeated non-literal */ |
294 | chr = pattern[1]; |
295 | TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n" , pattern, ptr, chr)); |
296 | while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) != chr) |
297 | ptr++; |
298 | break; |
299 | |
300 | case SRE_OP_NOT_LITERAL_UNI_IGNORE: |
301 | /* repeated non-literal */ |
302 | chr = pattern[1]; |
303 | TRACE(("|%p|%p|COUNT NOT_LITERAL_UNI_IGNORE %d\n" , pattern, ptr, chr)); |
304 | while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) != chr) |
305 | ptr++; |
306 | break; |
307 | |
308 | case SRE_OP_NOT_LITERAL_LOC_IGNORE: |
309 | /* repeated non-literal */ |
310 | chr = pattern[1]; |
311 | TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n" , pattern, ptr, chr)); |
312 | while (ptr < end && !char_loc_ignore(chr, *ptr)) |
313 | ptr++; |
314 | break; |
315 | |
316 | default: |
317 | /* repeated single character pattern */ |
318 | TRACE(("|%p|%p|COUNT SUBPATTERN\n" , pattern, ptr)); |
319 | while ((SRE_CHAR*) state->ptr < end) { |
320 | i = SRE(match)(state, pattern, 0); |
321 | if (i < 0) |
322 | return i; |
323 | if (!i) |
324 | break; |
325 | } |
326 | TRACE(("|%p|%p|COUNT %zd\n" , pattern, ptr, |
327 | (SRE_CHAR*) state->ptr - ptr)); |
328 | return (SRE_CHAR*) state->ptr - ptr; |
329 | } |
330 | |
331 | TRACE(("|%p|%p|COUNT %zd\n" , pattern, ptr, |
332 | ptr - (SRE_CHAR*) state->ptr)); |
333 | return ptr - (SRE_CHAR*) state->ptr; |
334 | } |
335 | |
336 | #if 0 /* not used in this release */ |
337 | LOCAL(int) |
338 | SRE(info)(SRE_STATE* state, const SRE_CODE* pattern) |
339 | { |
340 | /* check if an SRE_OP_INFO block matches at the current position. |
341 | returns the number of SRE_CODE objects to skip if successful, 0 |
342 | if no match */ |
343 | |
344 | const SRE_CHAR* end = (const SRE_CHAR*) state->end; |
345 | const SRE_CHAR* ptr = (const SRE_CHAR*) state->ptr; |
346 | Py_ssize_t i; |
347 | |
348 | /* check minimal length */ |
349 | if (pattern[3] && end - ptr < pattern[3]) |
350 | return 0; |
351 | |
352 | /* check known prefix */ |
353 | if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) { |
354 | /* <length> <skip> <prefix data> <overlap data> */ |
355 | for (i = 0; i < pattern[5]; i++) |
356 | if ((SRE_CODE) ptr[i] != pattern[7 + i]) |
357 | return 0; |
358 | return pattern[0] + 2 * pattern[6]; |
359 | } |
360 | return pattern[0]; |
361 | } |
362 | #endif |
363 | |
364 | /* The macros below should be used to protect recursive SRE(match)() |
365 | * calls that *failed* and do *not* return immediately (IOW, those |
366 | * that will backtrack). Explaining: |
367 | * |
368 | * - Recursive SRE(match)() returned true: that's usually a success |
369 | * (besides atypical cases like ASSERT_NOT), therefore there's no |
370 | * reason to restore lastmark; |
371 | * |
372 | * - Recursive SRE(match)() returned false but the current SRE(match)() |
373 | * is returning to the caller: If the current SRE(match)() is the |
374 | * top function of the recursion, returning false will be a matching |
375 | * failure, and it doesn't matter where lastmark is pointing to. |
376 | * If it's *not* the top function, it will be a recursive SRE(match)() |
377 | * failure by itself, and the calling SRE(match)() will have to deal |
378 | * with the failure by the same rules explained here (it will restore |
379 | * lastmark by itself if necessary); |
380 | * |
381 | * - Recursive SRE(match)() returned false, and will continue the |
382 | * outside 'for' loop: must be protected when breaking, since the next |
383 | * OP could potentially depend on lastmark; |
384 | * |
385 | * - Recursive SRE(match)() returned false, and will be called again |
386 | * inside a local for/while loop: must be protected between each |
387 | * loop iteration, since the recursive SRE(match)() could do anything, |
388 | * and could potentially depend on lastmark. |
389 | * |
390 | * For more information, check the discussion at SF patch #712900. |
391 | */ |
392 | #define LASTMARK_SAVE() \ |
393 | do { \ |
394 | ctx->lastmark = state->lastmark; \ |
395 | ctx->lastindex = state->lastindex; \ |
396 | } while (0) |
397 | #define LASTMARK_RESTORE() \ |
398 | do { \ |
399 | state->lastmark = ctx->lastmark; \ |
400 | state->lastindex = ctx->lastindex; \ |
401 | } while (0) |
402 | |
403 | #define RETURN_ERROR(i) do { return i; } while(0) |
404 | #define RETURN_FAILURE do { ret = 0; goto exit; } while(0) |
405 | #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0) |
406 | |
407 | #define RETURN_ON_ERROR(i) \ |
408 | do { if (i < 0) RETURN_ERROR(i); } while (0) |
409 | #define RETURN_ON_SUCCESS(i) \ |
410 | do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0) |
411 | #define RETURN_ON_FAILURE(i) \ |
412 | do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0) |
413 | |
414 | #define DATA_STACK_ALLOC(state, type, ptr) \ |
415 | do { \ |
416 | alloc_pos = state->data_stack_base; \ |
417 | TRACE(("allocating %s in %zd (%zd)\n", \ |
418 | Py_STRINGIFY(type), alloc_pos, sizeof(type))); \ |
419 | if (sizeof(type) > state->data_stack_size - alloc_pos) { \ |
420 | int j = data_stack_grow(state, sizeof(type)); \ |
421 | if (j < 0) return j; \ |
422 | if (ctx_pos != -1) \ |
423 | DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \ |
424 | } \ |
425 | ptr = (type*)(state->data_stack+alloc_pos); \ |
426 | state->data_stack_base += sizeof(type); \ |
427 | } while (0) |
428 | |
429 | #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \ |
430 | do { \ |
431 | TRACE(("looking up %s at %zd\n", Py_STRINGIFY(type), pos)); \ |
432 | ptr = (type*)(state->data_stack+pos); \ |
433 | } while (0) |
434 | |
435 | #define DATA_STACK_PUSH(state, data, size) \ |
436 | do { \ |
437 | TRACE(("copy data in %p to %zd (%zd)\n", \ |
438 | data, state->data_stack_base, size)); \ |
439 | if (size > state->data_stack_size - state->data_stack_base) { \ |
440 | int j = data_stack_grow(state, size); \ |
441 | if (j < 0) return j; \ |
442 | if (ctx_pos != -1) \ |
443 | DATA_STACK_LOOKUP_AT(state, SRE(match_context), ctx, ctx_pos); \ |
444 | } \ |
445 | memcpy(state->data_stack+state->data_stack_base, data, size); \ |
446 | state->data_stack_base += size; \ |
447 | } while (0) |
448 | |
449 | /* We add an explicit cast to memcpy here because MSVC has a bug when |
450 | compiling C code where it believes that `const void**` cannot be |
451 | safely casted to `void*`, see bpo-39943 for details. */ |
452 | #define DATA_STACK_POP(state, data, size, discard) \ |
453 | do { \ |
454 | TRACE(("copy data to %p from %zd (%zd)\n", \ |
455 | data, state->data_stack_base-size, size)); \ |
456 | memcpy((void*) data, state->data_stack+state->data_stack_base-size, size); \ |
457 | if (discard) \ |
458 | state->data_stack_base -= size; \ |
459 | } while (0) |
460 | |
461 | #define DATA_STACK_POP_DISCARD(state, size) \ |
462 | do { \ |
463 | TRACE(("discard data from %zd (%zd)\n", \ |
464 | state->data_stack_base-size, size)); \ |
465 | state->data_stack_base -= size; \ |
466 | } while(0) |
467 | |
468 | #define DATA_PUSH(x) \ |
469 | DATA_STACK_PUSH(state, (x), sizeof(*(x))) |
470 | #define DATA_POP(x) \ |
471 | DATA_STACK_POP(state, (x), sizeof(*(x)), 1) |
472 | #define DATA_POP_DISCARD(x) \ |
473 | DATA_STACK_POP_DISCARD(state, sizeof(*(x))) |
474 | #define DATA_ALLOC(t,p) \ |
475 | DATA_STACK_ALLOC(state, t, p) |
476 | #define DATA_LOOKUP_AT(t,p,pos) \ |
477 | DATA_STACK_LOOKUP_AT(state,t,p,pos) |
478 | |
479 | #define MARK_PUSH(lastmark) \ |
480 | do if (lastmark > 0) { \ |
481 | i = lastmark; /* ctx->lastmark may change if reallocated */ \ |
482 | DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ |
483 | } while (0) |
484 | #define MARK_POP(lastmark) \ |
485 | do if (lastmark > 0) { \ |
486 | DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ |
487 | } while (0) |
488 | #define MARK_POP_KEEP(lastmark) \ |
489 | do if (lastmark > 0) { \ |
490 | DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ |
491 | } while (0) |
492 | #define MARK_POP_DISCARD(lastmark) \ |
493 | do if (lastmark > 0) { \ |
494 | DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ |
495 | } while (0) |
496 | |
497 | #define JUMP_NONE 0 |
498 | #define JUMP_MAX_UNTIL_1 1 |
499 | #define JUMP_MAX_UNTIL_2 2 |
500 | #define JUMP_MAX_UNTIL_3 3 |
501 | #define JUMP_MIN_UNTIL_1 4 |
502 | #define JUMP_MIN_UNTIL_2 5 |
503 | #define JUMP_MIN_UNTIL_3 6 |
504 | #define JUMP_REPEAT 7 |
505 | #define JUMP_REPEAT_ONE_1 8 |
506 | #define JUMP_REPEAT_ONE_2 9 |
507 | #define JUMP_MIN_REPEAT_ONE 10 |
508 | #define JUMP_BRANCH 11 |
509 | #define JUMP_ASSERT 12 |
510 | #define JUMP_ASSERT_NOT 13 |
511 | |
512 | #define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \ |
513 | DATA_ALLOC(SRE(match_context), nextctx); \ |
514 | nextctx->last_ctx_pos = ctx_pos; \ |
515 | nextctx->jump = jumpvalue; \ |
516 | nextctx->pattern = nextpattern; \ |
517 | nextctx->toplevel = toplevel_; \ |
518 | ctx_pos = alloc_pos; \ |
519 | ctx = nextctx; \ |
520 | goto entrance; \ |
521 | jumplabel: \ |
522 | while (0) /* gcc doesn't like labels at end of scopes */ \ |
523 | |
524 | #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \ |
525 | DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->toplevel) |
526 | |
527 | #define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \ |
528 | DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0) |
529 | |
530 | typedef struct { |
531 | Py_ssize_t last_ctx_pos; |
532 | Py_ssize_t jump; |
533 | const SRE_CHAR* ptr; |
534 | const SRE_CODE* pattern; |
535 | Py_ssize_t count; |
536 | Py_ssize_t lastmark; |
537 | Py_ssize_t lastindex; |
538 | union { |
539 | SRE_CODE chr; |
540 | SRE_REPEAT* rep; |
541 | } u; |
542 | int toplevel; |
543 | } SRE(match_context); |
544 | |
545 | /* check if string matches the given pattern. returns <0 for |
546 | error, 0 for failure, and 1 for success */ |
547 | LOCAL(Py_ssize_t) |
548 | SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) |
549 | { |
550 | const SRE_CHAR* end = (const SRE_CHAR *)state->end; |
551 | Py_ssize_t alloc_pos, ctx_pos = -1; |
552 | Py_ssize_t i, ret = 0; |
553 | Py_ssize_t jump; |
554 | unsigned int sigcount=0; |
555 | |
556 | SRE(match_context)* ctx; |
557 | SRE(match_context)* nextctx; |
558 | |
559 | TRACE(("|%p|%p|ENTER\n" , pattern, state->ptr)); |
560 | |
561 | DATA_ALLOC(SRE(match_context), ctx); |
562 | ctx->last_ctx_pos = -1; |
563 | ctx->jump = JUMP_NONE; |
564 | ctx->pattern = pattern; |
565 | ctx->toplevel = toplevel; |
566 | ctx_pos = alloc_pos; |
567 | |
568 | entrance: |
569 | |
570 | ctx->ptr = (SRE_CHAR *)state->ptr; |
571 | |
572 | if (ctx->pattern[0] == SRE_OP_INFO) { |
573 | /* optimization info block */ |
574 | /* <INFO> <1=skip> <2=flags> <3=min> ... */ |
575 | if (ctx->pattern[3] && (uintptr_t)(end - ctx->ptr) < ctx->pattern[3]) { |
576 | TRACE(("reject (got %zd chars, need %zd)\n" , |
577 | end - ctx->ptr, (Py_ssize_t) ctx->pattern[3])); |
578 | RETURN_FAILURE; |
579 | } |
580 | ctx->pattern += ctx->pattern[1] + 1; |
581 | } |
582 | |
583 | for (;;) { |
584 | ++sigcount; |
585 | if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals()) |
586 | RETURN_ERROR(SRE_ERROR_INTERRUPTED); |
587 | |
588 | switch (*ctx->pattern++) { |
589 | |
590 | case SRE_OP_MARK: |
591 | /* set mark */ |
592 | /* <MARK> <gid> */ |
593 | TRACE(("|%p|%p|MARK %d\n" , ctx->pattern, |
594 | ctx->ptr, ctx->pattern[0])); |
595 | i = ctx->pattern[0]; |
596 | if (i & 1) |
597 | state->lastindex = i/2 + 1; |
598 | if (i > state->lastmark) { |
599 | /* state->lastmark is the highest valid index in the |
600 | state->mark array. If it is increased by more than 1, |
601 | the intervening marks must be set to NULL to signal |
602 | that these marks have not been encountered. */ |
603 | Py_ssize_t j = state->lastmark + 1; |
604 | while (j < i) |
605 | state->mark[j++] = NULL; |
606 | state->lastmark = i; |
607 | } |
608 | state->mark[i] = ctx->ptr; |
609 | ctx->pattern++; |
610 | break; |
611 | |
612 | case SRE_OP_LITERAL: |
613 | /* match literal string */ |
614 | /* <LITERAL> <code> */ |
615 | TRACE(("|%p|%p|LITERAL %d\n" , ctx->pattern, |
616 | ctx->ptr, *ctx->pattern)); |
617 | if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0]) |
618 | RETURN_FAILURE; |
619 | ctx->pattern++; |
620 | ctx->ptr++; |
621 | break; |
622 | |
623 | case SRE_OP_NOT_LITERAL: |
624 | /* match anything that is not literal character */ |
625 | /* <NOT_LITERAL> <code> */ |
626 | TRACE(("|%p|%p|NOT_LITERAL %d\n" , ctx->pattern, |
627 | ctx->ptr, *ctx->pattern)); |
628 | if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0]) |
629 | RETURN_FAILURE; |
630 | ctx->pattern++; |
631 | ctx->ptr++; |
632 | break; |
633 | |
634 | case SRE_OP_SUCCESS: |
635 | /* end of pattern */ |
636 | TRACE(("|%p|%p|SUCCESS\n" , ctx->pattern, ctx->ptr)); |
637 | if (ctx->toplevel && |
638 | ((state->match_all && ctx->ptr != state->end) || |
639 | (state->must_advance && ctx->ptr == state->start))) |
640 | { |
641 | RETURN_FAILURE; |
642 | } |
643 | state->ptr = ctx->ptr; |
644 | RETURN_SUCCESS; |
645 | |
646 | case SRE_OP_AT: |
647 | /* match at given position */ |
648 | /* <AT> <code> */ |
649 | TRACE(("|%p|%p|AT %d\n" , ctx->pattern, ctx->ptr, *ctx->pattern)); |
650 | if (!SRE(at)(state, ctx->ptr, *ctx->pattern)) |
651 | RETURN_FAILURE; |
652 | ctx->pattern++; |
653 | break; |
654 | |
655 | case SRE_OP_CATEGORY: |
656 | /* match at given category */ |
657 | /* <CATEGORY> <code> */ |
658 | TRACE(("|%p|%p|CATEGORY %d\n" , ctx->pattern, |
659 | ctx->ptr, *ctx->pattern)); |
660 | if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0])) |
661 | RETURN_FAILURE; |
662 | ctx->pattern++; |
663 | ctx->ptr++; |
664 | break; |
665 | |
666 | case SRE_OP_ANY: |
667 | /* match anything (except a newline) */ |
668 | /* <ANY> */ |
669 | TRACE(("|%p|%p|ANY\n" , ctx->pattern, ctx->ptr)); |
670 | if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0])) |
671 | RETURN_FAILURE; |
672 | ctx->ptr++; |
673 | break; |
674 | |
675 | case SRE_OP_ANY_ALL: |
676 | /* match anything */ |
677 | /* <ANY_ALL> */ |
678 | TRACE(("|%p|%p|ANY_ALL\n" , ctx->pattern, ctx->ptr)); |
679 | if (ctx->ptr >= end) |
680 | RETURN_FAILURE; |
681 | ctx->ptr++; |
682 | break; |
683 | |
684 | case SRE_OP_IN: |
685 | /* match set member (or non_member) */ |
686 | /* <IN> <skip> <set> */ |
687 | TRACE(("|%p|%p|IN\n" , ctx->pattern, ctx->ptr)); |
688 | if (ctx->ptr >= end || |
689 | !SRE(charset)(state, ctx->pattern + 1, *ctx->ptr)) |
690 | RETURN_FAILURE; |
691 | ctx->pattern += ctx->pattern[0]; |
692 | ctx->ptr++; |
693 | break; |
694 | |
695 | case SRE_OP_LITERAL_IGNORE: |
696 | TRACE(("|%p|%p|LITERAL_IGNORE %d\n" , |
697 | ctx->pattern, ctx->ptr, ctx->pattern[0])); |
698 | if (ctx->ptr >= end || |
699 | sre_lower_ascii(*ctx->ptr) != *ctx->pattern) |
700 | RETURN_FAILURE; |
701 | ctx->pattern++; |
702 | ctx->ptr++; |
703 | break; |
704 | |
705 | case SRE_OP_LITERAL_UNI_IGNORE: |
706 | TRACE(("|%p|%p|LITERAL_UNI_IGNORE %d\n" , |
707 | ctx->pattern, ctx->ptr, ctx->pattern[0])); |
708 | if (ctx->ptr >= end || |
709 | sre_lower_unicode(*ctx->ptr) != *ctx->pattern) |
710 | RETURN_FAILURE; |
711 | ctx->pattern++; |
712 | ctx->ptr++; |
713 | break; |
714 | |
715 | case SRE_OP_LITERAL_LOC_IGNORE: |
716 | TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n" , |
717 | ctx->pattern, ctx->ptr, ctx->pattern[0])); |
718 | if (ctx->ptr >= end |
719 | || !char_loc_ignore(*ctx->pattern, *ctx->ptr)) |
720 | RETURN_FAILURE; |
721 | ctx->pattern++; |
722 | ctx->ptr++; |
723 | break; |
724 | |
725 | case SRE_OP_NOT_LITERAL_IGNORE: |
726 | TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n" , |
727 | ctx->pattern, ctx->ptr, *ctx->pattern)); |
728 | if (ctx->ptr >= end || |
729 | sre_lower_ascii(*ctx->ptr) == *ctx->pattern) |
730 | RETURN_FAILURE; |
731 | ctx->pattern++; |
732 | ctx->ptr++; |
733 | break; |
734 | |
735 | case SRE_OP_NOT_LITERAL_UNI_IGNORE: |
736 | TRACE(("|%p|%p|NOT_LITERAL_UNI_IGNORE %d\n" , |
737 | ctx->pattern, ctx->ptr, *ctx->pattern)); |
738 | if (ctx->ptr >= end || |
739 | sre_lower_unicode(*ctx->ptr) == *ctx->pattern) |
740 | RETURN_FAILURE; |
741 | ctx->pattern++; |
742 | ctx->ptr++; |
743 | break; |
744 | |
745 | case SRE_OP_NOT_LITERAL_LOC_IGNORE: |
746 | TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n" , |
747 | ctx->pattern, ctx->ptr, *ctx->pattern)); |
748 | if (ctx->ptr >= end |
749 | || char_loc_ignore(*ctx->pattern, *ctx->ptr)) |
750 | RETURN_FAILURE; |
751 | ctx->pattern++; |
752 | ctx->ptr++; |
753 | break; |
754 | |
755 | case SRE_OP_IN_IGNORE: |
756 | TRACE(("|%p|%p|IN_IGNORE\n" , ctx->pattern, ctx->ptr)); |
757 | if (ctx->ptr >= end |
758 | || !SRE(charset)(state, ctx->pattern+1, |
759 | (SRE_CODE)sre_lower_ascii(*ctx->ptr))) |
760 | RETURN_FAILURE; |
761 | ctx->pattern += ctx->pattern[0]; |
762 | ctx->ptr++; |
763 | break; |
764 | |
765 | case SRE_OP_IN_UNI_IGNORE: |
766 | TRACE(("|%p|%p|IN_UNI_IGNORE\n" , ctx->pattern, ctx->ptr)); |
767 | if (ctx->ptr >= end |
768 | || !SRE(charset)(state, ctx->pattern+1, |
769 | (SRE_CODE)sre_lower_unicode(*ctx->ptr))) |
770 | RETURN_FAILURE; |
771 | ctx->pattern += ctx->pattern[0]; |
772 | ctx->ptr++; |
773 | break; |
774 | |
775 | case SRE_OP_IN_LOC_IGNORE: |
776 | TRACE(("|%p|%p|IN_LOC_IGNORE\n" , ctx->pattern, ctx->ptr)); |
777 | if (ctx->ptr >= end |
778 | || !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr)) |
779 | RETURN_FAILURE; |
780 | ctx->pattern += ctx->pattern[0]; |
781 | ctx->ptr++; |
782 | break; |
783 | |
784 | case SRE_OP_JUMP: |
785 | case SRE_OP_INFO: |
786 | /* jump forward */ |
787 | /* <JUMP> <offset> */ |
788 | TRACE(("|%p|%p|JUMP %d\n" , ctx->pattern, |
789 | ctx->ptr, ctx->pattern[0])); |
790 | ctx->pattern += ctx->pattern[0]; |
791 | break; |
792 | |
793 | case SRE_OP_BRANCH: |
794 | /* alternation */ |
795 | /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */ |
796 | TRACE(("|%p|%p|BRANCH\n" , ctx->pattern, ctx->ptr)); |
797 | LASTMARK_SAVE(); |
798 | ctx->u.rep = state->repeat; |
799 | if (ctx->u.rep) |
800 | MARK_PUSH(ctx->lastmark); |
801 | for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) { |
802 | if (ctx->pattern[1] == SRE_OP_LITERAL && |
803 | (ctx->ptr >= end || |
804 | (SRE_CODE) *ctx->ptr != ctx->pattern[2])) |
805 | continue; |
806 | if (ctx->pattern[1] == SRE_OP_IN && |
807 | (ctx->ptr >= end || |
808 | !SRE(charset)(state, ctx->pattern + 3, |
809 | (SRE_CODE) *ctx->ptr))) |
810 | continue; |
811 | state->ptr = ctx->ptr; |
812 | DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); |
813 | if (ret) { |
814 | if (ctx->u.rep) |
815 | MARK_POP_DISCARD(ctx->lastmark); |
816 | RETURN_ON_ERROR(ret); |
817 | RETURN_SUCCESS; |
818 | } |
819 | if (ctx->u.rep) |
820 | MARK_POP_KEEP(ctx->lastmark); |
821 | LASTMARK_RESTORE(); |
822 | } |
823 | if (ctx->u.rep) |
824 | MARK_POP_DISCARD(ctx->lastmark); |
825 | RETURN_FAILURE; |
826 | |
827 | case SRE_OP_REPEAT_ONE: |
828 | /* match repeated sequence (maximizing regexp) */ |
829 | |
830 | /* this operator only works if the repeated item is |
831 | exactly one character wide, and we're not already |
832 | collecting backtracking points. for other cases, |
833 | use the MAX_REPEAT operator */ |
834 | |
835 | /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */ |
836 | |
837 | TRACE(("|%p|%p|REPEAT_ONE %d %d\n" , ctx->pattern, ctx->ptr, |
838 | ctx->pattern[1], ctx->pattern[2])); |
839 | |
840 | if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr) |
841 | RETURN_FAILURE; /* cannot match */ |
842 | |
843 | state->ptr = ctx->ptr; |
844 | |
845 | ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[2]); |
846 | RETURN_ON_ERROR(ret); |
847 | DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); |
848 | ctx->count = ret; |
849 | ctx->ptr += ctx->count; |
850 | |
851 | /* when we arrive here, count contains the number of |
852 | matches, and ctx->ptr points to the tail of the target |
853 | string. check if the rest of the pattern matches, |
854 | and backtrack if not. */ |
855 | |
856 | if (ctx->count < (Py_ssize_t) ctx->pattern[1]) |
857 | RETURN_FAILURE; |
858 | |
859 | if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS && |
860 | ctx->ptr == state->end && |
861 | !(ctx->toplevel && state->must_advance && ctx->ptr == state->start)) |
862 | { |
863 | /* tail is empty. we're finished */ |
864 | state->ptr = ctx->ptr; |
865 | RETURN_SUCCESS; |
866 | } |
867 | |
868 | LASTMARK_SAVE(); |
869 | |
870 | if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) { |
871 | /* tail starts with a literal. skip positions where |
872 | the rest of the pattern cannot possibly match */ |
873 | ctx->u.chr = ctx->pattern[ctx->pattern[0]+1]; |
874 | for (;;) { |
875 | while (ctx->count >= (Py_ssize_t) ctx->pattern[1] && |
876 | (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) { |
877 | ctx->ptr--; |
878 | ctx->count--; |
879 | } |
880 | if (ctx->count < (Py_ssize_t) ctx->pattern[1]) |
881 | break; |
882 | state->ptr = ctx->ptr; |
883 | DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1, |
884 | ctx->pattern+ctx->pattern[0]); |
885 | if (ret) { |
886 | RETURN_ON_ERROR(ret); |
887 | RETURN_SUCCESS; |
888 | } |
889 | |
890 | LASTMARK_RESTORE(); |
891 | |
892 | ctx->ptr--; |
893 | ctx->count--; |
894 | } |
895 | |
896 | } else { |
897 | /* general case */ |
898 | while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) { |
899 | state->ptr = ctx->ptr; |
900 | DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2, |
901 | ctx->pattern+ctx->pattern[0]); |
902 | if (ret) { |
903 | RETURN_ON_ERROR(ret); |
904 | RETURN_SUCCESS; |
905 | } |
906 | ctx->ptr--; |
907 | ctx->count--; |
908 | LASTMARK_RESTORE(); |
909 | } |
910 | } |
911 | RETURN_FAILURE; |
912 | |
913 | case SRE_OP_MIN_REPEAT_ONE: |
914 | /* match repeated sequence (minimizing regexp) */ |
915 | |
916 | /* this operator only works if the repeated item is |
917 | exactly one character wide, and we're not already |
918 | collecting backtracking points. for other cases, |
919 | use the MIN_REPEAT operator */ |
920 | |
921 | /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */ |
922 | |
923 | TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n" , ctx->pattern, ctx->ptr, |
924 | ctx->pattern[1], ctx->pattern[2])); |
925 | |
926 | if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr) |
927 | RETURN_FAILURE; /* cannot match */ |
928 | |
929 | state->ptr = ctx->ptr; |
930 | |
931 | if (ctx->pattern[1] == 0) |
932 | ctx->count = 0; |
933 | else { |
934 | /* count using pattern min as the maximum */ |
935 | ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[1]); |
936 | RETURN_ON_ERROR(ret); |
937 | DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); |
938 | if (ret < (Py_ssize_t) ctx->pattern[1]) |
939 | /* didn't match minimum number of times */ |
940 | RETURN_FAILURE; |
941 | /* advance past minimum matches of repeat */ |
942 | ctx->count = ret; |
943 | ctx->ptr += ctx->count; |
944 | } |
945 | |
946 | if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS && |
947 | !(ctx->toplevel && |
948 | ((state->match_all && ctx->ptr != state->end) || |
949 | (state->must_advance && ctx->ptr == state->start)))) |
950 | { |
951 | /* tail is empty. we're finished */ |
952 | state->ptr = ctx->ptr; |
953 | RETURN_SUCCESS; |
954 | |
955 | } else { |
956 | /* general case */ |
957 | LASTMARK_SAVE(); |
958 | while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT |
959 | || ctx->count <= (Py_ssize_t)ctx->pattern[2]) { |
960 | state->ptr = ctx->ptr; |
961 | DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one, |
962 | ctx->pattern+ctx->pattern[0]); |
963 | if (ret) { |
964 | RETURN_ON_ERROR(ret); |
965 | RETURN_SUCCESS; |
966 | } |
967 | state->ptr = ctx->ptr; |
968 | ret = SRE(count)(state, ctx->pattern+3, 1); |
969 | RETURN_ON_ERROR(ret); |
970 | DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); |
971 | if (ret == 0) |
972 | break; |
973 | assert(ret == 1); |
974 | ctx->ptr++; |
975 | ctx->count++; |
976 | LASTMARK_RESTORE(); |
977 | } |
978 | } |
979 | RETURN_FAILURE; |
980 | |
981 | case SRE_OP_REPEAT: |
982 | /* create repeat context. all the hard work is done |
983 | by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ |
984 | /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */ |
985 | TRACE(("|%p|%p|REPEAT %d %d\n" , ctx->pattern, ctx->ptr, |
986 | ctx->pattern[1], ctx->pattern[2])); |
987 | |
988 | /* install new repeat context */ |
989 | ctx->u.rep = (SRE_REPEAT*) PyObject_Malloc(sizeof(*ctx->u.rep)); |
990 | if (!ctx->u.rep) { |
991 | PyErr_NoMemory(); |
992 | RETURN_FAILURE; |
993 | } |
994 | ctx->u.rep->count = -1; |
995 | ctx->u.rep->pattern = ctx->pattern; |
996 | ctx->u.rep->prev = state->repeat; |
997 | ctx->u.rep->last_ptr = NULL; |
998 | state->repeat = ctx->u.rep; |
999 | |
1000 | state->ptr = ctx->ptr; |
1001 | DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]); |
1002 | state->repeat = ctx->u.rep->prev; |
1003 | PyObject_Free(ctx->u.rep); |
1004 | |
1005 | if (ret) { |
1006 | RETURN_ON_ERROR(ret); |
1007 | RETURN_SUCCESS; |
1008 | } |
1009 | RETURN_FAILURE; |
1010 | |
1011 | case SRE_OP_MAX_UNTIL: |
1012 | /* maximizing repeat */ |
1013 | /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */ |
1014 | |
1015 | /* FIXME: we probably need to deal with zero-width |
1016 | matches in here... */ |
1017 | |
1018 | ctx->u.rep = state->repeat; |
1019 | if (!ctx->u.rep) |
1020 | RETURN_ERROR(SRE_ERROR_STATE); |
1021 | |
1022 | state->ptr = ctx->ptr; |
1023 | |
1024 | ctx->count = ctx->u.rep->count+1; |
1025 | |
1026 | TRACE(("|%p|%p|MAX_UNTIL %zd\n" , ctx->pattern, |
1027 | ctx->ptr, ctx->count)); |
1028 | |
1029 | if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) { |
1030 | /* not enough matches */ |
1031 | ctx->u.rep->count = ctx->count; |
1032 | DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1, |
1033 | ctx->u.rep->pattern+3); |
1034 | if (ret) { |
1035 | RETURN_ON_ERROR(ret); |
1036 | RETURN_SUCCESS; |
1037 | } |
1038 | ctx->u.rep->count = ctx->count-1; |
1039 | state->ptr = ctx->ptr; |
1040 | RETURN_FAILURE; |
1041 | } |
1042 | |
1043 | if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] || |
1044 | ctx->u.rep->pattern[2] == SRE_MAXREPEAT) && |
1045 | state->ptr != ctx->u.rep->last_ptr) { |
1046 | /* we may have enough matches, but if we can |
1047 | match another item, do so */ |
1048 | ctx->u.rep->count = ctx->count; |
1049 | LASTMARK_SAVE(); |
1050 | MARK_PUSH(ctx->lastmark); |
1051 | /* zero-width match protection */ |
1052 | DATA_PUSH(&ctx->u.rep->last_ptr); |
1053 | ctx->u.rep->last_ptr = state->ptr; |
1054 | DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2, |
1055 | ctx->u.rep->pattern+3); |
1056 | DATA_POP(&ctx->u.rep->last_ptr); |
1057 | if (ret) { |
1058 | MARK_POP_DISCARD(ctx->lastmark); |
1059 | RETURN_ON_ERROR(ret); |
1060 | RETURN_SUCCESS; |
1061 | } |
1062 | MARK_POP(ctx->lastmark); |
1063 | LASTMARK_RESTORE(); |
1064 | ctx->u.rep->count = ctx->count-1; |
1065 | state->ptr = ctx->ptr; |
1066 | } |
1067 | |
1068 | /* cannot match more repeated items here. make sure the |
1069 | tail matches */ |
1070 | state->repeat = ctx->u.rep->prev; |
1071 | DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern); |
1072 | RETURN_ON_SUCCESS(ret); |
1073 | state->repeat = ctx->u.rep; |
1074 | state->ptr = ctx->ptr; |
1075 | RETURN_FAILURE; |
1076 | |
1077 | case SRE_OP_MIN_UNTIL: |
1078 | /* minimizing repeat */ |
1079 | /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */ |
1080 | |
1081 | ctx->u.rep = state->repeat; |
1082 | if (!ctx->u.rep) |
1083 | RETURN_ERROR(SRE_ERROR_STATE); |
1084 | |
1085 | state->ptr = ctx->ptr; |
1086 | |
1087 | ctx->count = ctx->u.rep->count+1; |
1088 | |
1089 | TRACE(("|%p|%p|MIN_UNTIL %zd %p\n" , ctx->pattern, |
1090 | ctx->ptr, ctx->count, ctx->u.rep->pattern)); |
1091 | |
1092 | if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) { |
1093 | /* not enough matches */ |
1094 | ctx->u.rep->count = ctx->count; |
1095 | DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1, |
1096 | ctx->u.rep->pattern+3); |
1097 | if (ret) { |
1098 | RETURN_ON_ERROR(ret); |
1099 | RETURN_SUCCESS; |
1100 | } |
1101 | ctx->u.rep->count = ctx->count-1; |
1102 | state->ptr = ctx->ptr; |
1103 | RETURN_FAILURE; |
1104 | } |
1105 | |
1106 | LASTMARK_SAVE(); |
1107 | |
1108 | /* see if the tail matches */ |
1109 | state->repeat = ctx->u.rep->prev; |
1110 | DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern); |
1111 | if (ret) { |
1112 | RETURN_ON_ERROR(ret); |
1113 | RETURN_SUCCESS; |
1114 | } |
1115 | |
1116 | state->repeat = ctx->u.rep; |
1117 | state->ptr = ctx->ptr; |
1118 | |
1119 | LASTMARK_RESTORE(); |
1120 | |
1121 | if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2] |
1122 | && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) || |
1123 | state->ptr == ctx->u.rep->last_ptr) |
1124 | RETURN_FAILURE; |
1125 | |
1126 | ctx->u.rep->count = ctx->count; |
1127 | /* zero-width match protection */ |
1128 | DATA_PUSH(&ctx->u.rep->last_ptr); |
1129 | ctx->u.rep->last_ptr = state->ptr; |
1130 | DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3, |
1131 | ctx->u.rep->pattern+3); |
1132 | DATA_POP(&ctx->u.rep->last_ptr); |
1133 | if (ret) { |
1134 | RETURN_ON_ERROR(ret); |
1135 | RETURN_SUCCESS; |
1136 | } |
1137 | ctx->u.rep->count = ctx->count-1; |
1138 | state->ptr = ctx->ptr; |
1139 | RETURN_FAILURE; |
1140 | |
1141 | case SRE_OP_GROUPREF: |
1142 | /* match backreference */ |
1143 | TRACE(("|%p|%p|GROUPREF %d\n" , ctx->pattern, |
1144 | ctx->ptr, ctx->pattern[0])); |
1145 | i = ctx->pattern[0]; |
1146 | { |
1147 | Py_ssize_t groupref = i+i; |
1148 | if (groupref >= state->lastmark) { |
1149 | RETURN_FAILURE; |
1150 | } else { |
1151 | SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; |
1152 | SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; |
1153 | if (!p || !e || e < p) |
1154 | RETURN_FAILURE; |
1155 | while (p < e) { |
1156 | if (ctx->ptr >= end || *ctx->ptr != *p) |
1157 | RETURN_FAILURE; |
1158 | p++; |
1159 | ctx->ptr++; |
1160 | } |
1161 | } |
1162 | } |
1163 | ctx->pattern++; |
1164 | break; |
1165 | |
1166 | case SRE_OP_GROUPREF_IGNORE: |
1167 | /* match backreference */ |
1168 | TRACE(("|%p|%p|GROUPREF_IGNORE %d\n" , ctx->pattern, |
1169 | ctx->ptr, ctx->pattern[0])); |
1170 | i = ctx->pattern[0]; |
1171 | { |
1172 | Py_ssize_t groupref = i+i; |
1173 | if (groupref >= state->lastmark) { |
1174 | RETURN_FAILURE; |
1175 | } else { |
1176 | SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; |
1177 | SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; |
1178 | if (!p || !e || e < p) |
1179 | RETURN_FAILURE; |
1180 | while (p < e) { |
1181 | if (ctx->ptr >= end || |
1182 | sre_lower_ascii(*ctx->ptr) != sre_lower_ascii(*p)) |
1183 | RETURN_FAILURE; |
1184 | p++; |
1185 | ctx->ptr++; |
1186 | } |
1187 | } |
1188 | } |
1189 | ctx->pattern++; |
1190 | break; |
1191 | |
1192 | case SRE_OP_GROUPREF_UNI_IGNORE: |
1193 | /* match backreference */ |
1194 | TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n" , ctx->pattern, |
1195 | ctx->ptr, ctx->pattern[0])); |
1196 | i = ctx->pattern[0]; |
1197 | { |
1198 | Py_ssize_t groupref = i+i; |
1199 | if (groupref >= state->lastmark) { |
1200 | RETURN_FAILURE; |
1201 | } else { |
1202 | SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; |
1203 | SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; |
1204 | if (!p || !e || e < p) |
1205 | RETURN_FAILURE; |
1206 | while (p < e) { |
1207 | if (ctx->ptr >= end || |
1208 | sre_lower_unicode(*ctx->ptr) != sre_lower_unicode(*p)) |
1209 | RETURN_FAILURE; |
1210 | p++; |
1211 | ctx->ptr++; |
1212 | } |
1213 | } |
1214 | } |
1215 | ctx->pattern++; |
1216 | break; |
1217 | |
1218 | case SRE_OP_GROUPREF_LOC_IGNORE: |
1219 | /* match backreference */ |
1220 | TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n" , ctx->pattern, |
1221 | ctx->ptr, ctx->pattern[0])); |
1222 | i = ctx->pattern[0]; |
1223 | { |
1224 | Py_ssize_t groupref = i+i; |
1225 | if (groupref >= state->lastmark) { |
1226 | RETURN_FAILURE; |
1227 | } else { |
1228 | SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; |
1229 | SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; |
1230 | if (!p || !e || e < p) |
1231 | RETURN_FAILURE; |
1232 | while (p < e) { |
1233 | if (ctx->ptr >= end || |
1234 | sre_lower_locale(*ctx->ptr) != sre_lower_locale(*p)) |
1235 | RETURN_FAILURE; |
1236 | p++; |
1237 | ctx->ptr++; |
1238 | } |
1239 | } |
1240 | } |
1241 | ctx->pattern++; |
1242 | break; |
1243 | |
1244 | case SRE_OP_GROUPREF_EXISTS: |
1245 | TRACE(("|%p|%p|GROUPREF_EXISTS %d\n" , ctx->pattern, |
1246 | ctx->ptr, ctx->pattern[0])); |
1247 | /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */ |
1248 | i = ctx->pattern[0]; |
1249 | { |
1250 | Py_ssize_t groupref = i+i; |
1251 | if (groupref >= state->lastmark) { |
1252 | ctx->pattern += ctx->pattern[1]; |
1253 | break; |
1254 | } else { |
1255 | SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; |
1256 | SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; |
1257 | if (!p || !e || e < p) { |
1258 | ctx->pattern += ctx->pattern[1]; |
1259 | break; |
1260 | } |
1261 | } |
1262 | } |
1263 | ctx->pattern += 2; |
1264 | break; |
1265 | |
1266 | case SRE_OP_ASSERT: |
1267 | /* assert subpattern */ |
1268 | /* <ASSERT> <skip> <back> <pattern> */ |
1269 | TRACE(("|%p|%p|ASSERT %d\n" , ctx->pattern, |
1270 | ctx->ptr, ctx->pattern[1])); |
1271 | if (ctx->ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)ctx->pattern[1]) |
1272 | RETURN_FAILURE; |
1273 | state->ptr = ctx->ptr - ctx->pattern[1]; |
1274 | DO_JUMP0(JUMP_ASSERT, jump_assert, ctx->pattern+2); |
1275 | RETURN_ON_FAILURE(ret); |
1276 | ctx->pattern += ctx->pattern[0]; |
1277 | break; |
1278 | |
1279 | case SRE_OP_ASSERT_NOT: |
1280 | /* assert not subpattern */ |
1281 | /* <ASSERT_NOT> <skip> <back> <pattern> */ |
1282 | TRACE(("|%p|%p|ASSERT_NOT %d\n" , ctx->pattern, |
1283 | ctx->ptr, ctx->pattern[1])); |
1284 | if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) { |
1285 | state->ptr = ctx->ptr - ctx->pattern[1]; |
1286 | DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); |
1287 | if (ret) { |
1288 | RETURN_ON_ERROR(ret); |
1289 | RETURN_FAILURE; |
1290 | } |
1291 | } |
1292 | ctx->pattern += ctx->pattern[0]; |
1293 | break; |
1294 | |
1295 | case SRE_OP_FAILURE: |
1296 | /* immediate failure */ |
1297 | TRACE(("|%p|%p|FAILURE\n" , ctx->pattern, ctx->ptr)); |
1298 | RETURN_FAILURE; |
1299 | |
1300 | default: |
1301 | TRACE(("|%p|%p|UNKNOWN %d\n" , ctx->pattern, ctx->ptr, |
1302 | ctx->pattern[-1])); |
1303 | RETURN_ERROR(SRE_ERROR_ILLEGAL); |
1304 | } |
1305 | } |
1306 | |
1307 | exit: |
1308 | ctx_pos = ctx->last_ctx_pos; |
1309 | jump = ctx->jump; |
1310 | DATA_POP_DISCARD(ctx); |
1311 | if (ctx_pos == -1) |
1312 | return ret; |
1313 | DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); |
1314 | |
1315 | switch (jump) { |
1316 | case JUMP_MAX_UNTIL_2: |
1317 | TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n" , ctx->pattern, ctx->ptr)); |
1318 | goto jump_max_until_2; |
1319 | case JUMP_MAX_UNTIL_3: |
1320 | TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n" , ctx->pattern, ctx->ptr)); |
1321 | goto jump_max_until_3; |
1322 | case JUMP_MIN_UNTIL_2: |
1323 | TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n" , ctx->pattern, ctx->ptr)); |
1324 | goto jump_min_until_2; |
1325 | case JUMP_MIN_UNTIL_3: |
1326 | TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n" , ctx->pattern, ctx->ptr)); |
1327 | goto jump_min_until_3; |
1328 | case JUMP_BRANCH: |
1329 | TRACE(("|%p|%p|JUMP_BRANCH\n" , ctx->pattern, ctx->ptr)); |
1330 | goto jump_branch; |
1331 | case JUMP_MAX_UNTIL_1: |
1332 | TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n" , ctx->pattern, ctx->ptr)); |
1333 | goto jump_max_until_1; |
1334 | case JUMP_MIN_UNTIL_1: |
1335 | TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n" , ctx->pattern, ctx->ptr)); |
1336 | goto jump_min_until_1; |
1337 | case JUMP_REPEAT: |
1338 | TRACE(("|%p|%p|JUMP_REPEAT\n" , ctx->pattern, ctx->ptr)); |
1339 | goto jump_repeat; |
1340 | case JUMP_REPEAT_ONE_1: |
1341 | TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n" , ctx->pattern, ctx->ptr)); |
1342 | goto jump_repeat_one_1; |
1343 | case JUMP_REPEAT_ONE_2: |
1344 | TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n" , ctx->pattern, ctx->ptr)); |
1345 | goto jump_repeat_one_2; |
1346 | case JUMP_MIN_REPEAT_ONE: |
1347 | TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n" , ctx->pattern, ctx->ptr)); |
1348 | goto jump_min_repeat_one; |
1349 | case JUMP_ASSERT: |
1350 | TRACE(("|%p|%p|JUMP_ASSERT\n" , ctx->pattern, ctx->ptr)); |
1351 | goto jump_assert; |
1352 | case JUMP_ASSERT_NOT: |
1353 | TRACE(("|%p|%p|JUMP_ASSERT_NOT\n" , ctx->pattern, ctx->ptr)); |
1354 | goto jump_assert_not; |
1355 | case JUMP_NONE: |
1356 | TRACE(("|%p|%p|RETURN %zd\n" , ctx->pattern, |
1357 | ctx->ptr, ret)); |
1358 | break; |
1359 | } |
1360 | |
1361 | return ret; /* should never get here */ |
1362 | } |
1363 | |
1364 | /* need to reset capturing groups between two SRE(match) callings in loops */ |
1365 | #define RESET_CAPTURE_GROUP() \ |
1366 | do { state->lastmark = state->lastindex = -1; } while (0) |
1367 | |
1368 | LOCAL(Py_ssize_t) |
1369 | SRE(search)(SRE_STATE* state, SRE_CODE* pattern) |
1370 | { |
1371 | SRE_CHAR* ptr = (SRE_CHAR *)state->start; |
1372 | SRE_CHAR* end = (SRE_CHAR *)state->end; |
1373 | Py_ssize_t status = 0; |
1374 | Py_ssize_t prefix_len = 0; |
1375 | Py_ssize_t prefix_skip = 0; |
1376 | SRE_CODE* prefix = NULL; |
1377 | SRE_CODE* charset = NULL; |
1378 | SRE_CODE* overlap = NULL; |
1379 | int flags = 0; |
1380 | |
1381 | if (ptr > end) |
1382 | return 0; |
1383 | |
1384 | if (pattern[0] == SRE_OP_INFO) { |
1385 | /* optimization info block */ |
1386 | /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ |
1387 | |
1388 | flags = pattern[2]; |
1389 | |
1390 | if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) { |
1391 | TRACE(("reject (got %u chars, need %u)\n" , |
1392 | (unsigned int)(end - ptr), pattern[3])); |
1393 | return 0; |
1394 | } |
1395 | if (pattern[3] > 1) { |
1396 | /* adjust end point (but make sure we leave at least one |
1397 | character in there, so literal search will work) */ |
1398 | end -= pattern[3] - 1; |
1399 | if (end <= ptr) |
1400 | end = ptr; |
1401 | } |
1402 | |
1403 | if (flags & SRE_INFO_PREFIX) { |
1404 | /* pattern starts with a known prefix */ |
1405 | /* <length> <skip> <prefix data> <overlap data> */ |
1406 | prefix_len = pattern[5]; |
1407 | prefix_skip = pattern[6]; |
1408 | prefix = pattern + 7; |
1409 | overlap = prefix + prefix_len - 1; |
1410 | } else if (flags & SRE_INFO_CHARSET) |
1411 | /* pattern starts with a character from a known set */ |
1412 | /* <charset> */ |
1413 | charset = pattern + 5; |
1414 | |
1415 | pattern += 1 + pattern[1]; |
1416 | } |
1417 | |
1418 | TRACE(("prefix = %p %zd %zd\n" , |
1419 | prefix, prefix_len, prefix_skip)); |
1420 | TRACE(("charset = %p\n" , charset)); |
1421 | |
1422 | if (prefix_len == 1) { |
1423 | /* pattern starts with a literal character */ |
1424 | SRE_CHAR c = (SRE_CHAR) prefix[0]; |
1425 | #if SIZEOF_SRE_CHAR < 4 |
1426 | if ((SRE_CODE) c != prefix[0]) |
1427 | return 0; /* literal can't match: doesn't fit in char width */ |
1428 | #endif |
1429 | end = (SRE_CHAR *)state->end; |
1430 | state->must_advance = 0; |
1431 | while (ptr < end) { |
1432 | while (*ptr != c) { |
1433 | if (++ptr >= end) |
1434 | return 0; |
1435 | } |
1436 | TRACE(("|%p|%p|SEARCH LITERAL\n" , pattern, ptr)); |
1437 | state->start = ptr; |
1438 | state->ptr = ptr + prefix_skip; |
1439 | if (flags & SRE_INFO_LITERAL) |
1440 | return 1; /* we got all of it */ |
1441 | status = SRE(match)(state, pattern + 2*prefix_skip, 0); |
1442 | if (status != 0) |
1443 | return status; |
1444 | ++ptr; |
1445 | RESET_CAPTURE_GROUP(); |
1446 | } |
1447 | return 0; |
1448 | } |
1449 | |
1450 | if (prefix_len > 1) { |
1451 | /* pattern starts with a known prefix. use the overlap |
1452 | table to skip forward as fast as we possibly can */ |
1453 | Py_ssize_t i = 0; |
1454 | |
1455 | end = (SRE_CHAR *)state->end; |
1456 | if (prefix_len > end - ptr) |
1457 | return 0; |
1458 | #if SIZEOF_SRE_CHAR < 4 |
1459 | for (i = 0; i < prefix_len; i++) |
1460 | if ((SRE_CODE)(SRE_CHAR) prefix[i] != prefix[i]) |
1461 | return 0; /* literal can't match: doesn't fit in char width */ |
1462 | #endif |
1463 | while (ptr < end) { |
1464 | SRE_CHAR c = (SRE_CHAR) prefix[0]; |
1465 | while (*ptr++ != c) { |
1466 | if (ptr >= end) |
1467 | return 0; |
1468 | } |
1469 | if (ptr >= end) |
1470 | return 0; |
1471 | |
1472 | i = 1; |
1473 | state->must_advance = 0; |
1474 | do { |
1475 | if (*ptr == (SRE_CHAR) prefix[i]) { |
1476 | if (++i != prefix_len) { |
1477 | if (++ptr >= end) |
1478 | return 0; |
1479 | continue; |
1480 | } |
1481 | /* found a potential match */ |
1482 | TRACE(("|%p|%p|SEARCH SCAN\n" , pattern, ptr)); |
1483 | state->start = ptr - (prefix_len - 1); |
1484 | state->ptr = ptr - (prefix_len - prefix_skip - 1); |
1485 | if (flags & SRE_INFO_LITERAL) |
1486 | return 1; /* we got all of it */ |
1487 | status = SRE(match)(state, pattern + 2*prefix_skip, 0); |
1488 | if (status != 0) |
1489 | return status; |
1490 | /* close but no cigar -- try again */ |
1491 | if (++ptr >= end) |
1492 | return 0; |
1493 | RESET_CAPTURE_GROUP(); |
1494 | } |
1495 | i = overlap[i]; |
1496 | } while (i != 0); |
1497 | } |
1498 | return 0; |
1499 | } |
1500 | |
1501 | if (charset) { |
1502 | /* pattern starts with a character from a known set */ |
1503 | end = (SRE_CHAR *)state->end; |
1504 | state->must_advance = 0; |
1505 | for (;;) { |
1506 | while (ptr < end && !SRE(charset)(state, charset, *ptr)) |
1507 | ptr++; |
1508 | if (ptr >= end) |
1509 | return 0; |
1510 | TRACE(("|%p|%p|SEARCH CHARSET\n" , pattern, ptr)); |
1511 | state->start = ptr; |
1512 | state->ptr = ptr; |
1513 | status = SRE(match)(state, pattern, 0); |
1514 | if (status != 0) |
1515 | break; |
1516 | ptr++; |
1517 | RESET_CAPTURE_GROUP(); |
1518 | } |
1519 | } else { |
1520 | /* general case */ |
1521 | assert(ptr <= end); |
1522 | TRACE(("|%p|%p|SEARCH\n" , pattern, ptr)); |
1523 | state->start = state->ptr = ptr; |
1524 | status = SRE(match)(state, pattern, 1); |
1525 | state->must_advance = 0; |
1526 | while (status == 0 && ptr < end) { |
1527 | ptr++; |
1528 | RESET_CAPTURE_GROUP(); |
1529 | TRACE(("|%p|%p|SEARCH\n" , pattern, ptr)); |
1530 | state->start = state->ptr = ptr; |
1531 | status = SRE(match)(state, pattern, 0); |
1532 | } |
1533 | } |
1534 | |
1535 | return status; |
1536 | } |
1537 | |
1538 | #undef SRE_CHAR |
1539 | #undef SIZEOF_SRE_CHAR |
1540 | #undef SRE |
1541 | |
1542 | /* vim:ts=4:sw=4:et |
1543 | */ |
1544 | |