1 | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Regular expression representation. |
6 | // Tested by parse_test.cc |
7 | |
8 | #include "re2/regexp.h" |
9 | |
10 | #include <stddef.h> |
11 | #include <stdint.h> |
12 | #include <string.h> |
13 | #include <algorithm> |
14 | #include <map> |
15 | #include <string> |
16 | #include <vector> |
17 | |
18 | #include "absl/base/call_once.h" |
19 | #include "absl/base/macros.h" |
20 | #include "absl/synchronization/mutex.h" |
21 | #include "util/logging.h" |
22 | #include "util/utf.h" |
23 | #include "re2/pod_array.h" |
24 | #include "re2/walker-inl.h" |
25 | |
26 | namespace re2 { |
27 | |
28 | // Constructor. Allocates vectors as appropriate for operator. |
29 | Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) |
30 | : op_(static_cast<uint8_t>(op)), |
31 | simple_(false), |
32 | parse_flags_(static_cast<uint16_t>(parse_flags)), |
33 | ref_(1), |
34 | nsub_(0), |
35 | down_(NULL) { |
36 | subone_ = NULL; |
37 | memset(the_union_, 0, sizeof the_union_); |
38 | } |
39 | |
40 | // Destructor. Assumes already cleaned up children. |
41 | // Private: use Decref() instead of delete to destroy Regexps. |
42 | // Can't call Decref on the sub-Regexps here because |
43 | // that could cause arbitrarily deep recursion, so |
44 | // required Decref() to have handled them for us. |
45 | Regexp::~Regexp() { |
46 | if (nsub_ > 0) |
47 | LOG(DFATAL) << "Regexp not destroyed." ; |
48 | |
49 | switch (op_) { |
50 | default: |
51 | break; |
52 | case kRegexpCapture: |
53 | delete name_; |
54 | break; |
55 | case kRegexpLiteralString: |
56 | delete[] runes_; |
57 | break; |
58 | case kRegexpCharClass: |
59 | if (cc_) |
60 | cc_->Delete(); |
61 | delete ccb_; |
62 | break; |
63 | } |
64 | } |
65 | |
66 | // If it's possible to destroy this regexp without recurring, |
67 | // do so and return true. Else return false. |
68 | bool Regexp::QuickDestroy() { |
69 | if (nsub_ == 0) { |
70 | delete this; |
71 | return true; |
72 | } |
73 | return false; |
74 | } |
75 | |
76 | // Lazily allocated. |
77 | static absl::Mutex* ref_mutex; |
78 | static std::map<Regexp*, int>* ref_map; |
79 | |
80 | int Regexp::Ref() { |
81 | if (ref_ < kMaxRef) |
82 | return ref_; |
83 | |
84 | absl::MutexLock l(ref_mutex); |
85 | return (*ref_map)[this]; |
86 | } |
87 | |
88 | // Increments reference count, returns object as convenience. |
89 | Regexp* Regexp::Incref() { |
90 | if (ref_ >= kMaxRef-1) { |
91 | static absl::once_flag ref_once; |
92 | absl::call_once(ref_once, []() { |
93 | ref_mutex = new absl::Mutex; |
94 | ref_map = new std::map<Regexp*, int>; |
95 | }); |
96 | |
97 | // Store ref count in overflow map. |
98 | absl::MutexLock l(ref_mutex); |
99 | if (ref_ == kMaxRef) { |
100 | // already overflowed |
101 | (*ref_map)[this]++; |
102 | } else { |
103 | // overflowing now |
104 | (*ref_map)[this] = kMaxRef; |
105 | ref_ = kMaxRef; |
106 | } |
107 | return this; |
108 | } |
109 | |
110 | ref_++; |
111 | return this; |
112 | } |
113 | |
114 | // Decrements reference count and deletes this object if count reaches 0. |
115 | void Regexp::Decref() { |
116 | if (ref_ == kMaxRef) { |
117 | // Ref count is stored in overflow map. |
118 | absl::MutexLock l(ref_mutex); |
119 | int r = (*ref_map)[this] - 1; |
120 | if (r < kMaxRef) { |
121 | ref_ = static_cast<uint16_t>(r); |
122 | ref_map->erase(this); |
123 | } else { |
124 | (*ref_map)[this] = r; |
125 | } |
126 | return; |
127 | } |
128 | ref_--; |
129 | if (ref_ == 0) |
130 | Destroy(); |
131 | } |
132 | |
133 | // Deletes this object; ref count has count reached 0. |
134 | void Regexp::Destroy() { |
135 | if (QuickDestroy()) |
136 | return; |
137 | |
138 | // Handle recursive Destroy with explicit stack |
139 | // to avoid arbitrarily deep recursion on process stack [sigh]. |
140 | down_ = NULL; |
141 | Regexp* stack = this; |
142 | while (stack != NULL) { |
143 | Regexp* re = stack; |
144 | stack = re->down_; |
145 | if (re->ref_ != 0) |
146 | LOG(DFATAL) << "Bad reference count " << re->ref_; |
147 | if (re->nsub_ > 0) { |
148 | Regexp** subs = re->sub(); |
149 | for (int i = 0; i < re->nsub_; i++) { |
150 | Regexp* sub = subs[i]; |
151 | if (sub == NULL) |
152 | continue; |
153 | if (sub->ref_ == kMaxRef) |
154 | sub->Decref(); |
155 | else |
156 | --sub->ref_; |
157 | if (sub->ref_ == 0 && !sub->QuickDestroy()) { |
158 | sub->down_ = stack; |
159 | stack = sub; |
160 | } |
161 | } |
162 | if (re->nsub_ > 1) |
163 | delete[] subs; |
164 | re->nsub_ = 0; |
165 | } |
166 | delete re; |
167 | } |
168 | } |
169 | |
170 | void Regexp::AddRuneToString(Rune r) { |
171 | DCHECK(op_ == kRegexpLiteralString); |
172 | if (nrunes_ == 0) { |
173 | // start with 8 |
174 | runes_ = new Rune[8]; |
175 | } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { |
176 | // double on powers of two |
177 | Rune *old = runes_; |
178 | runes_ = new Rune[nrunes_ * 2]; |
179 | for (int i = 0; i < nrunes_; i++) |
180 | runes_[i] = old[i]; |
181 | delete[] old; |
182 | } |
183 | |
184 | runes_[nrunes_++] = r; |
185 | } |
186 | |
187 | Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { |
188 | Regexp* re = new Regexp(kRegexpHaveMatch, flags); |
189 | re->match_id_ = match_id; |
190 | return re; |
191 | } |
192 | |
193 | Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) { |
194 | // Squash **, ++ and ??. |
195 | if (op == sub->op() && flags == sub->parse_flags()) |
196 | return sub; |
197 | |
198 | // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because |
199 | // op is Star/Plus/Quest, we just have to check that sub->op() is too. |
200 | if ((sub->op() == kRegexpStar || |
201 | sub->op() == kRegexpPlus || |
202 | sub->op() == kRegexpQuest) && |
203 | flags == sub->parse_flags()) { |
204 | // If sub is Star, no need to rewrite it. |
205 | if (sub->op() == kRegexpStar) |
206 | return sub; |
207 | |
208 | // Rewrite sub to Star. |
209 | Regexp* re = new Regexp(kRegexpStar, flags); |
210 | re->AllocSub(1); |
211 | re->sub()[0] = sub->sub()[0]->Incref(); |
212 | sub->Decref(); // We didn't consume the reference after all. |
213 | return re; |
214 | } |
215 | |
216 | Regexp* re = new Regexp(op, flags); |
217 | re->AllocSub(1); |
218 | re->sub()[0] = sub; |
219 | return re; |
220 | } |
221 | |
222 | Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { |
223 | return StarPlusOrQuest(kRegexpPlus, sub, flags); |
224 | } |
225 | |
226 | Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { |
227 | return StarPlusOrQuest(kRegexpStar, sub, flags); |
228 | } |
229 | |
230 | Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { |
231 | return StarPlusOrQuest(kRegexpQuest, sub, flags); |
232 | } |
233 | |
234 | Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, |
235 | ParseFlags flags, bool can_factor) { |
236 | if (nsub == 1) |
237 | return sub[0]; |
238 | |
239 | if (nsub == 0) { |
240 | if (op == kRegexpAlternate) |
241 | return new Regexp(kRegexpNoMatch, flags); |
242 | else |
243 | return new Regexp(kRegexpEmptyMatch, flags); |
244 | } |
245 | |
246 | PODArray<Regexp*> subcopy; |
247 | if (op == kRegexpAlternate && can_factor) { |
248 | // Going to edit sub; make a copy so we don't step on caller. |
249 | subcopy = PODArray<Regexp*>(nsub); |
250 | memmove(subcopy.data(), sub, nsub * sizeof sub[0]); |
251 | sub = subcopy.data(); |
252 | nsub = FactorAlternation(sub, nsub, flags); |
253 | if (nsub == 1) { |
254 | Regexp* re = sub[0]; |
255 | return re; |
256 | } |
257 | } |
258 | |
259 | if (nsub > kMaxNsub) { |
260 | // Too many subexpressions to fit in a single Regexp. |
261 | // Make a two-level tree. Two levels gets us to 65535^2. |
262 | int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; |
263 | Regexp* re = new Regexp(op, flags); |
264 | re->AllocSub(nbigsub); |
265 | Regexp** subs = re->sub(); |
266 | for (int i = 0; i < nbigsub - 1; i++) |
267 | subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); |
268 | subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, |
269 | nsub - (nbigsub-1)*kMaxNsub, flags, |
270 | false); |
271 | return re; |
272 | } |
273 | |
274 | Regexp* re = new Regexp(op, flags); |
275 | re->AllocSub(nsub); |
276 | Regexp** subs = re->sub(); |
277 | for (int i = 0; i < nsub; i++) |
278 | subs[i] = sub[i]; |
279 | return re; |
280 | } |
281 | |
282 | Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { |
283 | return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); |
284 | } |
285 | |
286 | Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { |
287 | return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); |
288 | } |
289 | |
290 | Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { |
291 | return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); |
292 | } |
293 | |
294 | Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { |
295 | Regexp* re = new Regexp(kRegexpCapture, flags); |
296 | re->AllocSub(1); |
297 | re->sub()[0] = sub; |
298 | re->cap_ = cap; |
299 | return re; |
300 | } |
301 | |
302 | Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { |
303 | Regexp* re = new Regexp(kRegexpRepeat, flags); |
304 | re->AllocSub(1); |
305 | re->sub()[0] = sub; |
306 | re->min_ = min; |
307 | re->max_ = max; |
308 | return re; |
309 | } |
310 | |
311 | Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { |
312 | Regexp* re = new Regexp(kRegexpLiteral, flags); |
313 | re->rune_ = rune; |
314 | return re; |
315 | } |
316 | |
317 | Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { |
318 | if (nrunes <= 0) |
319 | return new Regexp(kRegexpEmptyMatch, flags); |
320 | if (nrunes == 1) |
321 | return NewLiteral(runes[0], flags); |
322 | Regexp* re = new Regexp(kRegexpLiteralString, flags); |
323 | for (int i = 0; i < nrunes; i++) |
324 | re->AddRuneToString(runes[i]); |
325 | return re; |
326 | } |
327 | |
328 | Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { |
329 | Regexp* re = new Regexp(kRegexpCharClass, flags); |
330 | re->cc_ = cc; |
331 | return re; |
332 | } |
333 | |
334 | void Regexp::Swap(Regexp* that) { |
335 | // Regexp is not trivially copyable, so we cannot freely copy it with |
336 | // memmove(3), but swapping objects like so is safe for our purposes. |
337 | char tmp[sizeof *this]; |
338 | void* vthis = reinterpret_cast<void*>(this); |
339 | void* vthat = reinterpret_cast<void*>(that); |
340 | memmove(tmp, vthis, sizeof *this); |
341 | memmove(vthis, vthat, sizeof *this); |
342 | memmove(vthat, tmp, sizeof *this); |
343 | } |
344 | |
345 | // Tests equality of all top-level structure but not subregexps. |
346 | static bool TopEqual(Regexp* a, Regexp* b) { |
347 | if (a->op() != b->op()) |
348 | return false; |
349 | |
350 | switch (a->op()) { |
351 | case kRegexpNoMatch: |
352 | case kRegexpEmptyMatch: |
353 | case kRegexpAnyChar: |
354 | case kRegexpAnyByte: |
355 | case kRegexpBeginLine: |
356 | case kRegexpEndLine: |
357 | case kRegexpWordBoundary: |
358 | case kRegexpNoWordBoundary: |
359 | case kRegexpBeginText: |
360 | return true; |
361 | |
362 | case kRegexpEndText: |
363 | // The parse flags remember whether it's \z or (?-m:$), |
364 | // which matters when testing against PCRE. |
365 | return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; |
366 | |
367 | case kRegexpLiteral: |
368 | return a->rune() == b->rune() && |
369 | ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; |
370 | |
371 | case kRegexpLiteralString: |
372 | return a->nrunes() == b->nrunes() && |
373 | ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && |
374 | memcmp(a->runes(), b->runes(), |
375 | a->nrunes() * sizeof a->runes()[0]) == 0; |
376 | |
377 | case kRegexpAlternate: |
378 | case kRegexpConcat: |
379 | return a->nsub() == b->nsub(); |
380 | |
381 | case kRegexpStar: |
382 | case kRegexpPlus: |
383 | case kRegexpQuest: |
384 | return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; |
385 | |
386 | case kRegexpRepeat: |
387 | return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && |
388 | a->min() == b->min() && |
389 | a->max() == b->max(); |
390 | |
391 | case kRegexpCapture: |
392 | return a->cap() == b->cap() && a->name() == b->name(); |
393 | |
394 | case kRegexpHaveMatch: |
395 | return a->match_id() == b->match_id(); |
396 | |
397 | case kRegexpCharClass: { |
398 | CharClass* acc = a->cc(); |
399 | CharClass* bcc = b->cc(); |
400 | return acc->size() == bcc->size() && |
401 | acc->end() - acc->begin() == bcc->end() - bcc->begin() && |
402 | memcmp(acc->begin(), bcc->begin(), |
403 | (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; |
404 | } |
405 | } |
406 | |
407 | LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); |
408 | return 0; |
409 | } |
410 | |
411 | bool Regexp::Equal(Regexp* a, Regexp* b) { |
412 | if (a == NULL || b == NULL) |
413 | return a == b; |
414 | |
415 | if (!TopEqual(a, b)) |
416 | return false; |
417 | |
418 | // Fast path: |
419 | // return without allocating vector if there are no subregexps. |
420 | switch (a->op()) { |
421 | case kRegexpAlternate: |
422 | case kRegexpConcat: |
423 | case kRegexpStar: |
424 | case kRegexpPlus: |
425 | case kRegexpQuest: |
426 | case kRegexpRepeat: |
427 | case kRegexpCapture: |
428 | break; |
429 | |
430 | default: |
431 | return true; |
432 | } |
433 | |
434 | // Committed to doing real work. |
435 | // The stack (vector) has pairs of regexps waiting to |
436 | // be compared. The regexps are only equal if |
437 | // all the pairs end up being equal. |
438 | std::vector<Regexp*> stk; |
439 | |
440 | for (;;) { |
441 | // Invariant: TopEqual(a, b) == true. |
442 | Regexp* a2; |
443 | Regexp* b2; |
444 | switch (a->op()) { |
445 | default: |
446 | break; |
447 | case kRegexpAlternate: |
448 | case kRegexpConcat: |
449 | for (int i = 0; i < a->nsub(); i++) { |
450 | a2 = a->sub()[i]; |
451 | b2 = b->sub()[i]; |
452 | if (!TopEqual(a2, b2)) |
453 | return false; |
454 | stk.push_back(a2); |
455 | stk.push_back(b2); |
456 | } |
457 | break; |
458 | |
459 | case kRegexpStar: |
460 | case kRegexpPlus: |
461 | case kRegexpQuest: |
462 | case kRegexpRepeat: |
463 | case kRegexpCapture: |
464 | a2 = a->sub()[0]; |
465 | b2 = b->sub()[0]; |
466 | if (!TopEqual(a2, b2)) |
467 | return false; |
468 | // Really: |
469 | // stk.push_back(a2); |
470 | // stk.push_back(b2); |
471 | // break; |
472 | // but faster to assign directly and loop. |
473 | a = a2; |
474 | b = b2; |
475 | continue; |
476 | } |
477 | |
478 | size_t n = stk.size(); |
479 | if (n == 0) |
480 | break; |
481 | |
482 | DCHECK_GE(n, 2); |
483 | a = stk[n-2]; |
484 | b = stk[n-1]; |
485 | stk.resize(n-2); |
486 | } |
487 | |
488 | return true; |
489 | } |
490 | |
491 | // Keep in sync with enum RegexpStatusCode in regexp.h |
492 | static const char *kErrorStrings[] = { |
493 | "no error" , |
494 | "unexpected error" , |
495 | "invalid escape sequence" , |
496 | "invalid character class" , |
497 | "invalid character class range" , |
498 | "missing ]" , |
499 | "missing )" , |
500 | "unexpected )" , |
501 | "trailing \\" , |
502 | "no argument for repetition operator" , |
503 | "invalid repetition size" , |
504 | "bad repetition operator" , |
505 | "invalid perl operator" , |
506 | "invalid UTF-8" , |
507 | "invalid named capture group" , |
508 | }; |
509 | |
510 | std::string RegexpStatus::CodeText(enum RegexpStatusCode code) { |
511 | if (code < 0 || code >= ABSL_ARRAYSIZE(kErrorStrings)) |
512 | code = kRegexpInternalError; |
513 | return kErrorStrings[code]; |
514 | } |
515 | |
516 | std::string RegexpStatus::Text() const { |
517 | if (error_arg_.empty()) |
518 | return CodeText(code_); |
519 | std::string s; |
520 | s.append(CodeText(code_)); |
521 | s.append(": " ); |
522 | s.append(error_arg_.data(), error_arg_.size()); |
523 | return s; |
524 | } |
525 | |
526 | void RegexpStatus::Copy(const RegexpStatus& status) { |
527 | code_ = status.code_; |
528 | error_arg_ = status.error_arg_; |
529 | } |
530 | |
531 | typedef int Ignored; // Walker<void> doesn't exist |
532 | |
533 | // Walker subclass to count capturing parens in regexp. |
534 | class NumCapturesWalker : public Regexp::Walker<Ignored> { |
535 | public: |
536 | NumCapturesWalker() : ncapture_(0) {} |
537 | int ncapture() { return ncapture_; } |
538 | |
539 | virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
540 | if (re->op() == kRegexpCapture) |
541 | ncapture_++; |
542 | return ignored; |
543 | } |
544 | |
545 | virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
546 | // Should never be called: we use Walk(), not WalkExponential(). |
547 | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
548 | LOG(DFATAL) << "NumCapturesWalker::ShortVisit called" ; |
549 | #endif |
550 | return ignored; |
551 | } |
552 | |
553 | private: |
554 | int ncapture_; |
555 | |
556 | NumCapturesWalker(const NumCapturesWalker&) = delete; |
557 | NumCapturesWalker& operator=(const NumCapturesWalker&) = delete; |
558 | }; |
559 | |
560 | int Regexp::NumCaptures() { |
561 | NumCapturesWalker w; |
562 | w.Walk(this, 0); |
563 | return w.ncapture(); |
564 | } |
565 | |
566 | // Walker class to build map of named capture groups and their indices. |
567 | class NamedCapturesWalker : public Regexp::Walker<Ignored> { |
568 | public: |
569 | NamedCapturesWalker() : map_(NULL) {} |
570 | ~NamedCapturesWalker() { delete map_; } |
571 | |
572 | std::map<std::string, int>* TakeMap() { |
573 | std::map<std::string, int>* m = map_; |
574 | map_ = NULL; |
575 | return m; |
576 | } |
577 | |
578 | virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
579 | if (re->op() == kRegexpCapture && re->name() != NULL) { |
580 | // Allocate map once we find a name. |
581 | if (map_ == NULL) |
582 | map_ = new std::map<std::string, int>; |
583 | |
584 | // Record first occurrence of each name. |
585 | // (The rule is that if you have the same name |
586 | // multiple times, only the leftmost one counts.) |
587 | map_->insert({*re->name(), re->cap()}); |
588 | } |
589 | return ignored; |
590 | } |
591 | |
592 | virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
593 | // Should never be called: we use Walk(), not WalkExponential(). |
594 | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
595 | LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called" ; |
596 | #endif |
597 | return ignored; |
598 | } |
599 | |
600 | private: |
601 | std::map<std::string, int>* map_; |
602 | |
603 | NamedCapturesWalker(const NamedCapturesWalker&) = delete; |
604 | NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete; |
605 | }; |
606 | |
607 | std::map<std::string, int>* Regexp::NamedCaptures() { |
608 | NamedCapturesWalker w; |
609 | w.Walk(this, 0); |
610 | return w.TakeMap(); |
611 | } |
612 | |
613 | // Walker class to build map from capture group indices to their names. |
614 | class CaptureNamesWalker : public Regexp::Walker<Ignored> { |
615 | public: |
616 | CaptureNamesWalker() : map_(NULL) {} |
617 | ~CaptureNamesWalker() { delete map_; } |
618 | |
619 | std::map<int, std::string>* TakeMap() { |
620 | std::map<int, std::string>* m = map_; |
621 | map_ = NULL; |
622 | return m; |
623 | } |
624 | |
625 | virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
626 | if (re->op() == kRegexpCapture && re->name() != NULL) { |
627 | // Allocate map once we find a name. |
628 | if (map_ == NULL) |
629 | map_ = new std::map<int, std::string>; |
630 | |
631 | (*map_)[re->cap()] = *re->name(); |
632 | } |
633 | return ignored; |
634 | } |
635 | |
636 | virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
637 | // Should never be called: we use Walk(), not WalkExponential(). |
638 | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
639 | LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called" ; |
640 | #endif |
641 | return ignored; |
642 | } |
643 | |
644 | private: |
645 | std::map<int, std::string>* map_; |
646 | |
647 | CaptureNamesWalker(const CaptureNamesWalker&) = delete; |
648 | CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete; |
649 | }; |
650 | |
651 | std::map<int, std::string>* Regexp::CaptureNames() { |
652 | CaptureNamesWalker w; |
653 | w.Walk(this, 0); |
654 | return w.TakeMap(); |
655 | } |
656 | |
657 | void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes, |
658 | std::string* bytes) { |
659 | if (latin1) { |
660 | bytes->resize(nrunes); |
661 | for (int i = 0; i < nrunes; i++) |
662 | (*bytes)[i] = static_cast<char>(runes[i]); |
663 | } else { |
664 | bytes->resize(nrunes * UTFmax); // worst case |
665 | char* p = &(*bytes)[0]; |
666 | for (int i = 0; i < nrunes; i++) |
667 | p += runetochar(p, &runes[i]); |
668 | bytes->resize(p - &(*bytes)[0]); |
669 | bytes->shrink_to_fit(); |
670 | } |
671 | } |
672 | |
673 | // Determines whether regexp matches must be anchored |
674 | // with a fixed string prefix. If so, returns the prefix and |
675 | // the regexp that remains after the prefix. The prefix might |
676 | // be ASCII case-insensitive. |
677 | bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase, |
678 | Regexp** suffix) { |
679 | prefix->clear(); |
680 | *foldcase = false; |
681 | *suffix = NULL; |
682 | |
683 | // No need for a walker: the regexp must be of the form |
684 | // 1. some number of ^ anchors |
685 | // 2. a literal char or string |
686 | // 3. the rest |
687 | if (op_ != kRegexpConcat) |
688 | return false; |
689 | int i = 0; |
690 | while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText) |
691 | i++; |
692 | if (i == 0 || i >= nsub_) |
693 | return false; |
694 | Regexp* re = sub()[i]; |
695 | if (re->op_ != kRegexpLiteral && |
696 | re->op_ != kRegexpLiteralString) |
697 | return false; |
698 | i++; |
699 | if (i < nsub_) { |
700 | for (int j = i; j < nsub_; j++) |
701 | sub()[j]->Incref(); |
702 | *suffix = Concat(sub() + i, nsub_ - i, parse_flags()); |
703 | } else { |
704 | *suffix = new Regexp(kRegexpEmptyMatch, parse_flags()); |
705 | } |
706 | |
707 | bool latin1 = (re->parse_flags() & Latin1) != 0; |
708 | Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; |
709 | int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; |
710 | ConvertRunesToBytes(latin1, runes, nrunes, prefix); |
711 | *foldcase = (re->parse_flags() & FoldCase) != 0; |
712 | return true; |
713 | } |
714 | |
715 | // Determines whether regexp matches must be unanchored |
716 | // with a fixed string prefix. If so, returns the prefix. |
717 | // The prefix might be ASCII case-insensitive. |
718 | bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) { |
719 | prefix->clear(); |
720 | *foldcase = false; |
721 | |
722 | // No need for a walker: the regexp must either begin with or be |
723 | // a literal char or string. We "see through" capturing groups, |
724 | // but make no effort to glue multiple prefix fragments together. |
725 | Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this; |
726 | while (re->op_ == kRegexpCapture) { |
727 | re = re->sub()[0]; |
728 | if (re->op_ == kRegexpConcat && re->nsub_ > 0) |
729 | re = re->sub()[0]; |
730 | } |
731 | if (re->op_ != kRegexpLiteral && |
732 | re->op_ != kRegexpLiteralString) |
733 | return false; |
734 | |
735 | bool latin1 = (re->parse_flags() & Latin1) != 0; |
736 | Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; |
737 | int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; |
738 | ConvertRunesToBytes(latin1, runes, nrunes, prefix); |
739 | *foldcase = (re->parse_flags() & FoldCase) != 0; |
740 | return true; |
741 | } |
742 | |
743 | // Character class builder is a balanced binary tree (STL set) |
744 | // containing non-overlapping, non-abutting RuneRanges. |
745 | // The less-than operator used in the tree treats two |
746 | // ranges as equal if they overlap at all, so that |
747 | // lookups for a particular Rune are possible. |
748 | |
749 | CharClassBuilder::CharClassBuilder() { |
750 | nrunes_ = 0; |
751 | upper_ = 0; |
752 | lower_ = 0; |
753 | } |
754 | |
755 | // Add lo-hi to the class; return whether class got bigger. |
756 | bool CharClassBuilder::AddRange(Rune lo, Rune hi) { |
757 | if (hi < lo) |
758 | return false; |
759 | |
760 | if (lo <= 'z' && hi >= 'A') { |
761 | // Overlaps some alpha, maybe not all. |
762 | // Update bitmaps telling which ASCII letters are in the set. |
763 | Rune lo1 = std::max<Rune>(lo, 'A'); |
764 | Rune hi1 = std::min<Rune>(hi, 'Z'); |
765 | if (lo1 <= hi1) |
766 | upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); |
767 | |
768 | lo1 = std::max<Rune>(lo, 'a'); |
769 | hi1 = std::min<Rune>(hi, 'z'); |
770 | if (lo1 <= hi1) |
771 | lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); |
772 | } |
773 | |
774 | { // Check whether lo, hi is already in the class. |
775 | iterator it = ranges_.find(RuneRange(lo, lo)); |
776 | if (it != end() && it->lo <= lo && hi <= it->hi) |
777 | return false; |
778 | } |
779 | |
780 | // Look for a range abutting lo on the left. |
781 | // If it exists, take it out and increase our range. |
782 | if (lo > 0) { |
783 | iterator it = ranges_.find(RuneRange(lo-1, lo-1)); |
784 | if (it != end()) { |
785 | lo = it->lo; |
786 | if (it->hi > hi) |
787 | hi = it->hi; |
788 | nrunes_ -= it->hi - it->lo + 1; |
789 | ranges_.erase(it); |
790 | } |
791 | } |
792 | |
793 | // Look for a range abutting hi on the right. |
794 | // If it exists, take it out and increase our range. |
795 | if (hi < Runemax) { |
796 | iterator it = ranges_.find(RuneRange(hi+1, hi+1)); |
797 | if (it != end()) { |
798 | hi = it->hi; |
799 | nrunes_ -= it->hi - it->lo + 1; |
800 | ranges_.erase(it); |
801 | } |
802 | } |
803 | |
804 | // Look for ranges between lo and hi. Take them out. |
805 | // This is only safe because the set has no overlapping ranges. |
806 | // We've already removed any ranges abutting lo and hi, so |
807 | // any that overlap [lo, hi] must be contained within it. |
808 | for (;;) { |
809 | iterator it = ranges_.find(RuneRange(lo, hi)); |
810 | if (it == end()) |
811 | break; |
812 | nrunes_ -= it->hi - it->lo + 1; |
813 | ranges_.erase(it); |
814 | } |
815 | |
816 | // Finally, add [lo, hi]. |
817 | nrunes_ += hi - lo + 1; |
818 | ranges_.insert(RuneRange(lo, hi)); |
819 | return true; |
820 | } |
821 | |
822 | void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { |
823 | for (iterator it = cc->begin(); it != cc->end(); ++it) |
824 | AddRange(it->lo, it->hi); |
825 | } |
826 | |
827 | bool CharClassBuilder::Contains(Rune r) { |
828 | return ranges_.find(RuneRange(r, r)) != end(); |
829 | } |
830 | |
831 | // Does the character class behave the same on A-Z as on a-z? |
832 | bool CharClassBuilder::FoldsASCII() { |
833 | return ((upper_ ^ lower_) & AlphaMask) == 0; |
834 | } |
835 | |
836 | CharClassBuilder* CharClassBuilder::Copy() { |
837 | CharClassBuilder* cc = new CharClassBuilder; |
838 | for (iterator it = begin(); it != end(); ++it) |
839 | cc->ranges_.insert(RuneRange(it->lo, it->hi)); |
840 | cc->upper_ = upper_; |
841 | cc->lower_ = lower_; |
842 | cc->nrunes_ = nrunes_; |
843 | return cc; |
844 | } |
845 | |
846 | |
847 | |
848 | void CharClassBuilder::RemoveAbove(Rune r) { |
849 | if (r >= Runemax) |
850 | return; |
851 | |
852 | if (r < 'z') { |
853 | if (r < 'a') |
854 | lower_ = 0; |
855 | else |
856 | lower_ &= AlphaMask >> ('z' - r); |
857 | } |
858 | |
859 | if (r < 'Z') { |
860 | if (r < 'A') |
861 | upper_ = 0; |
862 | else |
863 | upper_ &= AlphaMask >> ('Z' - r); |
864 | } |
865 | |
866 | for (;;) { |
867 | |
868 | iterator it = ranges_.find(RuneRange(r + 1, Runemax)); |
869 | if (it == end()) |
870 | break; |
871 | RuneRange rr = *it; |
872 | ranges_.erase(it); |
873 | nrunes_ -= rr.hi - rr.lo + 1; |
874 | if (rr.lo <= r) { |
875 | rr.hi = r; |
876 | ranges_.insert(rr); |
877 | nrunes_ += rr.hi - rr.lo + 1; |
878 | } |
879 | } |
880 | } |
881 | |
882 | void CharClassBuilder::Negate() { |
883 | // Build up negation and then copy in. |
884 | // Could edit ranges in place, but C++ won't let me. |
885 | std::vector<RuneRange> v; |
886 | v.reserve(ranges_.size() + 1); |
887 | |
888 | // In negation, first range begins at 0, unless |
889 | // the current class begins at 0. |
890 | iterator it = begin(); |
891 | if (it == end()) { |
892 | v.push_back(RuneRange(0, Runemax)); |
893 | } else { |
894 | int nextlo = 0; |
895 | if (it->lo == 0) { |
896 | nextlo = it->hi + 1; |
897 | ++it; |
898 | } |
899 | for (; it != end(); ++it) { |
900 | v.push_back(RuneRange(nextlo, it->lo - 1)); |
901 | nextlo = it->hi + 1; |
902 | } |
903 | if (nextlo <= Runemax) |
904 | v.push_back(RuneRange(nextlo, Runemax)); |
905 | } |
906 | |
907 | ranges_.clear(); |
908 | for (size_t i = 0; i < v.size(); i++) |
909 | ranges_.insert(v[i]); |
910 | |
911 | upper_ = AlphaMask & ~upper_; |
912 | lower_ = AlphaMask & ~lower_; |
913 | nrunes_ = Runemax+1 - nrunes_; |
914 | } |
915 | |
916 | // Character class is a sorted list of ranges. |
917 | // The ranges are allocated in the same block as the header, |
918 | // necessitating a special allocator and Delete method. |
919 | |
920 | CharClass* CharClass::New(size_t maxranges) { |
921 | CharClass* cc; |
922 | uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; |
923 | cc = reinterpret_cast<CharClass*>(data); |
924 | cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); |
925 | cc->nranges_ = 0; |
926 | cc->folds_ascii_ = false; |
927 | cc->nrunes_ = 0; |
928 | return cc; |
929 | } |
930 | |
931 | void CharClass::Delete() { |
932 | uint8_t* data = reinterpret_cast<uint8_t*>(this); |
933 | delete[] data; |
934 | } |
935 | |
936 | CharClass* CharClass::Negate() { |
937 | CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1)); |
938 | cc->folds_ascii_ = folds_ascii_; |
939 | cc->nrunes_ = Runemax + 1 - nrunes_; |
940 | int n = 0; |
941 | int nextlo = 0; |
942 | for (CharClass::iterator it = begin(); it != end(); ++it) { |
943 | if (it->lo == nextlo) { |
944 | nextlo = it->hi + 1; |
945 | } else { |
946 | cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); |
947 | nextlo = it->hi + 1; |
948 | } |
949 | } |
950 | if (nextlo <= Runemax) |
951 | cc->ranges_[n++] = RuneRange(nextlo, Runemax); |
952 | cc->nranges_ = n; |
953 | return cc; |
954 | } |
955 | |
956 | bool CharClass::Contains(Rune r) const { |
957 | RuneRange* rr = ranges_; |
958 | int n = nranges_; |
959 | while (n > 0) { |
960 | int m = n/2; |
961 | if (rr[m].hi < r) { |
962 | rr += m+1; |
963 | n -= m+1; |
964 | } else if (r < rr[m].lo) { |
965 | n = m; |
966 | } else { // rr[m].lo <= r && r <= rr[m].hi |
967 | return true; |
968 | } |
969 | } |
970 | return false; |
971 | } |
972 | |
973 | CharClass* CharClassBuilder::GetCharClass() { |
974 | CharClass* cc = CharClass::New(ranges_.size()); |
975 | int n = 0; |
976 | for (iterator it = begin(); it != end(); ++it) |
977 | cc->ranges_[n++] = *it; |
978 | cc->nranges_ = n; |
979 | DCHECK_LE(n, static_cast<int>(ranges_.size())); |
980 | cc->nrunes_ = nrunes_; |
981 | cc->folds_ascii_ = FoldsASCII(); |
982 | return cc; |
983 | } |
984 | |
985 | } // namespace re2 |
986 | |