1// Copyright 2006 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression representation.
6// Tested by parse_test.cc
7
8#include "re2/regexp.h"
9
10#include <stddef.h>
11#include <stdint.h>
12#include <string.h>
13#include <algorithm>
14#include <map>
15#include <string>
16#include <vector>
17
18#include "absl/base/call_once.h"
19#include "absl/base/macros.h"
20#include "absl/synchronization/mutex.h"
21#include "util/logging.h"
22#include "util/utf.h"
23#include "re2/pod_array.h"
24#include "re2/walker-inl.h"
25
26namespace re2 {
27
28// Constructor. Allocates vectors as appropriate for operator.
29Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
30 : op_(static_cast<uint8_t>(op)),
31 simple_(false),
32 parse_flags_(static_cast<uint16_t>(parse_flags)),
33 ref_(1),
34 nsub_(0),
35 down_(NULL) {
36 subone_ = NULL;
37 memset(the_union_, 0, sizeof the_union_);
38}
39
40// Destructor. Assumes already cleaned up children.
41// Private: use Decref() instead of delete to destroy Regexps.
42// Can't call Decref on the sub-Regexps here because
43// that could cause arbitrarily deep recursion, so
44// required Decref() to have handled them for us.
45Regexp::~Regexp() {
46 if (nsub_ > 0)
47 LOG(DFATAL) << "Regexp not destroyed.";
48
49 switch (op_) {
50 default:
51 break;
52 case kRegexpCapture:
53 delete name_;
54 break;
55 case kRegexpLiteralString:
56 delete[] runes_;
57 break;
58 case kRegexpCharClass:
59 if (cc_)
60 cc_->Delete();
61 delete ccb_;
62 break;
63 }
64}
65
66// If it's possible to destroy this regexp without recurring,
67// do so and return true. Else return false.
68bool Regexp::QuickDestroy() {
69 if (nsub_ == 0) {
70 delete this;
71 return true;
72 }
73 return false;
74}
75
76// Lazily allocated.
77static absl::Mutex* ref_mutex;
78static std::map<Regexp*, int>* ref_map;
79
80int Regexp::Ref() {
81 if (ref_ < kMaxRef)
82 return ref_;
83
84 absl::MutexLock l(ref_mutex);
85 return (*ref_map)[this];
86}
87
88// Increments reference count, returns object as convenience.
89Regexp* Regexp::Incref() {
90 if (ref_ >= kMaxRef-1) {
91 static absl::once_flag ref_once;
92 absl::call_once(ref_once, []() {
93 ref_mutex = new absl::Mutex;
94 ref_map = new std::map<Regexp*, int>;
95 });
96
97 // Store ref count in overflow map.
98 absl::MutexLock l(ref_mutex);
99 if (ref_ == kMaxRef) {
100 // already overflowed
101 (*ref_map)[this]++;
102 } else {
103 // overflowing now
104 (*ref_map)[this] = kMaxRef;
105 ref_ = kMaxRef;
106 }
107 return this;
108 }
109
110 ref_++;
111 return this;
112}
113
114// Decrements reference count and deletes this object if count reaches 0.
115void Regexp::Decref() {
116 if (ref_ == kMaxRef) {
117 // Ref count is stored in overflow map.
118 absl::MutexLock l(ref_mutex);
119 int r = (*ref_map)[this] - 1;
120 if (r < kMaxRef) {
121 ref_ = static_cast<uint16_t>(r);
122 ref_map->erase(this);
123 } else {
124 (*ref_map)[this] = r;
125 }
126 return;
127 }
128 ref_--;
129 if (ref_ == 0)
130 Destroy();
131}
132
133// Deletes this object; ref count has count reached 0.
134void Regexp::Destroy() {
135 if (QuickDestroy())
136 return;
137
138 // Handle recursive Destroy with explicit stack
139 // to avoid arbitrarily deep recursion on process stack [sigh].
140 down_ = NULL;
141 Regexp* stack = this;
142 while (stack != NULL) {
143 Regexp* re = stack;
144 stack = re->down_;
145 if (re->ref_ != 0)
146 LOG(DFATAL) << "Bad reference count " << re->ref_;
147 if (re->nsub_ > 0) {
148 Regexp** subs = re->sub();
149 for (int i = 0; i < re->nsub_; i++) {
150 Regexp* sub = subs[i];
151 if (sub == NULL)
152 continue;
153 if (sub->ref_ == kMaxRef)
154 sub->Decref();
155 else
156 --sub->ref_;
157 if (sub->ref_ == 0 && !sub->QuickDestroy()) {
158 sub->down_ = stack;
159 stack = sub;
160 }
161 }
162 if (re->nsub_ > 1)
163 delete[] subs;
164 re->nsub_ = 0;
165 }
166 delete re;
167 }
168}
169
170void Regexp::AddRuneToString(Rune r) {
171 DCHECK(op_ == kRegexpLiteralString);
172 if (nrunes_ == 0) {
173 // start with 8
174 runes_ = new Rune[8];
175 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
176 // double on powers of two
177 Rune *old = runes_;
178 runes_ = new Rune[nrunes_ * 2];
179 for (int i = 0; i < nrunes_; i++)
180 runes_[i] = old[i];
181 delete[] old;
182 }
183
184 runes_[nrunes_++] = r;
185}
186
187Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
188 Regexp* re = new Regexp(kRegexpHaveMatch, flags);
189 re->match_id_ = match_id;
190 return re;
191}
192
193Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
194 // Squash **, ++ and ??.
195 if (op == sub->op() && flags == sub->parse_flags())
196 return sub;
197
198 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
199 // op is Star/Plus/Quest, we just have to check that sub->op() is too.
200 if ((sub->op() == kRegexpStar ||
201 sub->op() == kRegexpPlus ||
202 sub->op() == kRegexpQuest) &&
203 flags == sub->parse_flags()) {
204 // If sub is Star, no need to rewrite it.
205 if (sub->op() == kRegexpStar)
206 return sub;
207
208 // Rewrite sub to Star.
209 Regexp* re = new Regexp(kRegexpStar, flags);
210 re->AllocSub(1);
211 re->sub()[0] = sub->sub()[0]->Incref();
212 sub->Decref(); // We didn't consume the reference after all.
213 return re;
214 }
215
216 Regexp* re = new Regexp(op, flags);
217 re->AllocSub(1);
218 re->sub()[0] = sub;
219 return re;
220}
221
222Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
223 return StarPlusOrQuest(kRegexpPlus, sub, flags);
224}
225
226Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
227 return StarPlusOrQuest(kRegexpStar, sub, flags);
228}
229
230Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
231 return StarPlusOrQuest(kRegexpQuest, sub, flags);
232}
233
234Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
235 ParseFlags flags, bool can_factor) {
236 if (nsub == 1)
237 return sub[0];
238
239 if (nsub == 0) {
240 if (op == kRegexpAlternate)
241 return new Regexp(kRegexpNoMatch, flags);
242 else
243 return new Regexp(kRegexpEmptyMatch, flags);
244 }
245
246 PODArray<Regexp*> subcopy;
247 if (op == kRegexpAlternate && can_factor) {
248 // Going to edit sub; make a copy so we don't step on caller.
249 subcopy = PODArray<Regexp*>(nsub);
250 memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
251 sub = subcopy.data();
252 nsub = FactorAlternation(sub, nsub, flags);
253 if (nsub == 1) {
254 Regexp* re = sub[0];
255 return re;
256 }
257 }
258
259 if (nsub > kMaxNsub) {
260 // Too many subexpressions to fit in a single Regexp.
261 // Make a two-level tree. Two levels gets us to 65535^2.
262 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
263 Regexp* re = new Regexp(op, flags);
264 re->AllocSub(nbigsub);
265 Regexp** subs = re->sub();
266 for (int i = 0; i < nbigsub - 1; i++)
267 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
268 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
269 nsub - (nbigsub-1)*kMaxNsub, flags,
270 false);
271 return re;
272 }
273
274 Regexp* re = new Regexp(op, flags);
275 re->AllocSub(nsub);
276 Regexp** subs = re->sub();
277 for (int i = 0; i < nsub; i++)
278 subs[i] = sub[i];
279 return re;
280}
281
282Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
283 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
284}
285
286Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
287 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
288}
289
290Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
291 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
292}
293
294Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
295 Regexp* re = new Regexp(kRegexpCapture, flags);
296 re->AllocSub(1);
297 re->sub()[0] = sub;
298 re->cap_ = cap;
299 return re;
300}
301
302Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
303 Regexp* re = new Regexp(kRegexpRepeat, flags);
304 re->AllocSub(1);
305 re->sub()[0] = sub;
306 re->min_ = min;
307 re->max_ = max;
308 return re;
309}
310
311Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
312 Regexp* re = new Regexp(kRegexpLiteral, flags);
313 re->rune_ = rune;
314 return re;
315}
316
317Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
318 if (nrunes <= 0)
319 return new Regexp(kRegexpEmptyMatch, flags);
320 if (nrunes == 1)
321 return NewLiteral(runes[0], flags);
322 Regexp* re = new Regexp(kRegexpLiteralString, flags);
323 for (int i = 0; i < nrunes; i++)
324 re->AddRuneToString(runes[i]);
325 return re;
326}
327
328Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
329 Regexp* re = new Regexp(kRegexpCharClass, flags);
330 re->cc_ = cc;
331 return re;
332}
333
334void Regexp::Swap(Regexp* that) {
335 // Regexp is not trivially copyable, so we cannot freely copy it with
336 // memmove(3), but swapping objects like so is safe for our purposes.
337 char tmp[sizeof *this];
338 void* vthis = reinterpret_cast<void*>(this);
339 void* vthat = reinterpret_cast<void*>(that);
340 memmove(tmp, vthis, sizeof *this);
341 memmove(vthis, vthat, sizeof *this);
342 memmove(vthat, tmp, sizeof *this);
343}
344
345// Tests equality of all top-level structure but not subregexps.
346static bool TopEqual(Regexp* a, Regexp* b) {
347 if (a->op() != b->op())
348 return false;
349
350 switch (a->op()) {
351 case kRegexpNoMatch:
352 case kRegexpEmptyMatch:
353 case kRegexpAnyChar:
354 case kRegexpAnyByte:
355 case kRegexpBeginLine:
356 case kRegexpEndLine:
357 case kRegexpWordBoundary:
358 case kRegexpNoWordBoundary:
359 case kRegexpBeginText:
360 return true;
361
362 case kRegexpEndText:
363 // The parse flags remember whether it's \z or (?-m:$),
364 // which matters when testing against PCRE.
365 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
366
367 case kRegexpLiteral:
368 return a->rune() == b->rune() &&
369 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
370
371 case kRegexpLiteralString:
372 return a->nrunes() == b->nrunes() &&
373 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
374 memcmp(a->runes(), b->runes(),
375 a->nrunes() * sizeof a->runes()[0]) == 0;
376
377 case kRegexpAlternate:
378 case kRegexpConcat:
379 return a->nsub() == b->nsub();
380
381 case kRegexpStar:
382 case kRegexpPlus:
383 case kRegexpQuest:
384 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
385
386 case kRegexpRepeat:
387 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
388 a->min() == b->min() &&
389 a->max() == b->max();
390
391 case kRegexpCapture:
392 return a->cap() == b->cap() && a->name() == b->name();
393
394 case kRegexpHaveMatch:
395 return a->match_id() == b->match_id();
396
397 case kRegexpCharClass: {
398 CharClass* acc = a->cc();
399 CharClass* bcc = b->cc();
400 return acc->size() == bcc->size() &&
401 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
402 memcmp(acc->begin(), bcc->begin(),
403 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
404 }
405 }
406
407 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
408 return 0;
409}
410
411bool Regexp::Equal(Regexp* a, Regexp* b) {
412 if (a == NULL || b == NULL)
413 return a == b;
414
415 if (!TopEqual(a, b))
416 return false;
417
418 // Fast path:
419 // return without allocating vector if there are no subregexps.
420 switch (a->op()) {
421 case kRegexpAlternate:
422 case kRegexpConcat:
423 case kRegexpStar:
424 case kRegexpPlus:
425 case kRegexpQuest:
426 case kRegexpRepeat:
427 case kRegexpCapture:
428 break;
429
430 default:
431 return true;
432 }
433
434 // Committed to doing real work.
435 // The stack (vector) has pairs of regexps waiting to
436 // be compared. The regexps are only equal if
437 // all the pairs end up being equal.
438 std::vector<Regexp*> stk;
439
440 for (;;) {
441 // Invariant: TopEqual(a, b) == true.
442 Regexp* a2;
443 Regexp* b2;
444 switch (a->op()) {
445 default:
446 break;
447 case kRegexpAlternate:
448 case kRegexpConcat:
449 for (int i = 0; i < a->nsub(); i++) {
450 a2 = a->sub()[i];
451 b2 = b->sub()[i];
452 if (!TopEqual(a2, b2))
453 return false;
454 stk.push_back(a2);
455 stk.push_back(b2);
456 }
457 break;
458
459 case kRegexpStar:
460 case kRegexpPlus:
461 case kRegexpQuest:
462 case kRegexpRepeat:
463 case kRegexpCapture:
464 a2 = a->sub()[0];
465 b2 = b->sub()[0];
466 if (!TopEqual(a2, b2))
467 return false;
468 // Really:
469 // stk.push_back(a2);
470 // stk.push_back(b2);
471 // break;
472 // but faster to assign directly and loop.
473 a = a2;
474 b = b2;
475 continue;
476 }
477
478 size_t n = stk.size();
479 if (n == 0)
480 break;
481
482 DCHECK_GE(n, 2);
483 a = stk[n-2];
484 b = stk[n-1];
485 stk.resize(n-2);
486 }
487
488 return true;
489}
490
491// Keep in sync with enum RegexpStatusCode in regexp.h
492static const char *kErrorStrings[] = {
493 "no error",
494 "unexpected error",
495 "invalid escape sequence",
496 "invalid character class",
497 "invalid character class range",
498 "missing ]",
499 "missing )",
500 "unexpected )",
501 "trailing \\",
502 "no argument for repetition operator",
503 "invalid repetition size",
504 "bad repetition operator",
505 "invalid perl operator",
506 "invalid UTF-8",
507 "invalid named capture group",
508};
509
510std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
511 if (code < 0 || code >= ABSL_ARRAYSIZE(kErrorStrings))
512 code = kRegexpInternalError;
513 return kErrorStrings[code];
514}
515
516std::string RegexpStatus::Text() const {
517 if (error_arg_.empty())
518 return CodeText(code_);
519 std::string s;
520 s.append(CodeText(code_));
521 s.append(": ");
522 s.append(error_arg_.data(), error_arg_.size());
523 return s;
524}
525
526void RegexpStatus::Copy(const RegexpStatus& status) {
527 code_ = status.code_;
528 error_arg_ = status.error_arg_;
529}
530
531typedef int Ignored; // Walker<void> doesn't exist
532
533// Walker subclass to count capturing parens in regexp.
534class NumCapturesWalker : public Regexp::Walker<Ignored> {
535 public:
536 NumCapturesWalker() : ncapture_(0) {}
537 int ncapture() { return ncapture_; }
538
539 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
540 if (re->op() == kRegexpCapture)
541 ncapture_++;
542 return ignored;
543 }
544
545 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
546 // Should never be called: we use Walk(), not WalkExponential().
547#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
548 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
549#endif
550 return ignored;
551 }
552
553 private:
554 int ncapture_;
555
556 NumCapturesWalker(const NumCapturesWalker&) = delete;
557 NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
558};
559
560int Regexp::NumCaptures() {
561 NumCapturesWalker w;
562 w.Walk(this, 0);
563 return w.ncapture();
564}
565
566// Walker class to build map of named capture groups and their indices.
567class NamedCapturesWalker : public Regexp::Walker<Ignored> {
568 public:
569 NamedCapturesWalker() : map_(NULL) {}
570 ~NamedCapturesWalker() { delete map_; }
571
572 std::map<std::string, int>* TakeMap() {
573 std::map<std::string, int>* m = map_;
574 map_ = NULL;
575 return m;
576 }
577
578 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
579 if (re->op() == kRegexpCapture && re->name() != NULL) {
580 // Allocate map once we find a name.
581 if (map_ == NULL)
582 map_ = new std::map<std::string, int>;
583
584 // Record first occurrence of each name.
585 // (The rule is that if you have the same name
586 // multiple times, only the leftmost one counts.)
587 map_->insert({*re->name(), re->cap()});
588 }
589 return ignored;
590 }
591
592 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
593 // Should never be called: we use Walk(), not WalkExponential().
594#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
595 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
596#endif
597 return ignored;
598 }
599
600 private:
601 std::map<std::string, int>* map_;
602
603 NamedCapturesWalker(const NamedCapturesWalker&) = delete;
604 NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
605};
606
607std::map<std::string, int>* Regexp::NamedCaptures() {
608 NamedCapturesWalker w;
609 w.Walk(this, 0);
610 return w.TakeMap();
611}
612
613// Walker class to build map from capture group indices to their names.
614class CaptureNamesWalker : public Regexp::Walker<Ignored> {
615 public:
616 CaptureNamesWalker() : map_(NULL) {}
617 ~CaptureNamesWalker() { delete map_; }
618
619 std::map<int, std::string>* TakeMap() {
620 std::map<int, std::string>* m = map_;
621 map_ = NULL;
622 return m;
623 }
624
625 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
626 if (re->op() == kRegexpCapture && re->name() != NULL) {
627 // Allocate map once we find a name.
628 if (map_ == NULL)
629 map_ = new std::map<int, std::string>;
630
631 (*map_)[re->cap()] = *re->name();
632 }
633 return ignored;
634 }
635
636 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
637 // Should never be called: we use Walk(), not WalkExponential().
638#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
639 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
640#endif
641 return ignored;
642 }
643
644 private:
645 std::map<int, std::string>* map_;
646
647 CaptureNamesWalker(const CaptureNamesWalker&) = delete;
648 CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
649};
650
651std::map<int, std::string>* Regexp::CaptureNames() {
652 CaptureNamesWalker w;
653 w.Walk(this, 0);
654 return w.TakeMap();
655}
656
657void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
658 std::string* bytes) {
659 if (latin1) {
660 bytes->resize(nrunes);
661 for (int i = 0; i < nrunes; i++)
662 (*bytes)[i] = static_cast<char>(runes[i]);
663 } else {
664 bytes->resize(nrunes * UTFmax); // worst case
665 char* p = &(*bytes)[0];
666 for (int i = 0; i < nrunes; i++)
667 p += runetochar(p, &runes[i]);
668 bytes->resize(p - &(*bytes)[0]);
669 bytes->shrink_to_fit();
670 }
671}
672
673// Determines whether regexp matches must be anchored
674// with a fixed string prefix. If so, returns the prefix and
675// the regexp that remains after the prefix. The prefix might
676// be ASCII case-insensitive.
677bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
678 Regexp** suffix) {
679 prefix->clear();
680 *foldcase = false;
681 *suffix = NULL;
682
683 // No need for a walker: the regexp must be of the form
684 // 1. some number of ^ anchors
685 // 2. a literal char or string
686 // 3. the rest
687 if (op_ != kRegexpConcat)
688 return false;
689 int i = 0;
690 while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
691 i++;
692 if (i == 0 || i >= nsub_)
693 return false;
694 Regexp* re = sub()[i];
695 if (re->op_ != kRegexpLiteral &&
696 re->op_ != kRegexpLiteralString)
697 return false;
698 i++;
699 if (i < nsub_) {
700 for (int j = i; j < nsub_; j++)
701 sub()[j]->Incref();
702 *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
703 } else {
704 *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
705 }
706
707 bool latin1 = (re->parse_flags() & Latin1) != 0;
708 Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
709 int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
710 ConvertRunesToBytes(latin1, runes, nrunes, prefix);
711 *foldcase = (re->parse_flags() & FoldCase) != 0;
712 return true;
713}
714
715// Determines whether regexp matches must be unanchored
716// with a fixed string prefix. If so, returns the prefix.
717// The prefix might be ASCII case-insensitive.
718bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
719 prefix->clear();
720 *foldcase = false;
721
722 // No need for a walker: the regexp must either begin with or be
723 // a literal char or string. We "see through" capturing groups,
724 // but make no effort to glue multiple prefix fragments together.
725 Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
726 while (re->op_ == kRegexpCapture) {
727 re = re->sub()[0];
728 if (re->op_ == kRegexpConcat && re->nsub_ > 0)
729 re = re->sub()[0];
730 }
731 if (re->op_ != kRegexpLiteral &&
732 re->op_ != kRegexpLiteralString)
733 return false;
734
735 bool latin1 = (re->parse_flags() & Latin1) != 0;
736 Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
737 int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
738 ConvertRunesToBytes(latin1, runes, nrunes, prefix);
739 *foldcase = (re->parse_flags() & FoldCase) != 0;
740 return true;
741}
742
743// Character class builder is a balanced binary tree (STL set)
744// containing non-overlapping, non-abutting RuneRanges.
745// The less-than operator used in the tree treats two
746// ranges as equal if they overlap at all, so that
747// lookups for a particular Rune are possible.
748
749CharClassBuilder::CharClassBuilder() {
750 nrunes_ = 0;
751 upper_ = 0;
752 lower_ = 0;
753}
754
755// Add lo-hi to the class; return whether class got bigger.
756bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
757 if (hi < lo)
758 return false;
759
760 if (lo <= 'z' && hi >= 'A') {
761 // Overlaps some alpha, maybe not all.
762 // Update bitmaps telling which ASCII letters are in the set.
763 Rune lo1 = std::max<Rune>(lo, 'A');
764 Rune hi1 = std::min<Rune>(hi, 'Z');
765 if (lo1 <= hi1)
766 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
767
768 lo1 = std::max<Rune>(lo, 'a');
769 hi1 = std::min<Rune>(hi, 'z');
770 if (lo1 <= hi1)
771 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
772 }
773
774 { // Check whether lo, hi is already in the class.
775 iterator it = ranges_.find(RuneRange(lo, lo));
776 if (it != end() && it->lo <= lo && hi <= it->hi)
777 return false;
778 }
779
780 // Look for a range abutting lo on the left.
781 // If it exists, take it out and increase our range.
782 if (lo > 0) {
783 iterator it = ranges_.find(RuneRange(lo-1, lo-1));
784 if (it != end()) {
785 lo = it->lo;
786 if (it->hi > hi)
787 hi = it->hi;
788 nrunes_ -= it->hi - it->lo + 1;
789 ranges_.erase(it);
790 }
791 }
792
793 // Look for a range abutting hi on the right.
794 // If it exists, take it out and increase our range.
795 if (hi < Runemax) {
796 iterator it = ranges_.find(RuneRange(hi+1, hi+1));
797 if (it != end()) {
798 hi = it->hi;
799 nrunes_ -= it->hi - it->lo + 1;
800 ranges_.erase(it);
801 }
802 }
803
804 // Look for ranges between lo and hi. Take them out.
805 // This is only safe because the set has no overlapping ranges.
806 // We've already removed any ranges abutting lo and hi, so
807 // any that overlap [lo, hi] must be contained within it.
808 for (;;) {
809 iterator it = ranges_.find(RuneRange(lo, hi));
810 if (it == end())
811 break;
812 nrunes_ -= it->hi - it->lo + 1;
813 ranges_.erase(it);
814 }
815
816 // Finally, add [lo, hi].
817 nrunes_ += hi - lo + 1;
818 ranges_.insert(RuneRange(lo, hi));
819 return true;
820}
821
822void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
823 for (iterator it = cc->begin(); it != cc->end(); ++it)
824 AddRange(it->lo, it->hi);
825}
826
827bool CharClassBuilder::Contains(Rune r) {
828 return ranges_.find(RuneRange(r, r)) != end();
829}
830
831// Does the character class behave the same on A-Z as on a-z?
832bool CharClassBuilder::FoldsASCII() {
833 return ((upper_ ^ lower_) & AlphaMask) == 0;
834}
835
836CharClassBuilder* CharClassBuilder::Copy() {
837 CharClassBuilder* cc = new CharClassBuilder;
838 for (iterator it = begin(); it != end(); ++it)
839 cc->ranges_.insert(RuneRange(it->lo, it->hi));
840 cc->upper_ = upper_;
841 cc->lower_ = lower_;
842 cc->nrunes_ = nrunes_;
843 return cc;
844}
845
846
847
848void CharClassBuilder::RemoveAbove(Rune r) {
849 if (r >= Runemax)
850 return;
851
852 if (r < 'z') {
853 if (r < 'a')
854 lower_ = 0;
855 else
856 lower_ &= AlphaMask >> ('z' - r);
857 }
858
859 if (r < 'Z') {
860 if (r < 'A')
861 upper_ = 0;
862 else
863 upper_ &= AlphaMask >> ('Z' - r);
864 }
865
866 for (;;) {
867
868 iterator it = ranges_.find(RuneRange(r + 1, Runemax));
869 if (it == end())
870 break;
871 RuneRange rr = *it;
872 ranges_.erase(it);
873 nrunes_ -= rr.hi - rr.lo + 1;
874 if (rr.lo <= r) {
875 rr.hi = r;
876 ranges_.insert(rr);
877 nrunes_ += rr.hi - rr.lo + 1;
878 }
879 }
880}
881
882void CharClassBuilder::Negate() {
883 // Build up negation and then copy in.
884 // Could edit ranges in place, but C++ won't let me.
885 std::vector<RuneRange> v;
886 v.reserve(ranges_.size() + 1);
887
888 // In negation, first range begins at 0, unless
889 // the current class begins at 0.
890 iterator it = begin();
891 if (it == end()) {
892 v.push_back(RuneRange(0, Runemax));
893 } else {
894 int nextlo = 0;
895 if (it->lo == 0) {
896 nextlo = it->hi + 1;
897 ++it;
898 }
899 for (; it != end(); ++it) {
900 v.push_back(RuneRange(nextlo, it->lo - 1));
901 nextlo = it->hi + 1;
902 }
903 if (nextlo <= Runemax)
904 v.push_back(RuneRange(nextlo, Runemax));
905 }
906
907 ranges_.clear();
908 for (size_t i = 0; i < v.size(); i++)
909 ranges_.insert(v[i]);
910
911 upper_ = AlphaMask & ~upper_;
912 lower_ = AlphaMask & ~lower_;
913 nrunes_ = Runemax+1 - nrunes_;
914}
915
916// Character class is a sorted list of ranges.
917// The ranges are allocated in the same block as the header,
918// necessitating a special allocator and Delete method.
919
920CharClass* CharClass::New(size_t maxranges) {
921 CharClass* cc;
922 uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
923 cc = reinterpret_cast<CharClass*>(data);
924 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
925 cc->nranges_ = 0;
926 cc->folds_ascii_ = false;
927 cc->nrunes_ = 0;
928 return cc;
929}
930
931void CharClass::Delete() {
932 uint8_t* data = reinterpret_cast<uint8_t*>(this);
933 delete[] data;
934}
935
936CharClass* CharClass::Negate() {
937 CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1));
938 cc->folds_ascii_ = folds_ascii_;
939 cc->nrunes_ = Runemax + 1 - nrunes_;
940 int n = 0;
941 int nextlo = 0;
942 for (CharClass::iterator it = begin(); it != end(); ++it) {
943 if (it->lo == nextlo) {
944 nextlo = it->hi + 1;
945 } else {
946 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
947 nextlo = it->hi + 1;
948 }
949 }
950 if (nextlo <= Runemax)
951 cc->ranges_[n++] = RuneRange(nextlo, Runemax);
952 cc->nranges_ = n;
953 return cc;
954}
955
956bool CharClass::Contains(Rune r) const {
957 RuneRange* rr = ranges_;
958 int n = nranges_;
959 while (n > 0) {
960 int m = n/2;
961 if (rr[m].hi < r) {
962 rr += m+1;
963 n -= m+1;
964 } else if (r < rr[m].lo) {
965 n = m;
966 } else { // rr[m].lo <= r && r <= rr[m].hi
967 return true;
968 }
969 }
970 return false;
971}
972
973CharClass* CharClassBuilder::GetCharClass() {
974 CharClass* cc = CharClass::New(ranges_.size());
975 int n = 0;
976 for (iterator it = begin(); it != end(); ++it)
977 cc->ranges_[n++] = *it;
978 cc->nranges_ = n;
979 DCHECK_LE(n, static_cast<int>(ranges_.size()));
980 cc->nrunes_ = nrunes_;
981 cc->folds_ascii_ = FoldsASCII();
982 return cc;
983}
984
985} // namespace re2
986