1// Copyright 2009 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "re2/prefilter.h"
6
7#include <stddef.h>
8#include <stdint.h>
9#include <string>
10#include <vector>
11
12#include "absl/strings/str_format.h"
13#include "util/logging.h"
14#include "util/utf.h"
15#include "re2/re2.h"
16#include "re2/unicode_casefold.h"
17#include "re2/walker-inl.h"
18
19namespace re2 {
20
21static const bool ExtraDebug = false;
22
23typedef std::set<std::string>::iterator SSIter;
24typedef std::set<std::string>::const_iterator ConstSSIter;
25
26// Initializes a Prefilter, allocating subs_ as necessary.
27Prefilter::Prefilter(Op op) {
28 op_ = op;
29 subs_ = NULL;
30 if (op_ == AND || op_ == OR)
31 subs_ = new std::vector<Prefilter*>;
32}
33
34// Destroys a Prefilter.
35Prefilter::~Prefilter() {
36 if (subs_) {
37 for (size_t i = 0; i < subs_->size(); i++)
38 delete (*subs_)[i];
39 delete subs_;
40 subs_ = NULL;
41 }
42}
43
44// Simplify if the node is an empty Or or And.
45Prefilter* Prefilter::Simplify() {
46 if (op_ != AND && op_ != OR) {
47 return this;
48 }
49
50 // Nothing left in the AND/OR.
51 if (subs_->empty()) {
52 if (op_ == AND)
53 op_ = ALL; // AND of nothing is true
54 else
55 op_ = NONE; // OR of nothing is false
56
57 return this;
58 }
59
60 // Just one subnode: throw away wrapper.
61 if (subs_->size() == 1) {
62 Prefilter* a = (*subs_)[0];
63 subs_->clear();
64 delete this;
65 return a->Simplify();
66 }
67
68 return this;
69}
70
71// Combines two Prefilters together to create an "op" (AND or OR).
72// The passed Prefilters will be part of the returned Prefilter or deleted.
73// Does lots of work to avoid creating unnecessarily complicated structures.
74Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
75 // If a, b can be rewritten as op, do so.
76 a = a->Simplify();
77 b = b->Simplify();
78
79 // Canonicalize: a->op <= b->op.
80 if (a->op() > b->op()) {
81 Prefilter* t = a;
82 a = b;
83 b = t;
84 }
85
86 // Trivial cases.
87 // ALL AND b = b
88 // NONE OR b = b
89 // ALL OR b = ALL
90 // NONE AND b = NONE
91 // Don't need to look at b, because of canonicalization above.
92 // ALL and NONE are smallest opcodes.
93 if (a->op() == ALL || a->op() == NONE) {
94 if ((a->op() == ALL && op == AND) ||
95 (a->op() == NONE && op == OR)) {
96 delete a;
97 return b;
98 } else {
99 delete b;
100 return a;
101 }
102 }
103
104 // If a and b match op, merge their contents.
105 if (a->op() == op && b->op() == op) {
106 for (size_t i = 0; i < b->subs()->size(); i++) {
107 Prefilter* bb = (*b->subs())[i];
108 a->subs()->push_back(bb);
109 }
110 b->subs()->clear();
111 delete b;
112 return a;
113 }
114
115 // If a already has the same op as the op that is under construction
116 // add in b (similarly if b already has the same op, add in a).
117 if (b->op() == op) {
118 Prefilter* t = a;
119 a = b;
120 b = t;
121 }
122 if (a->op() == op) {
123 a->subs()->push_back(b);
124 return a;
125 }
126
127 // Otherwise just return the op.
128 Prefilter* c = new Prefilter(op);
129 c->subs()->push_back(a);
130 c->subs()->push_back(b);
131 return c;
132}
133
134Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
135 return AndOr(AND, a, b);
136}
137
138Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
139 return AndOr(OR, a, b);
140}
141
142static void SimplifyStringSet(std::set<std::string>* ss) {
143 // Now make sure that the strings aren't redundant. For example, if
144 // we know "ab" is a required string, then it doesn't help at all to
145 // know that "abc" is also a required string, so delete "abc". This
146 // is because, when we are performing a string search to filter
147 // regexps, matching "ab" will already allow this regexp to be a
148 // candidate for match, so further matching "abc" is redundant.
149 // Note that we must ignore "" because find() would find it at the
150 // start of everything and thus we would end up erasing everything.
151 for (SSIter i = ss->begin(); i != ss->end(); ++i) {
152 if (i->empty())
153 continue;
154 SSIter j = i;
155 ++j;
156 while (j != ss->end()) {
157 if (j->find(*i) != std::string::npos) {
158 j = ss->erase(j);
159 continue;
160 }
161 ++j;
162 }
163 }
164}
165
166Prefilter* Prefilter::OrStrings(std::set<std::string>* ss) {
167 Prefilter* or_prefilter = new Prefilter(NONE);
168 SimplifyStringSet(ss);
169 for (SSIter i = ss->begin(); i != ss->end(); ++i)
170 or_prefilter = Or(or_prefilter, FromString(*i));
171 return or_prefilter;
172}
173
174static Rune ToLowerRune(Rune r) {
175 if (r < Runeself) {
176 if ('A' <= r && r <= 'Z')
177 r += 'a' - 'A';
178 return r;
179 }
180
181 const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
182 if (f == NULL || r < f->lo)
183 return r;
184 return ApplyFold(f, r);
185}
186
187static Rune ToLowerRuneLatin1(Rune r) {
188 if ('A' <= r && r <= 'Z')
189 r += 'a' - 'A';
190 return r;
191}
192
193Prefilter* Prefilter::FromString(const std::string& str) {
194 Prefilter* m = new Prefilter(Prefilter::ATOM);
195 m->atom_ = str;
196 return m;
197}
198
199// Information about a regexp used during computation of Prefilter.
200// Can be thought of as information about the set of strings matching
201// the given regular expression.
202class Prefilter::Info {
203 public:
204 Info();
205 ~Info();
206
207 // More constructors. They delete their Info* arguments.
208 static Info* Alt(Info* a, Info* b);
209 static Info* Concat(Info* a, Info* b);
210 static Info* And(Info* a, Info* b);
211 static Info* Star(Info* a);
212 static Info* Plus(Info* a);
213 static Info* Quest(Info* a);
214 static Info* EmptyString();
215 static Info* NoMatch();
216 static Info* AnyCharOrAnyByte();
217 static Info* CClass(CharClass* cc, bool latin1);
218 static Info* Literal(Rune r);
219 static Info* LiteralLatin1(Rune r);
220 static Info* AnyMatch();
221
222 // Format Info as a string.
223 std::string ToString();
224
225 // Caller takes ownership of the Prefilter.
226 Prefilter* TakeMatch();
227
228 std::set<std::string>& exact() { return exact_; }
229
230 bool is_exact() const { return is_exact_; }
231
232 class Walker;
233
234 private:
235 std::set<std::string> exact_;
236
237 // When is_exact_ is true, the strings that match
238 // are placed in exact_. When it is no longer an exact
239 // set of strings that match this RE, then is_exact_
240 // is false and the match_ contains the required match
241 // criteria.
242 bool is_exact_;
243
244 // Accumulated Prefilter query that any
245 // match for this regexp is guaranteed to match.
246 Prefilter* match_;
247};
248
249
250Prefilter::Info::Info()
251 : is_exact_(false),
252 match_(NULL) {
253}
254
255Prefilter::Info::~Info() {
256 delete match_;
257}
258
259Prefilter* Prefilter::Info::TakeMatch() {
260 if (is_exact_) {
261 match_ = Prefilter::OrStrings(&exact_);
262 is_exact_ = false;
263 }
264 Prefilter* m = match_;
265 match_ = NULL;
266 return m;
267}
268
269// Format a Info in string form.
270std::string Prefilter::Info::ToString() {
271 if (is_exact_) {
272 int n = 0;
273 std::string s;
274 for (SSIter i = exact_.begin(); i != exact_.end(); ++i) {
275 if (n++ > 0)
276 s += ",";
277 s += *i;
278 }
279 return s;
280 }
281
282 if (match_)
283 return match_->DebugString();
284
285 return "";
286}
287
288// Add the strings from src to dst.
289static void CopyIn(const std::set<std::string>& src,
290 std::set<std::string>* dst) {
291 for (ConstSSIter i = src.begin(); i != src.end(); ++i)
292 dst->insert(*i);
293}
294
295// Add the cross-product of a and b to dst.
296// (For each string i in a and j in b, add i+j.)
297static void CrossProduct(const std::set<std::string>& a,
298 const std::set<std::string>& b,
299 std::set<std::string>* dst) {
300 for (ConstSSIter i = a.begin(); i != a.end(); ++i)
301 for (ConstSSIter j = b.begin(); j != b.end(); ++j)
302 dst->insert(*i + *j);
303}
304
305// Concats a and b. Requires that both are exact sets.
306// Forms an exact set that is a crossproduct of a and b.
307Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
308 if (a == NULL)
309 return b;
310 DCHECK(a->is_exact_);
311 DCHECK(b && b->is_exact_);
312 Info *ab = new Info();
313
314 CrossProduct(a->exact_, b->exact_, &ab->exact_);
315 ab->is_exact_ = true;
316
317 delete a;
318 delete b;
319 return ab;
320}
321
322// Constructs an inexact Info for ab given a and b.
323// Used only when a or b is not exact or when the
324// exact cross product is likely to be too big.
325Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
326 if (a == NULL)
327 return b;
328 if (b == NULL)
329 return a;
330
331 Info *ab = new Info();
332
333 ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
334 ab->is_exact_ = false;
335 delete a;
336 delete b;
337 return ab;
338}
339
340// Constructs Info for a|b given a and b.
341Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
342 Info *ab = new Info();
343
344 if (a->is_exact_ && b->is_exact_) {
345 CopyIn(a->exact_, &ab->exact_);
346 CopyIn(b->exact_, &ab->exact_);
347 ab->is_exact_ = true;
348 } else {
349 // Either a or b has is_exact_ = false. If the other
350 // one has is_exact_ = true, we move it to match_ and
351 // then create a OR of a,b. The resulting Info has
352 // is_exact_ = false.
353 ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
354 ab->is_exact_ = false;
355 }
356
357 delete a;
358 delete b;
359 return ab;
360}
361
362// Constructs Info for a? given a.
363Prefilter::Info* Prefilter::Info::Quest(Info *a) {
364 Info *ab = new Info();
365
366 ab->is_exact_ = false;
367 ab->match_ = new Prefilter(ALL);
368 delete a;
369 return ab;
370}
371
372// Constructs Info for a* given a.
373// Same as a? -- not much to do.
374Prefilter::Info* Prefilter::Info::Star(Info *a) {
375 return Quest(a);
376}
377
378// Constructs Info for a+ given a. If a was exact set, it isn't
379// anymore.
380Prefilter::Info* Prefilter::Info::Plus(Info *a) {
381 Info *ab = new Info();
382
383 ab->match_ = a->TakeMatch();
384 ab->is_exact_ = false;
385
386 delete a;
387 return ab;
388}
389
390static std::string RuneToString(Rune r) {
391 char buf[UTFmax];
392 int n = runetochar(buf, &r);
393 return std::string(buf, n);
394}
395
396static std::string RuneToStringLatin1(Rune r) {
397 char c = r & 0xff;
398 return std::string(&c, 1);
399}
400
401// Constructs Info for literal rune.
402Prefilter::Info* Prefilter::Info::Literal(Rune r) {
403 Info* info = new Info();
404 info->exact_.insert(RuneToString(ToLowerRune(r)));
405 info->is_exact_ = true;
406 return info;
407}
408
409// Constructs Info for literal rune for Latin1 encoded string.
410Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
411 Info* info = new Info();
412 info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
413 info->is_exact_ = true;
414 return info;
415}
416
417// Constructs Info for dot (any character) or \C (any byte).
418Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
419 Prefilter::Info* info = new Prefilter::Info();
420 info->match_ = new Prefilter(ALL);
421 return info;
422}
423
424// Constructs Prefilter::Info for no possible match.
425Prefilter::Info* Prefilter::Info::NoMatch() {
426 Prefilter::Info* info = new Prefilter::Info();
427 info->match_ = new Prefilter(NONE);
428 return info;
429}
430
431// Constructs Prefilter::Info for any possible match.
432// This Prefilter::Info is valid for any regular expression,
433// since it makes no assertions whatsoever about the
434// strings being matched.
435Prefilter::Info* Prefilter::Info::AnyMatch() {
436 Prefilter::Info *info = new Prefilter::Info();
437 info->match_ = new Prefilter(ALL);
438 return info;
439}
440
441// Constructs Prefilter::Info for just the empty string.
442Prefilter::Info* Prefilter::Info::EmptyString() {
443 Prefilter::Info* info = new Prefilter::Info();
444 info->is_exact_ = true;
445 info->exact_.insert("");
446 return info;
447}
448
449// Constructs Prefilter::Info for a character class.
450typedef CharClass::iterator CCIter;
451Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
452 bool latin1) {
453 if (ExtraDebug) {
454 LOG(ERROR) << "CharClassInfo:";
455 for (CCIter i = cc->begin(); i != cc->end(); ++i)
456 LOG(ERROR) << " " << i->lo << "-" << i->hi;
457 }
458
459 // If the class is too large, it's okay to overestimate.
460 if (cc->size() > 10)
461 return AnyCharOrAnyByte();
462
463 Prefilter::Info *a = new Prefilter::Info();
464 for (CCIter i = cc->begin(); i != cc->end(); ++i)
465 for (Rune r = i->lo; r <= i->hi; r++) {
466 if (latin1) {
467 a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
468 } else {
469 a->exact_.insert(RuneToString(ToLowerRune(r)));
470 }
471 }
472
473
474 a->is_exact_ = true;
475
476 if (ExtraDebug)
477 LOG(ERROR) << " = " << a->ToString();
478
479 return a;
480}
481
482class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
483 public:
484 Walker(bool latin1) : latin1_(latin1) {}
485
486 virtual Info* PostVisit(
487 Regexp* re, Info* parent_arg,
488 Info* pre_arg,
489 Info** child_args, int nchild_args);
490
491 virtual Info* ShortVisit(
492 Regexp* re,
493 Info* parent_arg);
494
495 bool latin1() { return latin1_; }
496 private:
497 bool latin1_;
498
499 Walker(const Walker&) = delete;
500 Walker& operator=(const Walker&) = delete;
501};
502
503Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
504 if (ExtraDebug)
505 LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
506
507 bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
508 Prefilter::Info::Walker w(latin1);
509 Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
510
511 if (w.stopped_early()) {
512 delete info;
513 return NULL;
514 }
515
516 return info;
517}
518
519Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
520 Regexp* re, Prefilter::Info* parent_arg) {
521 return AnyMatch();
522}
523
524// Constructs the Prefilter::Info for the given regular expression.
525// Assumes re is simplified.
526Prefilter::Info* Prefilter::Info::Walker::PostVisit(
527 Regexp* re, Prefilter::Info* parent_arg,
528 Prefilter::Info* pre_arg, Prefilter::Info** child_args,
529 int nchild_args) {
530 Prefilter::Info *info;
531 switch (re->op()) {
532 default:
533 case kRegexpRepeat:
534 LOG(DFATAL) << "Bad regexp op " << re->op();
535 info = EmptyString();
536 break;
537
538 case kRegexpNoMatch:
539 info = NoMatch();
540 break;
541
542 // These ops match the empty string:
543 case kRegexpEmptyMatch: // anywhere
544 case kRegexpBeginLine: // at beginning of line
545 case kRegexpEndLine: // at end of line
546 case kRegexpBeginText: // at beginning of text
547 case kRegexpEndText: // at end of text
548 case kRegexpWordBoundary: // at word boundary
549 case kRegexpNoWordBoundary: // not at word boundary
550 info = EmptyString();
551 break;
552
553 case kRegexpLiteral:
554 if (latin1()) {
555 info = LiteralLatin1(re->rune());
556 }
557 else {
558 info = Literal(re->rune());
559 }
560 break;
561
562 case kRegexpLiteralString:
563 if (re->nrunes() == 0) {
564 info = NoMatch();
565 break;
566 }
567 if (latin1()) {
568 info = LiteralLatin1(re->runes()[0]);
569 for (int i = 1; i < re->nrunes(); i++) {
570 info = Concat(info, LiteralLatin1(re->runes()[i]));
571 }
572 } else {
573 info = Literal(re->runes()[0]);
574 for (int i = 1; i < re->nrunes(); i++) {
575 info = Concat(info, Literal(re->runes()[i]));
576 }
577 }
578 break;
579
580 case kRegexpConcat: {
581 // Accumulate in info.
582 // Exact is concat of recent contiguous exact nodes.
583 info = NULL;
584 Info* exact = NULL;
585 for (int i = 0; i < nchild_args; i++) {
586 Info* ci = child_args[i]; // child info
587 if (!ci->is_exact() ||
588 (exact && ci->exact().size() * exact->exact().size() > 16)) {
589 // Exact run is over.
590 info = And(info, exact);
591 exact = NULL;
592 // Add this child's info.
593 info = And(info, ci);
594 } else {
595 // Append to exact run.
596 exact = Concat(exact, ci);
597 }
598 }
599 info = And(info, exact);
600 }
601 break;
602
603 case kRegexpAlternate:
604 info = child_args[0];
605 for (int i = 1; i < nchild_args; i++)
606 info = Alt(info, child_args[i]);
607 break;
608
609 case kRegexpStar:
610 info = Star(child_args[0]);
611 break;
612
613 case kRegexpQuest:
614 info = Quest(child_args[0]);
615 break;
616
617 case kRegexpPlus:
618 info = Plus(child_args[0]);
619 break;
620
621 case kRegexpAnyChar:
622 case kRegexpAnyByte:
623 // Claim nothing, except that it's not empty.
624 info = AnyCharOrAnyByte();
625 break;
626
627 case kRegexpCharClass:
628 info = CClass(re->cc(), latin1());
629 break;
630
631 case kRegexpCapture:
632 // These don't affect the set of matching strings.
633 info = child_args[0];
634 break;
635 }
636
637 if (ExtraDebug)
638 LOG(ERROR) << "BuildInfo " << re->ToString()
639 << ": " << (info ? info->ToString() : "");
640
641 return info;
642}
643
644
645Prefilter* Prefilter::FromRegexp(Regexp* re) {
646 if (re == NULL)
647 return NULL;
648
649 Regexp* simple = re->Simplify();
650 if (simple == NULL)
651 return NULL;
652
653 Prefilter::Info* info = BuildInfo(simple);
654 simple->Decref();
655 if (info == NULL)
656 return NULL;
657
658 Prefilter* m = info->TakeMatch();
659 delete info;
660 return m;
661}
662
663std::string Prefilter::DebugString() const {
664 switch (op_) {
665 default:
666 LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
667 return absl::StrFormat("op%d", op_);
668 case NONE:
669 return "*no-matches*";
670 case ATOM:
671 return atom_;
672 case ALL:
673 return "";
674 case AND: {
675 std::string s = "";
676 for (size_t i = 0; i < subs_->size(); i++) {
677 if (i > 0)
678 s += " ";
679 Prefilter* sub = (*subs_)[i];
680 s += sub ? sub->DebugString() : "<nil>";
681 }
682 return s;
683 }
684 case OR: {
685 std::string s = "(";
686 for (size_t i = 0; i < subs_->size(); i++) {
687 if (i > 0)
688 s += "|";
689 Prefilter* sub = (*subs_)[i];
690 s += sub ? sub->DebugString() : "<nil>";
691 }
692 s += ")";
693 return s;
694 }
695 }
696}
697
698Prefilter* Prefilter::FromRE2(const RE2* re2) {
699 if (re2 == NULL)
700 return NULL;
701
702 Regexp* regexp = re2->Regexp();
703 if (regexp == NULL)
704 return NULL;
705
706 return FromRegexp(regexp);
707}
708
709
710} // namespace re2
711