1// Copyright 2009 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#ifndef RE2_FILTERED_RE2_H_
6#define RE2_FILTERED_RE2_H_
7
8// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
9// It provides a prefilter mechanism that helps in cutting down the
10// number of regexps that need to be actually searched.
11//
12// By design, it does not include a string matching engine. This is to
13// allow the user of the class to use their favorite string matching
14// engine. The overall flow is: Add all the regexps using Add, then
15// Compile the FilteredRE2. Compile returns strings that need to be
16// matched. Note that the returned strings are lowercased and distinct.
17// For applying regexps to a search text, the caller does the string
18// matching using the returned strings. When doing the string match,
19// note that the caller has to do that in a case-insensitive way or
20// on a lowercased version of the search text. Then call FirstMatch
21// or AllMatches with a vector of indices of strings that were found
22// in the text to get the actual regexp matches.
23
24#include <memory>
25#include <string>
26#include <vector>
27
28#include "absl/strings/string_view.h"
29#include "re2/re2.h"
30
31namespace re2 {
32
33class PrefilterTree;
34
35class FilteredRE2 {
36 public:
37 FilteredRE2();
38 explicit FilteredRE2(int min_atom_len);
39 ~FilteredRE2();
40
41 // Not copyable.
42 FilteredRE2(const FilteredRE2&) = delete;
43 FilteredRE2& operator=(const FilteredRE2&) = delete;
44 // Movable.
45 FilteredRE2(FilteredRE2&& other);
46 FilteredRE2& operator=(FilteredRE2&& other);
47
48 // Uses RE2 constructor to create a RE2 object (re). Returns
49 // re->error_code(). If error_code is other than NoError, then re is
50 // deleted and not added to re2_vec_.
51 RE2::ErrorCode Add(absl::string_view pattern,
52 const RE2::Options& options,
53 int* id);
54
55 // Prepares the regexps added by Add for filtering. Returns a set
56 // of strings that the caller should check for in candidate texts.
57 // The returned strings are lowercased and distinct. When doing
58 // string matching, it should be performed in a case-insensitive
59 // way or the search text should be lowercased first. Call after
60 // all Add calls are done.
61 void Compile(std::vector<std::string>* strings_to_match);
62
63 // Returns the index of the first matching regexp.
64 // Returns -1 on no match. Can be called prior to Compile.
65 // Does not do any filtering: simply tries to Match the
66 // regexps in a loop.
67 int SlowFirstMatch(absl::string_view text) const;
68
69 // Returns the index of the first matching regexp.
70 // Returns -1 on no match. Compile has to be called before
71 // calling this.
72 int FirstMatch(absl::string_view text,
73 const std::vector<int>& atoms) const;
74
75 // Returns the indices of all matching regexps, after first clearing
76 // matched_regexps.
77 bool AllMatches(absl::string_view text,
78 const std::vector<int>& atoms,
79 std::vector<int>* matching_regexps) const;
80
81 // Returns the indices of all potentially matching regexps after first
82 // clearing potential_regexps.
83 // A regexp is potentially matching if it passes the filter.
84 // If a regexp passes the filter it may still not match.
85 // A regexp that does not pass the filter is guaranteed to not match.
86 void AllPotentials(const std::vector<int>& atoms,
87 std::vector<int>* potential_regexps) const;
88
89 // The number of regexps added.
90 int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
91
92 // Get the individual RE2 objects.
93 const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; }
94
95 private:
96 // Print prefilter.
97 void PrintPrefilter(int regexpid);
98
99 // Useful for testing and debugging.
100 void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
101 std::vector<int>* passed_regexps);
102
103 // All the regexps in the FilteredRE2.
104 std::vector<RE2*> re2_vec_;
105
106 // Has the FilteredRE2 been compiled using Compile()
107 bool compiled_;
108
109 // An AND-OR tree of string atoms used for filtering regexps.
110 std::unique_ptr<PrefilterTree> prefilter_tree_;
111};
112
113} // namespace re2
114
115#endif // RE2_FILTERED_RE2_H_
116