1 | // Copyright 2009 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #ifndef RE2_FILTERED_RE2_H_ |
6 | #define RE2_FILTERED_RE2_H_ |
7 | |
8 | // The class FilteredRE2 is used as a wrapper to multiple RE2 regexps. |
9 | // It provides a prefilter mechanism that helps in cutting down the |
10 | // number of regexps that need to be actually searched. |
11 | // |
12 | // By design, it does not include a string matching engine. This is to |
13 | // allow the user of the class to use their favorite string matching |
14 | // engine. The overall flow is: Add all the regexps using Add, then |
15 | // Compile the FilteredRE2. Compile returns strings that need to be |
16 | // matched. Note that the returned strings are lowercased and distinct. |
17 | // For applying regexps to a search text, the caller does the string |
18 | // matching using the returned strings. When doing the string match, |
19 | // note that the caller has to do that in a case-insensitive way or |
20 | // on a lowercased version of the search text. Then call FirstMatch |
21 | // or AllMatches with a vector of indices of strings that were found |
22 | // in the text to get the actual regexp matches. |
23 | |
24 | #include <memory> |
25 | #include <string> |
26 | #include <vector> |
27 | |
28 | #include "absl/strings/string_view.h" |
29 | #include "re2/re2.h" |
30 | |
31 | namespace re2 { |
32 | |
33 | class PrefilterTree; |
34 | |
35 | class FilteredRE2 { |
36 | public: |
37 | FilteredRE2(); |
38 | explicit FilteredRE2(int min_atom_len); |
39 | ~FilteredRE2(); |
40 | |
41 | // Not copyable. |
42 | FilteredRE2(const FilteredRE2&) = delete; |
43 | FilteredRE2& operator=(const FilteredRE2&) = delete; |
44 | // Movable. |
45 | FilteredRE2(FilteredRE2&& other); |
46 | FilteredRE2& operator=(FilteredRE2&& other); |
47 | |
48 | // Uses RE2 constructor to create a RE2 object (re). Returns |
49 | // re->error_code(). If error_code is other than NoError, then re is |
50 | // deleted and not added to re2_vec_. |
51 | RE2::ErrorCode Add(absl::string_view pattern, |
52 | const RE2::Options& options, |
53 | int* id); |
54 | |
55 | // Prepares the regexps added by Add for filtering. Returns a set |
56 | // of strings that the caller should check for in candidate texts. |
57 | // The returned strings are lowercased and distinct. When doing |
58 | // string matching, it should be performed in a case-insensitive |
59 | // way or the search text should be lowercased first. Call after |
60 | // all Add calls are done. |
61 | void Compile(std::vector<std::string>* strings_to_match); |
62 | |
63 | // Returns the index of the first matching regexp. |
64 | // Returns -1 on no match. Can be called prior to Compile. |
65 | // Does not do any filtering: simply tries to Match the |
66 | // regexps in a loop. |
67 | int SlowFirstMatch(absl::string_view text) const; |
68 | |
69 | // Returns the index of the first matching regexp. |
70 | // Returns -1 on no match. Compile has to be called before |
71 | // calling this. |
72 | int FirstMatch(absl::string_view text, |
73 | const std::vector<int>& atoms) const; |
74 | |
75 | // Returns the indices of all matching regexps, after first clearing |
76 | // matched_regexps. |
77 | bool AllMatches(absl::string_view text, |
78 | const std::vector<int>& atoms, |
79 | std::vector<int>* matching_regexps) const; |
80 | |
81 | // Returns the indices of all potentially matching regexps after first |
82 | // clearing potential_regexps. |
83 | // A regexp is potentially matching if it passes the filter. |
84 | // If a regexp passes the filter it may still not match. |
85 | // A regexp that does not pass the filter is guaranteed to not match. |
86 | void AllPotentials(const std::vector<int>& atoms, |
87 | std::vector<int>* potential_regexps) const; |
88 | |
89 | // The number of regexps added. |
90 | int NumRegexps() const { return static_cast<int>(re2_vec_.size()); } |
91 | |
92 | // Get the individual RE2 objects. |
93 | const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; } |
94 | |
95 | private: |
96 | // Print prefilter. |
97 | void PrintPrefilter(int regexpid); |
98 | |
99 | // Useful for testing and debugging. |
100 | void RegexpsGivenStrings(const std::vector<int>& matched_atoms, |
101 | std::vector<int>* passed_regexps); |
102 | |
103 | // All the regexps in the FilteredRE2. |
104 | std::vector<RE2*> re2_vec_; |
105 | |
106 | // Has the FilteredRE2 been compiled using Compile() |
107 | bool compiled_; |
108 | |
109 | // An AND-OR tree of string atoms used for filtering regexps. |
110 | std::unique_ptr<PrefilterTree> prefilter_tree_; |
111 | }; |
112 | |
113 | } // namespace re2 |
114 | |
115 | #endif // RE2_FILTERED_RE2_H_ |
116 | |