1// Copyright 2009 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "re2/prefilter_tree.h"
6
7#include <stddef.h>
8#include <algorithm>
9#include <cmath>
10#include <map>
11#include <memory>
12#include <string>
13#include <utility>
14#include <vector>
15
16#include "absl/strings/str_format.h"
17#include "util/logging.h"
18#include "re2/prefilter.h"
19#include "re2/re2.h"
20
21namespace re2 {
22
23static const bool ExtraDebug = false;
24
25PrefilterTree::PrefilterTree()
26 : compiled_(false),
27 min_atom_len_(3) {
28}
29
30PrefilterTree::PrefilterTree(int min_atom_len)
31 : compiled_(false),
32 min_atom_len_(min_atom_len) {
33}
34
35PrefilterTree::~PrefilterTree() {
36 for (size_t i = 0; i < prefilter_vec_.size(); i++)
37 delete prefilter_vec_[i];
38}
39
40void PrefilterTree::Add(Prefilter* prefilter) {
41 if (compiled_) {
42 LOG(DFATAL) << "Add called after Compile.";
43 return;
44 }
45 if (prefilter != NULL && !KeepNode(prefilter)) {
46 delete prefilter;
47 prefilter = NULL;
48 }
49
50 prefilter_vec_.push_back(prefilter);
51}
52
53void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
54 if (compiled_) {
55 LOG(DFATAL) << "Compile called already.";
56 return;
57 }
58
59 // Some legacy users of PrefilterTree call Compile() before
60 // adding any regexps and expect Compile() to have no effect.
61 if (prefilter_vec_.empty())
62 return;
63
64 compiled_ = true;
65
66 NodeMap nodes;
67 AssignUniqueIds(&nodes, atom_vec);
68 if (ExtraDebug)
69 PrintDebugInfo(&nodes);
70}
71
72Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
73 std::string node_string = NodeString(node);
74 NodeMap::iterator iter = nodes->find(node_string);
75 if (iter == nodes->end())
76 return NULL;
77 return (*iter).second;
78}
79
80std::string PrefilterTree::NodeString(Prefilter* node) const {
81 // Adding the operation disambiguates AND/OR/atom nodes.
82 std::string s = absl::StrFormat("%d", node->op()) + ":";
83 if (node->op() == Prefilter::ATOM) {
84 s += node->atom();
85 } else {
86 for (size_t i = 0; i < node->subs()->size(); i++) {
87 if (i > 0)
88 s += ',';
89 s += absl::StrFormat("%d", (*node->subs())[i]->unique_id());
90 }
91 }
92 return s;
93}
94
95bool PrefilterTree::KeepNode(Prefilter* node) const {
96 if (node == NULL)
97 return false;
98
99 switch (node->op()) {
100 default:
101 LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
102 return false;
103
104 case Prefilter::ALL:
105 case Prefilter::NONE:
106 return false;
107
108 case Prefilter::ATOM:
109 return node->atom().size() >= static_cast<size_t>(min_atom_len_);
110
111 case Prefilter::AND: {
112 int j = 0;
113 std::vector<Prefilter*>* subs = node->subs();
114 for (size_t i = 0; i < subs->size(); i++)
115 if (KeepNode((*subs)[i]))
116 (*subs)[j++] = (*subs)[i];
117 else
118 delete (*subs)[i];
119
120 subs->resize(j);
121 return j > 0;
122 }
123
124 case Prefilter::OR:
125 for (size_t i = 0; i < node->subs()->size(); i++)
126 if (!KeepNode((*node->subs())[i]))
127 return false;
128 return true;
129 }
130}
131
132void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
133 std::vector<std::string>* atom_vec) {
134 atom_vec->clear();
135
136 // Build vector of all filter nodes, sorted topologically
137 // from top to bottom in v.
138 std::vector<Prefilter*> v;
139
140 // Add the top level nodes of each regexp prefilter.
141 for (size_t i = 0; i < prefilter_vec_.size(); i++) {
142 Prefilter* f = prefilter_vec_[i];
143 if (f == NULL)
144 unfiltered_.push_back(static_cast<int>(i));
145
146 // We push NULL also on to v, so that we maintain the
147 // mapping of index==regexpid for level=0 prefilter nodes.
148 v.push_back(f);
149 }
150
151 // Now add all the descendant nodes.
152 for (size_t i = 0; i < v.size(); i++) {
153 Prefilter* f = v[i];
154 if (f == NULL)
155 continue;
156 if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
157 const std::vector<Prefilter*>& subs = *f->subs();
158 for (size_t j = 0; j < subs.size(); j++)
159 v.push_back(subs[j]);
160 }
161 }
162
163 // Identify unique nodes.
164 int unique_id = 0;
165 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
166 Prefilter *node = v[i];
167 if (node == NULL)
168 continue;
169 node->set_unique_id(-1);
170 Prefilter* canonical = CanonicalNode(nodes, node);
171 if (canonical == NULL) {
172 // Any further nodes that have the same node string
173 // will find this node as the canonical node.
174 nodes->emplace(NodeString(node), node);
175 if (node->op() == Prefilter::ATOM) {
176 atom_vec->push_back(node->atom());
177 atom_index_to_id_.push_back(unique_id);
178 }
179 node->set_unique_id(unique_id++);
180 } else {
181 node->set_unique_id(canonical->unique_id());
182 }
183 }
184 entries_.resize(unique_id);
185
186 // Fill the entries.
187 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
188 Prefilter* prefilter = v[i];
189 if (prefilter == NULL)
190 continue;
191 if (CanonicalNode(nodes, prefilter) != prefilter)
192 continue;
193 int id = prefilter->unique_id();
194 switch (prefilter->op()) {
195 default:
196 LOG(DFATAL) << "Unexpected op: " << prefilter->op();
197 return;
198
199 case Prefilter::ATOM:
200 entries_[id].propagate_up_at_count = 1;
201 break;
202
203 case Prefilter::OR:
204 case Prefilter::AND: {
205 // For each child, we append our id to the child's list of
206 // parent ids... unless we happen to have done so already.
207 // The number of appends is the number of unique children,
208 // which allows correct upward propagation from AND nodes.
209 int up_count = 0;
210 for (size_t j = 0; j < prefilter->subs()->size(); j++) {
211 int child_id = (*prefilter->subs())[j]->unique_id();
212 std::vector<int>& parents = entries_[child_id].parents;
213 if (parents.empty() || parents.back() != id) {
214 parents.push_back(id);
215 up_count++;
216 }
217 }
218 entries_[id].propagate_up_at_count =
219 prefilter->op() == Prefilter::AND ? up_count : 1;
220 break;
221 }
222 }
223 }
224
225 // For top level nodes, populate regexp id.
226 for (size_t i = 0; i < prefilter_vec_.size(); i++) {
227 if (prefilter_vec_[i] == NULL)
228 continue;
229 int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
230 DCHECK_LE(0, id);
231 Entry* entry = &entries_[id];
232 entry->regexps.push_back(static_cast<int>(i));
233 }
234
235 // Lastly, using probability-based heuristics, we identify nodes
236 // that trigger too many parents and then we try to prune edges.
237 // We use logarithms below to avoid the likelihood of underflow.
238 double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size());
239 // Hoisted this above the loop so that we don't thrash the heap.
240 std::vector<std::pair<size_t, int>> entries_by_num_edges;
241 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
242 Prefilter* prefilter = v[i];
243 // Pruning applies only to AND nodes because it "just" reduces
244 // precision; applied to OR nodes, it would break correctness.
245 if (prefilter == NULL || prefilter->op() != Prefilter::AND)
246 continue;
247 if (CanonicalNode(nodes, prefilter) != prefilter)
248 continue;
249 int id = prefilter->unique_id();
250
251 // Sort the current node's children by the numbers of parents.
252 entries_by_num_edges.clear();
253 for (size_t j = 0; j < prefilter->subs()->size(); j++) {
254 int child_id = (*prefilter->subs())[j]->unique_id();
255 const std::vector<int>& parents = entries_[child_id].parents;
256 entries_by_num_edges.emplace_back(parents.size(), child_id);
257 }
258 std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end());
259
260 // A running estimate of how many regexps will be triggered by
261 // pruning the remaining children's edges to the current node.
262 // Our nominal target is one, so the threshold is log(1) == 0;
263 // pruning occurs iff the child has more than nine edges left.
264 double log_num_triggered = log_num_regexps;
265 for (const auto& pair : entries_by_num_edges) {
266 int child_id = pair.second;
267 std::vector<int>& parents = entries_[child_id].parents;
268 if (log_num_triggered > 0.) {
269 log_num_triggered += std::log(parents.size());
270 log_num_triggered -= log_num_regexps;
271 } else if (parents.size() > 9) {
272 auto it = std::find(parents.begin(), parents.end(), id);
273 if (it != parents.end()) {
274 parents.erase(it);
275 entries_[id].propagate_up_at_count--;
276 }
277 }
278 }
279 }
280}
281
282// Functions for triggering during search.
283void PrefilterTree::RegexpsGivenStrings(
284 const std::vector<int>& matched_atoms,
285 std::vector<int>* regexps) const {
286 regexps->clear();
287 if (!compiled_) {
288 // Some legacy users of PrefilterTree call Compile() before
289 // adding any regexps and expect Compile() to have no effect.
290 // This kludge is a counterpart to that kludge.
291 if (prefilter_vec_.empty())
292 return;
293
294 LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
295 for (size_t i = 0; i < prefilter_vec_.size(); i++)
296 regexps->push_back(static_cast<int>(i));
297 } else {
298 IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
299 std::vector<int> matched_atom_ids;
300 for (size_t j = 0; j < matched_atoms.size(); j++)
301 matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
302 PropagateMatch(matched_atom_ids, &regexps_map);
303 for (IntMap::iterator it = regexps_map.begin();
304 it != regexps_map.end();
305 ++it)
306 regexps->push_back(it->index());
307
308 regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
309 }
310 std::sort(regexps->begin(), regexps->end());
311}
312
313void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
314 IntMap* regexps) const {
315 IntMap count(static_cast<int>(entries_.size()));
316 IntMap work(static_cast<int>(entries_.size()));
317 for (size_t i = 0; i < atom_ids.size(); i++)
318 work.set(atom_ids[i], 1);
319 for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
320 const Entry& entry = entries_[it->index()];
321 // Record regexps triggered.
322 for (size_t i = 0; i < entry.regexps.size(); i++)
323 regexps->set(entry.regexps[i], 1);
324 int c;
325 // Pass trigger up to parents.
326 for (int j : entry.parents) {
327 const Entry& parent = entries_[j];
328 // Delay until all the children have succeeded.
329 if (parent.propagate_up_at_count > 1) {
330 if (count.has_index(j)) {
331 c = count.get_existing(j) + 1;
332 count.set_existing(j, c);
333 } else {
334 c = 1;
335 count.set_new(j, c);
336 }
337 if (c < parent.propagate_up_at_count)
338 continue;
339 }
340 // Trigger the parent.
341 work.set(j, 1);
342 }
343 }
344}
345
346// Debugging help.
347void PrefilterTree::PrintPrefilter(int regexpid) {
348 LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
349}
350
351void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
352 LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
353 LOG(ERROR) << "#Unique Nodes: " << entries_.size();
354
355 for (size_t i = 0; i < entries_.size(); i++) {
356 const std::vector<int>& parents = entries_[i].parents;
357 const std::vector<int>& regexps = entries_[i].regexps;
358 LOG(ERROR) << "EntryId: " << i
359 << " N: " << parents.size() << " R: " << regexps.size();
360 for (int parent : parents)
361 LOG(ERROR) << parent;
362 }
363 LOG(ERROR) << "Map:";
364 for (NodeMap::const_iterator iter = nodes->begin();
365 iter != nodes->end(); ++iter)
366 LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
367 << " Str: " << (*iter).first;
368}
369
370std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
371 std::string node_string = "";
372 if (node->op() == Prefilter::ATOM) {
373 DCHECK(!node->atom().empty());
374 node_string += node->atom();
375 } else {
376 // Adding the operation disambiguates AND and OR nodes.
377 node_string += node->op() == Prefilter::AND ? "AND" : "OR";
378 node_string += "(";
379 for (size_t i = 0; i < node->subs()->size(); i++) {
380 if (i > 0)
381 node_string += ',';
382 node_string += absl::StrFormat("%d", (*node->subs())[i]->unique_id());
383 node_string += ":";
384 node_string += DebugNodeString((*node->subs())[i]);
385 }
386 node_string += ")";
387 }
388 return node_string;
389}
390
391} // namespace re2
392