1 | // Copyright 2009 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | #include "re2/prefilter_tree.h" |
6 | |
7 | #include <stddef.h> |
8 | #include <algorithm> |
9 | #include <cmath> |
10 | #include <map> |
11 | #include <memory> |
12 | #include <string> |
13 | #include <utility> |
14 | #include <vector> |
15 | |
16 | #include "absl/strings/str_format.h" |
17 | #include "util/logging.h" |
18 | #include "re2/prefilter.h" |
19 | #include "re2/re2.h" |
20 | |
21 | namespace re2 { |
22 | |
23 | static const bool = false; |
24 | |
25 | PrefilterTree::PrefilterTree() |
26 | : compiled_(false), |
27 | min_atom_len_(3) { |
28 | } |
29 | |
30 | PrefilterTree::PrefilterTree(int min_atom_len) |
31 | : compiled_(false), |
32 | min_atom_len_(min_atom_len) { |
33 | } |
34 | |
35 | PrefilterTree::~PrefilterTree() { |
36 | for (size_t i = 0; i < prefilter_vec_.size(); i++) |
37 | delete prefilter_vec_[i]; |
38 | } |
39 | |
40 | void PrefilterTree::Add(Prefilter* prefilter) { |
41 | if (compiled_) { |
42 | LOG(DFATAL) << "Add called after Compile." ; |
43 | return; |
44 | } |
45 | if (prefilter != NULL && !KeepNode(prefilter)) { |
46 | delete prefilter; |
47 | prefilter = NULL; |
48 | } |
49 | |
50 | prefilter_vec_.push_back(prefilter); |
51 | } |
52 | |
53 | void PrefilterTree::Compile(std::vector<std::string>* atom_vec) { |
54 | if (compiled_) { |
55 | LOG(DFATAL) << "Compile called already." ; |
56 | return; |
57 | } |
58 | |
59 | // Some legacy users of PrefilterTree call Compile() before |
60 | // adding any regexps and expect Compile() to have no effect. |
61 | if (prefilter_vec_.empty()) |
62 | return; |
63 | |
64 | compiled_ = true; |
65 | |
66 | NodeMap nodes; |
67 | AssignUniqueIds(&nodes, atom_vec); |
68 | if (ExtraDebug) |
69 | PrintDebugInfo(&nodes); |
70 | } |
71 | |
72 | Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) { |
73 | std::string node_string = NodeString(node); |
74 | NodeMap::iterator iter = nodes->find(node_string); |
75 | if (iter == nodes->end()) |
76 | return NULL; |
77 | return (*iter).second; |
78 | } |
79 | |
80 | std::string PrefilterTree::NodeString(Prefilter* node) const { |
81 | // Adding the operation disambiguates AND/OR/atom nodes. |
82 | std::string s = absl::StrFormat("%d" , node->op()) + ":" ; |
83 | if (node->op() == Prefilter::ATOM) { |
84 | s += node->atom(); |
85 | } else { |
86 | for (size_t i = 0; i < node->subs()->size(); i++) { |
87 | if (i > 0) |
88 | s += ','; |
89 | s += absl::StrFormat("%d" , (*node->subs())[i]->unique_id()); |
90 | } |
91 | } |
92 | return s; |
93 | } |
94 | |
95 | bool PrefilterTree::KeepNode(Prefilter* node) const { |
96 | if (node == NULL) |
97 | return false; |
98 | |
99 | switch (node->op()) { |
100 | default: |
101 | LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op(); |
102 | return false; |
103 | |
104 | case Prefilter::ALL: |
105 | case Prefilter::NONE: |
106 | return false; |
107 | |
108 | case Prefilter::ATOM: |
109 | return node->atom().size() >= static_cast<size_t>(min_atom_len_); |
110 | |
111 | case Prefilter::AND: { |
112 | int j = 0; |
113 | std::vector<Prefilter*>* subs = node->subs(); |
114 | for (size_t i = 0; i < subs->size(); i++) |
115 | if (KeepNode((*subs)[i])) |
116 | (*subs)[j++] = (*subs)[i]; |
117 | else |
118 | delete (*subs)[i]; |
119 | |
120 | subs->resize(j); |
121 | return j > 0; |
122 | } |
123 | |
124 | case Prefilter::OR: |
125 | for (size_t i = 0; i < node->subs()->size(); i++) |
126 | if (!KeepNode((*node->subs())[i])) |
127 | return false; |
128 | return true; |
129 | } |
130 | } |
131 | |
132 | void PrefilterTree::AssignUniqueIds(NodeMap* nodes, |
133 | std::vector<std::string>* atom_vec) { |
134 | atom_vec->clear(); |
135 | |
136 | // Build vector of all filter nodes, sorted topologically |
137 | // from top to bottom in v. |
138 | std::vector<Prefilter*> v; |
139 | |
140 | // Add the top level nodes of each regexp prefilter. |
141 | for (size_t i = 0; i < prefilter_vec_.size(); i++) { |
142 | Prefilter* f = prefilter_vec_[i]; |
143 | if (f == NULL) |
144 | unfiltered_.push_back(static_cast<int>(i)); |
145 | |
146 | // We push NULL also on to v, so that we maintain the |
147 | // mapping of index==regexpid for level=0 prefilter nodes. |
148 | v.push_back(f); |
149 | } |
150 | |
151 | // Now add all the descendant nodes. |
152 | for (size_t i = 0; i < v.size(); i++) { |
153 | Prefilter* f = v[i]; |
154 | if (f == NULL) |
155 | continue; |
156 | if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { |
157 | const std::vector<Prefilter*>& subs = *f->subs(); |
158 | for (size_t j = 0; j < subs.size(); j++) |
159 | v.push_back(subs[j]); |
160 | } |
161 | } |
162 | |
163 | // Identify unique nodes. |
164 | int unique_id = 0; |
165 | for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { |
166 | Prefilter *node = v[i]; |
167 | if (node == NULL) |
168 | continue; |
169 | node->set_unique_id(-1); |
170 | Prefilter* canonical = CanonicalNode(nodes, node); |
171 | if (canonical == NULL) { |
172 | // Any further nodes that have the same node string |
173 | // will find this node as the canonical node. |
174 | nodes->emplace(NodeString(node), node); |
175 | if (node->op() == Prefilter::ATOM) { |
176 | atom_vec->push_back(node->atom()); |
177 | atom_index_to_id_.push_back(unique_id); |
178 | } |
179 | node->set_unique_id(unique_id++); |
180 | } else { |
181 | node->set_unique_id(canonical->unique_id()); |
182 | } |
183 | } |
184 | entries_.resize(unique_id); |
185 | |
186 | // Fill the entries. |
187 | for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { |
188 | Prefilter* prefilter = v[i]; |
189 | if (prefilter == NULL) |
190 | continue; |
191 | if (CanonicalNode(nodes, prefilter) != prefilter) |
192 | continue; |
193 | int id = prefilter->unique_id(); |
194 | switch (prefilter->op()) { |
195 | default: |
196 | LOG(DFATAL) << "Unexpected op: " << prefilter->op(); |
197 | return; |
198 | |
199 | case Prefilter::ATOM: |
200 | entries_[id].propagate_up_at_count = 1; |
201 | break; |
202 | |
203 | case Prefilter::OR: |
204 | case Prefilter::AND: { |
205 | // For each child, we append our id to the child's list of |
206 | // parent ids... unless we happen to have done so already. |
207 | // The number of appends is the number of unique children, |
208 | // which allows correct upward propagation from AND nodes. |
209 | int up_count = 0; |
210 | for (size_t j = 0; j < prefilter->subs()->size(); j++) { |
211 | int child_id = (*prefilter->subs())[j]->unique_id(); |
212 | std::vector<int>& parents = entries_[child_id].parents; |
213 | if (parents.empty() || parents.back() != id) { |
214 | parents.push_back(id); |
215 | up_count++; |
216 | } |
217 | } |
218 | entries_[id].propagate_up_at_count = |
219 | prefilter->op() == Prefilter::AND ? up_count : 1; |
220 | break; |
221 | } |
222 | } |
223 | } |
224 | |
225 | // For top level nodes, populate regexp id. |
226 | for (size_t i = 0; i < prefilter_vec_.size(); i++) { |
227 | if (prefilter_vec_[i] == NULL) |
228 | continue; |
229 | int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id(); |
230 | DCHECK_LE(0, id); |
231 | Entry* entry = &entries_[id]; |
232 | entry->regexps.push_back(static_cast<int>(i)); |
233 | } |
234 | |
235 | // Lastly, using probability-based heuristics, we identify nodes |
236 | // that trigger too many parents and then we try to prune edges. |
237 | // We use logarithms below to avoid the likelihood of underflow. |
238 | double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size()); |
239 | // Hoisted this above the loop so that we don't thrash the heap. |
240 | std::vector<std::pair<size_t, int>> entries_by_num_edges; |
241 | for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { |
242 | Prefilter* prefilter = v[i]; |
243 | // Pruning applies only to AND nodes because it "just" reduces |
244 | // precision; applied to OR nodes, it would break correctness. |
245 | if (prefilter == NULL || prefilter->op() != Prefilter::AND) |
246 | continue; |
247 | if (CanonicalNode(nodes, prefilter) != prefilter) |
248 | continue; |
249 | int id = prefilter->unique_id(); |
250 | |
251 | // Sort the current node's children by the numbers of parents. |
252 | entries_by_num_edges.clear(); |
253 | for (size_t j = 0; j < prefilter->subs()->size(); j++) { |
254 | int child_id = (*prefilter->subs())[j]->unique_id(); |
255 | const std::vector<int>& parents = entries_[child_id].parents; |
256 | entries_by_num_edges.emplace_back(parents.size(), child_id); |
257 | } |
258 | std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end()); |
259 | |
260 | // A running estimate of how many regexps will be triggered by |
261 | // pruning the remaining children's edges to the current node. |
262 | // Our nominal target is one, so the threshold is log(1) == 0; |
263 | // pruning occurs iff the child has more than nine edges left. |
264 | double log_num_triggered = log_num_regexps; |
265 | for (const auto& pair : entries_by_num_edges) { |
266 | int child_id = pair.second; |
267 | std::vector<int>& parents = entries_[child_id].parents; |
268 | if (log_num_triggered > 0.) { |
269 | log_num_triggered += std::log(parents.size()); |
270 | log_num_triggered -= log_num_regexps; |
271 | } else if (parents.size() > 9) { |
272 | auto it = std::find(parents.begin(), parents.end(), id); |
273 | if (it != parents.end()) { |
274 | parents.erase(it); |
275 | entries_[id].propagate_up_at_count--; |
276 | } |
277 | } |
278 | } |
279 | } |
280 | } |
281 | |
282 | // Functions for triggering during search. |
283 | void PrefilterTree::RegexpsGivenStrings( |
284 | const std::vector<int>& matched_atoms, |
285 | std::vector<int>* regexps) const { |
286 | regexps->clear(); |
287 | if (!compiled_) { |
288 | // Some legacy users of PrefilterTree call Compile() before |
289 | // adding any regexps and expect Compile() to have no effect. |
290 | // This kludge is a counterpart to that kludge. |
291 | if (prefilter_vec_.empty()) |
292 | return; |
293 | |
294 | LOG(ERROR) << "RegexpsGivenStrings called before Compile." ; |
295 | for (size_t i = 0; i < prefilter_vec_.size(); i++) |
296 | regexps->push_back(static_cast<int>(i)); |
297 | } else { |
298 | IntMap regexps_map(static_cast<int>(prefilter_vec_.size())); |
299 | std::vector<int> matched_atom_ids; |
300 | for (size_t j = 0; j < matched_atoms.size(); j++) |
301 | matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); |
302 | PropagateMatch(matched_atom_ids, ®exps_map); |
303 | for (IntMap::iterator it = regexps_map.begin(); |
304 | it != regexps_map.end(); |
305 | ++it) |
306 | regexps->push_back(it->index()); |
307 | |
308 | regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); |
309 | } |
310 | std::sort(regexps->begin(), regexps->end()); |
311 | } |
312 | |
313 | void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids, |
314 | IntMap* regexps) const { |
315 | IntMap count(static_cast<int>(entries_.size())); |
316 | IntMap work(static_cast<int>(entries_.size())); |
317 | for (size_t i = 0; i < atom_ids.size(); i++) |
318 | work.set(atom_ids[i], 1); |
319 | for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { |
320 | const Entry& entry = entries_[it->index()]; |
321 | // Record regexps triggered. |
322 | for (size_t i = 0; i < entry.regexps.size(); i++) |
323 | regexps->set(entry.regexps[i], 1); |
324 | int c; |
325 | // Pass trigger up to parents. |
326 | for (int j : entry.parents) { |
327 | const Entry& parent = entries_[j]; |
328 | // Delay until all the children have succeeded. |
329 | if (parent.propagate_up_at_count > 1) { |
330 | if (count.has_index(j)) { |
331 | c = count.get_existing(j) + 1; |
332 | count.set_existing(j, c); |
333 | } else { |
334 | c = 1; |
335 | count.set_new(j, c); |
336 | } |
337 | if (c < parent.propagate_up_at_count) |
338 | continue; |
339 | } |
340 | // Trigger the parent. |
341 | work.set(j, 1); |
342 | } |
343 | } |
344 | } |
345 | |
346 | // Debugging help. |
347 | void PrefilterTree::PrintPrefilter(int regexpid) { |
348 | LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]); |
349 | } |
350 | |
351 | void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { |
352 | LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size(); |
353 | LOG(ERROR) << "#Unique Nodes: " << entries_.size(); |
354 | |
355 | for (size_t i = 0; i < entries_.size(); i++) { |
356 | const std::vector<int>& parents = entries_[i].parents; |
357 | const std::vector<int>& regexps = entries_[i].regexps; |
358 | LOG(ERROR) << "EntryId: " << i |
359 | << " N: " << parents.size() << " R: " << regexps.size(); |
360 | for (int parent : parents) |
361 | LOG(ERROR) << parent; |
362 | } |
363 | LOG(ERROR) << "Map:" ; |
364 | for (NodeMap::const_iterator iter = nodes->begin(); |
365 | iter != nodes->end(); ++iter) |
366 | LOG(ERROR) << "NodeId: " << (*iter).second->unique_id() |
367 | << " Str: " << (*iter).first; |
368 | } |
369 | |
370 | std::string PrefilterTree::DebugNodeString(Prefilter* node) const { |
371 | std::string node_string = "" ; |
372 | if (node->op() == Prefilter::ATOM) { |
373 | DCHECK(!node->atom().empty()); |
374 | node_string += node->atom(); |
375 | } else { |
376 | // Adding the operation disambiguates AND and OR nodes. |
377 | node_string += node->op() == Prefilter::AND ? "AND" : "OR" ; |
378 | node_string += "(" ; |
379 | for (size_t i = 0; i < node->subs()->size(); i++) { |
380 | if (i > 0) |
381 | node_string += ','; |
382 | node_string += absl::StrFormat("%d" , (*node->subs())[i]->unique_id()); |
383 | node_string += ":" ; |
384 | node_string += DebugNodeString((*node->subs())[i]); |
385 | } |
386 | node_string += ")" ; |
387 | } |
388 | return node_string; |
389 | } |
390 | |
391 | } // namespace re2 |
392 | |