1// Copyright 2006 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#ifndef RE2_WALKER_INL_H_
6#define RE2_WALKER_INL_H_
7
8// Helper class for traversing Regexps without recursion.
9// Clients should declare their own subclasses that override
10// the PreVisit and PostVisit methods, which are called before
11// and after visiting the subexpressions.
12
13// Not quite the Visitor pattern, because (among other things)
14// the Visitor pattern is recursive.
15
16#include <stack>
17
18#include "absl/base/macros.h"
19#include "util/logging.h"
20#include "re2/regexp.h"
21
22namespace re2 {
23
24template<typename T> struct WalkState;
25
26template<typename T> class Regexp::Walker {
27 public:
28 Walker();
29 virtual ~Walker();
30
31 // Virtual method called before visiting re's children.
32 // PreVisit passes ownership of its return value to its caller.
33 // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
34 // and passed to the child PreVisits and PostVisits as parent_arg.
35 // At the top-most Regexp, parent_arg is arg passed to walk.
36 // If PreVisit sets *stop to true, the walk does not recurse
37 // into the children. Instead it behaves as though the return
38 // value from PreVisit is the return value from PostVisit.
39 // The default PreVisit returns parent_arg.
40 virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
41
42 // Virtual method called after visiting re's children.
43 // The pre_arg is the T that PreVisit returned.
44 // The child_args is a vector of the T that the child PostVisits returned.
45 // PostVisit takes ownership of pre_arg.
46 // PostVisit takes ownership of the Ts
47 // in *child_args, but not the vector itself.
48 // PostVisit passes ownership of its return value
49 // to its caller.
50 // The default PostVisit simply returns pre_arg.
51 virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
52 T* child_args, int nchild_args);
53
54 // Virtual method called to copy a T,
55 // when Walk notices that more than one child is the same re.
56 virtual T Copy(T arg);
57
58 // Virtual method called to do a "quick visit" of the re,
59 // but not its children. Only called once the visit budget
60 // has been used up and we're trying to abort the walk
61 // as quickly as possible. Should return a value that
62 // makes sense for the parent PostVisits still to be run.
63 // This function is (hopefully) only called by
64 // WalkExponential, but must be implemented by all clients,
65 // just in case.
66 virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
67
68 // Walks over a regular expression.
69 // Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
70 // Returns the T returned by PostVisit on re.
71 T Walk(Regexp* re, T top_arg);
72
73 // Like Walk, but doesn't use Copy. This can lead to
74 // exponential runtimes on cross-linked Regexps like the
75 // ones generated by Simplify. To help limit this,
76 // at most max_visits nodes will be visited and then
77 // the walk will be cut off early.
78 // If the walk *is* cut off early, ShortVisit(re)
79 // will be called on regexps that cannot be fully
80 // visited rather than calling PreVisit/PostVisit.
81 T WalkExponential(Regexp* re, T top_arg, int max_visits);
82
83 // Clears the stack. Should never be necessary, since
84 // Walk always enters and exits with an empty stack.
85 // Logs DFATAL if stack is not already clear.
86 void Reset();
87
88 // Returns whether walk was cut off.
89 bool stopped_early() { return stopped_early_; }
90
91 private:
92 // Walk state for the entire traversal.
93 std::stack<WalkState<T>> stack_;
94 bool stopped_early_;
95 int max_visits_;
96
97 T WalkInternal(Regexp* re, T top_arg, bool use_copy);
98
99 Walker(const Walker&) = delete;
100 Walker& operator=(const Walker&) = delete;
101};
102
103template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
104 T parent_arg,
105 bool* stop) {
106 return parent_arg;
107}
108
109template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
110 T parent_arg,
111 T pre_arg,
112 T* child_args,
113 int nchild_args) {
114 return pre_arg;
115}
116
117template<typename T> T Regexp::Walker<T>::Copy(T arg) {
118 return arg;
119}
120
121// State about a single level in the traversal.
122template<typename T> struct WalkState {
123 WalkState(Regexp* re, T parent)
124 : re(re),
125 n(-1),
126 parent_arg(parent),
127 child_args(NULL) { }
128
129 Regexp* re; // The regexp
130 int n; // The index of the next child to process; -1 means need to PreVisit
131 T parent_arg; // Accumulated arguments.
132 T pre_arg;
133 T child_arg; // One-element buffer for child_args.
134 T* child_args;
135};
136
137template<typename T> Regexp::Walker<T>::Walker() {
138 stopped_early_ = false;
139}
140
141template<typename T> Regexp::Walker<T>::~Walker() {
142 Reset();
143}
144
145// Clears the stack. Should never be necessary, since
146// Walk always enters and exits with an empty stack.
147// Logs DFATAL if stack is not already clear.
148template<typename T> void Regexp::Walker<T>::Reset() {
149 if (!stack_.empty()) {
150 LOG(DFATAL) << "Stack not empty.";
151 while (!stack_.empty()) {
152 if (stack_.top().re->nsub_ > 1)
153 delete[] stack_.top().child_args;
154 stack_.pop();
155 }
156 }
157}
158
159template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
160 bool use_copy) {
161 Reset();
162
163 if (re == NULL) {
164 LOG(DFATAL) << "Walk NULL";
165 return top_arg;
166 }
167
168 stack_.push(WalkState<T>(re, top_arg));
169
170 WalkState<T>* s;
171 for (;;) {
172 T t;
173 s = &stack_.top();
174 re = s->re;
175 switch (s->n) {
176 case -1: {
177 if (--max_visits_ < 0) {
178 stopped_early_ = true;
179 t = ShortVisit(re, s->parent_arg);
180 break;
181 }
182 bool stop = false;
183 s->pre_arg = PreVisit(re, s->parent_arg, &stop);
184 if (stop) {
185 t = s->pre_arg;
186 break;
187 }
188 s->n = 0;
189 s->child_args = NULL;
190 if (re->nsub_ == 1)
191 s->child_args = &s->child_arg;
192 else if (re->nsub_ > 1)
193 s->child_args = new T[re->nsub_];
194 ABSL_FALLTHROUGH_INTENDED;
195 }
196 default: {
197 if (re->nsub_ > 0) {
198 Regexp** sub = re->sub();
199 if (s->n < re->nsub_) {
200 if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
201 s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
202 s->n++;
203 } else {
204 stack_.push(WalkState<T>(sub[s->n], s->pre_arg));
205 }
206 continue;
207 }
208 }
209
210 t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
211 if (re->nsub_ > 1)
212 delete[] s->child_args;
213 break;
214 }
215 }
216
217 // We've finished stack_.top().
218 // Update next guy down.
219 stack_.pop();
220 if (stack_.empty())
221 return t;
222 s = &stack_.top();
223 if (s->child_args != NULL)
224 s->child_args[s->n] = t;
225 else
226 s->child_arg = t;
227 s->n++;
228 }
229}
230
231template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
232 // Without the exponential walking behavior,
233 // this budget should be more than enough for any
234 // regexp, and yet not enough to get us in trouble
235 // as far as CPU time.
236 max_visits_ = 1000000;
237 return WalkInternal(re, top_arg, true);
238}
239
240template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
241 int max_visits) {
242 max_visits_ = max_visits;
243 return WalkInternal(re, top_arg, false);
244}
245
246} // namespace re2
247
248#endif // RE2_WALKER_INL_H_
249