1 | // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Regular expression interface RE2. |
6 | // |
7 | // Originally the PCRE C++ wrapper, but adapted to use |
8 | // the new automata-based regular expression engines. |
9 | |
10 | #include "re2/re2.h" |
11 | |
12 | #include <assert.h> |
13 | #include <ctype.h> |
14 | #include <errno.h> |
15 | #ifdef _MSC_VER |
16 | #include <intrin.h> |
17 | #endif |
18 | #include <stdint.h> |
19 | #include <stdlib.h> |
20 | #include <string.h> |
21 | #include <algorithm> |
22 | #include <atomic> |
23 | #include <iterator> |
24 | #include <string> |
25 | #include <utility> |
26 | #include <vector> |
27 | |
28 | #include "absl/base/macros.h" |
29 | #include "absl/container/fixed_array.h" |
30 | #include "absl/strings/str_format.h" |
31 | #include "util/logging.h" |
32 | #include "util/strutil.h" |
33 | #include "util/utf.h" |
34 | #include "re2/prog.h" |
35 | #include "re2/regexp.h" |
36 | #include "re2/sparse_array.h" |
37 | |
38 | namespace re2 { |
39 | |
40 | // Maximum number of args we can set |
41 | static const int kMaxArgs = 16; |
42 | static const int kVecSize = 1+kMaxArgs; |
43 | |
44 | const int RE2::Options::kDefaultMaxMem; // initialized in re2.h |
45 | |
46 | RE2::Options::Options(RE2::CannedOptions opt) |
47 | : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), |
48 | posix_syntax_(opt == RE2::POSIX), |
49 | longest_match_(opt == RE2::POSIX), |
50 | log_errors_(opt != RE2::Quiet), |
51 | max_mem_(kDefaultMaxMem), |
52 | literal_(false), |
53 | never_nl_(false), |
54 | dot_nl_(false), |
55 | never_capture_(false), |
56 | case_sensitive_(true), |
57 | perl_classes_(false), |
58 | word_boundary_(false), |
59 | one_line_(false) { |
60 | } |
61 | |
62 | // static empty objects for use as const references. |
63 | // To avoid global constructors, allocated in RE2::Init(). |
64 | static const std::string* empty_string; |
65 | static const std::map<std::string, int>* empty_named_groups; |
66 | static const std::map<int, std::string>* empty_group_names; |
67 | |
68 | // Converts from Regexp error code to RE2 error code. |
69 | // Maybe some day they will diverge. In any event, this |
70 | // hides the existence of Regexp from RE2 users. |
71 | static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { |
72 | switch (code) { |
73 | case re2::kRegexpSuccess: |
74 | return RE2::NoError; |
75 | case re2::kRegexpInternalError: |
76 | return RE2::ErrorInternal; |
77 | case re2::kRegexpBadEscape: |
78 | return RE2::ErrorBadEscape; |
79 | case re2::kRegexpBadCharClass: |
80 | return RE2::ErrorBadCharClass; |
81 | case re2::kRegexpBadCharRange: |
82 | return RE2::ErrorBadCharRange; |
83 | case re2::kRegexpMissingBracket: |
84 | return RE2::ErrorMissingBracket; |
85 | case re2::kRegexpMissingParen: |
86 | return RE2::ErrorMissingParen; |
87 | case re2::kRegexpUnexpectedParen: |
88 | return RE2::ErrorUnexpectedParen; |
89 | case re2::kRegexpTrailingBackslash: |
90 | return RE2::ErrorTrailingBackslash; |
91 | case re2::kRegexpRepeatArgument: |
92 | return RE2::ErrorRepeatArgument; |
93 | case re2::kRegexpRepeatSize: |
94 | return RE2::ErrorRepeatSize; |
95 | case re2::kRegexpRepeatOp: |
96 | return RE2::ErrorRepeatOp; |
97 | case re2::kRegexpBadPerlOp: |
98 | return RE2::ErrorBadPerlOp; |
99 | case re2::kRegexpBadUTF8: |
100 | return RE2::ErrorBadUTF8; |
101 | case re2::kRegexpBadNamedCapture: |
102 | return RE2::ErrorBadNamedCapture; |
103 | } |
104 | return RE2::ErrorInternal; |
105 | } |
106 | |
107 | static std::string trunc(absl::string_view pattern) { |
108 | if (pattern.size() < 100) |
109 | return std::string(pattern); |
110 | return std::string(pattern.substr(0, 100)) + "..." ; |
111 | } |
112 | |
113 | |
114 | RE2::RE2(const char* pattern) { |
115 | Init(pattern, DefaultOptions); |
116 | } |
117 | |
118 | RE2::RE2(const std::string& pattern) { |
119 | Init(pattern, DefaultOptions); |
120 | } |
121 | |
122 | RE2::RE2(absl::string_view pattern) { |
123 | Init(pattern, DefaultOptions); |
124 | } |
125 | |
126 | RE2::RE2(absl::string_view pattern, const Options& options) { |
127 | Init(pattern, options); |
128 | } |
129 | |
130 | int RE2::Options::ParseFlags() const { |
131 | int flags = Regexp::ClassNL; |
132 | switch (encoding()) { |
133 | default: |
134 | if (log_errors()) |
135 | LOG(ERROR) << "Unknown encoding " << encoding(); |
136 | break; |
137 | case RE2::Options::EncodingUTF8: |
138 | break; |
139 | case RE2::Options::EncodingLatin1: |
140 | flags |= Regexp::Latin1; |
141 | break; |
142 | } |
143 | |
144 | if (!posix_syntax()) |
145 | flags |= Regexp::LikePerl; |
146 | |
147 | if (literal()) |
148 | flags |= Regexp::Literal; |
149 | |
150 | if (never_nl()) |
151 | flags |= Regexp::NeverNL; |
152 | |
153 | if (dot_nl()) |
154 | flags |= Regexp::DotNL; |
155 | |
156 | if (never_capture()) |
157 | flags |= Regexp::NeverCapture; |
158 | |
159 | if (!case_sensitive()) |
160 | flags |= Regexp::FoldCase; |
161 | |
162 | if (perl_classes()) |
163 | flags |= Regexp::PerlClasses; |
164 | |
165 | if (word_boundary()) |
166 | flags |= Regexp::PerlB; |
167 | |
168 | if (one_line()) |
169 | flags |= Regexp::OneLine; |
170 | |
171 | return flags; |
172 | } |
173 | |
174 | void RE2::Init(absl::string_view pattern, const Options& options) { |
175 | static absl::once_flag empty_once; |
176 | absl::call_once(empty_once, []() { |
177 | empty_string = new std::string; |
178 | empty_named_groups = new std::map<std::string, int>; |
179 | empty_group_names = new std::map<int, std::string>; |
180 | }); |
181 | |
182 | pattern_.assign(pattern.data(), pattern.size()); |
183 | options_.Copy(options); |
184 | entire_regexp_ = NULL; |
185 | error_ = empty_string; |
186 | error_code_ = NoError; |
187 | error_arg_.clear(); |
188 | prefix_.clear(); |
189 | prefix_foldcase_ = false; |
190 | suffix_regexp_ = NULL; |
191 | prog_ = NULL; |
192 | num_captures_ = -1; |
193 | is_one_pass_ = false; |
194 | |
195 | rprog_ = NULL; |
196 | named_groups_ = NULL; |
197 | group_names_ = NULL; |
198 | |
199 | RegexpStatus status; |
200 | entire_regexp_ = Regexp::Parse( |
201 | pattern_, |
202 | static_cast<Regexp::ParseFlags>(options_.ParseFlags()), |
203 | &status); |
204 | if (entire_regexp_ == NULL) { |
205 | if (options_.log_errors()) { |
206 | LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " |
207 | << status.Text(); |
208 | } |
209 | error_ = new std::string(status.Text()); |
210 | error_code_ = RegexpErrorToRE2(status.code()); |
211 | error_arg_ = std::string(status.error_arg()); |
212 | return; |
213 | } |
214 | |
215 | re2::Regexp* suffix; |
216 | if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) |
217 | suffix_regexp_ = suffix; |
218 | else |
219 | suffix_regexp_ = entire_regexp_->Incref(); |
220 | |
221 | // Two thirds of the memory goes to the forward Prog, |
222 | // one third to the reverse prog, because the forward |
223 | // Prog has two DFAs but the reverse prog has one. |
224 | prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); |
225 | if (prog_ == NULL) { |
226 | if (options_.log_errors()) |
227 | LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'" ; |
228 | error_ = new std::string("pattern too large - compile failed" ); |
229 | error_code_ = RE2::ErrorPatternTooLarge; |
230 | return; |
231 | } |
232 | |
233 | // We used to compute this lazily, but it's used during the |
234 | // typical control flow for a match call, so we now compute |
235 | // it eagerly, which avoids the overhead of absl::once_flag. |
236 | num_captures_ = suffix_regexp_->NumCaptures(); |
237 | |
238 | // Could delay this until the first match call that |
239 | // cares about submatch information, but the one-pass |
240 | // machine's memory gets cut from the DFA memory budget, |
241 | // and that is harder to do if the DFA has already |
242 | // been built. |
243 | is_one_pass_ = prog_->IsOnePass(); |
244 | } |
245 | |
246 | // Returns rprog_, computing it if needed. |
247 | re2::Prog* RE2::ReverseProg() const { |
248 | absl::call_once(rprog_once_, [](const RE2* re) { |
249 | re->rprog_ = |
250 | re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3); |
251 | if (re->rprog_ == NULL) { |
252 | if (re->options_.log_errors()) |
253 | LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'" ; |
254 | // We no longer touch error_ and error_code_ because failing to compile |
255 | // the reverse Prog is not a showstopper: falling back to NFA execution |
256 | // is fine. More importantly, an RE2 object is supposed to be logically |
257 | // immutable: whatever ok() would have returned after Init() completed, |
258 | // it should continue to return that no matter what ReverseProg() does. |
259 | } |
260 | }, this); |
261 | return rprog_; |
262 | } |
263 | |
264 | RE2::~RE2() { |
265 | if (suffix_regexp_) |
266 | suffix_regexp_->Decref(); |
267 | if (entire_regexp_) |
268 | entire_regexp_->Decref(); |
269 | delete prog_; |
270 | delete rprog_; |
271 | if (error_ != empty_string) |
272 | delete error_; |
273 | if (named_groups_ != NULL && named_groups_ != empty_named_groups) |
274 | delete named_groups_; |
275 | if (group_names_ != NULL && group_names_ != empty_group_names) |
276 | delete group_names_; |
277 | } |
278 | |
279 | int RE2::ProgramSize() const { |
280 | if (prog_ == NULL) |
281 | return -1; |
282 | return prog_->size(); |
283 | } |
284 | |
285 | int RE2::ReverseProgramSize() const { |
286 | if (prog_ == NULL) |
287 | return -1; |
288 | Prog* prog = ReverseProg(); |
289 | if (prog == NULL) |
290 | return -1; |
291 | return prog->size(); |
292 | } |
293 | |
294 | // Finds the most significant non-zero bit in n. |
295 | static int FindMSBSet(uint32_t n) { |
296 | DCHECK_NE(n, 0); |
297 | #if defined(__GNUC__) |
298 | return 31 ^ __builtin_clz(n); |
299 | #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) |
300 | unsigned long c; |
301 | _BitScanReverse(&c, n); |
302 | return static_cast<int>(c); |
303 | #else |
304 | int c = 0; |
305 | for (int shift = 1 << 4; shift != 0; shift >>= 1) { |
306 | uint32_t word = n >> shift; |
307 | if (word != 0) { |
308 | n = word; |
309 | c += shift; |
310 | } |
311 | } |
312 | return c; |
313 | #endif |
314 | } |
315 | |
316 | static int Fanout(Prog* prog, std::vector<int>* histogram) { |
317 | SparseArray<int> fanout(prog->size()); |
318 | prog->Fanout(&fanout); |
319 | int data[32] = {}; |
320 | int size = 0; |
321 | for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) { |
322 | if (i->value() == 0) |
323 | continue; |
324 | uint32_t value = i->value(); |
325 | int bucket = FindMSBSet(value); |
326 | bucket += value & (value-1) ? 1 : 0; |
327 | ++data[bucket]; |
328 | size = std::max(size, bucket+1); |
329 | } |
330 | if (histogram != NULL) |
331 | histogram->assign(data, data+size); |
332 | return size-1; |
333 | } |
334 | |
335 | int RE2::ProgramFanout(std::vector<int>* histogram) const { |
336 | if (prog_ == NULL) |
337 | return -1; |
338 | return Fanout(prog_, histogram); |
339 | } |
340 | |
341 | int RE2::ReverseProgramFanout(std::vector<int>* histogram) const { |
342 | if (prog_ == NULL) |
343 | return -1; |
344 | Prog* prog = ReverseProg(); |
345 | if (prog == NULL) |
346 | return -1; |
347 | return Fanout(prog, histogram); |
348 | } |
349 | |
350 | // Returns named_groups_, computing it if needed. |
351 | const std::map<std::string, int>& RE2::NamedCapturingGroups() const { |
352 | absl::call_once(named_groups_once_, [](const RE2* re) { |
353 | if (re->suffix_regexp_ != NULL) |
354 | re->named_groups_ = re->suffix_regexp_->NamedCaptures(); |
355 | if (re->named_groups_ == NULL) |
356 | re->named_groups_ = empty_named_groups; |
357 | }, this); |
358 | return *named_groups_; |
359 | } |
360 | |
361 | // Returns group_names_, computing it if needed. |
362 | const std::map<int, std::string>& RE2::CapturingGroupNames() const { |
363 | absl::call_once(group_names_once_, [](const RE2* re) { |
364 | if (re->suffix_regexp_ != NULL) |
365 | re->group_names_ = re->suffix_regexp_->CaptureNames(); |
366 | if (re->group_names_ == NULL) |
367 | re->group_names_ = empty_group_names; |
368 | }, this); |
369 | return *group_names_; |
370 | } |
371 | |
372 | /***** Convenience interfaces *****/ |
373 | |
374 | bool RE2::FullMatchN(absl::string_view text, const RE2& re, |
375 | const Arg* const args[], int n) { |
376 | return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); |
377 | } |
378 | |
379 | bool RE2::PartialMatchN(absl::string_view text, const RE2& re, |
380 | const Arg* const args[], int n) { |
381 | return re.DoMatch(text, UNANCHORED, NULL, args, n); |
382 | } |
383 | |
384 | bool RE2::ConsumeN(absl::string_view* input, const RE2& re, |
385 | const Arg* const args[], int n) { |
386 | size_t consumed; |
387 | if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { |
388 | input->remove_prefix(consumed); |
389 | return true; |
390 | } else { |
391 | return false; |
392 | } |
393 | } |
394 | |
395 | bool RE2::FindAndConsumeN(absl::string_view* input, const RE2& re, |
396 | const Arg* const args[], int n) { |
397 | size_t consumed; |
398 | if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { |
399 | input->remove_prefix(consumed); |
400 | return true; |
401 | } else { |
402 | return false; |
403 | } |
404 | } |
405 | |
406 | bool RE2::Replace(std::string* str, |
407 | const RE2& re, |
408 | absl::string_view rewrite) { |
409 | absl::string_view vec[kVecSize]; |
410 | int nvec = 1 + MaxSubmatch(rewrite); |
411 | if (nvec > 1 + re.NumberOfCapturingGroups()) |
412 | return false; |
413 | if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec))) |
414 | return false; |
415 | if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) |
416 | return false; |
417 | |
418 | std::string s; |
419 | if (!re.Rewrite(&s, rewrite, vec, nvec)) |
420 | return false; |
421 | |
422 | assert(vec[0].data() >= str->data()); |
423 | assert(vec[0].data() + vec[0].size() <= str->data() + str->size()); |
424 | str->replace(vec[0].data() - str->data(), vec[0].size(), s); |
425 | return true; |
426 | } |
427 | |
428 | int RE2::GlobalReplace(std::string* str, |
429 | const RE2& re, |
430 | absl::string_view rewrite) { |
431 | absl::string_view vec[kVecSize]; |
432 | int nvec = 1 + MaxSubmatch(rewrite); |
433 | if (nvec > 1 + re.NumberOfCapturingGroups()) |
434 | return false; |
435 | if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec))) |
436 | return false; |
437 | |
438 | const char* p = str->data(); |
439 | const char* ep = p + str->size(); |
440 | const char* lastend = NULL; |
441 | std::string out; |
442 | int count = 0; |
443 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
444 | // Iterate just once when fuzzing. Otherwise, we easily get bogged down |
445 | // and coverage is unlikely to improve despite significant expense. |
446 | while (p == str->data()) { |
447 | #else |
448 | while (p <= ep) { |
449 | #endif |
450 | if (!re.Match(*str, static_cast<size_t>(p - str->data()), |
451 | str->size(), UNANCHORED, vec, nvec)) |
452 | break; |
453 | if (p < vec[0].data()) |
454 | out.append(p, vec[0].data() - p); |
455 | if (vec[0].data() == lastend && vec[0].empty()) { |
456 | // Disallow empty match at end of last match: skip ahead. |
457 | // |
458 | // fullrune() takes int, not ptrdiff_t. However, it just looks |
459 | // at the leading byte and treats any length >= 4 the same. |
460 | if (re.options().encoding() == RE2::Options::EncodingUTF8 && |
461 | fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) { |
462 | // re is in UTF-8 mode and there is enough left of str |
463 | // to allow us to advance by up to UTFmax bytes. |
464 | Rune r; |
465 | int n = chartorune(&r, p); |
466 | // Some copies of chartorune have a bug that accepts |
467 | // encodings of values in (10FFFF, 1FFFFF] as valid. |
468 | if (r > Runemax) { |
469 | n = 1; |
470 | r = Runeerror; |
471 | } |
472 | if (!(n == 1 && r == Runeerror)) { // no decoding error |
473 | out.append(p, n); |
474 | p += n; |
475 | continue; |
476 | } |
477 | } |
478 | // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode, |
479 | // we fell through from above and the GIGO principle applies. |
480 | if (p < ep) |
481 | out.append(p, 1); |
482 | p++; |
483 | continue; |
484 | } |
485 | re.Rewrite(&out, rewrite, vec, nvec); |
486 | p = vec[0].data() + vec[0].size(); |
487 | lastend = p; |
488 | count++; |
489 | } |
490 | |
491 | if (count == 0) |
492 | return 0; |
493 | |
494 | if (p < ep) |
495 | out.append(p, ep - p); |
496 | using std::swap; |
497 | swap(out, *str); |
498 | return count; |
499 | } |
500 | |
501 | bool RE2::(absl::string_view text, |
502 | const RE2& re, |
503 | absl::string_view rewrite, |
504 | std::string* out) { |
505 | absl::string_view vec[kVecSize]; |
506 | int nvec = 1 + MaxSubmatch(rewrite); |
507 | if (nvec > 1 + re.NumberOfCapturingGroups()) |
508 | return false; |
509 | if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec))) |
510 | return false; |
511 | if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) |
512 | return false; |
513 | |
514 | out->clear(); |
515 | return re.Rewrite(out, rewrite, vec, nvec); |
516 | } |
517 | |
518 | std::string RE2::QuoteMeta(absl::string_view unquoted) { |
519 | std::string result; |
520 | result.reserve(unquoted.size() << 1); |
521 | |
522 | // Escape any ascii character not in [A-Za-z_0-9]. |
523 | // |
524 | // Note that it's legal to escape a character even if it has no |
525 | // special meaning in a regular expression -- so this function does |
526 | // that. (This also makes it identical to the perl function of the |
527 | // same name except for the null-character special case; |
528 | // see `perldoc -f quotemeta`.) |
529 | for (size_t ii = 0; ii < unquoted.size(); ++ii) { |
530 | // Note that using 'isalnum' here raises the benchmark time from |
531 | // 32ns to 58ns: |
532 | if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && |
533 | (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && |
534 | (unquoted[ii] < '0' || unquoted[ii] > '9') && |
535 | unquoted[ii] != '_' && |
536 | // If this is the part of a UTF8 or Latin1 character, we need |
537 | // to copy this byte without escaping. Experimentally this is |
538 | // what works correctly with the regexp library. |
539 | !(unquoted[ii] & 128)) { |
540 | if (unquoted[ii] == '\0') { // Special handling for null chars. |
541 | // Note that this special handling is not strictly required for RE2, |
542 | // but this quoting is required for other regexp libraries such as |
543 | // PCRE. |
544 | // Can't use "\\0" since the next character might be a digit. |
545 | result += "\\x00" ; |
546 | continue; |
547 | } |
548 | result += '\\'; |
549 | } |
550 | result += unquoted[ii]; |
551 | } |
552 | |
553 | return result; |
554 | } |
555 | |
556 | bool RE2::PossibleMatchRange(std::string* min, std::string* max, |
557 | int maxlen) const { |
558 | if (prog_ == NULL) |
559 | return false; |
560 | |
561 | int n = static_cast<int>(prefix_.size()); |
562 | if (n > maxlen) |
563 | n = maxlen; |
564 | |
565 | // Determine initial min max from prefix_ literal. |
566 | *min = prefix_.substr(0, n); |
567 | *max = prefix_.substr(0, n); |
568 | if (prefix_foldcase_) { |
569 | // prefix is ASCII lowercase; change *min to uppercase. |
570 | for (int i = 0; i < n; i++) { |
571 | char& c = (*min)[i]; |
572 | if ('a' <= c && c <= 'z') |
573 | c += 'A' - 'a'; |
574 | } |
575 | } |
576 | |
577 | // Add to prefix min max using PossibleMatchRange on regexp. |
578 | std::string dmin, dmax; |
579 | maxlen -= n; |
580 | if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { |
581 | min->append(dmin); |
582 | max->append(dmax); |
583 | } else if (!max->empty()) { |
584 | // prog_->PossibleMatchRange has failed us, |
585 | // but we still have useful information from prefix_. |
586 | // Round up *max to allow any possible suffix. |
587 | PrefixSuccessor(max); |
588 | } else { |
589 | // Nothing useful. |
590 | *min = "" ; |
591 | *max = "" ; |
592 | return false; |
593 | } |
594 | |
595 | return true; |
596 | } |
597 | |
598 | // Avoid possible locale nonsense in standard strcasecmp. |
599 | // The string a is known to be all lowercase. |
600 | static int ascii_strcasecmp(const char* a, const char* b, size_t len) { |
601 | const char* ae = a + len; |
602 | |
603 | for (; a < ae; a++, b++) { |
604 | uint8_t x = *a; |
605 | uint8_t y = *b; |
606 | if ('A' <= y && y <= 'Z') |
607 | y += 'a' - 'A'; |
608 | if (x != y) |
609 | return x - y; |
610 | } |
611 | return 0; |
612 | } |
613 | |
614 | |
615 | /***** Actual matching and rewriting code *****/ |
616 | |
617 | bool RE2::Match(absl::string_view text, |
618 | size_t startpos, |
619 | size_t endpos, |
620 | Anchor re_anchor, |
621 | absl::string_view* submatch, |
622 | int nsubmatch) const { |
623 | if (!ok()) { |
624 | if (options_.log_errors()) |
625 | LOG(ERROR) << "Invalid RE2: " << *error_; |
626 | return false; |
627 | } |
628 | |
629 | if (startpos > endpos || endpos > text.size()) { |
630 | if (options_.log_errors()) |
631 | LOG(ERROR) << "RE2: invalid startpos, endpos pair. [" |
632 | << "startpos: " << startpos << ", " |
633 | << "endpos: " << endpos << ", " |
634 | << "text size: " << text.size() << "]" ; |
635 | return false; |
636 | } |
637 | |
638 | absl::string_view subtext = text; |
639 | subtext.remove_prefix(startpos); |
640 | subtext.remove_suffix(text.size() - endpos); |
641 | |
642 | // Use DFAs to find exact location of match, filter out non-matches. |
643 | |
644 | // Don't ask for the location if we won't use it. |
645 | // SearchDFA can do extra optimizations in that case. |
646 | absl::string_view match; |
647 | absl::string_view* matchp = &match; |
648 | if (nsubmatch == 0) |
649 | matchp = NULL; |
650 | |
651 | int ncap = 1 + NumberOfCapturingGroups(); |
652 | if (ncap > nsubmatch) |
653 | ncap = nsubmatch; |
654 | |
655 | // If the regexp is anchored explicitly, must not be in middle of text. |
656 | if (prog_->anchor_start() && startpos != 0) |
657 | return false; |
658 | if (prog_->anchor_end() && endpos != text.size()) |
659 | return false; |
660 | |
661 | // If the regexp is anchored explicitly, update re_anchor |
662 | // so that we can potentially fall into a faster case below. |
663 | if (prog_->anchor_start() && prog_->anchor_end()) |
664 | re_anchor = ANCHOR_BOTH; |
665 | else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) |
666 | re_anchor = ANCHOR_START; |
667 | |
668 | // Check for the required prefix, if any. |
669 | size_t prefixlen = 0; |
670 | if (!prefix_.empty()) { |
671 | if (startpos != 0) |
672 | return false; |
673 | prefixlen = prefix_.size(); |
674 | if (prefixlen > subtext.size()) |
675 | return false; |
676 | if (prefix_foldcase_) { |
677 | if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
678 | return false; |
679 | } else { |
680 | if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
681 | return false; |
682 | } |
683 | subtext.remove_prefix(prefixlen); |
684 | // If there is a required prefix, the anchor must be at least ANCHOR_START. |
685 | if (re_anchor != ANCHOR_BOTH) |
686 | re_anchor = ANCHOR_START; |
687 | } |
688 | |
689 | Prog::Anchor anchor = Prog::kUnanchored; |
690 | Prog::MatchKind kind = Prog::kFirstMatch; |
691 | if (options_.longest_match()) |
692 | kind = Prog::kLongestMatch; |
693 | |
694 | bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture; |
695 | bool can_bit_state = prog_->CanBitState(); |
696 | size_t bit_state_text_max_size = prog_->bit_state_text_max_size(); |
697 | |
698 | #ifdef RE2_HAVE_THREAD_LOCAL |
699 | hooks::context = this; |
700 | #endif |
701 | bool dfa_failed = false; |
702 | bool skipped_test = false; |
703 | switch (re_anchor) { |
704 | default: |
705 | LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor; |
706 | return false; |
707 | |
708 | case UNANCHORED: { |
709 | if (prog_->anchor_end()) { |
710 | // This is a very special case: we don't need the forward DFA because |
711 | // we already know where the match must end! Instead, the reverse DFA |
712 | // can say whether there is a match and (optionally) where it starts. |
713 | Prog* prog = ReverseProg(); |
714 | if (prog == NULL) { |
715 | // Fall back to NFA below. |
716 | skipped_test = true; |
717 | break; |
718 | } |
719 | if (!prog->SearchDFA(subtext, text, Prog::kAnchored, |
720 | Prog::kLongestMatch, matchp, &dfa_failed, NULL)) { |
721 | if (dfa_failed) { |
722 | if (options_.log_errors()) |
723 | LOG(ERROR) << "DFA out of memory: " |
724 | << "pattern length " << pattern_.size() << ", " |
725 | << "program size " << prog->size() << ", " |
726 | << "list count " << prog->list_count() << ", " |
727 | << "bytemap range " << prog->bytemap_range(); |
728 | // Fall back to NFA below. |
729 | skipped_test = true; |
730 | break; |
731 | } |
732 | return false; |
733 | } |
734 | if (matchp == NULL) // Matched. Don't care where. |
735 | return true; |
736 | break; |
737 | } |
738 | |
739 | if (!prog_->SearchDFA(subtext, text, anchor, kind, |
740 | matchp, &dfa_failed, NULL)) { |
741 | if (dfa_failed) { |
742 | if (options_.log_errors()) |
743 | LOG(ERROR) << "DFA out of memory: " |
744 | << "pattern length " << pattern_.size() << ", " |
745 | << "program size " << prog_->size() << ", " |
746 | << "list count " << prog_->list_count() << ", " |
747 | << "bytemap range " << prog_->bytemap_range(); |
748 | // Fall back to NFA below. |
749 | skipped_test = true; |
750 | break; |
751 | } |
752 | return false; |
753 | } |
754 | if (matchp == NULL) // Matched. Don't care where. |
755 | return true; |
756 | // SearchDFA set match.end() but didn't know where the |
757 | // match started. Run the regexp backward from match.end() |
758 | // to find the longest possible match -- that's where it started. |
759 | Prog* prog = ReverseProg(); |
760 | if (prog == NULL) { |
761 | // Fall back to NFA below. |
762 | skipped_test = true; |
763 | break; |
764 | } |
765 | if (!prog->SearchDFA(match, text, Prog::kAnchored, |
766 | Prog::kLongestMatch, &match, &dfa_failed, NULL)) { |
767 | if (dfa_failed) { |
768 | if (options_.log_errors()) |
769 | LOG(ERROR) << "DFA out of memory: " |
770 | << "pattern length " << pattern_.size() << ", " |
771 | << "program size " << prog->size() << ", " |
772 | << "list count " << prog->list_count() << ", " |
773 | << "bytemap range " << prog->bytemap_range(); |
774 | // Fall back to NFA below. |
775 | skipped_test = true; |
776 | break; |
777 | } |
778 | if (options_.log_errors()) |
779 | LOG(ERROR) << "SearchDFA inconsistency" ; |
780 | return false; |
781 | } |
782 | break; |
783 | } |
784 | |
785 | case ANCHOR_BOTH: |
786 | case ANCHOR_START: |
787 | if (re_anchor == ANCHOR_BOTH) |
788 | kind = Prog::kFullMatch; |
789 | anchor = Prog::kAnchored; |
790 | |
791 | // If only a small amount of text and need submatch |
792 | // information anyway and we're going to use OnePass or BitState |
793 | // to get it, we might as well not even bother with the DFA: |
794 | // OnePass or BitState will be fast enough. |
795 | // On tiny texts, OnePass outruns even the DFA, and |
796 | // it doesn't have the shared state and occasional mutex that |
797 | // the DFA does. |
798 | if (can_one_pass && text.size() <= 4096 && |
799 | (ncap > 1 || text.size() <= 16)) { |
800 | skipped_test = true; |
801 | break; |
802 | } |
803 | if (can_bit_state && text.size() <= bit_state_text_max_size && |
804 | ncap > 1) { |
805 | skipped_test = true; |
806 | break; |
807 | } |
808 | if (!prog_->SearchDFA(subtext, text, anchor, kind, |
809 | &match, &dfa_failed, NULL)) { |
810 | if (dfa_failed) { |
811 | if (options_.log_errors()) |
812 | LOG(ERROR) << "DFA out of memory: " |
813 | << "pattern length " << pattern_.size() << ", " |
814 | << "program size " << prog_->size() << ", " |
815 | << "list count " << prog_->list_count() << ", " |
816 | << "bytemap range " << prog_->bytemap_range(); |
817 | // Fall back to NFA below. |
818 | skipped_test = true; |
819 | break; |
820 | } |
821 | return false; |
822 | } |
823 | break; |
824 | } |
825 | |
826 | if (!skipped_test && ncap <= 1) { |
827 | // We know exactly where it matches. That's enough. |
828 | if (ncap == 1) |
829 | submatch[0] = match; |
830 | } else { |
831 | absl::string_view subtext1; |
832 | if (skipped_test) { |
833 | // DFA ran out of memory or was skipped: |
834 | // need to search in entire original text. |
835 | subtext1 = subtext; |
836 | } else { |
837 | // DFA found the exact match location: |
838 | // let NFA run an anchored, full match search |
839 | // to find submatch locations. |
840 | subtext1 = match; |
841 | anchor = Prog::kAnchored; |
842 | kind = Prog::kFullMatch; |
843 | } |
844 | |
845 | if (can_one_pass && anchor != Prog::kUnanchored) { |
846 | if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { |
847 | if (!skipped_test && options_.log_errors()) |
848 | LOG(ERROR) << "SearchOnePass inconsistency" ; |
849 | return false; |
850 | } |
851 | } else if (can_bit_state && subtext1.size() <= bit_state_text_max_size) { |
852 | if (!prog_->SearchBitState(subtext1, text, anchor, |
853 | kind, submatch, ncap)) { |
854 | if (!skipped_test && options_.log_errors()) |
855 | LOG(ERROR) << "SearchBitState inconsistency" ; |
856 | return false; |
857 | } |
858 | } else { |
859 | if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { |
860 | if (!skipped_test && options_.log_errors()) |
861 | LOG(ERROR) << "SearchNFA inconsistency" ; |
862 | return false; |
863 | } |
864 | } |
865 | } |
866 | |
867 | // Adjust overall match for required prefix that we stripped off. |
868 | if (prefixlen > 0 && nsubmatch > 0) |
869 | submatch[0] = absl::string_view(submatch[0].data() - prefixlen, |
870 | submatch[0].size() + prefixlen); |
871 | |
872 | // Zero submatches that don't exist in the regexp. |
873 | for (int i = ncap; i < nsubmatch; i++) |
874 | submatch[i] = absl::string_view(); |
875 | return true; |
876 | } |
877 | |
878 | // Internal matcher - like Match() but takes Args not string_views. |
879 | bool RE2::DoMatch(absl::string_view text, |
880 | Anchor re_anchor, |
881 | size_t* consumed, |
882 | const Arg* const* args, |
883 | int n) const { |
884 | if (!ok()) { |
885 | if (options_.log_errors()) |
886 | LOG(ERROR) << "Invalid RE2: " << *error_; |
887 | return false; |
888 | } |
889 | |
890 | if (NumberOfCapturingGroups() < n) { |
891 | // RE has fewer capturing groups than number of Arg pointers passed in. |
892 | return false; |
893 | } |
894 | |
895 | // Count number of capture groups needed. |
896 | int nvec; |
897 | if (n == 0 && consumed == NULL) |
898 | nvec = 0; |
899 | else |
900 | nvec = n+1; |
901 | |
902 | absl::FixedArray<absl::string_view, kVecSize> vec_storage(nvec); |
903 | absl::string_view* vec = vec_storage.data(); |
904 | |
905 | if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) { |
906 | return false; |
907 | } |
908 | |
909 | if (consumed != NULL) |
910 | *consumed = static_cast<size_t>(EndPtr(vec[0]) - BeginPtr(text)); |
911 | |
912 | if (n == 0 || args == NULL) { |
913 | // We are not interested in results |
914 | return true; |
915 | } |
916 | |
917 | // If we got here, we must have matched the whole pattern. |
918 | for (int i = 0; i < n; i++) { |
919 | absl::string_view s = vec[i+1]; |
920 | if (!args[i]->Parse(s.data(), s.size())) { |
921 | // TODO: Should we indicate what the error was? |
922 | return false; |
923 | } |
924 | } |
925 | |
926 | return true; |
927 | } |
928 | |
929 | // Checks that the rewrite string is well-formed with respect to this |
930 | // regular expression. |
931 | bool RE2::CheckRewriteString(absl::string_view rewrite, |
932 | std::string* error) const { |
933 | int max_token = -1; |
934 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
935 | s < end; s++) { |
936 | int c = *s; |
937 | if (c != '\\') { |
938 | continue; |
939 | } |
940 | if (++s == end) { |
941 | *error = "Rewrite schema error: '\\' not allowed at end." ; |
942 | return false; |
943 | } |
944 | c = *s; |
945 | if (c == '\\') { |
946 | continue; |
947 | } |
948 | if (!isdigit(c)) { |
949 | *error = "Rewrite schema error: " |
950 | "'\\' must be followed by a digit or '\\'." ; |
951 | return false; |
952 | } |
953 | int n = (c - '0'); |
954 | if (max_token < n) { |
955 | max_token = n; |
956 | } |
957 | } |
958 | |
959 | if (max_token > NumberOfCapturingGroups()) { |
960 | *error = absl::StrFormat( |
961 | "Rewrite schema requests %d matches, but the regexp only has %d " |
962 | "parenthesized subexpressions." , |
963 | max_token, NumberOfCapturingGroups()); |
964 | return false; |
965 | } |
966 | return true; |
967 | } |
968 | |
969 | // Returns the maximum submatch needed for the rewrite to be done by Replace(). |
970 | // E.g. if rewrite == "foo \\2,\\1", returns 2. |
971 | int RE2::MaxSubmatch(absl::string_view rewrite) { |
972 | int max = 0; |
973 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
974 | s < end; s++) { |
975 | if (*s == '\\') { |
976 | s++; |
977 | int c = (s < end) ? *s : -1; |
978 | if (isdigit(c)) { |
979 | int n = (c - '0'); |
980 | if (n > max) |
981 | max = n; |
982 | } |
983 | } |
984 | } |
985 | return max; |
986 | } |
987 | |
988 | // Append the "rewrite" string, with backslash subsitutions from "vec", |
989 | // to string "out". |
990 | bool RE2::Rewrite(std::string* out, |
991 | absl::string_view rewrite, |
992 | const absl::string_view* vec, |
993 | int veclen) const { |
994 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
995 | s < end; s++) { |
996 | if (*s != '\\') { |
997 | out->push_back(*s); |
998 | continue; |
999 | } |
1000 | s++; |
1001 | int c = (s < end) ? *s : -1; |
1002 | if (isdigit(c)) { |
1003 | int n = (c - '0'); |
1004 | if (n >= veclen) { |
1005 | if (options_.log_errors()) { |
1006 | LOG(ERROR) << "invalid substitution \\" << n |
1007 | << " from " << veclen << " groups" ; |
1008 | } |
1009 | return false; |
1010 | } |
1011 | absl::string_view snip = vec[n]; |
1012 | if (!snip.empty()) |
1013 | out->append(snip.data(), snip.size()); |
1014 | } else if (c == '\\') { |
1015 | out->push_back('\\'); |
1016 | } else { |
1017 | if (options_.log_errors()) |
1018 | LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); |
1019 | return false; |
1020 | } |
1021 | } |
1022 | return true; |
1023 | } |
1024 | |
1025 | /***** Parsers for various types *****/ |
1026 | |
1027 | namespace re2_internal { |
1028 | |
1029 | template <> |
1030 | bool Parse(const char* str, size_t n, void* dest) { |
1031 | // We fail if somebody asked us to store into a non-NULL void* pointer |
1032 | return (dest == NULL); |
1033 | } |
1034 | |
1035 | template <> |
1036 | bool Parse(const char* str, size_t n, std::string* dest) { |
1037 | if (dest == NULL) return true; |
1038 | dest->assign(str, n); |
1039 | return true; |
1040 | } |
1041 | |
1042 | template <> |
1043 | bool Parse(const char* str, size_t n, absl::string_view* dest) { |
1044 | if (dest == NULL) return true; |
1045 | *dest = absl::string_view(str, n); |
1046 | return true; |
1047 | } |
1048 | |
1049 | template <> |
1050 | bool Parse(const char* str, size_t n, char* dest) { |
1051 | if (n != 1) return false; |
1052 | if (dest == NULL) return true; |
1053 | *dest = str[0]; |
1054 | return true; |
1055 | } |
1056 | |
1057 | template <> |
1058 | bool Parse(const char* str, size_t n, signed char* dest) { |
1059 | if (n != 1) return false; |
1060 | if (dest == NULL) return true; |
1061 | *dest = str[0]; |
1062 | return true; |
1063 | } |
1064 | |
1065 | template <> |
1066 | bool Parse(const char* str, size_t n, unsigned char* dest) { |
1067 | if (n != 1) return false; |
1068 | if (dest == NULL) return true; |
1069 | *dest = str[0]; |
1070 | return true; |
1071 | } |
1072 | |
1073 | // Largest number spec that we are willing to parse |
1074 | static const int kMaxNumberLength = 32; |
1075 | |
1076 | // REQUIRES "buf" must have length at least nbuf. |
1077 | // Copies "str" into "buf" and null-terminates. |
1078 | // Overwrites *np with the new length. |
1079 | static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, |
1080 | size_t* np, bool accept_spaces) { |
1081 | size_t n = *np; |
1082 | if (n == 0) return "" ; |
1083 | if (n > 0 && isspace(*str)) { |
1084 | // We are less forgiving than the strtoxxx() routines and do not |
1085 | // allow leading spaces. We do allow leading spaces for floats. |
1086 | if (!accept_spaces) { |
1087 | return "" ; |
1088 | } |
1089 | while (n > 0 && isspace(*str)) { |
1090 | n--; |
1091 | str++; |
1092 | } |
1093 | } |
1094 | |
1095 | // Although buf has a fixed maximum size, we can still handle |
1096 | // arbitrarily large integers correctly by omitting leading zeros. |
1097 | // (Numbers that are still too long will be out of range.) |
1098 | // Before deciding whether str is too long, |
1099 | // remove leading zeros with s/000+/00/. |
1100 | // Leaving the leading two zeros in place means that |
1101 | // we don't change 0000x123 (invalid) into 0x123 (valid). |
1102 | // Skip over leading - before replacing. |
1103 | bool neg = false; |
1104 | if (n >= 1 && str[0] == '-') { |
1105 | neg = true; |
1106 | n--; |
1107 | str++; |
1108 | } |
1109 | |
1110 | if (n >= 3 && str[0] == '0' && str[1] == '0') { |
1111 | while (n >= 3 && str[2] == '0') { |
1112 | n--; |
1113 | str++; |
1114 | } |
1115 | } |
1116 | |
1117 | if (neg) { // make room in buf for - |
1118 | n++; |
1119 | str--; |
1120 | } |
1121 | |
1122 | if (n > nbuf-1) return "" ; |
1123 | |
1124 | memmove(buf, str, n); |
1125 | if (neg) { |
1126 | buf[0] = '-'; |
1127 | } |
1128 | buf[n] = '\0'; |
1129 | *np = n; |
1130 | return buf; |
1131 | } |
1132 | |
1133 | template <> |
1134 | bool Parse(const char* str, size_t n, float* dest) { |
1135 | if (n == 0) return false; |
1136 | static const int kMaxLength = 200; |
1137 | char buf[kMaxLength+1]; |
1138 | str = TerminateNumber(buf, sizeof buf, str, &n, true); |
1139 | char* end; |
1140 | errno = 0; |
1141 | float r = strtof(str, &end); |
1142 | if (end != str + n) return false; // Leftover junk |
1143 | if (errno) return false; |
1144 | if (dest == NULL) return true; |
1145 | *dest = r; |
1146 | return true; |
1147 | } |
1148 | |
1149 | template <> |
1150 | bool Parse(const char* str, size_t n, double* dest) { |
1151 | if (n == 0) return false; |
1152 | static const int kMaxLength = 200; |
1153 | char buf[kMaxLength+1]; |
1154 | str = TerminateNumber(buf, sizeof buf, str, &n, true); |
1155 | char* end; |
1156 | errno = 0; |
1157 | double r = strtod(str, &end); |
1158 | if (end != str + n) return false; // Leftover junk |
1159 | if (errno) return false; |
1160 | if (dest == NULL) return true; |
1161 | *dest = r; |
1162 | return true; |
1163 | } |
1164 | |
1165 | template <> |
1166 | bool Parse(const char* str, size_t n, long* dest, int radix) { |
1167 | if (n == 0) return false; |
1168 | char buf[kMaxNumberLength+1]; |
1169 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1170 | char* end; |
1171 | errno = 0; |
1172 | long r = strtol(str, &end, radix); |
1173 | if (end != str + n) return false; // Leftover junk |
1174 | if (errno) return false; |
1175 | if (dest == NULL) return true; |
1176 | *dest = r; |
1177 | return true; |
1178 | } |
1179 | |
1180 | template <> |
1181 | bool Parse(const char* str, size_t n, unsigned long* dest, int radix) { |
1182 | if (n == 0) return false; |
1183 | char buf[kMaxNumberLength+1]; |
1184 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1185 | if (str[0] == '-') { |
1186 | // strtoul() will silently accept negative numbers and parse |
1187 | // them. This module is more strict and treats them as errors. |
1188 | return false; |
1189 | } |
1190 | |
1191 | char* end; |
1192 | errno = 0; |
1193 | unsigned long r = strtoul(str, &end, radix); |
1194 | if (end != str + n) return false; // Leftover junk |
1195 | if (errno) return false; |
1196 | if (dest == NULL) return true; |
1197 | *dest = r; |
1198 | return true; |
1199 | } |
1200 | |
1201 | template <> |
1202 | bool Parse(const char* str, size_t n, short* dest, int radix) { |
1203 | long r; |
1204 | if (!Parse(str, n, &r, radix)) return false; // Could not parse |
1205 | if ((short)r != r) return false; // Out of range |
1206 | if (dest == NULL) return true; |
1207 | *dest = (short)r; |
1208 | return true; |
1209 | } |
1210 | |
1211 | template <> |
1212 | bool Parse(const char* str, size_t n, unsigned short* dest, int radix) { |
1213 | unsigned long r; |
1214 | if (!Parse(str, n, &r, radix)) return false; // Could not parse |
1215 | if ((unsigned short)r != r) return false; // Out of range |
1216 | if (dest == NULL) return true; |
1217 | *dest = (unsigned short)r; |
1218 | return true; |
1219 | } |
1220 | |
1221 | template <> |
1222 | bool Parse(const char* str, size_t n, int* dest, int radix) { |
1223 | long r; |
1224 | if (!Parse(str, n, &r, radix)) return false; // Could not parse |
1225 | if ((int)r != r) return false; // Out of range |
1226 | if (dest == NULL) return true; |
1227 | *dest = (int)r; |
1228 | return true; |
1229 | } |
1230 | |
1231 | template <> |
1232 | bool Parse(const char* str, size_t n, unsigned int* dest, int radix) { |
1233 | unsigned long r; |
1234 | if (!Parse(str, n, &r, radix)) return false; // Could not parse |
1235 | if ((unsigned int)r != r) return false; // Out of range |
1236 | if (dest == NULL) return true; |
1237 | *dest = (unsigned int)r; |
1238 | return true; |
1239 | } |
1240 | |
1241 | template <> |
1242 | bool Parse(const char* str, size_t n, long long* dest, int radix) { |
1243 | if (n == 0) return false; |
1244 | char buf[kMaxNumberLength+1]; |
1245 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1246 | char* end; |
1247 | errno = 0; |
1248 | long long r = strtoll(str, &end, radix); |
1249 | if (end != str + n) return false; // Leftover junk |
1250 | if (errno) return false; |
1251 | if (dest == NULL) return true; |
1252 | *dest = r; |
1253 | return true; |
1254 | } |
1255 | |
1256 | template <> |
1257 | bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) { |
1258 | if (n == 0) return false; |
1259 | char buf[kMaxNumberLength+1]; |
1260 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1261 | if (str[0] == '-') { |
1262 | // strtoull() will silently accept negative numbers and parse |
1263 | // them. This module is more strict and treats them as errors. |
1264 | return false; |
1265 | } |
1266 | char* end; |
1267 | errno = 0; |
1268 | unsigned long long r = strtoull(str, &end, radix); |
1269 | if (end != str + n) return false; // Leftover junk |
1270 | if (errno) return false; |
1271 | if (dest == NULL) return true; |
1272 | *dest = r; |
1273 | return true; |
1274 | } |
1275 | |
1276 | } // namespace re2_internal |
1277 | |
1278 | namespace hooks { |
1279 | |
1280 | #ifdef RE2_HAVE_THREAD_LOCAL |
1281 | thread_local const RE2* context = NULL; |
1282 | #endif |
1283 | |
1284 | template <typename T> |
1285 | union Hook { |
1286 | void Store(T* cb) { cb_.store(cb, std::memory_order_release); } |
1287 | T* Load() const { return cb_.load(std::memory_order_acquire); } |
1288 | |
1289 | #if !defined(__clang__) && defined(_MSC_VER) |
1290 | // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent, |
1291 | // this is a gross hack to make std::atomic<T*> constant-initialized on MSVC. |
1292 | static_assert(ATOMIC_POINTER_LOCK_FREE == 2, |
1293 | "std::atomic<T*> must be always lock-free" ); |
1294 | T* cb_for_constinit_; |
1295 | #endif |
1296 | |
1297 | std::atomic<T*> cb_; |
1298 | }; |
1299 | |
1300 | template <typename T> |
1301 | static void DoNothing(const T&) {} |
1302 | |
1303 | #define DEFINE_HOOK(type, name) \ |
1304 | static Hook<type##Callback> name##_hook = {{&DoNothing<type>}}; \ |
1305 | void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \ |
1306 | type##Callback* Get##type##Hook() { return name##_hook.Load(); } |
1307 | |
1308 | DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset) |
1309 | DEFINE_HOOK(DFASearchFailure, dfa_search_failure) |
1310 | |
1311 | #undef DEFINE_HOOK |
1312 | |
1313 | } // namespace hooks |
1314 | |
1315 | } // namespace re2 |
1316 | |