1// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression interface RE2.
6//
7// Originally the PCRE C++ wrapper, but adapted to use
8// the new automata-based regular expression engines.
9
10#include "re2/re2.h"
11
12#include <assert.h>
13#include <ctype.h>
14#include <errno.h>
15#ifdef _MSC_VER
16#include <intrin.h>
17#endif
18#include <stdint.h>
19#include <stdlib.h>
20#include <string.h>
21#include <algorithm>
22#include <atomic>
23#include <iterator>
24#include <string>
25#include <utility>
26#include <vector>
27
28#include "absl/base/macros.h"
29#include "absl/container/fixed_array.h"
30#include "absl/strings/str_format.h"
31#include "util/logging.h"
32#include "util/strutil.h"
33#include "util/utf.h"
34#include "re2/prog.h"
35#include "re2/regexp.h"
36#include "re2/sparse_array.h"
37
38namespace re2 {
39
40// Maximum number of args we can set
41static const int kMaxArgs = 16;
42static const int kVecSize = 1+kMaxArgs;
43
44const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
45
46RE2::Options::Options(RE2::CannedOptions opt)
47 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
48 posix_syntax_(opt == RE2::POSIX),
49 longest_match_(opt == RE2::POSIX),
50 log_errors_(opt != RE2::Quiet),
51 max_mem_(kDefaultMaxMem),
52 literal_(false),
53 never_nl_(false),
54 dot_nl_(false),
55 never_capture_(false),
56 case_sensitive_(true),
57 perl_classes_(false),
58 word_boundary_(false),
59 one_line_(false) {
60}
61
62// static empty objects for use as const references.
63// To avoid global constructors, allocated in RE2::Init().
64static const std::string* empty_string;
65static const std::map<std::string, int>* empty_named_groups;
66static const std::map<int, std::string>* empty_group_names;
67
68// Converts from Regexp error code to RE2 error code.
69// Maybe some day they will diverge. In any event, this
70// hides the existence of Regexp from RE2 users.
71static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
72 switch (code) {
73 case re2::kRegexpSuccess:
74 return RE2::NoError;
75 case re2::kRegexpInternalError:
76 return RE2::ErrorInternal;
77 case re2::kRegexpBadEscape:
78 return RE2::ErrorBadEscape;
79 case re2::kRegexpBadCharClass:
80 return RE2::ErrorBadCharClass;
81 case re2::kRegexpBadCharRange:
82 return RE2::ErrorBadCharRange;
83 case re2::kRegexpMissingBracket:
84 return RE2::ErrorMissingBracket;
85 case re2::kRegexpMissingParen:
86 return RE2::ErrorMissingParen;
87 case re2::kRegexpUnexpectedParen:
88 return RE2::ErrorUnexpectedParen;
89 case re2::kRegexpTrailingBackslash:
90 return RE2::ErrorTrailingBackslash;
91 case re2::kRegexpRepeatArgument:
92 return RE2::ErrorRepeatArgument;
93 case re2::kRegexpRepeatSize:
94 return RE2::ErrorRepeatSize;
95 case re2::kRegexpRepeatOp:
96 return RE2::ErrorRepeatOp;
97 case re2::kRegexpBadPerlOp:
98 return RE2::ErrorBadPerlOp;
99 case re2::kRegexpBadUTF8:
100 return RE2::ErrorBadUTF8;
101 case re2::kRegexpBadNamedCapture:
102 return RE2::ErrorBadNamedCapture;
103 }
104 return RE2::ErrorInternal;
105}
106
107static std::string trunc(absl::string_view pattern) {
108 if (pattern.size() < 100)
109 return std::string(pattern);
110 return std::string(pattern.substr(0, 100)) + "...";
111}
112
113
114RE2::RE2(const char* pattern) {
115 Init(pattern, DefaultOptions);
116}
117
118RE2::RE2(const std::string& pattern) {
119 Init(pattern, DefaultOptions);
120}
121
122RE2::RE2(absl::string_view pattern) {
123 Init(pattern, DefaultOptions);
124}
125
126RE2::RE2(absl::string_view pattern, const Options& options) {
127 Init(pattern, options);
128}
129
130int RE2::Options::ParseFlags() const {
131 int flags = Regexp::ClassNL;
132 switch (encoding()) {
133 default:
134 if (log_errors())
135 LOG(ERROR) << "Unknown encoding " << encoding();
136 break;
137 case RE2::Options::EncodingUTF8:
138 break;
139 case RE2::Options::EncodingLatin1:
140 flags |= Regexp::Latin1;
141 break;
142 }
143
144 if (!posix_syntax())
145 flags |= Regexp::LikePerl;
146
147 if (literal())
148 flags |= Regexp::Literal;
149
150 if (never_nl())
151 flags |= Regexp::NeverNL;
152
153 if (dot_nl())
154 flags |= Regexp::DotNL;
155
156 if (never_capture())
157 flags |= Regexp::NeverCapture;
158
159 if (!case_sensitive())
160 flags |= Regexp::FoldCase;
161
162 if (perl_classes())
163 flags |= Regexp::PerlClasses;
164
165 if (word_boundary())
166 flags |= Regexp::PerlB;
167
168 if (one_line())
169 flags |= Regexp::OneLine;
170
171 return flags;
172}
173
174void RE2::Init(absl::string_view pattern, const Options& options) {
175 static absl::once_flag empty_once;
176 absl::call_once(empty_once, []() {
177 empty_string = new std::string;
178 empty_named_groups = new std::map<std::string, int>;
179 empty_group_names = new std::map<int, std::string>;
180 });
181
182 pattern_.assign(pattern.data(), pattern.size());
183 options_.Copy(options);
184 entire_regexp_ = NULL;
185 error_ = empty_string;
186 error_code_ = NoError;
187 error_arg_.clear();
188 prefix_.clear();
189 prefix_foldcase_ = false;
190 suffix_regexp_ = NULL;
191 prog_ = NULL;
192 num_captures_ = -1;
193 is_one_pass_ = false;
194
195 rprog_ = NULL;
196 named_groups_ = NULL;
197 group_names_ = NULL;
198
199 RegexpStatus status;
200 entire_regexp_ = Regexp::Parse(
201 pattern_,
202 static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
203 &status);
204 if (entire_regexp_ == NULL) {
205 if (options_.log_errors()) {
206 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
207 << status.Text();
208 }
209 error_ = new std::string(status.Text());
210 error_code_ = RegexpErrorToRE2(status.code());
211 error_arg_ = std::string(status.error_arg());
212 return;
213 }
214
215 re2::Regexp* suffix;
216 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
217 suffix_regexp_ = suffix;
218 else
219 suffix_regexp_ = entire_regexp_->Incref();
220
221 // Two thirds of the memory goes to the forward Prog,
222 // one third to the reverse prog, because the forward
223 // Prog has two DFAs but the reverse prog has one.
224 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
225 if (prog_ == NULL) {
226 if (options_.log_errors())
227 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
228 error_ = new std::string("pattern too large - compile failed");
229 error_code_ = RE2::ErrorPatternTooLarge;
230 return;
231 }
232
233 // We used to compute this lazily, but it's used during the
234 // typical control flow for a match call, so we now compute
235 // it eagerly, which avoids the overhead of absl::once_flag.
236 num_captures_ = suffix_regexp_->NumCaptures();
237
238 // Could delay this until the first match call that
239 // cares about submatch information, but the one-pass
240 // machine's memory gets cut from the DFA memory budget,
241 // and that is harder to do if the DFA has already
242 // been built.
243 is_one_pass_ = prog_->IsOnePass();
244}
245
246// Returns rprog_, computing it if needed.
247re2::Prog* RE2::ReverseProg() const {
248 absl::call_once(rprog_once_, [](const RE2* re) {
249 re->rprog_ =
250 re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
251 if (re->rprog_ == NULL) {
252 if (re->options_.log_errors())
253 LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
254 // We no longer touch error_ and error_code_ because failing to compile
255 // the reverse Prog is not a showstopper: falling back to NFA execution
256 // is fine. More importantly, an RE2 object is supposed to be logically
257 // immutable: whatever ok() would have returned after Init() completed,
258 // it should continue to return that no matter what ReverseProg() does.
259 }
260 }, this);
261 return rprog_;
262}
263
264RE2::~RE2() {
265 if (suffix_regexp_)
266 suffix_regexp_->Decref();
267 if (entire_regexp_)
268 entire_regexp_->Decref();
269 delete prog_;
270 delete rprog_;
271 if (error_ != empty_string)
272 delete error_;
273 if (named_groups_ != NULL && named_groups_ != empty_named_groups)
274 delete named_groups_;
275 if (group_names_ != NULL && group_names_ != empty_group_names)
276 delete group_names_;
277}
278
279int RE2::ProgramSize() const {
280 if (prog_ == NULL)
281 return -1;
282 return prog_->size();
283}
284
285int RE2::ReverseProgramSize() const {
286 if (prog_ == NULL)
287 return -1;
288 Prog* prog = ReverseProg();
289 if (prog == NULL)
290 return -1;
291 return prog->size();
292}
293
294// Finds the most significant non-zero bit in n.
295static int FindMSBSet(uint32_t n) {
296 DCHECK_NE(n, 0);
297#if defined(__GNUC__)
298 return 31 ^ __builtin_clz(n);
299#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
300 unsigned long c;
301 _BitScanReverse(&c, n);
302 return static_cast<int>(c);
303#else
304 int c = 0;
305 for (int shift = 1 << 4; shift != 0; shift >>= 1) {
306 uint32_t word = n >> shift;
307 if (word != 0) {
308 n = word;
309 c += shift;
310 }
311 }
312 return c;
313#endif
314}
315
316static int Fanout(Prog* prog, std::vector<int>* histogram) {
317 SparseArray<int> fanout(prog->size());
318 prog->Fanout(&fanout);
319 int data[32] = {};
320 int size = 0;
321 for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
322 if (i->value() == 0)
323 continue;
324 uint32_t value = i->value();
325 int bucket = FindMSBSet(value);
326 bucket += value & (value-1) ? 1 : 0;
327 ++data[bucket];
328 size = std::max(size, bucket+1);
329 }
330 if (histogram != NULL)
331 histogram->assign(data, data+size);
332 return size-1;
333}
334
335int RE2::ProgramFanout(std::vector<int>* histogram) const {
336 if (prog_ == NULL)
337 return -1;
338 return Fanout(prog_, histogram);
339}
340
341int RE2::ReverseProgramFanout(std::vector<int>* histogram) const {
342 if (prog_ == NULL)
343 return -1;
344 Prog* prog = ReverseProg();
345 if (prog == NULL)
346 return -1;
347 return Fanout(prog, histogram);
348}
349
350// Returns named_groups_, computing it if needed.
351const std::map<std::string, int>& RE2::NamedCapturingGroups() const {
352 absl::call_once(named_groups_once_, [](const RE2* re) {
353 if (re->suffix_regexp_ != NULL)
354 re->named_groups_ = re->suffix_regexp_->NamedCaptures();
355 if (re->named_groups_ == NULL)
356 re->named_groups_ = empty_named_groups;
357 }, this);
358 return *named_groups_;
359}
360
361// Returns group_names_, computing it if needed.
362const std::map<int, std::string>& RE2::CapturingGroupNames() const {
363 absl::call_once(group_names_once_, [](const RE2* re) {
364 if (re->suffix_regexp_ != NULL)
365 re->group_names_ = re->suffix_regexp_->CaptureNames();
366 if (re->group_names_ == NULL)
367 re->group_names_ = empty_group_names;
368 }, this);
369 return *group_names_;
370}
371
372/***** Convenience interfaces *****/
373
374bool RE2::FullMatchN(absl::string_view text, const RE2& re,
375 const Arg* const args[], int n) {
376 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
377}
378
379bool RE2::PartialMatchN(absl::string_view text, const RE2& re,
380 const Arg* const args[], int n) {
381 return re.DoMatch(text, UNANCHORED, NULL, args, n);
382}
383
384bool RE2::ConsumeN(absl::string_view* input, const RE2& re,
385 const Arg* const args[], int n) {
386 size_t consumed;
387 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
388 input->remove_prefix(consumed);
389 return true;
390 } else {
391 return false;
392 }
393}
394
395bool RE2::FindAndConsumeN(absl::string_view* input, const RE2& re,
396 const Arg* const args[], int n) {
397 size_t consumed;
398 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
399 input->remove_prefix(consumed);
400 return true;
401 } else {
402 return false;
403 }
404}
405
406bool RE2::Replace(std::string* str,
407 const RE2& re,
408 absl::string_view rewrite) {
409 absl::string_view vec[kVecSize];
410 int nvec = 1 + MaxSubmatch(rewrite);
411 if (nvec > 1 + re.NumberOfCapturingGroups())
412 return false;
413 if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec)))
414 return false;
415 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
416 return false;
417
418 std::string s;
419 if (!re.Rewrite(&s, rewrite, vec, nvec))
420 return false;
421
422 assert(vec[0].data() >= str->data());
423 assert(vec[0].data() + vec[0].size() <= str->data() + str->size());
424 str->replace(vec[0].data() - str->data(), vec[0].size(), s);
425 return true;
426}
427
428int RE2::GlobalReplace(std::string* str,
429 const RE2& re,
430 absl::string_view rewrite) {
431 absl::string_view vec[kVecSize];
432 int nvec = 1 + MaxSubmatch(rewrite);
433 if (nvec > 1 + re.NumberOfCapturingGroups())
434 return false;
435 if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec)))
436 return false;
437
438 const char* p = str->data();
439 const char* ep = p + str->size();
440 const char* lastend = NULL;
441 std::string out;
442 int count = 0;
443#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
444 // Iterate just once when fuzzing. Otherwise, we easily get bogged down
445 // and coverage is unlikely to improve despite significant expense.
446 while (p == str->data()) {
447#else
448 while (p <= ep) {
449#endif
450 if (!re.Match(*str, static_cast<size_t>(p - str->data()),
451 str->size(), UNANCHORED, vec, nvec))
452 break;
453 if (p < vec[0].data())
454 out.append(p, vec[0].data() - p);
455 if (vec[0].data() == lastend && vec[0].empty()) {
456 // Disallow empty match at end of last match: skip ahead.
457 //
458 // fullrune() takes int, not ptrdiff_t. However, it just looks
459 // at the leading byte and treats any length >= 4 the same.
460 if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
461 fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) {
462 // re is in UTF-8 mode and there is enough left of str
463 // to allow us to advance by up to UTFmax bytes.
464 Rune r;
465 int n = chartorune(&r, p);
466 // Some copies of chartorune have a bug that accepts
467 // encodings of values in (10FFFF, 1FFFFF] as valid.
468 if (r > Runemax) {
469 n = 1;
470 r = Runeerror;
471 }
472 if (!(n == 1 && r == Runeerror)) { // no decoding error
473 out.append(p, n);
474 p += n;
475 continue;
476 }
477 }
478 // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
479 // we fell through from above and the GIGO principle applies.
480 if (p < ep)
481 out.append(p, 1);
482 p++;
483 continue;
484 }
485 re.Rewrite(&out, rewrite, vec, nvec);
486 p = vec[0].data() + vec[0].size();
487 lastend = p;
488 count++;
489 }
490
491 if (count == 0)
492 return 0;
493
494 if (p < ep)
495 out.append(p, ep - p);
496 using std::swap;
497 swap(out, *str);
498 return count;
499}
500
501bool RE2::Extract(absl::string_view text,
502 const RE2& re,
503 absl::string_view rewrite,
504 std::string* out) {
505 absl::string_view vec[kVecSize];
506 int nvec = 1 + MaxSubmatch(rewrite);
507 if (nvec > 1 + re.NumberOfCapturingGroups())
508 return false;
509 if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec)))
510 return false;
511 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
512 return false;
513
514 out->clear();
515 return re.Rewrite(out, rewrite, vec, nvec);
516}
517
518std::string RE2::QuoteMeta(absl::string_view unquoted) {
519 std::string result;
520 result.reserve(unquoted.size() << 1);
521
522 // Escape any ascii character not in [A-Za-z_0-9].
523 //
524 // Note that it's legal to escape a character even if it has no
525 // special meaning in a regular expression -- so this function does
526 // that. (This also makes it identical to the perl function of the
527 // same name except for the null-character special case;
528 // see `perldoc -f quotemeta`.)
529 for (size_t ii = 0; ii < unquoted.size(); ++ii) {
530 // Note that using 'isalnum' here raises the benchmark time from
531 // 32ns to 58ns:
532 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
533 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
534 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
535 unquoted[ii] != '_' &&
536 // If this is the part of a UTF8 or Latin1 character, we need
537 // to copy this byte without escaping. Experimentally this is
538 // what works correctly with the regexp library.
539 !(unquoted[ii] & 128)) {
540 if (unquoted[ii] == '\0') { // Special handling for null chars.
541 // Note that this special handling is not strictly required for RE2,
542 // but this quoting is required for other regexp libraries such as
543 // PCRE.
544 // Can't use "\\0" since the next character might be a digit.
545 result += "\\x00";
546 continue;
547 }
548 result += '\\';
549 }
550 result += unquoted[ii];
551 }
552
553 return result;
554}
555
556bool RE2::PossibleMatchRange(std::string* min, std::string* max,
557 int maxlen) const {
558 if (prog_ == NULL)
559 return false;
560
561 int n = static_cast<int>(prefix_.size());
562 if (n > maxlen)
563 n = maxlen;
564
565 // Determine initial min max from prefix_ literal.
566 *min = prefix_.substr(0, n);
567 *max = prefix_.substr(0, n);
568 if (prefix_foldcase_) {
569 // prefix is ASCII lowercase; change *min to uppercase.
570 for (int i = 0; i < n; i++) {
571 char& c = (*min)[i];
572 if ('a' <= c && c <= 'z')
573 c += 'A' - 'a';
574 }
575 }
576
577 // Add to prefix min max using PossibleMatchRange on regexp.
578 std::string dmin, dmax;
579 maxlen -= n;
580 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
581 min->append(dmin);
582 max->append(dmax);
583 } else if (!max->empty()) {
584 // prog_->PossibleMatchRange has failed us,
585 // but we still have useful information from prefix_.
586 // Round up *max to allow any possible suffix.
587 PrefixSuccessor(max);
588 } else {
589 // Nothing useful.
590 *min = "";
591 *max = "";
592 return false;
593 }
594
595 return true;
596}
597
598// Avoid possible locale nonsense in standard strcasecmp.
599// The string a is known to be all lowercase.
600static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
601 const char* ae = a + len;
602
603 for (; a < ae; a++, b++) {
604 uint8_t x = *a;
605 uint8_t y = *b;
606 if ('A' <= y && y <= 'Z')
607 y += 'a' - 'A';
608 if (x != y)
609 return x - y;
610 }
611 return 0;
612}
613
614
615/***** Actual matching and rewriting code *****/
616
617bool RE2::Match(absl::string_view text,
618 size_t startpos,
619 size_t endpos,
620 Anchor re_anchor,
621 absl::string_view* submatch,
622 int nsubmatch) const {
623 if (!ok()) {
624 if (options_.log_errors())
625 LOG(ERROR) << "Invalid RE2: " << *error_;
626 return false;
627 }
628
629 if (startpos > endpos || endpos > text.size()) {
630 if (options_.log_errors())
631 LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
632 << "startpos: " << startpos << ", "
633 << "endpos: " << endpos << ", "
634 << "text size: " << text.size() << "]";
635 return false;
636 }
637
638 absl::string_view subtext = text;
639 subtext.remove_prefix(startpos);
640 subtext.remove_suffix(text.size() - endpos);
641
642 // Use DFAs to find exact location of match, filter out non-matches.
643
644 // Don't ask for the location if we won't use it.
645 // SearchDFA can do extra optimizations in that case.
646 absl::string_view match;
647 absl::string_view* matchp = &match;
648 if (nsubmatch == 0)
649 matchp = NULL;
650
651 int ncap = 1 + NumberOfCapturingGroups();
652 if (ncap > nsubmatch)
653 ncap = nsubmatch;
654
655 // If the regexp is anchored explicitly, must not be in middle of text.
656 if (prog_->anchor_start() && startpos != 0)
657 return false;
658 if (prog_->anchor_end() && endpos != text.size())
659 return false;
660
661 // If the regexp is anchored explicitly, update re_anchor
662 // so that we can potentially fall into a faster case below.
663 if (prog_->anchor_start() && prog_->anchor_end())
664 re_anchor = ANCHOR_BOTH;
665 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
666 re_anchor = ANCHOR_START;
667
668 // Check for the required prefix, if any.
669 size_t prefixlen = 0;
670 if (!prefix_.empty()) {
671 if (startpos != 0)
672 return false;
673 prefixlen = prefix_.size();
674 if (prefixlen > subtext.size())
675 return false;
676 if (prefix_foldcase_) {
677 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
678 return false;
679 } else {
680 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
681 return false;
682 }
683 subtext.remove_prefix(prefixlen);
684 // If there is a required prefix, the anchor must be at least ANCHOR_START.
685 if (re_anchor != ANCHOR_BOTH)
686 re_anchor = ANCHOR_START;
687 }
688
689 Prog::Anchor anchor = Prog::kUnanchored;
690 Prog::MatchKind kind = Prog::kFirstMatch;
691 if (options_.longest_match())
692 kind = Prog::kLongestMatch;
693
694 bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture;
695 bool can_bit_state = prog_->CanBitState();
696 size_t bit_state_text_max_size = prog_->bit_state_text_max_size();
697
698#ifdef RE2_HAVE_THREAD_LOCAL
699 hooks::context = this;
700#endif
701 bool dfa_failed = false;
702 bool skipped_test = false;
703 switch (re_anchor) {
704 default:
705 LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor;
706 return false;
707
708 case UNANCHORED: {
709 if (prog_->anchor_end()) {
710 // This is a very special case: we don't need the forward DFA because
711 // we already know where the match must end! Instead, the reverse DFA
712 // can say whether there is a match and (optionally) where it starts.
713 Prog* prog = ReverseProg();
714 if (prog == NULL) {
715 // Fall back to NFA below.
716 skipped_test = true;
717 break;
718 }
719 if (!prog->SearchDFA(subtext, text, Prog::kAnchored,
720 Prog::kLongestMatch, matchp, &dfa_failed, NULL)) {
721 if (dfa_failed) {
722 if (options_.log_errors())
723 LOG(ERROR) << "DFA out of memory: "
724 << "pattern length " << pattern_.size() << ", "
725 << "program size " << prog->size() << ", "
726 << "list count " << prog->list_count() << ", "
727 << "bytemap range " << prog->bytemap_range();
728 // Fall back to NFA below.
729 skipped_test = true;
730 break;
731 }
732 return false;
733 }
734 if (matchp == NULL) // Matched. Don't care where.
735 return true;
736 break;
737 }
738
739 if (!prog_->SearchDFA(subtext, text, anchor, kind,
740 matchp, &dfa_failed, NULL)) {
741 if (dfa_failed) {
742 if (options_.log_errors())
743 LOG(ERROR) << "DFA out of memory: "
744 << "pattern length " << pattern_.size() << ", "
745 << "program size " << prog_->size() << ", "
746 << "list count " << prog_->list_count() << ", "
747 << "bytemap range " << prog_->bytemap_range();
748 // Fall back to NFA below.
749 skipped_test = true;
750 break;
751 }
752 return false;
753 }
754 if (matchp == NULL) // Matched. Don't care where.
755 return true;
756 // SearchDFA set match.end() but didn't know where the
757 // match started. Run the regexp backward from match.end()
758 // to find the longest possible match -- that's where it started.
759 Prog* prog = ReverseProg();
760 if (prog == NULL) {
761 // Fall back to NFA below.
762 skipped_test = true;
763 break;
764 }
765 if (!prog->SearchDFA(match, text, Prog::kAnchored,
766 Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
767 if (dfa_failed) {
768 if (options_.log_errors())
769 LOG(ERROR) << "DFA out of memory: "
770 << "pattern length " << pattern_.size() << ", "
771 << "program size " << prog->size() << ", "
772 << "list count " << prog->list_count() << ", "
773 << "bytemap range " << prog->bytemap_range();
774 // Fall back to NFA below.
775 skipped_test = true;
776 break;
777 }
778 if (options_.log_errors())
779 LOG(ERROR) << "SearchDFA inconsistency";
780 return false;
781 }
782 break;
783 }
784
785 case ANCHOR_BOTH:
786 case ANCHOR_START:
787 if (re_anchor == ANCHOR_BOTH)
788 kind = Prog::kFullMatch;
789 anchor = Prog::kAnchored;
790
791 // If only a small amount of text and need submatch
792 // information anyway and we're going to use OnePass or BitState
793 // to get it, we might as well not even bother with the DFA:
794 // OnePass or BitState will be fast enough.
795 // On tiny texts, OnePass outruns even the DFA, and
796 // it doesn't have the shared state and occasional mutex that
797 // the DFA does.
798 if (can_one_pass && text.size() <= 4096 &&
799 (ncap > 1 || text.size() <= 16)) {
800 skipped_test = true;
801 break;
802 }
803 if (can_bit_state && text.size() <= bit_state_text_max_size &&
804 ncap > 1) {
805 skipped_test = true;
806 break;
807 }
808 if (!prog_->SearchDFA(subtext, text, anchor, kind,
809 &match, &dfa_failed, NULL)) {
810 if (dfa_failed) {
811 if (options_.log_errors())
812 LOG(ERROR) << "DFA out of memory: "
813 << "pattern length " << pattern_.size() << ", "
814 << "program size " << prog_->size() << ", "
815 << "list count " << prog_->list_count() << ", "
816 << "bytemap range " << prog_->bytemap_range();
817 // Fall back to NFA below.
818 skipped_test = true;
819 break;
820 }
821 return false;
822 }
823 break;
824 }
825
826 if (!skipped_test && ncap <= 1) {
827 // We know exactly where it matches. That's enough.
828 if (ncap == 1)
829 submatch[0] = match;
830 } else {
831 absl::string_view subtext1;
832 if (skipped_test) {
833 // DFA ran out of memory or was skipped:
834 // need to search in entire original text.
835 subtext1 = subtext;
836 } else {
837 // DFA found the exact match location:
838 // let NFA run an anchored, full match search
839 // to find submatch locations.
840 subtext1 = match;
841 anchor = Prog::kAnchored;
842 kind = Prog::kFullMatch;
843 }
844
845 if (can_one_pass && anchor != Prog::kUnanchored) {
846 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
847 if (!skipped_test && options_.log_errors())
848 LOG(ERROR) << "SearchOnePass inconsistency";
849 return false;
850 }
851 } else if (can_bit_state && subtext1.size() <= bit_state_text_max_size) {
852 if (!prog_->SearchBitState(subtext1, text, anchor,
853 kind, submatch, ncap)) {
854 if (!skipped_test && options_.log_errors())
855 LOG(ERROR) << "SearchBitState inconsistency";
856 return false;
857 }
858 } else {
859 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
860 if (!skipped_test && options_.log_errors())
861 LOG(ERROR) << "SearchNFA inconsistency";
862 return false;
863 }
864 }
865 }
866
867 // Adjust overall match for required prefix that we stripped off.
868 if (prefixlen > 0 && nsubmatch > 0)
869 submatch[0] = absl::string_view(submatch[0].data() - prefixlen,
870 submatch[0].size() + prefixlen);
871
872 // Zero submatches that don't exist in the regexp.
873 for (int i = ncap; i < nsubmatch; i++)
874 submatch[i] = absl::string_view();
875 return true;
876}
877
878// Internal matcher - like Match() but takes Args not string_views.
879bool RE2::DoMatch(absl::string_view text,
880 Anchor re_anchor,
881 size_t* consumed,
882 const Arg* const* args,
883 int n) const {
884 if (!ok()) {
885 if (options_.log_errors())
886 LOG(ERROR) << "Invalid RE2: " << *error_;
887 return false;
888 }
889
890 if (NumberOfCapturingGroups() < n) {
891 // RE has fewer capturing groups than number of Arg pointers passed in.
892 return false;
893 }
894
895 // Count number of capture groups needed.
896 int nvec;
897 if (n == 0 && consumed == NULL)
898 nvec = 0;
899 else
900 nvec = n+1;
901
902 absl::FixedArray<absl::string_view, kVecSize> vec_storage(nvec);
903 absl::string_view* vec = vec_storage.data();
904
905 if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
906 return false;
907 }
908
909 if (consumed != NULL)
910 *consumed = static_cast<size_t>(EndPtr(vec[0]) - BeginPtr(text));
911
912 if (n == 0 || args == NULL) {
913 // We are not interested in results
914 return true;
915 }
916
917 // If we got here, we must have matched the whole pattern.
918 for (int i = 0; i < n; i++) {
919 absl::string_view s = vec[i+1];
920 if (!args[i]->Parse(s.data(), s.size())) {
921 // TODO: Should we indicate what the error was?
922 return false;
923 }
924 }
925
926 return true;
927}
928
929// Checks that the rewrite string is well-formed with respect to this
930// regular expression.
931bool RE2::CheckRewriteString(absl::string_view rewrite,
932 std::string* error) const {
933 int max_token = -1;
934 for (const char *s = rewrite.data(), *end = s + rewrite.size();
935 s < end; s++) {
936 int c = *s;
937 if (c != '\\') {
938 continue;
939 }
940 if (++s == end) {
941 *error = "Rewrite schema error: '\\' not allowed at end.";
942 return false;
943 }
944 c = *s;
945 if (c == '\\') {
946 continue;
947 }
948 if (!isdigit(c)) {
949 *error = "Rewrite schema error: "
950 "'\\' must be followed by a digit or '\\'.";
951 return false;
952 }
953 int n = (c - '0');
954 if (max_token < n) {
955 max_token = n;
956 }
957 }
958
959 if (max_token > NumberOfCapturingGroups()) {
960 *error = absl::StrFormat(
961 "Rewrite schema requests %d matches, but the regexp only has %d "
962 "parenthesized subexpressions.",
963 max_token, NumberOfCapturingGroups());
964 return false;
965 }
966 return true;
967}
968
969// Returns the maximum submatch needed for the rewrite to be done by Replace().
970// E.g. if rewrite == "foo \\2,\\1", returns 2.
971int RE2::MaxSubmatch(absl::string_view rewrite) {
972 int max = 0;
973 for (const char *s = rewrite.data(), *end = s + rewrite.size();
974 s < end; s++) {
975 if (*s == '\\') {
976 s++;
977 int c = (s < end) ? *s : -1;
978 if (isdigit(c)) {
979 int n = (c - '0');
980 if (n > max)
981 max = n;
982 }
983 }
984 }
985 return max;
986}
987
988// Append the "rewrite" string, with backslash subsitutions from "vec",
989// to string "out".
990bool RE2::Rewrite(std::string* out,
991 absl::string_view rewrite,
992 const absl::string_view* vec,
993 int veclen) const {
994 for (const char *s = rewrite.data(), *end = s + rewrite.size();
995 s < end; s++) {
996 if (*s != '\\') {
997 out->push_back(*s);
998 continue;
999 }
1000 s++;
1001 int c = (s < end) ? *s : -1;
1002 if (isdigit(c)) {
1003 int n = (c - '0');
1004 if (n >= veclen) {
1005 if (options_.log_errors()) {
1006 LOG(ERROR) << "invalid substitution \\" << n
1007 << " from " << veclen << " groups";
1008 }
1009 return false;
1010 }
1011 absl::string_view snip = vec[n];
1012 if (!snip.empty())
1013 out->append(snip.data(), snip.size());
1014 } else if (c == '\\') {
1015 out->push_back('\\');
1016 } else {
1017 if (options_.log_errors())
1018 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
1019 return false;
1020 }
1021 }
1022 return true;
1023}
1024
1025/***** Parsers for various types *****/
1026
1027namespace re2_internal {
1028
1029template <>
1030bool Parse(const char* str, size_t n, void* dest) {
1031 // We fail if somebody asked us to store into a non-NULL void* pointer
1032 return (dest == NULL);
1033}
1034
1035template <>
1036bool Parse(const char* str, size_t n, std::string* dest) {
1037 if (dest == NULL) return true;
1038 dest->assign(str, n);
1039 return true;
1040}
1041
1042template <>
1043bool Parse(const char* str, size_t n, absl::string_view* dest) {
1044 if (dest == NULL) return true;
1045 *dest = absl::string_view(str, n);
1046 return true;
1047}
1048
1049template <>
1050bool Parse(const char* str, size_t n, char* dest) {
1051 if (n != 1) return false;
1052 if (dest == NULL) return true;
1053 *dest = str[0];
1054 return true;
1055}
1056
1057template <>
1058bool Parse(const char* str, size_t n, signed char* dest) {
1059 if (n != 1) return false;
1060 if (dest == NULL) return true;
1061 *dest = str[0];
1062 return true;
1063}
1064
1065template <>
1066bool Parse(const char* str, size_t n, unsigned char* dest) {
1067 if (n != 1) return false;
1068 if (dest == NULL) return true;
1069 *dest = str[0];
1070 return true;
1071}
1072
1073// Largest number spec that we are willing to parse
1074static const int kMaxNumberLength = 32;
1075
1076// REQUIRES "buf" must have length at least nbuf.
1077// Copies "str" into "buf" and null-terminates.
1078// Overwrites *np with the new length.
1079static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
1080 size_t* np, bool accept_spaces) {
1081 size_t n = *np;
1082 if (n == 0) return "";
1083 if (n > 0 && isspace(*str)) {
1084 // We are less forgiving than the strtoxxx() routines and do not
1085 // allow leading spaces. We do allow leading spaces for floats.
1086 if (!accept_spaces) {
1087 return "";
1088 }
1089 while (n > 0 && isspace(*str)) {
1090 n--;
1091 str++;
1092 }
1093 }
1094
1095 // Although buf has a fixed maximum size, we can still handle
1096 // arbitrarily large integers correctly by omitting leading zeros.
1097 // (Numbers that are still too long will be out of range.)
1098 // Before deciding whether str is too long,
1099 // remove leading zeros with s/000+/00/.
1100 // Leaving the leading two zeros in place means that
1101 // we don't change 0000x123 (invalid) into 0x123 (valid).
1102 // Skip over leading - before replacing.
1103 bool neg = false;
1104 if (n >= 1 && str[0] == '-') {
1105 neg = true;
1106 n--;
1107 str++;
1108 }
1109
1110 if (n >= 3 && str[0] == '0' && str[1] == '0') {
1111 while (n >= 3 && str[2] == '0') {
1112 n--;
1113 str++;
1114 }
1115 }
1116
1117 if (neg) { // make room in buf for -
1118 n++;
1119 str--;
1120 }
1121
1122 if (n > nbuf-1) return "";
1123
1124 memmove(buf, str, n);
1125 if (neg) {
1126 buf[0] = '-';
1127 }
1128 buf[n] = '\0';
1129 *np = n;
1130 return buf;
1131}
1132
1133template <>
1134bool Parse(const char* str, size_t n, float* dest) {
1135 if (n == 0) return false;
1136 static const int kMaxLength = 200;
1137 char buf[kMaxLength+1];
1138 str = TerminateNumber(buf, sizeof buf, str, &n, true);
1139 char* end;
1140 errno = 0;
1141 float r = strtof(str, &end);
1142 if (end != str + n) return false; // Leftover junk
1143 if (errno) return false;
1144 if (dest == NULL) return true;
1145 *dest = r;
1146 return true;
1147}
1148
1149template <>
1150bool Parse(const char* str, size_t n, double* dest) {
1151 if (n == 0) return false;
1152 static const int kMaxLength = 200;
1153 char buf[kMaxLength+1];
1154 str = TerminateNumber(buf, sizeof buf, str, &n, true);
1155 char* end;
1156 errno = 0;
1157 double r = strtod(str, &end);
1158 if (end != str + n) return false; // Leftover junk
1159 if (errno) return false;
1160 if (dest == NULL) return true;
1161 *dest = r;
1162 return true;
1163}
1164
1165template <>
1166bool Parse(const char* str, size_t n, long* dest, int radix) {
1167 if (n == 0) return false;
1168 char buf[kMaxNumberLength+1];
1169 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1170 char* end;
1171 errno = 0;
1172 long r = strtol(str, &end, radix);
1173 if (end != str + n) return false; // Leftover junk
1174 if (errno) return false;
1175 if (dest == NULL) return true;
1176 *dest = r;
1177 return true;
1178}
1179
1180template <>
1181bool Parse(const char* str, size_t n, unsigned long* dest, int radix) {
1182 if (n == 0) return false;
1183 char buf[kMaxNumberLength+1];
1184 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1185 if (str[0] == '-') {
1186 // strtoul() will silently accept negative numbers and parse
1187 // them. This module is more strict and treats them as errors.
1188 return false;
1189 }
1190
1191 char* end;
1192 errno = 0;
1193 unsigned long r = strtoul(str, &end, radix);
1194 if (end != str + n) return false; // Leftover junk
1195 if (errno) return false;
1196 if (dest == NULL) return true;
1197 *dest = r;
1198 return true;
1199}
1200
1201template <>
1202bool Parse(const char* str, size_t n, short* dest, int radix) {
1203 long r;
1204 if (!Parse(str, n, &r, radix)) return false; // Could not parse
1205 if ((short)r != r) return false; // Out of range
1206 if (dest == NULL) return true;
1207 *dest = (short)r;
1208 return true;
1209}
1210
1211template <>
1212bool Parse(const char* str, size_t n, unsigned short* dest, int radix) {
1213 unsigned long r;
1214 if (!Parse(str, n, &r, radix)) return false; // Could not parse
1215 if ((unsigned short)r != r) return false; // Out of range
1216 if (dest == NULL) return true;
1217 *dest = (unsigned short)r;
1218 return true;
1219}
1220
1221template <>
1222bool Parse(const char* str, size_t n, int* dest, int radix) {
1223 long r;
1224 if (!Parse(str, n, &r, radix)) return false; // Could not parse
1225 if ((int)r != r) return false; // Out of range
1226 if (dest == NULL) return true;
1227 *dest = (int)r;
1228 return true;
1229}
1230
1231template <>
1232bool Parse(const char* str, size_t n, unsigned int* dest, int radix) {
1233 unsigned long r;
1234 if (!Parse(str, n, &r, radix)) return false; // Could not parse
1235 if ((unsigned int)r != r) return false; // Out of range
1236 if (dest == NULL) return true;
1237 *dest = (unsigned int)r;
1238 return true;
1239}
1240
1241template <>
1242bool Parse(const char* str, size_t n, long long* dest, int radix) {
1243 if (n == 0) return false;
1244 char buf[kMaxNumberLength+1];
1245 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1246 char* end;
1247 errno = 0;
1248 long long r = strtoll(str, &end, radix);
1249 if (end != str + n) return false; // Leftover junk
1250 if (errno) return false;
1251 if (dest == NULL) return true;
1252 *dest = r;
1253 return true;
1254}
1255
1256template <>
1257bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) {
1258 if (n == 0) return false;
1259 char buf[kMaxNumberLength+1];
1260 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1261 if (str[0] == '-') {
1262 // strtoull() will silently accept negative numbers and parse
1263 // them. This module is more strict and treats them as errors.
1264 return false;
1265 }
1266 char* end;
1267 errno = 0;
1268 unsigned long long r = strtoull(str, &end, radix);
1269 if (end != str + n) return false; // Leftover junk
1270 if (errno) return false;
1271 if (dest == NULL) return true;
1272 *dest = r;
1273 return true;
1274}
1275
1276} // namespace re2_internal
1277
1278namespace hooks {
1279
1280#ifdef RE2_HAVE_THREAD_LOCAL
1281thread_local const RE2* context = NULL;
1282#endif
1283
1284template <typename T>
1285union Hook {
1286 void Store(T* cb) { cb_.store(cb, std::memory_order_release); }
1287 T* Load() const { return cb_.load(std::memory_order_acquire); }
1288
1289#if !defined(__clang__) && defined(_MSC_VER)
1290 // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent,
1291 // this is a gross hack to make std::atomic<T*> constant-initialized on MSVC.
1292 static_assert(ATOMIC_POINTER_LOCK_FREE == 2,
1293 "std::atomic<T*> must be always lock-free");
1294 T* cb_for_constinit_;
1295#endif
1296
1297 std::atomic<T*> cb_;
1298};
1299
1300template <typename T>
1301static void DoNothing(const T&) {}
1302
1303#define DEFINE_HOOK(type, name) \
1304 static Hook<type##Callback> name##_hook = {{&DoNothing<type>}}; \
1305 void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \
1306 type##Callback* Get##type##Hook() { return name##_hook.Load(); }
1307
1308DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset)
1309DEFINE_HOOK(DFASearchFailure, dfa_search_failure)
1310
1311#undef DEFINE_HOOK
1312
1313} // namespace hooks
1314
1315} // namespace re2
1316