re2.cc source code [tensorflow/external/com_googlesource_code_re2/re2/re2.cc]

1	// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Regular expression interface RE2.
6	//
7	// Originally the PCRE C++ wrapper, but adapted to use
8	// the new automata-based regular expression engines.
9
10	#include "re2/re2.h"
11
12	#include <assert.h>
13	#include <ctype.h>
14	#include <errno.h>
15	#ifdef _MSC_VER
16	#include <intrin.h>
17	#endif
18	#include <stdint.h>
19	#include <stdlib.h>
20	#include <string.h>
21	#include <algorithm>
22	#include <atomic>
23	#include <iterator>
24	#include <string>
25	#include <utility>
26	#include <vector>
27
28	#include "absl/base/macros.h"
29	#include "absl/container/fixed_array.h"
30	#include "absl/strings/str_format.h"
31	#include "util/logging.h"
32	#include "util/strutil.h"
33	#include "util/utf.h"
34	#include "re2/prog.h"
35	#include "re2/regexp.h"
36	#include "re2/sparse_array.h"
37
38	namespace re2 {
39
40	// Maximum number of args we can set
41	static const int kMaxArgs = `16`;
42	static const int kVecSize = `1`+kMaxArgs;
43
44	const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
45
46	RE2::Options::Options(RE2::CannedOptions opt)
47	: encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
48	posix_syntax_(opt == RE2::POSIX),
49	longest_match_(opt == RE2::POSIX),
50	log_errors_(opt != RE2::Quiet),
51	max_mem_(kDefaultMaxMem),
52	literal_(false),
53	never_nl_(false),
54	dot_nl_(false),
55	never_capture_(false),
56	case_sensitive_(true),
57	perl_classes_(false),
58	word_boundary_(false),
59	one_line_(false) {
60	}
61
62	// static empty objects for use as const references.
63	// To avoid global constructors, allocated in RE2::Init().
64	static const std::string* empty_string;
65	static const std::map<std::string, int>* empty_named_groups;
66	static const std::map<int, std::string>* empty_group_names;
67
68	// Converts from Regexp error code to RE2 error code.
69	// Maybe some day they will diverge. In any event, this
70	// hides the existence of Regexp from RE2 users.
71	static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
72	switch (code) {
73	case re2::kRegexpSuccess:
74	return RE2::NoError;
75	case re2::kRegexpInternalError:
76	return RE2::ErrorInternal;
77	case re2::kRegexpBadEscape:
78	return RE2::ErrorBadEscape;
79	case re2::kRegexpBadCharClass:
80	return RE2::ErrorBadCharClass;
81	case re2::kRegexpBadCharRange:
82	return RE2::ErrorBadCharRange;
83	case re2::kRegexpMissingBracket:
84	return RE2::ErrorMissingBracket;
85	case re2::kRegexpMissingParen:
86	return RE2::ErrorMissingParen;
87	case re2::kRegexpUnexpectedParen:
88	return RE2::ErrorUnexpectedParen;
89	case re2::kRegexpTrailingBackslash:
90	return RE2::ErrorTrailingBackslash;
91	case re2::kRegexpRepeatArgument:
92	return RE2::ErrorRepeatArgument;
93	case re2::kRegexpRepeatSize:
94	return RE2::ErrorRepeatSize;
95	case re2::kRegexpRepeatOp:
96	return RE2::ErrorRepeatOp;
97	case re2::kRegexpBadPerlOp:
98	return RE2::ErrorBadPerlOp;
99	case re2::kRegexpBadUTF8:
100	return RE2::ErrorBadUTF8;
101	case re2::kRegexpBadNamedCapture:
102	return RE2::ErrorBadNamedCapture;
103	}
104	return RE2::ErrorInternal;
105	}
106
107	static std::string trunc(absl::string_view pattern) {
108	if (pattern.size() < `100`)
109	return std::string (pattern);
110	return std::string (pattern.substr(`0`, `100`)) + "...";
111	}
112
113
114	RE2::RE2(const char* pattern) {
115	Init(pattern, DefaultOptions);
116	}
117
118	RE2::RE2(const std::string& pattern) {
119	Init(pattern, DefaultOptions);
120	}
121
122	RE2::RE2(absl::string_view pattern) {
123	Init(pattern, DefaultOptions);
124	}
125
126	RE2::RE2(absl::string_view pattern, const Options& options) {
127	Init(pattern, options);
128	}
129
130	int RE2::Options::ParseFlags() const {
131	int flags = Regexp::ClassNL;
132	switch (encoding()) {
133	default:
134	if (log_errors())
135	LOG(ERROR) << "Unknown encoding " << encoding();
136	break;
137	case RE2::Options::EncodingUTF8:
138	break;
139	case RE2::Options::EncodingLatin1:
140	flags \|= Regexp::Latin1;
141	break;
142	}
143
144	if (!posix_syntax())
145	flags \|= Regexp::LikePerl;
146
147	if (literal())
148	flags \|= Regexp::Literal;
149
150	if (never_nl())
151	flags \|= Regexp::NeverNL;
152
153	if (dot_nl())
154	flags \|= Regexp::DotNL;
155
156	if (never_capture())
157	flags \|= Regexp::NeverCapture;
158
159	if (!case_sensitive())
160	flags \|= Regexp::FoldCase;
161
162	if (perl_classes())
163	flags \|= Regexp::PerlClasses;
164
165	if (word_boundary())
166	flags \|= Regexp::PerlB;
167
168	if (one_line())
169	flags \|= Regexp::OneLine;
170
171	return flags;
172	}
173
174	void RE2::Init(absl::string_view pattern, const Options& options) {
175	static absl::once_flag empty_once;
176	absl::call_once(empty_once, []() {
177	empty_string = new std::string;
178	empty_named_groups = new std::map<std::string, int>;
179	empty_group_names = new std::map<int, std::string>;
180	});
181
182	pattern_.assign(pattern.data(), pattern.size());
183	options_.Copy(options);
184	entire_regexp_ = NULL;
185	error_ = empty_string;
186	error_code_ = NoError;
187	error_arg_.clear();
188	prefix_.clear();
189	prefix_foldcase_ = false;
190	suffix_regexp_ = NULL;
191	prog_ = NULL;
192	num_captures_ = -`1`;
193	is_one_pass_ = false;
194
195	rprog_ = NULL;
196	named_groups_ = NULL;
197	group_names_ = NULL;
198
199	RegexpStatus status;
200	entire_regexp_ = Regexp::Parse(
201	pattern_,
202	static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
203	&status);
204	if (entire_regexp_ == NULL) {
205	if (options_.log_errors()) {
206	LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
207	<< status.Text();
208	}
209	error_ = new std::string(status.Text());
210	error_code_ = RegexpErrorToRE2(status.code());
211	error_arg_ = std::string (status.error_arg());
212	return;
213	}
214
215	re2::Regexp* suffix;
216	if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
217	suffix_regexp_ = suffix;
218	else
219	suffix_regexp_ = entire_regexp_->Incref();
220
221	// Two thirds of the memory goes to the forward Prog,
222	// one third to the reverse prog, because the forward
223	// Prog has two DFAs but the reverse prog has one.
224	prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*`2`/`3`);
225	if (prog_ == NULL) {
226	if (options_.log_errors())
227	LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
228	error_ = new std::string ("pattern too large - compile failed");
229	error_code_ = RE2::ErrorPatternTooLarge;
230	return;
231	}
232
233	// We used to compute this lazily, but it's used during the
234	// typical control flow for a match call, so we now compute
235	// it eagerly, which avoids the overhead of absl::once_flag.
236	num_captures_ = suffix_regexp_->NumCaptures();
237
238	// Could delay this until the first match call that
239	// cares about submatch information, but the one-pass
240	// machine's memory gets cut from the DFA memory budget,
241	// and that is harder to do if the DFA has already
242	// been built.
243	is_one_pass_ = prog_->IsOnePass();
244	}
245
246	// Returns rprog_, computing it if needed.
247	re2::Prog* RE2::ReverseProg() const {
248	absl::call_once(rprog_once_, [](const RE2* re) {
249	re->rprog_ =
250	re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / `3`);
251	if (re->rprog_ == NULL) {
252	if (re->options_.log_errors())
253	LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
254	// We no longer touch error_ and error_code_ because failing to compile
255	// the reverse Prog is not a showstopper: falling back to NFA execution
256	// is fine. More importantly, an RE2 object is supposed to be logically
257	// immutable: whatever ok() would have returned after Init() completed,
258	// it should continue to return that no matter what ReverseProg() does.
259	}
260	}, this);
261	return rprog_;
262	}
263
264	RE2::~RE2() {
265	if (suffix_regexp_)
266	suffix_regexp_->Decref();
267	if (entire_regexp_)
268	entire_regexp_->Decref();
269	delete prog_;
270	delete rprog_;
271	if (error_ != empty_string)
272	delete error_;
273	if (named_groups_ != NULL && named_groups_ != empty_named_groups)
274	delete named_groups_;
275	if (group_names_ != NULL && group_names_ != empty_group_names)
276	delete group_names_;
277	}
278
279	int RE2::ProgramSize() const {
280	if (prog_ == NULL)
281	return -`1`;
282	return prog_->size();
283	}
284
285	int RE2::ReverseProgramSize() const {
286	if (prog_ == NULL)
287	return -`1`;
288	Prog* prog = ReverseProg();
289	if (prog == NULL)
290	return -`1`;
291	return prog->size();
292	}
293
294	// Finds the most significant non-zero bit in n.
295	static int FindMSBSet(uint32_t n) {
296	DCHECK_NE(n, `0`);
297	#if defined(__GNUC__)
298	return `31` ^ __builtin_clz(n);
299	#elif defined(_MSC_VER) && (defined(_M_X64) \|\| defined(_M_IX86))
300	unsigned long c;
301	_BitScanReverse(&c, n);
302	return static_cast<int>(c);
303	#else
304	int c = `0`;
305	for (int shift = `1` << `4`; shift != `0`; shift >>= `1`) {
306	uint32_t word = n >> shift;
307	if (word != `0`) {
308	n = word;
309	c += shift;
310	}
311	}
312	return c;
313	#endif
314	}
315
316	static int Fanout(Prog* prog, std::vector<int>* histogram) {
317	SparseArray<int> fanout(prog->size());
318	prog->Fanout(&fanout);
319	int data[`32`] = {};
320	int size = `0`;
321	for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
322	if (i->value() == `0`)
323	continue;
324	uint32_t value = i->value();
325	int bucket = FindMSBSet(value);
326	bucket += value & (value-`1`) ? `1` : `0`;
327	++data[bucket];
328	size = std::max(size, bucket+`1`);
329	}
330	if (histogram != NULL)
331	histogram->assign(data, data+size);
332	return size-`1`;
333	}
334
335	int RE2::ProgramFanout(std::vector<int>* histogram) const {
336	if (prog_ == NULL)
337	return -`1`;
338	return Fanout(prog_, histogram);
339	}
340
341	int RE2::ReverseProgramFanout(std::vector<int>* histogram) const {
342	if (prog_ == NULL)
343	return -`1`;
344	Prog* prog = ReverseProg();
345	if (prog == NULL)
346	return -`1`;
347	return Fanout(prog, histogram);
348	}
349
350	// Returns named_groups_, computing it if needed.
351	const std::map<std::string, int>& RE2::NamedCapturingGroups() const {
352	absl::call_once(named_groups_once_, [](const RE2* re) {
353	if (re->suffix_regexp_ != NULL)
354	re->named_groups_ = re->suffix_regexp_->NamedCaptures();
355	if (re->named_groups_ == NULL)
356	re->named_groups_ = empty_named_groups;
357	}, this);
358	return *named_groups_;
359	}
360
361	// Returns group_names_, computing it if needed.
362	const std::map<int, std::string>& RE2::CapturingGroupNames() const {
363	absl::call_once(group_names_once_, [](const RE2* re) {
364	if (re->suffix_regexp_ != NULL)
365	re->group_names_ = re->suffix_regexp_->CaptureNames();
366	if (re->group_names_ == NULL)
367	re->group_names_ = empty_group_names;
368	}, this);
369	return *group_names_;
370	}
371
372	/** Convenience interfaces **/
373
374	bool RE2::FullMatchN(absl::string_view text, const RE2& re,
375	const Arg* const args[], int n) {
376	return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
377	}
378
379	bool RE2::PartialMatchN(absl::string_view text, const RE2& re,
380	const Arg* const args[], int n) {
381	return re.DoMatch(text, UNANCHORED, NULL, args, n);
382	}
383
384	bool RE2::ConsumeN(absl::string_view* input, const RE2& re,
385	const Arg* const args[], int n) {
386	size_t consumed;
387	if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
388	input->remove_prefix(consumed);
389	return true;
390	} else {
391	return false;
392	}
393	}
394
395	bool RE2::FindAndConsumeN(absl::string_view* input, const RE2& re,
396	const Arg* const args[], int n) {
397	size_t consumed;
398	if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
399	input->remove_prefix(consumed);
400	return true;
401	} else {
402	return false;
403	}
404	}
405
406	bool RE2::Replace(std::string* str,
407	const RE2& re,
408	absl::string_view rewrite) {
409	absl::string_view vec[kVecSize];
410	int nvec = `1` + MaxSubmatch(rewrite);
411	if (nvec > `1` + re.NumberOfCapturingGroups())
412	return false;
413	if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec)))
414	return false;
415	if (!re.Match(*str, `0`, str->size(), UNANCHORED, vec, nvec))
416	return false;
417
418	std::string s;
419	if (!re.Rewrite(&s, rewrite, vec, nvec))
420	return false;
421
422	assert(vec[`0`].data() >= str->data());
423	assert(vec[`0`].data() + vec[`0`].size() <= str->data() + str->size());
424	str->replace(vec[`0`].data() - str->data(), vec[`0`].size(), s);
425	return true;
426	}
427
428	int RE2::GlobalReplace(std::string* str,
429	const RE2& re,
430	absl::string_view rewrite) {
431	absl::string_view vec[kVecSize];
432	int nvec = `1` + MaxSubmatch(rewrite);
433	if (nvec > `1` + re.NumberOfCapturingGroups())
434	return false;
435	if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec)))
436	return false;
437
438	const char* p = str->data();
439	const char* ep = p + str->size();
440	const char* lastend = NULL;
441	std::string out;
442	int count = `0`;
443	#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
444	// Iterate just once when fuzzing. Otherwise, we easily get bogged down
445	// and coverage is unlikely to improve despite significant expense.
446	while (p == str->data()) {
447	#else
448	while (p <= ep) {
449	#endif
450	if (!re.Match(str, static_cast*<size_t>(p - str->data()),
451	str->size(), UNANCHORED, vec, nvec))
452	break;
453	if (p < vec[`0`].data())
454	out.append(p, vec[`0`].data() - p);
455	if (vec[`0`].data() == lastend && vec[`0`].empty()) {
456	// Disallow empty match at end of last match: skip ahead.
457	//
458	// fullrune() takes int, not ptrdiff_t. However, it just looks
459	// at the leading byte and treats any length >= 4 the same.
460	if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
461	fullrune(p, static_cast<int>(std::min(ptrdiff_t{`4`}, ep - p)))) {
462	// re is in UTF-8 mode and there is enough left of str
463	// to allow us to advance by up to UTFmax bytes.
464	Rune r;
465	int n = chartorune(&r, p);
466	// Some copies of chartorune have a bug that accepts
467	// encodings of values in (10FFFF, 1FFFFF] as valid.
468	if (r > Runemax) {
469	n = `1`;
470	r = Runeerror;
471	}
472	if (!(n == `1` && r == Runeerror)) { // no decoding error
473	out.append(p, n);
474	p += n;
475	continue;
476	}
477	}
478	// Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
479	// we fell through from above and the GIGO principle applies.
480	if (p < ep)
481	out.append(p, `1`);
482	p++;
483	continue;
484	}
485	re.Rewrite(&out, rewrite, vec, nvec);
486	p = vec[`0`].data() + vec[`0`].size();
487	lastend = p;
488	count++;
489	}
490
491	if (count == `0`)
492	return `0`;
493
494	if (p < ep)
495	out.append(p, ep - p);
496	using std::swap;
497	swap(out, *str);
498	return count;
499	}
500
501	bool RE2::Extract(absl::string_view text,
502	const RE2& re,
503	absl::string_view rewrite,
504	std::string* out) {
505	absl::string_view vec[kVecSize];
506	int nvec = `1` + MaxSubmatch(rewrite);
507	if (nvec > `1` + re.NumberOfCapturingGroups())
508	return false;
509	if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec)))
510	return false;
511	if (!re.Match(text, `0`, text.size(), UNANCHORED, vec, nvec))
512	return false;
513
514	out->clear();
515	return re.Rewrite(out, rewrite, vec, nvec);
516	}
517
518	std::string RE2::QuoteMeta(absl::string_view unquoted) {
519	std::string result;
520	result.reserve(unquoted.size() << `1`);
521
522	// Escape any ascii character not in [A-Za-z_0-9].
523	//
524	// Note that it's legal to escape a character even if it has no
525	// special meaning in a regular expression -- so this function does
526	// that. (This also makes it identical to the perl function of the
527	// same name except for the null-character special case;
528	// see `perldoc -f quotemeta`.)
529	for (size_t ii = `0`; ii < unquoted.size(); ++ii) {
530	// Note that using 'isalnum' here raises the benchmark time from
531	// 32ns to 58ns:
532	if ((unquoted [ii] < `'a'` \|\| unquoted [ii] > `'z'`) &&
533	(unquoted [ii] < `'A'` \|\| unquoted [ii] > `'Z'`) &&
534	(unquoted [ii] < `'0'` \|\| unquoted [ii] > `'9'`) &&
535	unquoted [ii] != `'_'` &&
536	// If this is the part of a UTF8 or Latin1 character, we need
537	// to copy this byte without escaping. Experimentally this is
538	// what works correctly with the regexp library.
539	!(unquoted [ii] & `128`)) {
540	if (unquoted [ii] == `'\0'`) { // Special handling for null chars.
541	// Note that this special handling is not strictly required for RE2,
542	// but this quoting is required for other regexp libraries such as
543	// PCRE.
544	// Can't use "\\0" since the next character might be a digit.
545	result += "\\x00";
546	continue;
547	}
548	result += `'\\'`;
549	}
550	result += unquoted [ii];
551	}
552
553	return result;
554	}
555
556	bool RE2::PossibleMatchRange(std::string* min, std::string* max,
557	int maxlen) const {
558	if (prog_ == NULL)
559	return false;
560
561	int n = static_cast<int>(prefix_.size());
562	if (n > maxlen)
563	n = maxlen;
564
565	// Determine initial min max from prefix_ literal.
566	*min = prefix_.substr(`0`, n);
567	*max = prefix_.substr(`0`, n);
568	if (prefix_foldcase_) {
569	// prefix is ASCII lowercase; change min to uppercase.*
570	for (int i = `0`; i < n; i++) {
571	char& c = (*min)[i];
572	if (`'a'` <= c && c <= `'z'`)
573	c += `'A'` - `'a'`;
574	}
575	}
576
577	// Add to prefix min max using PossibleMatchRange on regexp.
578	std::string dmin, dmax;
579	maxlen -= n;
580	if (maxlen > `0` && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
581	min->append(dmin);
582	max->append(dmax);
583	} else if (!max->empty()) {
584	// prog_->PossibleMatchRange has failed us,
585	// but we still have useful information from prefix_.
586	// Round up max to allow any possible suffix.*
587	PrefixSuccessor(max);
588	} else {
589	// Nothing useful.
590	*min = "";
591	*max = "";
592	return false;
593	}
594
595	return true;
596	}
597
598	// Avoid possible locale nonsense in standard strcasecmp.
599	// The string a is known to be all lowercase.
600	static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
601	const char* ae = a + len;
602
603	for (; a < ae; a++, b++) {
604	uint8_t x = *a;
605	uint8_t y = *b;
606	if (`'A'` <= y && y <= `'Z'`)
607	y += `'a'` - `'A'`;
608	if (x != y)
609	return x - y;
610	}
611	return `0`;
612	}
613
614
615	/** Actual matching and rewriting code **/
616
617	bool RE2::Match(absl::string_view text,
618	size_t startpos,
619	size_t endpos,
620	Anchor re_anchor,
621	absl::string_view* submatch,
622	int nsubmatch) const {
623	if (!ok()) {
624	if (options_.log_errors())
625	LOG(ERROR) << "Invalid RE2: " << *error_;
626	return false;
627	}
628
629	if (startpos > endpos \|\| endpos > text.size()) {
630	if (options_.log_errors())
631	LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
632	<< "startpos: " << startpos << ", "
633	<< "endpos: " << endpos << ", "
634	<< "text size: " << text.size() << "]";
635	return false;
636	}
637
638	absl::string_view subtext = text;
639	subtext.remove_prefix(startpos);
640	subtext.remove_suffix(text.size() - endpos);
641
642	// Use DFAs to find exact location of match, filter out non-matches.
643
644	// Don't ask for the location if we won't use it.
645	// SearchDFA can do extra optimizations in that case.
646	absl::string_view match;
647	absl::string_view* matchp = &match;
648	if (nsubmatch == `0`)
649	matchp = NULL;
650
651	int ncap = `1` + NumberOfCapturingGroups();
652	if (ncap > nsubmatch)
653	ncap = nsubmatch;
654
655	// If the regexp is anchored explicitly, must not be in middle of text.
656	if (prog_->anchor_start() && startpos != `0`)
657	return false;
658	if (prog_->anchor_end() && endpos != text.size())
659	return false;
660
661	// If the regexp is anchored explicitly, update re_anchor
662	// so that we can potentially fall into a faster case below.
663	if (prog_->anchor_start() && prog_->anchor_end())
664	re_anchor = ANCHOR_BOTH;
665	else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
666	re_anchor = ANCHOR_START;
667
668	// Check for the required prefix, if any.
669	size_t prefixlen = `0`;
670	if (!prefix_.empty()) {
671	if (startpos != `0`)
672	return false;
673	prefixlen = prefix_.size();
674	if (prefixlen > subtext.size())
675	return false;
676	if (prefix_foldcase_) {
677	if (ascii_strcasecmp(&prefix_[`0`], subtext.data(), prefixlen) != `0`)
678	return false;
679	} else {
680	if (memcmp(&prefix_[`0`], subtext.data(), prefixlen) != `0`)
681	return false;
682	}
683	subtext.remove_prefix(prefixlen);
684	// If there is a required prefix, the anchor must be at least ANCHOR_START.
685	if (re_anchor != ANCHOR_BOTH)
686	re_anchor = ANCHOR_START;
687	}
688
689	Prog::Anchor anchor = Prog::kUnanchored;
690	Prog::MatchKind kind = Prog::kFirstMatch;
691	if (options_.longest_match())
692	kind = Prog::kLongestMatch;
693
694	bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture;
695	bool can_bit_state = prog_->CanBitState();
696	size_t bit_state_text_max_size = prog_->bit_state_text_max_size();
697
698	#ifdef RE2_HAVE_THREAD_LOCAL
699	hooks::context = this;
700	#endif
701	bool dfa_failed = false;
702	bool skipped_test = false;
703	switch (re_anchor) {
704	default:
705	LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor;
706	return false;
707
708	case UNANCHORED: {
709	if (prog_->anchor_end()) {
710	// This is a very special case: we don't need the forward DFA because
711	// we already know where the match must end! Instead, the reverse DFA
712	// can say whether there is a match and (optionally) where it starts.
713	Prog* prog = ReverseProg();
714	if (prog == NULL) {
715	// Fall back to NFA below.
716	skipped_test = true;
717	break;
718	}
719	if (!prog->SearchDFA(subtext, text, Prog::kAnchored,
720	Prog::kLongestMatch, matchp, &dfa_failed, NULL)) {
721	if (dfa_failed) {
722	if (options_.log_errors())
723	LOG(ERROR) << "DFA out of memory: "
724	<< "pattern length " << pattern_.size() << ", "
725	<< "program size " << prog->size() << ", "
726	<< "list count " << prog->list_count() << ", "
727	<< "bytemap range " << prog->bytemap_range();
728	// Fall back to NFA below.
729	skipped_test = true;
730	break;
731	}
732	return false;
733	}
734	if (matchp == NULL) // Matched. Don't care where.
735	return true;
736	break;
737	}
738
739	if (!prog_->SearchDFA(subtext, text, anchor, kind,
740	matchp, &dfa_failed, NULL)) {
741	if (dfa_failed) {
742	if (options_.log_errors())
743	LOG(ERROR) << "DFA out of memory: "
744	<< "pattern length " << pattern_.size() << ", "
745	<< "program size " << prog_->size() << ", "
746	<< "list count " << prog_->list_count() << ", "
747	<< "bytemap range " << prog_->bytemap_range();
748	// Fall back to NFA below.
749	skipped_test = true;
750	break;
751	}
752	return false;
753	}
754	if (matchp == NULL) // Matched. Don't care where.
755	return true;
756	// SearchDFA set match.end() but didn't know where the
757	// match started. Run the regexp backward from match.end()
758	// to find the longest possible match -- that's where it started.
759	Prog* prog = ReverseProg();
760	if (prog == NULL) {
761	// Fall back to NFA below.
762	skipped_test = true;
763	break;
764	}
765	if (!prog->SearchDFA(match, text, Prog::kAnchored,
766	Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
767	if (dfa_failed) {
768	if (options_.log_errors())
769	LOG(ERROR) << "DFA out of memory: "
770	<< "pattern length " << pattern_.size() << ", "
771	<< "program size " << prog->size() << ", "
772	<< "list count " << prog->list_count() << ", "
773	<< "bytemap range " << prog->bytemap_range();
774	// Fall back to NFA below.
775	skipped_test = true;
776	break;
777	}
778	if (options_.log_errors())
779	LOG(ERROR) << "SearchDFA inconsistency";
780	return false;
781	}
782	break;
783	}
784
785	case ANCHOR_BOTH:
786	case ANCHOR_START:
787	if (re_anchor == ANCHOR_BOTH)
788	kind = Prog::kFullMatch;
789	anchor = Prog::kAnchored;
790
791	// If only a small amount of text and need submatch
792	// information anyway and we're going to use OnePass or BitState
793	// to get it, we might as well not even bother with the DFA:
794	// OnePass or BitState will be fast enough.
795	// On tiny texts, OnePass outruns even the DFA, and
796	// it doesn't have the shared state and occasional mutex that
797	// the DFA does.
798	if (can_one_pass && text.size() <= `4096` &&
799	(ncap > `1` \|\| text.size() <= `16`)) {
800	skipped_test = true;
801	break;
802	}
803	if (can_bit_state && text.size() <= bit_state_text_max_size &&
804	ncap > `1`) {
805	skipped_test = true;
806	break;
807	}
808	if (!prog_->SearchDFA(subtext, text, anchor, kind,
809	&match, &dfa_failed, NULL)) {
810	if (dfa_failed) {
811	if (options_.log_errors())
812	LOG(ERROR) << "DFA out of memory: "
813	<< "pattern length " << pattern_.size() << ", "
814	<< "program size " << prog_->size() << ", "
815	<< "list count " << prog_->list_count() << ", "
816	<< "bytemap range " << prog_->bytemap_range();
817	// Fall back to NFA below.
818	skipped_test = true;
819	break;
820	}
821	return false;
822	}
823	break;
824	}
825
826	if (!skipped_test && ncap <= `1`) {
827	// We know exactly where it matches. That's enough.
828	if (ncap == `1`)
829	submatch[`0`] = match;
830	} else {
831	absl::string_view subtext1;
832	if (skipped_test) {
833	// DFA ran out of memory or was skipped:
834	// need to search in entire original text.
835	subtext1 = subtext;
836	} else {
837	// DFA found the exact match location:
838	// let NFA run an anchored, full match search
839	// to find submatch locations.
840	subtext1 = match;
841	anchor = Prog::kAnchored;
842	kind = Prog::kFullMatch;
843	}
844
845	if (can_one_pass && anchor != Prog::kUnanchored) {
846	if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
847	if (!skipped_test && options_.log_errors())
848	LOG(ERROR) << "SearchOnePass inconsistency";
849	return false;
850	}
851	} else if (can_bit_state && subtext1.size() <= bit_state_text_max_size) {
852	if (!prog_->SearchBitState(subtext1, text, anchor,
853	kind, submatch, ncap)) {
854	if (!skipped_test && options_.log_errors())
855	LOG(ERROR) << "SearchBitState inconsistency";
856	return false;
857	}
858	} else {
859	if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
860	if (!skipped_test && options_.log_errors())
861	LOG(ERROR) << "SearchNFA inconsistency";
862	return false;
863	}
864	}
865	}
866
867	// Adjust overall match for required prefix that we stripped off.
868	if (prefixlen > `0` && nsubmatch > `0`)
869	submatch[`0`] = absl::string_view (submatch[`0`].data() - prefixlen,
870	submatch[`0`].size() + prefixlen);
871
872	// Zero submatches that don't exist in the regexp.
873	for (int i = ncap; i < nsubmatch; i++)
874	submatch[i] = absl::string_view ();
875	return true;
876	}
877
878	// Internal matcher - like Match() but takes Args not string_views.
879	bool RE2::DoMatch(absl::string_view text,
880	Anchor re_anchor,
881	size_t* consumed,
882	const Arg* const* args,
883	int n) const {
884	if (!ok()) {
885	if (options_.log_errors())
886	LOG(ERROR) << "Invalid RE2: " << *error_;
887	return false;
888	}
889
890	if (NumberOfCapturingGroups() < n) {
891	// RE has fewer capturing groups than number of Arg pointers passed in.
892	return false;
893	}
894
895	// Count number of capture groups needed.
896	int nvec;
897	if (n == `0` && consumed == NULL)
898	nvec = `0`;
899	else
900	nvec = n+`1`;
901
902	absl::FixedArray<absl::string_view, kVecSize> vec_storage(nvec);
903	absl::string_view* vec = vec_storage.data();
904
905	if (!Match(text, `0`, text.size(), re_anchor, vec, nvec)) {
906	return false;
907	}
908
909	if (consumed != NULL)
910	consumed = static_cast*<size_t>(EndPtr(vec[`0`]) - BeginPtr(text));
911
912	if (n == `0` \|\| args == NULL) {
913	// We are not interested in results
914	return true;
915	}
916
917	// If we got here, we must have matched the whole pattern.
918	for (int i = `0`; i < n; i++) {
919	absl::string_view s = vec[i+`1`];
920	if (!args[i]->Parse(s.data(), s.size())) {
921	// TODO: Should we indicate what the error was?
922	return false;
923	}
924	}
925
926	return true;
927	}
928
929	// Checks that the rewrite string is well-formed with respect to this
930	// regular expression.
931	bool RE2::CheckRewriteString(absl::string_view rewrite,
932	std::string* error) const {
933	int max_token = -`1`;
934	for (const char s = rewrite.data(), end = s + rewrite.size();
935	s < end; s++) {
936	int c = *s;
937	if (c != `'\\'`) {
938	continue;
939	}
940	if (++s == end) {
941	*error = "Rewrite schema error: '\\' not allowed at end.";
942	return false;
943	}
944	c = *s;
945	if (c == `'\\'`) {
946	continue;
947	}
948	if (!isdigit(c)) {
949	*error = "Rewrite schema error: "
950	"'\\' must be followed by a digit or '\\'.";
951	return false;
952	}
953	int n = (c - `'0'`);
954	if (max_token < n) {
955	max_token = n;
956	}
957	}
958
959	if (max_token > NumberOfCapturingGroups()) {
960	*error = absl::StrFormat(
961	"Rewrite schema requests %d matches, but the regexp only has %d "
962	"parenthesized subexpressions.",
963	max_token, NumberOfCapturingGroups());
964	return false;
965	}
966	return true;
967	}
968
969	// Returns the maximum submatch needed for the rewrite to be done by Replace().
970	// E.g. if rewrite == "foo \\2,\\1", returns 2.
971	int RE2::MaxSubmatch(absl::string_view rewrite) {
972	int max = `0`;
973	for (const char s = rewrite.data(), end = s + rewrite.size();
974	s < end; s++) {
975	if (*s == `'\\'`) {
976	s++;
977	int c = (s < end) ? *s : -`1`;
978	if (isdigit(c)) {
979	int n = (c - `'0'`);
980	if (n > max)
981	max = n;
982	}
983	}
984	}
985	return max;
986	}
987
988	// Append the "rewrite" string, with backslash subsitutions from "vec",
989	// to string "out".
990	bool RE2::Rewrite(std::string* out,
991	absl::string_view rewrite,
992	const absl::string_view* vec,
993	int veclen) const {
994	for (const char s = rewrite.data(), end = s + rewrite.size();
995	s < end; s++) {
996	if (*s != `'\\'`) {
997	out->push_back(*s);
998	continue;
999	}
1000	s++;
1001	int c = (s < end) ? *s : -`1`;
1002	if (isdigit(c)) {
1003	int n = (c - `'0'`);
1004	if (n >= veclen) {
1005	if (options_.log_errors()) {
1006	LOG(ERROR) << "invalid substitution \\" << n
1007	<< " from " << veclen << " groups";
1008	}
1009	return false;
1010	}
1011	absl::string_view snip = vec[n];
1012	if (!snip.empty())
1013	out->append(snip.data(), snip.size());
1014	} else if (c == `'\\'`) {
1015	out->push_back(`'\\'`);
1016	} else {
1017	if (options_.log_errors())
1018	LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
1019	return false;
1020	}
1021	}
1022	return true;
1023	}
1024
1025	/** Parsers for various types **/
1026
1027	namespace re2_internal {
1028
1029	template <>
1030	bool Parse(const char* str, size_t n, void* dest) {
1031	// We fail if somebody asked us to store into a non-NULL void pointer*
1032	return (dest == NULL);
1033	}
1034
1035	template <>
1036	bool Parse(const char* str, size_t n, std::string* dest) {
1037	if (dest == NULL) return true;
1038	dest->assign(str, n);
1039	return true;
1040	}
1041
1042	template <>
1043	bool Parse(const char* str, size_t n, absl::string_view* dest) {
1044	if (dest == NULL) return true;
1045	*dest = absl::string_view (str, n);
1046	return true;
1047	}
1048
1049	template <>
1050	bool Parse(const char* str, size_t n, char* dest) {
1051	if (n != `1`) return false;
1052	if (dest == NULL) return true;
1053	*dest = str[`0`];
1054	return true;
1055	}
1056
1057	template <>
1058	bool Parse(const char* str, size_t n, signed char* dest) {
1059	if (n != `1`) return false;
1060	if (dest == NULL) return true;
1061	*dest = str[`0`];
1062	return true;
1063	}
1064
1065	template <>
1066	bool Parse(const char* str, size_t n, unsigned char* dest) {
1067	if (n != `1`) return false;
1068	if (dest == NULL) return true;
1069	*dest = str[`0`];
1070	return true;
1071	}
1072
1073	// Largest number spec that we are willing to parse
1074	static const int kMaxNumberLength = `32`;
1075
1076	// REQUIRES "buf" must have length at least nbuf.
1077	// Copies "str" into "buf" and null-terminates.
1078	// Overwrites np with the new length.*
1079	static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
1080	size_t* np, bool accept_spaces) {
1081	size_t n = *np;
1082	if (n == `0`) return "";
1083	if (n > `0` && isspace(*str)) {
1084	// We are less forgiving than the strtoxxx() routines and do not
1085	// allow leading spaces. We do allow leading spaces for floats.
1086	if (!accept_spaces) {
1087	return "";
1088	}
1089	while (n > `0` && isspace(*str)) {
1090	n--;
1091	str++;
1092	}
1093	}
1094
1095	// Although buf has a fixed maximum size, we can still handle
1096	// arbitrarily large integers correctly by omitting leading zeros.
1097	// (Numbers that are still too long will be out of range.)
1098	// Before deciding whether str is too long,
1099	// remove leading zeros with s/000+/00/.
1100	// Leaving the leading two zeros in place means that
1101	// we don't change 0000x123 (invalid) into 0x123 (valid).
1102	// Skip over leading - before replacing.
1103	bool neg = false;
1104	if (n >= `1` && str[`0`] == `'-'`) {
1105	neg = true;
1106	n--;
1107	str++;
1108	}
1109
1110	if (n >= `3` && str[`0`] == `'0'` && str[`1`] == `'0'`) {
1111	while (n >= `3` && str[`2`] == `'0'`) {
1112	n--;
1113	str++;
1114	}
1115	}
1116
1117	if (neg) { // make room in buf for -
1118	n++;
1119	str--;
1120	}
1121
1122	if (n > nbuf-`1`) return "";
1123
1124	memmove(buf, str, n);
1125	if (neg) {
1126	buf[`0`] = `'-'`;
1127	}
1128	buf[n] = `'\0'`;
1129	*np = n;
1130	return buf;
1131	}
1132
1133	template <>
1134	bool Parse(const char* str, size_t n, float* dest) {
1135	if (n == `0`) return false;
1136	static const int kMaxLength = `200`;
1137	char buf[kMaxLength+`1`];
1138	str = TerminateNumber(buf, sizeof buf, str, &n, true);
1139	char* end;
1140	errno = `0`;
1141	float r = strtof(str, &end);
1142	if (end != str + n) return false; // Leftover junk
1143	if (errno) return false;
1144	if (dest == NULL) return true;
1145	*dest = r;
1146	return true;
1147	}
1148
1149	template <>
1150	bool Parse(const char* str, size_t n, double* dest) {
1151	if (n == `0`) return false;
1152	static const int kMaxLength = `200`;
1153	char buf[kMaxLength+`1`];
1154	str = TerminateNumber(buf, sizeof buf, str, &n, true);
1155	char* end;
1156	errno = `0`;
1157	double r = strtod(str, &end);
1158	if (end != str + n) return false; // Leftover junk
1159	if (errno) return false;
1160	if (dest == NULL) return true;
1161	*dest = r;
1162	return true;
1163	}
1164
1165	template <>
1166	bool Parse(const char* str, size_t n, long* dest, int radix) {
1167	if (n == `0`) return false;
1168	char buf[kMaxNumberLength+`1`];
1169	str = TerminateNumber(buf, sizeof buf, str, &n, false);
1170	char* end;
1171	errno = `0`;
1172	long r = strtol(str, &end, radix);
1173	if (end != str + n) return false; // Leftover junk
1174	if (errno) return false;
1175	if (dest == NULL) return true;
1176	*dest = r;
1177	return true;
1178	}
1179
1180	template <>
1181	bool Parse(const char* str, size_t n, unsigned long* dest, int radix) {
1182	if (n == `0`) return false;
1183	char buf[kMaxNumberLength+`1`];
1184	str = TerminateNumber(buf, sizeof buf, str, &n, false);
1185	if (str[`0`] == `'-'`) {
1186	// strtoul() will silently accept negative numbers and parse
1187	// them. This module is more strict and treats them as errors.
1188	return false;
1189	}
1190
1191	char* end;
1192	errno = `0`;
1193	unsigned long r = strtoul(str, &end, radix);
1194	if (end != str + n) return false; // Leftover junk
1195	if (errno) return false;
1196	if (dest == NULL) return true;
1197	*dest = r;
1198	return true;
1199	}
1200
1201	template <>
1202	bool Parse(const char* str, size_t n, short* dest, int radix) {
1203	long r;
1204	if (!Parse(str, n, &r, radix)) return false; // Could not parse
1205	if ((short)r != r) return false; // Out of range
1206	if (dest == NULL) return true;
1207	dest = (short*)r;
1208	return true;
1209	}
1210
1211	template <>
1212	bool Parse(const char* str, size_t n, unsigned short* dest, int radix) {
1213	unsigned long r;
1214	if (!Parse(str, n, &r, radix)) return false; // Could not parse
1215	if ((unsigned short)r != r) return false; // Out of range
1216	if (dest == NULL) return true;
1217	dest = (unsigned* short)r;
1218	return true;
1219	}
1220
1221	template <>
1222	bool Parse(const char* str, size_t n, int* dest, int radix) {
1223	long r;
1224	if (!Parse(str, n, &r, radix)) return false; // Could not parse
1225	if ((int)r != r) return false; // Out of range
1226	if (dest == NULL) return true;
1227	dest = (int*)r;
1228	return true;
1229	}
1230
1231	template <>
1232	bool Parse(const char* str, size_t n, unsigned int* dest, int radix) {
1233	unsigned long r;
1234	if (!Parse(str, n, &r, radix)) return false; // Could not parse
1235	if ((unsigned int)r != r) return false; // Out of range
1236	if (dest == NULL) return true;
1237	dest = (unsigned* int)r;
1238	return true;
1239	}
1240
1241	template <>
1242	bool Parse(const char* str, size_t n, long long* dest, int radix) {
1243	if (n == `0`) return false;
1244	char buf[kMaxNumberLength+`1`];
1245	str = TerminateNumber(buf, sizeof buf, str, &n, false);
1246	char* end;
1247	errno = `0`;
1248	long long r = strtoll(str, &end, radix);
1249	if (end != str + n) return false; // Leftover junk
1250	if (errno) return false;
1251	if (dest == NULL) return true;
1252	*dest = r;
1253	return true;
1254	}
1255
1256	template <>
1257	bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) {
1258	if (n == `0`) return false;
1259	char buf[kMaxNumberLength+`1`];
1260	str = TerminateNumber(buf, sizeof buf, str, &n, false);
1261	if (str[`0`] == `'-'`) {
1262	// strtoull() will silently accept negative numbers and parse
1263	// them. This module is more strict and treats them as errors.
1264	return false;
1265	}
1266	char* end;
1267	errno = `0`;
1268	unsigned long long r = strtoull(str, &end, radix);
1269	if (end != str + n) return false; // Leftover junk
1270	if (errno) return false;
1271	if (dest == NULL) return true;
1272	*dest = r;
1273	return true;
1274	}
1275
1276	} // namespace re2_internal
1277
1278	namespace hooks {
1279
1280	#ifdef RE2_HAVE_THREAD_LOCAL
1281	thread_local const RE2* context = NULL;
1282	#endif
1283
1284	template <typename T>
1285	union Hook {
1286	void Store(T* cb) { cb_.store(cb, std::memory_order_release); }
1287	T* Load() const { return cb_.load(std::memory_order_acquire); }
1288
1289	#if !defined(__clang__) && defined(_MSC_VER)
1290	// Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent,
1291	// this is a gross hack to make std::atomic<T> constant-initialized on MSVC.*
1292	static_assert(ATOMIC_POINTER_LOCK_FREE == `2`,
1293	"std::atomic<T*> must be always lock-free");
1294	T* cb_for_constinit_;
1295	#endif
1296
1297	std::atomic<T*> cb_;
1298	};
1299
1300	template <typename T>
1301	static void DoNothing(const T&) {}
1302
1303	#define DEFINE_HOOK(type, name) \
1304	static Hook<type##Callback> name##_hook = {{&DoNothing<type>}}; \
1305	void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \
1306	type##Callback* Get##type##Hook() { return name##_hook.Load(); }
1307
1308	DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset)
1309	DEFINE_HOOK(DFASearchFailure, dfa_search_failure)
1310
1311	#undef DEFINE_HOOK
1312
1313	} // namespace hooks
1314
1315	} // namespace re2
1316

Browse the source code of tensorflow/external/com_googlesource_code_re2/re2/re2.cc