regexp.cc source code [tensorflow/external/com_googlesource_code_re2/re2/regexp.cc]

1	// Copyright 2006 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Regular expression representation.
6	// Tested by parse_test.cc
7
8	#include "re2/regexp.h"
9
10	#include <stddef.h>
11	#include <stdint.h>
12	#include <string.h>
13	#include <algorithm>
14	#include <map>
15	#include <string>
16	#include <vector>
17
18	#include "absl/base/call_once.h"
19	#include "absl/base/macros.h"
20	#include "absl/synchronization/mutex.h"
21	#include "util/logging.h"
22	#include "util/utf.h"
23	#include "re2/pod_array.h"
24	#include "re2/walker-inl.h"
25
26	namespace re2 {
27
28	// Constructor. Allocates vectors as appropriate for operator.
29	Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
30	: op_(static_cast<uint8_t>(op)),
31	simple_(false),
32	parse_flags_(static_cast<uint16_t>(parse_flags)),
33	ref_(`1`),
34	nsub_(`0`),
35	down_(NULL) {
36	subone_ = NULL;
37	memset(the_union_, `0`, sizeof the_union_);
38	}
39
40	// Destructor. Assumes already cleaned up children.
41	// Private: use Decref() instead of delete to destroy Regexps.
42	// Can't call Decref on the sub-Regexps here because
43	// that could cause arbitrarily deep recursion, so
44	// required Decref() to have handled them for us.
45	Regexp::~Regexp() {
46	if (nsub_ > `0`)
47	LOG(DFATAL) << "Regexp not destroyed.";
48
49	switch (op_) {
50	default:
51	break;
52	case kRegexpCapture:
53	delete name_;
54	break;
55	case kRegexpLiteralString:
56	delete[] runes_;
57	break;
58	case kRegexpCharClass:
59	if (cc_)
60	cc_->Delete();
61	delete ccb_;
62	break;
63	}
64	}
65
66	// If it's possible to destroy this regexp without recurring,
67	// do so and return true. Else return false.
68	bool Regexp::QuickDestroy() {
69	if (nsub_ == `0`) {
70	delete this;
71	return true;
72	}
73	return false;
74	}
75
76	// Lazily allocated.
77	static absl::Mutex* ref_mutex;
78	static std::map<Regexp, int> ref_map;
79
80	int Regexp::Ref() {
81	if (ref_ < kMaxRef)
82	return ref_;
83
84	absl::MutexLock l(ref_mutex);
85	return (ref_map)[this*];
86	}
87
88	// Increments reference count, returns object as convenience.
89	Regexp* Regexp::Incref() {
90	if (ref_ >= kMaxRef-`1`) {
91	static absl::once_flag ref_once;
92	absl::call_once(ref_once, []() {
93	ref_mutex = new absl::Mutex;
94	ref_map = new std::map<Regexp, int*>;
95	});
96
97	// Store ref count in overflow map.
98	absl::MutexLock l(ref_mutex);
99	if (ref_ == kMaxRef) {
100	// already overflowed
101	(ref_map)[this*]++;
102	} else {
103	// overflowing now
104	(ref_map)[this*] = kMaxRef;
105	ref_ = kMaxRef;
106	}
107	return this;
108	}
109
110	ref_++;
111	return this;
112	}
113
114	// Decrements reference count and deletes this object if count reaches 0.
115	void Regexp::Decref() {
116	if (ref_ == kMaxRef) {
117	// Ref count is stored in overflow map.
118	absl::MutexLock l(ref_mutex);
119	int r = (ref_map)[this*] - `1`;
120	if (r < kMaxRef) {
121	ref_ = static_cast<uint16_t>(r);
122	ref_map->erase(this);
123	} else {
124	(ref_map)[this*] = r;
125	}
126	return;
127	}
128	ref_--;
129	if (ref_ == `0`)
130	Destroy();
131	}
132
133	// Deletes this object; ref count has count reached 0.
134	void Regexp::Destroy() {
135	if (QuickDestroy())
136	return;
137
138	// Handle recursive Destroy with explicit stack
139	// to avoid arbitrarily deep recursion on process stack [sigh].
140	down_ = NULL;
141	Regexp* stack = this;
142	while (stack != NULL) {
143	Regexp* re = stack;
144	stack = re->down_;
145	if (re->ref_ != `0`)
146	LOG(DFATAL) << "Bad reference count " << re->ref_;
147	if (re->nsub_ > `0`) {
148	Regexp** subs = re->sub();
149	for (int i = `0`; i < re->nsub_; i++) {
150	Regexp* sub = subs[i];
151	if (sub == NULL)
152	continue;
153	if (sub->ref_ == kMaxRef)
154	sub->Decref();
155	else
156	--sub->ref_;
157	if (sub->ref_ == `0` && !sub->QuickDestroy()) {
158	sub->down_ = stack;
159	stack = sub;
160	}
161	}
162	if (re->nsub_ > `1`)
163	delete[] subs;
164	re->nsub_ = `0`;
165	}
166	delete re;
167	}
168	}
169
170	void Regexp::AddRuneToString(Rune r) {
171	DCHECK(op_ == kRegexpLiteralString);
172	if (nrunes_ == `0`) {
173	// start with 8
174	runes_ = new Rune[`8`];
175	} else if (nrunes_ >= `8` && (nrunes_ & (nrunes_ - `1`)) == `0`) {
176	// double on powers of two
177	Rune *old = runes_;
178	runes_ = new Rune[nrunes_ * `2`];
179	for (int i = `0`; i < nrunes_; i++)
180	runes_[i] = old[i];
181	delete[] old;
182	}
183
184	runes_[nrunes_++] = r;
185	}
186
187	Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
188	Regexp* re = new Regexp (kRegexpHaveMatch, flags);
189	re->match_id_ = match_id;
190	return re;
191	}
192
193	Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
194	// Squash , ++ and ??.
195	if (op == sub->op() && flags == sub->parse_flags())
196	return sub;
197
198	// Squash +, ?, +, +?, ?* and ?+. They all squash to , so because
199	// op is Star/Plus/Quest, we just have to check that sub->op() is too.
200	if ((sub->op() == kRegexpStar \|\|
201	sub->op() == kRegexpPlus \|\|
202	sub->op() == kRegexpQuest) &&
203	flags == sub->parse_flags()) {
204	// If sub is Star, no need to rewrite it.
205	if (sub->op() == kRegexpStar)
206	return sub;
207
208	// Rewrite sub to Star.
209	Regexp* re = new Regexp (kRegexpStar, flags);
210	re->AllocSub(`1`);
211	re->sub()[`0`] = sub->sub()[`0`]->Incref();
212	sub->Decref(); // We didn't consume the reference after all.
213	return re;
214	}
215
216	Regexp* re = new Regexp (op, flags);
217	re->AllocSub(`1`);
218	re->sub()[`0`] = sub;
219	return re;
220	}
221
222	Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
223	return StarPlusOrQuest(kRegexpPlus, sub, flags);
224	}
225
226	Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
227	return StarPlusOrQuest(kRegexpStar, sub, flags);
228	}
229
230	Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
231	return StarPlusOrQuest(kRegexpQuest, sub, flags);
232	}
233
234	Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
235	ParseFlags flags, bool can_factor) {
236	if (nsub == `1`)
237	return sub[`0`];
238
239	if (nsub == `0`) {
240	if (op == kRegexpAlternate)
241	return new Regexp (kRegexpNoMatch, flags);
242	else
243	return new Regexp (kRegexpEmptyMatch, flags);
244	}
245
246	PODArray<Regexp*> subcopy;
247	if (op == kRegexpAlternate && can_factor) {
248	// Going to edit sub; make a copy so we don't step on caller.
249	subcopy = PODArray<Regexp*>(nsub);
250	memmove(subcopy.data(), sub, nsub * sizeof sub[`0`]);
251	sub = subcopy.data();
252	nsub = FactorAlternation(sub, nsub, flags);
253	if (nsub == `1`) {
254	Regexp* re = sub[`0`];
255	return re;
256	}
257	}
258
259	if (nsub > kMaxNsub) {
260	// Too many subexpressions to fit in a single Regexp.
261	// Make a two-level tree. Two levels gets us to 65535^2.
262	int nbigsub = (nsub+kMaxNsub-`1`)/kMaxNsub;
263	Regexp* re = new Regexp (op, flags);
264	re->AllocSub(nbigsub);
265	Regexp** subs = re->sub();
266	for (int i = `0`; i < nbigsub - `1`; i++)
267	subs[i] = ConcatOrAlternate(op, sub+ikMaxNsub, kMaxNsub, flags, false*);
268	subs[nbigsub - `1`] = ConcatOrAlternate(op, sub+(nbigsub-`1`)*kMaxNsub,
269	nsub - (nbigsub-`1`)*kMaxNsub, flags,
270	false);
271	return re;
272	}
273
274	Regexp* re = new Regexp (op, flags);
275	re->AllocSub(nsub);
276	Regexp** subs = re->sub();
277	for (int i = `0`; i < nsub; i++)
278	subs[i] = sub[i];
279	return re;
280	}
281
282	Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
283	return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
284	}
285
286	Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
287	return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
288	}
289
290	Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
291	return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
292	}
293
294	Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
295	Regexp* re = new Regexp (kRegexpCapture, flags);
296	re->AllocSub(`1`);
297	re->sub()[`0`] = sub;
298	re->cap_ = cap;
299	return re;
300	}
301
302	Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
303	Regexp* re = new Regexp (kRegexpRepeat, flags);
304	re->AllocSub(`1`);
305	re->sub()[`0`] = sub;
306	re->min_ = min;
307	re->max_ = max;
308	return re;
309	}
310
311	Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
312	Regexp* re = new Regexp (kRegexpLiteral, flags);
313	re->rune_ = rune;
314	return re;
315	}
316
317	Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
318	if (nrunes <= `0`)
319	return new Regexp (kRegexpEmptyMatch, flags);
320	if (nrunes == `1`)
321	return NewLiteral(runes[`0`], flags);
322	Regexp* re = new Regexp (kRegexpLiteralString, flags);
323	for (int i = `0`; i < nrunes; i++)
324	re->AddRuneToString(runes[i]);
325	return re;
326	}
327
328	Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
329	Regexp* re = new Regexp (kRegexpCharClass, flags);
330	re->cc_ = cc;
331	return re;
332	}
333
334	void Regexp::Swap(Regexp* that) {
335	// Regexp is not trivially copyable, so we cannot freely copy it with
336	// memmove(3), but swapping objects like so is safe for our purposes.
337	char tmp[sizeof *this];
338	void* vthis = reinterpret_cast<void>(this*);
339	void* vthat = reinterpret_cast<void*>(that);
340	memmove(tmp, vthis, sizeof *this);
341	memmove(vthis, vthat, sizeof *this);
342	memmove(vthat, tmp, sizeof *this);
343	}
344
345	// Tests equality of all top-level structure but not subregexps.
346	static bool TopEqual(Regexp* a, Regexp* b) {
347	if (a->op() != b->op())
348	return false;
349
350	switch (a->op()) {
351	case kRegexpNoMatch:
352	case kRegexpEmptyMatch:
353	case kRegexpAnyChar:
354	case kRegexpAnyByte:
355	case kRegexpBeginLine:
356	case kRegexpEndLine:
357	case kRegexpWordBoundary:
358	case kRegexpNoWordBoundary:
359	case kRegexpBeginText:
360	return true;
361
362	case kRegexpEndText:
363	// The parse flags remember whether it's \z or (?-m:$),
364	// which matters when testing against PCRE.
365	return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == `0`;
366
367	case kRegexpLiteral:
368	return a->rune() == b->rune() &&
369	((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == `0`;
370
371	case kRegexpLiteralString:
372	return a->nrunes() == b->nrunes() &&
373	((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == `0` &&
374	memcmp(a->runes(), b->runes(),
375	a->nrunes() * sizeof a->runes()[`0`]) == `0`;
376
377	case kRegexpAlternate:
378	case kRegexpConcat:
379	return a->nsub() == b->nsub();
380
381	case kRegexpStar:
382	case kRegexpPlus:
383	case kRegexpQuest:
384	return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == `0`;
385
386	case kRegexpRepeat:
387	return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == `0` &&
388	a->min() == b->min() &&
389	a->max() == b->max();
390
391	case kRegexpCapture:
392	return a->cap() == b->cap() && a->name() == b->name();
393
394	case kRegexpHaveMatch:
395	return a->match_id() == b->match_id();
396
397	case kRegexpCharClass: {
398	CharClass* acc = a->cc();
399	CharClass* bcc = b->cc();
400	return acc->size() == bcc->size() &&
401	acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
402	memcmp(acc->begin(), bcc->begin(),
403	(acc->end() - acc->begin()) * sizeof acc->begin()[`0`]) == `0`;
404	}
405	}
406
407	LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
408	return `0`;
409	}
410
411	bool Regexp::Equal(Regexp* a, Regexp* b) {
412	if (a == NULL \|\| b == NULL)
413	return a == b;
414
415	if (!TopEqual(a, b))
416	return false;
417
418	// Fast path:
419	// return without allocating vector if there are no subregexps.
420	switch (a->op()) {
421	case kRegexpAlternate:
422	case kRegexpConcat:
423	case kRegexpStar:
424	case kRegexpPlus:
425	case kRegexpQuest:
426	case kRegexpRepeat:
427	case kRegexpCapture:
428	break;
429
430	default:
431	return true;
432	}
433
434	// Committed to doing real work.
435	// The stack (vector) has pairs of regexps waiting to
436	// be compared. The regexps are only equal if
437	// all the pairs end up being equal.
438	std::vector<Regexp*> stk;
439
440	for (;;) {
441	// Invariant: TopEqual(a, b) == true.
442	Regexp* a2;
443	Regexp* b2;
444	switch (a->op()) {
445	default:
446	break;
447	case kRegexpAlternate:
448	case kRegexpConcat:
449	for (int i = `0`; i < a->nsub(); i++) {
450	a2 = a->sub()[i];
451	b2 = b->sub()[i];
452	if (!TopEqual(a2, b2))
453	return false;
454	stk.push_back(a2);
455	stk.push_back(b2);
456	}
457	break;
458
459	case kRegexpStar:
460	case kRegexpPlus:
461	case kRegexpQuest:
462	case kRegexpRepeat:
463	case kRegexpCapture:
464	a2 = a->sub()[`0`];
465	b2 = b->sub()[`0`];
466	if (!TopEqual(a2, b2))
467	return false;
468	// Really:
469	// stk.push_back(a2);
470	// stk.push_back(b2);
471	// break;
472	// but faster to assign directly and loop.
473	a = a2;
474	b = b2;
475	continue;
476	}
477
478	size_t n = stk.size();
479	if (n == `0`)
480	break;
481
482	DCHECK_GE(n, `2`);
483	a = stk [n-`2`];
484	b = stk [n-`1`];
485	stk.resize(n-`2`);
486	}
487
488	return true;
489	}
490
491	// Keep in sync with enum RegexpStatusCode in regexp.h
492	static const char *kErrorStrings[] = {
493	"no error",
494	"unexpected error",
495	"invalid escape sequence",
496	"invalid character class",
497	"invalid character class range",
498	"missing ]",
499	"missing )",
500	"unexpected )",
501	"trailing \\",
502	"no argument for repetition operator",
503	"invalid repetition size",
504	"bad repetition operator",
505	"invalid perl operator",
506	"invalid UTF-8",
507	"invalid named capture group",
508	};
509
510	std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
511	if (code < `0` \|\| code >= ABSL_ARRAYSIZE(kErrorStrings))
512	code = kRegexpInternalError;
513	return kErrorStrings[code];
514	}
515
516	std::string RegexpStatus::Text() const {
517	if (error_arg_.empty())
518	return CodeText(code_);
519	std::string s;
520	s.append(CodeText(code_));
521	s.append(": ");
522	s.append(error_arg_.data(), error_arg_.size());
523	return s;
524	}
525
526	void RegexpStatus::Copy(const RegexpStatus& status) {
527	code_ = status.code_;
528	error_arg_ = status.error_arg_;
529	}
530
531	typedef int Ignored; // Walker<void> doesn't exist
532
533	// Walker subclass to count capturing parens in regexp.
534	class NumCapturesWalker : public Regexp::Walker<Ignored> {
535	public:
536	NumCapturesWalker() : ncapture_(`0`) {}
537	int ncapture() { return ncapture_; }
538
539	virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
540	if (re->op() == kRegexpCapture)
541	ncapture_++;
542	return ignored;
543	}
544
545	virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
546	// Should never be called: we use Walk(), not WalkExponential().
547	#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
548	LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
549	#endif
550	return ignored;
551	}
552
553	private:
554	int ncapture_;
555
556	NumCapturesWalker(const NumCapturesWalker&) = delete;
557	NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
558	};
559
560	int Regexp::NumCaptures() {
561	NumCapturesWalker w;
562	w.Walk(this, `0`);
563	return w.ncapture();
564	}
565
566	// Walker class to build map of named capture groups and their indices.
567	class NamedCapturesWalker : public Regexp::Walker<Ignored> {
568	public:
569	NamedCapturesWalker() : map_(NULL) {}
570	~NamedCapturesWalker() { delete map_; }
571
572	std::map<std::string, int>* TakeMap() {
573	std::map<std::string, int>* m = map_;
574	map_ = NULL;
575	return m;
576	}
577
578	virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
579	if (re->op() == kRegexpCapture && re->name() != NULL) {
580	// Allocate map once we find a name.
581	if (map_ == NULL)
582	map_ = new std::map<std::string, int>;
583
584	// Record first occurrence of each name.
585	// (The rule is that if you have the same name
586	// multiple times, only the leftmost one counts.)
587	map_->insert({*re->name(), re->cap()});
588	}
589	return ignored;
590	}
591
592	virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
593	// Should never be called: we use Walk(), not WalkExponential().
594	#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
595	LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
596	#endif
597	return ignored;
598	}
599
600	private:
601	std::map<std::string, int>* map_;
602
603	NamedCapturesWalker(const NamedCapturesWalker&) = delete;
604	NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
605	};
606
607	std::map<std::string, int>* Regexp::NamedCaptures() {
608	NamedCapturesWalker w;
609	w.Walk(this, `0`);
610	return w.TakeMap();
611	}
612
613	// Walker class to build map from capture group indices to their names.
614	class CaptureNamesWalker : public Regexp::Walker<Ignored> {
615	public:
616	CaptureNamesWalker() : map_(NULL) {}
617	~CaptureNamesWalker() { delete map_; }
618
619	std::map<int, std::string>* TakeMap() {
620	std::map<int, std::string>* m = map_;
621	map_ = NULL;
622	return m;
623	}
624
625	virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
626	if (re->op() == kRegexpCapture && re->name() != NULL) {
627	// Allocate map once we find a name.
628	if (map_ == NULL)
629	map_ = new std::map<int, std::string>;
630
631	(map_)[re->cap()] = re->name();
632	}
633	return ignored;
634	}
635
636	virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
637	// Should never be called: we use Walk(), not WalkExponential().
638	#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
639	LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
640	#endif
641	return ignored;
642	}
643
644	private:
645	std::map<int, std::string>* map_;
646
647	CaptureNamesWalker(const CaptureNamesWalker&) = delete;
648	CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
649	};
650
651	std::map<int, std::string>* Regexp::CaptureNames() {
652	CaptureNamesWalker w;
653	w.Walk(this, `0`);
654	return w.TakeMap();
655	}
656
657	void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
658	std::string* bytes) {
659	if (latin1) {
660	bytes->resize(nrunes);
661	for (int i = `0`; i < nrunes; i++)
662	(bytes)[i] = static_cast<char*>(runes[i]);
663	} else {
664	bytes->resize(nrunes * UTFmax); // worst case
665	char* p = &(*bytes)[`0`];
666	for (int i = `0`; i < nrunes; i++)
667	p += runetochar(p, &runes[i]);
668	bytes->resize(p - &(*bytes)[`0`]);
669	bytes->shrink_to_fit();
670	}
671	}
672
673	// Determines whether regexp matches must be anchored
674	// with a fixed string prefix. If so, returns the prefix and
675	// the regexp that remains after the prefix. The prefix might
676	// be ASCII case-insensitive.
677	bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
678	Regexp** suffix) {
679	prefix->clear();
680	foldcase = false*;
681	*suffix = NULL;
682
683	// No need for a walker: the regexp must be of the form
684	// 1. some number of ^ anchors
685	// 2. a literal char or string
686	// 3. the rest
687	if (op_ != kRegexpConcat)
688	return false;
689	int i = `0`;
690	while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
691	i++;
692	if (i == `0` \|\| i >= nsub_)
693	return false;
694	Regexp* re = sub()[i];
695	if (re->op_ != kRegexpLiteral &&
696	re->op_ != kRegexpLiteralString)
697	return false;
698	i++;
699	if (i < nsub_) {
700	for (int j = i; j < nsub_; j++)
701	sub()[j]->Incref();
702	*suffix = Concat(sub() + i, nsub_ - i, parse_flags());
703	} else {
704	suffix = new* Regexp (kRegexpEmptyMatch, parse_flags());
705	}
706
707	bool latin1 = (re->parse_flags() & Latin1) != `0`;
708	Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
709	int nrunes = re->op_ == kRegexpLiteral ? `1` : re->nrunes_;
710	ConvertRunesToBytes(latin1, runes, nrunes, prefix);
711	*foldcase = (re->parse_flags() & FoldCase) != `0`;
712	return true;
713	}
714
715	// Determines whether regexp matches must be unanchored
716	// with a fixed string prefix. If so, returns the prefix.
717	// The prefix might be ASCII case-insensitive.
718	bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
719	prefix->clear();
720	foldcase = false*;
721
722	// No need for a walker: the regexp must either begin with or be
723	// a literal char or string. We "see through" capturing groups,
724	// but make no effort to glue multiple prefix fragments together.
725	Regexp* re = op_ == kRegexpConcat && nsub_ > `0` ? sub()[`0`] : this;
726	while (re->op_ == kRegexpCapture) {
727	re = re->sub()[`0`];
728	if (re->op_ == kRegexpConcat && re->nsub_ > `0`)
729	re = re->sub()[`0`];
730	}
731	if (re->op_ != kRegexpLiteral &&
732	re->op_ != kRegexpLiteralString)
733	return false;
734
735	bool latin1 = (re->parse_flags() & Latin1) != `0`;
736	Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
737	int nrunes = re->op_ == kRegexpLiteral ? `1` : re->nrunes_;
738	ConvertRunesToBytes(latin1, runes, nrunes, prefix);
739	*foldcase = (re->parse_flags() & FoldCase) != `0`;
740	return true;
741	}
742
743	// Character class builder is a balanced binary tree (STL set)
744	// containing non-overlapping, non-abutting RuneRanges.
745	// The less-than operator used in the tree treats two
746	// ranges as equal if they overlap at all, so that
747	// lookups for a particular Rune are possible.
748
749	CharClassBuilder::CharClassBuilder() {
750	nrunes_ = `0`;
751	upper_ = `0`;
752	lower_ = `0`;
753	}
754
755	// Add lo-hi to the class; return whether class got bigger.
756	bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
757	if (hi < lo)
758	return false;
759
760	if (lo <= `'z'` && hi >= `'A'`) {
761	// Overlaps some alpha, maybe not all.
762	// Update bitmaps telling which ASCII letters are in the set.
763	Rune lo1 = std::max<Rune>(lo, `'A'`);
764	Rune hi1 = std::min<Rune>(hi, `'Z'`);
765	if (lo1 <= hi1)
766	upper_ \|= ((`1` << (hi1 - lo1 + `1`)) - `1`) << (lo1 - `'A'`);
767
768	lo1 = std::max<Rune>(lo, `'a'`);
769	hi1 = std::min<Rune>(hi, `'z'`);
770	if (lo1 <= hi1)
771	lower_ \|= ((`1` << (hi1 - lo1 + `1`)) - `1`) << (lo1 - `'a'`);
772	}
773
774	{ // Check whether lo, hi is already in the class.
775	iterator it = ranges_.find(RuneRange (lo, lo));
776	if (it != end() && it ->lo <= lo && hi <= it ->hi)
777	return false;
778	}
779
780	// Look for a range abutting lo on the left.
781	// If it exists, take it out and increase our range.
782	if (lo > `0`) {
783	iterator it = ranges_.find(RuneRange (lo-`1`, lo-`1`));
784	if (it != end()) {
785	lo = it ->lo;
786	if (it ->hi > hi)
787	hi = it ->hi;
788	nrunes_ -= it ->hi - it ->lo + `1`;
789	ranges_.erase(it);
790	}
791	}
792
793	// Look for a range abutting hi on the right.
794	// If it exists, take it out and increase our range.
795	if (hi < Runemax) {
796	iterator it = ranges_.find(RuneRange (hi+`1`, hi+`1`));
797	if (it != end()) {
798	hi = it ->hi;
799	nrunes_ -= it ->hi - it ->lo + `1`;
800	ranges_.erase(it);
801	}
802	}
803
804	// Look for ranges between lo and hi. Take them out.
805	// This is only safe because the set has no overlapping ranges.
806	// We've already removed any ranges abutting lo and hi, so
807	// any that overlap [lo, hi] must be contained within it.
808	for (;;) {
809	iterator it = ranges_.find(RuneRange (lo, hi));
810	if (it == end())
811	break;
812	nrunes_ -= it ->hi - it ->lo + `1`;
813	ranges_.erase(it);
814	}
815
816	// Finally, add [lo, hi].
817	nrunes_ += hi - lo + `1`;
818	ranges_.insert(RuneRange (lo, hi));
819	return true;
820	}
821
822	void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
823	for (iterator it = cc->begin(); it != cc->end(); ++it)
824	AddRange(it ->lo, it ->hi);
825	}
826
827	bool CharClassBuilder::Contains(Rune r) {
828	return ranges_.find(RuneRange (r, r)) != end();
829	}
830
831	// Does the character class behave the same on A-Z as on a-z?
832	bool CharClassBuilder::FoldsASCII() {
833	return ((upper_ ^ lower_) & AlphaMask) == `0`;
834	}
835
836	CharClassBuilder* CharClassBuilder::Copy() {
837	CharClassBuilder* cc = new CharClassBuilder;
838	for (iterator it = begin(); it != end(); ++it)
839	cc->ranges_.insert(RuneRange (it ->lo, it ->hi));
840	cc->upper_ = upper_;
841	cc->lower_ = lower_;
842	cc->nrunes_ = nrunes_;
843	return cc;
844	}
845
846
847
848	void CharClassBuilder::RemoveAbove(Rune r) {
849	if (r >= Runemax)
850	return;
851
852	if (r < `'z'`) {
853	if (r < `'a'`)
854	lower_ = `0`;
855	else
856	lower_ &= AlphaMask >> (`'z'` - r);
857	}
858
859	if (r < `'Z'`) {
860	if (r < `'A'`)
861	upper_ = `0`;
862	else
863	upper_ &= AlphaMask >> (`'Z'` - r);
864	}
865
866	for (;;) {
867
868	iterator it = ranges_.find(RuneRange (r + `1`, Runemax));
869	if (it == end())
870	break;
871	RuneRange rr = *it;
872	ranges_.erase(it);
873	nrunes_ -= rr.hi - rr.lo + `1`;
874	if (rr.lo <= r) {
875	rr.hi = r;
876	ranges_.insert(rr);
877	nrunes_ += rr.hi - rr.lo + `1`;
878	}
879	}
880	}
881
882	void CharClassBuilder::Negate() {
883	// Build up negation and then copy in.
884	// Could edit ranges in place, but C++ won't let me.
885	std::vector<RuneRange> v;
886	v.reserve(ranges_.size() + `1`);
887
888	// In negation, first range begins at 0, unless
889	// the current class begins at 0.
890	iterator it = begin();
891	if (it == end()) {
892	v.push_back(RuneRange (`0`, Runemax));
893	} else {
894	int nextlo = `0`;
895	if (it ->lo == `0`) {
896	nextlo = it ->hi + `1`;
897	++it;
898	}
899	for (; it != end(); ++it) {
900	v.push_back(RuneRange (nextlo, it ->lo - `1`));
901	nextlo = it ->hi + `1`;
902	}
903	if (nextlo <= Runemax)
904	v.push_back(RuneRange (nextlo, Runemax));
905	}
906
907	ranges_.clear();
908	for (size_t i = `0`; i < v.size(); i++)
909	ranges_.insert(v [i]);
910
911	upper_ = AlphaMask & ~upper_;
912	lower_ = AlphaMask & ~lower_;
913	nrunes_ = Runemax+`1` - nrunes_;
914	}
915
916	// Character class is a sorted list of ranges.
917	// The ranges are allocated in the same block as the header,
918	// necessitating a special allocator and Delete method.
919
920	CharClass* CharClass::New(size_t maxranges) {
921	CharClass* cc;
922	uint8_t* data = new uint8_t[sizeof cc + maxrangessizeof cc->ranges_[`0`]];
923	cc = reinterpret_cast<CharClass*>(data);
924	cc->ranges_ = reinterpret_cast<RuneRange>(data + sizeof* *cc);
925	cc->nranges_ = `0`;
926	cc->folds_ascii_ = false;
927	cc->nrunes_ = `0`;
928	return cc;
929	}
930
931	void CharClass::Delete() {
932	uint8_t* data = reinterpret_cast<uint8_t>(this*);
933	delete[] data;
934	}
935
936	CharClass* CharClass::Negate() {
937	CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+`1`));
938	cc->folds_ascii_ = folds_ascii_;
939	cc->nrunes_ = Runemax + `1` - nrunes_;
940	int n = `0`;
941	int nextlo = `0`;
942	for (CharClass::iterator it = begin(); it != end(); ++it) {
943	if (it->lo == nextlo) {
944	nextlo = it->hi + `1`;
945	} else {
946	cc->ranges_[n++] = RuneRange (nextlo, it->lo - `1`);
947	nextlo = it->hi + `1`;
948	}
949	}
950	if (nextlo <= Runemax)
951	cc->ranges_[n++] = RuneRange (nextlo, Runemax);
952	cc->nranges_ = n;
953	return cc;
954	}
955
956	bool CharClass::Contains(Rune r) const {
957	RuneRange* rr = ranges_;
958	int n = nranges_;
959	while (n > `0`) {
960	int m = n/`2`;
961	if (rr[m].hi < r) {
962	rr += m+`1`;
963	n -= m+`1`;
964	} else if (r < rr[m].lo) {
965	n = m;
966	} else { // rr[m].lo <= r && r <= rr[m].hi
967	return true;
968	}
969	}
970	return false;
971	}
972
973	CharClass* CharClassBuilder::GetCharClass() {
974	CharClass* cc = CharClass::New(ranges_.size());
975	int n = `0`;
976	for (iterator it = begin(); it != end(); ++it)
977	cc->ranges_[n++] = *it;
978	cc->nranges_ = n;
979	DCHECK_LE(n, static_cast<int>(ranges_.size()));
980	cc->nrunes_ = nrunes_;
981	cc->folds_ascii_ = FoldsASCII();
982	return cc;
983	}
984
985	} // namespace re2
986

Browse the source code of tensorflow/external/com_googlesource_code_re2/re2/regexp.cc