1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_TSL_PLATFORM_SCANNER_H_
17#define TENSORFLOW_TSL_PLATFORM_SCANNER_H_
18
19#include <string>
20
21#include "tensorflow/tsl/platform/macros.h"
22#include "tensorflow/tsl/platform/str_util.h"
23#include "tensorflow/tsl/platform/stringpiece.h"
24
25namespace tsl {
26namespace strings {
27
28// Scanner provides simplified string parsing, in which a string is parsed as a
29// series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then
30// finally GetResult is called. If GetResult returns true, then it also returns
31// the remaining characters and any captured substring.
32//
33// The range to capture can be controlled with RestartCapture and StopCapture;
34// by default, all processed characters are captured.
35class Scanner {
36 public:
37 // Classes of characters. Each enum name is to be read as the union of the
38 // parts - e.g., class LETTER_DIGIT means the class includes all letters and
39 // all digits.
40 //
41 // LETTER means ascii letter a-zA-Z.
42 // DIGIT means ascii digit: 0-9.
43 enum CharClass {
44 // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest
45 // in scanner_test.cc
46 ALL,
47 DIGIT,
48 LETTER,
49 LETTER_DIGIT,
50 LETTER_DIGIT_DASH_UNDERSCORE,
51 LETTER_DIGIT_DASH_DOT_SLASH, // SLASH is / only, not backslash
52 LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE, // SLASH is / only, not backslash
53 LETTER_DIGIT_DOT,
54 LETTER_DIGIT_DOT_PLUS_MINUS,
55 LETTER_DIGIT_DOT_UNDERSCORE,
56 LETTER_DIGIT_UNDERSCORE,
57 LOWERLETTER,
58 LOWERLETTER_DIGIT,
59 LOWERLETTER_DIGIT_UNDERSCORE,
60 NON_ZERO_DIGIT,
61 SPACE,
62 UPPERLETTER,
63 RANGLE,
64 };
65
66 explicit Scanner(StringPiece source) : cur_(source) { RestartCapture(); }
67
68 // Consume the next character of the given class from input. If the next
69 // character is not in the class, then GetResult will ultimately return false.
70 Scanner& One(CharClass clz) {
71 if (cur_.empty() || !Matches(clz, cur_[0])) {
72 return Error();
73 }
74 cur_.remove_prefix(1);
75 return *this;
76 }
77
78 // Consume the next s.size() characters of the input, if they match <s>. If
79 // they don't match <s>, this is a no-op.
80 Scanner& ZeroOrOneLiteral(StringPiece s) {
81 str_util::ConsumePrefix(&cur_, s);
82 return *this;
83 }
84
85 // Consume the next s.size() characters of the input, if they match <s>. If
86 // they don't match <s>, then GetResult will ultimately return false.
87 Scanner& OneLiteral(StringPiece s) {
88 if (!str_util::ConsumePrefix(&cur_, s)) {
89 error_ = true;
90 }
91 return *this;
92 }
93
94 // Consume characters from the input as long as they match <clz>. Zero
95 // characters is still considered a match, so it will never cause GetResult to
96 // return false.
97 Scanner& Any(CharClass clz) {
98 while (!cur_.empty() && Matches(clz, cur_[0])) {
99 cur_.remove_prefix(1);
100 }
101 return *this;
102 }
103
104 // Shorthand for One(clz).Any(clz).
105 Scanner& Many(CharClass clz) { return One(clz).Any(clz); }
106
107 // Reset the capture start point.
108 //
109 // Later, when GetResult is called and if it returns true, the capture
110 // returned will start at the position at the time this was called.
111 Scanner& RestartCapture() {
112 capture_start_ = cur_.data();
113 capture_end_ = nullptr;
114 return *this;
115 }
116
117 // Stop capturing input.
118 //
119 // Later, when GetResult is called and if it returns true, the capture
120 // returned will end at the position at the time this was called.
121 Scanner& StopCapture() {
122 capture_end_ = cur_.data();
123 return *this;
124 }
125
126 // If not at the input of input, then GetResult will ultimately return false.
127 Scanner& Eos() {
128 if (!cur_.empty()) error_ = true;
129 return *this;
130 }
131
132 // Shorthand for Any(SPACE).
133 Scanner& AnySpace() { return Any(SPACE); }
134
135 // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
136 Scanner& ScanUntil(char end_ch) {
137 ScanUntilImpl(end_ch, false);
138 return *this;
139 }
140
141 // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
142 // Backslash escape sequences are skipped.
143 // Used for implementing quoted string scanning.
144 Scanner& ScanEscapedUntil(char end_ch) {
145 ScanUntilImpl(end_ch, true);
146 return *this;
147 }
148
149 // Return the next character that will be scanned, or <default_value> if there
150 // are no more characters to scan.
151 // Note that if a scan operation has failed (so GetResult() returns false),
152 // then the value of Peek may or may not have advanced since the scan
153 // operation that failed.
154 char Peek(char default_value = '\0') const {
155 return cur_.empty() ? default_value : cur_[0];
156 }
157
158 // Returns false if there are no remaining characters to consume.
159 int empty() const { return cur_.empty(); }
160
161 // Returns true if the input string successfully matched. When true is
162 // returned, the remaining string is returned in <remaining> and the captured
163 // string returned in <capture>, if non-NULL.
164 bool GetResult(StringPiece* remaining = nullptr,
165 StringPiece* capture = nullptr);
166
167 private:
168 void ScanUntilImpl(char end_ch, bool escaped);
169
170 Scanner& Error() {
171 error_ = true;
172 return *this;
173 }
174
175 static bool IsLetter(char ch) {
176 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
177 }
178
179 static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; }
180
181 static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; }
182
183 static bool IsSpace(char ch) {
184 return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
185 ch == '\r');
186 }
187
188 static bool Matches(CharClass clz, char ch) {
189 switch (clz) {
190 case ALL:
191 return true;
192 case DIGIT:
193 return IsDigit(ch);
194 case LETTER:
195 return IsLetter(ch);
196 case LETTER_DIGIT:
197 return IsLetter(ch) || IsDigit(ch);
198 case LETTER_DIGIT_DASH_UNDERSCORE:
199 return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_');
200 case LETTER_DIGIT_DASH_DOT_SLASH:
201 return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
202 ch == '/';
203 case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE:
204 return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
205 ch == '/' || ch == '_');
206 case LETTER_DIGIT_DOT:
207 return IsLetter(ch) || IsDigit(ch) || ch == '.';
208 case LETTER_DIGIT_DOT_PLUS_MINUS:
209 return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' ||
210 ch == '.';
211 case LETTER_DIGIT_DOT_UNDERSCORE:
212 return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_';
213 case LETTER_DIGIT_UNDERSCORE:
214 return IsLetter(ch) || IsDigit(ch) || ch == '_';
215 case LOWERLETTER:
216 return ch >= 'a' && ch <= 'z';
217 case LOWERLETTER_DIGIT:
218 return IsLowerLetter(ch) || IsDigit(ch);
219 case LOWERLETTER_DIGIT_UNDERSCORE:
220 return IsLowerLetter(ch) || IsDigit(ch) || ch == '_';
221 case NON_ZERO_DIGIT:
222 return IsDigit(ch) && ch != '0';
223 case SPACE:
224 return IsSpace(ch);
225 case UPPERLETTER:
226 return ch >= 'A' && ch <= 'Z';
227 case RANGLE:
228 return ch == '>';
229 }
230 return false;
231 }
232
233 StringPiece cur_;
234 const char* capture_start_ = nullptr;
235 const char* capture_end_ = nullptr;
236 bool error_ = false;
237
238 friend class ScannerTest;
239
240 TF_DISALLOW_COPY_AND_ASSIGN(Scanner);
241};
242
243} // namespace strings
244} // namespace tsl
245
246#endif // TENSORFLOW_TSL_PLATFORM_SCANNER_H_
247