1 | /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_TSL_PLATFORM_SCANNER_H_ |
17 | #define TENSORFLOW_TSL_PLATFORM_SCANNER_H_ |
18 | |
19 | #include <string> |
20 | |
21 | #include "tensorflow/tsl/platform/macros.h" |
22 | #include "tensorflow/tsl/platform/str_util.h" |
23 | #include "tensorflow/tsl/platform/stringpiece.h" |
24 | |
25 | namespace tsl { |
26 | namespace strings { |
27 | |
28 | // Scanner provides simplified string parsing, in which a string is parsed as a |
29 | // series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then |
30 | // finally GetResult is called. If GetResult returns true, then it also returns |
31 | // the remaining characters and any captured substring. |
32 | // |
33 | // The range to capture can be controlled with RestartCapture and StopCapture; |
34 | // by default, all processed characters are captured. |
35 | class Scanner { |
36 | public: |
37 | // Classes of characters. Each enum name is to be read as the union of the |
38 | // parts - e.g., class LETTER_DIGIT means the class includes all letters and |
39 | // all digits. |
40 | // |
41 | // LETTER means ascii letter a-zA-Z. |
42 | // DIGIT means ascii digit: 0-9. |
43 | enum CharClass { |
44 | // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest |
45 | // in scanner_test.cc |
46 | ALL, |
47 | DIGIT, |
48 | LETTER, |
49 | LETTER_DIGIT, |
50 | LETTER_DIGIT_DASH_UNDERSCORE, |
51 | LETTER_DIGIT_DASH_DOT_SLASH, // SLASH is / only, not backslash |
52 | LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE, // SLASH is / only, not backslash |
53 | LETTER_DIGIT_DOT, |
54 | LETTER_DIGIT_DOT_PLUS_MINUS, |
55 | LETTER_DIGIT_DOT_UNDERSCORE, |
56 | LETTER_DIGIT_UNDERSCORE, |
57 | LOWERLETTER, |
58 | LOWERLETTER_DIGIT, |
59 | LOWERLETTER_DIGIT_UNDERSCORE, |
60 | NON_ZERO_DIGIT, |
61 | SPACE, |
62 | UPPERLETTER, |
63 | RANGLE, |
64 | }; |
65 | |
66 | explicit Scanner(StringPiece source) : cur_(source) { RestartCapture(); } |
67 | |
68 | // Consume the next character of the given class from input. If the next |
69 | // character is not in the class, then GetResult will ultimately return false. |
70 | Scanner& One(CharClass clz) { |
71 | if (cur_.empty() || !Matches(clz, cur_[0])) { |
72 | return Error(); |
73 | } |
74 | cur_.remove_prefix(1); |
75 | return *this; |
76 | } |
77 | |
78 | // Consume the next s.size() characters of the input, if they match <s>. If |
79 | // they don't match <s>, this is a no-op. |
80 | Scanner& ZeroOrOneLiteral(StringPiece s) { |
81 | str_util::ConsumePrefix(&cur_, s); |
82 | return *this; |
83 | } |
84 | |
85 | // Consume the next s.size() characters of the input, if they match <s>. If |
86 | // they don't match <s>, then GetResult will ultimately return false. |
87 | Scanner& OneLiteral(StringPiece s) { |
88 | if (!str_util::ConsumePrefix(&cur_, s)) { |
89 | error_ = true; |
90 | } |
91 | return *this; |
92 | } |
93 | |
94 | // Consume characters from the input as long as they match <clz>. Zero |
95 | // characters is still considered a match, so it will never cause GetResult to |
96 | // return false. |
97 | Scanner& Any(CharClass clz) { |
98 | while (!cur_.empty() && Matches(clz, cur_[0])) { |
99 | cur_.remove_prefix(1); |
100 | } |
101 | return *this; |
102 | } |
103 | |
104 | // Shorthand for One(clz).Any(clz). |
105 | Scanner& Many(CharClass clz) { return One(clz).Any(clz); } |
106 | |
107 | // Reset the capture start point. |
108 | // |
109 | // Later, when GetResult is called and if it returns true, the capture |
110 | // returned will start at the position at the time this was called. |
111 | Scanner& RestartCapture() { |
112 | capture_start_ = cur_.data(); |
113 | capture_end_ = nullptr; |
114 | return *this; |
115 | } |
116 | |
117 | // Stop capturing input. |
118 | // |
119 | // Later, when GetResult is called and if it returns true, the capture |
120 | // returned will end at the position at the time this was called. |
121 | Scanner& StopCapture() { |
122 | capture_end_ = cur_.data(); |
123 | return *this; |
124 | } |
125 | |
126 | // If not at the input of input, then GetResult will ultimately return false. |
127 | Scanner& Eos() { |
128 | if (!cur_.empty()) error_ = true; |
129 | return *this; |
130 | } |
131 | |
132 | // Shorthand for Any(SPACE). |
133 | Scanner& AnySpace() { return Any(SPACE); } |
134 | |
135 | // This scans input until <end_ch> is reached. <end_ch> is NOT consumed. |
136 | Scanner& ScanUntil(char end_ch) { |
137 | ScanUntilImpl(end_ch, false); |
138 | return *this; |
139 | } |
140 | |
141 | // This scans input until <end_ch> is reached. <end_ch> is NOT consumed. |
142 | // Backslash escape sequences are skipped. |
143 | // Used for implementing quoted string scanning. |
144 | Scanner& ScanEscapedUntil(char end_ch) { |
145 | ScanUntilImpl(end_ch, true); |
146 | return *this; |
147 | } |
148 | |
149 | // Return the next character that will be scanned, or <default_value> if there |
150 | // are no more characters to scan. |
151 | // Note that if a scan operation has failed (so GetResult() returns false), |
152 | // then the value of Peek may or may not have advanced since the scan |
153 | // operation that failed. |
154 | char Peek(char default_value = '\0') const { |
155 | return cur_.empty() ? default_value : cur_[0]; |
156 | } |
157 | |
158 | // Returns false if there are no remaining characters to consume. |
159 | int empty() const { return cur_.empty(); } |
160 | |
161 | // Returns true if the input string successfully matched. When true is |
162 | // returned, the remaining string is returned in <remaining> and the captured |
163 | // string returned in <capture>, if non-NULL. |
164 | bool GetResult(StringPiece* remaining = nullptr, |
165 | StringPiece* capture = nullptr); |
166 | |
167 | private: |
168 | void ScanUntilImpl(char end_ch, bool escaped); |
169 | |
170 | Scanner& Error() { |
171 | error_ = true; |
172 | return *this; |
173 | } |
174 | |
175 | static bool IsLetter(char ch) { |
176 | return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); |
177 | } |
178 | |
179 | static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; } |
180 | |
181 | static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; } |
182 | |
183 | static bool IsSpace(char ch) { |
184 | return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' || |
185 | ch == '\r'); |
186 | } |
187 | |
188 | static bool Matches(CharClass clz, char ch) { |
189 | switch (clz) { |
190 | case ALL: |
191 | return true; |
192 | case DIGIT: |
193 | return IsDigit(ch); |
194 | case LETTER: |
195 | return IsLetter(ch); |
196 | case LETTER_DIGIT: |
197 | return IsLetter(ch) || IsDigit(ch); |
198 | case LETTER_DIGIT_DASH_UNDERSCORE: |
199 | return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_'); |
200 | case LETTER_DIGIT_DASH_DOT_SLASH: |
201 | return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' || |
202 | ch == '/'; |
203 | case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE: |
204 | return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' || |
205 | ch == '/' || ch == '_'); |
206 | case LETTER_DIGIT_DOT: |
207 | return IsLetter(ch) || IsDigit(ch) || ch == '.'; |
208 | case LETTER_DIGIT_DOT_PLUS_MINUS: |
209 | return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' || |
210 | ch == '.'; |
211 | case LETTER_DIGIT_DOT_UNDERSCORE: |
212 | return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_'; |
213 | case LETTER_DIGIT_UNDERSCORE: |
214 | return IsLetter(ch) || IsDigit(ch) || ch == '_'; |
215 | case LOWERLETTER: |
216 | return ch >= 'a' && ch <= 'z'; |
217 | case LOWERLETTER_DIGIT: |
218 | return IsLowerLetter(ch) || IsDigit(ch); |
219 | case LOWERLETTER_DIGIT_UNDERSCORE: |
220 | return IsLowerLetter(ch) || IsDigit(ch) || ch == '_'; |
221 | case NON_ZERO_DIGIT: |
222 | return IsDigit(ch) && ch != '0'; |
223 | case SPACE: |
224 | return IsSpace(ch); |
225 | case UPPERLETTER: |
226 | return ch >= 'A' && ch <= 'Z'; |
227 | case RANGLE: |
228 | return ch == '>'; |
229 | } |
230 | return false; |
231 | } |
232 | |
233 | StringPiece cur_; |
234 | const char* capture_start_ = nullptr; |
235 | const char* capture_end_ = nullptr; |
236 | bool error_ = false; |
237 | |
238 | friend class ScannerTest; |
239 | |
240 | TF_DISALLOW_COPY_AND_ASSIGN(Scanner); |
241 | }; |
242 | |
243 | } // namespace strings |
244 | } // namespace tsl |
245 | |
246 | #endif // TENSORFLOW_TSL_PLATFORM_SCANNER_H_ |
247 | |