1 | // |
2 | // Copyright 2017 The Abseil Authors. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 (the "License"); |
5 | // you may not use this file except in compliance with the License. |
6 | // You may obtain a copy of the License at |
7 | // |
8 | // https://www.apache.org/licenses/LICENSE-2.0 |
9 | // |
10 | // Unless required by applicable law or agreed to in writing, software |
11 | // distributed under the License is distributed on an "AS IS" BASIS, |
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | // See the License for the specific language governing permissions and |
14 | // limitations under the License. |
15 | // |
16 | // ----------------------------------------------------------------------------- |
17 | // File: str_split.h |
18 | // ----------------------------------------------------------------------------- |
19 | // |
20 | // This file contains functions for splitting strings. It defines the main |
21 | // `StrSplit()` function, several delimiters for determining the boundaries on |
22 | // which to split the string, and predicates for filtering delimited results. |
23 | // `StrSplit()` adapts the returned collection to the type specified by the |
24 | // caller. |
25 | // |
26 | // Example: |
27 | // |
28 | // // Splits the given string on commas. Returns the results in a |
29 | // // vector of strings. |
30 | // std::vector<std::string> v = absl::StrSplit("a,b,c", ','); |
31 | // // Can also use "," |
32 | // // v[0] == "a", v[1] == "b", v[2] == "c" |
33 | // |
34 | // See StrSplit() below for more information. |
35 | #ifndef ABSL_STRINGS_STR_SPLIT_H_ |
36 | #define ABSL_STRINGS_STR_SPLIT_H_ |
37 | |
38 | #include <algorithm> |
39 | #include <cstddef> |
40 | #include <map> |
41 | #include <set> |
42 | #include <string> |
43 | #include <utility> |
44 | #include <vector> |
45 | |
46 | #include "absl/base/internal/raw_logging.h" |
47 | #include "absl/base/macros.h" |
48 | #include "absl/strings/internal/str_split_internal.h" |
49 | #include "absl/strings/string_view.h" |
50 | #include "absl/strings/strip.h" |
51 | |
52 | namespace absl { |
53 | ABSL_NAMESPACE_BEGIN |
54 | |
55 | //------------------------------------------------------------------------------ |
56 | // Delimiters |
57 | //------------------------------------------------------------------------------ |
58 | // |
59 | // `StrSplit()` uses delimiters to define the boundaries between elements in the |
60 | // provided input. Several `Delimiter` types are defined below. If a string |
61 | // (`const char*`, `std::string`, or `absl::string_view`) is passed in place of |
62 | // an explicit `Delimiter` object, `StrSplit()` treats it the same way as if it |
63 | // were passed a `ByString` delimiter. |
64 | // |
65 | // A `Delimiter` is an object with a `Find()` function that knows how to find |
66 | // the first occurrence of itself in a given `absl::string_view`. |
67 | // |
68 | // The following `Delimiter` types are available for use within `StrSplit()`: |
69 | // |
70 | // - `ByString` (default for string arguments) |
71 | // - `ByChar` (default for a char argument) |
72 | // - `ByAnyChar` |
73 | // - `ByLength` |
74 | // - `MaxSplits` |
75 | // |
76 | // A Delimiter's `Find()` member function will be passed an input `text` that is |
77 | // to be split and a position (`pos`) to begin searching for the next delimiter |
78 | // in `text`. The returned absl::string_view should refer to the next occurrence |
79 | // (after `pos`) of the represented delimiter; this returned absl::string_view |
80 | // represents the next location where the input `text` should be broken. |
81 | // |
82 | // The returned absl::string_view may be zero-length if the Delimiter does not |
83 | // represent a part of the string (e.g., a fixed-length delimiter). If no |
84 | // delimiter is found in the input `text`, a zero-length absl::string_view |
85 | // referring to `text.end()` should be returned (e.g., |
86 | // `text.substr(text.size())`). It is important that the returned |
87 | // absl::string_view always be within the bounds of the input `text` given as an |
88 | // argument--it must not refer to a string that is physically located outside of |
89 | // the given string. |
90 | // |
91 | // The following example is a simple Delimiter object that is created with a |
92 | // single char and will look for that char in the text passed to the `Find()` |
93 | // function: |
94 | // |
95 | // struct SimpleDelimiter { |
96 | // const char c_; |
97 | // explicit SimpleDelimiter(char c) : c_(c) {} |
98 | // absl::string_view Find(absl::string_view text, size_t pos) { |
99 | // auto found = text.find(c_, pos); |
100 | // if (found == absl::string_view::npos) |
101 | // return text.substr(text.size()); |
102 | // |
103 | // return text.substr(found, 1); |
104 | // } |
105 | // }; |
106 | |
107 | // ByString |
108 | // |
109 | // A sub-string delimiter. If `StrSplit()` is passed a string in place of a |
110 | // `Delimiter` object, the string will be implicitly converted into a |
111 | // `ByString` delimiter. |
112 | // |
113 | // Example: |
114 | // |
115 | // // Because a string literal is converted to an `absl::ByString`, |
116 | // // the following two splits are equivalent. |
117 | // |
118 | // std::vector<std::string> v1 = absl::StrSplit("a, b, c", ", "); |
119 | // |
120 | // using absl::ByString; |
121 | // std::vector<std::string> v2 = absl::StrSplit("a, b, c", |
122 | // ByString(", ")); |
123 | // // v[0] == "a", v[1] == "b", v[2] == "c" |
124 | class ByString { |
125 | public: |
126 | explicit ByString(absl::string_view sp); |
127 | absl::string_view Find(absl::string_view text, size_t pos) const; |
128 | |
129 | private: |
130 | const std::string delimiter_; |
131 | }; |
132 | |
133 | // ByChar |
134 | // |
135 | // A single character delimiter. `ByChar` is functionally equivalent to a |
136 | // 1-char string within a `ByString` delimiter, but slightly more efficient. |
137 | // |
138 | // Example: |
139 | // |
140 | // // Because a char literal is converted to a absl::ByChar, |
141 | // // the following two splits are equivalent. |
142 | // std::vector<std::string> v1 = absl::StrSplit("a,b,c", ','); |
143 | // using absl::ByChar; |
144 | // std::vector<std::string> v2 = absl::StrSplit("a,b,c", ByChar(',')); |
145 | // // v[0] == "a", v[1] == "b", v[2] == "c" |
146 | // |
147 | // `ByChar` is also the default delimiter if a single character is given |
148 | // as the delimiter to `StrSplit()`. For example, the following calls are |
149 | // equivalent: |
150 | // |
151 | // std::vector<std::string> v = absl::StrSplit("a-b", '-'); |
152 | // |
153 | // using absl::ByChar; |
154 | // std::vector<std::string> v = absl::StrSplit("a-b", ByChar('-')); |
155 | // |
156 | class ByChar { |
157 | public: |
158 | explicit ByChar(char c) : c_(c) {} |
159 | absl::string_view Find(absl::string_view text, size_t pos) const; |
160 | |
161 | private: |
162 | char c_; |
163 | }; |
164 | |
165 | // ByAnyChar |
166 | // |
167 | // A delimiter that will match any of the given byte-sized characters within |
168 | // its provided string. |
169 | // |
170 | // Note: this delimiter works with single-byte string data, but does not work |
171 | // with variable-width encodings, such as UTF-8. |
172 | // |
173 | // Example: |
174 | // |
175 | // using absl::ByAnyChar; |
176 | // std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",=")); |
177 | // // v[0] == "a", v[1] == "b", v[2] == "c" |
178 | // |
179 | // If `ByAnyChar` is given the empty string, it behaves exactly like |
180 | // `ByString` and matches each individual character in the input string. |
181 | // |
182 | class ByAnyChar { |
183 | public: |
184 | explicit ByAnyChar(absl::string_view sp); |
185 | absl::string_view Find(absl::string_view text, size_t pos) const; |
186 | |
187 | private: |
188 | const std::string delimiters_; |
189 | }; |
190 | |
191 | // ByLength |
192 | // |
193 | // A delimiter for splitting into equal-length strings. The length argument to |
194 | // the constructor must be greater than 0. |
195 | // |
196 | // Note: this delimiter works with single-byte string data, but does not work |
197 | // with variable-width encodings, such as UTF-8. |
198 | // |
199 | // Example: |
200 | // |
201 | // using absl::ByLength; |
202 | // std::vector<std::string> v = absl::StrSplit("123456789", ByLength(3)); |
203 | |
204 | // // v[0] == "123", v[1] == "456", v[2] == "789" |
205 | // |
206 | // Note that the string does not have to be a multiple of the fixed split |
207 | // length. In such a case, the last substring will be shorter. |
208 | // |
209 | // using absl::ByLength; |
210 | // std::vector<std::string> v = absl::StrSplit("12345", ByLength(2)); |
211 | // |
212 | // // v[0] == "12", v[1] == "34", v[2] == "5" |
213 | class ByLength { |
214 | public: |
215 | explicit ByLength(ptrdiff_t length); |
216 | absl::string_view Find(absl::string_view text, size_t pos) const; |
217 | |
218 | private: |
219 | const ptrdiff_t length_; |
220 | }; |
221 | |
222 | namespace strings_internal { |
223 | |
224 | // A traits-like metafunction for selecting the default Delimiter object type |
225 | // for a particular Delimiter type. The base case simply exposes type Delimiter |
226 | // itself as the delimiter's Type. However, there are specializations for |
227 | // string-like objects that map them to the ByString delimiter object. |
228 | // This allows functions like absl::StrSplit() and absl::MaxSplits() to accept |
229 | // string-like objects (e.g., ',') as delimiter arguments but they will be |
230 | // treated as if a ByString delimiter was given. |
231 | template <typename Delimiter> |
232 | struct SelectDelimiter { |
233 | using type = Delimiter; |
234 | }; |
235 | |
236 | template <> |
237 | struct SelectDelimiter<char> { |
238 | using type = ByChar; |
239 | }; |
240 | template <> |
241 | struct SelectDelimiter<char*> { |
242 | using type = ByString; |
243 | }; |
244 | template <> |
245 | struct SelectDelimiter<const char*> { |
246 | using type = ByString; |
247 | }; |
248 | template <> |
249 | struct SelectDelimiter<absl::string_view> { |
250 | using type = ByString; |
251 | }; |
252 | template <> |
253 | struct SelectDelimiter<std::string> { |
254 | using type = ByString; |
255 | }; |
256 | |
257 | // Wraps another delimiter and sets a max number of matches for that delimiter. |
258 | template <typename Delimiter> |
259 | class MaxSplitsImpl { |
260 | public: |
261 | MaxSplitsImpl(Delimiter delimiter, int limit) |
262 | : delimiter_(delimiter), limit_(limit), count_(0) {} |
263 | absl::string_view Find(absl::string_view text, size_t pos) { |
264 | if (count_++ == limit_) { |
265 | return absl::string_view(text.data() + text.size(), |
266 | 0); // No more matches. |
267 | } |
268 | return delimiter_.Find(text, pos); |
269 | } |
270 | |
271 | private: |
272 | Delimiter delimiter_; |
273 | const int limit_; |
274 | int count_; |
275 | }; |
276 | |
277 | } // namespace strings_internal |
278 | |
279 | // MaxSplits() |
280 | // |
281 | // A delimiter that limits the number of matches which can occur to the passed |
282 | // `limit`. The last element in the returned collection will contain all |
283 | // remaining unsplit pieces, which may contain instances of the delimiter. |
284 | // The collection will contain at most `limit` + 1 elements. |
285 | // Example: |
286 | // |
287 | // using absl::MaxSplits; |
288 | // std::vector<std::string> v = absl::StrSplit("a,b,c", MaxSplits(',', 1)); |
289 | // |
290 | // // v[0] == "a", v[1] == "b,c" |
291 | template <typename Delimiter> |
292 | inline strings_internal::MaxSplitsImpl< |
293 | typename strings_internal::SelectDelimiter<Delimiter>::type> |
294 | MaxSplits(Delimiter delimiter, int limit) { |
295 | typedef |
296 | typename strings_internal::SelectDelimiter<Delimiter>::type DelimiterType; |
297 | return strings_internal::MaxSplitsImpl<DelimiterType>( |
298 | DelimiterType(delimiter), limit); |
299 | } |
300 | |
301 | //------------------------------------------------------------------------------ |
302 | // Predicates |
303 | //------------------------------------------------------------------------------ |
304 | // |
305 | // Predicates filter the results of a `StrSplit()` by determining whether or not |
306 | // a resultant element is included in the result set. A predicate may be passed |
307 | // as an optional third argument to the `StrSplit()` function. |
308 | // |
309 | // Predicates are unary functions (or functors) that take a single |
310 | // `absl::string_view` argument and return a bool indicating whether the |
311 | // argument should be included (`true`) or excluded (`false`). |
312 | // |
313 | // Predicates are useful when filtering out empty substrings. By default, empty |
314 | // substrings may be returned by `StrSplit()`, which is similar to the way split |
315 | // functions work in other programming languages. |
316 | |
317 | // AllowEmpty() |
318 | // |
319 | // Always returns `true`, indicating that all strings--including empty |
320 | // strings--should be included in the split output. This predicate is not |
321 | // strictly needed because this is the default behavior of `StrSplit()`; |
322 | // however, it might be useful at some call sites to make the intent explicit. |
323 | // |
324 | // Example: |
325 | // |
326 | // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', AllowEmpty()); |
327 | // |
328 | // // v[0] == " a ", v[1] == " ", v[2] == "", v[3] = "b", v[4] == "" |
329 | struct AllowEmpty { |
330 | bool operator()(absl::string_view) const { return true; } |
331 | }; |
332 | |
333 | // SkipEmpty() |
334 | // |
335 | // Returns `false` if the given `absl::string_view` is empty, indicating that |
336 | // `StrSplit()` should omit the empty string. |
337 | // |
338 | // Example: |
339 | // |
340 | // std::vector<std::string> v = absl::StrSplit(",a,,b,", ',', SkipEmpty()); |
341 | // |
342 | // // v[0] == "a", v[1] == "b" |
343 | // |
344 | // Note: `SkipEmpty()` does not consider a string containing only whitespace |
345 | // to be empty. To skip such whitespace as well, use the `SkipWhitespace()` |
346 | // predicate. |
347 | struct SkipEmpty { |
348 | bool operator()(absl::string_view sp) const { return !sp.empty(); } |
349 | }; |
350 | |
351 | // SkipWhitespace() |
352 | // |
353 | // Returns `false` if the given `absl::string_view` is empty *or* contains only |
354 | // whitespace, indicating that `StrSplit()` should omit the string. |
355 | // |
356 | // Example: |
357 | // |
358 | // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", |
359 | // ',', SkipWhitespace()); |
360 | // // v[0] == " a ", v[1] == "b" |
361 | // |
362 | // // SkipEmpty() would return whitespace elements |
363 | // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', SkipEmpty()); |
364 | // // v[0] == " a ", v[1] == " ", v[2] == "b" |
365 | struct SkipWhitespace { |
366 | bool operator()(absl::string_view sp) const { |
367 | sp = absl::StripAsciiWhitespace(sp); |
368 | return !sp.empty(); |
369 | } |
370 | }; |
371 | |
372 | template <typename T> |
373 | using EnableSplitIfString = |
374 | typename std::enable_if<std::is_same<T, std::string>::value || |
375 | std::is_same<T, const std::string>::value, |
376 | int>::type; |
377 | |
378 | //------------------------------------------------------------------------------ |
379 | // StrSplit() |
380 | //------------------------------------------------------------------------------ |
381 | |
382 | // StrSplit() |
383 | // |
384 | // Splits a given string based on the provided `Delimiter` object, returning the |
385 | // elements within the type specified by the caller. Optionally, you may pass a |
386 | // `Predicate` to `StrSplit()` indicating whether to include or exclude the |
387 | // resulting element within the final result set. (See the overviews for |
388 | // Delimiters and Predicates above.) |
389 | // |
390 | // Example: |
391 | // |
392 | // std::vector<std::string> v = absl::StrSplit("a,b,c,d", ','); |
393 | // // v[0] == "a", v[1] == "b", v[2] == "c", v[3] == "d" |
394 | // |
395 | // You can also provide an explicit `Delimiter` object: |
396 | // |
397 | // Example: |
398 | // |
399 | // using absl::ByAnyChar; |
400 | // std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",=")); |
401 | // // v[0] == "a", v[1] == "b", v[2] == "c" |
402 | // |
403 | // See above for more information on delimiters. |
404 | // |
405 | // By default, empty strings are included in the result set. You can optionally |
406 | // include a third `Predicate` argument to apply a test for whether the |
407 | // resultant element should be included in the result set: |
408 | // |
409 | // Example: |
410 | // |
411 | // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", |
412 | // ',', SkipWhitespace()); |
413 | // // v[0] == " a ", v[1] == "b" |
414 | // |
415 | // See above for more information on predicates. |
416 | // |
417 | //------------------------------------------------------------------------------ |
418 | // StrSplit() Return Types |
419 | //------------------------------------------------------------------------------ |
420 | // |
421 | // The `StrSplit()` function adapts the returned collection to the collection |
422 | // specified by the caller (e.g. `std::vector` above). The returned collections |
423 | // may contain `std::string`, `absl::string_view` (in which case the original |
424 | // string being split must ensure that it outlives the collection), or any |
425 | // object that can be explicitly created from an `absl::string_view`. This |
426 | // behavior works for: |
427 | // |
428 | // 1) All standard STL containers including `std::vector`, `std::list`, |
429 | // `std::deque`, `std::set`,`std::multiset`, 'std::map`, and `std::multimap` |
430 | // 2) `std::pair` (which is not actually a container). See below. |
431 | // |
432 | // Example: |
433 | // |
434 | // // The results are returned as `absl::string_view` objects. Note that we |
435 | // // have to ensure that the input string outlives any results. |
436 | // std::vector<absl::string_view> v = absl::StrSplit("a,b,c", ','); |
437 | // |
438 | // // Stores results in a std::set<std::string>, which also performs |
439 | // // de-duplication and orders the elements in ascending order. |
440 | // std::set<std::string> a = absl::StrSplit("b,a,c,a,b", ','); |
441 | // // v[0] == "a", v[1] == "b", v[2] = "c" |
442 | // |
443 | // // `StrSplit()` can be used within a range-based for loop, in which case |
444 | // // each element will be of type `absl::string_view`. |
445 | // std::vector<std::string> v; |
446 | // for (const auto sv : absl::StrSplit("a,b,c", ',')) { |
447 | // if (sv != "b") v.emplace_back(sv); |
448 | // } |
449 | // // v[0] == "a", v[1] == "c" |
450 | // |
451 | // // Stores results in a map. The map implementation assumes that the input |
452 | // // is provided as a series of key/value pairs. For example, the 0th element |
453 | // // resulting from the split will be stored as a key to the 1st element. If |
454 | // // an odd number of elements are resolved, the last element is paired with |
455 | // // a default-constructed value (e.g., empty string). |
456 | // std::map<std::string, std::string> m = absl::StrSplit("a,b,c", ','); |
457 | // // m["a"] == "b", m["c"] == "" // last component value equals "" |
458 | // |
459 | // Splitting to `std::pair` is an interesting case because it can hold only two |
460 | // elements and is not a collection type. When splitting to a `std::pair` the |
461 | // first two split strings become the `std::pair` `.first` and `.second` |
462 | // members, respectively. The remaining split substrings are discarded. If there |
463 | // are less than two split substrings, the empty string is used for the |
464 | // corresponding `std::pair` member. |
465 | // |
466 | // Example: |
467 | // |
468 | // // Stores first two split strings as the members in a std::pair. |
469 | // std::pair<std::string, std::string> p = absl::StrSplit("a,b,c", ','); |
470 | // // p.first == "a", p.second == "b" // "c" is omitted. |
471 | // |
472 | // The `StrSplit()` function can be used multiple times to perform more |
473 | // complicated splitting logic, such as intelligently parsing key-value pairs. |
474 | // |
475 | // Example: |
476 | // |
477 | // // The input string "a=b=c,d=e,f=,g" becomes |
478 | // // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" } |
479 | // std::map<std::string, std::string> m; |
480 | // for (absl::string_view sp : absl::StrSplit("a=b=c,d=e,f=,g", ',')) { |
481 | // m.insert(absl::StrSplit(sp, absl::MaxSplits('=', 1))); |
482 | // } |
483 | // EXPECT_EQ("b=c", m.find("a")->second); |
484 | // EXPECT_EQ("e", m.find("d")->second); |
485 | // EXPECT_EQ("", m.find("f")->second); |
486 | // EXPECT_EQ("", m.find("g")->second); |
487 | // |
488 | // WARNING: Due to a legacy bug that is maintained for backward compatibility, |
489 | // splitting the following empty string_views produces different results: |
490 | // |
491 | // absl::StrSplit(absl::string_view(""), '-'); // {""} |
492 | // absl::StrSplit(absl::string_view(), '-'); // {}, but should be {""} |
493 | // |
494 | // Try not to depend on this distinction because the bug may one day be fixed. |
495 | template <typename Delimiter> |
496 | strings_internal::Splitter< |
497 | typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty, |
498 | absl::string_view> |
499 | StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d) { |
500 | using DelimiterType = |
501 | typename strings_internal::SelectDelimiter<Delimiter>::type; |
502 | return strings_internal::Splitter<DelimiterType, AllowEmpty, |
503 | absl::string_view>( |
504 | text.value(), DelimiterType(d), AllowEmpty()); |
505 | } |
506 | |
507 | template <typename Delimiter, typename StringType, |
508 | EnableSplitIfString<StringType> = 0> |
509 | strings_internal::Splitter< |
510 | typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty, |
511 | std::string> |
512 | StrSplit(StringType&& text, Delimiter d) { |
513 | using DelimiterType = |
514 | typename strings_internal::SelectDelimiter<Delimiter>::type; |
515 | return strings_internal::Splitter<DelimiterType, AllowEmpty, std::string>( |
516 | std::move(text), DelimiterType(d), AllowEmpty()); |
517 | } |
518 | |
519 | template <typename Delimiter, typename Predicate> |
520 | strings_internal::Splitter< |
521 | typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate, |
522 | absl::string_view> |
523 | StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d, |
524 | Predicate p) { |
525 | using DelimiterType = |
526 | typename strings_internal::SelectDelimiter<Delimiter>::type; |
527 | return strings_internal::Splitter<DelimiterType, Predicate, |
528 | absl::string_view>( |
529 | text.value(), DelimiterType(d), std::move(p)); |
530 | } |
531 | |
532 | template <typename Delimiter, typename Predicate, typename StringType, |
533 | EnableSplitIfString<StringType> = 0> |
534 | strings_internal::Splitter< |
535 | typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate, |
536 | std::string> |
537 | StrSplit(StringType&& text, Delimiter d, Predicate p) { |
538 | using DelimiterType = |
539 | typename strings_internal::SelectDelimiter<Delimiter>::type; |
540 | return strings_internal::Splitter<DelimiterType, Predicate, std::string>( |
541 | std::move(text), DelimiterType(d), std::move(p)); |
542 | } |
543 | |
544 | ABSL_NAMESPACE_END |
545 | } // namespace absl |
546 | |
547 | #endif // ABSL_STRINGS_STR_SPLIT_H_ |
548 | |