1//
2// Copyright 2017 The Abseil Authors.
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// https://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16// -----------------------------------------------------------------------------
17// File: str_split.h
18// -----------------------------------------------------------------------------
19//
20// This file contains functions for splitting strings. It defines the main
21// `StrSplit()` function, several delimiters for determining the boundaries on
22// which to split the string, and predicates for filtering delimited results.
23// `StrSplit()` adapts the returned collection to the type specified by the
24// caller.
25//
26// Example:
27//
28// // Splits the given string on commas. Returns the results in a
29// // vector of strings.
30// std::vector<std::string> v = absl::StrSplit("a,b,c", ',');
31// // Can also use ","
32// // v[0] == "a", v[1] == "b", v[2] == "c"
33//
34// See StrSplit() below for more information.
35#ifndef ABSL_STRINGS_STR_SPLIT_H_
36#define ABSL_STRINGS_STR_SPLIT_H_
37
38#include <algorithm>
39#include <cstddef>
40#include <map>
41#include <set>
42#include <string>
43#include <utility>
44#include <vector>
45
46#include "absl/base/internal/raw_logging.h"
47#include "absl/base/macros.h"
48#include "absl/strings/internal/str_split_internal.h"
49#include "absl/strings/string_view.h"
50#include "absl/strings/strip.h"
51
52namespace absl {
53ABSL_NAMESPACE_BEGIN
54
55//------------------------------------------------------------------------------
56// Delimiters
57//------------------------------------------------------------------------------
58//
59// `StrSplit()` uses delimiters to define the boundaries between elements in the
60// provided input. Several `Delimiter` types are defined below. If a string
61// (`const char*`, `std::string`, or `absl::string_view`) is passed in place of
62// an explicit `Delimiter` object, `StrSplit()` treats it the same way as if it
63// were passed a `ByString` delimiter.
64//
65// A `Delimiter` is an object with a `Find()` function that knows how to find
66// the first occurrence of itself in a given `absl::string_view`.
67//
68// The following `Delimiter` types are available for use within `StrSplit()`:
69//
70// - `ByString` (default for string arguments)
71// - `ByChar` (default for a char argument)
72// - `ByAnyChar`
73// - `ByLength`
74// - `MaxSplits`
75//
76// A Delimiter's `Find()` member function will be passed an input `text` that is
77// to be split and a position (`pos`) to begin searching for the next delimiter
78// in `text`. The returned absl::string_view should refer to the next occurrence
79// (after `pos`) of the represented delimiter; this returned absl::string_view
80// represents the next location where the input `text` should be broken.
81//
82// The returned absl::string_view may be zero-length if the Delimiter does not
83// represent a part of the string (e.g., a fixed-length delimiter). If no
84// delimiter is found in the input `text`, a zero-length absl::string_view
85// referring to `text.end()` should be returned (e.g.,
86// `text.substr(text.size())`). It is important that the returned
87// absl::string_view always be within the bounds of the input `text` given as an
88// argument--it must not refer to a string that is physically located outside of
89// the given string.
90//
91// The following example is a simple Delimiter object that is created with a
92// single char and will look for that char in the text passed to the `Find()`
93// function:
94//
95// struct SimpleDelimiter {
96// const char c_;
97// explicit SimpleDelimiter(char c) : c_(c) {}
98// absl::string_view Find(absl::string_view text, size_t pos) {
99// auto found = text.find(c_, pos);
100// if (found == absl::string_view::npos)
101// return text.substr(text.size());
102//
103// return text.substr(found, 1);
104// }
105// };
106
107// ByString
108//
109// A sub-string delimiter. If `StrSplit()` is passed a string in place of a
110// `Delimiter` object, the string will be implicitly converted into a
111// `ByString` delimiter.
112//
113// Example:
114//
115// // Because a string literal is converted to an `absl::ByString`,
116// // the following two splits are equivalent.
117//
118// std::vector<std::string> v1 = absl::StrSplit("a, b, c", ", ");
119//
120// using absl::ByString;
121// std::vector<std::string> v2 = absl::StrSplit("a, b, c",
122// ByString(", "));
123// // v[0] == "a", v[1] == "b", v[2] == "c"
124class ByString {
125 public:
126 explicit ByString(absl::string_view sp);
127 absl::string_view Find(absl::string_view text, size_t pos) const;
128
129 private:
130 const std::string delimiter_;
131};
132
133// ByChar
134//
135// A single character delimiter. `ByChar` is functionally equivalent to a
136// 1-char string within a `ByString` delimiter, but slightly more efficient.
137//
138// Example:
139//
140// // Because a char literal is converted to a absl::ByChar,
141// // the following two splits are equivalent.
142// std::vector<std::string> v1 = absl::StrSplit("a,b,c", ',');
143// using absl::ByChar;
144// std::vector<std::string> v2 = absl::StrSplit("a,b,c", ByChar(','));
145// // v[0] == "a", v[1] == "b", v[2] == "c"
146//
147// `ByChar` is also the default delimiter if a single character is given
148// as the delimiter to `StrSplit()`. For example, the following calls are
149// equivalent:
150//
151// std::vector<std::string> v = absl::StrSplit("a-b", '-');
152//
153// using absl::ByChar;
154// std::vector<std::string> v = absl::StrSplit("a-b", ByChar('-'));
155//
156class ByChar {
157 public:
158 explicit ByChar(char c) : c_(c) {}
159 absl::string_view Find(absl::string_view text, size_t pos) const;
160
161 private:
162 char c_;
163};
164
165// ByAnyChar
166//
167// A delimiter that will match any of the given byte-sized characters within
168// its provided string.
169//
170// Note: this delimiter works with single-byte string data, but does not work
171// with variable-width encodings, such as UTF-8.
172//
173// Example:
174//
175// using absl::ByAnyChar;
176// std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",="));
177// // v[0] == "a", v[1] == "b", v[2] == "c"
178//
179// If `ByAnyChar` is given the empty string, it behaves exactly like
180// `ByString` and matches each individual character in the input string.
181//
182class ByAnyChar {
183 public:
184 explicit ByAnyChar(absl::string_view sp);
185 absl::string_view Find(absl::string_view text, size_t pos) const;
186
187 private:
188 const std::string delimiters_;
189};
190
191// ByLength
192//
193// A delimiter for splitting into equal-length strings. The length argument to
194// the constructor must be greater than 0.
195//
196// Note: this delimiter works with single-byte string data, but does not work
197// with variable-width encodings, such as UTF-8.
198//
199// Example:
200//
201// using absl::ByLength;
202// std::vector<std::string> v = absl::StrSplit("123456789", ByLength(3));
203
204// // v[0] == "123", v[1] == "456", v[2] == "789"
205//
206// Note that the string does not have to be a multiple of the fixed split
207// length. In such a case, the last substring will be shorter.
208//
209// using absl::ByLength;
210// std::vector<std::string> v = absl::StrSplit("12345", ByLength(2));
211//
212// // v[0] == "12", v[1] == "34", v[2] == "5"
213class ByLength {
214 public:
215 explicit ByLength(ptrdiff_t length);
216 absl::string_view Find(absl::string_view text, size_t pos) const;
217
218 private:
219 const ptrdiff_t length_;
220};
221
222namespace strings_internal {
223
224// A traits-like metafunction for selecting the default Delimiter object type
225// for a particular Delimiter type. The base case simply exposes type Delimiter
226// itself as the delimiter's Type. However, there are specializations for
227// string-like objects that map them to the ByString delimiter object.
228// This allows functions like absl::StrSplit() and absl::MaxSplits() to accept
229// string-like objects (e.g., ',') as delimiter arguments but they will be
230// treated as if a ByString delimiter was given.
231template <typename Delimiter>
232struct SelectDelimiter {
233 using type = Delimiter;
234};
235
236template <>
237struct SelectDelimiter<char> {
238 using type = ByChar;
239};
240template <>
241struct SelectDelimiter<char*> {
242 using type = ByString;
243};
244template <>
245struct SelectDelimiter<const char*> {
246 using type = ByString;
247};
248template <>
249struct SelectDelimiter<absl::string_view> {
250 using type = ByString;
251};
252template <>
253struct SelectDelimiter<std::string> {
254 using type = ByString;
255};
256
257// Wraps another delimiter and sets a max number of matches for that delimiter.
258template <typename Delimiter>
259class MaxSplitsImpl {
260 public:
261 MaxSplitsImpl(Delimiter delimiter, int limit)
262 : delimiter_(delimiter), limit_(limit), count_(0) {}
263 absl::string_view Find(absl::string_view text, size_t pos) {
264 if (count_++ == limit_) {
265 return absl::string_view(text.data() + text.size(),
266 0); // No more matches.
267 }
268 return delimiter_.Find(text, pos);
269 }
270
271 private:
272 Delimiter delimiter_;
273 const int limit_;
274 int count_;
275};
276
277} // namespace strings_internal
278
279// MaxSplits()
280//
281// A delimiter that limits the number of matches which can occur to the passed
282// `limit`. The last element in the returned collection will contain all
283// remaining unsplit pieces, which may contain instances of the delimiter.
284// The collection will contain at most `limit` + 1 elements.
285// Example:
286//
287// using absl::MaxSplits;
288// std::vector<std::string> v = absl::StrSplit("a,b,c", MaxSplits(',', 1));
289//
290// // v[0] == "a", v[1] == "b,c"
291template <typename Delimiter>
292inline strings_internal::MaxSplitsImpl<
293 typename strings_internal::SelectDelimiter<Delimiter>::type>
294MaxSplits(Delimiter delimiter, int limit) {
295 typedef
296 typename strings_internal::SelectDelimiter<Delimiter>::type DelimiterType;
297 return strings_internal::MaxSplitsImpl<DelimiterType>(
298 DelimiterType(delimiter), limit);
299}
300
301//------------------------------------------------------------------------------
302// Predicates
303//------------------------------------------------------------------------------
304//
305// Predicates filter the results of a `StrSplit()` by determining whether or not
306// a resultant element is included in the result set. A predicate may be passed
307// as an optional third argument to the `StrSplit()` function.
308//
309// Predicates are unary functions (or functors) that take a single
310// `absl::string_view` argument and return a bool indicating whether the
311// argument should be included (`true`) or excluded (`false`).
312//
313// Predicates are useful when filtering out empty substrings. By default, empty
314// substrings may be returned by `StrSplit()`, which is similar to the way split
315// functions work in other programming languages.
316
317// AllowEmpty()
318//
319// Always returns `true`, indicating that all strings--including empty
320// strings--should be included in the split output. This predicate is not
321// strictly needed because this is the default behavior of `StrSplit()`;
322// however, it might be useful at some call sites to make the intent explicit.
323//
324// Example:
325//
326// std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', AllowEmpty());
327//
328// // v[0] == " a ", v[1] == " ", v[2] == "", v[3] = "b", v[4] == ""
329struct AllowEmpty {
330 bool operator()(absl::string_view) const { return true; }
331};
332
333// SkipEmpty()
334//
335// Returns `false` if the given `absl::string_view` is empty, indicating that
336// `StrSplit()` should omit the empty string.
337//
338// Example:
339//
340// std::vector<std::string> v = absl::StrSplit(",a,,b,", ',', SkipEmpty());
341//
342// // v[0] == "a", v[1] == "b"
343//
344// Note: `SkipEmpty()` does not consider a string containing only whitespace
345// to be empty. To skip such whitespace as well, use the `SkipWhitespace()`
346// predicate.
347struct SkipEmpty {
348 bool operator()(absl::string_view sp) const { return !sp.empty(); }
349};
350
351// SkipWhitespace()
352//
353// Returns `false` if the given `absl::string_view` is empty *or* contains only
354// whitespace, indicating that `StrSplit()` should omit the string.
355//
356// Example:
357//
358// std::vector<std::string> v = absl::StrSplit(" a , ,,b,",
359// ',', SkipWhitespace());
360// // v[0] == " a ", v[1] == "b"
361//
362// // SkipEmpty() would return whitespace elements
363// std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', SkipEmpty());
364// // v[0] == " a ", v[1] == " ", v[2] == "b"
365struct SkipWhitespace {
366 bool operator()(absl::string_view sp) const {
367 sp = absl::StripAsciiWhitespace(sp);
368 return !sp.empty();
369 }
370};
371
372template <typename T>
373using EnableSplitIfString =
374 typename std::enable_if<std::is_same<T, std::string>::value ||
375 std::is_same<T, const std::string>::value,
376 int>::type;
377
378//------------------------------------------------------------------------------
379// StrSplit()
380//------------------------------------------------------------------------------
381
382// StrSplit()
383//
384// Splits a given string based on the provided `Delimiter` object, returning the
385// elements within the type specified by the caller. Optionally, you may pass a
386// `Predicate` to `StrSplit()` indicating whether to include or exclude the
387// resulting element within the final result set. (See the overviews for
388// Delimiters and Predicates above.)
389//
390// Example:
391//
392// std::vector<std::string> v = absl::StrSplit("a,b,c,d", ',');
393// // v[0] == "a", v[1] == "b", v[2] == "c", v[3] == "d"
394//
395// You can also provide an explicit `Delimiter` object:
396//
397// Example:
398//
399// using absl::ByAnyChar;
400// std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",="));
401// // v[0] == "a", v[1] == "b", v[2] == "c"
402//
403// See above for more information on delimiters.
404//
405// By default, empty strings are included in the result set. You can optionally
406// include a third `Predicate` argument to apply a test for whether the
407// resultant element should be included in the result set:
408//
409// Example:
410//
411// std::vector<std::string> v = absl::StrSplit(" a , ,,b,",
412// ',', SkipWhitespace());
413// // v[0] == " a ", v[1] == "b"
414//
415// See above for more information on predicates.
416//
417//------------------------------------------------------------------------------
418// StrSplit() Return Types
419//------------------------------------------------------------------------------
420//
421// The `StrSplit()` function adapts the returned collection to the collection
422// specified by the caller (e.g. `std::vector` above). The returned collections
423// may contain `std::string`, `absl::string_view` (in which case the original
424// string being split must ensure that it outlives the collection), or any
425// object that can be explicitly created from an `absl::string_view`. This
426// behavior works for:
427//
428// 1) All standard STL containers including `std::vector`, `std::list`,
429// `std::deque`, `std::set`,`std::multiset`, 'std::map`, and `std::multimap`
430// 2) `std::pair` (which is not actually a container). See below.
431//
432// Example:
433//
434// // The results are returned as `absl::string_view` objects. Note that we
435// // have to ensure that the input string outlives any results.
436// std::vector<absl::string_view> v = absl::StrSplit("a,b,c", ',');
437//
438// // Stores results in a std::set<std::string>, which also performs
439// // de-duplication and orders the elements in ascending order.
440// std::set<std::string> a = absl::StrSplit("b,a,c,a,b", ',');
441// // v[0] == "a", v[1] == "b", v[2] = "c"
442//
443// // `StrSplit()` can be used within a range-based for loop, in which case
444// // each element will be of type `absl::string_view`.
445// std::vector<std::string> v;
446// for (const auto sv : absl::StrSplit("a,b,c", ',')) {
447// if (sv != "b") v.emplace_back(sv);
448// }
449// // v[0] == "a", v[1] == "c"
450//
451// // Stores results in a map. The map implementation assumes that the input
452// // is provided as a series of key/value pairs. For example, the 0th element
453// // resulting from the split will be stored as a key to the 1st element. If
454// // an odd number of elements are resolved, the last element is paired with
455// // a default-constructed value (e.g., empty string).
456// std::map<std::string, std::string> m = absl::StrSplit("a,b,c", ',');
457// // m["a"] == "b", m["c"] == "" // last component value equals ""
458//
459// Splitting to `std::pair` is an interesting case because it can hold only two
460// elements and is not a collection type. When splitting to a `std::pair` the
461// first two split strings become the `std::pair` `.first` and `.second`
462// members, respectively. The remaining split substrings are discarded. If there
463// are less than two split substrings, the empty string is used for the
464// corresponding `std::pair` member.
465//
466// Example:
467//
468// // Stores first two split strings as the members in a std::pair.
469// std::pair<std::string, std::string> p = absl::StrSplit("a,b,c", ',');
470// // p.first == "a", p.second == "b" // "c" is omitted.
471//
472// The `StrSplit()` function can be used multiple times to perform more
473// complicated splitting logic, such as intelligently parsing key-value pairs.
474//
475// Example:
476//
477// // The input string "a=b=c,d=e,f=,g" becomes
478// // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" }
479// std::map<std::string, std::string> m;
480// for (absl::string_view sp : absl::StrSplit("a=b=c,d=e,f=,g", ',')) {
481// m.insert(absl::StrSplit(sp, absl::MaxSplits('=', 1)));
482// }
483// EXPECT_EQ("b=c", m.find("a")->second);
484// EXPECT_EQ("e", m.find("d")->second);
485// EXPECT_EQ("", m.find("f")->second);
486// EXPECT_EQ("", m.find("g")->second);
487//
488// WARNING: Due to a legacy bug that is maintained for backward compatibility,
489// splitting the following empty string_views produces different results:
490//
491// absl::StrSplit(absl::string_view(""), '-'); // {""}
492// absl::StrSplit(absl::string_view(), '-'); // {}, but should be {""}
493//
494// Try not to depend on this distinction because the bug may one day be fixed.
495template <typename Delimiter>
496strings_internal::Splitter<
497 typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty,
498 absl::string_view>
499StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d) {
500 using DelimiterType =
501 typename strings_internal::SelectDelimiter<Delimiter>::type;
502 return strings_internal::Splitter<DelimiterType, AllowEmpty,
503 absl::string_view>(
504 text.value(), DelimiterType(d), AllowEmpty());
505}
506
507template <typename Delimiter, typename StringType,
508 EnableSplitIfString<StringType> = 0>
509strings_internal::Splitter<
510 typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty,
511 std::string>
512StrSplit(StringType&& text, Delimiter d) {
513 using DelimiterType =
514 typename strings_internal::SelectDelimiter<Delimiter>::type;
515 return strings_internal::Splitter<DelimiterType, AllowEmpty, std::string>(
516 std::move(text), DelimiterType(d), AllowEmpty());
517}
518
519template <typename Delimiter, typename Predicate>
520strings_internal::Splitter<
521 typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate,
522 absl::string_view>
523StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d,
524 Predicate p) {
525 using DelimiterType =
526 typename strings_internal::SelectDelimiter<Delimiter>::type;
527 return strings_internal::Splitter<DelimiterType, Predicate,
528 absl::string_view>(
529 text.value(), DelimiterType(d), std::move(p));
530}
531
532template <typename Delimiter, typename Predicate, typename StringType,
533 EnableSplitIfString<StringType> = 0>
534strings_internal::Splitter<
535 typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate,
536 std::string>
537StrSplit(StringType&& text, Delimiter d, Predicate p) {
538 using DelimiterType =
539 typename strings_internal::SelectDelimiter<Delimiter>::type;
540 return strings_internal::Splitter<DelimiterType, Predicate, std::string>(
541 std::move(text), DelimiterType(d), std::move(p));
542}
543
544ABSL_NAMESPACE_END
545} // namespace absl
546
547#endif // ABSL_STRINGS_STR_SPLIT_H_
548