1 | /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | #ifndef TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_ |
16 | #define TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_ |
17 | |
18 | #include "tensorflow/core/lib/core/status.h" |
19 | #include "tensorflow/core/lib/core/stringpiece.h" |
20 | |
21 | namespace tensorflow { |
22 | |
23 | // Enumeration for unicode encodings. Used by ops such as |
24 | // tf.strings.unicode_encode and tf.strings.unicode_decode. |
25 | enum class UnicodeEncoding { UTF8, UTF16BE, UTF32BE }; |
26 | |
27 | // Enumeration for character units. Used by string such as |
28 | // tf.strings.length and tf.substr. |
29 | // TODO(edloper): Add support for: UTF32_CHAR, etc. |
30 | enum class CharUnit { BYTE, UTF8_CHAR }; |
31 | |
32 | // Whether or not the given byte is the trailing byte of a UTF-8/16/32 char. |
33 | inline bool IsTrailByte(char x) { return static_cast<signed char>(x) < -0x40; } |
34 | |
35 | // Sets `encoding` based on `str`. |
36 | Status ParseUnicodeEncoding(const string& str, UnicodeEncoding* encoding); |
37 | |
38 | // Sets `unit` value based on `str`. |
39 | Status ParseCharUnit(const string& str, CharUnit* unit); |
40 | |
41 | // Returns the number of Unicode characters in a UTF-8 string. |
42 | // Result may be incorrect if the input string is not valid UTF-8. |
43 | int32 UTF8StrLen(const string& str); |
44 | |
45 | // Get the next UTF8 character position starting at the given position and |
46 | // skipping the given number of characters. Position is a byte offset, and |
47 | // should never be `null`. The function return true if successful. However, if |
48 | // the end of the string is reached before the requested characters, then the |
49 | // position will point to the end of string and this function will return false. |
50 | template <typename T> |
51 | bool ForwardNUTF8CharPositions(const StringPiece in, |
52 | const T num_utf8_chars_to_shift, T* pos) { |
53 | const size_t size = in.size(); |
54 | T utf8_chars_counted = 0; |
55 | while (utf8_chars_counted < num_utf8_chars_to_shift && *pos < size) { |
56 | // move forward one utf-8 character |
57 | do { |
58 | ++*pos; |
59 | } while (IsTrailByte(in[*pos]) && *pos < size); |
60 | ++utf8_chars_counted; |
61 | } |
62 | return utf8_chars_counted == num_utf8_chars_to_shift; |
63 | } |
64 | |
65 | // Get the previous UTF8 character position starting at the given position and |
66 | // skipping the given number of characters. Position is a byte offset with a |
67 | // positive value, relative to the beginning of the string, and should never be |
68 | // `null`. The function return true if successful. However, if the beginning of |
69 | // the string is reached before the requested character, then the position will |
70 | // point to the beginning of the string and this function will return false. |
71 | template <typename T> |
72 | bool BackNUTF8CharPositions(const StringPiece in, |
73 | const T num_utf8_chars_to_shift, T* pos) { |
74 | const size_t start = 0; |
75 | T utf8_chars_counted = 0; |
76 | while (utf8_chars_counted < num_utf8_chars_to_shift && (*pos > start)) { |
77 | // move back one utf-8 character |
78 | do { |
79 | --*pos; |
80 | } while (IsTrailByte(in[*pos]) && *pos > start); |
81 | ++utf8_chars_counted; |
82 | } |
83 | return utf8_chars_counted == num_utf8_chars_to_shift; |
84 | } |
85 | |
86 | } // namespace tensorflow |
87 | |
88 | #endif // TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_ |
89 | |