1/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15#ifndef TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_
16#define TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_
17
18#include "tensorflow/core/lib/core/status.h"
19#include "tensorflow/core/lib/core/stringpiece.h"
20
21namespace tensorflow {
22
23// Enumeration for unicode encodings. Used by ops such as
24// tf.strings.unicode_encode and tf.strings.unicode_decode.
25enum class UnicodeEncoding { UTF8, UTF16BE, UTF32BE };
26
27// Enumeration for character units. Used by string such as
28// tf.strings.length and tf.substr.
29// TODO(edloper): Add support for: UTF32_CHAR, etc.
30enum class CharUnit { BYTE, UTF8_CHAR };
31
32// Whether or not the given byte is the trailing byte of a UTF-8/16/32 char.
33inline bool IsTrailByte(char x) { return static_cast<signed char>(x) < -0x40; }
34
35// Sets `encoding` based on `str`.
36Status ParseUnicodeEncoding(const string& str, UnicodeEncoding* encoding);
37
38// Sets `unit` value based on `str`.
39Status ParseCharUnit(const string& str, CharUnit* unit);
40
41// Returns the number of Unicode characters in a UTF-8 string.
42// Result may be incorrect if the input string is not valid UTF-8.
43int32 UTF8StrLen(const string& str);
44
45// Get the next UTF8 character position starting at the given position and
46// skipping the given number of characters. Position is a byte offset, and
47// should never be `null`. The function return true if successful. However, if
48// the end of the string is reached before the requested characters, then the
49// position will point to the end of string and this function will return false.
50template <typename T>
51bool ForwardNUTF8CharPositions(const StringPiece in,
52 const T num_utf8_chars_to_shift, T* pos) {
53 const size_t size = in.size();
54 T utf8_chars_counted = 0;
55 while (utf8_chars_counted < num_utf8_chars_to_shift && *pos < size) {
56 // move forward one utf-8 character
57 do {
58 ++*pos;
59 } while (IsTrailByte(in[*pos]) && *pos < size);
60 ++utf8_chars_counted;
61 }
62 return utf8_chars_counted == num_utf8_chars_to_shift;
63}
64
65// Get the previous UTF8 character position starting at the given position and
66// skipping the given number of characters. Position is a byte offset with a
67// positive value, relative to the beginning of the string, and should never be
68// `null`. The function return true if successful. However, if the beginning of
69// the string is reached before the requested character, then the position will
70// point to the beginning of the string and this function will return false.
71template <typename T>
72bool BackNUTF8CharPositions(const StringPiece in,
73 const T num_utf8_chars_to_shift, T* pos) {
74 const size_t start = 0;
75 T utf8_chars_counted = 0;
76 while (utf8_chars_counted < num_utf8_chars_to_shift && (*pos > start)) {
77 // move back one utf-8 character
78 do {
79 --*pos;
80 } while (IsTrailByte(in[*pos]) && *pos > start);
81 ++utf8_chars_counted;
82 }
83 return utf8_chars_counted == num_utf8_chars_to_shift;
84}
85
86} // namespace tensorflow
87
88#endif // TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_
89