1 | /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | #include "tensorflow/core/kernels/string_util.h" |
16 | |
17 | #include "tensorflow/core/lib/core/errors.h" |
18 | |
19 | namespace tensorflow { |
20 | |
21 | // Sets unit value based on str. |
22 | Status ParseUnicodeEncoding(const string& str, UnicodeEncoding* encoding) { |
23 | if (str == "UTF-8" ) { |
24 | *encoding = UnicodeEncoding::UTF8; |
25 | } else if (str == "UTF-16-BE" ) { |
26 | *encoding = UnicodeEncoding::UTF16BE; |
27 | } else if (str == "UTF-32-BE" ) { |
28 | *encoding = UnicodeEncoding::UTF32BE; |
29 | } else { |
30 | return errors::InvalidArgument( |
31 | strings::StrCat("Invalid encoding \"" , str, |
32 | "\": Should be one of: UTF-8, UTF-16-BE, UTF-32-BE" )); |
33 | } |
34 | return OkStatus(); |
35 | } |
36 | |
37 | // Sets unit value based on str. |
38 | Status ParseCharUnit(const string& str, CharUnit* unit) { |
39 | if (str == "BYTE" ) { |
40 | *unit = CharUnit::BYTE; |
41 | } else if (str == "UTF8_CHAR" ) { |
42 | *unit = CharUnit::UTF8_CHAR; |
43 | } else { |
44 | return errors::InvalidArgument(strings::StrCat( |
45 | "Invalid unit \"" , str, "\": Should be one of: BYTE, UTF8_CHAR" )); |
46 | } |
47 | return OkStatus(); |
48 | } |
49 | |
50 | // Return the number of Unicode characters in a UTF-8 string. |
51 | // Result may be incorrect if the input string is not valid UTF-8. |
52 | int32 UTF8StrLen(const string& str) { |
53 | const int32_t byte_size = str.size(); |
54 | const char* const end = str.data() + byte_size; |
55 | const char* ptr = str.data(); |
56 | int32_t skipped_count = 0; |
57 | while (ptr < end) { |
58 | skipped_count += IsTrailByte(*ptr++) ? 1 : 0; |
59 | } |
60 | const int32_t result = byte_size - skipped_count; |
61 | return result; |
62 | } |
63 | |
64 | } // namespace tensorflow |
65 | |