1 | /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_ |
17 | #define TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_ |
18 | |
19 | #include <string> |
20 | #include <unordered_map> |
21 | #include <vector> |
22 | |
23 | #include "tensorflow/core/example/example.pb.h" |
24 | #include "tensorflow/core/framework/allocator.h" |
25 | #include "tensorflow/core/framework/graph.pb.h" |
26 | #include "tensorflow/core/framework/op_kernel.h" |
27 | #include "tensorflow/core/framework/partial_tensor_shape.h" |
28 | #include "tensorflow/core/framework/tensor.h" |
29 | #include "tensorflow/core/framework/types.h" |
30 | #include "tensorflow/core/lib/gtl/array_slice.h" |
31 | #include "tensorflow/core/platform/types.h" |
32 | #include "tensorflow/core/util/sparse/sparse_tensor.h" |
33 | |
34 | namespace tensorflow { |
35 | namespace example { |
36 | |
37 | // FastParseExampleConfig defines how to parse features in Example. |
38 | // Each sub-config is responsible for one feature identified with feature_name. |
39 | // FastParseExampleConfig can't have two sub-configs with the same feature_name. |
40 | // dtype identifies the type of output vector and the kind of Feature expected |
41 | // in Example. |
42 | struct FastParseExampleConfig { |
43 | struct Dense { |
44 | Dense(StringPiece feature_name, DataType dtype, PartialTensorShape shape, |
45 | Tensor default_value, bool variable_length, |
46 | std::size_t elements_per_stride) |
47 | : feature_name(feature_name), // TODO(mrry): Switch to preallocated |
48 | // tstring when this is available. |
49 | dtype(dtype), |
50 | shape(std::move(shape)), |
51 | default_value(std::move(default_value)), |
52 | variable_length(variable_length), |
53 | elements_per_stride(elements_per_stride) {} |
54 | Dense() = default; |
55 | |
56 | tstring feature_name; |
57 | DataType dtype; |
58 | // These 2 fields correspond exactly to dense_shapes and dense_defaults in |
59 | // ParseExample op. |
60 | // Documentation is available in: tensorflow/core/ops/parsing_ops.cc |
61 | PartialTensorShape shape; |
62 | Tensor default_value; |
63 | bool variable_length; |
64 | std::size_t elements_per_stride; |
65 | }; |
66 | |
67 | struct Sparse { |
68 | Sparse(StringPiece feature_name, DataType dtype) |
69 | : feature_name(feature_name), // TODO(mrry): Switch to preallocated |
70 | // tstring when this is available. |
71 | dtype(dtype) {} |
72 | Sparse() = default; |
73 | |
74 | tstring feature_name; |
75 | DataType dtype; |
76 | }; |
77 | |
78 | struct Ragged { |
79 | Ragged(StringPiece feature_name, DataType dtype, DataType splits_dtype) |
80 | : feature_name(feature_name), // TODO(mrry): Switch to preallocated |
81 | // tstring when this is available. |
82 | dtype(dtype), |
83 | splits_dtype(splits_dtype) {} |
84 | Ragged() = default; |
85 | |
86 | tstring feature_name; |
87 | DataType dtype; |
88 | DataType splits_dtype; |
89 | }; |
90 | |
91 | std::vector<Dense> dense; |
92 | std::vector<Sparse> sparse; |
93 | std::vector<Ragged> ragged; |
94 | |
95 | // If `true`, `Result::feature_stats` will contain one |
96 | // `PerExampleFeatureStats` for each serialized example in the input. |
97 | bool collect_feature_stats = false; |
98 | }; |
99 | |
100 | // Statistics about the features in each example passed to |
101 | // `FastParse[Single]Example()`. |
102 | // |
103 | // TODO(b/111553342): The gathered statistics currently have two limitations: |
104 | // * Feature names that appear more than once will be counted multiple times. |
105 | // * The feature values count only represents the counts for features that were |
106 | // requested in the `FastParseExampleConfig`. |
107 | // These could be addressed with additional work at runtime. |
108 | struct PerExampleFeatureStats { |
109 | // The number of feature names in an example. |
110 | size_t features_count = 0; |
111 | |
112 | // The sum of the number of values in each feature that is parsed. |
113 | size_t feature_values_count = 0; |
114 | }; |
115 | |
116 | // This is exactly the output of TF's ParseExample Op. |
117 | // Documentation is available in: tensorflow/core/ops/parsing_ops.cc |
118 | struct Result { |
119 | std::vector<Tensor> sparse_indices; |
120 | std::vector<Tensor> sparse_values; |
121 | std::vector<Tensor> sparse_shapes; |
122 | std::vector<Tensor> dense_values; |
123 | std::vector<Tensor> ragged_values; |
124 | std::vector<Tensor> ragged_splits; |
125 | std::vector<Tensor> ragged_outer_splits; // For SequenceExamples |
126 | |
127 | // This vector will be populated with one element per example if |
128 | // `FastParseExampleConfig::collect_feature_stats` is set to `true`. |
129 | std::vector<PerExampleFeatureStats> feature_stats; |
130 | }; |
131 | |
132 | // Parses a batch of serialized Example protos and converts them into result |
133 | // according to given config. |
134 | // Given example names have to either be empty or the same size as serialized. |
135 | // example_names are used only for error messages. |
136 | Status FastParseExample(const FastParseExampleConfig& config, |
137 | gtl::ArraySlice<tstring> serialized, |
138 | gtl::ArraySlice<tstring> example_names, |
139 | thread::ThreadPool* thread_pool, Result* result); |
140 | |
141 | // TODO(mrry): Move the hash table construction into the config object. |
142 | typedef FastParseExampleConfig FastParseSingleExampleConfig; |
143 | |
144 | Status FastParseSingleExample(const FastParseSingleExampleConfig& config, |
145 | StringPiece serialized, Result* result); |
146 | |
147 | // Parses a batch of serialized SequenceExample protos and converts them into |
148 | // result according to given config. |
149 | // Given example names have to either be empty or the same size as serialized. |
150 | // example_names are used only for error messages. |
151 | // (If batch=true, then this parses a single SequenceExample.) |
152 | Status FastParseSequenceExample( |
153 | const example::FastParseExampleConfig& context_config, |
154 | const example::FastParseExampleConfig& feature_list_config, |
155 | gtl::ArraySlice<tstring> serialized, gtl::ArraySlice<tstring> example_names, |
156 | thread::ThreadPool* thread_pool, example::Result* context_result, |
157 | example::Result* feature_list_result, |
158 | std::vector<Tensor>* dense_feature_lengths, bool is_batch = true); |
159 | |
160 | // This function parses serialized Example and populates given example. |
161 | // It uses the same specialized parser as FastParseExample which is efficient. |
162 | // But then constructs Example which is relatively slow. |
163 | // It is exported here as a convenient API to test parser part separately. |
164 | bool TestFastParse(const string& serialized, Example* example); |
165 | |
166 | } // namespace example |
167 | } // namespace tensorflow |
168 | |
169 | #endif // TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_ |
170 | |