1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
17#define TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
18
19#include <string>
20#include <unordered_map>
21#include <vector>
22
23#include "tensorflow/core/example/example.pb.h"
24#include "tensorflow/core/framework/allocator.h"
25#include "tensorflow/core/framework/graph.pb.h"
26#include "tensorflow/core/framework/op_kernel.h"
27#include "tensorflow/core/framework/partial_tensor_shape.h"
28#include "tensorflow/core/framework/tensor.h"
29#include "tensorflow/core/framework/types.h"
30#include "tensorflow/core/lib/gtl/array_slice.h"
31#include "tensorflow/core/platform/types.h"
32#include "tensorflow/core/util/sparse/sparse_tensor.h"
33
34namespace tensorflow {
35namespace example {
36
37// FastParseExampleConfig defines how to parse features in Example.
38// Each sub-config is responsible for one feature identified with feature_name.
39// FastParseExampleConfig can't have two sub-configs with the same feature_name.
40// dtype identifies the type of output vector and the kind of Feature expected
41// in Example.
42struct FastParseExampleConfig {
43 struct Dense {
44 Dense(StringPiece feature_name, DataType dtype, PartialTensorShape shape,
45 Tensor default_value, bool variable_length,
46 std::size_t elements_per_stride)
47 : feature_name(feature_name), // TODO(mrry): Switch to preallocated
48 // tstring when this is available.
49 dtype(dtype),
50 shape(std::move(shape)),
51 default_value(std::move(default_value)),
52 variable_length(variable_length),
53 elements_per_stride(elements_per_stride) {}
54 Dense() = default;
55
56 tstring feature_name;
57 DataType dtype;
58 // These 2 fields correspond exactly to dense_shapes and dense_defaults in
59 // ParseExample op.
60 // Documentation is available in: tensorflow/core/ops/parsing_ops.cc
61 PartialTensorShape shape;
62 Tensor default_value;
63 bool variable_length;
64 std::size_t elements_per_stride;
65 };
66
67 struct Sparse {
68 Sparse(StringPiece feature_name, DataType dtype)
69 : feature_name(feature_name), // TODO(mrry): Switch to preallocated
70 // tstring when this is available.
71 dtype(dtype) {}
72 Sparse() = default;
73
74 tstring feature_name;
75 DataType dtype;
76 };
77
78 struct Ragged {
79 Ragged(StringPiece feature_name, DataType dtype, DataType splits_dtype)
80 : feature_name(feature_name), // TODO(mrry): Switch to preallocated
81 // tstring when this is available.
82 dtype(dtype),
83 splits_dtype(splits_dtype) {}
84 Ragged() = default;
85
86 tstring feature_name;
87 DataType dtype;
88 DataType splits_dtype;
89 };
90
91 std::vector<Dense> dense;
92 std::vector<Sparse> sparse;
93 std::vector<Ragged> ragged;
94
95 // If `true`, `Result::feature_stats` will contain one
96 // `PerExampleFeatureStats` for each serialized example in the input.
97 bool collect_feature_stats = false;
98};
99
100// Statistics about the features in each example passed to
101// `FastParse[Single]Example()`.
102//
103// TODO(b/111553342): The gathered statistics currently have two limitations:
104// * Feature names that appear more than once will be counted multiple times.
105// * The feature values count only represents the counts for features that were
106// requested in the `FastParseExampleConfig`.
107// These could be addressed with additional work at runtime.
108struct PerExampleFeatureStats {
109 // The number of feature names in an example.
110 size_t features_count = 0;
111
112 // The sum of the number of values in each feature that is parsed.
113 size_t feature_values_count = 0;
114};
115
116// This is exactly the output of TF's ParseExample Op.
117// Documentation is available in: tensorflow/core/ops/parsing_ops.cc
118struct Result {
119 std::vector<Tensor> sparse_indices;
120 std::vector<Tensor> sparse_values;
121 std::vector<Tensor> sparse_shapes;
122 std::vector<Tensor> dense_values;
123 std::vector<Tensor> ragged_values;
124 std::vector<Tensor> ragged_splits;
125 std::vector<Tensor> ragged_outer_splits; // For SequenceExamples
126
127 // This vector will be populated with one element per example if
128 // `FastParseExampleConfig::collect_feature_stats` is set to `true`.
129 std::vector<PerExampleFeatureStats> feature_stats;
130};
131
132// Parses a batch of serialized Example protos and converts them into result
133// according to given config.
134// Given example names have to either be empty or the same size as serialized.
135// example_names are used only for error messages.
136Status FastParseExample(const FastParseExampleConfig& config,
137 gtl::ArraySlice<tstring> serialized,
138 gtl::ArraySlice<tstring> example_names,
139 thread::ThreadPool* thread_pool, Result* result);
140
141// TODO(mrry): Move the hash table construction into the config object.
142typedef FastParseExampleConfig FastParseSingleExampleConfig;
143
144Status FastParseSingleExample(const FastParseSingleExampleConfig& config,
145 StringPiece serialized, Result* result);
146
147// Parses a batch of serialized SequenceExample protos and converts them into
148// result according to given config.
149// Given example names have to either be empty or the same size as serialized.
150// example_names are used only for error messages.
151// (If batch=true, then this parses a single SequenceExample.)
152Status FastParseSequenceExample(
153 const example::FastParseExampleConfig& context_config,
154 const example::FastParseExampleConfig& feature_list_config,
155 gtl::ArraySlice<tstring> serialized, gtl::ArraySlice<tstring> example_names,
156 thread::ThreadPool* thread_pool, example::Result* context_result,
157 example::Result* feature_list_result,
158 std::vector<Tensor>* dense_feature_lengths, bool is_batch = true);
159
160// This function parses serialized Example and populates given example.
161// It uses the same specialized parser as FastParseExample which is efficient.
162// But then constructs Example which is relatively slow.
163// It is exported here as a convenient API to test parser part separately.
164bool TestFastParse(const string& serialized, Example* example);
165
166} // namespace example
167} // namespace tensorflow
168
169#endif // TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
170