1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | // This is an internal header file intended to only be included as the |
17 | // front-matter in the implementation files of various reduction ops. It |
18 | // is a header file because we split the various reduction ops into their |
19 | // own compilation units to get more parallelism in compilation. |
20 | |
21 | #ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_H_ |
22 | #define TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_H_ |
23 | |
24 | #define EIGEN_USE_THREADS |
25 | |
26 | #include "third_party/eigen3/Eigen/Core" |
27 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" |
28 | |
29 | #include "tensorflow/core/framework/numeric_op.h" |
30 | #include "tensorflow/core/framework/op_kernel.h" |
31 | #include "tensorflow/core/framework/register_types.h" |
32 | #include "tensorflow/core/framework/tensor.h" |
33 | #include "tensorflow/core/framework/types.h" |
34 | #include "tensorflow/core/kernels/reduction_ops.h" |
35 | #include "tensorflow/core/kernels/transpose_functor.h" |
36 | #include "tensorflow/core/lib/core/status.h" |
37 | #include "tensorflow/core/lib/gtl/inlined_vector.h" |
38 | #include "tensorflow/core/platform/logging.h" |
39 | |
40 | namespace tensorflow { |
41 | |
42 | typedef Eigen::ThreadPoolDevice CPUDevice; |
43 | typedef Eigen::GpuDevice GPUDevice; |
44 | |
45 | template <typename Device> |
46 | struct Constants { |
47 | // Derive Index type. int (32-bit) or long (64-bit) depending on the |
48 | // compile-time configuration. "float" here is not relevant. |
49 | // TODO(zhifengc): Moves the definition to TTypes. |
50 | typedef TTypes<float>::Tensor::Index Index; |
51 | Eigen::array<Index, 1> kZero; |
52 | Eigen::array<Index, 1> kOne; |
53 | Eigen::array<Index, 2> kZeroTwo; |
54 | |
55 | Constants() { |
56 | kZero[0] = 0; |
57 | kOne[0] = 1; |
58 | kZeroTwo[0] = 0; |
59 | kZeroTwo[1] = 2; |
60 | } |
61 | }; |
62 | |
63 | struct ConstantsBase { |
64 | const Eigen::IndexList<Eigen::type2index<0>> kZero; |
65 | const Eigen::IndexList<Eigen::type2index<1>> kOne; |
66 | const Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<2>> kZeroTwo; |
67 | }; |
68 | template <> |
69 | struct Constants<CPUDevice> : ConstantsBase {}; |
70 | |
71 | class ReductionHelper { |
72 | public: |
73 | ReductionHelper() : reduce_first_axis_(false) {} |
74 | |
75 | Status Simplify(const Tensor& data, const Tensor& axis, const bool keep_dims); |
76 | |
77 | // We need to do roughly: |
78 | // tmp_out = allocate(out_reshape()) |
79 | // tmp_out.reshape(out_reshape) = data.reshape(data_reshape).reduce(axes) |
80 | // out = tmp_out.reshape(out_shape) |
81 | |
82 | // The reduction result must be allocated with this shape. |
83 | TensorShape out_reshape() const; |
84 | |
85 | // The final output shape must be allocated with this shape. |
86 | TensorShape out_shape() const; |
87 | |
88 | // The reduction is on a reshaped tensor of this rank. |
89 | int ndims() const { return data_reshape_.size(); } |
90 | |
91 | // True if need to reduce the 0-th dimension. |
92 | bool reduce_first_axis() const { return reduce_first_axis_; } |
93 | |
94 | // The output is reshaped. |
95 | template <typename T, int N> |
96 | typename TTypes<T, N>::Tensor out(Tensor* out) { |
97 | return out->shaped<T, N>(out_reshape_); |
98 | } |
99 | |
100 | // The input is reshaped. |
101 | template <typename T, int N> |
102 | typename TTypes<T, N>::ConstTensor in(const Tensor& data) { |
103 | return data.shaped<T, N>(data_reshape_); |
104 | } |
105 | |
106 | // Shape of shuffled input |
107 | TensorShape data_reshape() const { |
108 | TensorShape shape; |
109 | for (auto s : data_reshape_) shape.AddDim(s); |
110 | return shape; |
111 | } |
112 | |
113 | // Shape with all reduction dimensions at the end |
114 | TensorShape shuffled_shape(); |
115 | |
116 | // Permutation of reduced dims needed to put reduction dimensions at the end |
117 | gtl::InlinedVector<int32, 8> permutation(); |
118 | |
119 | private: |
120 | bool reduce_first_axis_; // True if need to reduce the 0-th dimension. |
121 | gtl::InlinedVector<int64_t, 4> |
122 | data_reshape_; // Reshape data before reduction. |
123 | gtl::InlinedVector<int64_t, 4> out_shape_; // The final output shape. |
124 | gtl::InlinedVector<int64_t, 4> out_reshape_; // Reshape output for reduction. |
125 | }; |
126 | |
127 | // For operations where the output is a reduction function along some |
128 | // dimensions of the input. |
129 | template <typename Device, class T, typename Tperm, typename Reducer> |
130 | class ReductionOp : public OpKernel { |
131 | public: |
132 | explicit ReductionOp(OpKernelConstruction* ctx) : OpKernel(ctx) { |
133 | const DataType dt = DataTypeToEnum<T>::v(); |
134 | const DataType pt = DataTypeToEnum<Tperm>::v(); |
135 | OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, pt}, {dt})); |
136 | |
137 | OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims" , &keep_dims_)); |
138 | } |
139 | |
140 | void Compute(OpKernelContext* ctx) override { |
141 | const Tensor& data = ctx->input(0); |
142 | const Tensor& axes = ctx->input(1); |
143 | VLOG(1) << "data shape: " << data.shape().DebugString(); |
144 | VLOG(1) << "axes : " << axes.SummarizeValue(10); |
145 | |
146 | ReductionHelper helper; |
147 | OP_REQUIRES_OK(ctx, helper.Simplify(data, axes, keep_dims_)); |
148 | CHECK_GE(helper.ndims(), 0); |
149 | |
150 | bool is_scalar_identity = functor::ReducerTraits<Reducer>::IsScalarIdentity; |
151 | bool is_trivial = helper.ndims() == 0 || |
152 | (helper.ndims() == 1 && !helper.reduce_first_axis()); |
153 | if (is_scalar_identity && is_trivial) { |
154 | Tensor out; |
155 | // Special case. Reduces nothing and does not alter the input values. |
156 | if (!out.CopyFrom(data, helper.out_shape())) { |
157 | ctx->SetStatus(errors::Internal("Error during reduction copy." )); |
158 | } |
159 | ctx->set_output(0, out); |
160 | return; |
161 | } |
162 | |
163 | // We must allocate temp tensors using the same alloc attr as |
164 | // output(0) because it is returned as output(0) in the end. |
165 | const AllocatorAttributes alloc_attr = ctx->output_alloc_attr(0); |
166 | |
167 | Tensor tmp_out; |
168 | typedef functor::ReduceFunctor<Device, Reducer> Functor; |
169 | Constants<Device> constants; |
170 | const Device& d = ctx->eigen_device<Device>(); |
171 | Reducer reducer; |
172 | |
173 | if (data.NumElements() > 0 && is_trivial && !is_scalar_identity) { |
174 | OP_REQUIRES_OK(ctx, ctx->allocate_temp(ctx->expected_output_dtype(0), |
175 | TensorShape({data.NumElements()}), |
176 | &tmp_out, alloc_attr)); |
177 | Functor::Reduce(ctx, tmp_out.flat<T>(), |
178 | data.shaped<T, 2>({1, data.NumElements()}), |
179 | constants.kZero, reducer); |
180 | } else { |
181 | // A temporary tensor whose size matches the size of the reduced |
182 | // output. |
183 | OP_REQUIRES_OK( |
184 | ctx, ctx->allocate_temp(ctx->expected_output_dtype(0), |
185 | helper.out_reshape(), &tmp_out, alloc_attr)); |
186 | |
187 | if (tmp_out.NumElements() == 0) { |
188 | // Nothing to do, fall through to final reshaping. |
189 | } else if (data.NumElements() == 0) { |
190 | // Degenerate reduction where the input is empty but the output is |
191 | // nonempty (thus tmp_out.NumElements() > 0), and we must fill the |
192 | // output with identity elements. Example: tf.reduce_sum(tf.zeros((0, |
193 | // 3)), [0]). Eigen sometimes crashes in this case, so we do it |
194 | // manually. |
195 | Functor::FillIdentity(d, tmp_out.flat<T>(), reducer); |
196 | } else if ((helper.ndims() == 1) && helper.reduce_first_axis()) { |
197 | // Reduce to a scalar. |
198 | Functor::Reduce(ctx, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data), |
199 | constants.kZero, reducer); |
200 | } else if ((helper.ndims() == 2) && helper.reduce_first_axis()) { |
201 | // Can be viewed as a reduction of a matrix along 1st dimension. |
202 | Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data), |
203 | constants.kZero, reducer); |
204 | } else if ((helper.ndims() == 2) && !helper.reduce_first_axis()) { |
205 | // Can be viewed as a reduction of a matrix along 2nd dimension. |
206 | Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data), |
207 | constants.kOne, reducer); |
208 | } else if ((helper.ndims() == 3) && helper.reduce_first_axis()) { |
209 | // Can be viewed as a reduction of a 3D tensor along 1st and 3rd |
210 | // dimensions. |
211 | Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data), |
212 | constants.kZeroTwo, reducer); |
213 | } else if ((helper.ndims() == 3) && !helper.reduce_first_axis()) { |
214 | // Can be viewed as a reduction of a 3D tensor along 2nd dimension. |
215 | Functor::Reduce(ctx, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data), |
216 | constants.kOne, reducer); |
217 | } else { |
218 | // If we don't hit one of the cases above, transpose the data so that |
219 | // all reduced dimensions are last and reuse the 2-D -> 1-D case. |
220 | Tensor data_reshaped; |
221 | OP_REQUIRES(ctx, data_reshaped.CopyFrom(data, helper.data_reshape()), |
222 | errors::Internal("Error during reduction copy." )); |
223 | Tensor shuffled; |
224 | OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, |
225 | helper.shuffled_shape(), |
226 | &shuffled, alloc_attr)); |
227 | OP_REQUIRES_OK(ctx, DoTranspose(d, data_reshaped, helper.permutation(), |
228 | &shuffled)); |
229 | const int64_t unreduced = tmp_out.NumElements(); |
230 | const int64_t reduced = shuffled.NumElements() / unreduced; |
231 | const Tensor& const_shuffled = shuffled; |
232 | Functor::Reduce(ctx, tmp_out.flat<T>(), |
233 | const_shuffled.shaped<T, 2>({unreduced, reduced}), |
234 | constants.kOne, reducer); |
235 | } |
236 | } |
237 | |
238 | // Set the real output using the contents of the reduction but the |
239 | // real expected output shape. The number of elements should |
240 | // match between the two shapes. |
241 | Tensor out; |
242 | OP_REQUIRES(ctx, out.CopyFrom(tmp_out, helper.out_shape()), |
243 | errors::Internal("Error during reduction copy." )); |
244 | ctx->set_output(0, out); |
245 | } |
246 | |
247 | private: |
248 | // True if the number of dimensions should be maintained. |
249 | bool keep_dims_; |
250 | }; |
251 | |
252 | namespace functor { |
253 | |
254 | template <typename Device, typename Reducer> |
255 | struct ReduceFunctorBase { |
256 | template <typename OUT_T, typename IN_T, typename ReductionAxes> |
257 | static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in, |
258 | const ReductionAxes& reduction_axes, |
259 | const Reducer& reducer) { |
260 | const Device& d = ctx->eigen_device<Device>(); |
261 | ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes, Reducer> reducer_impl; |
262 | reducer_impl(d, out, in, reduction_axes, reducer); |
263 | } |
264 | |
265 | template <typename OUT_T> |
266 | static void FillIdentity(const Device& d, OUT_T out, const Reducer& reducer) { |
267 | FillIdentityEigenImpl(d, out, reducer); |
268 | } |
269 | }; |
270 | |
271 | template <typename Reducer> |
272 | struct ReduceFunctor<CPUDevice, Reducer> |
273 | : ReduceFunctorBase<CPUDevice, Reducer> {}; |
274 | |
275 | } // namespace functor |
276 | } // namespace tensorflow |
277 | |
278 | #endif // TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_H_ |
279 | |