1 | /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | // Output kernels for fusing computation into Eigen Tensor contractions: |
17 | // (1) FusedConv2DOp |
18 | // (2) FusedMatMulOp |
19 | // |
20 | // Supported fused computations: |
21 | // (1) {Conv2D/MatMul} + BiasAdd + <Activation> |
22 | // (2) {Conv2D/MatMul} + FusedBatchNorm + <Activation> |
23 | // |
24 | // Activation: Relu, Relu6, Elu, etc... |
25 | |
26 | #ifndef TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_ |
27 | #define TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_ |
28 | |
29 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" |
30 | #include "tensorflow/core/framework/op_kernel.h" |
31 | #include "tensorflow/core/framework/tensor.h" |
32 | #include "tensorflow/core/framework/tensor_types.h" |
33 | |
34 | namespace tensorflow { |
35 | |
36 | enum class FusedComputationType { |
37 | kUndefined, |
38 | kBiasAdd, |
39 | kBiasAddWithRelu, |
40 | kBiasAddWithRelu6, |
41 | kBiasAddWithTanh, |
42 | kBiasAddWithSigmoid, |
43 | kBiasAddWithElu, |
44 | kBiasAddWithLeakyRelu, |
45 | kBiasAddWithGeluApproximate, |
46 | kBiasAddWithGeluExact, |
47 | kFusedBatchNorm, |
48 | kFusedBatchNormWithRelu, |
49 | kFusedBatchNormWithRelu6, |
50 | kFusedBatchNormWithElu, |
51 | kFusedBatchNormWithLeakyRelu |
52 | }; |
53 | |
54 | // We have to pass around additional arguments for all possible fusion types. |
55 | struct FusedComputationArgs { |
56 | float epsilon = 0.0; // Used by `FusedBatchNorm` fusion only |
57 | float leakyrelu_alpha = 0.0; // Used by `LeakyRelu` fusion only |
58 | }; |
59 | |
60 | struct FusedComputationPattern { |
61 | FusedComputationType fused_computation; |
62 | std::vector<string> fused_ops; |
63 | }; |
64 | |
65 | // Parse attributes from the kernel construction context, and verifies that they |
66 | // specify valid fused computation pattern. |
67 | Status InitializeFusedComputation( |
68 | OpKernelConstruction* context, const string& kernel_name, |
69 | const std::vector<FusedComputationPattern>& patterns, |
70 | FusedComputationType* fused_computation, |
71 | FusedComputationArgs* fused_computation_args); |
72 | |
73 | // Type alias for the tensor contraction output mapper. |
74 | template <typename Scalar, typename StorageIndex> |
75 | using ContractionOutputMapper = |
76 | Eigen::internal::blas_data_mapper<Scalar, StorageIndex, Eigen::ColMajor>; |
77 | |
78 | // Returns input expression without any transformations. |
79 | struct Identity { |
80 | template <typename XprType> |
81 | static auto apply(XprType expr) -> XprType { |
82 | return expr; |
83 | }; |
84 | }; |
85 | |
86 | // Applies `Relu` to the passed input expression. |
87 | struct Relu { |
88 | template <typename XprType> |
89 | static auto apply(XprType expr) |
90 | -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())) { |
91 | return expr.cwiseMax(static_cast<typename XprType::Scalar>(0)); |
92 | }; |
93 | }; |
94 | |
95 | // Applies `Relu6` to the passed input expression. |
96 | struct Relu6 { |
97 | template <typename XprType> |
98 | static auto apply(XprType expr) |
99 | -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>()) |
100 | .cwiseMin(std::declval<typename XprType::Scalar>())) { |
101 | return expr.cwiseMax(static_cast<typename XprType::Scalar>(0)) |
102 | .cwiseMin(static_cast<typename XprType::Scalar>(6)); |
103 | }; |
104 | }; |
105 | |
106 | // Applies `Elu` to the passed input expression. |
107 | struct Elu { |
108 | template <typename XprType> |
109 | static auto apply(XprType expr) -> decltype( |
110 | (expr < std::declval<typename XprType::Scalar>()) |
111 | .select(expr.exp() - |
112 | expr.constant(std::declval<typename XprType::Scalar>()), |
113 | expr)) { |
114 | return (expr < static_cast<typename XprType::Scalar>(0)) |
115 | .select(expr.exp() - |
116 | expr.constant(static_cast<typename XprType::Scalar>(1)), |
117 | expr); |
118 | }; |
119 | }; |
120 | |
121 | // Applies `LeakyRelu` to the passed input expression. |
122 | struct LeakyRelu { |
123 | template <typename XprType> |
124 | static auto apply(XprType expr, const float leakyrelu_alpha) -> decltype( |
125 | (expr < std::declval<typename XprType::Scalar>()) |
126 | .select(expr * |
127 | expr.constant(std::declval<typename XprType::Scalar>()), |
128 | expr)) { |
129 | return (expr < static_cast<typename XprType::Scalar>(0)) |
130 | .select(expr * expr.constant(static_cast<typename XprType::Scalar>( |
131 | leakyrelu_alpha)), |
132 | expr); |
133 | }; |
134 | }; |
135 | |
136 | template <typename T> |
137 | struct BiasAddArgs { |
138 | const T* bias_add_data = nullptr; |
139 | float leakyrelu_alpha; |
140 | |
141 | static bool IsSupported(FusedComputationType fusion) { |
142 | return fusion == FusedComputationType::kBiasAdd || |
143 | fusion == FusedComputationType::kBiasAddWithRelu || |
144 | fusion == FusedComputationType::kBiasAddWithRelu6 || |
145 | fusion == FusedComputationType::kBiasAddWithElu || |
146 | fusion == FusedComputationType::kBiasAddWithLeakyRelu; |
147 | } |
148 | }; |
149 | |
150 | template <typename T> |
151 | struct FusedBatchNormArgs { |
152 | const T* scale_data = nullptr; |
153 | const T* offset_data = nullptr; |
154 | const T* estimated_mean_data = nullptr; |
155 | const T* estimated_variance_data = nullptr; |
156 | |
157 | // Precomputed expression: |
158 | // scaling_factor = (estimated_variance + epsilon).rsqrt() * scale |
159 | Eigen::Tensor<T, 1, Eigen::RowMajor> scaling_factor; |
160 | |
161 | float leakyrelu_alpha; |
162 | |
163 | static bool IsSupported(FusedComputationType fusion) { |
164 | return fusion == FusedComputationType::kFusedBatchNorm || |
165 | fusion == FusedComputationType::kFusedBatchNormWithRelu || |
166 | fusion == FusedComputationType::kFusedBatchNormWithRelu6 || |
167 | fusion == FusedComputationType::kFusedBatchNormWithElu || |
168 | fusion == FusedComputationType::kFusedBatchNormWithLeakyRelu; |
169 | } |
170 | }; |
171 | |
172 | // TensorContraction swaps lhs with rhs, and changes layout from RowMajor |
173 | // (default in Tensorflow) to ColMajor (preferred in Eigen), and computes matmul |
174 | // using these tensors. |
175 | // |
176 | // (1) Spatial Convolution (see eigen_spatial_convolutions.h): |
177 | // |
178 | // TensorContraction output matrix (before reshape) has a ColMajor layout, and |
179 | // has dimensions: |
180 | // - rows: output_channels |
181 | // - cols: all other dimensions |
182 | // |
183 | // First element in every column is: |
184 | // [batch ??, height ??, width ??, out_channel = i] |
185 | // |
186 | // We do not know what are the values of the 'batch', 'height', and 'width' |
187 | // here (if we know original dimensions, they can be computed from 'j'). |
188 | // |
189 | // Each column of an output block is a continuous slice along the output |
190 | // channel dimension, so we can use it to efficiently compute any |
191 | // transformation that depends only on a channel value (e.g. add channel |
192 | // bias). |
193 | // |
194 | // (2) Matrix Multiplication (see matmul_op.cc): |
195 | // |
196 | // For the `MxK * KxN` matrix multiplication, output matrix has a `MxN` |
197 | // dimensions. Each column in output block is a slice of the innermost |
198 | // dimension of the output matrix starting at offset 'i'. |
199 | // |
200 | // Example: In Tensorflow MatMul [8x32] * [32x64], each output block column |
201 | // will correspond to MatMul output row of size 64 (because Tensorflow uses |
202 | // row major storage order). |
203 | |
204 | // Output kernel that fuses BiasAdd operation into the output of tensor |
205 | // contraction + activation function defined by Activation. |
206 | template <typename T, typename Activation = Identity> |
207 | struct BiasAddOutputKernel { |
208 | explicit BiasAddOutputKernel(const BiasAddArgs<T>& args) |
209 | : bias_data(args.bias_add_data) {} |
210 | |
211 | template <typename StorageIndex, typename Scalar> |
212 | EIGEN_ALWAYS_INLINE void operator()( |
213 | const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper, |
214 | const Eigen::TensorContractionParams& params, StorageIndex i, |
215 | StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const { |
216 | DCHECK(params.swapped_arguments); |
217 | |
218 | const T* bias_base = bias_data + i; |
219 | typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows); |
220 | |
221 | for (int col = 0; col < num_cols; ++col) { |
222 | T* output_base = &output_mapper(0, col); |
223 | typename TTypes<T>::UnalignedTensor output(output_base, num_rows); |
224 | const auto expr = output + bias; |
225 | output = Activation::template apply<decltype(expr)>(expr); |
226 | } |
227 | } |
228 | |
229 | private: |
230 | const T* bias_data; |
231 | }; |
232 | |
233 | template <typename T> |
234 | struct BiasAddOutputKernel<T, LeakyRelu> { |
235 | explicit BiasAddOutputKernel(const BiasAddArgs<T>& args) |
236 | : bias_data(args.bias_add_data), leakyrelu_alpha(args.leakyrelu_alpha) {} |
237 | |
238 | template <typename StorageIndex, typename Scalar> |
239 | EIGEN_ALWAYS_INLINE void operator()( |
240 | const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper, |
241 | const Eigen::TensorContractionParams& params, StorageIndex i, |
242 | StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const { |
243 | DCHECK(params.swapped_arguments); |
244 | |
245 | const T* bias_base = bias_data + i; |
246 | typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows); |
247 | |
248 | for (int col = 0; col < num_cols; ++col) { |
249 | T* output_base = &output_mapper(0, col); |
250 | typename TTypes<T>::UnalignedTensor output(output_base, num_rows); |
251 | const auto expr = output + bias; |
252 | output = LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha); |
253 | } |
254 | } |
255 | |
256 | private: |
257 | const T* bias_data; |
258 | float leakyrelu_alpha; |
259 | }; |
260 | |
261 | // Output kernel that fuses FusedBatchNorm operation into the output of tensor |
262 | // contraction + activation function defined by Activation. |
263 | template <typename T, typename Activation = Identity> |
264 | struct FusedBatchNormOutputKernel { |
265 | FusedBatchNormOutputKernel(T epsilon, const FusedBatchNormArgs<T>& args) |
266 | : epsilon(epsilon), |
267 | scaling_factor_data(args.scaling_factor.data()), |
268 | offset_data(args.offset_data), |
269 | estimated_mean_data(args.estimated_mean_data) {} |
270 | |
271 | template <typename StorageIndex, typename Scalar> |
272 | EIGEN_ALWAYS_INLINE void operator()( |
273 | const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper, |
274 | const Eigen::TensorContractionParams& params, StorageIndex i, |
275 | StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const { |
276 | DCHECK(params.swapped_arguments); |
277 | |
278 | const T* scaling_factor_base = scaling_factor_data + i; |
279 | const T* offset_base = offset_data + i; |
280 | const T* mean_base = estimated_mean_data + i; |
281 | |
282 | typename TTypes<T>::UnalignedConstTensor scaling_factor(scaling_factor_base, |
283 | num_rows); |
284 | typename TTypes<T>::UnalignedConstTensor offset(offset_base, num_rows); |
285 | typename TTypes<T>::UnalignedConstTensor mean(mean_base, num_rows); |
286 | |
287 | for (int col = 0; col < num_cols; ++col) { |
288 | T* output_base = &output_mapper(0, col); |
289 | typename TTypes<T>::UnalignedTensor output(output_base, num_rows); |
290 | |
291 | auto scaled = (output - mean) * scaling_factor; |
292 | auto shifted = scaled + offset; |
293 | |
294 | output = Activation::template apply<decltype(shifted)>(shifted); |
295 | } |
296 | } |
297 | |
298 | private: |
299 | T epsilon; |
300 | const T* scaling_factor_data; |
301 | const T* offset_data; |
302 | const T* estimated_mean_data; |
303 | }; |
304 | |
305 | template <typename T> |
306 | struct FusedBatchNormOutputKernel<T, LeakyRelu> { |
307 | FusedBatchNormOutputKernel(T epsilon, const FusedBatchNormArgs<T>& args) |
308 | : epsilon(epsilon), |
309 | scaling_factor_data(args.scaling_factor.data()), |
310 | offset_data(args.offset_data), |
311 | estimated_mean_data(args.estimated_mean_data), |
312 | leakyrelu_alpha(args.leakyrelu_alpha) {} |
313 | |
314 | template <typename StorageIndex, typename Scalar> |
315 | EIGEN_ALWAYS_INLINE void operator()( |
316 | const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper, |
317 | const Eigen::TensorContractionParams& params, StorageIndex i, |
318 | StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const { |
319 | DCHECK(params.swapped_arguments); |
320 | |
321 | const T* scaling_factor_base = scaling_factor_data + i; |
322 | const T* offset_base = offset_data + i; |
323 | const T* mean_base = estimated_mean_data + i; |
324 | |
325 | typename TTypes<T>::UnalignedConstTensor scaling_factor(scaling_factor_base, |
326 | num_rows); |
327 | typename TTypes<T>::UnalignedConstTensor offset(offset_base, num_rows); |
328 | typename TTypes<T>::UnalignedConstTensor mean(mean_base, num_rows); |
329 | |
330 | for (int col = 0; col < num_cols; ++col) { |
331 | T* output_base = &output_mapper(0, col); |
332 | typename TTypes<T>::UnalignedTensor output(output_base, num_rows); |
333 | |
334 | auto scaled = (output - mean) * scaling_factor; |
335 | auto shifted = scaled + offset; |
336 | |
337 | output = LeakyRelu::template apply<decltype(shifted)>(shifted, |
338 | leakyrelu_alpha); |
339 | } |
340 | } |
341 | |
342 | private: |
343 | T epsilon; |
344 | const T* scaling_factor_data; |
345 | const T* offset_data; |
346 | const T* estimated_mean_data; |
347 | float leakyrelu_alpha; |
348 | }; |
349 | |
350 | // Type aliases for the output kernels, purely for the sake of better launch |
351 | // dispatching code readability. |
352 | template <typename T> |
353 | using WithBiasAdd = BiasAddOutputKernel<T>; |
354 | template <typename T> |
355 | using WithBiasAddAndRelu = BiasAddOutputKernel<T, Relu>; |
356 | template <typename T> |
357 | using WithBiasAddAndRelu6 = BiasAddOutputKernel<T, Relu6>; |
358 | template <typename T> |
359 | using WithBiasAddAndElu = BiasAddOutputKernel<T, Elu>; |
360 | template <typename T> |
361 | using WithBiasAddAndLeakyRelu = BiasAddOutputKernel<T, LeakyRelu>; |
362 | template <typename T> |
363 | using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>; |
364 | template <typename T> |
365 | using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>; |
366 | template <typename T> |
367 | using WithFusedBatchNormAndRelu6 = FusedBatchNormOutputKernel<T, Relu6>; |
368 | template <typename T> |
369 | using WithFusedBatchNormAndElu = FusedBatchNormOutputKernel<T, Elu>; |
370 | template <typename T> |
371 | using WithFusedBatchNormAndLeakyRelu = FusedBatchNormOutputKernel<T, LeakyRelu>; |
372 | |
373 | template <typename T> |
374 | Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs<T>* args, |
375 | const float* leakyrelu_alpha = nullptr) { |
376 | // Bias of the following dimensions: [ output_depth ] |
377 | const Tensor& bias = context->input(2); |
378 | |
379 | if (bias.dims() != 1) |
380 | return errors::InvalidArgument("bias must be 1-dimensional" , |
381 | bias.shape().DebugString()); |
382 | |
383 | const auto data_ptr = [](const Tensor& tensor) -> const T* { |
384 | return reinterpret_cast<const T*>(tensor.tensor_data().data()); |
385 | }; |
386 | |
387 | args->bias_add_data = data_ptr(bias); |
388 | |
389 | if (leakyrelu_alpha) { |
390 | args->leakyrelu_alpha = *leakyrelu_alpha; |
391 | } |
392 | |
393 | return OkStatus(); |
394 | } |
395 | |
396 | template <typename T> |
397 | Status InitFusedBatchNormArgs(OpKernelContext* context, float epsilon, |
398 | FusedBatchNormArgs<T>* args, |
399 | const float* leakyrelu_alpha = nullptr) { |
400 | const Tensor& scale = context->input(2); |
401 | const Tensor& offset = context->input(3); |
402 | const Tensor& estimated_mean = context->input(4); |
403 | const Tensor& estimated_variance = context->input(5); |
404 | |
405 | if (scale.dims() != 1) |
406 | return errors::InvalidArgument("scale must be 1-dimensional" , |
407 | scale.shape().DebugString()); |
408 | if (offset.dims() != 1) |
409 | return errors::InvalidArgument("offset must be 1-dimensional" , |
410 | offset.shape().DebugString()); |
411 | if (estimated_mean.dims() != 1) |
412 | return errors::InvalidArgument("estimated_mean must be 1-dimensional" , |
413 | estimated_mean.shape().DebugString()); |
414 | if (estimated_variance.dims() != 1) |
415 | return errors::InvalidArgument("estimated_variance must be 1-dimensional" , |
416 | estimated_variance.shape().DebugString()); |
417 | |
418 | const auto data_ptr = [](const Tensor& tensor) -> const T* { |
419 | return reinterpret_cast<const T*>(tensor.tensor_data().data()); |
420 | }; |
421 | |
422 | args->scale_data = data_ptr(scale); |
423 | args->offset_data = data_ptr(offset); |
424 | args->estimated_mean_data = data_ptr(estimated_mean); |
425 | args->estimated_variance_data = data_ptr(estimated_variance); |
426 | |
427 | // Precompute scaling factor once for all output blocks (kernels). |
428 | args->scaling_factor = |
429 | (estimated_variance.flat<T>() + static_cast<T>(epsilon)).rsqrt() * |
430 | scale.flat<T>(); |
431 | |
432 | if (leakyrelu_alpha) { |
433 | args->leakyrelu_alpha = *leakyrelu_alpha; |
434 | } |
435 | |
436 | return OkStatus(); |
437 | } |
438 | |
439 | } // namespace tensorflow |
440 | |
441 | #endif // TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_ |
442 | |