1/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// Output kernels for fusing computation into Eigen Tensor contractions:
17// (1) FusedConv2DOp
18// (2) FusedMatMulOp
19//
20// Supported fused computations:
21// (1) {Conv2D/MatMul} + BiasAdd + <Activation>
22// (2) {Conv2D/MatMul} + FusedBatchNorm + <Activation>
23//
24// Activation: Relu, Relu6, Elu, etc...
25
26#ifndef TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
27#define TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
28
29#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
30#include "tensorflow/core/framework/op_kernel.h"
31#include "tensorflow/core/framework/tensor.h"
32#include "tensorflow/core/framework/tensor_types.h"
33
34namespace tensorflow {
35
36enum class FusedComputationType {
37 kUndefined,
38 kBiasAdd,
39 kBiasAddWithRelu,
40 kBiasAddWithRelu6,
41 kBiasAddWithTanh,
42 kBiasAddWithSigmoid,
43 kBiasAddWithElu,
44 kBiasAddWithLeakyRelu,
45 kBiasAddWithGeluApproximate,
46 kBiasAddWithGeluExact,
47 kFusedBatchNorm,
48 kFusedBatchNormWithRelu,
49 kFusedBatchNormWithRelu6,
50 kFusedBatchNormWithElu,
51 kFusedBatchNormWithLeakyRelu
52};
53
54// We have to pass around additional arguments for all possible fusion types.
55struct FusedComputationArgs {
56 float epsilon = 0.0; // Used by `FusedBatchNorm` fusion only
57 float leakyrelu_alpha = 0.0; // Used by `LeakyRelu` fusion only
58};
59
60struct FusedComputationPattern {
61 FusedComputationType fused_computation;
62 std::vector<string> fused_ops;
63};
64
65// Parse attributes from the kernel construction context, and verifies that they
66// specify valid fused computation pattern.
67Status InitializeFusedComputation(
68 OpKernelConstruction* context, const string& kernel_name,
69 const std::vector<FusedComputationPattern>& patterns,
70 FusedComputationType* fused_computation,
71 FusedComputationArgs* fused_computation_args);
72
73// Type alias for the tensor contraction output mapper.
74template <typename Scalar, typename StorageIndex>
75using ContractionOutputMapper =
76 Eigen::internal::blas_data_mapper<Scalar, StorageIndex, Eigen::ColMajor>;
77
78// Returns input expression without any transformations.
79struct Identity {
80 template <typename XprType>
81 static auto apply(XprType expr) -> XprType {
82 return expr;
83 };
84};
85
86// Applies `Relu` to the passed input expression.
87struct Relu {
88 template <typename XprType>
89 static auto apply(XprType expr)
90 -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())) {
91 return expr.cwiseMax(static_cast<typename XprType::Scalar>(0));
92 };
93};
94
95// Applies `Relu6` to the passed input expression.
96struct Relu6 {
97 template <typename XprType>
98 static auto apply(XprType expr)
99 -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())
100 .cwiseMin(std::declval<typename XprType::Scalar>())) {
101 return expr.cwiseMax(static_cast<typename XprType::Scalar>(0))
102 .cwiseMin(static_cast<typename XprType::Scalar>(6));
103 };
104};
105
106// Applies `Elu` to the passed input expression.
107struct Elu {
108 template <typename XprType>
109 static auto apply(XprType expr) -> decltype(
110 (expr < std::declval<typename XprType::Scalar>())
111 .select(expr.exp() -
112 expr.constant(std::declval<typename XprType::Scalar>()),
113 expr)) {
114 return (expr < static_cast<typename XprType::Scalar>(0))
115 .select(expr.exp() -
116 expr.constant(static_cast<typename XprType::Scalar>(1)),
117 expr);
118 };
119};
120
121// Applies `LeakyRelu` to the passed input expression.
122struct LeakyRelu {
123 template <typename XprType>
124 static auto apply(XprType expr, const float leakyrelu_alpha) -> decltype(
125 (expr < std::declval<typename XprType::Scalar>())
126 .select(expr *
127 expr.constant(std::declval<typename XprType::Scalar>()),
128 expr)) {
129 return (expr < static_cast<typename XprType::Scalar>(0))
130 .select(expr * expr.constant(static_cast<typename XprType::Scalar>(
131 leakyrelu_alpha)),
132 expr);
133 };
134};
135
136template <typename T>
137struct BiasAddArgs {
138 const T* bias_add_data = nullptr;
139 float leakyrelu_alpha;
140
141 static bool IsSupported(FusedComputationType fusion) {
142 return fusion == FusedComputationType::kBiasAdd ||
143 fusion == FusedComputationType::kBiasAddWithRelu ||
144 fusion == FusedComputationType::kBiasAddWithRelu6 ||
145 fusion == FusedComputationType::kBiasAddWithElu ||
146 fusion == FusedComputationType::kBiasAddWithLeakyRelu;
147 }
148};
149
150template <typename T>
151struct FusedBatchNormArgs {
152 const T* scale_data = nullptr;
153 const T* offset_data = nullptr;
154 const T* estimated_mean_data = nullptr;
155 const T* estimated_variance_data = nullptr;
156
157 // Precomputed expression:
158 // scaling_factor = (estimated_variance + epsilon).rsqrt() * scale
159 Eigen::Tensor<T, 1, Eigen::RowMajor> scaling_factor;
160
161 float leakyrelu_alpha;
162
163 static bool IsSupported(FusedComputationType fusion) {
164 return fusion == FusedComputationType::kFusedBatchNorm ||
165 fusion == FusedComputationType::kFusedBatchNormWithRelu ||
166 fusion == FusedComputationType::kFusedBatchNormWithRelu6 ||
167 fusion == FusedComputationType::kFusedBatchNormWithElu ||
168 fusion == FusedComputationType::kFusedBatchNormWithLeakyRelu;
169 }
170};
171
172// TensorContraction swaps lhs with rhs, and changes layout from RowMajor
173// (default in Tensorflow) to ColMajor (preferred in Eigen), and computes matmul
174// using these tensors.
175//
176// (1) Spatial Convolution (see eigen_spatial_convolutions.h):
177//
178// TensorContraction output matrix (before reshape) has a ColMajor layout, and
179// has dimensions:
180// - rows: output_channels
181// - cols: all other dimensions
182//
183// First element in every column is:
184// [batch ??, height ??, width ??, out_channel = i]
185//
186// We do not know what are the values of the 'batch', 'height', and 'width'
187// here (if we know original dimensions, they can be computed from 'j').
188//
189// Each column of an output block is a continuous slice along the output
190// channel dimension, so we can use it to efficiently compute any
191// transformation that depends only on a channel value (e.g. add channel
192// bias).
193//
194// (2) Matrix Multiplication (see matmul_op.cc):
195//
196// For the `MxK * KxN` matrix multiplication, output matrix has a `MxN`
197// dimensions. Each column in output block is a slice of the innermost
198// dimension of the output matrix starting at offset 'i'.
199//
200// Example: In Tensorflow MatMul [8x32] * [32x64], each output block column
201// will correspond to MatMul output row of size 64 (because Tensorflow uses
202// row major storage order).
203
204// Output kernel that fuses BiasAdd operation into the output of tensor
205// contraction + activation function defined by Activation.
206template <typename T, typename Activation = Identity>
207struct BiasAddOutputKernel {
208 explicit BiasAddOutputKernel(const BiasAddArgs<T>& args)
209 : bias_data(args.bias_add_data) {}
210
211 template <typename StorageIndex, typename Scalar>
212 EIGEN_ALWAYS_INLINE void operator()(
213 const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
214 const Eigen::TensorContractionParams& params, StorageIndex i,
215 StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
216 DCHECK(params.swapped_arguments);
217
218 const T* bias_base = bias_data + i;
219 typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows);
220
221 for (int col = 0; col < num_cols; ++col) {
222 T* output_base = &output_mapper(0, col);
223 typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
224 const auto expr = output + bias;
225 output = Activation::template apply<decltype(expr)>(expr);
226 }
227 }
228
229 private:
230 const T* bias_data;
231};
232
233template <typename T>
234struct BiasAddOutputKernel<T, LeakyRelu> {
235 explicit BiasAddOutputKernel(const BiasAddArgs<T>& args)
236 : bias_data(args.bias_add_data), leakyrelu_alpha(args.leakyrelu_alpha) {}
237
238 template <typename StorageIndex, typename Scalar>
239 EIGEN_ALWAYS_INLINE void operator()(
240 const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
241 const Eigen::TensorContractionParams& params, StorageIndex i,
242 StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
243 DCHECK(params.swapped_arguments);
244
245 const T* bias_base = bias_data + i;
246 typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows);
247
248 for (int col = 0; col < num_cols; ++col) {
249 T* output_base = &output_mapper(0, col);
250 typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
251 const auto expr = output + bias;
252 output = LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha);
253 }
254 }
255
256 private:
257 const T* bias_data;
258 float leakyrelu_alpha;
259};
260
261// Output kernel that fuses FusedBatchNorm operation into the output of tensor
262// contraction + activation function defined by Activation.
263template <typename T, typename Activation = Identity>
264struct FusedBatchNormOutputKernel {
265 FusedBatchNormOutputKernel(T epsilon, const FusedBatchNormArgs<T>& args)
266 : epsilon(epsilon),
267 scaling_factor_data(args.scaling_factor.data()),
268 offset_data(args.offset_data),
269 estimated_mean_data(args.estimated_mean_data) {}
270
271 template <typename StorageIndex, typename Scalar>
272 EIGEN_ALWAYS_INLINE void operator()(
273 const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
274 const Eigen::TensorContractionParams& params, StorageIndex i,
275 StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
276 DCHECK(params.swapped_arguments);
277
278 const T* scaling_factor_base = scaling_factor_data + i;
279 const T* offset_base = offset_data + i;
280 const T* mean_base = estimated_mean_data + i;
281
282 typename TTypes<T>::UnalignedConstTensor scaling_factor(scaling_factor_base,
283 num_rows);
284 typename TTypes<T>::UnalignedConstTensor offset(offset_base, num_rows);
285 typename TTypes<T>::UnalignedConstTensor mean(mean_base, num_rows);
286
287 for (int col = 0; col < num_cols; ++col) {
288 T* output_base = &output_mapper(0, col);
289 typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
290
291 auto scaled = (output - mean) * scaling_factor;
292 auto shifted = scaled + offset;
293
294 output = Activation::template apply<decltype(shifted)>(shifted);
295 }
296 }
297
298 private:
299 T epsilon;
300 const T* scaling_factor_data;
301 const T* offset_data;
302 const T* estimated_mean_data;
303};
304
305template <typename T>
306struct FusedBatchNormOutputKernel<T, LeakyRelu> {
307 FusedBatchNormOutputKernel(T epsilon, const FusedBatchNormArgs<T>& args)
308 : epsilon(epsilon),
309 scaling_factor_data(args.scaling_factor.data()),
310 offset_data(args.offset_data),
311 estimated_mean_data(args.estimated_mean_data),
312 leakyrelu_alpha(args.leakyrelu_alpha) {}
313
314 template <typename StorageIndex, typename Scalar>
315 EIGEN_ALWAYS_INLINE void operator()(
316 const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
317 const Eigen::TensorContractionParams& params, StorageIndex i,
318 StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
319 DCHECK(params.swapped_arguments);
320
321 const T* scaling_factor_base = scaling_factor_data + i;
322 const T* offset_base = offset_data + i;
323 const T* mean_base = estimated_mean_data + i;
324
325 typename TTypes<T>::UnalignedConstTensor scaling_factor(scaling_factor_base,
326 num_rows);
327 typename TTypes<T>::UnalignedConstTensor offset(offset_base, num_rows);
328 typename TTypes<T>::UnalignedConstTensor mean(mean_base, num_rows);
329
330 for (int col = 0; col < num_cols; ++col) {
331 T* output_base = &output_mapper(0, col);
332 typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
333
334 auto scaled = (output - mean) * scaling_factor;
335 auto shifted = scaled + offset;
336
337 output = LeakyRelu::template apply<decltype(shifted)>(shifted,
338 leakyrelu_alpha);
339 }
340 }
341
342 private:
343 T epsilon;
344 const T* scaling_factor_data;
345 const T* offset_data;
346 const T* estimated_mean_data;
347 float leakyrelu_alpha;
348};
349
350// Type aliases for the output kernels, purely for the sake of better launch
351// dispatching code readability.
352template <typename T>
353using WithBiasAdd = BiasAddOutputKernel<T>;
354template <typename T>
355using WithBiasAddAndRelu = BiasAddOutputKernel<T, Relu>;
356template <typename T>
357using WithBiasAddAndRelu6 = BiasAddOutputKernel<T, Relu6>;
358template <typename T>
359using WithBiasAddAndElu = BiasAddOutputKernel<T, Elu>;
360template <typename T>
361using WithBiasAddAndLeakyRelu = BiasAddOutputKernel<T, LeakyRelu>;
362template <typename T>
363using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>;
364template <typename T>
365using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>;
366template <typename T>
367using WithFusedBatchNormAndRelu6 = FusedBatchNormOutputKernel<T, Relu6>;
368template <typename T>
369using WithFusedBatchNormAndElu = FusedBatchNormOutputKernel<T, Elu>;
370template <typename T>
371using WithFusedBatchNormAndLeakyRelu = FusedBatchNormOutputKernel<T, LeakyRelu>;
372
373template <typename T>
374Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs<T>* args,
375 const float* leakyrelu_alpha = nullptr) {
376 // Bias of the following dimensions: [ output_depth ]
377 const Tensor& bias = context->input(2);
378
379 if (bias.dims() != 1)
380 return errors::InvalidArgument("bias must be 1-dimensional",
381 bias.shape().DebugString());
382
383 const auto data_ptr = [](const Tensor& tensor) -> const T* {
384 return reinterpret_cast<const T*>(tensor.tensor_data().data());
385 };
386
387 args->bias_add_data = data_ptr(bias);
388
389 if (leakyrelu_alpha) {
390 args->leakyrelu_alpha = *leakyrelu_alpha;
391 }
392
393 return OkStatus();
394}
395
396template <typename T>
397Status InitFusedBatchNormArgs(OpKernelContext* context, float epsilon,
398 FusedBatchNormArgs<T>* args,
399 const float* leakyrelu_alpha = nullptr) {
400 const Tensor& scale = context->input(2);
401 const Tensor& offset = context->input(3);
402 const Tensor& estimated_mean = context->input(4);
403 const Tensor& estimated_variance = context->input(5);
404
405 if (scale.dims() != 1)
406 return errors::InvalidArgument("scale must be 1-dimensional",
407 scale.shape().DebugString());
408 if (offset.dims() != 1)
409 return errors::InvalidArgument("offset must be 1-dimensional",
410 offset.shape().DebugString());
411 if (estimated_mean.dims() != 1)
412 return errors::InvalidArgument("estimated_mean must be 1-dimensional",
413 estimated_mean.shape().DebugString());
414 if (estimated_variance.dims() != 1)
415 return errors::InvalidArgument("estimated_variance must be 1-dimensional",
416 estimated_variance.shape().DebugString());
417
418 const auto data_ptr = [](const Tensor& tensor) -> const T* {
419 return reinterpret_cast<const T*>(tensor.tensor_data().data());
420 };
421
422 args->scale_data = data_ptr(scale);
423 args->offset_data = data_ptr(offset);
424 args->estimated_mean_data = data_ptr(estimated_mean);
425 args->estimated_variance_data = data_ptr(estimated_variance);
426
427 // Precompute scaling factor once for all output blocks (kernels).
428 args->scaling_factor =
429 (estimated_variance.flat<T>() + static_cast<T>(epsilon)).rsqrt() *
430 scale.flat<T>();
431
432 if (leakyrelu_alpha) {
433 args->leakyrelu_alpha = *leakyrelu_alpha;
434 }
435
436 return OkStatus();
437}
438
439} // namespace tensorflow
440
441#endif // TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
442