conv_ops_fused_impl.h source code [tensorflow/tensorflow/core/kernels/conv_ops_fused_impl.h]

1	/ Copyright 2016 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	// Implements convolution operations with other kernels baked into the
17	// processing, to optimize latency and memory usage:
18	// - Conv2D + BiasAdd + <Activation>
19	// - Conv2D + FusedBatchNorm + <Activation>
20	//
21	// Activation: Relu, Relu6, Elu, etc...
22	//
23	// Kernels for convolutions fused with image transformations (resize and mirror
24	// padding) defined in `conv_ops_fused_image_transform.cc`.
25	//
26	// For the CPU device we implement fusion with an Eigen tensor contraction
27	// output kernel. For the GPU device we rely on CuDNN primitives.
28	//
29	// NOTE: GPU only supports fusion of Conv2D + BiasAdd + <optional Relu>.
30
31	#ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
32	#define TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
33
34	#define USE_EIGEN_TENSOR
35	#define EIGEN_USE_THREADS
36
37	#if GOOGLE_CUDA
38	#define EIGEN_USE_GPU
39	#endif // GOOGLE_CUDA
40
41	#include <string>
42	#include <type_traits>
43	#include <utility>
44	#include <vector>
45
46	#include "tensorflow/core/framework/bounds_check.h"
47	#include "tensorflow/core/framework/op_kernel.h"
48	#include "tensorflow/core/framework/register_types.h"
49	#include "tensorflow/core/framework/tensor.h"
50	#include "tensorflow/core/framework/tensor_shape.h"
51	#include "tensorflow/core/kernels/conv_2d.h"
52	#include "tensorflow/core/kernels/conv_ops.h"
53	#include "tensorflow/core/kernels/fused_eigen_output_kernels.h"
54	#include "tensorflow/core/kernels/ops_util.h"
55	#include "tensorflow/core/profiler/lib/scoped_annotation.h"
56	#include "tensorflow/core/util/tensor_format.h"
57	#include "tensorflow/core/util/use_cudnn.h"
58
59	#if GOOGLE_CUDA
60	#include "third_party/gpus/cudnn/cudnn.h"
61	#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h"
62	#include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
63	#include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h"
64	#include "tensorflow/core/kernels/conv_ops_gpu.h"
65	#include "tensorflow/core/platform/stream_executor.h"
66	#include "tensorflow/core/util/autotune_maps/conv_autotune_maps.h"
67	#include "tensorflow/core/util/autotune_maps/conv_parameters.h"
68	#include "tensorflow/core/util/proto/proto_utils.h"
69	#endif // GOOGLE_CUDA
70
71	namespace tensorflow {
72
73	typedef Eigen::ThreadPoolDevice CPUDevice;
74	typedef Eigen::GpuDevice GPUDevice;
75
76	template <typename Device, typename T>
77	struct LaunchFusedConv2DOp {
78	void operator()(OpKernelContext* context, bool use_cudnn,
79	bool cudnn_use_autotune, const Tensor& input,
80	const Tensor& filter, FusedComputationType fusion,
81	const FusedComputationArgs& fusion_args,
82	const Conv2DParameters& params,
83	const Conv2DDimensions& dimensions, Tensor* output);
84	};
85
86	// This is CPU-only implementation that uses Eigen contraction output kernels.
87	//
88	// Dispatch 2D convolution to the appropriate primitive operation:
89	// (1) MatMul for the case of 1x1 convolution.
90	// (2) MatMul for the case when filter size equals to the input size.
91	// (3) General spatial 2D convolution for all other cases.
92	template <typename T>
93	class LaunchFusedConv2DWithOutputKernel {
94	public:
95	LaunchFusedConv2DWithOutputKernel(
96	int row_stride, int col_stride, //
97	int row_dilation, int col_dilation, //
98	Padding padding, const std::vector<int64_t>& explicit_paddings)
99	: row_stride_(row_stride),
100	col_stride_(col_stride),
101	row_dilation_(row_dilation),
102	col_dilation_(col_dilation),
103	padding_(padding),
104	explicit_paddings_(explicit_paddings) {}
105
106	template <typename OutputKernel>
107	void operator()(const OutputKernel& output_kernel, OpKernelContext* ctx,
108	const Tensor& input, const Tensor& filter, Tensor* output) {
109	// Wrap output_kernel into type erased wrapper to reduce the number of
110	// unique template instantiations for Eigen Tensor contraction expressions.
111	OutputKernelWrapper output_kernel_wrapper(
112	[&output_kernel](
113	const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
114	const Eigen::TensorContractionParams& params, Eigen::Index i,
115	Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) {
116	output_kernel(output_mapper, params, i, j, num_rows, num_cols);
117	});
118
119	if (filter.dim_size(`0`) == `1` && filter.dim_size(`1`) == `1` &&
120	row_stride_ == `1` && col_stride_ == `1` && padding_ != EXPLICIT) {
121	int conv_width = `1`; // Width for the convolution step.
122	for (int i = `0`; i < `3`; ++i) {
123	conv_width *= output->dim_size(i);
124	}
125
126	Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, `1`> dim_pair;
127	dim_pair [`0`] = Eigen::IndexPair<Eigen::DenseIndex>(`1`, `0`);
128	functor::MatMulConvFunctor<CPUDevice, T, OutputKernelWrapper>()(
129	ctx->eigen_device<CPUDevice>(),
130	output->shaped<T, `2`>({conv_width, filter.dim_size(`3`)}),
131	input.shaped<T, `2`>({conv_width, filter.dim_size(`2`)}),
132	filter.shaped<T, `2`>({filter.dim_size(`2`), filter.dim_size(`3`)}),
133	dim_pair, std::move(output_kernel_wrapper));
134
135	} else if (filter.dim_size(`0`) == input.dim_size(`1`) &&
136	filter.dim_size(`1`) == input.dim_size(`2`) && row_dilation_ == `1` &&
137	col_dilation_ == `1` && padding_ == VALID) {
138	// If the input data and filter have the same height/width,
139	// reduce the 2D convolution to matrix multiplication.
140	const auto k = // Length of reduction dimension.
141	filter.dim_size(`0`) * filter.dim_size(`1`) * filter.dim_size(`2`);
142
143	Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, `1`> dim_pair;
144	dim_pair [`0`] = Eigen::IndexPair<Eigen::DenseIndex>(`1`, `0`);
145	functor::MatMulConvFunctor<CPUDevice, T, OutputKernelWrapper>()(
146	ctx->eigen_device<CPUDevice>(),
147	output->shaped<T, `2`>({input.dim_size(`0`), filter.dim_size(`3`)}),
148	input.shaped<T, `2`>({input.dim_size(`0`), k}),
149	filter.shaped<T, `2`>({k, filter.dim_size(`3`)}), dim_pair,
150	std::move(output_kernel_wrapper));
151
152	} else {
153	if (padding_ == EXPLICIT) {
154	functor::SpatialConvolution<CPUDevice, T, OutputKernelWrapper>()(
155	ctx->eigen_device<CPUDevice>(), output->tensor<T, `4`>(),
156	input.tensor<T, `4`>(), filter.tensor<T, `4`>(), row_stride_,
157	col_stride_, row_dilation_, col_dilation_,
158	static_cast<int>(explicit_paddings_[`2`]),
159	static_cast<int>(explicit_paddings_[`3`]),
160	static_cast<int>(explicit_paddings_[`4`]),
161	static_cast<int>(explicit_paddings_[`5`]),
162	std::move(output_kernel_wrapper));
163	} else {
164	functor::SpatialConvolution<CPUDevice, T, OutputKernelWrapper>()(
165	ctx->eigen_device<CPUDevice>(), output->tensor<T, `4`>(),
166	input.tensor<T, `4`>(), filter.tensor<T, `4`>(), row_stride_,
167	col_stride_, row_dilation_, col_dilation_,
168	BrainPadding2EigenPadding(padding_),
169	std::move(output_kernel_wrapper));
170	}
171	}
172	}
173
174	private:
175	// Wrap output_kernel into type erased struct to reduce the number of unique
176	// template instantiations for Eigen Tensor contraction expressions.
177	//
178	// We do not pass std::function directly as an output kernel because it blows
179	// up the binary size in debug mode with super long symbol names.
180	struct OutputKernelWrapper {
181	using OutputKernelFn =
182	std::function<void(const ContractionOutputMapper<T, Eigen::Index>&,
183	const Eigen::TensorContractionParams&, Eigen::Index,
184	Eigen::Index, Eigen::Index, Eigen::Index)>;
185
186	explicit OutputKernelWrapper(OutputKernelFn fn)
187	: output_kernel_fn(std::move(fn)) {}
188
189	void operator()(
190	const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
191	const Eigen::TensorContractionParams& params, Eigen::Index i,
192	Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) const {
193	output_kernel_fn(output_mapper, params, i, j, num_rows, num_cols);
194	}
195
196	OutputKernelFn output_kernel_fn;
197	};
198
199	int row_stride_;
200	int col_stride_;
201	int row_dilation_;
202	int col_dilation_;
203	const Padding padding_;
204	const std::vector<int64_t>& explicit_paddings_;
205	};
206
207	template <typename T>
208	struct LaunchFusedConv2DOp<CPUDevice, T> {
209	void operator()(OpKernelContext* context, bool use_cudnn,
210	bool cudnn_use_autotune, const Tensor& input,
211	const Tensor& filter, const FusedComputationType fusion,
212	const FusedComputationArgs& fusion_args,
213	const Conv2DParameters& params,
214	const Conv2DDimensions& dimensions, Tensor* output) {
215	OP_REQUIRES(context, dimensions.in_depth == filter.dim_size(`2`),
216	errors::Unimplemented("Fused conv implementation does not "
217	"support grouped convolutions for now."));
218	OP_REQUIRES(context, params.data_format == FORMAT_NHWC,
219	errors::Unimplemented("Fused conv implementation only supports "
220	"NHWC tensor format for now."));
221	OP_REQUIRES(context, DataTypeToEnum<T>::value != DT_HALF,
222	errors::Unimplemented("Fused conv implementation with half "
223	"precision is not supported on CPU."));
224
225	BiasAddArgs<T> bias_add_args;
226	if (BiasAddArgs<T>::IsSupported(fusion)) {
227	if (fusion == FusedComputationType::kBiasAddWithLeakyRelu) {
228	OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add_args,
229	&fusion_args.leakyrelu_alpha));
230	} else {
231	OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add_args));
232	}
233	}
234
235	FusedBatchNormArgs<T> fused_batch_norm_args;
236	if (FusedBatchNormArgs<T>::IsSupported(fusion)) {
237	if (fusion == FusedComputationType::kFusedBatchNormWithLeakyRelu) {
238	OP_REQUIRES_OK(context,
239	InitFusedBatchNormArgs(context, fusion_args.epsilon,
240	&fused_batch_norm_args,
241	&fusion_args.leakyrelu_alpha));
242	} else {
243	OP_REQUIRES_OK(context,
244	InitFusedBatchNormArgs(context, fusion_args.epsilon,
245	&fused_batch_norm_args));
246	}
247	}
248
249	LaunchFusedConv2DWithOutputKernel<T> conv2d(
250	dimensions.stride_rows, dimensions.stride_cols,
251	dimensions.dilation_rows, dimensions.dilation_cols, params.padding,
252	params.explicit_paddings);
253
254	switch (fusion) {
255	case FusedComputationType::kUndefined:
256	OP_REQUIRES_OK(context, errors::Internal("Fusion type is undefined"));
257	break;
258	case FusedComputationType::kBiasAdd:
259	conv2d(WithBiasAdd<T>(bias_add_args), context, input, filter, output);
260	break;
261	case FusedComputationType::kBiasAddWithRelu:
262	conv2d(WithBiasAddAndRelu<T>(bias_add_args), context, input, filter,
263	output);
264	break;
265	case FusedComputationType::kBiasAddWithRelu6:
266	conv2d(WithBiasAddAndRelu6<T>(bias_add_args), context, input, filter,
267	output);
268	break;
269	case FusedComputationType::kBiasAddWithLeakyRelu:
270	conv2d(WithBiasAddAndLeakyRelu<T>(bias_add_args), context, input,
271	filter, output);
272	break;
273	case FusedComputationType::kBiasAddWithElu:
274	conv2d(WithBiasAddAndElu<T>(bias_add_args), context, input, filter,
275	output);
276	break;
277	case FusedComputationType::kFusedBatchNorm:
278	conv2d(
279	WithFusedBatchNorm<T>(fusion_args.epsilon, fused_batch_norm_args),
280	context, input, filter, output);
281	break;
282	case FusedComputationType::kFusedBatchNormWithRelu:
283	conv2d(WithFusedBatchNormAndRelu<T>(fusion_args.epsilon,
284	fused_batch_norm_args),
285	context, input, filter, output);
286	break;
287	case FusedComputationType::kFusedBatchNormWithRelu6:
288	conv2d(WithFusedBatchNormAndRelu6<T>(fusion_args.epsilon,
289	fused_batch_norm_args),
290	context, input, filter, output);
291	break;
292	case FusedComputationType::kFusedBatchNormWithLeakyRelu:
293	conv2d(WithFusedBatchNormAndLeakyRelu<T>(fusion_args.epsilon,
294	fused_batch_norm_args),
295	context, input, filter, output);
296	break;
297	case FusedComputationType::kFusedBatchNormWithElu:
298	conv2d(WithFusedBatchNormAndElu<T>(fusion_args.epsilon,
299	fused_batch_norm_args),
300	context, input, filter, output);
301	break;
302	default:
303	OP_REQUIRES_OK(context, errors::Internal("Fusion type is unsupported"));
304	break;
305	}
306	}
307	};
308
309	template <>
310	struct LaunchFusedConv2DOp<CPUDevice, int8>;
311
312	template <>
313	struct LaunchFusedConv2DOp<CPUDevice, qint8>;
314
315	#if GOOGLE_CUDA
316
317	inline int64_t ConvolveScratchSize() {
318	static int64_t convolve_scratch_size = GetDnnWorkspaceLimit(
319	// default value is in bytes despite the name of the environment variable
320	"TF_CUDNN_WORKSPACE_LIMIT_IN_MB", `1LL` << `32` // 4GB
321	);
322	return convolve_scratch_size;
323	}
324
325	template <typename T>
326	struct LaunchFusedConv2DOp<GPUDevice, T> {
327	void operator()(OpKernelContext* context, bool use_cudnn,
328	bool cudnn_use_autotune, const Tensor& input_param,
329	const Tensor& filter, FusedComputationType fusion,
330	const FusedComputationArgs& fusion_args,
331	const Conv2DParameters& params,
332	const Conv2DDimensions& dimensions, Tensor* output) {
333	OP_REQUIRES(
334	context,
335	params.data_format == FORMAT_NHWC \|\| params.data_format == FORMAT_NCHW,
336	errors::Unimplemented("Fused conv implementation only supports "
337	"NHWC and HCHW tensor formats for now."));
338
339	auto* stream = context->op_device_context()->stream();
340	OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
341	OP_REQUIRES(
342	context, use_cudnn,
343	errors::Unimplemented("FusedConv2D for GPU is not currently supported "
344	"without cudnn"));
345
346	bool is_supported_activation =
347	fusion == FusedComputationType::kBiasAddWithRelu \|\|
348	fusion == FusedComputationType::kBiasAddWithRelu6 \|\|
349	fusion == FusedComputationType::kBiasAddWithElu \|\|
350	fusion == FusedComputationType::kBiasAddWithLeakyRelu;
351	OP_REQUIRES(
352	context, is_supported_activation,
353	errors::Unimplemented("FusedConv2D implementation only supports "
354	"fusing with `BiasAdd + Relu\|Relu6\|Elu\|LeakyRlue`"
355	" for now."));
356
357	Tensor input = input_param;
358
359	const int64_t in_batch = GetTensorDim(input, params.data_format, `'N'`);
360	int64_t in_rows = GetTensorDim(input, params.data_format, `'H'`);
361	int64_t in_cols = GetTensorDim(input, params.data_format, `'W'`);
362	const int64_t in_depths = GetTensorDim(input, params.data_format, `'C'`);
363
364	const int64_t patch_rows = filter.dim_size(`0`);
365	const int64_t patch_cols = filter.dim_size(`1`);
366	const int64_t patch_depths = filter.dim_size(`2`);
367
368	const int64_t out_batch = GetTensorDim(*output, params.data_format, `'N'`);
369	const int64_t out_rows = GetTensorDim(*output, params.data_format, `'H'`);
370	const int64_t out_cols = GetTensorDim(*output, params.data_format, `'W'`);
371	const int64_t out_depths = GetTensorDim(*output, params.data_format, `'C'`);
372
373	// Bias of the following dimensions: [ output_depth ]
374	const Tensor& bias = context->input(`2`);
375	OP_REQUIRES(context, bias.dims() == `1`,
376	errors::InvalidArgument("bias must be 1-dimensional",
377	bias.shape().DebugString()));
378	OP_REQUIRES(context, bias.dim_size(`0`) == out_depths,
379	errors::InvalidArgument("bias depth must be equal to out depth",
380	bias.shape().DebugString()));
381
382	const int64_t common_padding_rows =
383	std::min(dimensions.pad_rows_before, dimensions.pad_rows_after);
384	const int64_t common_padding_cols =
385	std::min(dimensions.pad_cols_before, dimensions.pad_cols_after);
386	if (dimensions.pad_rows_before != dimensions.pad_rows_after \|\|
387	dimensions.pad_cols_before != dimensions.pad_cols_after) {
388	// cuDNN only supports padding the same amount on the left and right
389	// sides, and on the top and bottom sides. So we manually create a new
390	// padded input tensor such that we can pass it to cuDNN.
391
392	// TODO(reedwm): In some cases, we can avoid an allocation even if the two
393	// padding sides are different. For example, if the input is 2x2, the
394	// filter is 1x1, the stride is 2, and the padding is (1, 0, 1, 0), the
395	// result is equivalent to as if the padding is (1, 1, 1, 1). Changing the
396	// padding in such a way would allow us to avoid the allocation.
397	Tensor transformed_input;
398	const int64_t padding_rows_diff =
399	std::abs(dimensions.pad_rows_after - dimensions.pad_rows_before);
400	const int64_t padding_cols_diff =
401	std::abs(dimensions.pad_cols_after - dimensions.pad_cols_before);
402	const int64_t new_in_rows = in_rows + padding_rows_diff;
403	const int64_t new_in_cols = in_cols + padding_cols_diff;
404	OP_REQUIRES_OK(context,
405	context->allocate_temp(
406	DataTypeToEnum<T>::value,
407	ShapeFromFormat(params.data_format, in_batch,
408	new_in_rows, new_in_cols, in_depths),
409	&transformed_input));
410	const int64_t input_pad_top =
411	dimensions.pad_rows_before - common_padding_rows;
412	const int64_t input_pad_bottom =
413	dimensions.pad_rows_after - common_padding_rows;
414	const int64_t input_pad_left =
415	dimensions.pad_cols_before - common_padding_cols;
416	const int64_t input_pad_right =
417	dimensions.pad_cols_after - common_padding_cols;
418	bool in_bounds =
419	FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
420	FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
421	FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
422	FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
423	if (!in_bounds) {
424	context->SetStatus(errors::InvalidArgument("Padding is too large."));
425	return;
426	}
427	functor::PadInput<GPUDevice, T, int, `4`>()(
428	context->eigen_device<GPUDevice>(),
429	To32Bit(input_param.tensor<T, `4`>()),
430	{{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
431	{{static_cast<int>(input_pad_bottom),
432	static_cast<int>(input_pad_right)}},
433	To32Bit(transformed_input.tensor<T, `4`>()), params.data_format, T{});
434	input = transformed_input;
435	in_rows = new_in_rows;
436	in_cols = new_in_cols;
437	}
438
439	const bool compute_in_nhwc = DataTypeToEnum<T>::value == DT_HALF &&
440	stream->GetCudaComputeCapability().IsAtLeast(
441	se::CudaComputeCapability::VOLTA);
442	if (!compute_in_nhwc && params.data_format == FORMAT_NHWC) {
443	// Convert the input tensor from NHWC to NCHW.
444	TensorShape nchw_shape =
445	ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
446	if (in_depths > `1`) {
447	Tensor transformed_input;
448	OP_REQUIRES_OK(context,
449	context->allocate_temp(DataTypeToEnum<T>::value,
450	nchw_shape, &transformed_input));
451	functor::NHWCToNCHW<GPUDevice, T, `4`>()(
452	context->eigen_device<GPUDevice>(),
453	const_cast<const Tensor&>(input).tensor<T, `4`>(),
454	transformed_input.tensor<T, `4`>());
455	input = transformed_input;
456	} else {
457	// If depth <= 1, then just reshape.
458	CHECK(input.CopyFrom(input, nchw_shape)); // Crash OK
459	}
460	}
461
462	CHECK(common_padding_rows >= `0`) << "Negative padding rows"; // Crash OK
463	CHECK(common_padding_rows >= `0`) << "Negative padding cols"; // Crash OK
464
465	se::dnn::ActivationMode dnn_activation_mode;
466	switch (fusion) {
467	case FusedComputationType::kBiasAddWithRelu:
468	dnn_activation_mode = se::dnn::ActivationMode::kRelu;
469	break;
470	case FusedComputationType::kBiasAddWithRelu6:
471	dnn_activation_mode = se::dnn::ActivationMode::kRelu6;
472	break;
473	case FusedComputationType::kBiasAddWithElu:
474	dnn_activation_mode = se::dnn::ActivationMode::kElu;
475	break;
476	case FusedComputationType::kBiasAddWithLeakyRelu:
477	dnn_activation_mode = se::dnn::ActivationMode::kLeakyRelu;
478	break;
479	default:
480	LOG(FATAL) << "Unsupported fusion type"; // Crash OK
481	}
482
483	const TensorFormat compute_data_format =
484	compute_in_nhwc ? FORMAT_NHWC : FORMAT_NCHW;
485	constexpr auto kComputeInNHWC =
486	std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
487	se::dnn::FilterLayout::kOutputYXInput);
488	constexpr auto kComputeInNCHW =
489	std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
490	se::dnn::FilterLayout::kOutputInputYX);
491	se::dnn::DataLayout compute_data_layout;
492	se::dnn::FilterLayout filter_layout;
493	std::tie(compute_data_layout, filter_layout) =
494	compute_in_nhwc ? kComputeInNHWC : kComputeInNCHW;
495
496	se::dnn::BatchDescriptor input_desc;
497	input_desc.set_count(in_batch)
498	.set_feature_map_count(in_depths)
499	.set_height(in_rows)
500	.set_width(in_cols)
501	.set_layout(compute_data_layout);
502	se::dnn::FilterDescriptor filter_desc;
503	filter_desc.set_input_filter_height(patch_rows)
504	.set_input_filter_width(patch_cols)
505	.set_input_feature_map_count(patch_depths)
506	.set_output_feature_map_count(filter.dim_size(`3`))
507	.set_layout(filter_layout);
508	se::dnn::BatchDescriptor bias_desc;
509	bias_desc.set_count(`1`)
510	.set_height(`1`)
511	.set_width(`1`)
512	.set_feature_map_count(out_depths)
513	.set_layout(compute_data_layout);
514	se::dnn::ConvolutionDescriptor conv_desc;
515	conv_desc.set_vertical_dilation_rate(dimensions.dilation_rows)
516	.set_horizontal_dilation_rate(dimensions.dilation_cols)
517	.set_vertical_filter_stride(dimensions.stride_rows)
518	.set_horizontal_filter_stride(dimensions.stride_cols)
519	.set_zero_padding_height(common_padding_rows)
520	.set_zero_padding_width(common_padding_cols)
521	.set_group_count(in_depths / patch_depths);
522	se::dnn::BatchDescriptor output_desc;
523	output_desc.set_count(out_batch)
524	.set_height(out_rows)
525	.set_width(out_cols)
526	.set_feature_map_count(out_depths)
527	.set_layout(compute_data_layout);
528
529	Tensor transformed_filter;
530	const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status {
531	VLOG(`4`) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
532	<< " to " << ToString(dst_format);
533
534	TensorShape dst_shape =
535	dst_format == FORMAT_OIHW
536	? TensorShape({filter.dim_size(`3`), filter.dim_size(`2`),
537	filter.dim_size(`0`), filter.dim_size(`1`)})
538	: TensorShape({filter.dim_size(`3`), filter.dim_size(`0`),
539	filter.dim_size(`1`), filter.dim_size(`2`)});
540
541	TF_RETURN_IF_ERROR(context->allocate_temp(
542	DataTypeToEnum<T>::value, dst_shape, &transformed_filter));
543	functor::TransformFilter<GPUDevice, T, int, `4`>()(
544	context->eigen_device<GPUDevice>(), dst_format,
545	To32Bit(filter.tensor<T, `4`>()),
546	To32Bit(transformed_filter.tensor<T, `4`>()));
547
548	return OkStatus();
549	};
550
551	if (compute_in_nhwc) {
552	OP_REQUIRES_OK(context, transform_filter(FORMAT_OHWI));
553	} else {
554	OP_REQUIRES_OK(context, transform_filter(FORMAT_OIHW));
555	}
556
557	Tensor transformed_output;
558	if (!compute_in_nhwc && params.data_format == FORMAT_NHWC) {
559	// Only allocate temporary memory when a layout transformation is needed.
560	OP_REQUIRES_OK(context,
561	context->allocate_temp(
562	DataTypeToEnum<T>::value,
563	ShapeFromFormat(FORMAT_NCHW, out_batch, out_rows,
564	out_cols, out_depths),
565	&transformed_output));
566	} else {
567	transformed_output = *output;
568	}
569
570	const auto tensor_on_device = [](const Tensor& t) -> se::DeviceMemory<T> {
571	return AsDeviceMemory(t.template flat<T>().data(),
572	t.template flat<T>().size());
573	};
574
575	se::DeviceMemory<T> input_ptr = tensor_on_device(input);
576	se::DeviceMemory<T> filter_ptr = tensor_on_device(transformed_filter);
577	se::DeviceMemory<T> bias_ptr = tensor_on_device(bias);
578	se::DeviceMemory<T> output_ptr = tensor_on_device(transformed_output);
579
580	// We do not use side inputs, so we can safely pass nullptr.
581	se::DeviceMemory<T> side_input_ptr =
582	AsDeviceMemory(static_cast<T>(nullptr*), `0`);
583
584	constexpr double kConvScale = `1.0`;
585	constexpr double kSideInputScale = `0.0`;
586	double leakyrelu_alpha = fusion_args.leakyrelu_alpha;
587
588	int device_id = stream->parent()->device_ordinal();
589	DataType dtype = input.dtype();
590	ConvParameters conv_parameters = {
591	in_batch, // batch
592	in_depths, // in_depths
593	{{in_rows, // in_rows
594	in_cols}}, // in_cols
595	compute_data_format, // compute_data_format
596	out_depths, // out_depths
597	{{patch_rows, // filter_rows
598	patch_cols, // filter_cols
599	patch_depths}}, // filter_depths
600	{{dimensions.dilation_rows, // dilation_rows
601	dimensions.dilation_cols}}, // dilation_cols
602	{{dimensions.stride_rows, // stride_rows
603	dimensions.stride_cols}}, // stride_cols
604	{{common_padding_rows, // padding_rows
605	common_padding_cols}}, // padding_cols
606	dtype, // tensor datatype
607	device_id, // device_id
608	conv_desc.group_count(),
609	ConvParameters::FusionInfo{kConvScale, kSideInputScale, leakyrelu_alpha,
610	dnn_activation_mode, // activation_mode
611	/is_contrib=/false}};
612
613	se::dnn::DataType element_type = se::dnn::ToDataType<T>::value;
614
615	auto entry_or = AutotuneFusedConv<T>(
616	cudnn_use_autotune, FusedConvAutotuneMap::GetInstance(),
617	conv_parameters, context, input_desc, filter_desc, bias_desc,
618	output_desc, conv_desc, dnn_activation_mode, kConvScale,
619	kSideInputScale, leakyrelu_alpha, input_ptr, filter_ptr, output_ptr,
620	bias_ptr, side_input_ptr, ConvolveScratchSize());
621	OP_REQUIRES_OK(context, entry_or.status());
622	auto autotune_entry = std::move(entry_or).value();
623
624	DnnScratchAllocator scratch_allocator(ConvolveScratchSize(), context);
625	Status cudnn_launch_status;
626	if (!autotune_entry.is_algorithm_config()) {
627	auto& runners = autotune_entry.GetOpRunners();
628	se::dnn::FusedConvOp::Config config{se::dnn::ConvolutionKind::FORWARD,
629	element_type,
630	element_type,
631	element_type,
632	kConvScale,
633	kSideInputScale,
634	leakyrelu_alpha,
635	input_desc,
636	filter_desc,
637	bias_desc,
638	output_desc,
639	conv_desc,
640	dnn_activation_mode};
641	auto primary_or = runners.primary->GetOrCreateRunner(config, stream);
642	OP_REQUIRES_OK(context, primary_or.status());
643	auto* primary = primary_or.value();
644
645	const se::dnn::FusedConvRunner* no_scratch_fallback = nullptr;
646	if (runners.no_scratch_fallback) {
647	auto no_scratch_fallback_or =
648	runners.no_scratch_fallback->GetOrCreateRunner(config, stream);
649	OP_REQUIRES_OK(context, no_scratch_fallback_or.status());
650	no_scratch_fallback = no_scratch_fallback_or.value();
651	}
652
653	auto runner_and_scratch_or =
654	AllocateScratchOrFallback<se::dnn::FusedConvOp::Signature>(
655	&scratch_allocator, primary, no_scratch_fallback);
656	OP_REQUIRES_OK(context, runner_and_scratch_or.status());
657	auto runner_and_scratch = std::move(runner_and_scratch_or).value();
658	auto& runner =
659	std::get<const* se::dnn::FusedConvRunner*>(runner_and_scratch);
660	cudnn_launch_status = runner(
661	stream, nullptr, std::get<se::DeviceMemoryBase>(runner_and_scratch),
662	input_ptr, filter_ptr, side_input_ptr, bias_ptr, output_ptr);
663	} else {
664	cudnn_launch_status = stream->FusedConvolveWithAlgorithm(
665	input_desc, input_ptr, // input
666	kConvScale, // input_scale
667	filter_desc, filter_ptr, // filter
668	conv_desc, // conv
669	side_input_ptr, kSideInputScale, // side_input
670	bias_desc, bias_ptr, // bias
671	dnn_activation_mode, // activation
672	output_desc, &output_ptr, // output
673	&scratch_allocator, autotune_entry.GetAlgorithmConfig(), nullptr);
674	}
675
676	OP_REQUIRES_OK(context, cudnn_launch_status);
677
678	// Convert the output tensor back from NCHW to NHWC.
679	if (!compute_in_nhwc && params.data_format == FORMAT_NHWC) {
680	functor::NCHWToNHWC<GPUDevice, T, `4`>()(
681	context->eigen_device<GPUDevice>(),
682	const_cast<const Tensor&>(transformed_output).tensor<T, `4`>(),
683	output->tensor<T, `4`>());
684	}
685	}
686	};
687
688	template <>
689	struct LaunchFusedConv2DOp<GPUDevice, int8>;
690
691	template <>
692	struct LaunchFusedConv2DOp<GPUDevice, qint8>;
693
694	#endif // GOOGLE_CUDA
695
696	template <typename Device, typename T>
697	class FusedConv2DOp : public OpKernel {
698	public:
699	explicit FusedConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
700	OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
701
702	OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
703	cudnn_use_autotune_ = CudnnUseAutotune();
704
705	using FCT = FusedComputationType;
706
707	std::vector<FusedComputationPattern> patterns;
708	if (std::is_same<Device, CPUDevice>::value) {
709	patterns = {
710	{FCT::kBiasAdd, {"BiasAdd"}},
711	{FCT::kBiasAddWithRelu, {"BiasAdd", "Relu"}},
712	{FCT::kBiasAddWithRelu6, {"BiasAdd", "Relu6"}},
713	{FCT::kBiasAddWithElu, {"BiasAdd", "Elu"}},
714	{FCT::kBiasAddWithLeakyRelu, {"BiasAdd", "LeakyRelu"}},
715	{FCT::kFusedBatchNorm, {"FusedBatchNorm"}},
716	{FCT::kFusedBatchNormWithRelu, {"FusedBatchNorm", "Relu"}},
717	{FCT::kFusedBatchNormWithRelu6, {"FusedBatchNorm", "Relu6"}},
718	{FCT::kFusedBatchNormWithElu, {"FusedBatchNorm", "Elu"}},
719	{FCT::kFusedBatchNormWithLeakyRelu, {"FusedBatchNorm", "LeakyRelu"}},
720	};
721	}
722
723	// NOTE(ezhulenev): CuDNN `cudnnConvolutionBiasActivationForward` supports
724	// identity activation function, it in theory should allow to fuse
725	// convolution with BiasAdd, but in practice it doesn't work, cuDNN ignores
726	// this parameter and always does Relu activation.
727	if (std::is_same<Device, GPUDevice>::value) {
728	if (std::is_same<T, int8>::value \|\| std::is_same<T, qint8>::value) {
729	patterns = {{FCT::kBiasAdd, {"BiasAdd"}},
730	{FCT::kBiasAddWithRelu, {"BiasAdd", "Relu"}}};
731	} else {
732	patterns = {
733	{FCT::kBiasAddWithRelu, {"BiasAdd", "Relu"}},
734	{FCT::kBiasAddWithRelu6, {"BiasAdd", "Relu6"}},
735	{FCT::kBiasAddWithElu, {"BiasAdd", "Elu"}},
736	{FCT::kBiasAddWithLeakyRelu, {"BiasAdd", "LeakyRelu"}},
737	};
738	}
739	}
740
741	OP_REQUIRES_OK(context, InitializeFusedComputation(
742	context, "Conv2D", patterns,
743	&fused_computation_, &fused_computation_args_));
744	}
745
746	void Compute(OpKernelContext* context) override {
747	// Input tensor is of the following dimensions:
748	// [ batch, in_rows, in_cols, in_depth ]
749	const Tensor& input = context->input(`0`);
750
751	// Input filter is of the following dimensions:
752	// [ filter_rows, filter_cols, in_depth, out_depth]
753	const Tensor& filter = context->input(`1`);
754
755	Conv2DDimensions dimensions;
756	OP_REQUIRES_OK(context,
757	ComputeConv2DDimension(params_, input, filter, &dimensions));
758
759	TensorShape out_shape = ShapeFromFormat(
760	params_.data_format, dimensions.batch, dimensions.out_rows,
761	dimensions.out_cols, dimensions.out_depth);
762
763	// Output tensor is of the following dimensions:
764	// [ in_batch, out_rows, out_cols, out_depth ]
765	Tensor* output = nullptr;
766	OP_REQUIRES_OK(context, context->allocate_output(`0`, out_shape, &output));
767
768	VLOG(`2`) << "FusedConv2D: in_depth = " << dimensions.in_depth
769	<< ", patch_depth = " << dimensions.patch_depth
770	<< ", input_cols = " << dimensions.input_cols
771	<< ", filter_cols = " << dimensions.filter_cols
772	<< ", input_rows = " << dimensions.input_rows
773	<< ", filter_rows = " << dimensions.filter_rows
774	<< ", stride_rows = " << dimensions.stride_rows
775	<< ", stride_cols = " << dimensions.stride_cols
776	<< ", dilation_rows = " << dimensions.dilation_rows
777	<< ", dilation_cols = " << dimensions.dilation_cols
778	<< ", out_depth = " << dimensions.out_depth;
779
780	// If there is nothing to compute, return.
781	if (out_shape.num_elements() == `0`) {
782	return;
783	}
784
785	LaunchFusedConv2DOp<Device, T>()(context, use_cudnn_, cudnn_use_autotune_,
786	input, filter, fused_computation_,
787	fused_computation_args_, params_,
788	dimensions, output);
789	}
790
791	private:
792	Conv2DParameters params_;
793	bool use_cudnn_;
794	bool cudnn_use_autotune_;
795
796	FusedComputationType fused_computation_ = FusedComputationType::kUndefined;
797	FusedComputationArgs fused_computation_args_;
798
799	TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DOp);
800	};
801
802	// Registration of the CPU implementations.
803	#define REGISTER_FUSED_CPU_CONV2D(T) \
804	REGISTER_KERNEL_BUILDER( \
805	Name("_FusedConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
806	FusedConv2DOp<CPUDevice, T>);
807
808	#if GOOGLE_CUDA
809
810	#define DECLARE_FUNCTOR_GPU_SPEC(T) \
811	template <> \
812	void TransformFilter<GPUDevice, T, int, 4>::operator()( \
813	const GPUDevice& d, FilterTensorFormat dst_filter_format, \
814	typename TTypes<T, 4, int>::ConstTensor in, \
815	typename TTypes<T, 4, int>::Tensor out); \
816	extern template struct TransformFilter<GPUDevice, T, int, 4>; \
817	template <> \
818	void PadInput<GPUDevice, T, int, 4>::operator()( \
819	const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \
820	const std::array<int, 2>& padding_left, \
821	const std::array<int, 2>& padding_right, \
822	typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
823	const T& padding_value); \
824	extern template struct PadInput<GPUDevice, T, int, 4>
825
826	// Registration of the GPU implementations.
827	#define REGISTER_FUSED_GPU_CONV2D(T) \
828	REGISTER_KERNEL_BUILDER(Name("_FusedConv2D") \
829	.Device(DEVICE_GPU) \
830	.TypeConstraint<T>("T") \
831	.HostMemory("host_args"), \
832	FusedConv2DOp<GPUDevice, T>);
833
834	#endif // GOOGLE_CUDA
835
836	} // namespace tensorflow
837
838	#endif // TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
839

Browse the source code of tensorflow/tensorflow/core/kernels/conv_ops_fused_impl.h