conv_ops_3d.cc source code [tensorflow/tensorflow/core/kernels/conv_ops_3d.cc]

1	/ Copyright 2016 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	#define USE_EIGEN_TENSOR
17	#define EIGEN_USE_THREADS
18
19	#include <utility>
20
21	#include "tensorflow/core/framework/kernel_shape_util.h"
22	#include "tensorflow/core/framework/numeric_op.h"
23	#include "tensorflow/core/framework/op_kernel.h"
24	#include "tensorflow/core/framework/register_types.h"
25	#include "tensorflow/core/framework/tensor.h"
26	#include "tensorflow/core/framework/tensor_shape.h"
27	#include "tensorflow/core/framework/tensor_slice.h"
28	#include "tensorflow/core/kernels/conv_2d.h"
29	#include "tensorflow/core/kernels/conv_3d.h"
30	#include "tensorflow/core/kernels/conv_ops_gpu.h"
31	#include "tensorflow/core/kernels/ops_util.h"
32	#include "tensorflow/core/lib/core/errors.h"
33	#include "tensorflow/core/profiler/lib/scoped_annotation.h"
34	#include "tensorflow/core/util/padding.h"
35	#include "tensorflow/core/util/tensor_format.h"
36	#include "tensorflow/core/util/use_cudnn.h"
37
38	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
39	#include "tensorflow/core/platform/stream_executor.h"
40	#include "tensorflow/core/protobuf/autotuning.pb.h"
41	#include "tensorflow/core/util/autotune_maps/conv_parameters.h"
42	#include "tensorflow/core/util/proto/proto_utils.h"
43	using stream_executor::dnn::DimIndex;
44	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
45	#if GOOGLE_CUDA
46	#include "third_party/gpus/cudnn/cudnn.h"
47	#include "tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h"
48	#include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
49	#include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h"
50	#endif // GOOGLE_CUDA
51
52	namespace tensorflow {
53
54	typedef Eigen::ThreadPoolDevice CPUDevice;
55	typedef Eigen::GpuDevice GPUDevice;
56
57	template <typename Device, typename T>
58	struct LaunchConvOp;
59
60	template <typename T>
61	struct LaunchConvOp<CPUDevice, T> {
62	static void launch(OpKernelContext* context, bool cudnn_use_autotune,
63	const Tensor& input, const Tensor& filter,
64	const std::array<int64, `3`>& dilations,
65	const std::array<int64, `3`>& strides, const Padding padding,
66	TensorFormat data_format, Tensor* output) {
67	OP_REQUIRES(context, data_format == FORMAT_NHWC,
68	errors::InvalidArgument("CPU implementation of Conv3D "
69	"currently only supports the NHWC "
70	"tensor format."));
71	OP_REQUIRES(context,
72	dilations[`0`] == `1` && dilations[`1`] == `1` && dilations[`2`] == `1`,
73	errors::InvalidArgument("CPU implementation of Conv3D "
74	"currently only supports dilated rates "
75	"of 1."));
76	OP_REQUIRES(context, filter.dim_size(`3`) == input.dim_size(input.dims() - `1`),
77	errors::InvalidArgument(
78	"Number of channels in filter (", filter.dim_size(`3`),
79	") must match last dimension of input (",
80	input.dim_size(input.dims() - `1`), ")"));
81	functor::CuboidConvolution<CPUDevice, T>()(
82	context->eigen_device<CPUDevice>(), output->tensor<T, `5`>(),
83	input.tensor<T, `5`>(), filter.tensor<T, `5`>(), strides [`2`], strides [`1`],
84	strides [`0`], BrainPadding2EigenPadding(padding));
85	}
86	};
87
88	template <typename Device, typename T>
89	class Conv3DOp : public BinaryOp<T> {
90	public:
91	explicit Conv3DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
92	string data_format;
93	OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
94	OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
95	errors::InvalidArgument("Invalid data format"));
96	OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
97	OP_REQUIRES(context, stride_.size() == `5`,
98	errors::InvalidArgument("Sliding window strides field must "
99	"specify 5 dimensions"));
100	OP_REQUIRES(
101	context,
102	(GetTensorDim(stride_, data_format_, `'N'`) == `1` &&
103	GetTensorDim(stride_, data_format_, `'C'`) == `1`),
104	errors::InvalidArgument("Current implementation does not yet support "
105	"strides in the batch and depth dimensions."));
106	OP_REQUIRES(
107	context,
108	(GetTensorDim(stride_, data_format_, `'0'`) > `0` &&
109	GetTensorDim(stride_, data_format_, `'1'`) > `0` &&
110	GetTensorDim(stride_, data_format_, `'2'`) > `0`),
111	errors::InvalidArgument("Spatial strides should be larger than 0."));
112	OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
113	OP_REQUIRES(context, dilation_.size() == `5`,
114	errors::InvalidArgument("Dilation rates field must "
115	"specify 5 dimensions"));
116	OP_REQUIRES(context,
117	(GetTensorDim(dilation_, data_format_, `'N'`) == `1` &&
118	GetTensorDim(dilation_, data_format_, `'C'`) == `1`),
119	errors::InvalidArgument(
120	"Current implementation does not yet support "
121	"dilation rates in the batch and depth dimensions."));
122	OP_REQUIRES(
123	context,
124	(GetTensorDim(dilation_, data_format_, `'0'`) > `0` &&
125	GetTensorDim(dilation_, data_format_, `'1'`) > `0` &&
126	GetTensorDim(dilation_, data_format_, `'2'`) > `0`),
127	errors::InvalidArgument("Dilated rates should be larger than 0."));
128	OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
129	cudnn_use_autotune_ = CudnnUseAutotune();
130	}
131
132	void Compute(OpKernelContext* context) override {
133	// Input tensor is of the following dimensions:
134	// [ batch, in_z, in_y, in_x, in_channels ]
135	const Tensor& input = context->input(`0`);
136
137	// Input filter is of the following dimensions:
138	// [ filter_z, filter_y, filter_x, in_channels, out_channels]
139	const Tensor& filter = context->input(`1`);
140
141	// NOTE: The ordering of the spatial dimensions is arbitrary, but has to be
142	// kept consistent between input/filter/output.
143	OP_REQUIRES(context, input.dims() == `5`,
144	errors::InvalidArgument("input must be 5-dimensional"));
145	OP_REQUIRES(context, filter.dims() == `5`,
146	errors::InvalidArgument("filter must be 5-dimensional"));
147
148	const int64_t in_depth = GetTensorDim(input, data_format_, `'C'`);
149	const int64_t in_batch = GetTensorDim(input, data_format_, `'N'`);
150
151	const int64_t filter_depth = filter.dim_size(`3`);
152	const int64_t out_depth = filter.dim_size(`4`);
153
154	OP_REQUIRES(context, filter_depth != `0`,
155	errors::InvalidArgument("filter_depth must be non-zero"));
156	OP_REQUIRES(context, in_depth % filter_depth == `0`,
157	errors::InvalidArgument(
158	"Input depth must be evenly divisible by filter depth: ",
159	in_depth, " vs ", filter_depth));
160	OP_REQUIRES(
161	context, filter.NumElements() > `0`,
162	errors::InvalidArgument("filter must not have zero elements "
163	"(i.e. all dimensions must be non-zero)"));
164
165	// Dimension order for these arrays is: z, y, x.
166	std::array<int64_t, `3`> input_size = {
167	{GetTensorDim(input, data_format_, `'0'`),
168	GetTensorDim(input, data_format_, `'1'`),
169	GetTensorDim(input, data_format_, `'2'`)}};
170	std::array<int64_t, `3`> filter_size = {
171	{filter.dim_size(`0`), filter.dim_size(`1`), filter.dim_size(`2`)}};
172	std::array<int64_t, `3`> dilations = {
173	{GetTensorDim(dilation_, data_format_, `'0'`),
174	GetTensorDim(dilation_, data_format_, `'1'`),
175	GetTensorDim(dilation_, data_format_, `'2'`)}};
176	std::array<int64_t, `3`> strides = {
177	{GetTensorDim(stride_, data_format_, `'0'`),
178	GetTensorDim(stride_, data_format_, `'1'`),
179	GetTensorDim(stride_, data_format_, `'2'`)}};
180	std::array<int64_t, `3`> out, padding;
181
182	OP_REQUIRES_OK(
183	context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides,
184	padding_, &out, &padding));
185	TensorShape out_shape = ShapeFromFormat(
186	data_format_, in_batch, {{out [`0`], out [`1`], out [`2`]}}, out_depth);
187	Tensor* output;
188	OP_REQUIRES_OK(context, context->allocate_output(`0`, out_shape, &output));
189
190	// Return early if nothing to do.
191	if (out_shape.num_elements() == `0`) return;
192
193	LaunchConvOp<Device, T>::launch(context, cudnn_use_autotune_, input, filter,
194	dilations, strides, padding_, data_format_,
195	output);
196	}
197
198	private:
199	std::vector<int32> dilation_;
200	std::vector<int32> stride_;
201	Padding padding_;
202	TensorFormat data_format_;
203	bool cudnn_use_autotune_;
204	};
205
206	#define REGISTER_CPU_KERNEL(T) \
207	REGISTER_KERNEL_BUILDER( \
208	Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
209	Conv3DOp<CPUDevice, T>);
210	TF_CALL_half(REGISTER_CPU_KERNEL);
211	TF_CALL_float(REGISTER_CPU_KERNEL);
212	TF_CALL_double(REGISTER_CPU_KERNEL);
213	TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
214	#undef REGISTER_CPU_KERNEL
215
216	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
217
218	// A dummy type to group forward convolution autotune results together.
219	struct Conv3dAutotuneGroup {
220	static string name() { return "Conv3d"; }
221	};
222
223	typedef AutotuneSingleton<Conv3dAutotuneGroup, ConvParameters,
224	AutotuneEntry<se::dnn::ConvOp>>
225	AutotuneConv3d;
226
227	// TODO(mjanusz): Share logic with 2d implementation as much as possible.
228	template <typename T>
229	struct LaunchConvOp<GPUDevice, T> {
230	static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
231	const Tensor& input_param, const Tensor& filter,
232	const std::array<int64, `3`>& dilations,
233	const std::array<int64, `3`>& strides, const Padding padding,
234	TensorFormat data_format, Tensor* output) {
235	auto* stream = ctx->op_device_context()->stream();
236	OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
237
238	Tensor input = input_param;
239
240	const int64_t in_batch = GetTensorDim(input, data_format, `'N'`);
241	int64_t in_planes = GetTensorDim(input, data_format, `'0'`);
242	int64_t in_rows = GetTensorDim(input, data_format, `'1'`);
243	int64_t in_cols = GetTensorDim(input, data_format, `'2'`);
244	const int64_t in_depth = GetTensorDim(input, data_format, `'C'`);
245
246	const int64_t filter_planes = filter.dim_size(`0`);
247	const int64_t filter_rows = filter.dim_size(`1`);
248	const int64_t filter_cols = filter.dim_size(`2`);
249	const int64_t filter_depth = filter.dim_size(`3`);
250	const int64_t out_depth = filter.dim_size(`4`);
251
252	int64_t pad_planes = `0`, pad_rows = `0`, pad_cols = `0`;
253	int64_t out_planes = GetTensorDim(*output, data_format, `'0'`);
254	int64_t out_rows = GetTensorDim(*output, data_format, `'1'`);
255	int64_t out_cols = GetTensorDim(*output, data_format, `'2'`);
256
257	if (padding == Padding::SAME) {
258	pad_planes = std::max<int64_t>(
259	`0`, (out_planes - `1`) * strides[`0`] + filter_planes - in_planes);
260	pad_rows = std::max<int64_t>(
261	`0`, (out_rows - `1`) * strides[`1`] + filter_rows - in_rows);
262	pad_cols = std::max<int64_t>(
263	`0`, (out_cols - `1`) * strides[`2`] + filter_cols - in_cols);
264	}
265
266	bool is_grouped_convolution = filter_depth != in_depth;
267
268	// NOTE: This only works in NHWC.
269	if (!is_grouped_convolution && filter_planes == `1` && filter_rows == `1` &&
270	filter_cols == `1` && dilations[`0`] == `1` && dilations[`1`] == `1` &&
271	dilations[`2`] == `1` && strides[`0`] == `1` && strides[`1`] == `1` &&
272	strides[`2`] == `1` && data_format == FORMAT_NHWC) {
273	// 1x1 filter, so call cublas directly.
274	const uint64 m = in_batch * in_planes * in_rows * in_cols;
275	const uint64 k = in_depth;
276	const uint64 n = out_depth;
277
278	auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
279	input.template flat<T>().size());
280	auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
281	filter.template flat<T>().size());
282	auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
283	output->template flat<T>().size());
284
285	auto no_transpose = se::blas::Transpose::kNoTranspose;
286	OP_REQUIRES_OK(
287	ctx, stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, b_ptr,
288	n, a_ptr, k, &c_ptr, n,
289	se::blas::kDefaultComputePrecision));
290	return;
291	} else if (!is_grouped_convolution && filter_planes == in_planes &&
292	filter_rows == in_rows && filter_cols == in_cols &&
293	padding == Padding::VALID && data_format == FORMAT_NHWC) {
294	// The input data and filter have the same planes/height/width, so call
295	// cublas directly.
296	const uint64 m = in_batch;
297	const uint64 k = in_planes * in_rows * in_cols * in_depth;
298	const uint64 n = out_depth;
299
300	auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
301	input.template flat<T>().size());
302	auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
303	filter.template flat<T>().size());
304	auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
305	output->template flat<T>().size());
306
307	auto no_transpose = se::blas::Transpose::kNoTranspose;
308	OP_REQUIRES_OK(
309	ctx, stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, b_ptr,
310	n, a_ptr, k, &c_ptr, n,
311	se::blas::kDefaultComputePrecision));
312	return;
313	}
314
315	if (padding == Padding::SAME) {
316	const bool rows_odd = (pad_rows % `2` != `0`);
317	const bool cols_odd = (pad_cols % `2` != `0`);
318	const bool planes_odd = (pad_planes % `2` != `0`);
319
320	// Necessary because cuDNN only supports symmetric padding.
321	// TODO(mjanusz): Consider making this optional? This would save some
322	// overhead and would work as long as an op trained this way is only
323	// used on GPU.
324	if (rows_odd \|\| cols_odd \|\| planes_odd) {
325	const int64_t new_in_rows = in_rows + rows_odd;
326	const int64_t new_in_cols = in_cols + cols_odd;
327	const int64_t new_in_planes = in_planes + planes_odd;
328
329	Tensor transformed_input;
330	TensorShape transformed_shape = ShapeFromFormat(
331	data_format, in_batch, {{new_in_planes, new_in_rows, new_in_cols}},
332	in_depth);
333	OP_REQUIRES_OK(
334	ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, transformed_shape,
335	&transformed_input));
336
337	functor::PadInput<GPUDevice, T, int, `5`>()(
338	ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, `5`>()),
339	{{`0`, `0`, `0`}}, {{planes_odd, rows_odd, cols_odd}},
340	To32Bit(transformed_input.tensor<T, `5`>()), data_format, T{});
341	input = transformed_input;
342	in_rows = new_in_rows;
343	in_cols = new_in_cols;
344	in_planes = new_in_planes;
345	}
346	}
347
348	#if GOOGLE_CUDA
349	const bool compute_in_nhwc =
350	CUDNN_VERSION >= `8000` && DataTypeToEnum<T>::value == DT_HALF;
351	#else
352	// fast NHWC implementation is a CUDA only feature
353	const bool compute_in_nhwc = false;
354	#endif
355	const TensorFormat compute_data_format =
356	(compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
357	: FORMAT_NCHW;
358
359	VLOG(`3`) << "Compute Conv3D with cuDNN:"
360	<< " data_format=" << ToString(data_format)
361	<< " compute_data_format=" << ToString(compute_data_format);
362
363	if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
364	VLOG(`4`) << "Convert the input tensor from NDHWC to NCDHW.";
365	const TensorShape nchw_shape = ShapeFromFormat(
366	FORMAT_NCHW, in_batch, {{in_planes, in_rows, in_cols}}, in_depth);
367	if (in_depth > `1`) {
368	Tensor transformed_input;
369	OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
370	nchw_shape, &transformed_input));
371	// input: [b, x, y, z, d]
372	// t_input: [b, d, x, y, z]
373	// NCDHW is the only format universally supported by cuDNN.
374	functor::NHWCToNCHW<GPUDevice, T, `5`>()(
375	ctx->eigen_device<GPUDevice>(),
376	const_cast<const Tensor&>(input).tensor<T, `5`>(),
377	transformed_input.tensor<T, `5`>());
378	input = transformed_input;
379	} else {
380	CHECK(input.CopyFrom(input, nchw_shape));
381	}
382	} else {
383	CHECK(data_format == compute_data_format) // Crash OK
384	<< "Illegal data and compute format pair:"
385	<< " data_format=" << ToString(data_format)
386	<< " compute_data_format=" << ToString(compute_data_format);
387	}
388
389	constexpr auto kComputeInNHWC =
390	std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
391	se::dnn::FilterLayout::kOutputYXInput);
392	constexpr auto kComputeInNCHW =
393	std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
394	se::dnn::FilterLayout::kOutputInputYX);
395
396	se::dnn::DataLayout compute_data_layout;
397	se::dnn::FilterLayout filter_layout;
398
399	std::tie(compute_data_layout, filter_layout) =
400	compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
401
402	CHECK(pad_rows >= `0` && pad_cols >= `0` && pad_planes >= `0`)
403	<< "Negative paddings: (" << pad_rows << ", " << pad_cols << ", "
404	<< pad_planes << ")";
405	se::dnn::BatchDescriptor input_desc(`3`);
406	input_desc.set_count(in_batch)
407	.set_feature_map_count(in_depth)
408	.set_spatial_dim(DimIndex::X, in_cols)
409	.set_spatial_dim(DimIndex::Y, in_rows)
410	.set_spatial_dim(DimIndex::Z, in_planes)
411	.set_layout(compute_data_layout);
412	se::dnn::BatchDescriptor output_desc(`3`);
413	output_desc.set_count(in_batch)
414	.set_spatial_dim(DimIndex::X, out_cols)
415	.set_spatial_dim(DimIndex::Y, out_rows)
416	.set_spatial_dim(DimIndex::Z, out_planes)
417	.set_feature_map_count(out_depth)
418	.set_layout(compute_data_layout);
419	se::dnn::FilterDescriptor filter_desc(`3`);
420	filter_desc.set_spatial_dim(DimIndex::X, filter_cols)
421	.set_spatial_dim(DimIndex::Y, filter_rows)
422	.set_spatial_dim(DimIndex::Z, filter_planes)
423	.set_input_feature_map_count(filter_depth)
424	.set_output_feature_map_count(out_depth)
425	.set_layout(filter_layout);
426	se::dnn::ConvolutionDescriptor conv_desc(`3`);
427	conv_desc.set_dilation_rate(DimIndex::X, dilations[`2`])
428	.set_dilation_rate(DimIndex::Y, dilations[`1`])
429	.set_dilation_rate(DimIndex::Z, dilations[`0`])
430	.set_filter_stride(DimIndex::X, strides[`2`])
431	.set_filter_stride(DimIndex::Y, strides[`1`])
432	.set_filter_stride(DimIndex::Z, strides[`0`])
433	.set_zero_padding(DimIndex::X, pad_cols / `2`)
434	.set_zero_padding(DimIndex::Y, pad_rows / `2`)
435	.set_zero_padding(DimIndex::Z, pad_planes / `2`)
436	.set_group_count(in_depth / filter_depth);
437
438	Tensor transformed_filter;
439	auto dst_format =
440	compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
441	VLOG(`4`) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
442	<< " to " << ToString(dst_format);
443	TensorShape dst_shape =
444	dst_format == FORMAT_OIHW
445	? TensorShape({filter.dim_size(`4`), filter.dim_size(`3`),
446	filter.dim_size(`0`), filter.dim_size(`1`),
447	filter.dim_size(`2`)})
448	: TensorShape({filter.dim_size(`4`), filter.dim_size(`0`),
449	filter.dim_size(`1`), filter.dim_size(`2`),
450	filter.dim_size(`3`)});
451	OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
452	&transformed_filter));
453	// filter: [x, y, z, in, out]
454	// t_filter: [out, in, x, y, z] (NCDHW) or
455	// t_filter: [out, x, y, z, in] (NDHWC)
456	functor::TransformFilter<GPUDevice, T, int, `5`>()(
457	ctx->eigen_device<GPUDevice>(), dst_format,
458	To32Bit(filter.tensor<T, `5`>()),
459	To32Bit(transformed_filter.tensor<T, `5`>()));
460
461	Tensor transformed_output;
462	if (data_format != compute_data_format) {
463	VLOG(`4`) << "Allocate temporary memory for output in compute data format";
464	OP_REQUIRES_OK(
465	ctx,
466	ctx->allocate_temp(
467	DataTypeToEnum<T>::value,
468	ShapeFromFormat(FORMAT_NCHW, in_batch,
469	{{out_planes, out_rows, out_cols}}, out_depth),
470	&transformed_output));
471	} else {
472	transformed_output = *output;
473	}
474
475	auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
476	input.template flat<T>().size());
477	auto filter_ptr =
478	AsDeviceMemory(transformed_filter.template flat<T>().data(),
479	transformed_filter.template flat<T>().size());
480	auto output_ptr =
481	AsDeviceMemory(transformed_output.template flat<T>().data(),
482	transformed_output.template flat<T>().size());
483
484	static int64_t ConvolveScratchSize = GetDnnWorkspaceLimitOrDefault();
485
486	int device_id = stream->parent()->device_ordinal();
487	DataType dtype = input.dtype();
488	ConvParameters conv_parameters = {
489	in_batch,
490	in_depth,
491	{{in_planes, in_rows, in_cols}},
492	compute_data_format,
493	out_depth,
494	{{filter_planes, filter_rows, filter_cols}},
495	{{dilations[`0`], dilations[`1`], dilations[`2`]}},
496	{{strides[`0`], strides[`1`], strides[`2`]}},
497	{{pad_planes, pad_rows, pad_cols}},
498	dtype,
499	device_id,
500	conv_desc.group_count()};
501
502	using se::dnn::AlgorithmConfig;
503	using se::dnn::AlgorithmDesc;
504	using se::dnn::ProfileResult;
505
506	auto config_or = AutotuneUnfusedConv(
507	cudnn_use_autotune, AutotuneConv3d::GetInstance(), conv_parameters, ctx,
508	se::dnn::ConvolutionKind::FORWARD, input_desc, input_ptr, filter_desc,
509	filter_ptr, conv_desc, output_desc, output_ptr, ConvolveScratchSize);
510	OP_REQUIRES_OK(ctx, config_or.status());
511	auto autotune_entry = std::move(config_or).value();
512
513	DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
514	Status cudnn_launch_status = LaunchAutotunedConv(
515	autotune_entry, &scratch_allocator, se::dnn::ConvolutionKind::FORWARD,
516	stream, input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
517	output_desc, output_ptr);
518	if (!cudnn_launch_status.ok()) {
519	ctx->SetStatus(cudnn_launch_status);
520	return;
521	}
522
523	if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
524	VLOG(`4`) << "Convert the output tensor back from NCDHW to NDHWC.";
525	// t_output: [b, out, x, y, z]
526	// output: [b, x, y, z, out]
527	functor::NCHWToNHWC<GPUDevice, T, `5`>()(
528	ctx->eigen_device<GPUDevice>(),
529	const_cast<const Tensor&>(transformed_output).tensor<T, `5`>(),
530	output->tensor<T, `5`>());
531	}
532	}
533	};
534
535	// Forward declarations of the functor specializations for GPU.
536	// This ensures that the custom implementation is used instead of the default
537	// Eigen one (which is used for CPU).
538	namespace functor {
539	#define DECLARE_GPU_SPEC(T) \
540	template <> \
541	void TransformFilter<GPUDevice, T, int, 5>::operator()( \
542	const GPUDevice& d, FilterTensorFormat dst_filter_format, \
543	typename TTypes<T, 5, int>::ConstTensor in, \
544	typename TTypes<T, 5, int>::Tensor out); \
545	template <> \
546	void ReverseTransformFilter<GPUDevice, T, 5>::operator()( \
547	const GPUDevice& d, FilterTensorFormat src_filter_format, \
548	typename TTypes<T, 5>::ConstTensor in, \
549	typename TTypes<T, 5>::Tensor out); \
550	template <> \
551	void PadInput<GPUDevice, T, int, 5>::operator()( \
552	const GPUDevice& d, typename TTypes<T, 5, int>::ConstTensor in, \
553	const std::array<int, 3>& padding_left, \
554	const std::array<int, 3>& padding_right, \
555	typename TTypes<T, 5, int>::Tensor out, TensorFormat format, \
556	const T& padding_value); \
557	template <> \
558	void NHWCToNCHW<GPUDevice, T, 5>::operator()( \
559	const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in, \
560	typename TTypes<T, 5>::Tensor out); \
561	template <> \
562	void NCHWToNHWC<GPUDevice, T, 5>::operator()( \
563	const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in, \
564	typename TTypes<T, 5>::Tensor out);
565
566	DECLARE_GPU_SPEC(Eigen::half);
567	DECLARE_GPU_SPEC(float);
568	DECLARE_GPU_SPEC(double);
569	#undef DECLARE_GPU_SPEC
570
571	} // namespace functor
572
573	// Registration of the GPU implementations.
574	REGISTER_KERNEL_BUILDER(
575	Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
576	Conv3DOp<GPUDevice, Eigen::half>);
577	REGISTER_KERNEL_BUILDER(
578	Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
579	Conv3DOp<GPUDevice, float>);
580	REGISTER_KERNEL_BUILDER(
581	Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<double>("T"),
582	Conv3DOp<GPUDevice, double>);
583	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
584
585	} // namespace tensorflow
586

Browse the source code of tensorflow/tensorflow/core/kernels/conv_ops_3d.cc