1/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// See docs in ../ops/nn_ops.cc.
17
18#include "tensorflow/core/kernels/conv_grad_input_ops.h"
19
20#include <utility>
21
22#include "tensorflow/core/profiler/lib/scoped_annotation.h"
23
24#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
25#include "tensorflow/core/protobuf/autotuning.pb.h"
26#include "tensorflow/core/util/autotune_maps/conv_parameters.h"
27#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
28
29namespace tensorflow {
30
31typedef Eigen::ThreadPoolDevice CPUDevice;
32typedef Eigen::GpuDevice GPUDevice;
33
34// To be used inside depthwise_conv_grad_op.cc.
35template struct LaunchConv2DBackpropInputOp<CPUDevice, bfloat16>;
36template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
37template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
38template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
39
40// GPU definitions.
41#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
42// The slow version (but compiles for GPU)
43
44// A dummy type to group forward backward data autotune results together.
45struct ConvBackwardDataAutotuneGroup {
46 static string name() { return "ConvBwdData"; }
47};
48
49typedef AutotuneSingleton<ConvBackwardDataAutotuneGroup, ConvParameters,
50 AutotuneEntry<se::dnn::ConvOp>>
51 AutotuneConvBwdData;
52
53#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
54// Computes backprop input using Eigen::SpatialConvolutionBackwardInput on GPU
55// for int32 inputs.
56template <>
57struct LaunchConv2DBackpropInputOp<GPUDevice, int32> {
58 void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
59 const Tensor& out_backprop, const Tensor& filter,
60 int row_dilation, int col_dilation, int row_stride,
61 int col_stride, const Padding& padding,
62 const std::vector<int64_t>& explicit_paddings,
63 Tensor* in_backprop, TensorFormat data_format) {
64 LaunchConv2DBackpropInputOpImpl<GPUDevice, int32> launcher;
65 launcher(ctx, use_cudnn, cudnn_use_autotune, out_backprop, filter,
66 row_dilation, col_dilation, row_stride, col_stride, padding,
67 explicit_paddings, in_backprop, data_format);
68 }
69};
70#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
71
72template <typename T>
73void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
74 OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
75 const Tensor& out_backprop, const Tensor& filter, int row_dilation,
76 int col_dilation, int row_stride, int col_stride, const Padding& padding,
77 const std::vector<int64_t>& explicit_paddings, Tensor* in_backprop,
78 TensorFormat data_format) {
79 using se::dnn::AlgorithmConfig;
80 using se::dnn::AlgorithmDesc;
81 using se::dnn::ProfileResult;
82
83 std::vector<int32> strides(4, 1);
84 std::vector<int32> dilations(4, 1);
85 auto input_h = GetTensorDimIndex(data_format, 'H');
86 auto input_w = GetTensorDimIndex(data_format, 'W');
87 strides[input_h] = row_stride;
88 strides[input_w] = col_stride;
89 dilations[input_h] = row_dilation;
90 dilations[input_w] = col_dilation;
91 TensorShape input_shape = in_backprop->shape();
92
93 const TensorShape& filter_shape = filter.shape();
94 ConvBackpropDimensions dims;
95 OP_REQUIRES_OK(
96 ctx, ConvBackpropComputeDimensionsV2(
97 "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2, input_shape,
98 filter_shape, out_backprop.shape(), dilations, strides, padding,
99 explicit_paddings, data_format, &dims));
100
101 int64_t padding_top = -1, padding_bottom = -1;
102 int64_t padding_left = -1, padding_right = -1;
103 if (padding == EXPLICIT) {
104 GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
105 &padding_bottom);
106 GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
107 &padding_right);
108 }
109 int64_t expected_out_rows, expected_out_cols;
110 // The function is guaranteed to succeed because we checked the output and
111 // padding was valid earlier.
112 TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
113 dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
114 row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
115 &padding_bottom));
116 DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
117 TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
118 dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
119 col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
120 &padding_right));
121 DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
122
123 auto* stream = ctx->op_device_context()->stream();
124 OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
125
126 if (!use_cudnn) {
127 ctx->SetStatus(errors::Unimplemented(
128 "Conv2DBackpropInput for GPU is not currently supported "
129 "without cudnn"));
130 return;
131 }
132
133 // If the filter in-depth (filter_shape.dim_size(2)) is 1 and smaller than the
134 // input depth, it's a depthwise convolution. More generally, if the filter
135 // in-depth divides but is smaller than the input depth, it is a grouped
136 // convolution.
137 bool is_grouped_convolution = filter_shape.dim_size(2) != dims.in_depth;
138 if (dims.spatial_dims[0].filter_size == 1 &&
139 dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
140 dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
141 data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
142 // 1x1 filter, so call cublas directly.
143 const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size *
144 dims.spatial_dims[1].input_size;
145 const uint64 k = dims.out_depth;
146 const uint64 n = dims.in_depth;
147
148 auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
149 out_backprop.template flat<T>().size());
150 auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
151 filter.template flat<T>().size());
152 auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
153 in_backprop->template flat<T>().size());
154
155 auto transpose = se::blas::Transpose::kTranspose;
156 auto no_transpose = se::blas::Transpose::kNoTranspose;
157
158 OP_REQUIRES_OK(ctx, stream->ThenBlasGemm(
159 transpose, no_transpose, n, m, k, b_ptr, k, a_ptr,
160 k, &c_ptr, n, se::blas::kDefaultComputePrecision));
161 return;
162 } else if (dims.spatial_dims[0].filter_size ==
163 dims.spatial_dims[0].input_size &&
164 dims.spatial_dims[1].filter_size ==
165 dims.spatial_dims[1].input_size &&
166 !is_grouped_convolution && padding == VALID &&
167 data_format == FORMAT_NHWC) {
168 // The input data and filter have the same height/width, and we are not
169 // using grouped convolution, so call cublas directly.
170 const uint64 m = dims.batch_size;
171 const uint64 k = dims.out_depth;
172 const uint64 n = dims.spatial_dims[0].input_size *
173 dims.spatial_dims[1].input_size * dims.in_depth;
174
175 auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
176 out_backprop.template flat<T>().size());
177 auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
178 filter.template flat<T>().size());
179 auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
180 in_backprop->template flat<T>().size());
181
182 auto transpose = se::blas::Transpose::kTranspose;
183 auto no_transpose = se::blas::Transpose::kNoTranspose;
184
185 OP_REQUIRES_OK(ctx, stream->ThenBlasGemm(
186 transpose, no_transpose, n, m, k, b_ptr, k, a_ptr,
187 k, &c_ptr, n, se::blas::kDefaultComputePrecision));
188 return;
189 }
190
191 const int64_t common_padding_rows = std::min(padding_top, padding_bottom);
192 const int64_t common_padding_cols = std::min(padding_left, padding_right);
193 TensorShape compatible_input_shape;
194 if (padding_top != padding_bottom || padding_left != padding_right) {
195 // Pad the input in the same way we did during the forward pass, so that
196 // cuDNN or MIOpen receives the same input during the backward pass function
197 // as it did during the forward pass function.
198 const int64_t padding_rows_diff = std::abs(padding_bottom - padding_top);
199 const int64_t padding_cols_diff = std::abs(padding_right - padding_left);
200 const int64_t new_in_rows =
201 dims.spatial_dims[0].input_size + padding_rows_diff;
202 const int64_t new_in_cols =
203 dims.spatial_dims[1].input_size + padding_cols_diff;
204 compatible_input_shape = ShapeFromFormat(
205 data_format, dims.batch_size, new_in_rows, new_in_cols, dims.in_depth);
206 } else {
207 compatible_input_shape = input_shape;
208 }
209
210 CHECK(common_padding_rows >= 0 && common_padding_cols >= 0) // Crash OK
211 << "Negative row or col paddings: (" << common_padding_rows << ", "
212 << common_padding_cols << ")";
213
214 // The Tensor Core in NVIDIA Volta+ GPUs supports efficient convolution with
215 // fp16 in NHWC data layout. In all other configurations it's more efficient
216 // to run computation in NCHW data format.
217 const bool compute_in_nhwc = DataTypeToEnum<T>::value == DT_HALF &&
218 stream->GetCudaComputeCapability().IsAtLeast(
219 se::CudaComputeCapability::VOLTA);
220
221 // We only do one directional conversion: NHWC->NCHW. We never convert in the
222 // other direction. Grappler layout optimizer selects the preferred layout and
223 // adds necessary annotations to the graph.
224 const TensorFormat compute_data_format =
225 (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
226 : FORMAT_NCHW;
227
228 VLOG(3) << "Compute Conv2DBackpropInput with cuDNN:"
229 << " data_format=" << ToString(data_format)
230 << " compute_data_format=" << ToString(compute_data_format);
231
232 constexpr auto kComputeInNHWC =
233 std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
234 se::dnn::FilterLayout::kOutputYXInput);
235 constexpr auto kComputeInNCHW =
236 std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
237 se::dnn::FilterLayout::kOutputInputYX);
238
239 se::dnn::DataLayout compute_data_layout;
240 se::dnn::FilterLayout filter_layout;
241
242 std::tie(compute_data_layout, filter_layout) =
243 compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
244
245 se::dnn::BatchDescriptor input_desc;
246 input_desc.set_count(dims.batch_size)
247 .set_height(GetTensorDim(compatible_input_shape, data_format, 'H'))
248 .set_width(GetTensorDim(compatible_input_shape, data_format, 'W'))
249 .set_feature_map_count(dims.in_depth)
250 .set_layout(compute_data_layout);
251 se::dnn::BatchDescriptor output_desc;
252 output_desc.set_count(dims.batch_size)
253 .set_height(dims.spatial_dims[0].output_size)
254 .set_width(dims.spatial_dims[1].output_size)
255 .set_feature_map_count(dims.out_depth)
256 .set_layout(compute_data_layout);
257 se::dnn::FilterDescriptor filter_desc;
258 filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
259 .set_input_filter_width(dims.spatial_dims[1].filter_size)
260 .set_input_feature_map_count(filter_shape.dim_size(2))
261 .set_output_feature_map_count(filter_shape.dim_size(3))
262 .set_layout(filter_layout);
263 se::dnn::ConvolutionDescriptor conv_desc;
264 conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
265 .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
266 .set_vertical_filter_stride(dims.spatial_dims[0].stride)
267 .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
268 .set_zero_padding_height(common_padding_rows)
269 .set_zero_padding_width(common_padding_cols)
270 .set_group_count(dims.in_depth / filter_shape.dim_size(2));
271
272 // Tensorflow filter format: HWIO
273 // cuDNN filter formats: (data format) -> (filter format)
274 // (1) NCHW -> OIHW
275 // (2) NHWC -> OHWI
276
277 Tensor transformed_filter;
278 const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status {
279 VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
280 << " to " << ToString(dst_format);
281
282 TensorShape dst_shape =
283 dst_format == FORMAT_OIHW
284 ? TensorShape({filter.dim_size(3), filter.dim_size(2),
285 filter.dim_size(0), filter.dim_size(1)})
286 : TensorShape({filter.dim_size(3), filter.dim_size(0),
287 filter.dim_size(1), filter.dim_size(2)});
288
289 TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
290 &transformed_filter));
291 functor::TransformFilter<GPUDevice, T, int, 4>()(
292 ctx->eigen_device<GPUDevice>(), dst_format,
293 To32Bit(filter.tensor<T, 4>()),
294 To32Bit(transformed_filter.tensor<T, 4>()));
295
296 return OkStatus();
297 };
298
299 if (compute_data_format == FORMAT_NCHW) {
300 OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OIHW));
301 } else if (compute_data_format == FORMAT_NHWC) {
302 OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OHWI));
303 } else {
304 ctx->SetStatus(errors::InvalidArgument("Invalid compute data format: ",
305 ToString(compute_data_format)));
306 return;
307 }
308
309 Tensor transformed_out_backprop;
310 if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
311 VLOG(4) << "Convert the `out_backprop` tensor from NHWC to NCHW.";
312 TensorShape compute_shape = ShapeFromFormat(
313 compute_data_format, dims.batch_size, dims.spatial_dims[0].output_size,
314 dims.spatial_dims[1].output_size, dims.out_depth);
315 if (dims.out_depth > 1) {
316 OP_REQUIRES_OK(ctx,
317 ctx->allocate_temp(DataTypeToEnum<T>::value, compute_shape,
318 &transformed_out_backprop));
319 functor::NHWCToNCHW<GPUDevice, T, 4>()(
320 ctx->eigen_device<GPUDevice>(), out_backprop.tensor<T, 4>(),
321 transformed_out_backprop.tensor<T, 4>());
322 } else {
323 // If depth <= 1, then just reshape.
324 CHECK(transformed_out_backprop.CopyFrom(out_backprop, compute_shape));
325 }
326 } else {
327 transformed_out_backprop = out_backprop;
328 }
329
330 Tensor pre_transformed_in_backprop;
331 OP_REQUIRES_OK(
332 ctx, ctx->allocate_temp(
333 DataTypeToEnum<T>::value,
334 ShapeFromFormat(
335 compute_data_format,
336 GetTensorDim(compatible_input_shape, data_format, 'N'),
337 GetTensorDim(compatible_input_shape, data_format, 'H'),
338 GetTensorDim(compatible_input_shape, data_format, 'W'),
339 GetTensorDim(compatible_input_shape, data_format, 'C')),
340 &pre_transformed_in_backprop));
341
342 auto out_backprop_ptr =
343 AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
344 transformed_out_backprop.template flat<T>().size());
345 auto filter_ptr =
346 AsDeviceMemory(transformed_filter.template flat<T>().data(),
347 transformed_filter.template flat<T>().size());
348 auto in_backprop_ptr =
349 AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
350 pre_transformed_in_backprop.template flat<T>().size());
351
352 static int64_t ConvolveBackwardDataScratchSize =
353 GetDnnWorkspaceLimitOrDefault();
354
355 int device_id = stream->parent()->device_ordinal();
356 DataType dtype = out_backprop.dtype();
357 ConvParameters conv_parameters = {
358 dims.batch_size, // batch
359 dims.in_depth, // in_depths
360 {{input_desc.height(), // in_rows
361 input_desc.width()}}, // in_cols
362 compute_data_format, // compute_data_format
363 dims.out_depth, // out_depths
364 {{dims.spatial_dims[0].filter_size, // filter_rows
365 dims.spatial_dims[1].filter_size, // filter_cols
366 filter_shape.dim_size(2)}}, // filter_depths
367 {{dims.spatial_dims[0].dilation, // dilation_rows
368 dims.spatial_dims[1].dilation}}, // dilation_cols
369 {{dims.spatial_dims[0].stride, // stride_rows
370 dims.spatial_dims[1].stride}}, // stride_cols
371 {{common_padding_rows, // padding_rows
372 common_padding_cols}}, // padding_cols
373 dtype, // tensor data type
374 device_id, // device_id
375 conv_desc.group_count() // group_count
376 };
377
378 auto entry_or = AutotuneUnfusedConv(
379 cudnn_use_autotune, AutotuneConvBwdData::GetInstance(), conv_parameters,
380 ctx, se::dnn::ConvolutionKind::BACKWARD_DATA, input_desc, in_backprop_ptr,
381 filter_desc, filter_ptr, conv_desc, output_desc, out_backprop_ptr,
382 ConvolveBackwardDataScratchSize);
383 OP_REQUIRES_OK(ctx, entry_or.status());
384 auto autotune_entry = std::move(entry_or).value();
385
386 DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx);
387 Status cudnn_launch_status =
388 LaunchAutotunedConv(autotune_entry, &scratch_allocator,
389 se::dnn::ConvolutionKind::BACKWARD_DATA, stream,
390 input_desc, in_backprop_ptr, filter_desc, filter_ptr,
391 conv_desc, output_desc, out_backprop_ptr);
392 if (!cudnn_launch_status.ok()) {
393 ctx->SetStatus(cudnn_launch_status);
394 return;
395 }
396
397 if (padding_top != padding_bottom || padding_left != padding_right) {
398 Tensor in_backprop_remove_padding;
399 OP_REQUIRES_OK(
400 ctx, ctx->allocate_temp(
401 DataTypeToEnum<T>::value,
402 ShapeFromFormat(compute_data_format,
403 GetTensorDim(input_shape, data_format, 'N'),
404 GetTensorDim(input_shape, data_format, 'H'),
405 GetTensorDim(input_shape, data_format, 'W'),
406 GetTensorDim(input_shape, data_format, 'C')),
407 &in_backprop_remove_padding));
408
409 // Remove the padding that was added to the input shape above.
410 const int64_t input_pad_top = padding_top - common_padding_rows;
411 const int64_t input_pad_bottom = padding_bottom - common_padding_rows;
412 const int64_t input_pad_left = padding_left - common_padding_cols;
413 const int64_t input_pad_right = padding_right - common_padding_cols;
414 functor::PadInput<GPUDevice, T, int, 4>()(
415 ctx->template eigen_device<GPUDevice>(),
416 To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
417 .tensor<T, 4>()),
418 {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
419 {{static_cast<int>(-input_pad_bottom),
420 static_cast<int>(-input_pad_right)}},
421 To32Bit(in_backprop_remove_padding.tensor<T, 4>()), compute_data_format,
422 T{});
423
424 pre_transformed_in_backprop = in_backprop_remove_padding;
425 }
426
427 if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
428 VLOG(4) << "Convert the output tensor back from NCHW to NHWC.";
429 auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
430 functor::NCHWToNHWC<GPUDevice, T, 4>()(
431 ctx->eigen_device<GPUDevice>(),
432 toConstTensor(pre_transformed_in_backprop).template tensor<T, 4>(),
433 in_backprop->tensor<T, 4>());
434 } else {
435 *in_backprop = pre_transformed_in_backprop;
436 }
437}
438
439// Forward declarations of the functor specializations for GPU.
440namespace functor {
441#define DECLARE_GPU_SPEC(T) \
442 template <> \
443 void TransformFilter<GPUDevice, T, int, 4>::operator()( \
444 const GPUDevice& d, FilterTensorFormat dst_filter_format, \
445 typename TTypes<T, 4, int>::ConstTensor in, \
446 typename TTypes<T, 4, int>::Tensor out); \
447 extern template struct TransformFilter<GPUDevice, T, int, 4>; \
448 template <> \
449 void PadInput<GPUDevice, T, int, 4>::operator()( \
450 const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \
451 const std::array<int, 2>& padding_left, \
452 const std::array<int, 2>& padding_right, \
453 typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
454 const T& padding_value); \
455 extern template struct PadInput<GPUDevice, T, int, 4>;
456
457DECLARE_GPU_SPEC(float);
458DECLARE_GPU_SPEC(Eigen::half);
459DECLARE_GPU_SPEC(double);
460#undef DECLARE_GPU_SPEC
461
462template <>
463void SpatialConvolutionBackwardInputFunc<GPUDevice, int32>::operator()(
464 const GPUDevice&, typename TTypes<int32, 4>::Tensor,
465 typename TTypes<int32, 4>::ConstTensor,
466 typename TTypes<int32, 4>::ConstTensor, Eigen::DenseIndex,
467 Eigen::DenseIndex, Eigen::DenseIndex, Eigen::DenseIndex);
468extern template struct SpatialConvolutionBackwardInputFunc<GPUDevice, int32>;
469
470template <>
471void SpatialConvolutionBackwardInputWithExplicitPaddingFunc<
472 GPUDevice, int32>::operator()(const GPUDevice&,
473 typename TTypes<int32, 4>::Tensor,
474 typename TTypes<int32, 4>::ConstTensor,
475 typename TTypes<int32, 4>::ConstTensor,
476 Eigen::DenseIndex, Eigen::DenseIndex,
477 Eigen::DenseIndex, Eigen::DenseIndex,
478 Eigen::DenseIndex, Eigen::DenseIndex,
479 Eigen::DenseIndex, Eigen::DenseIndex);
480extern template struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc<
481 GPUDevice, int32>;
482
483} // namespace functor
484
485REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
486 .Device(DEVICE_GPU)
487 .TypeConstraint<double>("T")
488 .HostMemory("input_sizes"),
489 Conv2DBackpropInputOp<GPUDevice, double>);
490REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
491 .Device(DEVICE_GPU)
492 .TypeConstraint<float>("T")
493 .HostMemory("input_sizes"),
494 Conv2DBackpropInputOp<GPUDevice, float>);
495REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
496 .Device(DEVICE_GPU)
497 .TypeConstraint<Eigen::half>("T")
498 .HostMemory("input_sizes"),
499 Conv2DBackpropInputOp<GPUDevice, Eigen::half>);
500REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
501 .Device(DEVICE_GPU)
502 .TypeConstraint<int32>("T")
503 .HostMemory("input_sizes"),
504 Conv2DBackpropInputOp<GPUDevice, int32>);
505
506// To be used inside depthwise_conv_grad_op.cc.
507// TODO(reedwm): Move this and the definition to depthwise_conv_grad_op.cc.
508template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
509template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
510template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
511
512#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
513
514} // namespace tensorflow
515