1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/core/kernels/pooling_ops_common.h"
17
18#include <vector>
19
20#include "tensorflow/core/common_runtime/device.h"
21#include "tensorflow/core/framework/bounds_check.h"
22#include "tensorflow/core/framework/kernel_shape_util.h"
23#include "tensorflow/core/framework/register_types.h"
24#include "tensorflow/core/framework/tensor.h"
25
26#if GOOGLE_CUDA
27#include "third_party/gpus/cudnn/cudnn.h"
28#endif // GOOGLE_CUDA
29#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
30#include "tensorflow/core/kernels/conv_2d.h"
31#include "tensorflow/core/kernels/gpu_utils.h"
32#if TENSORFLOW_USE_ROCM
33#include "tensorflow/core/kernels/conv_ops_gpu.h"
34#endif
35#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
36#include "tensorflow/core/platform/stream_executor.h"
37#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
38
39namespace tensorflow {
40
41namespace {
42
43template <typename T>
44struct RawType {
45 using type = T;
46};
47
48template <>
49struct RawType<qint8> {
50 using type = int8;
51};
52
53#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
54
55template <typename T>
56struct PadInputWithNegativeInf {
57 Status operator()(const GPUDevice& d,
58 typename TTypes<T, 4, int>::ConstTensor in,
59 int input_pad_top, int input_pad_bottom, int input_pad_left,
60 int input_pad_right, typename TTypes<T, 4, int>::Tensor out,
61 TensorFormat format) {
62 T padding_value = -std::numeric_limits<T>::infinity();
63 functor::PadInput<GPUDevice, T, int, 4>()(
64 d, in, {{input_pad_top, input_pad_left}},
65 {{input_pad_bottom, input_pad_right}}, out, format, padding_value);
66 return OkStatus();
67 }
68};
69
70template <>
71struct PadInputWithNegativeInf<qint8> {
72 Status operator()(const GPUDevice& d,
73 typename TTypes<qint8, 4, int>::ConstTensor in,
74 int input_pad_top, int input_pad_bottom, int input_pad_left,
75 int input_pad_right,
76 typename TTypes<qint8, 4, int>::Tensor out,
77 TensorFormat format) {
78 return errors::InvalidArgument(
79 "Explicit padding not yet supported with qint8");
80 }
81};
82
83#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
84
85} // namespace
86
87Status CheckPaddingSize(int64_t window_rows, int64_t window_cols,
88 int64_t pad_top, int64_t pad_bottom, int64_t pad_left,
89 int64_t pad_right) {
90 if (!FastBoundsCheck(pad_top, window_rows)) {
91 return errors::InvalidArgument("Top padding ", pad_top,
92 " needs to be smaller than the "
93 "window size ",
94 window_rows);
95 }
96 if (!FastBoundsCheck(pad_bottom, window_rows)) {
97 return errors::InvalidArgument("Bottom padding ", pad_bottom,
98 " needs to be smaller than the "
99 "window size ",
100 window_rows);
101 }
102 if (!FastBoundsCheck(pad_left, window_cols)) {
103 return errors::InvalidArgument("Left padding ", pad_left,
104 " needs to be smaller than the "
105 "window size ",
106 window_cols);
107 }
108 if (!FastBoundsCheck(pad_right, window_cols)) {
109 return errors::InvalidArgument("Right padding ", pad_right,
110 " needs to be smaller than the "
111 "window size ",
112 window_cols);
113 }
114 return OkStatus();
115}
116
117PoolParameters::PoolParameters(OpKernelContext* context,
118 const std::vector<int32>& ksize,
119 const std::vector<int32>& stride,
120 Padding padding,
121 std::vector<int64_t> explicit_paddings,
122 TensorFormat data_format,
123 const TensorShape& tensor_in_shape) {
124 // For maxpooling, tensor_in should have 2 spatial dimensions.
125 // Note: the total number of dimensions could be 4 for NHWC, NCHW,
126 // or 5 for NCHW_VECT_C.
127 OP_REQUIRES(context,
128 GetTensorSpatialDims(tensor_in_shape.dims(), data_format) == 2,
129 errors::InvalidArgument(
130 "tensor_in_shape must have 2 spatial dimensions. ",
131 tensor_in_shape.dims(), " ", data_format));
132
133 this->data_format = data_format;
134 depth = GetTensorDim(tensor_in_shape, data_format, 'C') *
135 (data_format == FORMAT_NCHW_VECT_C ? 4 : 1);
136 tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W');
137 tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H');
138 tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
139 window_rows = GetTensorDim(ksize, data_format, 'H');
140 window_cols = GetTensorDim(ksize, data_format, 'W');
141 depth_window = GetTensorDim(ksize, data_format, 'C');
142 row_stride = GetTensorDim(stride, data_format, 'H');
143 col_stride = GetTensorDim(stride, data_format, 'W');
144 depth_stride = GetTensorDim(stride, data_format, 'C');
145
146 // We only support 2D pooling across width/height and depthwise
147 // pooling, not a combination.
148 OP_REQUIRES(context,
149 (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
150 errors::Unimplemented(
151 "MaxPooling supports exactly one of pooling across depth "
152 "or pooling across width/height."));
153 if (padding == Padding::EXPLICIT) {
154 OP_REQUIRES_OK(context, CheckValidPadding(padding, explicit_paddings,
155 /*num_dims=*/4, data_format));
156 GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &pad_top,
157 &pad_bottom);
158 GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &pad_left,
159 &pad_right);
160 OP_REQUIRES_OK(context, CheckPaddingSize(window_rows, window_cols, pad_top,
161 pad_bottom, pad_left, pad_right));
162 }
163
164 if (depth_window == 1) {
165 OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
166 tensor_in_rows, window_rows, row_stride,
167 padding, &out_height, &pad_top, &pad_bottom));
168 OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
169 tensor_in_cols, window_cols, col_stride,
170 padding, &out_width, &pad_left, &pad_right));
171 pad_depth = 0;
172 out_depth = depth;
173 } else {
174 OP_REQUIRES(context, depth_window > 0,
175 errors::InvalidArgument("depth_window must not be 0"));
176 // Our current version of depthwise max pooling does not support
177 // any padding, and expects the depth_window to equal the
178 // depth_stride (no overlapping).
179 OP_REQUIRES(
180 context, depth % depth_window == 0,
181 errors::Unimplemented("Depthwise max pooling requires the depth "
182 "window to evenly divide the input depth"));
183 OP_REQUIRES(
184 context, depth_stride == depth_window,
185 errors::Unimplemented("Depthwise max pooling requires the depth "
186 "window to equal the depth stride"));
187
188 // The current version of depthwise max is only implemented on CPU.
189 OP_REQUIRES(context,
190 (DeviceType(static_cast<Device*>(context->device())
191 ->attributes()
192 .device_type()) == DeviceType(DEVICE_CPU)),
193 errors::Unimplemented("Depthwise max pooling is currently "
194 "only implemented for CPU devices."));
195
196 pad_depth = 0;
197 out_depth = depth / depth_window;
198 }
199}
200
201TensorShape PoolParameters::forward_output_shape() {
202 if (depth_window == 1) {
203 // Spatial pooling
204 return ShapeFromFormat(data_format, tensor_in_batch, out_height, out_width,
205 depth);
206 } else {
207 // Depthwise pooling
208 return TensorShape(
209 {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth});
210 }
211}
212
213#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
214
215template <typename T>
216void DnnPoolingOp<T>::Compute(OpKernelContext* context,
217 se::dnn::PoolingMode pooling_mode,
218 const std::vector<int32>& size,
219 const std::vector<int32>& stride, Padding padding,
220 std::vector<int64_t> explicit_paddings,
221 TensorFormat data_format, const Tensor& tensor_in,
222 const TensorShape& tensor_out_shape,
223 bool propagate_nans) {
224 Tensor* tensor_out = nullptr;
225 OP_REQUIRES_OK(context,
226 context->allocate_output(0, tensor_out_shape, &tensor_out));
227 if (tensor_in.shape().num_elements() == 0) {
228 return;
229 }
230
231 PoolParameters params{
232 context, size, stride, padding,
233 explicit_paddings, data_format, tensor_in.shape()};
234 if (!context->status().ok()) {
235 return;
236 }
237
238 int batch_size = params.tensor_in_batch;
239 int depth = params.depth;
240 int tensor_in_cols = params.tensor_in_cols;
241 int tensor_in_rows = params.tensor_in_rows;
242
243#if CUDNN_VERSION < 7300
244 /// Earlier versions do not support NHWC format, so we need to convert it
245 /// to NCHW before calling cudnn. We need to get rid of this once it is done
246 Tensor transformed_input;
247 if (data_format == FORMAT_NHWC) {
248 OP_REQUIRES_OK(context, context->allocate_temp(
249 DataTypeToEnum<T>::value,
250 ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(),
251 data_format),
252 &transformed_input));
253 functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
254 tensor_in.tensor<T, 4>(),
255 transformed_input.tensor<T, 4>());
256 } else {
257 transformed_input = tensor_in;
258 }
259 Tensor transformed_output;
260 if (data_format == FORMAT_NHWC) {
261 OP_REQUIRES_OK(context, context->allocate_temp(
262 DataTypeToEnum<T>::value,
263 ShapeFromFormat(FORMAT_NCHW, tensor_out_shape,
264 data_format),
265 &transformed_output));
266 } else {
267 transformed_output = *tensor_out;
268 }
269 se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
270#else
271 Tensor transformed_input = tensor_in;
272 auto& transformed_output = *tensor_out;
273 se::dnn::DataLayout data_layout;
274 switch (data_format) {
275 case FORMAT_NHWC:
276 data_layout = se::dnn::DataLayout::kBatchYXDepth;
277 break;
278 case FORMAT_NCHW:
279 data_layout = se::dnn::DataLayout::kBatchDepthYX;
280 break;
281 case FORMAT_NCHW_VECT_C:
282 // NCHW_VECT_C is not supported by cudnnPoolingForward(), but can be
283 // emulated via NHWC.
284 data_layout = se::dnn::DataLayout::kBatchYXDepth;
285 batch_size *= depth / 4;
286 depth = 4;
287 break;
288 default:
289 OP_REQUIRES(context, false,
290 errors::InvalidArgument("Unsupported format: ",
291 ToString(data_format)));
292 }
293#endif
294
295 int64_t vertical_padding = params.pad_top;
296 int64_t horizontal_padding = params.pad_left;
297
298 if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
299 params.pad_left != params.pad_right)) {
300 // cuDNN only supports padding the same amount on the left and right sides,
301 // and on the top and bottom sides. So we manually create a new padded
302 // input tensor such that we can pass it to cuDNN.
303 const int64_t common_padding_rows =
304 std::min(params.pad_top, params.pad_bottom);
305 const int64_t common_padding_cols =
306 std::min(params.pad_left, params.pad_right);
307
308 Tensor padded_input;
309 const int64_t padding_rows_diff =
310 std::abs(params.pad_top - params.pad_bottom);
311 const int64_t padding_cols_diff =
312 std::abs(params.pad_left - params.pad_right);
313
314 const int64_t new_in_rows = tensor_in_rows + padding_rows_diff;
315 const int64_t new_in_cols = tensor_in_cols + padding_cols_diff;
316
317 OP_REQUIRES_OK(
318 context,
319 context->allocate_temp(DataTypeToEnum<T>::value,
320 ShapeFromFormat(data_format, batch_size,
321 new_in_rows, new_in_cols, depth),
322 &padded_input));
323 const int64_t input_pad_top = params.pad_top - common_padding_rows;
324 const int64_t input_pad_bottom = params.pad_bottom - common_padding_rows;
325 const int64_t input_pad_left = params.pad_left - common_padding_cols;
326 const int64_t input_pad_right = params.pad_right - common_padding_cols;
327
328 bool in_bounds =
329 FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
330 FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
331 FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
332 FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
333 if (!in_bounds) {
334 context->SetStatus(errors::InvalidArgument("Padding is too large."));
335 return;
336 }
337
338 // We need to call the const version of transformed_input.tensor()
339 const Tensor& const_transformed_input = transformed_input;
340 OP_REQUIRES_OK(
341 context,
342 PadInputWithNegativeInf<T>()(
343 context->eigen_device<GPUDevice>(),
344 To32Bit(const_transformed_input.tensor<T, 4>()),
345 static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom),
346 static_cast<int>(input_pad_left), static_cast<int>(input_pad_right),
347 To32Bit(padded_input.tensor<T, 4>()), data_format));
348 transformed_input = padded_input;
349 vertical_padding = common_padding_rows;
350 horizontal_padding = common_padding_cols;
351 tensor_in_rows = new_in_rows;
352 tensor_in_cols = new_in_cols;
353 }
354
355 se::dnn::PoolingDescriptor pooling_desc;
356 pooling_desc.set_pooling_mode(pooling_mode)
357 .set_window_height(params.window_rows)
358 .set_window_width(params.window_cols)
359 .set_vertical_stride(params.row_stride)
360 .set_horizontal_stride(params.col_stride)
361 .set_vertical_padding(vertical_padding)
362 .set_horizontal_padding(horizontal_padding)
363 .set_propagate_nans(propagate_nans);
364
365 se::dnn::BatchDescriptor input_desc;
366 input_desc.set_count(batch_size)
367 .set_height(tensor_in_rows)
368 .set_width(tensor_in_cols)
369 .set_feature_map_count(depth)
370 .set_layout(data_layout);
371
372 se::dnn::BatchDescriptor output_desc;
373 output_desc.set_count(batch_size)
374 .set_height(params.out_height)
375 .set_width(params.out_width)
376 .set_feature_map_count(depth)
377 .set_layout(data_layout);
378
379 auto input_data =
380 AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
381 transformed_input.template flat<T>().data()),
382 transformed_input.template flat<T>().size());
383
384 auto output_data =
385 AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
386 transformed_output.template flat<T>().data()),
387 transformed_output.template flat<T>().size());
388
389 auto* stream = context->op_device_context()->stream();
390 OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
391
392#if TENSORFLOW_USE_ROCM
393 static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
394 // default value is in bytes despite the name of the environment variable
395 "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
396 );
397
398 DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
399 OP_REQUIRES_OK(context, stream->ThenPoolForward(
400 pooling_desc, input_desc, input_data, output_desc,
401 &output_data, &scratch_allocator));
402#else
403 OP_REQUIRES_OK(context,
404 stream->ThenPoolForward(pooling_desc, input_desc, input_data,
405 output_desc, &output_data));
406#endif
407
408#if CUDNN_VERSION < 7300
409 if (data_format == FORMAT_NHWC) {
410 /// Transform the output data from NCHW back to NHWC
411 auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
412 using RT = typename RawType<T>::type;
413 functor::NCHWToNHWC<GPUDevice, RT, 4>()(
414 context->eigen_device<Device>(),
415 toConstTensor(transformed_output).template tensor<RT, 4>(),
416 tensor_out->tensor<RT, 4>());
417 }
418#endif
419}
420
421// Forward declarations of the functor specializations for GPU.
422namespace functor {
423#define DECLARE_GPU_SPEC(T) \
424 template <> \
425 void PadInput<GPUDevice, T, int, 4>::operator()( \
426 const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \
427 const std::array<int, 2>& padding_left, \
428 const std::array<int, 2>& padding_right, \
429 typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
430 const T& padding_value); \
431 extern template struct PadInput<GPUDevice, T, int, 4>;
432
433DECLARE_GPU_SPEC(float);
434DECLARE_GPU_SPEC(Eigen::half);
435DECLARE_GPU_SPEC(double);
436DECLARE_GPU_SPEC(int32);
437} // namespace functor
438
439template <typename T>
440void DnnPoolingGradOp<T>::Compute(
441 OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
442 const std::vector<int32>& size, const std::vector<int32>& stride,
443 Padding padding, std::vector<int64_t> explicit_paddings,
444 TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out,
445 const Tensor& out_backprop, const TensorShape& tensor_in_shape,
446 bool propagate_nans) {
447 CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
448 (tensor_in && tensor_out))
449 << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
450 "specified";
451
452 Tensor* input_backprop = nullptr;
453 OP_REQUIRES_OK(context,
454 context->allocate_output(0, tensor_in_shape, &input_backprop));
455 if (tensor_in_shape.num_elements() == 0) {
456 return;
457 }
458
459 PoolParameters params{context, size, stride, padding,
460 explicit_paddings, data_format, tensor_in_shape};
461 if (!context->status().ok()) {
462 return;
463 }
464 if (tensor_out) {
465 OP_REQUIRES(context, tensor_out->shape() == params.forward_output_shape(),
466 errors::InvalidArgument("Expected orig_output shape to be ",
467 params.forward_output_shape(),
468 ", but got ", tensor_out->shape()));
469 }
470 OP_REQUIRES(context, out_backprop.shape() == params.forward_output_shape(),
471 errors::InvalidArgument("Expected grad shape to be ",
472 params.forward_output_shape(),
473 ", but got ", out_backprop.shape()));
474
475 TensorFormat transformed_input_data_format = data_format;
476
477#if CUDNN_VERSION < 7300
478 /// For now, cudnn does not support NHWC format, so we need to convert it
479 /// to NCHW before calling cudnn. We need to get rid of this once it is done
480 Tensor transformed_input;
481 TensorShape transformed_input_shape;
482 if (data_format == FORMAT_NHWC || !tensor_in) {
483 transformed_input_shape =
484 ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format);
485 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
486 transformed_input_shape,
487 &transformed_input));
488 } else {
489 transformed_input = *tensor_in;
490 }
491 Tensor transformed_output;
492 TensorShape transformed_output_shape;
493 if (data_format == FORMAT_NHWC || !tensor_out) {
494 transformed_output_shape =
495 ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format);
496 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
497 transformed_output_shape,
498 &transformed_output));
499 } else {
500 transformed_output = *tensor_out;
501 }
502 Tensor transformed_input_backprop;
503 if (data_format == FORMAT_NHWC) {
504 OP_REQUIRES_OK(context,
505 context->allocate_temp(DataTypeToEnum<T>::value,
506 transformed_input_shape,
507 &transformed_input_backprop));
508 } else {
509 transformed_input_backprop = *input_backprop;
510 }
511 Tensor transformed_output_backprop;
512 if (data_format == FORMAT_NHWC) {
513 OP_REQUIRES_OK(context,
514 context->allocate_temp(DataTypeToEnum<T>::value,
515 transformed_output_shape,
516 &transformed_output_backprop));
517 } else {
518 transformed_output_backprop = out_backprop;
519 }
520
521 if (data_format == FORMAT_NHWC) {
522 /// Convert the data from NHWC to NCHW if necessary.
523 if (tensor_in) {
524 // For AvgPoolGrad, the original input tensor is not necessary. However,
525 // cudnn still requires them to run, although they do not affect the
526 // results.
527 functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
528 tensor_in->tensor<T, 4>(),
529 transformed_input.tensor<T, 4>());
530 transformed_input_data_format = FORMAT_NCHW;
531 }
532 if (tensor_out) {
533 // For AvgPoolGrad, the original output tensor is not necessary. However,
534 // cudnn still requires them to run, although they do not affect the
535 // results.
536 functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
537 tensor_out->tensor<T, 4>(),
538 transformed_output.tensor<T, 4>());
539 }
540 functor::NHWCToNCHW<GPUDevice, T, 4>()(
541 context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
542 transformed_output_backprop.tensor<T, 4>());
543 }
544 se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
545#else
546 Tensor transformed_input;
547 if (!tensor_in) {
548 OP_REQUIRES_OK(context,
549 context->allocate_temp(DataTypeToEnum<T>::value,
550 tensor_in_shape, &transformed_input));
551 } else {
552 transformed_input = *tensor_in;
553 }
554 Tensor transformed_output;
555 if (!tensor_out) {
556 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
557 out_backprop.shape(),
558 &transformed_output));
559 } else {
560 transformed_output = *tensor_out;
561 }
562 Tensor transformed_input_backprop = *input_backprop;
563 Tensor transformed_output_backprop = out_backprop;
564 se::dnn::DataLayout data_layout;
565 switch (data_format) {
566 case FORMAT_NHWC:
567 data_layout = se::dnn::DataLayout::kBatchYXDepth;
568 break;
569 case FORMAT_NCHW:
570 data_layout = se::dnn::DataLayout::kBatchDepthYX;
571 break;
572 default:
573 OP_REQUIRES(context, false,
574 errors::InvalidArgument("Unsupported format: ",
575 ToString(data_format)));
576 }
577#endif // CUDNN_VERSION < 7300
578
579 int64_t vertical_padding = params.pad_top;
580 int64_t horizontal_padding = params.pad_left;
581
582 int batch_size = params.tensor_in_batch;
583 int depth = params.depth;
584 int tensor_in_cols = params.tensor_in_cols;
585 int tensor_in_rows = params.tensor_in_rows;
586
587 int64_t input_pad_top = 0;
588 int64_t input_pad_bottom = 0;
589 int64_t input_pad_left = 0;
590 int64_t input_pad_right = 0;
591
592 Tensor transformed_and_padded_input_backprop;
593
594 if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
595 params.pad_left != params.pad_right)) {
596 // Pad the input in the same way we did during the forward pass, so that
597 // cuDNN or MIOpen receives the same input during the backward pass function
598 // as it did during the forward pass function.
599 const int64_t common_padding_rows =
600 std::min(params.pad_top, params.pad_bottom);
601 const int64_t common_padding_cols =
602 std::min(params.pad_left, params.pad_right);
603
604 Tensor padded_input;
605 const int64_t padding_rows_diff =
606 std::abs(params.pad_top - params.pad_bottom);
607 const int64_t padding_cols_diff =
608 std::abs(params.pad_left - params.pad_right);
609
610 const int64_t new_in_rows = tensor_in_rows + padding_rows_diff;
611 const int64_t new_in_cols = tensor_in_cols + padding_cols_diff;
612
613 VLOG(2) << "Create new tensor: "
614 << " original rows=" << tensor_in_rows
615 << " original cols=" << tensor_in_cols
616 << " padding_rows=" << new_in_rows
617 << " padding_cols=" << new_in_cols << " depth= " << depth
618 << " batch_size=" << batch_size << " kernel_rows"
619 << params.window_rows << " kernel_col" << params.window_cols
620 << " stride_rows" << params.row_stride;
621
622 OP_REQUIRES_OK(
623 context, context->allocate_temp(
624 DataTypeToEnum<T>::value,
625 ShapeFromFormat(transformed_input_data_format, batch_size,
626 new_in_rows, new_in_cols, depth),
627 &padded_input));
628
629 OP_REQUIRES_OK(
630 context, context->allocate_temp(
631 DataTypeToEnum<T>::value,
632 ShapeFromFormat(transformed_input_data_format, batch_size,
633 new_in_rows, new_in_cols, depth),
634 &transformed_and_padded_input_backprop));
635
636 input_pad_top = params.pad_top - common_padding_rows;
637 input_pad_bottom = params.pad_bottom - common_padding_rows;
638 input_pad_left = params.pad_left - common_padding_cols;
639 input_pad_right = params.pad_right - common_padding_cols;
640
641 bool in_bounds =
642 FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
643 FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
644 FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
645 FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
646 if (!in_bounds) {
647 context->SetStatus(errors::InvalidArgument("Padding is too large."));
648 return;
649 }
650
651 // PadInputWithNegativeInf functor requires input to be a const.
652 const Tensor& const_transformed_input = transformed_input;
653 OP_REQUIRES_OK(
654 context,
655 PadInputWithNegativeInf<T>()(
656 context->eigen_device<GPUDevice>(),
657 To32Bit(const_transformed_input.tensor<T, 4>()),
658 static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom),
659 static_cast<int>(input_pad_left), static_cast<int>(input_pad_right),
660 To32Bit(padded_input.tensor<T, 4>()),
661 transformed_input_data_format));
662
663 transformed_input = padded_input;
664
665 vertical_padding = common_padding_rows;
666 horizontal_padding = common_padding_cols;
667 VLOG(2) << "vertical padding set to: " << vertical_padding
668 << " horizontal padding set to: " << horizontal_padding;
669 tensor_in_rows = new_in_rows;
670 tensor_in_cols = new_in_cols;
671 } else {
672 transformed_and_padded_input_backprop = transformed_input_backprop;
673 }
674
675 /// Get ready to call cudnn
676 se::dnn::PoolingDescriptor pooling_desc;
677 pooling_desc.set_pooling_mode(pooling_mode)
678 .set_window_height(params.window_rows)
679 .set_window_width(params.window_cols)
680 .set_vertical_stride(params.row_stride)
681 .set_horizontal_stride(params.col_stride)
682 .set_vertical_padding(vertical_padding)
683 .set_horizontal_padding(horizontal_padding)
684 .set_propagate_nans(propagate_nans);
685
686 se::dnn::BatchDescriptor orig_output_desc;
687 orig_output_desc.set_count(params.tensor_in_batch)
688 .set_height(params.out_height)
689 .set_width(params.out_width)
690 .set_feature_map_count(params.depth)
691 .set_layout(data_layout);
692
693 se::dnn::BatchDescriptor orig_input_desc;
694 orig_input_desc.set_count(params.tensor_in_batch)
695 .set_height(tensor_in_rows)
696 .set_width(tensor_in_cols)
697 .set_feature_map_count(params.depth)
698 .set_layout(data_layout);
699
700 auto orig_output_data =
701 AsDeviceMemory(transformed_output.template flat<T>().data(),
702 transformed_output.template flat<T>().size());
703 auto orig_input_data =
704 AsDeviceMemory(transformed_input.template flat<T>().data(),
705 transformed_input.template flat<T>().size());
706 auto output_backprop_data =
707 AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
708 transformed_output_backprop.template flat<T>().size());
709 auto input_backprop_data = AsDeviceMemory(
710 transformed_and_padded_input_backprop.template flat<T>().data(),
711 transformed_and_padded_input_backprop.template flat<T>().size());
712
713 auto* stream = context->op_device_context()->stream();
714 OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
715
716#if TENSORFLOW_USE_ROCM
717 static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
718 // default value is in bytes despite the name of the environment variable
719 "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
720 );
721
722 DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
723 OP_REQUIRES_OK(context,
724 stream->ThenPoolBackward(
725 pooling_desc, orig_input_desc, orig_input_data,
726 orig_output_desc, orig_output_data, output_backprop_data,
727 &input_backprop_data, &scratch_allocator));
728#else
729 OP_REQUIRES_OK(context, stream->ThenPoolBackward(
730 pooling_desc, orig_input_desc, orig_input_data,
731 orig_output_desc, orig_output_data,
732 output_backprop_data, &input_backprop_data));
733#endif
734
735 if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
736 params.pad_left != params.pad_right)) {
737 // Remove the padding that was added to the input shape above.
738 functor::PadInput<GPUDevice, T, int, 4>()(
739 context->eigen_device<GPUDevice>(),
740 To32Bit(const_cast<const Tensor&>(transformed_and_padded_input_backprop)
741 .tensor<T, 4>()),
742 {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
743 {{static_cast<int>(-input_pad_bottom),
744 static_cast<int>(-input_pad_right)}},
745 To32Bit(transformed_input_backprop.template tensor<T, 4>()),
746 transformed_input_data_format, T{});
747 }
748
749#if CUDNN_VERSION < 7300
750 if (data_format == FORMAT_NHWC) {
751 /// Transform the output data from NCHW back to NHWC.
752 auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
753 functor::NCHWToNHWC<GPUDevice, T, 4>()(
754 context->eigen_device<Device>(),
755 toConstTensor(transformed_input_backprop).template tensor<T, 4>(),
756 input_backprop->tensor<T, 4>());
757 }
758#endif // CUDNN_VERSION < 7300
759}
760
761#define DEFINE_DNN_OPS(T) \
762 template class DnnPoolingOp<T>; \
763 template class DnnPoolingGradOp<T>;
764TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS)
765
766#if CUDNN_VERSION >= 7300
767template class DnnPoolingOp<qint8>;
768#endif
769
770#undef DEFINE_DNN_OPS
771
772#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
773
774} // namespace tensorflow
775