1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "tensorflow/core/kernels/pooling_ops_common.h" |
17 | |
18 | #include <vector> |
19 | |
20 | #include "tensorflow/core/common_runtime/device.h" |
21 | #include "tensorflow/core/framework/bounds_check.h" |
22 | #include "tensorflow/core/framework/kernel_shape_util.h" |
23 | #include "tensorflow/core/framework/register_types.h" |
24 | #include "tensorflow/core/framework/tensor.h" |
25 | |
26 | #if GOOGLE_CUDA |
27 | #include "third_party/gpus/cudnn/cudnn.h" |
28 | #endif // GOOGLE_CUDA |
29 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
30 | #include "tensorflow/core/kernels/conv_2d.h" |
31 | #include "tensorflow/core/kernels/gpu_utils.h" |
32 | #if TENSORFLOW_USE_ROCM |
33 | #include "tensorflow/core/kernels/conv_ops_gpu.h" |
34 | #endif |
35 | #include "tensorflow/core/kernels/pooling_ops_common_gpu.h" |
36 | #include "tensorflow/core/platform/stream_executor.h" |
37 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
38 | |
39 | namespace tensorflow { |
40 | |
41 | namespace { |
42 | |
43 | template <typename T> |
44 | struct RawType { |
45 | using type = T; |
46 | }; |
47 | |
48 | template <> |
49 | struct RawType<qint8> { |
50 | using type = int8; |
51 | }; |
52 | |
53 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
54 | |
55 | template <typename T> |
56 | struct PadInputWithNegativeInf { |
57 | Status operator()(const GPUDevice& d, |
58 | typename TTypes<T, 4, int>::ConstTensor in, |
59 | int input_pad_top, int input_pad_bottom, int input_pad_left, |
60 | int input_pad_right, typename TTypes<T, 4, int>::Tensor out, |
61 | TensorFormat format) { |
62 | T padding_value = -std::numeric_limits<T>::infinity(); |
63 | functor::PadInput<GPUDevice, T, int, 4>()( |
64 | d, in, {{input_pad_top, input_pad_left}}, |
65 | {{input_pad_bottom, input_pad_right}}, out, format, padding_value); |
66 | return OkStatus(); |
67 | } |
68 | }; |
69 | |
70 | template <> |
71 | struct PadInputWithNegativeInf<qint8> { |
72 | Status operator()(const GPUDevice& d, |
73 | typename TTypes<qint8, 4, int>::ConstTensor in, |
74 | int input_pad_top, int input_pad_bottom, int input_pad_left, |
75 | int input_pad_right, |
76 | typename TTypes<qint8, 4, int>::Tensor out, |
77 | TensorFormat format) { |
78 | return errors::InvalidArgument( |
79 | "Explicit padding not yet supported with qint8" ); |
80 | } |
81 | }; |
82 | |
83 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
84 | |
85 | } // namespace |
86 | |
87 | Status CheckPaddingSize(int64_t window_rows, int64_t window_cols, |
88 | int64_t pad_top, int64_t pad_bottom, int64_t pad_left, |
89 | int64_t pad_right) { |
90 | if (!FastBoundsCheck(pad_top, window_rows)) { |
91 | return errors::InvalidArgument("Top padding " , pad_top, |
92 | " needs to be smaller than the " |
93 | "window size " , |
94 | window_rows); |
95 | } |
96 | if (!FastBoundsCheck(pad_bottom, window_rows)) { |
97 | return errors::InvalidArgument("Bottom padding " , pad_bottom, |
98 | " needs to be smaller than the " |
99 | "window size " , |
100 | window_rows); |
101 | } |
102 | if (!FastBoundsCheck(pad_left, window_cols)) { |
103 | return errors::InvalidArgument("Left padding " , pad_left, |
104 | " needs to be smaller than the " |
105 | "window size " , |
106 | window_cols); |
107 | } |
108 | if (!FastBoundsCheck(pad_right, window_cols)) { |
109 | return errors::InvalidArgument("Right padding " , pad_right, |
110 | " needs to be smaller than the " |
111 | "window size " , |
112 | window_cols); |
113 | } |
114 | return OkStatus(); |
115 | } |
116 | |
117 | PoolParameters::PoolParameters(OpKernelContext* context, |
118 | const std::vector<int32>& ksize, |
119 | const std::vector<int32>& stride, |
120 | Padding padding, |
121 | std::vector<int64_t> explicit_paddings, |
122 | TensorFormat data_format, |
123 | const TensorShape& tensor_in_shape) { |
124 | // For maxpooling, tensor_in should have 2 spatial dimensions. |
125 | // Note: the total number of dimensions could be 4 for NHWC, NCHW, |
126 | // or 5 for NCHW_VECT_C. |
127 | OP_REQUIRES(context, |
128 | GetTensorSpatialDims(tensor_in_shape.dims(), data_format) == 2, |
129 | errors::InvalidArgument( |
130 | "tensor_in_shape must have 2 spatial dimensions. " , |
131 | tensor_in_shape.dims(), " " , data_format)); |
132 | |
133 | this->data_format = data_format; |
134 | depth = GetTensorDim(tensor_in_shape, data_format, 'C') * |
135 | (data_format == FORMAT_NCHW_VECT_C ? 4 : 1); |
136 | tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W'); |
137 | tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H'); |
138 | tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N'); |
139 | window_rows = GetTensorDim(ksize, data_format, 'H'); |
140 | window_cols = GetTensorDim(ksize, data_format, 'W'); |
141 | depth_window = GetTensorDim(ksize, data_format, 'C'); |
142 | row_stride = GetTensorDim(stride, data_format, 'H'); |
143 | col_stride = GetTensorDim(stride, data_format, 'W'); |
144 | depth_stride = GetTensorDim(stride, data_format, 'C'); |
145 | |
146 | // We only support 2D pooling across width/height and depthwise |
147 | // pooling, not a combination. |
148 | OP_REQUIRES(context, |
149 | (depth_window == 1 || (window_rows == 1 && window_cols == 1)), |
150 | errors::Unimplemented( |
151 | "MaxPooling supports exactly one of pooling across depth " |
152 | "or pooling across width/height." )); |
153 | if (padding == Padding::EXPLICIT) { |
154 | OP_REQUIRES_OK(context, CheckValidPadding(padding, explicit_paddings, |
155 | /*num_dims=*/4, data_format)); |
156 | GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &pad_top, |
157 | &pad_bottom); |
158 | GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &pad_left, |
159 | &pad_right); |
160 | OP_REQUIRES_OK(context, CheckPaddingSize(window_rows, window_cols, pad_top, |
161 | pad_bottom, pad_left, pad_right)); |
162 | } |
163 | |
164 | if (depth_window == 1) { |
165 | OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( |
166 | tensor_in_rows, window_rows, row_stride, |
167 | padding, &out_height, &pad_top, &pad_bottom)); |
168 | OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( |
169 | tensor_in_cols, window_cols, col_stride, |
170 | padding, &out_width, &pad_left, &pad_right)); |
171 | pad_depth = 0; |
172 | out_depth = depth; |
173 | } else { |
174 | OP_REQUIRES(context, depth_window > 0, |
175 | errors::InvalidArgument("depth_window must not be 0" )); |
176 | // Our current version of depthwise max pooling does not support |
177 | // any padding, and expects the depth_window to equal the |
178 | // depth_stride (no overlapping). |
179 | OP_REQUIRES( |
180 | context, depth % depth_window == 0, |
181 | errors::Unimplemented("Depthwise max pooling requires the depth " |
182 | "window to evenly divide the input depth" )); |
183 | OP_REQUIRES( |
184 | context, depth_stride == depth_window, |
185 | errors::Unimplemented("Depthwise max pooling requires the depth " |
186 | "window to equal the depth stride" )); |
187 | |
188 | // The current version of depthwise max is only implemented on CPU. |
189 | OP_REQUIRES(context, |
190 | (DeviceType(static_cast<Device*>(context->device()) |
191 | ->attributes() |
192 | .device_type()) == DeviceType(DEVICE_CPU)), |
193 | errors::Unimplemented("Depthwise max pooling is currently " |
194 | "only implemented for CPU devices." )); |
195 | |
196 | pad_depth = 0; |
197 | out_depth = depth / depth_window; |
198 | } |
199 | } |
200 | |
201 | TensorShape PoolParameters::forward_output_shape() { |
202 | if (depth_window == 1) { |
203 | // Spatial pooling |
204 | return ShapeFromFormat(data_format, tensor_in_batch, out_height, out_width, |
205 | depth); |
206 | } else { |
207 | // Depthwise pooling |
208 | return TensorShape( |
209 | {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth}); |
210 | } |
211 | } |
212 | |
213 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
214 | |
215 | template <typename T> |
216 | void DnnPoolingOp<T>::Compute(OpKernelContext* context, |
217 | se::dnn::PoolingMode pooling_mode, |
218 | const std::vector<int32>& size, |
219 | const std::vector<int32>& stride, Padding padding, |
220 | std::vector<int64_t> explicit_paddings, |
221 | TensorFormat data_format, const Tensor& tensor_in, |
222 | const TensorShape& tensor_out_shape, |
223 | bool propagate_nans) { |
224 | Tensor* tensor_out = nullptr; |
225 | OP_REQUIRES_OK(context, |
226 | context->allocate_output(0, tensor_out_shape, &tensor_out)); |
227 | if (tensor_in.shape().num_elements() == 0) { |
228 | return; |
229 | } |
230 | |
231 | PoolParameters params{ |
232 | context, size, stride, padding, |
233 | explicit_paddings, data_format, tensor_in.shape()}; |
234 | if (!context->status().ok()) { |
235 | return; |
236 | } |
237 | |
238 | int batch_size = params.tensor_in_batch; |
239 | int depth = params.depth; |
240 | int tensor_in_cols = params.tensor_in_cols; |
241 | int tensor_in_rows = params.tensor_in_rows; |
242 | |
243 | #if CUDNN_VERSION < 7300 |
244 | /// Earlier versions do not support NHWC format, so we need to convert it |
245 | /// to NCHW before calling cudnn. We need to get rid of this once it is done |
246 | Tensor transformed_input; |
247 | if (data_format == FORMAT_NHWC) { |
248 | OP_REQUIRES_OK(context, context->allocate_temp( |
249 | DataTypeToEnum<T>::value, |
250 | ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(), |
251 | data_format), |
252 | &transformed_input)); |
253 | functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(), |
254 | tensor_in.tensor<T, 4>(), |
255 | transformed_input.tensor<T, 4>()); |
256 | } else { |
257 | transformed_input = tensor_in; |
258 | } |
259 | Tensor transformed_output; |
260 | if (data_format == FORMAT_NHWC) { |
261 | OP_REQUIRES_OK(context, context->allocate_temp( |
262 | DataTypeToEnum<T>::value, |
263 | ShapeFromFormat(FORMAT_NCHW, tensor_out_shape, |
264 | data_format), |
265 | &transformed_output)); |
266 | } else { |
267 | transformed_output = *tensor_out; |
268 | } |
269 | se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX; |
270 | #else |
271 | Tensor transformed_input = tensor_in; |
272 | auto& transformed_output = *tensor_out; |
273 | se::dnn::DataLayout data_layout; |
274 | switch (data_format) { |
275 | case FORMAT_NHWC: |
276 | data_layout = se::dnn::DataLayout::kBatchYXDepth; |
277 | break; |
278 | case FORMAT_NCHW: |
279 | data_layout = se::dnn::DataLayout::kBatchDepthYX; |
280 | break; |
281 | case FORMAT_NCHW_VECT_C: |
282 | // NCHW_VECT_C is not supported by cudnnPoolingForward(), but can be |
283 | // emulated via NHWC. |
284 | data_layout = se::dnn::DataLayout::kBatchYXDepth; |
285 | batch_size *= depth / 4; |
286 | depth = 4; |
287 | break; |
288 | default: |
289 | OP_REQUIRES(context, false, |
290 | errors::InvalidArgument("Unsupported format: " , |
291 | ToString(data_format))); |
292 | } |
293 | #endif |
294 | |
295 | int64_t vertical_padding = params.pad_top; |
296 | int64_t horizontal_padding = params.pad_left; |
297 | |
298 | if (padding == EXPLICIT && (params.pad_top != params.pad_bottom || |
299 | params.pad_left != params.pad_right)) { |
300 | // cuDNN only supports padding the same amount on the left and right sides, |
301 | // and on the top and bottom sides. So we manually create a new padded |
302 | // input tensor such that we can pass it to cuDNN. |
303 | const int64_t common_padding_rows = |
304 | std::min(params.pad_top, params.pad_bottom); |
305 | const int64_t common_padding_cols = |
306 | std::min(params.pad_left, params.pad_right); |
307 | |
308 | Tensor padded_input; |
309 | const int64_t padding_rows_diff = |
310 | std::abs(params.pad_top - params.pad_bottom); |
311 | const int64_t padding_cols_diff = |
312 | std::abs(params.pad_left - params.pad_right); |
313 | |
314 | const int64_t new_in_rows = tensor_in_rows + padding_rows_diff; |
315 | const int64_t new_in_cols = tensor_in_cols + padding_cols_diff; |
316 | |
317 | OP_REQUIRES_OK( |
318 | context, |
319 | context->allocate_temp(DataTypeToEnum<T>::value, |
320 | ShapeFromFormat(data_format, batch_size, |
321 | new_in_rows, new_in_cols, depth), |
322 | &padded_input)); |
323 | const int64_t input_pad_top = params.pad_top - common_padding_rows; |
324 | const int64_t input_pad_bottom = params.pad_bottom - common_padding_rows; |
325 | const int64_t input_pad_left = params.pad_left - common_padding_cols; |
326 | const int64_t input_pad_right = params.pad_right - common_padding_cols; |
327 | |
328 | bool in_bounds = |
329 | FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) && |
330 | FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) && |
331 | FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) && |
332 | FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max()); |
333 | if (!in_bounds) { |
334 | context->SetStatus(errors::InvalidArgument("Padding is too large." )); |
335 | return; |
336 | } |
337 | |
338 | // We need to call the const version of transformed_input.tensor() |
339 | const Tensor& const_transformed_input = transformed_input; |
340 | OP_REQUIRES_OK( |
341 | context, |
342 | PadInputWithNegativeInf<T>()( |
343 | context->eigen_device<GPUDevice>(), |
344 | To32Bit(const_transformed_input.tensor<T, 4>()), |
345 | static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom), |
346 | static_cast<int>(input_pad_left), static_cast<int>(input_pad_right), |
347 | To32Bit(padded_input.tensor<T, 4>()), data_format)); |
348 | transformed_input = padded_input; |
349 | vertical_padding = common_padding_rows; |
350 | horizontal_padding = common_padding_cols; |
351 | tensor_in_rows = new_in_rows; |
352 | tensor_in_cols = new_in_cols; |
353 | } |
354 | |
355 | se::dnn::PoolingDescriptor pooling_desc; |
356 | pooling_desc.set_pooling_mode(pooling_mode) |
357 | .set_window_height(params.window_rows) |
358 | .set_window_width(params.window_cols) |
359 | .set_vertical_stride(params.row_stride) |
360 | .set_horizontal_stride(params.col_stride) |
361 | .set_vertical_padding(vertical_padding) |
362 | .set_horizontal_padding(horizontal_padding) |
363 | .set_propagate_nans(propagate_nans); |
364 | |
365 | se::dnn::BatchDescriptor input_desc; |
366 | input_desc.set_count(batch_size) |
367 | .set_height(tensor_in_rows) |
368 | .set_width(tensor_in_cols) |
369 | .set_feature_map_count(depth) |
370 | .set_layout(data_layout); |
371 | |
372 | se::dnn::BatchDescriptor output_desc; |
373 | output_desc.set_count(batch_size) |
374 | .set_height(params.out_height) |
375 | .set_width(params.out_width) |
376 | .set_feature_map_count(depth) |
377 | .set_layout(data_layout); |
378 | |
379 | auto input_data = |
380 | AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>( |
381 | transformed_input.template flat<T>().data()), |
382 | transformed_input.template flat<T>().size()); |
383 | |
384 | auto output_data = |
385 | AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>( |
386 | transformed_output.template flat<T>().data()), |
387 | transformed_output.template flat<T>().size()); |
388 | |
389 | auto* stream = context->op_device_context()->stream(); |
390 | OP_REQUIRES(context, stream, errors::Internal("No GPU stream available." )); |
391 | |
392 | #if TENSORFLOW_USE_ROCM |
393 | static int64 PoolingScratchSize = GetDnnWorkspaceLimit( |
394 | // default value is in bytes despite the name of the environment variable |
395 | "TF_CUDNN_WORKSPACE_LIMIT_IN_MB" , 1LL << 32 // 4GB |
396 | ); |
397 | |
398 | DnnScratchAllocator scratch_allocator(PoolingScratchSize, context); |
399 | OP_REQUIRES_OK(context, stream->ThenPoolForward( |
400 | pooling_desc, input_desc, input_data, output_desc, |
401 | &output_data, &scratch_allocator)); |
402 | #else |
403 | OP_REQUIRES_OK(context, |
404 | stream->ThenPoolForward(pooling_desc, input_desc, input_data, |
405 | output_desc, &output_data)); |
406 | #endif |
407 | |
408 | #if CUDNN_VERSION < 7300 |
409 | if (data_format == FORMAT_NHWC) { |
410 | /// Transform the output data from NCHW back to NHWC |
411 | auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; |
412 | using RT = typename RawType<T>::type; |
413 | functor::NCHWToNHWC<GPUDevice, RT, 4>()( |
414 | context->eigen_device<Device>(), |
415 | toConstTensor(transformed_output).template tensor<RT, 4>(), |
416 | tensor_out->tensor<RT, 4>()); |
417 | } |
418 | #endif |
419 | } |
420 | |
421 | // Forward declarations of the functor specializations for GPU. |
422 | namespace functor { |
423 | #define DECLARE_GPU_SPEC(T) \ |
424 | template <> \ |
425 | void PadInput<GPUDevice, T, int, 4>::operator()( \ |
426 | const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \ |
427 | const std::array<int, 2>& padding_left, \ |
428 | const std::array<int, 2>& padding_right, \ |
429 | typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \ |
430 | const T& padding_value); \ |
431 | extern template struct PadInput<GPUDevice, T, int, 4>; |
432 | |
433 | DECLARE_GPU_SPEC(float); |
434 | DECLARE_GPU_SPEC(Eigen::half); |
435 | DECLARE_GPU_SPEC(double); |
436 | DECLARE_GPU_SPEC(int32); |
437 | } // namespace functor |
438 | |
439 | template <typename T> |
440 | void DnnPoolingGradOp<T>::Compute( |
441 | OpKernelContext* context, se::dnn::PoolingMode pooling_mode, |
442 | const std::vector<int32>& size, const std::vector<int32>& stride, |
443 | Padding padding, std::vector<int64_t> explicit_paddings, |
444 | TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out, |
445 | const Tensor& out_backprop, const TensorShape& tensor_in_shape, |
446 | bool propagate_nans) { |
447 | CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) || |
448 | (tensor_in && tensor_out)) |
449 | << "For MaxPoolGrad, both tensor_in and tensor_out needs to be " |
450 | "specified" ; |
451 | |
452 | Tensor* input_backprop = nullptr; |
453 | OP_REQUIRES_OK(context, |
454 | context->allocate_output(0, tensor_in_shape, &input_backprop)); |
455 | if (tensor_in_shape.num_elements() == 0) { |
456 | return; |
457 | } |
458 | |
459 | PoolParameters params{context, size, stride, padding, |
460 | explicit_paddings, data_format, tensor_in_shape}; |
461 | if (!context->status().ok()) { |
462 | return; |
463 | } |
464 | if (tensor_out) { |
465 | OP_REQUIRES(context, tensor_out->shape() == params.forward_output_shape(), |
466 | errors::InvalidArgument("Expected orig_output shape to be " , |
467 | params.forward_output_shape(), |
468 | ", but got " , tensor_out->shape())); |
469 | } |
470 | OP_REQUIRES(context, out_backprop.shape() == params.forward_output_shape(), |
471 | errors::InvalidArgument("Expected grad shape to be " , |
472 | params.forward_output_shape(), |
473 | ", but got " , out_backprop.shape())); |
474 | |
475 | TensorFormat transformed_input_data_format = data_format; |
476 | |
477 | #if CUDNN_VERSION < 7300 |
478 | /// For now, cudnn does not support NHWC format, so we need to convert it |
479 | /// to NCHW before calling cudnn. We need to get rid of this once it is done |
480 | Tensor transformed_input; |
481 | TensorShape transformed_input_shape; |
482 | if (data_format == FORMAT_NHWC || !tensor_in) { |
483 | transformed_input_shape = |
484 | ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format); |
485 | OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value, |
486 | transformed_input_shape, |
487 | &transformed_input)); |
488 | } else { |
489 | transformed_input = *tensor_in; |
490 | } |
491 | Tensor transformed_output; |
492 | TensorShape transformed_output_shape; |
493 | if (data_format == FORMAT_NHWC || !tensor_out) { |
494 | transformed_output_shape = |
495 | ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format); |
496 | OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value, |
497 | transformed_output_shape, |
498 | &transformed_output)); |
499 | } else { |
500 | transformed_output = *tensor_out; |
501 | } |
502 | Tensor transformed_input_backprop; |
503 | if (data_format == FORMAT_NHWC) { |
504 | OP_REQUIRES_OK(context, |
505 | context->allocate_temp(DataTypeToEnum<T>::value, |
506 | transformed_input_shape, |
507 | &transformed_input_backprop)); |
508 | } else { |
509 | transformed_input_backprop = *input_backprop; |
510 | } |
511 | Tensor transformed_output_backprop; |
512 | if (data_format == FORMAT_NHWC) { |
513 | OP_REQUIRES_OK(context, |
514 | context->allocate_temp(DataTypeToEnum<T>::value, |
515 | transformed_output_shape, |
516 | &transformed_output_backprop)); |
517 | } else { |
518 | transformed_output_backprop = out_backprop; |
519 | } |
520 | |
521 | if (data_format == FORMAT_NHWC) { |
522 | /// Convert the data from NHWC to NCHW if necessary. |
523 | if (tensor_in) { |
524 | // For AvgPoolGrad, the original input tensor is not necessary. However, |
525 | // cudnn still requires them to run, although they do not affect the |
526 | // results. |
527 | functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(), |
528 | tensor_in->tensor<T, 4>(), |
529 | transformed_input.tensor<T, 4>()); |
530 | transformed_input_data_format = FORMAT_NCHW; |
531 | } |
532 | if (tensor_out) { |
533 | // For AvgPoolGrad, the original output tensor is not necessary. However, |
534 | // cudnn still requires them to run, although they do not affect the |
535 | // results. |
536 | functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(), |
537 | tensor_out->tensor<T, 4>(), |
538 | transformed_output.tensor<T, 4>()); |
539 | } |
540 | functor::NHWCToNCHW<GPUDevice, T, 4>()( |
541 | context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), |
542 | transformed_output_backprop.tensor<T, 4>()); |
543 | } |
544 | se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX; |
545 | #else |
546 | Tensor transformed_input; |
547 | if (!tensor_in) { |
548 | OP_REQUIRES_OK(context, |
549 | context->allocate_temp(DataTypeToEnum<T>::value, |
550 | tensor_in_shape, &transformed_input)); |
551 | } else { |
552 | transformed_input = *tensor_in; |
553 | } |
554 | Tensor transformed_output; |
555 | if (!tensor_out) { |
556 | OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value, |
557 | out_backprop.shape(), |
558 | &transformed_output)); |
559 | } else { |
560 | transformed_output = *tensor_out; |
561 | } |
562 | Tensor transformed_input_backprop = *input_backprop; |
563 | Tensor transformed_output_backprop = out_backprop; |
564 | se::dnn::DataLayout data_layout; |
565 | switch (data_format) { |
566 | case FORMAT_NHWC: |
567 | data_layout = se::dnn::DataLayout::kBatchYXDepth; |
568 | break; |
569 | case FORMAT_NCHW: |
570 | data_layout = se::dnn::DataLayout::kBatchDepthYX; |
571 | break; |
572 | default: |
573 | OP_REQUIRES(context, false, |
574 | errors::InvalidArgument("Unsupported format: " , |
575 | ToString(data_format))); |
576 | } |
577 | #endif // CUDNN_VERSION < 7300 |
578 | |
579 | int64_t vertical_padding = params.pad_top; |
580 | int64_t horizontal_padding = params.pad_left; |
581 | |
582 | int batch_size = params.tensor_in_batch; |
583 | int depth = params.depth; |
584 | int tensor_in_cols = params.tensor_in_cols; |
585 | int tensor_in_rows = params.tensor_in_rows; |
586 | |
587 | int64_t input_pad_top = 0; |
588 | int64_t input_pad_bottom = 0; |
589 | int64_t input_pad_left = 0; |
590 | int64_t input_pad_right = 0; |
591 | |
592 | Tensor transformed_and_padded_input_backprop; |
593 | |
594 | if (padding == EXPLICIT && (params.pad_top != params.pad_bottom || |
595 | params.pad_left != params.pad_right)) { |
596 | // Pad the input in the same way we did during the forward pass, so that |
597 | // cuDNN or MIOpen receives the same input during the backward pass function |
598 | // as it did during the forward pass function. |
599 | const int64_t common_padding_rows = |
600 | std::min(params.pad_top, params.pad_bottom); |
601 | const int64_t common_padding_cols = |
602 | std::min(params.pad_left, params.pad_right); |
603 | |
604 | Tensor padded_input; |
605 | const int64_t padding_rows_diff = |
606 | std::abs(params.pad_top - params.pad_bottom); |
607 | const int64_t padding_cols_diff = |
608 | std::abs(params.pad_left - params.pad_right); |
609 | |
610 | const int64_t new_in_rows = tensor_in_rows + padding_rows_diff; |
611 | const int64_t new_in_cols = tensor_in_cols + padding_cols_diff; |
612 | |
613 | VLOG(2) << "Create new tensor: " |
614 | << " original rows=" << tensor_in_rows |
615 | << " original cols=" << tensor_in_cols |
616 | << " padding_rows=" << new_in_rows |
617 | << " padding_cols=" << new_in_cols << " depth= " << depth |
618 | << " batch_size=" << batch_size << " kernel_rows" |
619 | << params.window_rows << " kernel_col" << params.window_cols |
620 | << " stride_rows" << params.row_stride; |
621 | |
622 | OP_REQUIRES_OK( |
623 | context, context->allocate_temp( |
624 | DataTypeToEnum<T>::value, |
625 | ShapeFromFormat(transformed_input_data_format, batch_size, |
626 | new_in_rows, new_in_cols, depth), |
627 | &padded_input)); |
628 | |
629 | OP_REQUIRES_OK( |
630 | context, context->allocate_temp( |
631 | DataTypeToEnum<T>::value, |
632 | ShapeFromFormat(transformed_input_data_format, batch_size, |
633 | new_in_rows, new_in_cols, depth), |
634 | &transformed_and_padded_input_backprop)); |
635 | |
636 | input_pad_top = params.pad_top - common_padding_rows; |
637 | input_pad_bottom = params.pad_bottom - common_padding_rows; |
638 | input_pad_left = params.pad_left - common_padding_cols; |
639 | input_pad_right = params.pad_right - common_padding_cols; |
640 | |
641 | bool in_bounds = |
642 | FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) && |
643 | FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) && |
644 | FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) && |
645 | FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max()); |
646 | if (!in_bounds) { |
647 | context->SetStatus(errors::InvalidArgument("Padding is too large." )); |
648 | return; |
649 | } |
650 | |
651 | // PadInputWithNegativeInf functor requires input to be a const. |
652 | const Tensor& const_transformed_input = transformed_input; |
653 | OP_REQUIRES_OK( |
654 | context, |
655 | PadInputWithNegativeInf<T>()( |
656 | context->eigen_device<GPUDevice>(), |
657 | To32Bit(const_transformed_input.tensor<T, 4>()), |
658 | static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom), |
659 | static_cast<int>(input_pad_left), static_cast<int>(input_pad_right), |
660 | To32Bit(padded_input.tensor<T, 4>()), |
661 | transformed_input_data_format)); |
662 | |
663 | transformed_input = padded_input; |
664 | |
665 | vertical_padding = common_padding_rows; |
666 | horizontal_padding = common_padding_cols; |
667 | VLOG(2) << "vertical padding set to: " << vertical_padding |
668 | << " horizontal padding set to: " << horizontal_padding; |
669 | tensor_in_rows = new_in_rows; |
670 | tensor_in_cols = new_in_cols; |
671 | } else { |
672 | transformed_and_padded_input_backprop = transformed_input_backprop; |
673 | } |
674 | |
675 | /// Get ready to call cudnn |
676 | se::dnn::PoolingDescriptor pooling_desc; |
677 | pooling_desc.set_pooling_mode(pooling_mode) |
678 | .set_window_height(params.window_rows) |
679 | .set_window_width(params.window_cols) |
680 | .set_vertical_stride(params.row_stride) |
681 | .set_horizontal_stride(params.col_stride) |
682 | .set_vertical_padding(vertical_padding) |
683 | .set_horizontal_padding(horizontal_padding) |
684 | .set_propagate_nans(propagate_nans); |
685 | |
686 | se::dnn::BatchDescriptor orig_output_desc; |
687 | orig_output_desc.set_count(params.tensor_in_batch) |
688 | .set_height(params.out_height) |
689 | .set_width(params.out_width) |
690 | .set_feature_map_count(params.depth) |
691 | .set_layout(data_layout); |
692 | |
693 | se::dnn::BatchDescriptor orig_input_desc; |
694 | orig_input_desc.set_count(params.tensor_in_batch) |
695 | .set_height(tensor_in_rows) |
696 | .set_width(tensor_in_cols) |
697 | .set_feature_map_count(params.depth) |
698 | .set_layout(data_layout); |
699 | |
700 | auto orig_output_data = |
701 | AsDeviceMemory(transformed_output.template flat<T>().data(), |
702 | transformed_output.template flat<T>().size()); |
703 | auto orig_input_data = |
704 | AsDeviceMemory(transformed_input.template flat<T>().data(), |
705 | transformed_input.template flat<T>().size()); |
706 | auto output_backprop_data = |
707 | AsDeviceMemory(transformed_output_backprop.template flat<T>().data(), |
708 | transformed_output_backprop.template flat<T>().size()); |
709 | auto input_backprop_data = AsDeviceMemory( |
710 | transformed_and_padded_input_backprop.template flat<T>().data(), |
711 | transformed_and_padded_input_backprop.template flat<T>().size()); |
712 | |
713 | auto* stream = context->op_device_context()->stream(); |
714 | OP_REQUIRES(context, stream, errors::Internal("No GPU stream available." )); |
715 | |
716 | #if TENSORFLOW_USE_ROCM |
717 | static int64 PoolingScratchSize = GetDnnWorkspaceLimit( |
718 | // default value is in bytes despite the name of the environment variable |
719 | "TF_CUDNN_WORKSPACE_LIMIT_IN_MB" , 1LL << 32 // 4GB |
720 | ); |
721 | |
722 | DnnScratchAllocator scratch_allocator(PoolingScratchSize, context); |
723 | OP_REQUIRES_OK(context, |
724 | stream->ThenPoolBackward( |
725 | pooling_desc, orig_input_desc, orig_input_data, |
726 | orig_output_desc, orig_output_data, output_backprop_data, |
727 | &input_backprop_data, &scratch_allocator)); |
728 | #else |
729 | OP_REQUIRES_OK(context, stream->ThenPoolBackward( |
730 | pooling_desc, orig_input_desc, orig_input_data, |
731 | orig_output_desc, orig_output_data, |
732 | output_backprop_data, &input_backprop_data)); |
733 | #endif |
734 | |
735 | if (padding == EXPLICIT && (params.pad_top != params.pad_bottom || |
736 | params.pad_left != params.pad_right)) { |
737 | // Remove the padding that was added to the input shape above. |
738 | functor::PadInput<GPUDevice, T, int, 4>()( |
739 | context->eigen_device<GPUDevice>(), |
740 | To32Bit(const_cast<const Tensor&>(transformed_and_padded_input_backprop) |
741 | .tensor<T, 4>()), |
742 | {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}}, |
743 | {{static_cast<int>(-input_pad_bottom), |
744 | static_cast<int>(-input_pad_right)}}, |
745 | To32Bit(transformed_input_backprop.template tensor<T, 4>()), |
746 | transformed_input_data_format, T{}); |
747 | } |
748 | |
749 | #if CUDNN_VERSION < 7300 |
750 | if (data_format == FORMAT_NHWC) { |
751 | /// Transform the output data from NCHW back to NHWC. |
752 | auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; |
753 | functor::NCHWToNHWC<GPUDevice, T, 4>()( |
754 | context->eigen_device<Device>(), |
755 | toConstTensor(transformed_input_backprop).template tensor<T, 4>(), |
756 | input_backprop->tensor<T, 4>()); |
757 | } |
758 | #endif // CUDNN_VERSION < 7300 |
759 | } |
760 | |
761 | #define DEFINE_DNN_OPS(T) \ |
762 | template class DnnPoolingOp<T>; \ |
763 | template class DnnPoolingGradOp<T>; |
764 | TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS) |
765 | |
766 | #if CUDNN_VERSION >= 7300 |
767 | template class DnnPoolingOp<qint8>; |
768 | #endif |
769 | |
770 | #undef DEFINE_DNN_OPS |
771 | |
772 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
773 | |
774 | } // namespace tensorflow |
775 | |