1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | // See docs in ../ops/nn_ops.cc. |
17 | |
18 | #define EIGEN_USE_THREADS |
19 | |
20 | #include "tensorflow/core/kernels/maxpooling_op.h" |
21 | |
22 | #include <type_traits> |
23 | #include <vector> |
24 | |
25 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" |
26 | #include "tensorflow/core/common_runtime/device.h" |
27 | #include "tensorflow/core/framework/bounds_check.h" |
28 | #include "tensorflow/core/framework/numeric_op.h" |
29 | #include "tensorflow/core/framework/op_kernel.h" |
30 | #include "tensorflow/core/framework/register_types.h" |
31 | #include "tensorflow/core/framework/tensor.h" |
32 | #include "tensorflow/core/framework/tensor_shape.h" |
33 | #include "tensorflow/core/framework/tensor_slice.h" |
34 | #include "tensorflow/core/kernels/conv_2d.h" |
35 | #include "tensorflow/core/kernels/eigen_pooling.h" |
36 | #include "tensorflow/core/kernels/ops_util.h" |
37 | #include "tensorflow/core/kernels/pooling_ops_common.h" |
38 | #include "tensorflow/core/lib/core/errors.h" |
39 | #include "tensorflow/core/lib/gtl/array_slice.h" |
40 | #include "tensorflow/core/util/determinism.h" |
41 | #include "tensorflow/core/util/env_var.h" |
42 | #include "tensorflow/core/util/padding.h" |
43 | #include "tensorflow/core/util/tensor_format.h" |
44 | #include "tensorflow/core/util/use_cudnn.h" |
45 | |
46 | #if GOOGLE_CUDA |
47 | #include "third_party/gpus/cudnn/cudnn.h" |
48 | #endif // GOOGLE_CUDA |
49 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
50 | #include "tensorflow/core/kernels/maxpooling_op_gpu.h" |
51 | #include "tensorflow/core/kernels/pooling_ops_common_gpu.h" |
52 | #include "tensorflow/core/platform/stream_executor.h" |
53 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
54 | |
55 | namespace tensorflow { |
56 | |
57 | typedef Eigen::ThreadPoolDevice CPUDevice; |
58 | typedef Eigen::GpuDevice GPUDevice; |
59 | |
60 | const int kInvalidMaxPoolingIndex = -1; |
61 | |
62 | template <typename Device, typename T, typename Targmax> |
63 | static void SpatialMaxPoolWithArgMaxHelper( |
64 | OpKernelContext* context, Tensor* output, Tensor* output_arg_max, |
65 | Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop, |
66 | const PoolParameters& params, const bool include_batch_in_index) { |
67 | if (input_backprop != nullptr) { |
68 | OP_REQUIRES( |
69 | context, include_batch_in_index, |
70 | errors::Internal( |
71 | "SpatialMaxPoolWithArgMaxHelper requires include_batch_in_index " |
72 | "to be True when input_backprop != nullptr" )); |
73 | OP_REQUIRES( |
74 | context, (std::is_same<Targmax, int64_t>::value), |
75 | errors::Internal("SpatialMaxPoolWithArgMaxHelper requires Targmax " |
76 | "to be int64 when input_backprop != nullptr" )); |
77 | } |
78 | if (tensor_in.NumElements() == 0 || output->NumElements() == 0) return; |
79 | |
80 | typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
81 | ConstEigenMatrixMap; |
82 | typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
83 | EigenMatrixMap; |
84 | typedef Eigen::Map<Eigen::Matrix<Targmax, Eigen::Dynamic, Eigen::Dynamic>> |
85 | EigenIndexMatrixMap; |
86 | |
87 | ConstEigenMatrixMap in_mat( |
88 | tensor_in.flat<T>().data(), params.depth, |
89 | params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch); |
90 | EigenMatrixMap out_mat( |
91 | output->flat<T>().data(), params.depth, |
92 | params.out_width * params.out_height * params.tensor_in_batch); |
93 | EigenIndexMatrixMap out_arg_max_mat( |
94 | output_arg_max->flat<Targmax>().data(), params.depth, |
95 | params.out_width * params.out_height * params.tensor_in_batch); |
96 | |
97 | const DeviceBase::CpuWorkerThreads& worker_threads = |
98 | *(context->device()->tensorflow_cpu_worker_threads()); |
99 | |
100 | // The following code basically does the following: |
101 | // 1. Flattens the input and output tensors into two dimensional arrays. |
102 | // tensor_in_as_matrix: |
103 | // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) |
104 | // output_as_matrix: |
105 | // depth by (out_width * out_height * tensor_in_batch) |
106 | // |
107 | // 2. Walks through the set of columns in the flattened tensor_in_as_matrix, |
108 | // and updates the corresponding column(s) in output_as_matrix with the |
109 | // max value. |
110 | auto shard = [¶ms, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop, |
111 | &output_arg_max, &out_backprop, |
112 | include_batch_in_index](int64_t start, int64_t limit) { |
113 | const int32_t depth = params.depth; |
114 | const int32_t in_rows = params.tensor_in_rows; |
115 | const int32_t in_cols = params.tensor_in_cols; |
116 | const int32_t pad_top = params.pad_top; |
117 | const int32_t pad_left = params.pad_left; |
118 | const int32_t window_rows = params.window_rows; |
119 | const int32_t window_cols = params.window_cols; |
120 | const int32_t row_stride = params.row_stride; |
121 | const int32_t col_stride = params.col_stride; |
122 | const int32_t out_height = params.out_height; |
123 | const int32_t out_width = params.out_width; |
124 | |
125 | { |
126 | // Initializes the output tensor with MIN<T>. |
127 | const int32_t output_image_size = out_height * out_width * depth; |
128 | EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, 1, |
129 | (limit - start) * output_image_size); |
130 | out_shard.setConstant(Eigen::NumTraits<T>::lowest()); |
131 | EigenIndexMatrixMap out_arg_max_shard( |
132 | out_arg_max_mat.data() + start * output_image_size, 1, |
133 | (limit - start) * output_image_size); |
134 | out_arg_max_shard.setConstant(kInvalidMaxPoolingIndex); |
135 | } |
136 | |
137 | for (int64_t b = start; b < limit; ++b) { |
138 | for (int h = 0; h < in_rows; ++h) { |
139 | for (int w = 0; w < in_cols; ++w) { |
140 | // (h_start, h_end) * (w_start, w_end) is the range that the input |
141 | // vector projects to. |
142 | const int hpad = h + pad_top; |
143 | const int wpad = w + pad_left; |
144 | const int h_start = |
145 | (hpad < window_rows) ? 0 : (hpad - window_rows) / row_stride + 1; |
146 | const int h_end = std::min(hpad / row_stride + 1, out_height); |
147 | const int w_start = |
148 | (wpad < window_cols) ? 0 : (wpad - window_cols) / col_stride + 1; |
149 | const int w_end = std::min(wpad / col_stride + 1, out_width); |
150 | // compute elementwise max |
151 | const int64_t in_index = (b * in_rows + h) * in_cols + w; |
152 | for (int ph = h_start; ph < h_end; ++ph) { |
153 | const int64_t out_index_base = (b * out_height + ph) * out_width; |
154 | for (int pw = w_start; pw < w_end; ++pw) { |
155 | const int64_t out_index = out_index_base + pw; |
156 | /// NOTES(zhengxq): not using the eigen matrix operation for |
157 | /// now. |
158 | for (int d = 0; d < depth; ++d) { |
159 | const T& input_ref = in_mat.coeffRef(d, in_index); |
160 | T& output_ref = out_mat.coeffRef(d, out_index); |
161 | Targmax& out_arg_max_ref = |
162 | out_arg_max_mat.coeffRef(d, out_index); |
163 | if (output_ref < input_ref || |
164 | out_arg_max_ref == kInvalidMaxPoolingIndex) { |
165 | output_ref = input_ref; |
166 | if (include_batch_in_index) { |
167 | out_arg_max_ref = in_index * depth + d; |
168 | } else { |
169 | out_arg_max_ref = (h * in_cols + w) * depth + d; |
170 | } |
171 | } |
172 | } |
173 | } |
174 | } |
175 | } |
176 | } |
177 | } |
178 | |
179 | if (input_backprop != nullptr) { |
180 | auto input_backprop_flat = input_backprop->flat<T>(); |
181 | auto out_arg_max_flat = output_arg_max->flat<int64_t>(); |
182 | auto out_backprop_flat = out_backprop.flat<T>(); |
183 | |
184 | // Initialize output to 0. |
185 | const int64_t in_size = in_rows * in_cols * depth; |
186 | const int64_t in_start = start * in_size; |
187 | const int64_t in_end = limit * in_size; |
188 | EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1, |
189 | in_end - in_start); |
190 | in_shard.setConstant(T(0)); |
191 | |
192 | // Backpropagate. |
193 | const int out_size = out_height * out_width * depth; |
194 | const int out_start = start * out_size; |
195 | const int out_end = limit * out_size; |
196 | for (int index = out_start; index < out_end; ++index) { |
197 | int input_backprop_index = out_arg_max_flat(index); |
198 | // Although this check is in the inner loop, it is worth its value |
199 | // so we don't end up with memory corruptions. Our benchmark shows that |
200 | // the performance impact is quite small |
201 | // CHECK(input_backprop_index >= in_start && input_backprop_index < |
202 | // in_end) |
203 | FastBoundsCheck(input_backprop_index - in_start, in_end - in_start); |
204 | if (index < out_backprop.NumElements()) { |
205 | input_backprop_flat(input_backprop_index) += out_backprop_flat(index); |
206 | } |
207 | } |
208 | } |
209 | }; |
210 | |
211 | const int64_t shard_cost = params.tensor_in_rows * params.tensor_in_cols * |
212 | params.depth * params.window_rows * |
213 | params.window_cols; |
214 | Shard(worker_threads.num_threads, worker_threads.workers, |
215 | params.tensor_in_batch, shard_cost, shard); |
216 | } |
217 | |
218 | // The operation to compute MaxPool gradients. |
219 | // It takes three inputs: |
220 | // - The original input tensor |
221 | // - The original output tensor |
222 | // - Backprop tensor for output |
223 | // It produces one output: backprop tensor for input. |
224 | template <class Device, class T> |
225 | class MaxPoolingGradOp : public OpKernel { |
226 | public: |
227 | explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) { |
228 | string data_format; |
229 | OP_REQUIRES_OK(context, context->GetAttr("data_format" , &data_format)); |
230 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), |
231 | errors::InvalidArgument("Invalid data format" )); |
232 | OP_REQUIRES( |
233 | context, data_format_ == FORMAT_NHWC, |
234 | errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC " , |
235 | "on device type " , |
236 | DeviceTypeString(context->device_type()))); |
237 | |
238 | if (context->num_inputs() == 3) { |
239 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
240 | OP_REQUIRES(context, ksize_.size() == 4, |
241 | errors::InvalidArgument("Sliding window ksize field must " |
242 | "specify 4 dimensions" )); |
243 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
244 | OP_REQUIRES(context, stride_.size() == 4, |
245 | errors::InvalidArgument("Sliding window strides field must " |
246 | "specify 4 dimensions" )); |
247 | OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, |
248 | errors::Unimplemented( |
249 | "Pooling is not yet supported on the batch dimension." )); |
250 | OP_REQUIRES( |
251 | context, ksize_[3] == 1 && stride_[3] == 1, |
252 | errors::Unimplemented( |
253 | "MaxPoolingGrad is not yet supported on the depth dimension." )); |
254 | } |
255 | |
256 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
257 | |
258 | if (padding_ == Padding::EXPLICIT) { |
259 | OP_REQUIRES_OK( |
260 | context, context->GetAttr("explicit_paddings" , &explicit_paddings_)); |
261 | OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_, |
262 | /*num_dims=*/4, data_format_)); |
263 | } |
264 | } |
265 | |
266 | void Compute(OpKernelContext* context) override { |
267 | const Tensor& tensor_in = context->input(0); |
268 | const Tensor& tensor_out = context->input(1); |
269 | const Tensor& out_backprop = context->input(2); |
270 | |
271 | // For maxpooling, tensor_in should have 4 dimensions. |
272 | OP_REQUIRES(context, tensor_in.dims() == 4, |
273 | errors::InvalidArgument("tensor_in must be 4-dimensional" )); |
274 | OP_REQUIRES(context, tensor_out.dims() == 4, |
275 | errors::InvalidArgument("tensor_out must be 4-dimensional" )); |
276 | // For maxpooling, out_backprop should have 4 dimensions. |
277 | OP_REQUIRES(context, out_backprop.dims() == 4, |
278 | errors::InvalidArgument("out_backprop must be 4-dimensional" )); |
279 | |
280 | const TensorShape& output_shape = tensor_in.shape(); |
281 | |
282 | Tensor tensor_out_dup; |
283 | OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp( |
284 | {1}, DataTypeToEnum<T>::v(), tensor_out.shape(), |
285 | &tensor_out_dup)); |
286 | Tensor tensor_out_arg_max; |
287 | OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64_t>::v(), |
288 | tensor_out.shape(), |
289 | &tensor_out_arg_max)); |
290 | std::vector<int32> ksize = ksize_; |
291 | std::vector<int32> stride = stride_; |
292 | if (context->num_inputs() == 5) { |
293 | const Tensor& tensor_ksize = context->input(3); |
294 | auto value_ksize = tensor_ksize.flat<int32>(); |
295 | ksize.resize(tensor_ksize.shape().num_elements()); |
296 | std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); |
297 | |
298 | const Tensor& tensor_stride = context->input(4); |
299 | auto value_stride = tensor_stride.flat<int32>(); |
300 | stride.resize(tensor_stride.shape().num_elements()); |
301 | std::copy_n(&value_stride(0), stride.size(), stride.begin()); |
302 | } |
303 | |
304 | OP_REQUIRES(context, ksize.size() == 4, |
305 | errors::InvalidArgument("Sliding window ksize field must " |
306 | "specify 4 dimensions" )); |
307 | OP_REQUIRES(context, stride.size() == 4, |
308 | errors::InvalidArgument("Sliding window strides field must " |
309 | "specify 4 dimensions" )); |
310 | OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1, |
311 | errors::Unimplemented( |
312 | "Pooling is not yet supported on the batch dimension." )); |
313 | OP_REQUIRES( |
314 | context, ksize[3] == 1 && stride[3] == 1, |
315 | errors::Unimplemented( |
316 | "MaxPoolingGrad is not yet supported on the depth dimension." )); |
317 | |
318 | PoolParameters params{context, |
319 | ksize, |
320 | stride, |
321 | padding_, |
322 | explicit_paddings_, |
323 | FORMAT_NHWC, |
324 | tensor_in.shape()}; |
325 | if (!context->status().ok()) { |
326 | return; |
327 | } |
328 | OP_REQUIRES(context, tensor_out.shape() == params.forward_output_shape(), |
329 | errors::InvalidArgument("Expected orig_output shape to be " , |
330 | params.forward_output_shape(), |
331 | ", but got " , tensor_out.shape())); |
332 | OP_REQUIRES(context, out_backprop.shape() == params.forward_output_shape(), |
333 | errors::InvalidArgument("Expected grad shape to be " , |
334 | params.forward_output_shape(), |
335 | ", but got " , out_backprop.shape())); |
336 | |
337 | Tensor* output = nullptr; |
338 | OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( |
339 | {0}, 0, output_shape, &output)); |
340 | |
341 | SpatialMaxPoolWithArgMaxHelper<CPUDevice, T, int64_t>( |
342 | context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in, |
343 | out_backprop, params, true); |
344 | } |
345 | |
346 | private: |
347 | std::vector<int32> ksize_; |
348 | std::vector<int32> stride_; |
349 | Padding padding_; |
350 | std::vector<int64_t> explicit_paddings_; |
351 | TensorFormat data_format_; |
352 | }; |
353 | |
354 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
355 | |
356 | template <class T> |
357 | class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel { |
358 | public: |
359 | typedef Eigen::GpuDevice Device; |
360 | |
361 | explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) { |
362 | string data_format; |
363 | OP_REQUIRES_OK(context, context->GetAttr("data_format" , &data_format)); |
364 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), |
365 | errors::InvalidArgument("Invalid data format" )); |
366 | if (context->num_inputs() == 3) { |
367 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
368 | OP_REQUIRES(context, ksize_.size() == 4, |
369 | errors::InvalidArgument("Sliding window ksize field must " |
370 | "specify 4 dimensions" )); |
371 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
372 | OP_REQUIRES(context, stride_.size() == 4, |
373 | errors::InvalidArgument("Sliding window strides field must " |
374 | "specify 4 dimensions" )); |
375 | const int32_t ksize_n = GetTensorDim(ksize_, data_format_, 'N'); |
376 | const int32_t stride_n = GetTensorDim(stride_, data_format_, 'N'); |
377 | OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, |
378 | errors::Unimplemented( |
379 | "Pooling is not yet supported on the batch dimension." )); |
380 | } |
381 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
382 | if (padding_ == Padding::EXPLICIT) { |
383 | OP_REQUIRES_OK( |
384 | context, context->GetAttr("explicit_paddings" , &explicit_paddings_)); |
385 | OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_, |
386 | /*num_dims=*/4, data_format_)); |
387 | } |
388 | TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP" , false, |
389 | &propagate_nans_)); |
390 | } |
391 | |
392 | void Compute(OpKernelContext* context) override { |
393 | const Tensor& tensor_in = context->input(0); |
394 | const Tensor& tensor_out = context->input(1); |
395 | const Tensor& out_backprop = context->input(2); |
396 | |
397 | // For maxpooling, tensor_in should have 4 dimensions. |
398 | OP_REQUIRES(context, tensor_in.dims() == 4, |
399 | errors::InvalidArgument("tensor_in must be 4-dimensional 4" )); |
400 | OP_REQUIRES(context, tensor_out.dims() == 4, |
401 | errors::InvalidArgument("tensor_out must be 4-dimensional" )); |
402 | // For maxpooling, out_backprop should have 4 dimensions. |
403 | OP_REQUIRES(context, out_backprop.dims() == 4, |
404 | errors::InvalidArgument("out_backprop must be 4-dimensional" )); |
405 | |
406 | TensorShape output_shape = tensor_in.shape(); |
407 | |
408 | std::vector<int32> ksize = ksize_; |
409 | std::vector<int32> stride = stride_; |
410 | if (context->num_inputs() == 5) { |
411 | const Tensor& tensor_ksize = context->input(3); |
412 | auto value_ksize = tensor_ksize.flat<int32>(); |
413 | ksize.resize(tensor_ksize.shape().num_elements()); |
414 | std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); |
415 | |
416 | const Tensor& tensor_stride = context->input(4); |
417 | auto value_stride = tensor_stride.flat<int32>(); |
418 | stride.resize(tensor_stride.shape().num_elements()); |
419 | std::copy_n(&value_stride(0), stride.size(), stride.begin()); |
420 | } |
421 | OP_REQUIRES(context, ksize.size() == 4, |
422 | errors::InvalidArgument("Sliding window ksize field must " |
423 | "specify 4 dimensions" )); |
424 | OP_REQUIRES(context, stride.size() == 4, |
425 | errors::InvalidArgument("Sliding window strides field must " |
426 | "specify 4 dimensions" )); |
427 | const int32_t ksize_n = GetTensorDim(ksize, data_format_, 'N'); |
428 | const int32_t stride_n = GetTensorDim(stride, data_format_, 'N'); |
429 | OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, |
430 | errors::Unimplemented( |
431 | "Pooling is not yet supported on the batch dimension." )); |
432 | int64_t pad_top, pad_bottom, pad_left, pad_right; |
433 | if (padding_ == Padding::EXPLICIT) { |
434 | GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'H', |
435 | /*pad_top=*/&pad_top, |
436 | /*pad_bottom=*/&pad_bottom); |
437 | GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'W', |
438 | /*pad_left=*/&pad_left, |
439 | /*pad_right=*/&pad_right); |
440 | } |
441 | DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize, |
442 | stride, padding_, explicit_paddings_, |
443 | data_format_, &tensor_in, &tensor_out, |
444 | out_backprop, output_shape, propagate_nans_); |
445 | } |
446 | |
447 | private: |
448 | std::vector<int32> ksize_; |
449 | std::vector<int32> stride_; |
450 | Padding padding_; |
451 | std::vector<int64_t> explicit_paddings_; |
452 | TensorFormat data_format_; |
453 | bool propagate_nans_; |
454 | }; |
455 | |
456 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
457 | |
458 | // The operation to compute gradient of MaxPool gradients. |
459 | // It takes three inputs: |
460 | // - The original input tensor |
461 | // - The original output tensor |
462 | // - Backprop tensor for output gradients |
463 | // It produces one output: backprop tensor for output gradient. |
464 | template <class Device, class T> |
465 | class MaxPoolingGradGradOp : public OpKernel { |
466 | public: |
467 | explicit MaxPoolingGradGradOp(OpKernelConstruction* context) |
468 | : OpKernel(context) { |
469 | string data_format; |
470 | OP_REQUIRES_OK(context, context->GetAttr("data_format" , &data_format)); |
471 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), |
472 | errors::InvalidArgument("Invalid data format" )); |
473 | OP_REQUIRES( |
474 | context, data_format_ == FORMAT_NHWC, |
475 | errors::InvalidArgument( |
476 | "Default MaxPoolingGradGradOp only supports NHWC " , |
477 | "on device type " , DeviceTypeString(context->device_type()))); |
478 | |
479 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
480 | |
481 | if (context->num_inputs() == 3) { |
482 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
483 | OP_REQUIRES(context, ksize_.size() == 4, |
484 | errors::InvalidArgument("Sliding window ksize field must " |
485 | "specify 4 dimensions" )); |
486 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
487 | OP_REQUIRES(context, stride_.size() == 4, |
488 | errors::InvalidArgument("Sliding window strides field must " |
489 | "specify 4 dimensions" )); |
490 | OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, |
491 | errors::Unimplemented( |
492 | "Pooling is not yet supported on the batch dimension." )); |
493 | OP_REQUIRES(context, ksize_[3] == 1 && stride_[3] == 1, |
494 | errors::Unimplemented("MaxPoolingGradGrad is not yet " |
495 | "supported on the depth dimension." )); |
496 | } |
497 | } |
498 | |
499 | void Compute(OpKernelContext* context) override { |
500 | const Tensor& tensor_in = context->input(0); |
501 | const Tensor& tensor_out = context->input(1); |
502 | const Tensor& out_grad_backprop = context->input(2); |
503 | |
504 | // For maxpooling, tensor_in should have 4 dimensions. |
505 | OP_REQUIRES(context, tensor_in.dims() == 4, |
506 | errors::InvalidArgument("tensor_in must be 4-dimensional" )); |
507 | OP_REQUIRES(context, tensor_out.dims() == 4, |
508 | errors::InvalidArgument("tensor_out must be 4-dimensional" )); |
509 | // For maxpooling, out_grad_backprop should have 4 dimensions. |
510 | OP_REQUIRES( |
511 | context, out_grad_backprop.dims() == 4, |
512 | errors::InvalidArgument("out_grad_backprop must be 4-dimensional" )); |
513 | |
514 | std::vector<int32> ksize = ksize_; |
515 | std::vector<int32> stride = stride_; |
516 | if (context->num_inputs() == 5) { |
517 | const Tensor& tensor_ksize = context->input(3); |
518 | auto value_ksize = tensor_ksize.flat<int32>(); |
519 | ksize.resize(tensor_ksize.shape().num_elements()); |
520 | std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); |
521 | |
522 | const Tensor& tensor_stride = context->input(4); |
523 | auto value_stride = tensor_stride.flat<int32>(); |
524 | stride.resize(tensor_stride.shape().num_elements()); |
525 | std::copy_n(&value_stride(0), stride.size(), stride.begin()); |
526 | } |
527 | |
528 | OP_REQUIRES(context, ksize.size() == 4, |
529 | errors::InvalidArgument("Sliding window ksize field must " |
530 | "specify 4 dimensions" )); |
531 | OP_REQUIRES(context, stride.size() == 4, |
532 | errors::InvalidArgument("Sliding window strides field must " |
533 | "specify 4 dimensions" )); |
534 | OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1, |
535 | errors::Unimplemented( |
536 | "Pooling is not yet supported on the batch dimension." )); |
537 | OP_REQUIRES( |
538 | context, ksize[3] == 1 && stride[3] == 1, |
539 | errors::Unimplemented( |
540 | "MaxPoolingGrad is not yet supported on the depth dimension." )); |
541 | |
542 | PoolParameters params{context, |
543 | ksize, |
544 | stride, |
545 | padding_, |
546 | /*explicit_paddings=*/{}, |
547 | FORMAT_NHWC, |
548 | tensor_in.shape()}; |
549 | if (!context->status().ok()) { |
550 | return; |
551 | } |
552 | OP_REQUIRES(context, tensor_out.shape() == params.forward_output_shape(), |
553 | errors::InvalidArgument("Expected orig_output shape to be " , |
554 | params.forward_output_shape(), |
555 | ", but got " , tensor_out.shape())); |
556 | OP_REQUIRES( |
557 | context, out_grad_backprop.shape() == tensor_in.shape(), |
558 | errors::InvalidArgument("Expected grad shape to be " , tensor_in.shape(), |
559 | ", but got " , out_grad_backprop.shape())); |
560 | |
561 | Tensor* output = nullptr; |
562 | OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( |
563 | {2}, 0, tensor_out.shape(), &output)); |
564 | |
565 | SpatialMaxPoolGradGrad(context, output, tensor_in, tensor_out, |
566 | out_grad_backprop, params, padding_); |
567 | } |
568 | |
569 | private: |
570 | void SpatialMaxPoolGradGrad(OpKernelContext* context, Tensor* bottom_diff, |
571 | const Tensor& tensor_in, const Tensor& tensor_out, |
572 | const Tensor& top_diff, |
573 | const PoolParameters& params, |
574 | const Padding& padding) { |
575 | typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
576 | ConstEigenMatrixMap; |
577 | typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
578 | EigenMatrixMap; |
579 | |
580 | ConstEigenMatrixMap in_mat( |
581 | tensor_in.flat<T>().data(), params.depth, |
582 | params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch); |
583 | ConstEigenMatrixMap out_mat( |
584 | tensor_out.flat<T>().data(), params.depth, |
585 | params.out_width * params.out_height * params.tensor_in_batch); |
586 | ConstEigenMatrixMap top_diff_mat( |
587 | top_diff.flat<T>().data(), params.depth, |
588 | params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch); |
589 | EigenMatrixMap bottom_diff_mat( |
590 | bottom_diff->flat<T>().data(), params.depth, |
591 | params.out_width * params.out_height * params.tensor_in_batch); |
592 | |
593 | const DeviceBase::CpuWorkerThreads& worker_threads = |
594 | *(context->device()->tensorflow_cpu_worker_threads()); |
595 | |
596 | // The following code basically does the following: |
597 | // 1. Flattens the input, output, top_diff and bottom_diff tensors into |
598 | // two dimensional arrays. |
599 | // tensor_in_as_matrix: |
600 | // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) |
601 | // tensor_out_as_matrix: |
602 | // depth by (out_width * out_height * tensor_in_batch) |
603 | // top_diff_as_matrix: |
604 | // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) |
605 | // bottom_diff_as_matrix: |
606 | // depth by (out_width * out_height * tensor_in_batch) |
607 | // |
608 | // 2. Walks through the set of columns in the flattened |
609 | // tensor_in_as_matrix, tensor_out_as_matrix, top_diff_as_matrix |
610 | // and updates the column(s) corresponding to the maximum values in |
611 | // tensor_out_as_matrix with the corresponding values in |
612 | // top_diff_as_matrix. |
613 | auto shard = [¶ms, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat]( |
614 | int64_t start, int64_t limit) { |
615 | const int32_t depth = params.depth; |
616 | const int32_t in_rows = params.tensor_in_rows; |
617 | const int32_t in_cols = params.tensor_in_cols; |
618 | const int32_t pad_top = params.pad_top; |
619 | const int32_t pad_left = params.pad_left; |
620 | const int32_t window_rows = params.window_rows; |
621 | const int32_t window_cols = params.window_cols; |
622 | const int32_t row_stride = params.row_stride; |
623 | const int32_t col_stride = params.col_stride; |
624 | const int32_t out_height = params.out_height; |
625 | const int32_t out_width = params.out_width; |
626 | |
627 | { |
628 | // Initializes the output grad backprop tensor with 0. |
629 | const int32_t output_image_size = out_height * out_width * params.depth; |
630 | EigenMatrixMap bottom_diff_shard( |
631 | bottom_diff_mat.data() + start * output_image_size, 1, |
632 | (limit - start) * output_image_size); |
633 | bottom_diff_shard.setZero(); |
634 | } |
635 | |
636 | for (int b = start; b < limit; ++b) { |
637 | for (int ph = 0; ph < out_height; ++ph) { |
638 | for (int pw = 0; pw < out_width; ++pw) { |
639 | // (h_start, h_end) * (w_start, w_end) is the range that the input |
640 | // vector projects to. |
641 | int h_start = ph * row_stride - pad_top; |
642 | const int h_end = std::min(h_start + window_rows, in_rows); |
643 | int w_start = pw * col_stride - pad_left; |
644 | const int w_end = std::min(w_start + window_cols, in_cols); |
645 | h_start = std::max(h_start, 0); |
646 | w_start = std::max(w_start, 0); |
647 | const int out_index = (b * out_height + ph) * out_width + pw; |
648 | // Find value corresponding to the input maximum in top_diff. |
649 | for (int d = 0; d < depth; ++d) { |
650 | const T& output_ref = out_mat.coeffRef(d, out_index); |
651 | bool should_stop = false; |
652 | for (int h = h_start; h < h_end && !should_stop; ++h) { |
653 | for (int w = w_start; w < w_end && !should_stop; ++w) { |
654 | const int in_index = (b * in_rows + h) * in_cols + w; |
655 | const T& input_ref = in_mat.coeffRef(d, in_index); |
656 | if (output_ref == input_ref) { |
657 | T& bottom_diff_ref = bottom_diff_mat.coeffRef(d, out_index); |
658 | bottom_diff_ref = top_diff_mat.coeffRef(d, in_index); |
659 | should_stop = true; |
660 | } |
661 | } |
662 | } |
663 | } |
664 | } |
665 | } |
666 | } |
667 | }; |
668 | |
669 | const int64_t shard_cost = params.out_width * params.out_height * |
670 | params.depth * params.window_rows * |
671 | params.window_cols; |
672 | Shard(worker_threads.num_threads, worker_threads.workers, |
673 | params.tensor_in_batch, shard_cost, shard); |
674 | } |
675 | |
676 | std::vector<int32> ksize_; |
677 | std::vector<int32> stride_; |
678 | Padding padding_; |
679 | TensorFormat data_format_; |
680 | }; |
681 | |
682 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
683 | |
684 | template <class T> |
685 | class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel { |
686 | public: |
687 | typedef Eigen::GpuDevice Device; |
688 | |
689 | explicit MaxPoolingGradGradOp(OpKernelConstruction* context) |
690 | : OpKernel(context) { |
691 | string data_format; |
692 | OP_REQUIRES_OK(context, context->GetAttr("data_format" , &data_format)); |
693 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), |
694 | errors::InvalidArgument("Invalid data format" )); |
695 | if (context->num_inputs() == 3) { |
696 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
697 | OP_REQUIRES(context, ksize_.size() == 4, |
698 | errors::InvalidArgument("Sliding window ksize field must " |
699 | "specify 4 dimensions" )); |
700 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
701 | OP_REQUIRES(context, stride_.size() == 4, |
702 | errors::InvalidArgument("Sliding window strides field must " |
703 | "specify 4 dimensions" )); |
704 | const int32_t ksize_n = GetTensorDim(ksize_, data_format_, 'N'); |
705 | const int32_t stride_n = GetTensorDim(stride_, data_format_, 'N'); |
706 | OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, |
707 | errors::Unimplemented( |
708 | "Pooling is not yet supported on the batch dimension." )); |
709 | } |
710 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
711 | } |
712 | |
713 | void Compute(OpKernelContext* context) override { |
714 | const Tensor& tensor_in = context->input(0); |
715 | const Tensor& tensor_out = context->input(1); |
716 | const Tensor& out_grad_backprop = context->input(2); |
717 | |
718 | // For maxpooling, tensor_in should have 4 dimensions. |
719 | OP_REQUIRES(context, tensor_in.dims() == 4, |
720 | errors::InvalidArgument("tensor_in must be 4-dimensional 4" )); |
721 | OP_REQUIRES(context, tensor_out.dims() == 4, |
722 | errors::InvalidArgument("tensor_out must be 4-dimensional" )); |
723 | // For maxpooling, out_grad_backprop should have 4 dimensions. |
724 | OP_REQUIRES( |
725 | context, out_grad_backprop.dims() == 4, |
726 | errors::InvalidArgument("out_grad_backprop must be 4-dimensional" )); |
727 | |
728 | Tensor* output = nullptr; |
729 | OP_REQUIRES_OK(context, |
730 | context->allocate_output(0, tensor_out.shape(), &output)); |
731 | |
732 | std::vector<int32> ksize = ksize_; |
733 | std::vector<int32> stride = stride_; |
734 | if (context->num_inputs() == 5) { |
735 | const Tensor& tensor_ksize = context->input(3); |
736 | auto value_ksize = tensor_ksize.flat<int32>(); |
737 | ksize.resize(tensor_ksize.shape().num_elements()); |
738 | std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); |
739 | |
740 | const Tensor& tensor_stride = context->input(4); |
741 | auto value_stride = tensor_stride.flat<int32>(); |
742 | stride.resize(tensor_stride.shape().num_elements()); |
743 | std::copy_n(&value_stride(0), stride.size(), stride.begin()); |
744 | } |
745 | |
746 | OP_REQUIRES(context, ksize.size() == 4, |
747 | errors::InvalidArgument("Sliding window ksize field must " |
748 | "specify 4 dimensions" )); |
749 | OP_REQUIRES(context, stride.size() == 4, |
750 | errors::InvalidArgument("Sliding window strides field must " |
751 | "specify 4 dimensions" )); |
752 | const int32_t ksize_n = GetTensorDim(ksize, data_format_, 'N'); |
753 | const int32_t stride_n = GetTensorDim(stride, data_format_, 'N'); |
754 | OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, |
755 | errors::Unimplemented( |
756 | "Pooling is not yet supported on the batch dimension." )); |
757 | |
758 | PoolParameters params{context, |
759 | ksize, |
760 | stride, |
761 | padding_, |
762 | /*explicit_paddings=*/{}, |
763 | data_format_, |
764 | tensor_in.shape()}; |
765 | if (!context->status().ok()) { |
766 | return; |
767 | } |
768 | OP_REQUIRES(context, tensor_out.shape() == params.forward_output_shape(), |
769 | errors::InvalidArgument("Expected orig_output shape to be " , |
770 | params.forward_output_shape(), |
771 | ", but got " , tensor_out.shape())); |
772 | OP_REQUIRES( |
773 | context, out_grad_backprop.shape() == tensor_in.shape(), |
774 | errors::InvalidArgument("Expected grad shape to be " , tensor_in.shape(), |
775 | ", but got " , out_grad_backprop.shape())); |
776 | |
777 | functor::MaxPoolGradBackwardNoMask<T>()( |
778 | data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(), |
779 | params.tensor_in_batch, params.out_height, params.out_width, |
780 | params.depth, params.tensor_in_rows, params.tensor_in_cols, |
781 | params.window_rows, params.window_cols, params.row_stride, |
782 | params.col_stride, params.pad_top, params.pad_left, |
783 | out_grad_backprop.flat<T>().data(), output->flat<T>().data(), |
784 | context->eigen_device<Eigen::GpuDevice>()); |
785 | } |
786 | |
787 | private: |
788 | std::vector<int32> ksize_; |
789 | std::vector<int32> stride_; |
790 | Padding padding_; |
791 | TensorFormat data_format_; |
792 | bool use_dnn_; |
793 | }; |
794 | |
795 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
796 | |
797 | template <typename Device, typename T> |
798 | struct LaunchMaxPoolingNoMask; |
799 | |
800 | template <typename Device, typename T> |
801 | class MaxPoolingNoMaskOp : public OpKernel { |
802 | public: |
803 | explicit MaxPoolingNoMaskOp(OpKernelConstruction* context) |
804 | : OpKernel(context) { |
805 | string data_format; |
806 | OP_REQUIRES_OK(context, context->GetAttr("data_format" , &data_format)); |
807 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), |
808 | errors::InvalidArgument("Invalid data format" )); |
809 | OP_REQUIRES( |
810 | context, data_format_ == FORMAT_NHWC, |
811 | errors::InvalidArgument( |
812 | "Default MaxPoolingNoMaskOp only supports NHWC on device type " , |
813 | DeviceTypeString(context->device_type()))); |
814 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
815 | OP_REQUIRES(context, ksize_.size() == 4, |
816 | errors::InvalidArgument("Sliding window ksize field must " |
817 | "specify 4 dimensions" )); |
818 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
819 | OP_REQUIRES(context, stride_.size() == 4, |
820 | errors::InvalidArgument("Sliding window stride field must " |
821 | "specify 4 dimensions" )); |
822 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
823 | OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, |
824 | errors::Unimplemented( |
825 | "Pooling is not yet supported on the batch dimension." )); |
826 | OP_REQUIRES( |
827 | context, padding_ != EXPLICIT, |
828 | errors::Unimplemented( |
829 | "Explicit padding is not supported for MaxPoolingNoMaskOp." )); |
830 | } |
831 | |
832 | void Compute(OpKernelContext* context) override { |
833 | const Tensor& tensor_in = context->input(0); |
834 | |
835 | PoolParameters params{context, |
836 | ksize_, |
837 | stride_, |
838 | padding_, |
839 | /*explicit_paddings=*/{}, |
840 | data_format_, |
841 | tensor_in.shape()}; |
842 | if (!context->status().ok()) { |
843 | return; |
844 | } |
845 | |
846 | TensorShape out_shape({params.tensor_in_batch, params.out_height, |
847 | params.out_width, params.depth}); |
848 | Tensor* output = nullptr; |
849 | OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); |
850 | |
851 | LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in, |
852 | output); |
853 | } |
854 | |
855 | private: |
856 | std::vector<int32> ksize_; |
857 | std::vector<int32> stride_; |
858 | Padding padding_; |
859 | TensorFormat data_format_; |
860 | }; |
861 | |
862 | template <typename Device, typename T> |
863 | class MaxPoolingNoMaskV2Op : public OpKernel { |
864 | public: |
865 | explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context) |
866 | : OpKernel(context) { |
867 | string data_format; |
868 | OP_REQUIRES_OK(context, context->GetAttr("data_format" , &data_format)); |
869 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), |
870 | errors::InvalidArgument("Invalid data format" )); |
871 | OP_REQUIRES( |
872 | context, data_format_ == FORMAT_NHWC, |
873 | errors::InvalidArgument( |
874 | "Default MaxPoolingNoMaskOp only supports NHWC on device type " , |
875 | DeviceTypeString(context->device_type()))); |
876 | if (context->num_inputs() == 1) { |
877 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
878 | OP_REQUIRES(context, ksize_.size() == 4, |
879 | errors::InvalidArgument("Sliding window ksize field must " |
880 | "specify 4 dimensions" )); |
881 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
882 | OP_REQUIRES(context, stride_.size() == 4, |
883 | errors::InvalidArgument("Sliding window stride field must " |
884 | "specify 4 dimensions" )); |
885 | OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, |
886 | errors::Unimplemented( |
887 | "Pooling is not yet supported on the batch dimension." )); |
888 | } |
889 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
890 | } |
891 | |
892 | void Compute(OpKernelContext* context) override { |
893 | const Tensor& tensor_in = context->input(0); |
894 | |
895 | std::vector<int32> ksize = ksize_; |
896 | std::vector<int32> stride = stride_; |
897 | |
898 | if (context->num_inputs() != 1) { |
899 | const Tensor& tensor_ksize = context->input(1); |
900 | auto value_ksize = tensor_ksize.flat<int32>(); |
901 | ksize.resize(tensor_ksize.shape().num_elements()); |
902 | std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); |
903 | |
904 | const Tensor& tensor_stride = context->input(2); |
905 | auto value_stride = tensor_stride.flat<int32>(); |
906 | stride.resize(tensor_stride.shape().num_elements()); |
907 | std::copy_n(&value_stride(0), stride.size(), stride.begin()); |
908 | } |
909 | OP_REQUIRES(context, ksize.size() == 4, |
910 | errors::InvalidArgument("Sliding window ksize field must " |
911 | "specify 4 dimensions" )); |
912 | OP_REQUIRES(context, stride.size() == 4, |
913 | errors::InvalidArgument("Sliding window stride field must " |
914 | "specify 4 dimensions" )); |
915 | OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1, |
916 | errors::Unimplemented( |
917 | "Pooling is not yet supported on the batch dimension." )); |
918 | PoolParameters params{context, |
919 | ksize, |
920 | stride, |
921 | padding_, |
922 | /*explicit_paddings=*/{}, |
923 | data_format_, |
924 | tensor_in.shape()}; |
925 | if (!context->status().ok()) { |
926 | return; |
927 | } |
928 | |
929 | TensorShape out_shape({params.tensor_in_batch, params.out_height, |
930 | params.out_width, params.depth}); |
931 | Tensor* output = nullptr; |
932 | OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); |
933 | |
934 | LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in, |
935 | output); |
936 | } |
937 | |
938 | private: |
939 | std::vector<int32> ksize_; |
940 | std::vector<int32> stride_; |
941 | Padding padding_; |
942 | TensorFormat data_format_; |
943 | }; |
944 | |
945 | template <typename Device, typename T, typename Targmax> |
946 | struct LaunchMaxPoolingWithArgmax; |
947 | |
948 | template <typename T, typename Targmax> |
949 | struct LaunchMaxPoolingWithArgmax<CPUDevice, T, Targmax> { |
950 | static void launch(OpKernelContext* context, const PoolParameters& params, |
951 | const Tensor& input, Tensor* output, Tensor* argmax, |
952 | bool propagate_nans, bool include_batch_in_index) { |
953 | Tensor unused; |
954 | SpatialMaxPoolWithArgMaxHelper<CPUDevice, T, Targmax>( |
955 | context, output, argmax, /*input_backprop=*/nullptr, input, unused, |
956 | params, include_batch_in_index); |
957 | } |
958 | }; |
959 | |
960 | template <typename Device, typename T, typename Targmax> |
961 | class MaxPoolingWithArgmaxOp : public OpKernel { |
962 | public: |
963 | explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context) |
964 | : OpKernel(context) { |
965 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
966 | OP_REQUIRES(context, ksize_.size() == 4, |
967 | errors::InvalidArgument("Sliding window ksize field must " |
968 | "specify 4 dimensions" )); |
969 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
970 | OP_REQUIRES(context, stride_.size() == 4, |
971 | errors::InvalidArgument("Sliding window stride field must " |
972 | "specify 4 dimensions" )); |
973 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
974 | OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, |
975 | errors::Unimplemented( |
976 | "Pooling is not yet supported on the batch dimension." )); |
977 | OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index" , |
978 | &include_batch_in_index_)); |
979 | TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP" , false, |
980 | &propagate_nans_)); |
981 | } |
982 | |
983 | void Compute(OpKernelContext* context) override { |
984 | const Tensor& tensor_in = context->input(0); |
985 | OP_REQUIRES(context, tensor_in.dims() == 4, |
986 | errors::InvalidArgument("tensor_in must be 4-dimensional (2)" )); |
987 | OP_REQUIRES(context, tensor_in.NumElements() > 0, |
988 | errors::InvalidArgument("tensor_in must not be empty (2)" )); |
989 | |
990 | PoolParameters params{context, |
991 | ksize_, |
992 | stride_, |
993 | padding_, |
994 | /*explicit_paddings=*/{}, |
995 | FORMAT_NHWC, |
996 | tensor_in.shape()}; |
997 | if (!context->status().ok()) { |
998 | return; |
999 | } |
1000 | |
1001 | TensorShape out_shape({params.tensor_in_batch, params.out_height, |
1002 | params.out_width, params.depth}); |
1003 | Tensor* output = nullptr; |
1004 | OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); |
1005 | Tensor* argmax = nullptr; |
1006 | OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax)); |
1007 | |
1008 | LaunchMaxPoolingWithArgmax<Device, T, Targmax>::launch( |
1009 | context, params, tensor_in, output, argmax, propagate_nans_, |
1010 | include_batch_in_index_); |
1011 | } |
1012 | |
1013 | private: |
1014 | std::vector<int32> ksize_; |
1015 | std::vector<int32> stride_; |
1016 | Padding padding_; |
1017 | bool propagate_nans_; |
1018 | bool include_batch_in_index_; |
1019 | }; |
1020 | |
1021 | template <typename Device, typename T> |
1022 | struct LaunchMaxPoolingGradWithArgmax; |
1023 | |
1024 | template <typename T> |
1025 | struct LaunchMaxPoolingGradWithArgmax<CPUDevice, T> { |
1026 | typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
1027 | EigenMatrixMap; |
1028 | |
1029 | static void launch(OpKernelContext* context, const PoolParameters& params, |
1030 | const Tensor& grad_in, const Tensor& argmax, |
1031 | Tensor* grad_out, const bool include_batch_in_index) { |
1032 | const DeviceBase::CpuWorkerThreads& worker_threads = |
1033 | *(context->device()->tensorflow_cpu_worker_threads()); |
1034 | |
1035 | auto shard = [&grad_in, &argmax, &grad_out, include_batch_in_index]( |
1036 | int64_t start, int64_t limit) { |
1037 | const int64_t batch_size = |
1038 | GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N'); |
1039 | const int64_t output_size_per_batch = |
1040 | grad_out->NumElements() / batch_size; |
1041 | const int64_t input_size_per_batch = grad_in.NumElements() / batch_size; |
1042 | |
1043 | { |
1044 | auto grad_out_flat = grad_out->flat<T>(); |
1045 | auto argmax_flat = argmax.flat<int64_t>(); |
1046 | auto grad_in_flat = grad_in.flat<T>(); |
1047 | |
1048 | const int64_t output_start = start * output_size_per_batch; |
1049 | const int64_t output_end = limit * output_size_per_batch; |
1050 | EigenMatrixMap inputShard(grad_out_flat.data() + output_start, 1, |
1051 | output_end - output_start); |
1052 | inputShard.setConstant(T(0)); |
1053 | |
1054 | const int input_start = start * input_size_per_batch; |
1055 | const int input_end = limit * input_size_per_batch; |
1056 | for (int64_t index = input_start; index < input_end; index++) { |
1057 | if (index >= argmax.NumElements()) { |
1058 | break; |
1059 | } |
1060 | int64_t grad_out_index = argmax_flat(index); |
1061 | if (!include_batch_in_index) { |
1062 | const int64_t cur_batch = index / input_size_per_batch; |
1063 | grad_out_index += cur_batch * output_size_per_batch; |
1064 | } |
1065 | CHECK(grad_out_index >= output_start && grad_out_index < output_end) |
1066 | << "Invalid output gradient index: " << grad_out_index << ", " |
1067 | << output_start << ", " << output_end; |
1068 | grad_out_flat(grad_out_index) += grad_in_flat(index); |
1069 | } |
1070 | } |
1071 | }; |
1072 | |
1073 | const int64_t batch_size = |
1074 | GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N'); |
1075 | const int64_t shard_cost = grad_out->NumElements() / batch_size; |
1076 | Shard(worker_threads.num_threads, worker_threads.workers, batch_size, |
1077 | shard_cost, shard); |
1078 | } |
1079 | }; |
1080 | |
1081 | // TODO(b/175733711): Support int32 argmax type in MaxPoolGradWithArgmax op. |
1082 | template <typename Device, typename T> |
1083 | class MaxPoolingGradWithArgmaxOp : public OpKernel { |
1084 | public: |
1085 | explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context) |
1086 | : OpKernel(context) { |
1087 | string data_format_str; |
1088 | if (std::is_same<Device, GPUDevice>::value) { |
1089 | OP_REQUIRES(context, !tensorflow::OpDeterminismRequired(), |
1090 | errors::Unimplemented("Determinism is not yet supported " |
1091 | "for MaxPoolGradWithArgmax." )); |
1092 | } |
1093 | auto status = context->GetAttr("data_format" , &data_format_str); |
1094 | if (status.ok()) { |
1095 | OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_), |
1096 | errors::InvalidArgument("Invalid data format" )); |
1097 | } |
1098 | |
1099 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
1100 | OP_REQUIRES(context, ksize_.size() == 4, |
1101 | errors::InvalidArgument("Sliding window ksize field must " |
1102 | "specify 4 dimensions" )); |
1103 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
1104 | OP_REQUIRES(context, stride_.size() == 4, |
1105 | errors::InvalidArgument("Sliding window stride field must " |
1106 | "specify 4 dimensions" )); |
1107 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
1108 | OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, |
1109 | errors::Unimplemented( |
1110 | "Pooling is not yet supported on the batch dimension." )); |
1111 | OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index" , |
1112 | &include_batch_in_index_)); |
1113 | } |
1114 | |
1115 | void Compute(OpKernelContext* context) override { |
1116 | const Tensor& tensor_in = context->input(0); |
1117 | const Tensor& grad_in = context->input(1); |
1118 | const Tensor& argmax = context->input(2); |
1119 | |
1120 | PoolParameters params{context, |
1121 | ksize_, |
1122 | stride_, |
1123 | padding_, |
1124 | /*explicit_paddings=*/{}, |
1125 | FORMAT_NHWC, |
1126 | tensor_in.shape()}; |
1127 | if (!context->status().ok()) { |
1128 | return; |
1129 | } |
1130 | OP_REQUIRES(context, grad_in.shape() == params.forward_output_shape(), |
1131 | errors::InvalidArgument("Expected grad shape to be " , |
1132 | params.forward_output_shape(), |
1133 | ", but got " , grad_in.shape())); |
1134 | OP_REQUIRES(context, argmax.shape() == params.forward_output_shape(), |
1135 | errors::InvalidArgument("Expected argmax shape to be " , |
1136 | params.forward_output_shape(), |
1137 | ", but got " , argmax.shape())); |
1138 | |
1139 | TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows, |
1140 | params.tensor_in_cols, params.depth}); |
1141 | Tensor* grad_out = nullptr; |
1142 | OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( |
1143 | {0}, 0, out_shape, &grad_out)); |
1144 | |
1145 | if (out_shape.num_elements() == 0) return; // nothing to be done |
1146 | |
1147 | LaunchMaxPoolingGradWithArgmax<Device, T>::launch( |
1148 | context, params, grad_in, argmax, grad_out, include_batch_in_index_); |
1149 | } |
1150 | |
1151 | private: |
1152 | std::vector<int32> ksize_; |
1153 | std::vector<int32> stride_; |
1154 | Padding padding_; |
1155 | TensorFormat data_format_; |
1156 | bool include_batch_in_index_; |
1157 | }; |
1158 | |
1159 | template <typename Device, typename T> |
1160 | struct LaunchMaxPoolingGradGradWithArgmax; |
1161 | |
1162 | template <typename Device, typename T> |
1163 | class MaxPoolingGradGradWithArgmaxOp : public OpKernel { |
1164 | public: |
1165 | explicit MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction* context) |
1166 | : OpKernel(context) { |
1167 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
1168 | OP_REQUIRES(context, ksize_.size() == 4, |
1169 | errors::InvalidArgument("Sliding window ksize field must " |
1170 | "specify 4 dimensions" )); |
1171 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
1172 | OP_REQUIRES(context, stride_.size() == 4, |
1173 | errors::InvalidArgument("Sliding window stride field must " |
1174 | "specify 4 dimensions" )); |
1175 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
1176 | OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, |
1177 | errors::Unimplemented( |
1178 | "Pooling is not yet supported on the batch dimension." )); |
1179 | OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index" , |
1180 | &include_batch_in_index_)); |
1181 | } |
1182 | |
1183 | void Compute(OpKernelContext* context) override { |
1184 | const Tensor& tensor_in = context->input(0); |
1185 | const Tensor& grad_in = context->input(1); |
1186 | const Tensor& argmax = context->input(2); |
1187 | |
1188 | PoolParameters params{context, |
1189 | ksize_, |
1190 | stride_, |
1191 | padding_, |
1192 | /*explicit_paddings=*/{}, |
1193 | FORMAT_NHWC, |
1194 | tensor_in.shape()}; |
1195 | if (!context->status().ok()) { |
1196 | return; |
1197 | } |
1198 | OP_REQUIRES( |
1199 | context, grad_in.shape() == tensor_in.shape(), |
1200 | errors::InvalidArgument("Expected grad shape to be " , tensor_in.shape(), |
1201 | ", but got " , grad_in.shape())); |
1202 | OP_REQUIRES(context, argmax.shape() == params.forward_output_shape(), |
1203 | errors::InvalidArgument("Expected argmax shape to be " , |
1204 | params.forward_output_shape(), |
1205 | ", but got " , argmax.shape())); |
1206 | |
1207 | TensorShape out_shape({params.tensor_in_batch, params.out_height, |
1208 | params.out_width, params.depth}); |
1209 | |
1210 | Tensor* grad_out = nullptr; |
1211 | OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( |
1212 | {0}, 0, out_shape, &grad_out)); |
1213 | |
1214 | LaunchMaxPoolingGradGradWithArgmax<Device, T>::launch( |
1215 | context, params, grad_in, argmax, grad_out, include_batch_in_index_); |
1216 | } |
1217 | |
1218 | private: |
1219 | std::vector<int32> ksize_; |
1220 | std::vector<int32> stride_; |
1221 | Padding padding_; |
1222 | bool include_batch_in_index_; |
1223 | }; |
1224 | |
1225 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
1226 | template <typename T> |
1227 | class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel { |
1228 | public: |
1229 | typedef GPUDevice Device; |
1230 | explicit MaxPoolingNoMaskOp(OpKernelConstruction* context) |
1231 | : OpKernel(context) { |
1232 | string data_format; |
1233 | OP_REQUIRES_OK(context, context->GetAttr("data_format" , &data_format)); |
1234 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), |
1235 | errors::InvalidArgument("Invalid data format" )); |
1236 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
1237 | OP_REQUIRES(context, ksize_.size() == 4, |
1238 | errors::InvalidArgument("Sliding window ksize field must " |
1239 | "specify 4 dimensions" )); |
1240 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
1241 | OP_REQUIRES(context, stride_.size() == 4, |
1242 | errors::InvalidArgument("Sliding window stride field must " |
1243 | "specify 4 dimensions" )); |
1244 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
1245 | OP_REQUIRES_OK(context, |
1246 | context->GetAttr("explicit_paddings" , &explicit_paddings_)); |
1247 | const int32_t ksize_n = GetTensorDim(ksize_, data_format_, 'N'); |
1248 | const int32_t stride_n = GetTensorDim(stride_, data_format_, 'N'); |
1249 | OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, |
1250 | errors::Unimplemented( |
1251 | "Pooling is not yet supported on the batch dimension." )); |
1252 | |
1253 | TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP" , false, |
1254 | &propagate_nans_)); |
1255 | } |
1256 | |
1257 | void Compute(OpKernelContext* context) override { |
1258 | const Tensor& tensor_in = context->input(0); |
1259 | |
1260 | PoolParameters params{ |
1261 | context, ksize_, stride_, padding_, explicit_paddings_, |
1262 | data_format_, tensor_in.shape()}; |
1263 | if (!context->status().ok()) { |
1264 | return; |
1265 | } |
1266 | |
1267 | TensorShape out_shape = |
1268 | ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height, |
1269 | params.out_width, params.depth); |
1270 | |
1271 | // Degenerate pooling output should return an empty tensor. |
1272 | if (out_shape.num_elements() == 0) { |
1273 | Tensor* output = nullptr; |
1274 | OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); |
1275 | return; |
1276 | } |
1277 | |
1278 | // Assuming qint8 <--> NCHW_VECT_C (int8x4) here. |
1279 | constexpr bool is_int8x4 = std::is_same<T, qint8>::value; |
1280 | OP_REQUIRES(context, (is_int8x4 == (data_format_ == FORMAT_NCHW_VECT_C)), |
1281 | errors::InvalidArgument( |
1282 | "qint8 should be used with data_format NCHW_VECT_C." )); |
1283 | |
1284 | #if CUDNN_VERSION >= 7300 |
1285 | DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_, |
1286 | stride_, padding_, explicit_paddings_, |
1287 | data_format_, tensor_in, out_shape, |
1288 | propagate_nans_); |
1289 | #else |
1290 | // These is_int8x4 checks avoid linker errors for missing qint8 kernels. |
1291 | if (!is_int8x4 && data_format_ == FORMAT_NCHW) { |
1292 | DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_, |
1293 | stride_, padding_, explicit_paddings_, |
1294 | data_format_, tensor_in, out_shape, |
1295 | propagate_nans_); |
1296 | } else { |
1297 | #if !defined(TENSORFLOW_USE_ROCM) |
1298 | OP_REQUIRES(context, padding_ != EXPLICIT, |
1299 | errors::Unimplemented("Explicit padding is not supported " , |
1300 | "when CUDNN is not enabled." )); |
1301 | #endif |
1302 | Tensor* output = nullptr; |
1303 | OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); |
1304 | if (is_int8x4) { |
1305 | LaunchMaxPoolingNoMask_NCHW_VECT_C<Device>::launch(context, params, |
1306 | tensor_in, output); |
1307 | } else if (data_format_ == FORMAT_NHWC) { |
1308 | LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in, |
1309 | output, propagate_nans_); |
1310 | } else { |
1311 | LOG(FATAL) << "MaxPool currently only supports the following (layout, " |
1312 | "type) combinations: (NHWC, non-qint8), " |
1313 | "(NCHW, non-qint8) or (NCHW_VECT_C, qint8). The " |
1314 | "requested combination (" |
1315 | << ToString(data_format_) << ", " |
1316 | << DataTypeString(DataTypeToEnum<T>::v()) |
1317 | << ") is not supported." ; |
1318 | } |
1319 | } |
1320 | #endif |
1321 | } |
1322 | |
1323 | private: |
1324 | std::vector<int32> ksize_; |
1325 | std::vector<int32> stride_; |
1326 | Padding padding_; |
1327 | std::vector<int64_t> explicit_paddings_; |
1328 | TensorFormat data_format_; |
1329 | bool propagate_nans_; |
1330 | }; |
1331 | |
1332 | template <typename T> |
1333 | class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel { |
1334 | public: |
1335 | typedef GPUDevice Device; |
1336 | explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context) |
1337 | : OpKernel(context) { |
1338 | string data_format; |
1339 | OP_REQUIRES_OK(context, context->GetAttr("data_format" , &data_format)); |
1340 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), |
1341 | errors::InvalidArgument("Invalid data format" )); |
1342 | if (context->num_inputs() == 1) { |
1343 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
1344 | OP_REQUIRES(context, ksize_.size() == 4, |
1345 | errors::InvalidArgument("Sliding window ksize field must " |
1346 | "specify 4 dimensions" )); |
1347 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
1348 | OP_REQUIRES(context, stride_.size() == 4, |
1349 | errors::InvalidArgument("Sliding window stride field must " |
1350 | "specify 4 dimensions" )); |
1351 | const int32_t ksize_n = GetTensorDim(ksize_, data_format_, 'N'); |
1352 | const int32_t stride_n = GetTensorDim(stride_, data_format_, 'N'); |
1353 | OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, |
1354 | errors::Unimplemented( |
1355 | "Pooling is not yet supported on the batch dimension." )); |
1356 | } |
1357 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
1358 | TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP" , false, |
1359 | &propagate_nans_)); |
1360 | } |
1361 | |
1362 | void Compute(OpKernelContext* context) override { |
1363 | const Tensor& tensor_in = context->input(0); |
1364 | |
1365 | std::vector<int32> ksize = ksize_; |
1366 | std::vector<int32> stride = stride_; |
1367 | |
1368 | if (context->num_inputs() != 1) { |
1369 | const Tensor& tensor_ksize = context->input(1); |
1370 | auto value_ksize = tensor_ksize.flat<int32>(); |
1371 | ksize.resize(tensor_ksize.shape().num_elements()); |
1372 | std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); |
1373 | |
1374 | const Tensor& tensor_stride = context->input(2); |
1375 | auto value_stride = tensor_stride.flat<int32>(); |
1376 | stride.resize(tensor_stride.shape().num_elements()); |
1377 | std::copy_n(&value_stride(0), stride.size(), stride.begin()); |
1378 | } |
1379 | OP_REQUIRES(context, ksize.size() == 4, |
1380 | errors::InvalidArgument("Sliding window ksize field must " |
1381 | "specify 4 dimensions" )); |
1382 | OP_REQUIRES(context, stride.size() == 4, |
1383 | errors::InvalidArgument("Sliding window stride field must " |
1384 | "specify 4 dimensions" )); |
1385 | const int32_t ksize_n = GetTensorDim(ksize, data_format_, 'N'); |
1386 | const int32_t stride_n = GetTensorDim(stride, data_format_, 'N'); |
1387 | OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, |
1388 | errors::Unimplemented( |
1389 | "Pooling is not yet supported on the batch dimension." )); |
1390 | |
1391 | PoolParameters params{context, |
1392 | ksize, |
1393 | stride, |
1394 | padding_, |
1395 | /*explicit_paddings=*/{}, |
1396 | data_format_, |
1397 | tensor_in.shape()}; |
1398 | if (!context->status().ok()) { |
1399 | return; |
1400 | } |
1401 | |
1402 | TensorShape out_shape = |
1403 | ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height, |
1404 | params.out_width, params.depth); |
1405 | if (data_format_ == FORMAT_NCHW) { |
1406 | DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize, |
1407 | stride, padding_, explicit_paddings_, |
1408 | data_format_, tensor_in, out_shape, |
1409 | propagate_nans_); |
1410 | } else { |
1411 | CHECK(data_format_ == FORMAT_NHWC) |
1412 | << "MaxPool only supports NCHW or NHWC format" ; |
1413 | Tensor* output = nullptr; |
1414 | OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); |
1415 | LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in, |
1416 | output, propagate_nans_); |
1417 | } |
1418 | } |
1419 | |
1420 | private: |
1421 | std::vector<int32> ksize_; |
1422 | std::vector<int32> stride_; |
1423 | Padding padding_; |
1424 | std::vector<int64_t> explicit_paddings_; |
1425 | TensorFormat data_format_; |
1426 | bool propagate_nans_; |
1427 | }; |
1428 | |
1429 | template <typename T> |
1430 | struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> { |
1431 | static void launch(OpKernelContext* context, const PoolParameters& params, |
1432 | const Tensor& input, Tensor* output, bool propagate_nans) { |
1433 | bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()( |
1434 | input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, |
1435 | params.tensor_in_cols, params.depth, params.out_height, |
1436 | params.out_width, params.window_rows, params.window_cols, |
1437 | params.row_stride, params.col_stride, params.pad_top, params.pad_left, |
1438 | output->flat<T>().data(), nullptr, context->eigen_gpu_device(), |
1439 | propagate_nans, false); |
1440 | if (!status) { |
1441 | context->SetStatus( |
1442 | errors::Internal("Failed launching MaxPoolForwardNoMask" )); |
1443 | } |
1444 | } |
1445 | }; |
1446 | |
1447 | template <typename T> |
1448 | struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T, int64_t> { |
1449 | static void launch(OpKernelContext* context, const PoolParameters& params, |
1450 | const Tensor& input, Tensor* output, Tensor* argmax, |
1451 | bool propagate_nans, bool include_batch_in_index) { |
1452 | bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()( |
1453 | input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, |
1454 | params.tensor_in_cols, params.depth, params.out_height, |
1455 | params.out_width, params.window_rows, params.window_cols, |
1456 | params.row_stride, params.col_stride, params.pad_top, params.pad_left, |
1457 | output->flat<T>().data(), |
1458 | reinterpret_cast<int64_t*>(argmax->flat<int64_t>().data()), |
1459 | context->eigen_gpu_device(), propagate_nans, include_batch_in_index); |
1460 | if (!status) { |
1461 | context->SetStatus( |
1462 | errors::Internal("Failed launching MaxPoolForwardWithArgmax" )); |
1463 | } |
1464 | } |
1465 | }; |
1466 | |
1467 | template <typename T> |
1468 | struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> { |
1469 | static void launch(OpKernelContext* context, const PoolParameters& params, |
1470 | const Tensor& grad_in, const Tensor& argmax, |
1471 | Tensor* grad_out, const bool include_batch_in_index) { |
1472 | const int input_size = params.tensor_in_batch * params.tensor_in_rows * |
1473 | params.tensor_in_cols * params.depth; |
1474 | const int output_size = params.tensor_in_batch * params.out_height * |
1475 | params.out_width * params.depth; |
1476 | const int top_offset = params.out_height * params.out_width * params.depth; |
1477 | const int bottom_offset = |
1478 | params.tensor_in_rows * params.tensor_in_cols * params.depth; |
1479 | bool status = functor::MaxPoolBackwardWithArgmax<T>()( |
1480 | output_size, input_size, grad_in.flat<T>().data(), |
1481 | reinterpret_cast<const int64_t*>(argmax.flat<int64_t>().data()), |
1482 | top_offset, bottom_offset, grad_out->flat<T>().data(), |
1483 | context->eigen_gpu_device(), include_batch_in_index); |
1484 | if (!status) { |
1485 | context->SetStatus( |
1486 | errors::Internal("Failed launching MaxPoolBackwardWithArgmax" )); |
1487 | } |
1488 | } |
1489 | }; |
1490 | |
1491 | template <typename T> |
1492 | struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> { |
1493 | static void launch(OpKernelContext* context, const PoolParameters& params, |
1494 | const Tensor& grad_in, const Tensor& argmax, |
1495 | Tensor* grad_out, const bool include_batch_in_index) { |
1496 | const int input_size = params.tensor_in_batch * params.tensor_in_rows * |
1497 | params.tensor_in_cols * params.depth; |
1498 | const int output_size = params.tensor_in_batch * params.out_height * |
1499 | params.out_width * params.depth; |
1500 | const int top_offset = |
1501 | params.tensor_in_rows * params.tensor_in_cols * params.depth; |
1502 | const int bottom_offset = |
1503 | params.out_width * params.out_height * params.depth; |
1504 | bool status = functor::MaxPoolGradBackwardWithArgmax<T>()( |
1505 | output_size, input_size, grad_in.flat<T>().data(), |
1506 | reinterpret_cast<const int64_t*>(argmax.flat<int64_t>().data()), |
1507 | top_offset, bottom_offset, grad_out->flat<T>().data(), |
1508 | context->eigen_gpu_device(), include_batch_in_index); |
1509 | if (!status) { |
1510 | context->SetStatus( |
1511 | errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax" )); |
1512 | } |
1513 | } |
1514 | }; |
1515 | |
1516 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
1517 | |
1518 | #define REGISTER_MAX_POOL_KERNELS(D, T) \ |
1519 | REGISTER_KERNEL_BUILDER( \ |
1520 | Name("MaxPoolGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \ |
1521 | MaxPoolingGradOp<D##Device, T>); \ |
1522 | REGISTER_KERNEL_BUILDER( \ |
1523 | Name("MaxPoolGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \ |
1524 | MaxPoolingGradGradOp<D##Device, T>); \ |
1525 | REGISTER_KERNEL_BUILDER(Name("MaxPoolGradV2") \ |
1526 | .Device(DEVICE_##D) \ |
1527 | .HostMemory("ksize") \ |
1528 | .HostMemory("strides") \ |
1529 | .TypeConstraint<T>("T"), \ |
1530 | MaxPoolingGradOp<D##Device, T>); \ |
1531 | REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradV2") \ |
1532 | .Device(DEVICE_##D) \ |
1533 | .HostMemory("ksize") \ |
1534 | .HostMemory("strides") \ |
1535 | .TypeConstraint<T>("T"), \ |
1536 | MaxPoolingGradGradOp<D##Device, T>) \ |
1537 | REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") \ |
1538 | .Device(DEVICE_##D) \ |
1539 | .TypeConstraint<int64_t>("Targmax") \ |
1540 | .TypeConstraint<T>("T"), \ |
1541 | MaxPoolingWithArgmaxOp<D##Device, T, int64>); \ |
1542 | REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax") \ |
1543 | .Device(DEVICE_##D) \ |
1544 | .TypeConstraint<T>("T") \ |
1545 | .TypeConstraint<int64_t>("Targmax"), \ |
1546 | MaxPoolingGradWithArgmaxOp<D##Device, T>); |
1547 | |
1548 | // Below kernels implemented only for CPU device. |
1549 | #define REGISTER_CPU_ONLY_POOL_KERNELS(T) \ |
1550 | REGISTER_KERNEL_BUILDER( \ |
1551 | Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ |
1552 | MaxPoolingOp<CPUDevice, T>); \ |
1553 | REGISTER_KERNEL_BUILDER( \ |
1554 | Name("MaxPoolV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ |
1555 | MaxPoolingV2Op<CPUDevice, T>); \ |
1556 | REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") \ |
1557 | .Device(DEVICE_CPU) \ |
1558 | .TypeConstraint<int32>("Targmax") \ |
1559 | .TypeConstraint<T>("T"), \ |
1560 | MaxPoolingWithArgmaxOp<CPUDevice, T, int32>); |
1561 | TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS); |
1562 | #undef REGISTER_CPU_ONLY_POOL_KERNELS |
1563 | |
1564 | #define REGISTER_CPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(CPU, T); |
1565 | TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS); |
1566 | #undef REGISTER_CPU_KERNELS |
1567 | |
1568 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
1569 | |
1570 | // Forward declarations for the functor specializations for GPU. |
1571 | namespace functor { |
1572 | #define DECLARE_GPU_SPEC(T) \ |
1573 | template <> \ |
1574 | void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()( \ |
1575 | const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \ |
1576 | typename TTypes<T, 4>::ConstTensor input, int window_rows, \ |
1577 | int window_cols, int row_stride, int col_stride, \ |
1578 | const Eigen::PaddingType& padding); \ |
1579 | extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>; |
1580 | |
1581 | TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); |
1582 | #undef DECLARE_GPU_SPEC |
1583 | } // namespace functor |
1584 | |
1585 | #define REGISTER_GPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(GPU, T) |
1586 | TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS); |
1587 | #undef REGISTER_GPU_MAX_POOL_KERNELS |
1588 | |
1589 | // Below kernels currently implemented only for GPU device. |
1590 | // Note(jiayq): Currently, the Caffe custom implementation is faster than the |
1591 | // default Eigen implementation so we are using the custom kernel as the |
1592 | // default. However, you can explicitly invoke the eigen version using |
1593 | // kernel_label_map. |
1594 | #define REGISTER_GPU_ONLY_POOL_KERNELS(T) \ |
1595 | REGISTER_KERNEL_BUILDER(Name("MaxPool") \ |
1596 | .Device(DEVICE_GPU) \ |
1597 | .TypeConstraint<T>("T") \ |
1598 | .Label("eigen_tensor"), \ |
1599 | MaxPoolingOp<GPUDevice, T>); \ |
1600 | REGISTER_KERNEL_BUILDER(Name("MaxPoolV2") \ |
1601 | .Device(DEVICE_GPU) \ |
1602 | .HostMemory("ksize") \ |
1603 | .HostMemory("strides") \ |
1604 | .TypeConstraint<T>("T") \ |
1605 | .Label("eigen_tensor"), \ |
1606 | MaxPoolingV2Op<GPUDevice, T>); \ |
1607 | REGISTER_KERNEL_BUILDER( \ |
1608 | Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"), \ |
1609 | MaxPoolingNoMaskOp<GPUDevice, T>); \ |
1610 | REGISTER_KERNEL_BUILDER(Name("MaxPoolV2") \ |
1611 | .Device(DEVICE_GPU) \ |
1612 | .HostMemory("ksize") \ |
1613 | .HostMemory("strides") \ |
1614 | .TypeConstraint<T>("T"), \ |
1615 | MaxPoolingNoMaskV2Op<GPUDevice, T>); \ |
1616 | REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax") \ |
1617 | .Device(DEVICE_GPU) \ |
1618 | .TypeConstraint<T>("T") \ |
1619 | .TypeConstraint<int64_t>("Targmax"), \ |
1620 | MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>); |
1621 | TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS); |
1622 | |
1623 | // TODO(b/65847473): Re-enable once the underlying build error is fixed. |
1624 | #if !defined(PLATFORM_WINDOWS) |
1625 | REGISTER_KERNEL_BUILDER( |
1626 | Name("MaxPool" ).Device(DEVICE_GPU).TypeConstraint<qint8>("T" ), |
1627 | MaxPoolingNoMaskOp<GPUDevice, qint8>); |
1628 | |
1629 | REGISTER_KERNEL_BUILDER(Name("MaxPoolV2" ) |
1630 | .Device(DEVICE_GPU) |
1631 | .HostMemory("ksize" ) |
1632 | .HostMemory("strides" ) |
1633 | .TypeConstraint<qint8>("T" ), |
1634 | MaxPoolingV2Op<GPUDevice, qint8>); |
1635 | |
1636 | REGISTER_KERNEL_BUILDER(Name("MaxPoolV2" ) |
1637 | .Device(DEVICE_GPU) |
1638 | .HostMemory("ksize" ) |
1639 | .HostMemory("strides" ) |
1640 | .TypeConstraint<qint8>("T" ) |
1641 | .Label("eigen_tensor" ), |
1642 | MaxPoolingV2Op<GPUDevice, qint8>); |
1643 | #endif // !defined(PLATFORM_WINDOWS) |
1644 | |
1645 | #undef REGISTER_GPU_ONLY_POOL_KERNELS |
1646 | |
1647 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
1648 | |
1649 | #undef REGISTER_MAX_POOL_KERNELS |
1650 | |
1651 | } // namespace tensorflow |
1652 | |