1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
17#define TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
18
19#include <vector>
20
21#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
22#define EIGEN_USE_GPU
23#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
24
25#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26#include "tensorflow/core/framework/bounds_check.h"
27#include "tensorflow/core/framework/numeric_op.h"
28#include "tensorflow/core/framework/op_kernel.h"
29#include "tensorflow/core/framework/tensor_shape.h"
30#include "tensorflow/core/kernels/avgpooling_op.h"
31#include "tensorflow/core/kernels/maxpooling_op.h"
32#include "tensorflow/core/kernels/ops_util.h"
33#include "tensorflow/core/util/padding.h"
34#include "tensorflow/core/util/tensor_format.h"
35#include "tensorflow/core/util/work_sharder.h"
36
37#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
38#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
39#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
40
41namespace tensorflow {
42
43typedef Eigen::GpuDevice GPUDevice;
44
45// A helper class to manage sizes and shapes for pooling operations.
46struct PoolParameters {
47 // Updates context->status if there is an invalid input.
48 // explicit_paddings has eight elements if padding==EXPLIICT, and zero
49 // elements otherwise.
50 PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize,
51 const std::vector<int32>& stride, Padding padding,
52 std::vector<int64_t> explicit_paddings,
53 TensorFormat data_format, const TensorShape& tensor_in_shape);
54
55 // Returns the shape of the output for "forward" pooling operations.
56 TensorShape forward_output_shape();
57
58 int depth;
59
60 int tensor_in_cols;
61 int tensor_in_rows;
62 int tensor_in_batch;
63
64 int window_rows;
65 int window_cols;
66 int depth_window;
67
68 int row_stride;
69 int col_stride;
70 int depth_stride;
71
72 int64_t out_height;
73 int64_t out_width;
74 int out_depth;
75
76 int64_t pad_top;
77 int64_t pad_bottom;
78 int64_t pad_left;
79 int64_t pad_right;
80
81 int pad_depth;
82
83 TensorFormat data_format;
84};
85
86// An implementation of MaxPooling (forward).
87// TODO (yongtang): Remove MaxPoolingOp and use MaxPoolingV2Op,
88// QuantizedMaxPoolingOp depends on MaxPoolingOp so keep intact for now
89template <typename Device, typename T>
90class MaxPoolingOp : public OpKernel {
91 public:
92 explicit MaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
93 string data_format;
94 auto status = context->GetAttr("data_format", &data_format);
95 if (status.ok()) {
96 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
97 errors::InvalidArgument("Invalid data format"));
98 OP_REQUIRES(
99 context, data_format_ == FORMAT_NHWC,
100 errors::InvalidArgument("Default MaxPoolingOp only supports NHWC ",
101 "on device type ",
102 DeviceTypeString(context->device_type())));
103 } else {
104 data_format_ = FORMAT_NHWC;
105 }
106 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
107 OP_REQUIRES(context, ksize_.size() == 4,
108 errors::InvalidArgument("Sliding window ksize field must "
109 "specify 4 dimensions"));
110 for (int i = 0; i < ksize_.size(); ++i) {
111 OP_REQUIRES(context, ksize_[i] > 0,
112 errors::InvalidArgument("Sliding window ksize for dimension ",
113 i, " was zero."));
114 }
115 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
116 OP_REQUIRES(context, stride_.size() == 4,
117 errors::InvalidArgument("Sliding window stride field must "
118 "specify 4 dimensions"));
119 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
120 if (padding_ == Padding::EXPLICIT) {
121 OP_REQUIRES_OK(
122 context, context->GetAttr("explicit_paddings", &explicit_paddings_));
123 }
124 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
125 errors::Unimplemented(
126 "Pooling is not yet supported on the batch dimension."));
127 }
128
129 void Compute(OpKernelContext* context) override {
130 const Tensor& tensor_in = context->input(0);
131 PoolParameters params{
132 context, ksize_, stride_, padding_, explicit_paddings_,
133 FORMAT_NHWC, tensor_in.shape()};
134 if (!context->status().ok()) {
135 return;
136 }
137
138 Tensor* output = nullptr;
139 OP_REQUIRES_OK(context, context->allocate_output(
140 0, params.forward_output_shape(), &output));
141
142 if (params.depth_window > 1) {
143 // Validate spec against the current implementation. A
144 // relaxation of these requirements would be ideal.
145 OP_REQUIRES(context, params.depth % params.depth_window == 0,
146 errors::Unimplemented(
147 "Depthwise max pooling requires "
148 "the depth window to evenly divide the input depth."));
149 OP_REQUIRES(
150 context, params.depth_window == params.depth_stride,
151 errors::Unimplemented("Depthwise max pooling requires "
152 "the depth window to equal the depth stride."));
153 OP_REQUIRES(
154 context, padding_ != EXPLICIT,
155 errors::Unimplemented("Depthwise max pooling does not support "
156 "explicit padding."));
157
158 DepthwiseMaxPool(context, output, tensor_in, params);
159 } else {
160 // MaxPoolingOp is only called on the GPU when the eigen_tensor label
161 // is used. In this case, explicit padding is not supported
162 if (std::is_same<Device, GPUDevice>::value &&
163 padding_ == Padding::EXPLICIT) {
164 context->SetStatus(errors::Unimplemented(
165 "MaxPoolingOp does not support explicit padding."));
166 return;
167 }
168 SpatialMaxPool(context, output, tensor_in, params, padding_);
169 }
170 }
171
172 private:
173 // Single-threaded implementation of DepthwiseMaxPool which
174 // does not handle all of the same options as SpatialMaxPool
175 // (strict assumptions on no padding, stride).
176 //
177 // TODO(vrv): implement a more general depthwise-max pool that works
178 // on GPU as well.
179 void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
180 const Tensor& tensor_in, const PoolParameters& params) {
181 Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
182 in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
183 tensor_in.NumElements() / params.depth_window);
184 Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
185 output->flat<T>().data(), 1, output->NumElements());
186 out_by_pool = in_by_pool.colwise().maxCoeff();
187 }
188
189 void SpatialMaxPool(OpKernelContext* context, Tensor* output,
190 const Tensor& tensor_in, const PoolParameters& params,
191 const Padding& padding) {
192 if (output->NumElements() == 0) {
193 return;
194 }
195 // On GPU, use Eigen's Spatial Max Pooling. On CPU, use an
196 // EigenMatrix version that is currently faster than Eigen's
197 // Spatial MaxPooling implementation.
198 //
199 // TODO(vrv): Remove this once we no longer need it.
200 if (std::is_same<Device, GPUDevice>::value) {
201 Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
202 functor::SpatialMaxPooling<Device, T>()(
203 context->eigen_device<Device>(), output->tensor<T, 4>(),
204 tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
205 params.row_stride, params.col_stride, pt);
206 } else {
207 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
208 ConstEigenMatrixMap;
209 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
210 EigenMatrixMap;
211
212 ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
213 params.tensor_in_cols * params.tensor_in_rows *
214 params.tensor_in_batch);
215 EigenMatrixMap out_mat(
216 output->flat<T>().data(), params.depth,
217 params.out_width * params.out_height * params.tensor_in_batch);
218
219 const DeviceBase::CpuWorkerThreads& worker_threads =
220 *(context->device()->tensorflow_cpu_worker_threads());
221
222 // The following code basically does the following:
223 // 1. Flattens the input and output tensors into two dimensional arrays.
224 // tensor_in_as_matrix:
225 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
226 // output_as_matrix:
227 // depth by (out_width * out_height * tensor_in_batch)
228 //
229 // 2. Walks through the set of columns in the flattened
230 // tensor_in_as_matrix,
231 // and updates the corresponding column(s) in output_as_matrix with the
232 // max value.
233 auto shard = [&params, &in_mat, &out_mat](int64_t start, int64_t limit) {
234 const int32_t in_rows = params.tensor_in_rows;
235 const int32_t in_cols = params.tensor_in_cols;
236 const int32_t pad_top = params.pad_top;
237 const int32_t pad_left = params.pad_left;
238 const int32_t window_rows = params.window_rows;
239 const int32_t window_cols = params.window_cols;
240 const int32_t row_stride = params.row_stride;
241 const int32_t col_stride = params.col_stride;
242 const int32_t out_height = params.out_height;
243 const int32_t out_width = params.out_width;
244
245 {
246 // Initializes the output tensor with MIN<T>.
247 const int32_t output_image_size =
248 out_height * out_width * params.depth;
249 EigenMatrixMap out_shard(out_mat.data() + start * output_image_size,
250 1, (limit - start) * output_image_size);
251 out_shard.setConstant(Eigen::NumTraits<T>::lowest());
252 }
253
254 for (int32_t b = start; b < limit; ++b) {
255 const int32_t out_offset_batch = b * out_height;
256 for (int32_t h = 0; h < in_rows; ++h) {
257 for (int32_t w = 0; w < in_cols; ++w) {
258 // (h_start, h_end) * (w_start, w_end) is the range that the input
259 // vector projects to.
260 const int32_t hpad = h + pad_top;
261 const int32_t wpad = w + pad_left;
262 const int32_t h_start =
263 (hpad < window_rows) ? 0
264 : (hpad - window_rows) / row_stride + 1;
265 const int32_t h_end = std::min(hpad / row_stride + 1, out_height);
266 const int32_t w_start =
267 (wpad < window_cols) ? 0
268 : (wpad - window_cols) / col_stride + 1;
269 const int32_t w_end = std::min(wpad / col_stride + 1, out_width);
270 // compute elementwise max
271 const int32_t in_offset = (b * in_rows + h) * in_cols + w;
272 for (int32_t ph = h_start; ph < h_end; ++ph) {
273 const int32_t out_offset_base =
274 (out_offset_batch + ph) * out_width;
275 for (int32_t pw = w_start; pw < w_end; ++pw) {
276 const int32_t out_offset = out_offset_base + pw;
277 out_mat.col(out_offset) =
278 out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
279 }
280 }
281 }
282 }
283 }
284 };
285
286 // TODO(andydavis) Consider sharding across batch x rows x cols.
287 // TODO(andydavis) Consider a higher resolution shard cost model.
288 const int64_t shard_cost =
289 params.tensor_in_rows * params.tensor_in_cols * params.depth;
290 Shard(worker_threads.num_threads, worker_threads.workers,
291 params.tensor_in_batch, shard_cost, shard);
292 }
293 }
294
295 std::vector<int32> ksize_;
296 std::vector<int32> stride_;
297 Padding padding_;
298 std::vector<int64_t> explicit_paddings_;
299 TensorFormat data_format_;
300};
301
302template <typename Device>
303struct LaunchMaxPoolingNoMask_NCHW_VECT_C;
304
305#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
306template <>
307struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
308 static void launch(OpKernelContext* context, const PoolParameters& params,
309 const Tensor& input, Tensor* output) {
310#if GOOGLE_CUDA
311 bool status = functor::MaxPoolForwardNoMask_NCHW_VECT_C()(
312 reinterpret_cast<const int32*>(input.flat<qint8>().data()),
313 params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols,
314 params.depth, params.out_height, params.out_width, params.window_rows,
315 params.window_cols, params.row_stride, params.col_stride,
316 params.pad_top, params.pad_left,
317 reinterpret_cast<int32*>(output->flat<qint8>().data()),
318 context->eigen_gpu_device());
319 if (!status) {
320 context->SetStatus(errors::Internal(
321 "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
322 }
323#else
324 // ROCm TODO: add support __vmaxs4 on ROCm
325 context->SetStatus(errors::Internal(
326 "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
327#endif // GOOGLE_CUDA
328 }
329};
330#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
331
332template <typename Device, typename T>
333class MaxPoolingV2Op : public OpKernel {
334 public:
335 explicit MaxPoolingV2Op(OpKernelConstruction* context) : OpKernel(context) {
336 string data_format;
337 auto status = context->GetAttr("data_format", &data_format);
338 if (status.ok()) {
339 OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
340 errors::InvalidArgument("Invalid data format"));
341 OP_REQUIRES(
342 context,
343 data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW_VECT_C,
344 errors::InvalidArgument(
345 "MaxPoolingV2Op only supports NHWC or NCHW_VECT_C. Got: ",
346 data_format));
347 } else {
348 data_format_ = FORMAT_NHWC;
349 }
350 if (context->num_inputs() == 1) {
351 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
352 OP_REQUIRES(context, ksize_.size() == 4,
353 errors::InvalidArgument("Sliding window ksize field must "
354 "specify 4 dimensions"));
355 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
356 OP_REQUIRES(context, stride_.size() == 4,
357 errors::InvalidArgument("Sliding window stride field must "
358 "specify 4 dimensions"));
359 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
360 errors::Unimplemented(
361 "Pooling is not yet supported on the batch dimension."));
362 }
363 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
364 }
365
366 void Compute(OpKernelContext* context) override {
367 const Tensor& tensor_in = context->input(0);
368
369 std::vector<int32> ksize = ksize_;
370 std::vector<int32> stride = stride_;
371
372 if (context->num_inputs() != 1) {
373 const Tensor& tensor_ksize = context->input(1);
374 auto value_ksize = tensor_ksize.flat<int32>();
375 ksize.resize(tensor_ksize.shape().num_elements());
376 std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
377
378 const Tensor& tensor_stride = context->input(2);
379 auto value_stride = tensor_stride.flat<int32>();
380 stride.resize(tensor_stride.shape().num_elements());
381 std::copy_n(&value_stride(0), stride.size(), stride.begin());
382 }
383
384 OP_REQUIRES(context, ksize.size() == 4,
385 errors::InvalidArgument("Sliding window ksize field must "
386 "specify 4 dimensions"));
387 OP_REQUIRES(context, stride.size() == 4,
388 errors::InvalidArgument("Sliding window stride field must "
389 "specify 4 dimensions"));
390 OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
391 errors::Unimplemented(
392 "Pooling is not yet supported on the batch dimension."));
393
394 PoolParameters params{
395 context,
396 ksize,
397 stride,
398 padding_,
399 /*explicit_paddings=*/{},
400 data_format_,
401 tensor_in.shape(),
402 };
403 if (!context->status().ok()) {
404 return;
405 }
406
407 Tensor* output = nullptr;
408 OP_REQUIRES_OK(context, context->allocate_output(
409 0, params.forward_output_shape(), &output));
410
411 if (params.depth_window > 1) {
412 // Validate spec against the current implementation. A
413 // relaxation of these requirements would be ideal.
414 OP_REQUIRES(context, params.depth % params.depth_window == 0,
415 errors::Unimplemented(
416 "Depthwise max pooling requires "
417 "the depth window to evenly divide the input depth."));
418 OP_REQUIRES(
419 context, params.depth_window == params.depth_stride,
420 errors::Unimplemented("Depthwise max pooling requires "
421 "the depth window to equal the depth stride."));
422
423 DepthwiseMaxPool(context, output, tensor_in, params);
424 } else {
425 SpatialMaxPool(context, output, tensor_in, params, padding_);
426 }
427 }
428
429 private:
430 // Single-threaded implementation of DepthwiseMaxPool which
431 // does not handle all of the same options as SpatialMaxPool
432 // (strict assumptions on no padding, stride).
433 //
434 // TODO(vrv): implement a more general depthwise-max pool that works
435 // on GPU as well.
436 void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
437 const Tensor& tensor_in, const PoolParameters& params) {
438 Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
439 in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
440 tensor_in.NumElements() / params.depth_window);
441 Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
442 output->flat<T>().data(), 1, output->NumElements());
443 out_by_pool = in_by_pool.colwise().maxCoeff();
444 }
445
446 void SpatialMaxPool(OpKernelContext* context, Tensor* output,
447 const Tensor& tensor_in, const PoolParameters& params,
448 const Padding& padding) {
449 if (output->NumElements() == 0) {
450 return;
451 }
452 // On GPU, use Eigen's Spatial Max Pooling. On CPU, use an
453 // EigenMatrix version that is currently faster than Eigen's
454 // Spatial MaxPooling implementation.
455 //
456 // TODO(vrv): Remove this once we no longer need it.
457#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
458 if (std::is_same<Device, GPUDevice>::value) {
459 Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
460 if (std::is_same<T, qint8>::value) {
461 LaunchMaxPoolingNoMask_NCHW_VECT_C<GPUDevice>::launch(
462 context, params, tensor_in, output);
463 } else {
464 functor::SpatialMaxPooling<Device, T>()(
465 context->eigen_device<Device>(), output->tensor<T, 4>(),
466 tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
467 params.row_stride, params.col_stride, pt);
468 }
469 } else
470#endif
471 {
472 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
473 ConstEigenMatrixMap;
474 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
475 EigenMatrixMap;
476
477 ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
478 params.tensor_in_cols * params.tensor_in_rows *
479 params.tensor_in_batch);
480 EigenMatrixMap out_mat(
481 output->flat<T>().data(), params.depth,
482 params.out_width * params.out_height * params.tensor_in_batch);
483
484 const DeviceBase::CpuWorkerThreads& worker_threads =
485 *(context->device()->tensorflow_cpu_worker_threads());
486
487 // The following code basically does the following:
488 // 1. Flattens the input and output tensors into two dimensional arrays.
489 // tensor_in_as_matrix:
490 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
491 // output_as_matrix:
492 // depth by (out_width * out_height * tensor_in_batch)
493 //
494 // 2. Walks through the set of columns in the flattened
495 // tensor_in_as_matrix,
496 // and updates the corresponding column(s) in output_as_matrix with the
497 // max value.
498 auto shard = [&params, &in_mat, &out_mat](int64_t start, int64_t limit) {
499 const int32_t in_rows = params.tensor_in_rows;
500 const int32_t in_cols = params.tensor_in_cols;
501 const int32_t pad_top = params.pad_top;
502 const int32_t pad_left = params.pad_left;
503 const int32_t window_rows = params.window_rows;
504 const int32_t window_cols = params.window_cols;
505 const int32_t row_stride = params.row_stride;
506 const int32_t col_stride = params.col_stride;
507 const int32_t out_height = params.out_height;
508 const int32_t out_width = params.out_width;
509
510 {
511 // Initializes the output tensor with MIN<T>.
512 const int32_t output_image_size =
513 out_height * out_width * params.depth;
514 EigenMatrixMap out_shard(out_mat.data() + start * output_image_size,
515 1, (limit - start) * output_image_size);
516 out_shard.setConstant(Eigen::NumTraits<T>::lowest());
517 }
518
519 for (int32_t b = start; b < limit; ++b) {
520 const int32_t out_offset_batch = b * out_height;
521 for (int32_t h = 0; h < in_rows; ++h) {
522 for (int32_t w = 0; w < in_cols; ++w) {
523 // (h_start, h_end) * (w_start, w_end) is the range that the input
524 // vector projects to.
525 const int32_t hpad = h + pad_top;
526 const int32_t wpad = w + pad_left;
527 const int32_t h_start =
528 (hpad < window_rows) ? 0
529 : (hpad - window_rows) / row_stride + 1;
530 const int32_t h_end = std::min(hpad / row_stride + 1, out_height);
531 const int32_t w_start =
532 (wpad < window_cols) ? 0
533 : (wpad - window_cols) / col_stride + 1;
534 const int32_t w_end = std::min(wpad / col_stride + 1, out_width);
535 // compute elementwise max
536 const int32_t in_offset = (b * in_rows + h) * in_cols + w;
537 for (int32_t ph = h_start; ph < h_end; ++ph) {
538 const int32_t out_offset_base =
539 (out_offset_batch + ph) * out_width;
540 for (int32_t pw = w_start; pw < w_end; ++pw) {
541 const int32_t out_offset = out_offset_base + pw;
542 out_mat.col(out_offset) =
543 out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
544 }
545 }
546 }
547 }
548 }
549 };
550
551 // TODO(andydavis) Consider sharding across batch x rows x cols.
552 // TODO(andydavis) Consider a higher resolution shard cost model.
553 const int64_t shard_cost =
554 params.tensor_in_rows * params.tensor_in_cols * params.depth;
555 Shard(worker_threads.num_threads, worker_threads.workers,
556 params.tensor_in_batch, shard_cost, shard);
557 }
558 }
559
560 std::vector<int32> ksize_;
561 std::vector<int32> stride_;
562 Padding padding_;
563 TensorFormat data_format_;
564};
565
566template <typename Device, typename T>
567void SpatialAvgPool(OpKernelContext* context, Tensor* output,
568 const Tensor& input, const PoolParameters& params,
569 const Padding& padding) {
570 if (output->NumElements() == 0) {
571 return;
572 }
573 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
574 ConstEigenMatrixMap;
575 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
576 EigenMatrixMap;
577
578 auto in_flat = input.flat<T>();
579 auto out_flat = output->flat<T>();
580
581 auto shard = [&params, &in_flat, &out_flat](int64_t start, int64_t limit) {
582 // Calculate indices for this shards chunk of work.
583 const int64_t input_image_size =
584 params.tensor_in_rows * params.tensor_in_cols * params.depth;
585 const int64_t output_image_size =
586 params.out_width * params.out_height * params.depth;
587 const int64_t shard_batch_size = limit - start;
588
589 ConstEigenMatrixMap in_mat(
590 in_flat.data() + start * input_image_size, params.depth,
591 params.tensor_in_cols * params.tensor_in_rows * shard_batch_size);
592 EigenMatrixMap out_mat(
593 out_flat.data() + start * output_image_size, params.depth,
594 params.out_width * params.out_height * shard_batch_size);
595 Eigen::Matrix<T, Eigen::Dynamic, 1> out_count(out_mat.cols());
596 out_count.setZero();
597
598 // Initializes output to zero.
599 out_mat.setZero();
600
601 // The following code basically does the following:
602 // 1. Flattens the input and output tensors into two dimensional arrays.
603 // tensor_in_as_matrix:
604 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
605 // output_as_matrix:
606 // depth by (out_width * out_height * tensor_in_batch)
607 //
608 // 2. Walks through the set of columns in the flattened
609 // tensor_in_as_matrix,
610 // and updates the corresponding column(s) in output_as_matrix with the
611 // average value.
612 for (int b = 0; b < shard_batch_size; ++b) {
613 for (int h = 0; h < params.tensor_in_rows; ++h) {
614 for (int w = 0; w < params.tensor_in_cols; ++w) {
615 // (h_start, h_end) * (w_start, w_end) is the range that the input
616 // vector projects to.
617 const int hpad = h + params.pad_top;
618 const int wpad = w + params.pad_left;
619 const int h_start =
620 (hpad < params.window_rows)
621 ? 0
622 : (hpad - params.window_rows) / params.row_stride + 1;
623 const int h_end =
624 std::min<int>(hpad / params.row_stride + 1, params.out_height);
625 const int w_start =
626 (wpad < params.window_cols)
627 ? 0
628 : (wpad - params.window_cols) / params.col_stride + 1;
629 const int w_end =
630 std::min<int>(wpad / params.col_stride + 1, params.out_width);
631 const int in_offset =
632 (b * params.tensor_in_rows + h) * params.tensor_in_cols + w;
633 Eigen::DSizes<Eigen::DenseIndex, 2> in_indices(0, in_offset);
634 for (int ph = h_start; ph < h_end; ++ph) {
635 for (int pw = w_start; pw < w_end; ++pw) {
636 const int out_offset =
637 (b * params.out_height + ph) * params.out_width + pw;
638 out_mat.col(out_offset) += in_mat.col(in_offset);
639 out_count(out_offset) += T(1);
640 }
641 }
642 }
643 }
644 }
645
646 DCHECK_GT(out_count.minCoeff(), T(0));
647 out_mat.array().rowwise() /= out_count.transpose().array();
648 };
649
650 const int64_t work_unit_size =
651 params.tensor_in_rows * params.tensor_in_cols * params.depth;
652 // NOTE: Constants in calculation below were estimated based on benchmarking.
653 // Nanoseconds/work_unit for benchmarks ranged from 0.01 to 0.001, and
654 // so the factor 0.01 (i.e. 1/100) with a max of 10000, was chosen to limit
655 // the work unit cost to an operating range in which it empirically performed
656 // best.
657 const int64_t work_unit_cost = std::max(int64_t{10000}, work_unit_size / 100);
658 const DeviceBase::CpuWorkerThreads& worker_threads =
659 *(context->device()->tensorflow_cpu_worker_threads());
660 Shard(worker_threads.num_threads, worker_threads.workers,
661 params.tensor_in_batch, work_unit_cost, shard);
662}
663
664} // namespace tensorflow
665
666#endif // TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
667