1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_ |
17 | #define TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_ |
18 | |
19 | #include <vector> |
20 | |
21 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
22 | #define EIGEN_USE_GPU |
23 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
24 | |
25 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" |
26 | #include "tensorflow/core/framework/bounds_check.h" |
27 | #include "tensorflow/core/framework/numeric_op.h" |
28 | #include "tensorflow/core/framework/op_kernel.h" |
29 | #include "tensorflow/core/framework/tensor_shape.h" |
30 | #include "tensorflow/core/kernels/avgpooling_op.h" |
31 | #include "tensorflow/core/kernels/maxpooling_op.h" |
32 | #include "tensorflow/core/kernels/ops_util.h" |
33 | #include "tensorflow/core/util/padding.h" |
34 | #include "tensorflow/core/util/tensor_format.h" |
35 | #include "tensorflow/core/util/work_sharder.h" |
36 | |
37 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
38 | #include "tensorflow/core/kernels/maxpooling_op_gpu.h" |
39 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
40 | |
41 | namespace tensorflow { |
42 | |
43 | typedef Eigen::GpuDevice GPUDevice; |
44 | |
45 | // A helper class to manage sizes and shapes for pooling operations. |
46 | struct PoolParameters { |
47 | // Updates context->status if there is an invalid input. |
48 | // explicit_paddings has eight elements if padding==EXPLIICT, and zero |
49 | // elements otherwise. |
50 | PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize, |
51 | const std::vector<int32>& stride, Padding padding, |
52 | std::vector<int64_t> explicit_paddings, |
53 | TensorFormat data_format, const TensorShape& tensor_in_shape); |
54 | |
55 | // Returns the shape of the output for "forward" pooling operations. |
56 | TensorShape forward_output_shape(); |
57 | |
58 | int depth; |
59 | |
60 | int tensor_in_cols; |
61 | int tensor_in_rows; |
62 | int tensor_in_batch; |
63 | |
64 | int window_rows; |
65 | int window_cols; |
66 | int depth_window; |
67 | |
68 | int row_stride; |
69 | int col_stride; |
70 | int depth_stride; |
71 | |
72 | int64_t out_height; |
73 | int64_t out_width; |
74 | int out_depth; |
75 | |
76 | int64_t pad_top; |
77 | int64_t pad_bottom; |
78 | int64_t pad_left; |
79 | int64_t pad_right; |
80 | |
81 | int pad_depth; |
82 | |
83 | TensorFormat data_format; |
84 | }; |
85 | |
86 | // An implementation of MaxPooling (forward). |
87 | // TODO (yongtang): Remove MaxPoolingOp and use MaxPoolingV2Op, |
88 | // QuantizedMaxPoolingOp depends on MaxPoolingOp so keep intact for now |
89 | template <typename Device, typename T> |
90 | class MaxPoolingOp : public OpKernel { |
91 | public: |
92 | explicit MaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) { |
93 | string data_format; |
94 | auto status = context->GetAttr("data_format" , &data_format); |
95 | if (status.ok()) { |
96 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), |
97 | errors::InvalidArgument("Invalid data format" )); |
98 | OP_REQUIRES( |
99 | context, data_format_ == FORMAT_NHWC, |
100 | errors::InvalidArgument("Default MaxPoolingOp only supports NHWC " , |
101 | "on device type " , |
102 | DeviceTypeString(context->device_type()))); |
103 | } else { |
104 | data_format_ = FORMAT_NHWC; |
105 | } |
106 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
107 | OP_REQUIRES(context, ksize_.size() == 4, |
108 | errors::InvalidArgument("Sliding window ksize field must " |
109 | "specify 4 dimensions" )); |
110 | for (int i = 0; i < ksize_.size(); ++i) { |
111 | OP_REQUIRES(context, ksize_[i] > 0, |
112 | errors::InvalidArgument("Sliding window ksize for dimension " , |
113 | i, " was zero." )); |
114 | } |
115 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
116 | OP_REQUIRES(context, stride_.size() == 4, |
117 | errors::InvalidArgument("Sliding window stride field must " |
118 | "specify 4 dimensions" )); |
119 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
120 | if (padding_ == Padding::EXPLICIT) { |
121 | OP_REQUIRES_OK( |
122 | context, context->GetAttr("explicit_paddings" , &explicit_paddings_)); |
123 | } |
124 | OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, |
125 | errors::Unimplemented( |
126 | "Pooling is not yet supported on the batch dimension." )); |
127 | } |
128 | |
129 | void Compute(OpKernelContext* context) override { |
130 | const Tensor& tensor_in = context->input(0); |
131 | PoolParameters params{ |
132 | context, ksize_, stride_, padding_, explicit_paddings_, |
133 | FORMAT_NHWC, tensor_in.shape()}; |
134 | if (!context->status().ok()) { |
135 | return; |
136 | } |
137 | |
138 | Tensor* output = nullptr; |
139 | OP_REQUIRES_OK(context, context->allocate_output( |
140 | 0, params.forward_output_shape(), &output)); |
141 | |
142 | if (params.depth_window > 1) { |
143 | // Validate spec against the current implementation. A |
144 | // relaxation of these requirements would be ideal. |
145 | OP_REQUIRES(context, params.depth % params.depth_window == 0, |
146 | errors::Unimplemented( |
147 | "Depthwise max pooling requires " |
148 | "the depth window to evenly divide the input depth." )); |
149 | OP_REQUIRES( |
150 | context, params.depth_window == params.depth_stride, |
151 | errors::Unimplemented("Depthwise max pooling requires " |
152 | "the depth window to equal the depth stride." )); |
153 | OP_REQUIRES( |
154 | context, padding_ != EXPLICIT, |
155 | errors::Unimplemented("Depthwise max pooling does not support " |
156 | "explicit padding." )); |
157 | |
158 | DepthwiseMaxPool(context, output, tensor_in, params); |
159 | } else { |
160 | // MaxPoolingOp is only called on the GPU when the eigen_tensor label |
161 | // is used. In this case, explicit padding is not supported |
162 | if (std::is_same<Device, GPUDevice>::value && |
163 | padding_ == Padding::EXPLICIT) { |
164 | context->SetStatus(errors::Unimplemented( |
165 | "MaxPoolingOp does not support explicit padding." )); |
166 | return; |
167 | } |
168 | SpatialMaxPool(context, output, tensor_in, params, padding_); |
169 | } |
170 | } |
171 | |
172 | private: |
173 | // Single-threaded implementation of DepthwiseMaxPool which |
174 | // does not handle all of the same options as SpatialMaxPool |
175 | // (strict assumptions on no padding, stride). |
176 | // |
177 | // TODO(vrv): implement a more general depthwise-max pool that works |
178 | // on GPU as well. |
179 | void DepthwiseMaxPool(OpKernelContext* context, Tensor* output, |
180 | const Tensor& tensor_in, const PoolParameters& params) { |
181 | Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
182 | in_by_pool(tensor_in.flat<T>().data(), params.depth_window, |
183 | tensor_in.NumElements() / params.depth_window); |
184 | Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool( |
185 | output->flat<T>().data(), 1, output->NumElements()); |
186 | out_by_pool = in_by_pool.colwise().maxCoeff(); |
187 | } |
188 | |
189 | void SpatialMaxPool(OpKernelContext* context, Tensor* output, |
190 | const Tensor& tensor_in, const PoolParameters& params, |
191 | const Padding& padding) { |
192 | if (output->NumElements() == 0) { |
193 | return; |
194 | } |
195 | // On GPU, use Eigen's Spatial Max Pooling. On CPU, use an |
196 | // EigenMatrix version that is currently faster than Eigen's |
197 | // Spatial MaxPooling implementation. |
198 | // |
199 | // TODO(vrv): Remove this once we no longer need it. |
200 | if (std::is_same<Device, GPUDevice>::value) { |
201 | Eigen::PaddingType pt = BrainPadding2EigenPadding(padding); |
202 | functor::SpatialMaxPooling<Device, T>()( |
203 | context->eigen_device<Device>(), output->tensor<T, 4>(), |
204 | tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols, |
205 | params.row_stride, params.col_stride, pt); |
206 | } else { |
207 | typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
208 | ConstEigenMatrixMap; |
209 | typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
210 | EigenMatrixMap; |
211 | |
212 | ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth, |
213 | params.tensor_in_cols * params.tensor_in_rows * |
214 | params.tensor_in_batch); |
215 | EigenMatrixMap out_mat( |
216 | output->flat<T>().data(), params.depth, |
217 | params.out_width * params.out_height * params.tensor_in_batch); |
218 | |
219 | const DeviceBase::CpuWorkerThreads& worker_threads = |
220 | *(context->device()->tensorflow_cpu_worker_threads()); |
221 | |
222 | // The following code basically does the following: |
223 | // 1. Flattens the input and output tensors into two dimensional arrays. |
224 | // tensor_in_as_matrix: |
225 | // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) |
226 | // output_as_matrix: |
227 | // depth by (out_width * out_height * tensor_in_batch) |
228 | // |
229 | // 2. Walks through the set of columns in the flattened |
230 | // tensor_in_as_matrix, |
231 | // and updates the corresponding column(s) in output_as_matrix with the |
232 | // max value. |
233 | auto shard = [¶ms, &in_mat, &out_mat](int64_t start, int64_t limit) { |
234 | const int32_t in_rows = params.tensor_in_rows; |
235 | const int32_t in_cols = params.tensor_in_cols; |
236 | const int32_t pad_top = params.pad_top; |
237 | const int32_t pad_left = params.pad_left; |
238 | const int32_t window_rows = params.window_rows; |
239 | const int32_t window_cols = params.window_cols; |
240 | const int32_t row_stride = params.row_stride; |
241 | const int32_t col_stride = params.col_stride; |
242 | const int32_t out_height = params.out_height; |
243 | const int32_t out_width = params.out_width; |
244 | |
245 | { |
246 | // Initializes the output tensor with MIN<T>. |
247 | const int32_t output_image_size = |
248 | out_height * out_width * params.depth; |
249 | EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, |
250 | 1, (limit - start) * output_image_size); |
251 | out_shard.setConstant(Eigen::NumTraits<T>::lowest()); |
252 | } |
253 | |
254 | for (int32_t b = start; b < limit; ++b) { |
255 | const int32_t out_offset_batch = b * out_height; |
256 | for (int32_t h = 0; h < in_rows; ++h) { |
257 | for (int32_t w = 0; w < in_cols; ++w) { |
258 | // (h_start, h_end) * (w_start, w_end) is the range that the input |
259 | // vector projects to. |
260 | const int32_t hpad = h + pad_top; |
261 | const int32_t wpad = w + pad_left; |
262 | const int32_t h_start = |
263 | (hpad < window_rows) ? 0 |
264 | : (hpad - window_rows) / row_stride + 1; |
265 | const int32_t h_end = std::min(hpad / row_stride + 1, out_height); |
266 | const int32_t w_start = |
267 | (wpad < window_cols) ? 0 |
268 | : (wpad - window_cols) / col_stride + 1; |
269 | const int32_t w_end = std::min(wpad / col_stride + 1, out_width); |
270 | // compute elementwise max |
271 | const int32_t in_offset = (b * in_rows + h) * in_cols + w; |
272 | for (int32_t ph = h_start; ph < h_end; ++ph) { |
273 | const int32_t out_offset_base = |
274 | (out_offset_batch + ph) * out_width; |
275 | for (int32_t pw = w_start; pw < w_end; ++pw) { |
276 | const int32_t out_offset = out_offset_base + pw; |
277 | out_mat.col(out_offset) = |
278 | out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset)); |
279 | } |
280 | } |
281 | } |
282 | } |
283 | } |
284 | }; |
285 | |
286 | // TODO(andydavis) Consider sharding across batch x rows x cols. |
287 | // TODO(andydavis) Consider a higher resolution shard cost model. |
288 | const int64_t shard_cost = |
289 | params.tensor_in_rows * params.tensor_in_cols * params.depth; |
290 | Shard(worker_threads.num_threads, worker_threads.workers, |
291 | params.tensor_in_batch, shard_cost, shard); |
292 | } |
293 | } |
294 | |
295 | std::vector<int32> ksize_; |
296 | std::vector<int32> stride_; |
297 | Padding padding_; |
298 | std::vector<int64_t> explicit_paddings_; |
299 | TensorFormat data_format_; |
300 | }; |
301 | |
302 | template <typename Device> |
303 | struct LaunchMaxPoolingNoMask_NCHW_VECT_C; |
304 | |
305 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
306 | template <> |
307 | struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> { |
308 | static void launch(OpKernelContext* context, const PoolParameters& params, |
309 | const Tensor& input, Tensor* output) { |
310 | #if GOOGLE_CUDA |
311 | bool status = functor::MaxPoolForwardNoMask_NCHW_VECT_C()( |
312 | reinterpret_cast<const int32*>(input.flat<qint8>().data()), |
313 | params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols, |
314 | params.depth, params.out_height, params.out_width, params.window_rows, |
315 | params.window_cols, params.row_stride, params.col_stride, |
316 | params.pad_top, params.pad_left, |
317 | reinterpret_cast<int32*>(output->flat<qint8>().data()), |
318 | context->eigen_gpu_device()); |
319 | if (!status) { |
320 | context->SetStatus(errors::Internal( |
321 | "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C" )); |
322 | } |
323 | #else |
324 | // ROCm TODO: add support __vmaxs4 on ROCm |
325 | context->SetStatus(errors::Internal( |
326 | "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C" )); |
327 | #endif // GOOGLE_CUDA |
328 | } |
329 | }; |
330 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
331 | |
332 | template <typename Device, typename T> |
333 | class MaxPoolingV2Op : public OpKernel { |
334 | public: |
335 | explicit MaxPoolingV2Op(OpKernelConstruction* context) : OpKernel(context) { |
336 | string data_format; |
337 | auto status = context->GetAttr("data_format" , &data_format); |
338 | if (status.ok()) { |
339 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), |
340 | errors::InvalidArgument("Invalid data format" )); |
341 | OP_REQUIRES( |
342 | context, |
343 | data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW_VECT_C, |
344 | errors::InvalidArgument( |
345 | "MaxPoolingV2Op only supports NHWC or NCHW_VECT_C. Got: " , |
346 | data_format)); |
347 | } else { |
348 | data_format_ = FORMAT_NHWC; |
349 | } |
350 | if (context->num_inputs() == 1) { |
351 | OP_REQUIRES_OK(context, context->GetAttr("ksize" , &ksize_)); |
352 | OP_REQUIRES(context, ksize_.size() == 4, |
353 | errors::InvalidArgument("Sliding window ksize field must " |
354 | "specify 4 dimensions" )); |
355 | OP_REQUIRES_OK(context, context->GetAttr("strides" , &stride_)); |
356 | OP_REQUIRES(context, stride_.size() == 4, |
357 | errors::InvalidArgument("Sliding window stride field must " |
358 | "specify 4 dimensions" )); |
359 | OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, |
360 | errors::Unimplemented( |
361 | "Pooling is not yet supported on the batch dimension." )); |
362 | } |
363 | OP_REQUIRES_OK(context, context->GetAttr("padding" , &padding_)); |
364 | } |
365 | |
366 | void Compute(OpKernelContext* context) override { |
367 | const Tensor& tensor_in = context->input(0); |
368 | |
369 | std::vector<int32> ksize = ksize_; |
370 | std::vector<int32> stride = stride_; |
371 | |
372 | if (context->num_inputs() != 1) { |
373 | const Tensor& tensor_ksize = context->input(1); |
374 | auto value_ksize = tensor_ksize.flat<int32>(); |
375 | ksize.resize(tensor_ksize.shape().num_elements()); |
376 | std::copy_n(&value_ksize(0), ksize.size(), ksize.begin()); |
377 | |
378 | const Tensor& tensor_stride = context->input(2); |
379 | auto value_stride = tensor_stride.flat<int32>(); |
380 | stride.resize(tensor_stride.shape().num_elements()); |
381 | std::copy_n(&value_stride(0), stride.size(), stride.begin()); |
382 | } |
383 | |
384 | OP_REQUIRES(context, ksize.size() == 4, |
385 | errors::InvalidArgument("Sliding window ksize field must " |
386 | "specify 4 dimensions" )); |
387 | OP_REQUIRES(context, stride.size() == 4, |
388 | errors::InvalidArgument("Sliding window stride field must " |
389 | "specify 4 dimensions" )); |
390 | OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1, |
391 | errors::Unimplemented( |
392 | "Pooling is not yet supported on the batch dimension." )); |
393 | |
394 | PoolParameters params{ |
395 | context, |
396 | ksize, |
397 | stride, |
398 | padding_, |
399 | /*explicit_paddings=*/{}, |
400 | data_format_, |
401 | tensor_in.shape(), |
402 | }; |
403 | if (!context->status().ok()) { |
404 | return; |
405 | } |
406 | |
407 | Tensor* output = nullptr; |
408 | OP_REQUIRES_OK(context, context->allocate_output( |
409 | 0, params.forward_output_shape(), &output)); |
410 | |
411 | if (params.depth_window > 1) { |
412 | // Validate spec against the current implementation. A |
413 | // relaxation of these requirements would be ideal. |
414 | OP_REQUIRES(context, params.depth % params.depth_window == 0, |
415 | errors::Unimplemented( |
416 | "Depthwise max pooling requires " |
417 | "the depth window to evenly divide the input depth." )); |
418 | OP_REQUIRES( |
419 | context, params.depth_window == params.depth_stride, |
420 | errors::Unimplemented("Depthwise max pooling requires " |
421 | "the depth window to equal the depth stride." )); |
422 | |
423 | DepthwiseMaxPool(context, output, tensor_in, params); |
424 | } else { |
425 | SpatialMaxPool(context, output, tensor_in, params, padding_); |
426 | } |
427 | } |
428 | |
429 | private: |
430 | // Single-threaded implementation of DepthwiseMaxPool which |
431 | // does not handle all of the same options as SpatialMaxPool |
432 | // (strict assumptions on no padding, stride). |
433 | // |
434 | // TODO(vrv): implement a more general depthwise-max pool that works |
435 | // on GPU as well. |
436 | void DepthwiseMaxPool(OpKernelContext* context, Tensor* output, |
437 | const Tensor& tensor_in, const PoolParameters& params) { |
438 | Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
439 | in_by_pool(tensor_in.flat<T>().data(), params.depth_window, |
440 | tensor_in.NumElements() / params.depth_window); |
441 | Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool( |
442 | output->flat<T>().data(), 1, output->NumElements()); |
443 | out_by_pool = in_by_pool.colwise().maxCoeff(); |
444 | } |
445 | |
446 | void SpatialMaxPool(OpKernelContext* context, Tensor* output, |
447 | const Tensor& tensor_in, const PoolParameters& params, |
448 | const Padding& padding) { |
449 | if (output->NumElements() == 0) { |
450 | return; |
451 | } |
452 | // On GPU, use Eigen's Spatial Max Pooling. On CPU, use an |
453 | // EigenMatrix version that is currently faster than Eigen's |
454 | // Spatial MaxPooling implementation. |
455 | // |
456 | // TODO(vrv): Remove this once we no longer need it. |
457 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
458 | if (std::is_same<Device, GPUDevice>::value) { |
459 | Eigen::PaddingType pt = BrainPadding2EigenPadding(padding); |
460 | if (std::is_same<T, qint8>::value) { |
461 | LaunchMaxPoolingNoMask_NCHW_VECT_C<GPUDevice>::launch( |
462 | context, params, tensor_in, output); |
463 | } else { |
464 | functor::SpatialMaxPooling<Device, T>()( |
465 | context->eigen_device<Device>(), output->tensor<T, 4>(), |
466 | tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols, |
467 | params.row_stride, params.col_stride, pt); |
468 | } |
469 | } else |
470 | #endif |
471 | { |
472 | typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
473 | ConstEigenMatrixMap; |
474 | typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
475 | EigenMatrixMap; |
476 | |
477 | ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth, |
478 | params.tensor_in_cols * params.tensor_in_rows * |
479 | params.tensor_in_batch); |
480 | EigenMatrixMap out_mat( |
481 | output->flat<T>().data(), params.depth, |
482 | params.out_width * params.out_height * params.tensor_in_batch); |
483 | |
484 | const DeviceBase::CpuWorkerThreads& worker_threads = |
485 | *(context->device()->tensorflow_cpu_worker_threads()); |
486 | |
487 | // The following code basically does the following: |
488 | // 1. Flattens the input and output tensors into two dimensional arrays. |
489 | // tensor_in_as_matrix: |
490 | // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) |
491 | // output_as_matrix: |
492 | // depth by (out_width * out_height * tensor_in_batch) |
493 | // |
494 | // 2. Walks through the set of columns in the flattened |
495 | // tensor_in_as_matrix, |
496 | // and updates the corresponding column(s) in output_as_matrix with the |
497 | // max value. |
498 | auto shard = [¶ms, &in_mat, &out_mat](int64_t start, int64_t limit) { |
499 | const int32_t in_rows = params.tensor_in_rows; |
500 | const int32_t in_cols = params.tensor_in_cols; |
501 | const int32_t pad_top = params.pad_top; |
502 | const int32_t pad_left = params.pad_left; |
503 | const int32_t window_rows = params.window_rows; |
504 | const int32_t window_cols = params.window_cols; |
505 | const int32_t row_stride = params.row_stride; |
506 | const int32_t col_stride = params.col_stride; |
507 | const int32_t out_height = params.out_height; |
508 | const int32_t out_width = params.out_width; |
509 | |
510 | { |
511 | // Initializes the output tensor with MIN<T>. |
512 | const int32_t output_image_size = |
513 | out_height * out_width * params.depth; |
514 | EigenMatrixMap out_shard(out_mat.data() + start * output_image_size, |
515 | 1, (limit - start) * output_image_size); |
516 | out_shard.setConstant(Eigen::NumTraits<T>::lowest()); |
517 | } |
518 | |
519 | for (int32_t b = start; b < limit; ++b) { |
520 | const int32_t out_offset_batch = b * out_height; |
521 | for (int32_t h = 0; h < in_rows; ++h) { |
522 | for (int32_t w = 0; w < in_cols; ++w) { |
523 | // (h_start, h_end) * (w_start, w_end) is the range that the input |
524 | // vector projects to. |
525 | const int32_t hpad = h + pad_top; |
526 | const int32_t wpad = w + pad_left; |
527 | const int32_t h_start = |
528 | (hpad < window_rows) ? 0 |
529 | : (hpad - window_rows) / row_stride + 1; |
530 | const int32_t h_end = std::min(hpad / row_stride + 1, out_height); |
531 | const int32_t w_start = |
532 | (wpad < window_cols) ? 0 |
533 | : (wpad - window_cols) / col_stride + 1; |
534 | const int32_t w_end = std::min(wpad / col_stride + 1, out_width); |
535 | // compute elementwise max |
536 | const int32_t in_offset = (b * in_rows + h) * in_cols + w; |
537 | for (int32_t ph = h_start; ph < h_end; ++ph) { |
538 | const int32_t out_offset_base = |
539 | (out_offset_batch + ph) * out_width; |
540 | for (int32_t pw = w_start; pw < w_end; ++pw) { |
541 | const int32_t out_offset = out_offset_base + pw; |
542 | out_mat.col(out_offset) = |
543 | out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset)); |
544 | } |
545 | } |
546 | } |
547 | } |
548 | } |
549 | }; |
550 | |
551 | // TODO(andydavis) Consider sharding across batch x rows x cols. |
552 | // TODO(andydavis) Consider a higher resolution shard cost model. |
553 | const int64_t shard_cost = |
554 | params.tensor_in_rows * params.tensor_in_cols * params.depth; |
555 | Shard(worker_threads.num_threads, worker_threads.workers, |
556 | params.tensor_in_batch, shard_cost, shard); |
557 | } |
558 | } |
559 | |
560 | std::vector<int32> ksize_; |
561 | std::vector<int32> stride_; |
562 | Padding padding_; |
563 | TensorFormat data_format_; |
564 | }; |
565 | |
566 | template <typename Device, typename T> |
567 | void SpatialAvgPool(OpKernelContext* context, Tensor* output, |
568 | const Tensor& input, const PoolParameters& params, |
569 | const Padding& padding) { |
570 | if (output->NumElements() == 0) { |
571 | return; |
572 | } |
573 | typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
574 | ConstEigenMatrixMap; |
575 | typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> |
576 | EigenMatrixMap; |
577 | |
578 | auto in_flat = input.flat<T>(); |
579 | auto out_flat = output->flat<T>(); |
580 | |
581 | auto shard = [¶ms, &in_flat, &out_flat](int64_t start, int64_t limit) { |
582 | // Calculate indices for this shards chunk of work. |
583 | const int64_t input_image_size = |
584 | params.tensor_in_rows * params.tensor_in_cols * params.depth; |
585 | const int64_t output_image_size = |
586 | params.out_width * params.out_height * params.depth; |
587 | const int64_t shard_batch_size = limit - start; |
588 | |
589 | ConstEigenMatrixMap in_mat( |
590 | in_flat.data() + start * input_image_size, params.depth, |
591 | params.tensor_in_cols * params.tensor_in_rows * shard_batch_size); |
592 | EigenMatrixMap out_mat( |
593 | out_flat.data() + start * output_image_size, params.depth, |
594 | params.out_width * params.out_height * shard_batch_size); |
595 | Eigen::Matrix<T, Eigen::Dynamic, 1> out_count(out_mat.cols()); |
596 | out_count.setZero(); |
597 | |
598 | // Initializes output to zero. |
599 | out_mat.setZero(); |
600 | |
601 | // The following code basically does the following: |
602 | // 1. Flattens the input and output tensors into two dimensional arrays. |
603 | // tensor_in_as_matrix: |
604 | // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) |
605 | // output_as_matrix: |
606 | // depth by (out_width * out_height * tensor_in_batch) |
607 | // |
608 | // 2. Walks through the set of columns in the flattened |
609 | // tensor_in_as_matrix, |
610 | // and updates the corresponding column(s) in output_as_matrix with the |
611 | // average value. |
612 | for (int b = 0; b < shard_batch_size; ++b) { |
613 | for (int h = 0; h < params.tensor_in_rows; ++h) { |
614 | for (int w = 0; w < params.tensor_in_cols; ++w) { |
615 | // (h_start, h_end) * (w_start, w_end) is the range that the input |
616 | // vector projects to. |
617 | const int hpad = h + params.pad_top; |
618 | const int wpad = w + params.pad_left; |
619 | const int h_start = |
620 | (hpad < params.window_rows) |
621 | ? 0 |
622 | : (hpad - params.window_rows) / params.row_stride + 1; |
623 | const int h_end = |
624 | std::min<int>(hpad / params.row_stride + 1, params.out_height); |
625 | const int w_start = |
626 | (wpad < params.window_cols) |
627 | ? 0 |
628 | : (wpad - params.window_cols) / params.col_stride + 1; |
629 | const int w_end = |
630 | std::min<int>(wpad / params.col_stride + 1, params.out_width); |
631 | const int in_offset = |
632 | (b * params.tensor_in_rows + h) * params.tensor_in_cols + w; |
633 | Eigen::DSizes<Eigen::DenseIndex, 2> in_indices(0, in_offset); |
634 | for (int ph = h_start; ph < h_end; ++ph) { |
635 | for (int pw = w_start; pw < w_end; ++pw) { |
636 | const int out_offset = |
637 | (b * params.out_height + ph) * params.out_width + pw; |
638 | out_mat.col(out_offset) += in_mat.col(in_offset); |
639 | out_count(out_offset) += T(1); |
640 | } |
641 | } |
642 | } |
643 | } |
644 | } |
645 | |
646 | DCHECK_GT(out_count.minCoeff(), T(0)); |
647 | out_mat.array().rowwise() /= out_count.transpose().array(); |
648 | }; |
649 | |
650 | const int64_t work_unit_size = |
651 | params.tensor_in_rows * params.tensor_in_cols * params.depth; |
652 | // NOTE: Constants in calculation below were estimated based on benchmarking. |
653 | // Nanoseconds/work_unit for benchmarks ranged from 0.01 to 0.001, and |
654 | // so the factor 0.01 (i.e. 1/100) with a max of 10000, was chosen to limit |
655 | // the work unit cost to an operating range in which it empirically performed |
656 | // best. |
657 | const int64_t work_unit_cost = std::max(int64_t{10000}, work_unit_size / 100); |
658 | const DeviceBase::CpuWorkerThreads& worker_threads = |
659 | *(context->device()->tensorflow_cpu_worker_threads()); |
660 | Shard(worker_threads.num_threads, worker_threads.workers, |
661 | params.tensor_in_batch, work_unit_cost, shard); |
662 | } |
663 | |
664 | } // namespace tensorflow |
665 | |
666 | #endif // TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_ |
667 | |