pooling_ops_common.h source code [tensorflow/tensorflow/core/kernels/pooling_ops_common.h]

1	/ Copyright 2015 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
17	#define TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
18
19	#include <vector>
20
21	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
22	#define EIGEN_USE_GPU
23	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
24
25	#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26	#include "tensorflow/core/framework/bounds_check.h"
27	#include "tensorflow/core/framework/numeric_op.h"
28	#include "tensorflow/core/framework/op_kernel.h"
29	#include "tensorflow/core/framework/tensor_shape.h"
30	#include "tensorflow/core/kernels/avgpooling_op.h"
31	#include "tensorflow/core/kernels/maxpooling_op.h"
32	#include "tensorflow/core/kernels/ops_util.h"
33	#include "tensorflow/core/util/padding.h"
34	#include "tensorflow/core/util/tensor_format.h"
35	#include "tensorflow/core/util/work_sharder.h"
36
37	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
38	#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
39	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
40
41	namespace tensorflow {
42
43	typedef Eigen::GpuDevice GPUDevice;
44
45	// A helper class to manage sizes and shapes for pooling operations.
46	struct PoolParameters {
47	// Updates context->status if there is an invalid input.
48	// explicit_paddings has eight elements if padding==EXPLIICT, and zero
49	// elements otherwise.
50	PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize,
51	const std::vector<int32>& stride, Padding padding,
52	std::vector<int64_t> explicit_paddings,
53	TensorFormat data_format, const TensorShape& tensor_in_shape);
54
55	// Returns the shape of the output for "forward" pooling operations.
56	TensorShape forward_output_shape();
57
58	int depth;
59
60	int tensor_in_cols;
61	int tensor_in_rows;
62	int tensor_in_batch;
63
64	int window_rows;
65	int window_cols;
66	int depth_window;
67
68	int row_stride;
69	int col_stride;
70	int depth_stride;
71
72	int64_t out_height;
73	int64_t out_width;
74	int out_depth;
75
76	int64_t pad_top;
77	int64_t pad_bottom;
78	int64_t pad_left;
79	int64_t pad_right;
80
81	int pad_depth;
82
83	TensorFormat data_format;
84	};
85
86	// An implementation of MaxPooling (forward).
87	// TODO (yongtang): Remove MaxPoolingOp and use MaxPoolingV2Op,
88	// QuantizedMaxPoolingOp depends on MaxPoolingOp so keep intact for now
89	template <typename Device, typename T>
90	class MaxPoolingOp : public OpKernel {
91	public:
92	explicit MaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
93	string data_format;
94	auto status = context->GetAttr("data_format", &data_format);
95	if (status.ok()) {
96	OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
97	errors::InvalidArgument("Invalid data format"));
98	OP_REQUIRES(
99	context, data_format_ == FORMAT_NHWC,
100	errors::InvalidArgument("Default MaxPoolingOp only supports NHWC ",
101	"on device type ",
102	DeviceTypeString(context->device_type())));
103	} else {
104	data_format_ = FORMAT_NHWC;
105	}
106	OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
107	OP_REQUIRES(context, ksize_.size() == `4`,
108	errors::InvalidArgument("Sliding window ksize field must "
109	"specify 4 dimensions"));
110	for (int i = `0`; i < ksize_.size(); ++i) {
111	OP_REQUIRES(context, ksize_[i] > `0`,
112	errors::InvalidArgument("Sliding window ksize for dimension ",
113	i, " was zero."));
114	}
115	OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
116	OP_REQUIRES(context, stride_.size() == `4`,
117	errors::InvalidArgument("Sliding window stride field must "
118	"specify 4 dimensions"));
119	OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
120	if (padding_ == Padding::EXPLICIT) {
121	OP_REQUIRES_OK(
122	context, context->GetAttr("explicit_paddings", &explicit_paddings_));
123	}
124	OP_REQUIRES(context, ksize_[`0`] == `1` && stride_[`0`] == `1`,
125	errors::Unimplemented(
126	"Pooling is not yet supported on the batch dimension."));
127	}
128
129	void Compute(OpKernelContext* context) override {
130	const Tensor& tensor_in = context->input(`0`);
131	PoolParameters params{
132	context, ksize_, stride_, padding_, explicit_paddings_,
133	FORMAT_NHWC, tensor_in.shape()};
134	if (!context->status().ok()) {
135	return;
136	}
137
138	Tensor* output = nullptr;
139	OP_REQUIRES_OK(context, context->allocate_output(
140	`0`, params.forward_output_shape(), &output));
141
142	if (params.depth_window > `1`) {
143	// Validate spec against the current implementation. A
144	// relaxation of these requirements would be ideal.
145	OP_REQUIRES(context, params.depth % params.depth_window == `0`,
146	errors::Unimplemented(
147	"Depthwise max pooling requires "
148	"the depth window to evenly divide the input depth."));
149	OP_REQUIRES(
150	context, params.depth_window == params.depth_stride,
151	errors::Unimplemented("Depthwise max pooling requires "
152	"the depth window to equal the depth stride."));
153	OP_REQUIRES(
154	context, padding_ != EXPLICIT,
155	errors::Unimplemented("Depthwise max pooling does not support "
156	"explicit padding."));
157
158	DepthwiseMaxPool(context, output, tensor_in, params);
159	} else {
160	// MaxPoolingOp is only called on the GPU when the eigen_tensor label
161	// is used. In this case, explicit padding is not supported
162	if (std::is_same<Device, GPUDevice>::value &&
163	padding_ == Padding::EXPLICIT) {
164	context->SetStatus(errors::Unimplemented(
165	"MaxPoolingOp does not support explicit padding."));
166	return;
167	}
168	SpatialMaxPool(context, output, tensor_in, params, padding_);
169	}
170	}
171
172	private:
173	// Single-threaded implementation of DepthwiseMaxPool which
174	// does not handle all of the same options as SpatialMaxPool
175	// (strict assumptions on no padding, stride).
176	//
177	// TODO(vrv): implement a more general depthwise-max pool that works
178	// on GPU as well.
179	void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
180	const Tensor& tensor_in, const PoolParameters& params) {
181	Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
182	in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
183	tensor_in.NumElements() / params.depth_window);
184	Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
185	output->flat<T>().data(), `1`, output->NumElements());
186	out_by_pool = in_by_pool.colwise().maxCoeff();
187	}
188
189	void SpatialMaxPool(OpKernelContext* context, Tensor* output,
190	const Tensor& tensor_in, const PoolParameters& params,
191	const Padding& padding) {
192	if (output->NumElements() == `0`) {
193	return;
194	}
195	// On GPU, use Eigen's Spatial Max Pooling. On CPU, use an
196	// EigenMatrix version that is currently faster than Eigen's
197	// Spatial MaxPooling implementation.
198	//
199	// TODO(vrv): Remove this once we no longer need it.
200	if (std::is_same<Device, GPUDevice>::value) {
201	Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
202	functor::SpatialMaxPooling<Device, T>()(
203	context->eigen_device<Device>(), output->tensor<T, `4`>(),
204	tensor_in.tensor<T, `4`>(), params.window_rows, params.window_cols,
205	params.row_stride, params.col_stride, pt);
206	} else {
207	typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
208	ConstEigenMatrixMap;
209	typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
210	EigenMatrixMap;
211
212	ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
213	params.tensor_in_cols * params.tensor_in_rows *
214	params.tensor_in_batch);
215	EigenMatrixMap out_mat(
216	output->flat<T>().data(), params.depth,
217	params.out_width * params.out_height * params.tensor_in_batch);
218
219	const DeviceBase::CpuWorkerThreads& worker_threads =
220	*(context->device()->tensorflow_cpu_worker_threads());
221
222	// The following code basically does the following:
223	// 1. Flattens the input and output tensors into two dimensional arrays.
224	// tensor_in_as_matrix:
225	// depth by (tensor_in_cols tensor_in_rows * tensor_in_batch)*
226	// output_as_matrix:
227	// depth by (out_width out_height * tensor_in_batch)*
228	//
229	// 2. Walks through the set of columns in the flattened
230	// tensor_in_as_matrix,
231	// and updates the corresponding column(s) in output_as_matrix with the
232	// max value.
233	auto shard = [&params, &in_mat, &out_mat](int64_t start, int64_t limit) {
234	const int32_t in_rows = params.tensor_in_rows;
235	const int32_t in_cols = params.tensor_in_cols;
236	const int32_t pad_top = params.pad_top;
237	const int32_t pad_left = params.pad_left;
238	const int32_t window_rows = params.window_rows;
239	const int32_t window_cols = params.window_cols;
240	const int32_t row_stride = params.row_stride;
241	const int32_t col_stride = params.col_stride;
242	const int32_t out_height = params.out_height;
243	const int32_t out_width = params.out_width;
244
245	{
246	// Initializes the output tensor with MIN<T>.
247	const int32_t output_image_size =
248	out_height * out_width * params.depth;
249	EigenMatrixMap out_shard(out_mat.data() + start * output_image_size,
250	`1`, (limit - start) * output_image_size);
251	out_shard.setConstant(Eigen::NumTraits<T>::lowest());
252	}
253
254	for (int32_t b = start; b < limit; ++b) {
255	const int32_t out_offset_batch = b * out_height;
256	for (int32_t h = `0`; h < in_rows; ++h) {
257	for (int32_t w = `0`; w < in_cols; ++w) {
258	// (h_start, h_end) (w_start, w_end) is the range that the input*
259	// vector projects to.
260	const int32_t hpad = h + pad_top;
261	const int32_t wpad = w + pad_left;
262	const int32_t h_start =
263	(hpad < window_rows) ? `0`
264	: (hpad - window_rows) / row_stride + `1`;
265	const int32_t h_end = std::min(hpad / row_stride + `1`, out_height);
266	const int32_t w_start =
267	(wpad < window_cols) ? `0`
268	: (wpad - window_cols) / col_stride + `1`;
269	const int32_t w_end = std::min(wpad / col_stride + `1`, out_width);
270	// compute elementwise max
271	const int32_t in_offset = (b * in_rows + h) * in_cols + w;
272	for (int32_t ph = h_start; ph < h_end; ++ph) {
273	const int32_t out_offset_base =
274	(out_offset_batch + ph) * out_width;
275	for (int32_t pw = w_start; pw < w_end; ++pw) {
276	const int32_t out_offset = out_offset_base + pw;
277	out_mat.col(out_offset) =
278	out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
279	}
280	}
281	}
282	}
283	}
284	};
285
286	// TODO(andydavis) Consider sharding across batch x rows x cols.
287	// TODO(andydavis) Consider a higher resolution shard cost model.
288	const int64_t shard_cost =
289	params.tensor_in_rows * params.tensor_in_cols * params.depth;
290	Shard(worker_threads.num_threads, worker_threads.workers,
291	params.tensor_in_batch, shard_cost, shard);
292	}
293	}
294
295	std::vector<int32> ksize_;
296	std::vector<int32> stride_;
297	Padding padding_;
298	std::vector<int64_t> explicit_paddings_;
299	TensorFormat data_format_;
300	};
301
302	template <typename Device>
303	struct LaunchMaxPoolingNoMask_NCHW_VECT_C;
304
305	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
306	template <>
307	struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
308	static void launch(OpKernelContext* context, const PoolParameters& params,
309	const Tensor& input, Tensor* output) {
310	#if GOOGLE_CUDA
311	bool status = functor::MaxPoolForwardNoMask_NCHW_VECT_C()(
312	reinterpret_cast<const int32*>(input.flat<qint8>().data()),
313	params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols,
314	params.depth, params.out_height, params.out_width, params.window_rows,
315	params.window_cols, params.row_stride, params.col_stride,
316	params.pad_top, params.pad_left,
317	reinterpret_cast<int32*>(output->flat<qint8>().data()),
318	context->eigen_gpu_device());
319	if (!status) {
320	context->SetStatus(errors::Internal(
321	"Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
322	}
323	#else
324	// ROCm TODO: add support __vmaxs4 on ROCm
325	context->SetStatus(errors::Internal(
326	"Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
327	#endif // GOOGLE_CUDA
328	}
329	};
330	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
331
332	template <typename Device, typename T>
333	class MaxPoolingV2Op : public OpKernel {
334	public:
335	explicit MaxPoolingV2Op(OpKernelConstruction* context) : OpKernel(context) {
336	string data_format;
337	auto status = context->GetAttr("data_format", &data_format);
338	if (status.ok()) {
339	OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
340	errors::InvalidArgument("Invalid data format"));
341	OP_REQUIRES(
342	context,
343	data_format_ == FORMAT_NHWC \|\| data_format_ == FORMAT_NCHW_VECT_C,
344	errors::InvalidArgument(
345	"MaxPoolingV2Op only supports NHWC or NCHW_VECT_C. Got: ",
346	data_format));
347	} else {
348	data_format_ = FORMAT_NHWC;
349	}
350	if (context->num_inputs() == `1`) {
351	OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
352	OP_REQUIRES(context, ksize_.size() == `4`,
353	errors::InvalidArgument("Sliding window ksize field must "
354	"specify 4 dimensions"));
355	OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
356	OP_REQUIRES(context, stride_.size() == `4`,
357	errors::InvalidArgument("Sliding window stride field must "
358	"specify 4 dimensions"));
359	OP_REQUIRES(context, ksize_[`0`] == `1` && stride_[`0`] == `1`,
360	errors::Unimplemented(
361	"Pooling is not yet supported on the batch dimension."));
362	}
363	OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
364	}
365
366	void Compute(OpKernelContext* context) override {
367	const Tensor& tensor_in = context->input(`0`);
368
369	std::vector<int32> ksize = ksize_;
370	std::vector<int32> stride = stride_;
371
372	if (context->num_inputs() != `1`) {
373	const Tensor& tensor_ksize = context->input(`1`);
374	auto value_ksize = tensor_ksize.flat<int32>();
375	ksize.resize(tensor_ksize.shape().num_elements());
376	std::copy_n(&value_ksize (`0`), ksize.size(), ksize.begin());
377
378	const Tensor& tensor_stride = context->input(`2`);
379	auto value_stride = tensor_stride.flat<int32>();
380	stride.resize(tensor_stride.shape().num_elements());
381	std::copy_n(&value_stride (`0`), stride.size(), stride.begin());
382	}
383
384	OP_REQUIRES(context, ksize.size() == `4`,
385	errors::InvalidArgument("Sliding window ksize field must "
386	"specify 4 dimensions"));
387	OP_REQUIRES(context, stride.size() == `4`,
388	errors::InvalidArgument("Sliding window stride field must "
389	"specify 4 dimensions"));
390	OP_REQUIRES(context, ksize[`0`] == `1` && stride[`0`] == `1`,
391	errors::Unimplemented(
392	"Pooling is not yet supported on the batch dimension."));
393
394	PoolParameters params{
395	context,
396	ksize,
397	stride,
398	padding_,
399	/explicit_paddings=/{},
400	data_format_,
401	tensor_in.shape(),
402	};
403	if (!context->status().ok()) {
404	return;
405	}
406
407	Tensor* output = nullptr;
408	OP_REQUIRES_OK(context, context->allocate_output(
409	`0`, params.forward_output_shape(), &output));
410
411	if (params.depth_window > `1`) {
412	// Validate spec against the current implementation. A
413	// relaxation of these requirements would be ideal.
414	OP_REQUIRES(context, params.depth % params.depth_window == `0`,
415	errors::Unimplemented(
416	"Depthwise max pooling requires "
417	"the depth window to evenly divide the input depth."));
418	OP_REQUIRES(
419	context, params.depth_window == params.depth_stride,
420	errors::Unimplemented("Depthwise max pooling requires "
421	"the depth window to equal the depth stride."));
422
423	DepthwiseMaxPool(context, output, tensor_in, params);
424	} else {
425	SpatialMaxPool(context, output, tensor_in, params, padding_);
426	}
427	}
428
429	private:
430	// Single-threaded implementation of DepthwiseMaxPool which
431	// does not handle all of the same options as SpatialMaxPool
432	// (strict assumptions on no padding, stride).
433	//
434	// TODO(vrv): implement a more general depthwise-max pool that works
435	// on GPU as well.
436	void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
437	const Tensor& tensor_in, const PoolParameters& params) {
438	Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
439	in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
440	tensor_in.NumElements() / params.depth_window);
441	Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
442	output->flat<T>().data(), `1`, output->NumElements());
443	out_by_pool = in_by_pool.colwise().maxCoeff();
444	}
445
446	void SpatialMaxPool(OpKernelContext* context, Tensor* output,
447	const Tensor& tensor_in, const PoolParameters& params,
448	const Padding& padding) {
449	if (output->NumElements() == `0`) {
450	return;
451	}
452	// On GPU, use Eigen's Spatial Max Pooling. On CPU, use an
453	// EigenMatrix version that is currently faster than Eigen's
454	// Spatial MaxPooling implementation.
455	//
456	// TODO(vrv): Remove this once we no longer need it.
457	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
458	if (std::is_same<Device, GPUDevice>::value) {
459	Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
460	if (std::is_same<T, qint8>::value) {
461	LaunchMaxPoolingNoMask_NCHW_VECT_C<GPUDevice>::launch(
462	context, params, tensor_in, output);
463	} else {
464	functor::SpatialMaxPooling<Device, T>()(
465	context->eigen_device<Device>(), output->tensor<T, `4`>(),
466	tensor_in.tensor<T, `4`>(), params.window_rows, params.window_cols,
467	params.row_stride, params.col_stride, pt);
468	}
469	} else
470	#endif
471	{
472	typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
473	ConstEigenMatrixMap;
474	typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
475	EigenMatrixMap;
476
477	ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
478	params.tensor_in_cols * params.tensor_in_rows *
479	params.tensor_in_batch);
480	EigenMatrixMap out_mat(
481	output->flat<T>().data(), params.depth,
482	params.out_width * params.out_height * params.tensor_in_batch);
483
484	const DeviceBase::CpuWorkerThreads& worker_threads =
485	*(context->device()->tensorflow_cpu_worker_threads());
486
487	// The following code basically does the following:
488	// 1. Flattens the input and output tensors into two dimensional arrays.
489	// tensor_in_as_matrix:
490	// depth by (tensor_in_cols tensor_in_rows * tensor_in_batch)*
491	// output_as_matrix:
492	// depth by (out_width out_height * tensor_in_batch)*
493	//
494	// 2. Walks through the set of columns in the flattened
495	// tensor_in_as_matrix,
496	// and updates the corresponding column(s) in output_as_matrix with the
497	// max value.
498	auto shard = [&params, &in_mat, &out_mat](int64_t start, int64_t limit) {
499	const int32_t in_rows = params.tensor_in_rows;
500	const int32_t in_cols = params.tensor_in_cols;
501	const int32_t pad_top = params.pad_top;
502	const int32_t pad_left = params.pad_left;
503	const int32_t window_rows = params.window_rows;
504	const int32_t window_cols = params.window_cols;
505	const int32_t row_stride = params.row_stride;
506	const int32_t col_stride = params.col_stride;
507	const int32_t out_height = params.out_height;
508	const int32_t out_width = params.out_width;
509
510	{
511	// Initializes the output tensor with MIN<T>.
512	const int32_t output_image_size =
513	out_height * out_width * params.depth;
514	EigenMatrixMap out_shard(out_mat.data() + start * output_image_size,
515	`1`, (limit - start) * output_image_size);
516	out_shard.setConstant(Eigen::NumTraits<T>::lowest());
517	}
518
519	for (int32_t b = start; b < limit; ++b) {
520	const int32_t out_offset_batch = b * out_height;
521	for (int32_t h = `0`; h < in_rows; ++h) {
522	for (int32_t w = `0`; w < in_cols; ++w) {
523	// (h_start, h_end) (w_start, w_end) is the range that the input*
524	// vector projects to.
525	const int32_t hpad = h + pad_top;
526	const int32_t wpad = w + pad_left;
527	const int32_t h_start =
528	(hpad < window_rows) ? `0`
529	: (hpad - window_rows) / row_stride + `1`;
530	const int32_t h_end = std::min(hpad / row_stride + `1`, out_height);
531	const int32_t w_start =
532	(wpad < window_cols) ? `0`
533	: (wpad - window_cols) / col_stride + `1`;
534	const int32_t w_end = std::min(wpad / col_stride + `1`, out_width);
535	// compute elementwise max
536	const int32_t in_offset = (b * in_rows + h) * in_cols + w;
537	for (int32_t ph = h_start; ph < h_end; ++ph) {
538	const int32_t out_offset_base =
539	(out_offset_batch + ph) * out_width;
540	for (int32_t pw = w_start; pw < w_end; ++pw) {
541	const int32_t out_offset = out_offset_base + pw;
542	out_mat.col(out_offset) =
543	out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
544	}
545	}
546	}
547	}
548	}
549	};
550
551	// TODO(andydavis) Consider sharding across batch x rows x cols.
552	// TODO(andydavis) Consider a higher resolution shard cost model.
553	const int64_t shard_cost =
554	params.tensor_in_rows * params.tensor_in_cols * params.depth;
555	Shard(worker_threads.num_threads, worker_threads.workers,
556	params.tensor_in_batch, shard_cost, shard);
557	}
558	}
559
560	std::vector<int32> ksize_;
561	std::vector<int32> stride_;
562	Padding padding_;
563	TensorFormat data_format_;
564	};
565
566	template <typename Device, typename T>
567	void SpatialAvgPool(OpKernelContext* context, Tensor* output,
568	const Tensor& input, const PoolParameters& params,
569	const Padding& padding) {
570	if (output->NumElements() == `0`) {
571	return;
572	}
573	typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
574	ConstEigenMatrixMap;
575	typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
576	EigenMatrixMap;
577
578	auto in_flat = input.flat<T>();
579	auto out_flat = output->flat<T>();
580
581	auto shard = [&params, &in_flat, &out_flat](int64_t start, int64_t limit) {
582	// Calculate indices for this shards chunk of work.
583	const int64_t input_image_size =
584	params.tensor_in_rows * params.tensor_in_cols * params.depth;
585	const int64_t output_image_size =
586	params.out_width * params.out_height * params.depth;
587	const int64_t shard_batch_size = limit - start;
588
589	ConstEigenMatrixMap in_mat(
590	in_flat.data() + start * input_image_size, params.depth,
591	params.tensor_in_cols * params.tensor_in_rows * shard_batch_size);
592	EigenMatrixMap out_mat(
593	out_flat.data() + start * output_image_size, params.depth,
594	params.out_width * params.out_height * shard_batch_size);
595	Eigen::Matrix<T, Eigen::Dynamic, `1`> out_count(out_mat.cols());
596	out_count.setZero();
597
598	// Initializes output to zero.
599	out_mat.setZero();
600
601	// The following code basically does the following:
602	// 1. Flattens the input and output tensors into two dimensional arrays.
603	// tensor_in_as_matrix:
604	// depth by (tensor_in_cols tensor_in_rows * tensor_in_batch)*
605	// output_as_matrix:
606	// depth by (out_width out_height * tensor_in_batch)*
607	//
608	// 2. Walks through the set of columns in the flattened
609	// tensor_in_as_matrix,
610	// and updates the corresponding column(s) in output_as_matrix with the
611	// average value.
612	for (int b = `0`; b < shard_batch_size; ++b) {
613	for (int h = `0`; h < params.tensor_in_rows; ++h) {
614	for (int w = `0`; w < params.tensor_in_cols; ++w) {
615	// (h_start, h_end) (w_start, w_end) is the range that the input*
616	// vector projects to.
617	const int hpad = h + params.pad_top;
618	const int wpad = w + params.pad_left;
619	const int h_start =
620	(hpad < params.window_rows)
621	? `0`
622	: (hpad - params.window_rows) / params.row_stride + `1`;
623	const int h_end =
624	std::min<int>(hpad / params.row_stride + `1`, params.out_height);
625	const int w_start =
626	(wpad < params.window_cols)
627	? `0`
628	: (wpad - params.window_cols) / params.col_stride + `1`;
629	const int w_end =
630	std::min<int>(wpad / params.col_stride + `1`, params.out_width);
631	const int in_offset =
632	(b * params.tensor_in_rows + h) * params.tensor_in_cols + w;
633	Eigen::DSizes<Eigen::DenseIndex, `2`> in_indices(`0`, in_offset);
634	for (int ph = h_start; ph < h_end; ++ph) {
635	for (int pw = w_start; pw < w_end; ++pw) {
636	const int out_offset =
637	(b * params.out_height + ph) * params.out_width + pw;
638	out_mat.col(out_offset) += in_mat.col(in_offset);
639	out_count(out_offset) += T(`1`);
640	}
641	}
642	}
643	}
644	}
645
646	DCHECK_GT(out_count.minCoeff(), T(`0`));
647	out_mat.array().rowwise() /= out_count.transpose().array();
648	};
649
650	const int64_t work_unit_size =
651	params.tensor_in_rows * params.tensor_in_cols * params.depth;
652	// NOTE: Constants in calculation below were estimated based on benchmarking.
653	// Nanoseconds/work_unit for benchmarks ranged from 0.01 to 0.001, and
654	// so the factor 0.01 (i.e. 1/100) with a max of 10000, was chosen to limit
655	// the work unit cost to an operating range in which it empirically performed
656	// best.
657	const int64_t work_unit_cost = std::max(int64_t{`10000`}, work_unit_size / `100`);
658	const DeviceBase::CpuWorkerThreads& worker_threads =
659	*(context->device()->tensorflow_cpu_worker_threads());
660	Shard(worker_threads.num_threads, worker_threads.workers,
661	params.tensor_in_batch, work_unit_cost, shard);
662	}
663
664	} // namespace tensorflow
665
666	#endif // TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
667

Browse the source code of tensorflow/tensorflow/core/kernels/pooling_ops_common.h