conv_ops_fused_image_transform.cc source code [tensorflow/tensorflow/core/kernels/conv_ops_fused_image_transform.cc]

1	/ Copyright 2018 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	// Implements convolution operations with image transformations (resize and
17	// mirror padding) baked into the processing, to optimize latency and memory
18	// usage.
19
20	#define EIGEN_USE_THREADS
21
22	#include <string>
23	#include <vector>
24
25	#include "tensorflow/core/framework/bounds_check.h"
26	#include "tensorflow/core/framework/kernel_shape_util.h"
27	#include "tensorflow/core/framework/numeric_op.h"
28	#include "tensorflow/core/framework/op_kernel.h"
29	#include "tensorflow/core/framework/register_types.h"
30	#include "tensorflow/core/framework/resource_mgr.h"
31	#include "tensorflow/core/framework/tensor.h"
32	#include "tensorflow/core/framework/tensor_shape.h"
33	#include "tensorflow/core/framework/tensor_slice.h"
34	#include "tensorflow/core/kernels/conv_2d.h"
35	#include "tensorflow/core/kernels/conv_ops.h"
36	#include "tensorflow/core/kernels/gemm_functors.h"
37	#include "tensorflow/core/kernels/ops_util.h"
38	#include "tensorflow/core/lib/core/threadpool.h"
39	#include "tensorflow/core/util/image_resizer_state.h"
40	#include "tensorflow/core/util/mirror_pad_mode.h"
41	#include "tensorflow/core/util/padding.h"
42	#include "tensorflow/core/util/tensor_format.h"
43
44	namespace tensorflow {
45	namespace {
46
47	// We don't want to allocate a buffer to hold all the patches if the size is
48	// going to be extremely large, so break it into chunks if it's bigger than
49	// a limit. Each chunk will be processed serially, so we can refill the
50	// buffer for the next chunk and reuse it, keeping maximum memory size down.
51	// In this case, we've picked 16 megabytes as a reasonable limit for Android and
52	// other platforms using Eigen, and 1MB for iOS devices, from experimentation.
53	#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
54	const size_t kMaxChunkSize = (`1` * `1024` * `1024`);
55	#else
56	const size_t kMaxChunkSize = (`16` * `1024` * `1024`);
57	#endif
58	const size_t kResizeCacheSize = (`8` * `1024` * `1024`);
59
60	// Lookup method used when resizing.
61	enum SamplingMode {
62	BILINEAR = `0`,
63	NEAREST = `1`,
64	};
65
66	// Simple utility function used by FusedConv to multithread basic workloads. To
67	// use it, pass begin and end values for the full workload and a std::function
68	// that receives a subset of that through the begin and end values for each
69	// worker's task. The division of the full workload into worker tasks is handled
70	// by the multithreading logic. Here's an example of how to use it:
71	// std::vector<float> my_vector(100);
72	// ...
73	// FusedConvParallelFor(context, 0, 100,
74	// [&my_vector](int64 task_begin, int64 task_end) {
75	// for (int64 current = task_begin; current != task_end; ++current) {
76	// my_vector[current] = 10.0f;*
77	// }
78	// });
79	void FusedConvParallelFor(
80	OpKernelContext* context, int64_t begin, int64_t end,
81	const std::function<void(int64_t, int64_t)>& task_function) {
82	// On iOS, the thread management imposes a very big performance penalty, so
83	// just call the function directly with no multithreading.
84	#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
85	task_function(begin, end);
86	#else
87	auto& worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
88	thread::ThreadPool* thread_pool = worker_threads.workers;
89	const int64_t total_elements = end - begin;
90	// This is a bit of an arbitrary number, but was found to work well for
91	// typical models we've been profiling on various devices.
92	const int64_t element_cost = `10000000`;
93	thread_pool->ParallelFor(
94	total_elements, element_cost,
95	[begin, task_function](int64_t begin_offset, int64_t end_offset) {
96	const int64_t task_begin = begin + begin_offset;
97	const int64_t task_end = begin + end_offset;
98	task_function (task_begin, task_end);
99	});
100	#endif
101	}
102
103	// Holds the state needed for the resizing subtasks.
104	template <class T1>
105	struct ResizeTaskParameters {
106	ResizeTaskParameters() : st (false, false) {}
107
108	int cache_height;
109	T1* resize_cache;
110	int cache_line_width;
111	int input_width;
112	int input_depth;
113	int top_padding;
114	int pad_offset;
115	int64_t resized_height;
116	ImageResizerState st;
117	const T1* input_batch_start;
118	int64_t cache_start_x;
119	int64_t cache_end_x;
120	int left_padding;
121	int64_t resized_width;
122	int64_t padded_width;
123	int64_t padded_height;
124	};
125
126	template <class T1>
127	struct PerCacheLineParameters {
128	PerCacheLineParameters() {}
129	PerCacheLineParameters(const PerCacheLineParameters<T1>& other)
130	: cache_line_start(other.cache_line_start),
131	input_top_row_start(other.input_top_row_start),
132	input_bottom_row_start(other.input_bottom_row_start),
133	y_lerp(other.y_lerp) {}
134
135	T1* cache_line_start;
136	const T1* input_top_row_start;
137	const T1* input_bottom_row_start;
138	T1 y_lerp;
139	};
140
141	// Helper class to simplify bilinear filtering
142	template <class T1>
143	struct SampleRect {
144	EIGEN_ALWAYS_INLINE SampleRect(const T1* in_top_left, const T1* in_top_right,
145	const T1* in_bottom_left,
146	const T1* in_bottom_right)
147	: top_left(in_top_left),
148	top_right(in_top_right),
149	bottom_left(in_bottom_left),
150	bottom_right(in_bottom_right) {}
151
152	EIGEN_ALWAYS_INLINE T1 BilinearSample(int channel, T1 x_lerp,
153	T1 y_lerp) const {
154	const T1 top =
155	top_left[channel] + (top_right[channel] - top_left[channel]) * x_lerp;
156	const T1 bottom = bottom_left[channel] +
157	(bottom_right[channel] - bottom_left[channel]) * x_lerp;
158	return top + (bottom - top) * y_lerp;
159	}
160
161	const T1* top_left;
162	const T1* top_right;
163	const T1* bottom_left;
164	const T1* bottom_right;
165	};
166
167	// Calculates parameters which remain constant through a resize cache row.
168	template <class T1>
169	EIGEN_ALWAYS_INLINE PerCacheLineParameters<T1> CalculatePerCacheLineParameters(
170	int64_t cache_height, int64_t cache_y, T1* resize_cache,
171	int64_t cache_line_width, int64_t input_width, int64_t input_depth,
172	int64_t top_padding, int64_t pad_offset, int64_t resized_height,
173	const ImageResizerState& st, const T1* input_batch_start) {
174	PerCacheLineParameters<T1> result;
175	// The cache is organized so that the real y values of the resized image map
176	// onto the actual cache values through a modulo scheme. This means that as we
177	// progress downwards through the image, we keep reusing a small cache and so
178	// keep memory usage down.
179	int64_t cache_index_y;
180	if (cache_y < `0`) {
181	cache_index_y = cache_height + (cache_y % cache_height);
182	} else {
183	cache_index_y = cache_y % cache_height;
184	}
185	result.cache_line_start =
186	resize_cache + (cache_index_y * cache_line_width * input_depth);
187	// This part is implementing the mirror padding that happens before resizing.
188	float in_y = (cache_y - top_padding);
189	if (in_y < `0`) {
190	in_y = -(in_y + `1.0f` - pad_offset);
191	} else if (in_y >= resized_height) {
192	in_y = (resized_height * `2.0f`) - (in_y + `1.0f` + pad_offset);
193	}
194	// Here's where to do the actual resize.
195	in_y *= st.height_scale;
196	const int64_t top_y_index = static_cast<int64_t>(std::floor(in_y));
197	const int64_t bottom_y_index =
198	std::min(static_cast<int64_t>(std::ceil(in_y)), (st.in_height - `1`));
199	// Lerp is used for bilinear filtering when that's needed.
200	result.y_lerp = static_cast<T1>(in_y - top_y_index);
201	// Which rows of the original input image to pull the values from.
202	result.input_top_row_start =
203	input_batch_start + (top_y_index * input_width * input_depth);
204	result.input_bottom_row_start =
205	input_batch_start + (bottom_y_index * input_width * input_depth);
206	return result;
207	}
208
209	template <class T1>
210	struct PerCachePixelParameters {
211	PerCachePixelParameters() {}
212	PerCachePixelParameters(const PerCachePixelParameters<T1>& other)
213	: cache_line_pixel(other.cache_line_pixel),
214	left_x_index(other.left_x_index),
215	right_x_index(other.right_x_index),
216	x_lerp(other.x_lerp) {}
217
218	T1* cache_line_pixel;
219	int64_t left_x_index;
220	int64_t right_x_index;
221	T1 x_lerp;
222	};
223
224	// Pulls out common parameters used for every resized pixel.
225	template <class T1>
226	EIGEN_ALWAYS_INLINE PerCachePixelParameters<T1>
227	CalculatePerCachePixelParameters(int64_t cache_x, int64_t cache_start_x,
228	T1* cache_line_start, int64_t input_depth,
229	int64_t left_padding, int64_t pad_offset,
230	int64_t resized_width,
231	const ImageResizerState& st) {
232	PerCachePixelParameters<T1> result;
233	// Figure out where we're going to store the results of our transform.
234	const int cache_index_x = cache_x - cache_start_x;
235	result.cache_line_pixel = cache_line_start + (cache_index_x * input_depth);
236	// Implement mirror padding by flipping in_x if it's off the edge.
237	float in_x = (cache_x - left_padding);
238	if (in_x < `0`) {
239	in_x = -(in_x + `1.0f` - pad_offset);
240	} else if (in_x >= resized_width) {
241	in_x = (resized_width * `2.0f`) - (in_x + `1.0f` + pad_offset);
242	}
243	// Resize the x parameters.
244	in_x *= st.width_scale;
245	// Get the x coordinates for the left and right pixels to pull from.
246	result.left_x_index = static_cast<int64_t>(std::floor(in_x));
247	result.right_x_index =
248	std::min(static_cast<int64_t>(std::ceil(in_x)), (st.in_width - `1`));
249	// This x_lerp is used to blend pixels in bilinear filtering.
250	result.x_lerp = static_cast<T1>(in_x - result.left_x_index);
251	return result;
252	}
253
254	// Combines bilinear resizing and mirror padding into the im2col transformation
255	// stage of convolution.
256	template <class T1, class T2, class T3, class TGemmFunctor,
257	SamplingMode SampleMode>
258	class FusedResizeAndPadConvFunctor {
259	public:
260	void operator()(OpKernelContext* context, const Tensor& input,
261	int input_batches, int resized_height, int resized_width,
262	int padded_height, int padded_width, int input_depth,
263	const T2* filter_data, int filter_height, int filter_width,
264	int filter_count, int stride_rows, int stride_cols,
265	Padding padding, T3* output_data, int output_height,
266	int output_width, const ImageResizerState& st,
267	int top_padding, int bottom_padding, int left_padding,
268	int right_padding, int pad_offset) {
269	if ((input_batches <= `0`) \|\| (padded_width <= `0`) \|\| (padded_height <= `0`) \|\|
270	(input_depth <= `0`)) {
271	LOG(WARNING) << "Conv2D was called with bad input dimensions: "
272	<< input_batches << ", " << padded_height << ", "
273	<< padded_width << ", " << input_depth;
274	return;
275	}
276	if ((filter_width <= `0`) \|\| (filter_height <= `0`) \|\| (filter_count <= `0`)) {
277	LOG(WARNING) << "Conv2D was called with bad filter dimensions: "
278	<< filter_width << ", " << filter_height << ", "
279	<< filter_count;
280	return;
281	}
282	if ((output_width <= `0`) \|\| (output_height <= `0`)) {
283	LOG(WARNING) << "Conv2D was called with bad output width or height: "
284	<< output_width << ", " << output_height;
285	return;
286	}
287	OP_REQUIRES(
288	context, ((SampleMode == NEAREST) \|\| (SampleMode == BILINEAR)),
289	errors::InvalidArgument("Bad sample mode passed in", SampleMode));
290
291	// These calculations define how the patches will be positioned within the
292	// input image. The actual definitions are quite complex, and rely on the
293	// previously-calculated output size.
294	int filter_left_offset;
295	int filter_top_offset;
296	if (padding == VALID) {
297	filter_left_offset =
298	((output_width - `1`) * stride_cols + filter_width - padded_width + `1`) /
299	`2`;
300	filter_top_offset = ((output_height - `1`) * stride_rows + filter_height -
301	padded_height + `1`) /
302	`2`;
303	} else {
304	filter_left_offset =
305	((output_width - `1`) * stride_cols + filter_width - padded_width) / `2`;
306	filter_top_offset =
307	((output_height - `1`) * stride_rows + filter_height - padded_height) /
308	`2`;
309	}
310
311	ResizeTaskParameters<T1> task_params;
312	task_params.input_depth = input_depth;
313	task_params.top_padding = top_padding;
314	task_params.pad_offset = pad_offset;
315	task_params.resized_height = resized_height;
316	task_params.st = st;
317	task_params.left_padding = left_padding;
318	task_params.resized_width = resized_width;
319	task_params.padded_width = padded_width;
320	task_params.padded_height = padded_height;
321
322	// The im2col buffer has # of patches rows, and # of filters cols.
323	// It's laid out like this, in row major order in memory:
324	// < filter value count >
325	// ^ +---------------------+
326	// patch \| \|
327	// count \| \|
328	// v +---------------------+
329	// Each patch row contains a filter_width x filter_height patch of the
330	// input, with the depth channel as the most contiguous in memory, followed
331	// by the width, then the height. This is the standard memory order in the
332	// image world if it helps to visualize it.
333	const int filter_value_count = filter_width * filter_height * input_depth;
334
335	OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= kMaxChunkSize,
336	errors::InvalidArgument("Im2Col patch too large for buffer"));
337	const size_t patches_per_chunk =
338	kMaxChunkSize / (filter_value_count * sizeof(T1));
339	// Because memory allocation is very expensive on mobile platforms, try to
340	// allocate a persistent buffer that will be kept around between calls. We
341	// use TensorFlow's resource management to ensure that the memory will be
342	// released when the session is over.
343	Im2ColBufferResource<T1, kMaxChunkSize>* im2col_buffer_resource;
344	std::function<Status(Im2ColBufferResource<T1, kMaxChunkSize>**)> creator =
345	[](Im2ColBufferResource<T1, kMaxChunkSize>** resource) {
346	resource = new* Im2ColBufferResource<T1, kMaxChunkSize>();
347	return OkStatus();
348	};
349	OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
350	"Conv2d", "im2col_buffer",
351	&im2col_buffer_resource, creator));
352
353	// Create a resize cache memory buffer that will hold the rows of
354	// transformed and mirror padded input pixels, ready to be copied
355	// into filter patches by im2col.
356	// It's laid out like this, in row major order in memory:
357	// < cache line width >
358	// ^ +--------------------+
359	// cache \| \|
360	// height \| \|
361	// v +--------------------+
362	// Each cache row contains a cache_line_width number of resized pixels,
363	// each with input_depth channels. The cache height is typically less than
364	// the full height the resized image would be, so it's filled up
365	// incrementally as we progress downwards through the input creating im2col
366	// patches.
367	task_params.cache_start_x = -filter_left_offset;
368	task_params.cache_end_x =
369	(((output_width - `1`) * stride_cols) - filter_left_offset) +
370	filter_width;
371	task_params.cache_line_width =
372	task_params.cache_end_x - task_params.cache_start_x;
373	task_params.cache_height =
374	kResizeCacheSize / (task_params.cache_line_width * input_depth);
375	const int needed_resize_cache_count =
376	filter_height * task_params.cache_line_width * input_depth;
377	OP_REQUIRES(context,
378	(needed_resize_cache_count * sizeof(T1)) <= kResizeCacheSize,
379	errors::InvalidArgument("Input too large for resize cache"));
380	Im2ColBufferResource<T1, kResizeCacheSize>* resize_cache_resource;
381	std::function<Status(Im2ColBufferResource<T1, kResizeCacheSize>**)>
382	resize_creator =
383	[](Im2ColBufferResource<T1, kResizeCacheSize>** resource) {
384	resource = new* Im2ColBufferResource<T1, kResizeCacheSize>();
385	return OkStatus();
386	};
387	OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
388	"Conv2d", "resize_cache",
389	&resize_cache_resource, resize_creator));
390
391	// This means that multiple ops can't be run simultaneously on different
392	// threads, because we have a single shared resource. The platforms this is
393	// aimed at have intra-op parallelism as their focus though, so it shouldn't
394	// be an issue.
395	mutex_lock lock_buffer(im2col_buffer_resource->mu);
396	core::ScopedUnref unref_buffer(im2col_buffer_resource);
397	T1* im2col_buffer = im2col_buffer_resource->data;
398
399	// This buffer is used as a fairly heavy-weight cache for the resized and
400	// mirrored inputs to the im2col operation. The problem is that we want to
401	// keep the memory usage down by not rendering the fully resized and padded
402	// input tensor to the convolution into an entire buffer. The first approach
403	// to avoid this was to fold the bilinear filtering and padding spatial
404	// transformations into the im2col lookup itself. This successfully reduced
405	// memory usage, but because im2col can access an individual pixel for many
406	// different patches, the extra overhead of doing the same bilinear lookups
407	// repeatedly became too expensive.
408	// The resize cache is designed to avoid this problem by keeping a
409	// horizontal slice of the resized and padded input to the im2col
410	// precalculated, so that repeated accesses to the same pixel from different
411	// filter patches can just be copied from this cache. It's organized as a
412	// horizontal slice stretching across the whole virtual image, and as high
413	// as the filter window, so that as the patch processing moves across all
414	// the pixels are present, and before a new row of patches is started any
415	// previously calculated rows that are needed are maintained, with new rows
416	// calculated as required.
417	mutex_lock resize_lock_buffer(resize_cache_resource->mu);
418	core::ScopedUnref unref_resized_cache(resize_cache_resource);
419	task_params.resize_cache = resize_cache_resource->data;
420
421	const T1* input_data = input.flat<T1>().data();
422	const int64_t input_height = input.shape().dim_sizes()[`1`];
423	task_params.input_width = input.shape().dim_sizes()[`2`];
424
425	int end_cached_lines = std::numeric_limits<int>::min();
426
427	for (int batch = `0`; batch < input_batches; ++batch) {
428	task_params.input_batch_start =
429	input_data +
430	(batch * input_height * task_params.input_width * input_depth);
431	const int in_y_end =
432	((output_height * stride_rows) - filter_top_offset) + filter_height;
433	for (int out_y = `0`; out_y < output_height; ++out_y) {
434	const int in_y_origin = (out_y * stride_rows) - filter_top_offset;
435	const int cache_start_y = std::max(in_y_origin, end_cached_lines);
436	const int cache_end_y = std::min(
437	in_y_end, std::max((in_y_origin + task_params.cache_height),
438	end_cached_lines));
439	if (end_cached_lines < (in_y_origin + filter_height)) {
440	// This call breaks up the work required for calculating the mirror
441	// padding and resizing across multiple threads.
442	FusedConvParallelFor(
443	context, cache_start_y, cache_end_y,
444	[task_params](int64_t task_cache_start_y,
445	int64_t task_cache_end_y) {
446	// This is a long and confusing function, but it's been laid out
447	// this way to help with performance on some intensive models.
448	// What it's doing is populating a cache of the original input
449	// image, after it's been bilinear resized and had its edges
450	// mirrored. This allows the following im2col code to access the
451	// transformed pixels from this cache, without having to
452	// repeatedly apply the expensive bilinear calculations as the
453	// same pixels are accessed by different patches.
454	// This is most effective when the stride is small and the
455	// filter size is large, since that's when pixels are reused
456	// most frequently as patches overlap.
457	for (int cache_y = task_cache_start_y;
458	cache_y < task_cache_end_y; ++cache_y) {
459	// We organize the cache as a series of rows, each containing
460	// all the transformed pixels for a given line in the image.
461	// This cache is big enough to hold at least a filter's height
462	// worth of rows, but typically more, limited by the size of
463	// the cache buffer.
464	// We don't allocate an entire image's worth of rows though,
465	// because we're trying to keep memory usage down, so as we
466	// progress downwards through the im2col we periodically
467	// refresh the cache so that the next lines that are needed
468	// for that operation are always present.
469	// Work out the parameters that remain constant across the
470	// row we're calculating.
471	PerCacheLineParameters<T1> line_params(
472	CalculatePerCacheLineParameters<T1>(
473	task_params.cache_height, cache_y,
474	task_params.resize_cache,
475	task_params.cache_line_width, task_params.input_width,
476	task_params.input_depth, task_params.top_padding,
477	task_params.pad_offset, task_params.resized_height,
478	task_params.st, task_params.input_batch_start));
479	// Iterate through the resize cache row we're filling in.
480	for (int cache_x = task_params.cache_start_x;
481	cache_x < task_params.cache_end_x; ++cache_x) {
482	// Figure out what we need for the cache pixel we're
483	// populating.
484	PerCachePixelParameters<T1> pixel_params(
485	CalculatePerCachePixelParameters<T1>(
486	cache_x, task_params.cache_start_x,
487	line_params.cache_line_start,
488	task_params.input_depth, task_params.left_padding,
489	task_params.pad_offset, task_params.resized_width,
490	task_params.st));
491	// If the access is off the left, right, top, or bottom of
492	// the resized image, the conv padding means we should set
493	// it to zero.
494	if ((cache_x < `0`) \|\|
495	(cache_x >= task_params.padded_width) \|\|
496	(cache_y < `0`) \|\|
497	(cache_y >= task_params.padded_height)) {
498	std::fill_n(pixel_params.cache_line_pixel,
499	task_params.input_depth, T1(`0`));
500	} else {
501	// There are two different sampling strategies for
502	// resizing. When using nearest, we can just do a
503	// straight copy of the pixel closest to our sample point,
504	// but bilinear requires a more complex calculation.
505	if (SampleMode == NEAREST) {
506	const T1* input_top_left_pixel =
507	line_params.input_top_row_start +
508	(pixel_params.left_x_index *
509	task_params.input_depth);
510
511	std::copy_n(input_top_left_pixel,
512	task_params.input_depth,
513	pixel_params.cache_line_pixel);
514	} else {
515	const SampleRect<T1> rect(
516	line_params.input_top_row_start +
517	(pixel_params.left_x_index *
518	task_params.input_depth),
519	line_params.input_top_row_start +
520	(pixel_params.right_x_index *
521	task_params.input_depth),
522	line_params.input_bottom_row_start +
523	(pixel_params.left_x_index *
524	task_params.input_depth),
525	line_params.input_bottom_row_start +
526	(pixel_params.right_x_index *
527	task_params.input_depth));
528	for (int in_channel = `0`;
529	in_channel < task_params.input_depth;
530	++in_channel) {
531	pixel_params.cache_line_pixel[in_channel] =
532	rect.BilinearSample(in_channel,
533	pixel_params.x_lerp,
534	line_params.y_lerp);
535	}
536	}
537	}
538	}
539	}
540	});
541	end_cached_lines = cache_end_y;
542	}
543	for (int out_x = `0`; out_x < output_width; ++out_x) {
544	const int in_x_origin = (out_x * stride_cols) - filter_left_offset;
545	const int patch_index = (batch * output_width * output_height) +
546	(out_y * output_width) + out_x;
547	const int patch_index_within_chunk = patch_index % patches_per_chunk;
548	T1* im2col_patch_start =
549	im2col_buffer + (patch_index_within_chunk * filter_value_count);
550	for (int filter_y = `0`; filter_y < filter_height; ++filter_y) {
551	T1* im2col_row_start =
552	im2col_patch_start +
553	(filter_y * filter_width * task_params.input_depth);
554	const int conv_in_y = in_y_origin + filter_y;
555	int cache_index_y;
556	if (conv_in_y < `0`) {
557	cache_index_y = task_params.cache_height +
558	(conv_in_y % task_params.cache_height);
559	} else {
560	cache_index_y = conv_in_y % task_params.cache_height;
561	}
562	T1* cache_line_start =
563	task_params.resize_cache +
564	(cache_index_y * task_params.cache_line_width *
565	task_params.input_depth);
566	T1* cache_filter_row_start =
567	cache_line_start + ((in_x_origin - task_params.cache_start_x) *
568	task_params.input_depth);
569	std::copy_n(cache_filter_row_start,
570	(filter_width * task_params.input_depth),
571	im2col_row_start);
572	}
573	const bool is_last_in_chunk =
574	(patch_index_within_chunk == (patches_per_chunk - `1`));
575	const bool is_last_overall =
576	((batch == (input_batches - `1`)) &&
577	(out_y == (output_height - `1`)) && (out_x == (output_width - `1`)));
578	if (is_last_in_chunk \|\| is_last_overall) {
579	// Now we've assembled a set of image patches into a matrix, apply
580	// a GEMM matrix multiply of the patches as rows, times the filter
581	// weights in columns, to get partial results in the output
582	// matrix.
583	const int how_many_patches = patch_index_within_chunk + `1`;
584	const int m = how_many_patches;
585	const int n = filter_count;
586	const int k = filter_value_count;
587	const int lda = filter_value_count;
588	const int ldb = filter_count;
589	const int ldc = filter_count;
590	const size_t start_patch_index =
591	patch_index - (how_many_patches - `1`);
592	T3* chunk_output_data =
593	output_data + (start_patch_index * filter_count);
594	TGemmFunctor gemm_functor;
595	gemm_functor(context, m, n, k, im2col_buffer, lda, filter_data, ldb,
596	chunk_output_data, ldc);
597	}
598	}
599	}
600	}
601	}
602	};
603
604	} // namespace
605
606	// Implements a version of convolution with bilinear resizing and mirror padding
607	// included.
608	template <class T, class TConvFunctor, bool DoResize>
609	class FusedResizeConv2DUsingGemmOp : public OpKernel {
610	public:
611	explicit FusedResizeConv2DUsingGemmOp(OpKernelConstruction* context)
612	: OpKernel(context) {
613	if (DoResize) {
614	OP_REQUIRES_OK(context,
615	context->GetAttr("resize_align_corners", &align_corners_));
616	}
617	MirrorPadMode mode;
618	OP_REQUIRES_OK(context, context->GetAttr("mode", &mode));
619
620	switch (mode) {
621	case MirrorPadMode::SYMMETRIC: {
622	offset_ = `0`;
623	break;
624	}
625	case MirrorPadMode::REFLECT: {
626	offset_ = `1`;
627	break;
628	}
629	default:
630	OP_REQUIRES(context, false,
631	errors::InvalidArgument(
632	"mode must be either REFLECT or SYMMETRIC."));
633	}
634	OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
635	OP_REQUIRES(context, strides_.size() == `4`,
636	errors::InvalidArgument("Sliding window strides field must "
637	"specify 4 dimensions"));
638	const int64_t stride_n = GetTensorDim(strides_, FORMAT_NHWC, `'N'`);
639	const int64_t stride_c = GetTensorDim(strides_, FORMAT_NHWC, `'C'`);
640	OP_REQUIRES(
641	context, stride_n == `1` && stride_c == `1`,
642	errors::InvalidArgument("Current implementation does not yet support "
643	"strides in the batch and depth dimensions."));
644	OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
645	}
646
647	void Compute(OpKernelContext* context) override {
648	// Input tensor is of the following dimensions:
649	// [ batch, in_rows, in_cols, in_depth ]
650	const Tensor& input = context->input(`0`);
651	OP_REQUIRES(context, (input.shape().num_elements() > `0`),
652	errors::InvalidArgument("Input tensor can't be empty"));
653
654	ImageResizerState st(false, false);
655	if (DoResize) {
656	st = ImageResizerState (align_corners_, false);
657	st.ValidateAndCalculateOutputSize(context);
658	if (!context->status().ok()) return;
659	} else {
660	// Set up the resize parameters to do no scaling at all.
661	st.batch_size = input.dim_size(`0`);
662	st.out_height = input.dim_size(`1`);
663	st.out_width = input.dim_size(`2`);
664	st.in_height = input.dim_size(`1`);
665	st.in_width = input.dim_size(`2`);
666	st.channels = input.dim_size(`3`);
667	st.height_scale = `1.0f`;
668	st.width_scale = `1.0f`;
669	}
670	TensorShape resized_shape(
671	{input.dim_size(`0`), st.out_height, st.out_width, input.dim_size(`3`)});
672	int paddings_index;
673	int filter_index;
674	if (DoResize) {
675	paddings_index = `2`;
676	filter_index = `3`;
677	} else {
678	paddings_index = `1`;
679	filter_index = `2`;
680	}
681	const Tensor& paddings = context->input(paddings_index);
682
683	const int dims = resized_shape.dims();
684	OP_REQUIRES(
685	context,
686	TensorShapeUtils::IsMatrix(paddings.shape()) &&
687	paddings.dim_size(`1`) == `2`,
688	errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
689	paddings.shape().DebugString()));
690	OP_REQUIRES(
691	context, dims == paddings.dim_size(`0`),
692	errors::InvalidArgument(
693	"The first dimension of paddings must be the rank of inputs: ",
694	dims, " ", paddings.shape().DebugString(), " ",
695	resized_shape.DebugString()));
696	OP_REQUIRES(
697	context, dims == paddings.dim_size(`0`),
698	errors::InvalidArgument(
699	"The first dimension of paddings must be the rank of inputs: ",
700	dims, " ", paddings.shape().DebugString(), " ",
701	resized_shape.DebugString()));
702
703	OP_REQUIRES(
704	context, dims == `4`,
705	errors::InvalidArgument(
706	"Fused mirror padding only supports four-dimensional inputs, but ",
707	dims, " requested"));
708
709	// Compute the shape of the output tensor, and allocate it.
710	TensorShape padded_shape;
711	TTypes<int32>::ConstMatrix paddings_matrix = paddings.matrix<int32>();
712	for (int d = `0`; d < dims; ++d) {
713	const int32_t before =
714	paddings_matrix (d, `0`); // Pad before existing elements.
715	const int32_t after =
716	paddings_matrix (d, `1`); // Pad after existing elements.
717	OP_REQUIRES(context, before >= `0` && after >= `0`,
718	errors::InvalidArgument(
719	"paddings must be non-negative: ", before, " ", after));
720	if (offset_ == `0`) { // SYMMETRIC mode.
721	OP_REQUIRES(
722	context,
723	before <= resized_shape.dim_size(d) &&
724	after <= resized_shape.dim_size(d),
725	errors::InvalidArgument("paddings must be no greater "
726	"than the dimension size: ",
727	before, ", ", after, " greater than ",
728	resized_shape.dim_size(d)));
729	} else if (offset_ == `1`) { // REFLECT mode.
730	OP_REQUIRES(
731	context,
732	before < resized_shape.dim_size(d) &&
733	after < resized_shape.dim_size(d),
734	errors::InvalidArgument("paddings must be less than"
735	" the dimension size: ",
736	before, ", ", after, " not less than ",
737	resized_shape.dim_size(d)));
738	}
739	padded_shape.AddDim(before + resized_shape.dim_size(d) + after);
740	}
741
742	OP_REQUIRES(
743	context, ((paddings_matrix(`0`, `0`) == `0`) && (paddings_matrix(`0`, `1`) == `0`)),
744	errors::InvalidArgument(
745	"Fused mirror padding only support spatial padding, not batches: ",
746	paddings.DebugString()));
747	OP_REQUIRES(
748	context, ((paddings_matrix(`3`, `0`) == `0`) && (paddings_matrix(`3`, `1`) == `0`)),
749	errors::InvalidArgument(
750	"Fused mirror padding only support spatial padding, not channels: ",
751	paddings.DebugString()));
752	const int32_t top_padding = paddings_matrix (`1`, `0`);
753	const int32_t bottom_padding = paddings_matrix (`1`, `1`);
754	const int32_t left_padding = paddings_matrix (`2`, `0`);
755	const int32_t right_padding = paddings_matrix (`2`, `1`);
756
757	// Input filter is of the following dimensions:
758	// [ filter_rows, filter_cols, in_depth, out_depth]
759	const Tensor& filter = context->input(filter_index);
760
761	// For 2D convolution, there should be 4 dimensions.
762	OP_REQUIRES(context, padded_shape.dims() == `4`,
763	errors::InvalidArgument("input must be 4-dimensional",
764	padded_shape.DebugString()));
765	OP_REQUIRES(context, filter.dims() == `4`,
766	errors::InvalidArgument("filter must be 4-dimensional: ",
767	filter.shape().DebugString()));
768
769	// We only check the first three dims, since the depth is accessed as an
770	// int64 below.
771	for (int i = `0`; i < `3`; i++) {
772	OP_REQUIRES(
773	context,
774	FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
775	errors::InvalidArgument("filter too large"));
776	}
777
778	// The last dimension for input is in_depth. It must be the same as the
779	// filter's in_depth.
780	const int64_t in_depth = padded_shape.dim_size(`3`);
781	OP_REQUIRES(context, in_depth == filter.dim_size(`2`),
782	errors::InvalidArgument(
783	"input and filter must have the same depth: ", in_depth,
784	" vs ", filter.dim_size(`2`)));
785
786	// The last dimension for filter is out_depth.
787	const int out_depth = static_cast<int>(filter.dim_size(`3`));
788
789	// The second dimension for input is rows/height.
790	// The first dimension for filter is rows/height.
791	const int64_t padded_rows_raw = padded_shape.dim_size(`1`);
792	OP_REQUIRES(
793	context,
794	FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
795	errors::InvalidArgument("Input rows too large"));
796	const int padded_rows = static_cast<int>(padded_rows_raw);
797	const int filter_rows = static_cast<int>(filter.dim_size(`0`));
798	const int resized_rows = static_cast<int>(resized_shape.dim_size(`1`));
799
800	// The third dimension for input is columns/width.
801	// The second dimension for filter is columns/width.
802	const int64_t padded_cols_raw = padded_shape.dim_size(`2`);
803	OP_REQUIRES(
804	context,
805	FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
806	errors::InvalidArgument("Input cols too large"));
807	const int padded_cols = static_cast<int>(padded_cols_raw);
808	const int filter_cols = static_cast<int>(filter.dim_size(`1`));
809	const int resized_cols = static_cast<int>(resized_shape.dim_size(`2`));
810
811	// The first dimension for input is batch.
812	const int64_t batch_raw = padded_shape.dim_size(`0`);
813	OP_REQUIRES(context,
814	FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
815	errors::InvalidArgument("batch is too large"));
816	const int batch = static_cast<int>(batch_raw);
817
818	// For now we take the stride from the second and third dimensions only (we
819	// do not support striding on the batch or depth dimension).
820	const int stride_rows = GetTensorDim(strides_, FORMAT_NHWC, `'H'`);
821	const int stride_cols = GetTensorDim(strides_, FORMAT_NHWC, `'W'`);
822
823	int64_t out_rows = `0`, out_cols = `0`, pad_rows = `0`, pad_cols = `0`;
824	OP_REQUIRES_OK(context,
825	GetWindowedOutputSize(padded_rows, filter_rows, stride_rows,
826	padding_, &out_rows, &pad_rows));
827	OP_REQUIRES_OK(context,
828	GetWindowedOutputSize(padded_cols, filter_cols, stride_cols,
829	padding_, &out_cols, &pad_cols));
830	TensorShape out_shape =
831	ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth);
832	OP_REQUIRES(context, (out_shape.num_elements() > `0`),
833	errors::InvalidArgument("Output tensor can't be empty"));
834
835	// Output tensor is of the following dimensions:
836	// [ in_batch, out_rows, out_cols, out_depth ]
837	Tensor* output = nullptr;
838	OP_REQUIRES_OK(context, context->allocate_output(`0`, out_shape, &output));
839
840	VLOG(`2`) << "FusedConv2D: " << name() << ", in_depth = " << in_depth
841	<< ", padded_cols = " << padded_cols
842	<< ", resized_cols = " << resized_cols
843	<< ", filter_cols = " << filter_cols
844	<< ", padded_rows = " << padded_rows
845	<< ", resized_rows = " << resized_rows
846	<< ", filter_rows = " << filter_rows
847	<< ", stride_rows = " << stride_rows
848	<< ", stride_cols = " << stride_cols
849	<< ", out_depth = " << out_depth << ", DoResize=" << DoResize;
850
851	// If there is nothing to compute, return.
852	if (out_shape.num_elements() == `0`) {
853	return;
854	}
855	TConvFunctor conv_functor;
856	conv_functor(context, input, batch, resized_rows, resized_cols, padded_rows,
857	padded_cols, in_depth, filter.flat<T>().data(), filter_rows,
858	filter_cols, out_depth, stride_rows, stride_cols, padding_,
859	output->flat<T>().data(), out_rows, out_cols, st, top_padding,
860	bottom_padding, left_padding, right_padding, offset_);
861	}
862
863	private:
864	std::vector<int32> strides_;
865	Padding padding_;
866	bool align_corners_;
867	int offset_;
868
869	TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
870	};
871
872	#define REGISTER_FUSED(T) \
873	REGISTER_KERNEL_BUILDER( \
874	Name("FusedResizeAndPadConv2D") \
875	.Device(DEVICE_CPU) \
876	.TypeConstraint<T>("T"), \
877	FusedResizeConv2DUsingGemmOp< \
878	T, \
879	FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
880	BILINEAR>, \
881	true>);
882
883	TF_CALL_half(REGISTER_FUSED);
884	TF_CALL_float(REGISTER_FUSED);
885	TF_CALL_double(REGISTER_FUSED);
886
887	#define REGISTER_PAD_ONLY_FUSED(T) \
888	REGISTER_KERNEL_BUILDER( \
889	Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
890	FusedResizeConv2DUsingGemmOp< \
891	T, \
892	FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
893	NEAREST>, \
894	false>);
895
896	TF_CALL_half(REGISTER_PAD_ONLY_FUSED);
897	TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
898	TF_CALL_double(REGISTER_PAD_ONLY_FUSED);
899
900	} // namespace tensorflow
901

Browse the source code of tensorflow/tensorflow/core/kernels/conv_ops_fused_image_transform.cc