ConvUtils.h source code [pytorch/aten/src/ATen/native/ConvUtils.h]

1	#pragma once
2	#include <ATen/core/Tensor.h>
3	#include <ATen/TensorUtils.h>
4	#include <ATen/detail/CUDAHooksInterface.h>
5	#include <ATen/native/DispatchStub.h>
6	#include <c10/util/env.h>
7	#include <c10/util/irange.h>
8
9	namespace at { namespace native {
10
11	using conv_depthwise2d_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
12	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
13	at::IntArrayRef, at::IntArrayRef, std::array<bool, `2`>);
14	DECLARE_DISPATCH(conv_depthwise2d_backward_fn, conv_depthwise2d_backward_stub);
15	using conv_depthwise3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
16	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
17	at::IntArrayRef, at::IntArrayRef, std::array<bool, `3`>);
18	DECLARE_DISPATCH(conv_depthwise3d_backward_fn, conv_depthwise3d_backward_stub);
19	using cudnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
20	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
21	at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,`2`>);
22	DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub);
23	using mps_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
24	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
25	at::IntArrayRef, int64_t, std::array<bool,`3`>);
26	DECLARE_DISPATCH(mps_convolution_backward_fn, mps_convolution_backward_stub);
27	using cudnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
28	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
29	at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,`2`>);
30	DECLARE_DISPATCH(cudnn_convolution_transpose_backward_fn, cudnn_convolution_transpose_backward_stub);
31	using miopen_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
32	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
33	at::IntArrayRef, int64_t, bool, bool, std::array<bool,`3`>);
34	DECLARE_DISPATCH(miopen_convolution_backward_fn, miopen_convolution_backward_stub);
35	using miopen_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
36	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
37	at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, std::array<bool,`3`>);
38	DECLARE_DISPATCH(miopen_convolution_transpose_backward_fn, miopen_convolution_transpose_backward_stub);
39	using miopen_depthwise_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
40	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
41	at::IntArrayRef, int64_t, bool, bool, std::array<bool,`3`>);
42	DECLARE_DISPATCH(miopen_depthwise_convolution_backward_fn, miopen_depthwise_convolution_backward_stub);
43	using mkldnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
44	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
45	at::IntArrayRef, int64_t, std::array<bool,`3`>);
46	DECLARE_DISPATCH(mkldnn_convolution_backward_fn, mkldnn_convolution_backward_stub);
47	using mkldnn_convolution_transpose_fn = Tensor()(const* Tensor&, const Tensor&, const c10::optional<Tensor>&,
48	IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t);
49	DECLARE_DISPATCH(mkldnn_convolution_transpose_fn, mkldnn_convolution_transpose_stub);
50	using mkldnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
51	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
52	at::IntArrayRef, at::IntArrayRef, int64_t, std::array<bool,`3`>);
53	DECLARE_DISPATCH(mkldnn_convolution_transpose_backward_fn, mkldnn_convolution_transpose_backward_stub);
54	using slow_conv_dilated2d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
55	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
56	at::IntArrayRef, at::IntArrayRef, std::array<bool, `3`>);
57	DECLARE_DISPATCH(slow_conv_dilated2d_backward_fn, slow_conv_dilated2d_backward_stub);
58	using slow_conv_dilated3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
59	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
60	at::IntArrayRef, at::IntArrayRef, std::array<bool, `3`>);
61	DECLARE_DISPATCH(slow_conv_dilated3d_backward_fn, slow_conv_dilated3d_backward_stub);
62	using slow_conv_transpose2d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
63	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
64	at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array<bool,`3`>);
65	DECLARE_DISPATCH(slow_conv_transpose2d_backward_fn, slow_conv_transpose2d_backward_stub);
66	using slow_conv_transpose3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
67	const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
68	at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array<bool,`3`>);
69	DECLARE_DISPATCH(slow_conv_transpose3d_backward_fn, slow_conv_transpose3d_backward_stub);
70
71	namespace {
72	static bool cudnnv8_heuristic_mode_b = c10::utils::check_env("TORCH_CUDNN_USE_HEURISTIC_MODE_B") == true;
73	}
74
75	static inline bool cudnnv8_enabled_check_debug() {
76	static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_DISABLED") != true;
77	static bool cudnnv8_debug = c10::utils::check_env("TORCH_CUDNN_V8_API_DEBUG") == true;
78	static uint8_t cudnnv8_debugcount = `0`;
79	if (cudnnv8_debug == `1` && cudnnv8_debugcount < `10`) {
80	TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8 ON: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", cudnnv8_heuristic_mode_b);
81	cudnnv8_debugcount++;
82	}
83	return cudnnv8_flag == `1`;
84	}
85
86	static inline bool cudnnv8_use_heur_mode_b() {
87	return cudnnv8_heuristic_mode_b;
88	}
89
90	// Keep in sync with py::enum_ in Module.cpp
91	enum class ConvBackend {
92	CudaDepthwise2d,
93	CudaDepthwise3d,
94	Cudnn,
95	CudnnTranspose,
96	Empty,
97	Miopen,
98	MiopenDepthwise,
99	MiopenTranspose,
100	Mkldnn,
101	MkldnnTranspose,
102	MkldnnEmpty,
103	NnpackSpatial,
104	Overrideable,
105	Slow2d,
106	Slow3d,
107	SlowDilated2d,
108	SlowDilated3d,
109	SlowTranspose2d,
110	SlowTranspose3d,
111	Winograd3x3Depthwise,
112	Xnnpack2d,
113	Mps,
114	MpsTranspose,
115	};
116
117	// Overload for selecting the convolution backend from the full set of convolution inputs.
118	// This overload is exposed to python for testing, etc.
119	TORCH_API ConvBackend select_conv_backend(
120	const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
121	IntArrayRef stride, SymIntArrayRef padding, IntArrayRef dilation,
122	bool transposed, SymIntArrayRef output_padding, int64_t groups, const at::OptionalSymIntArrayRef bias_sizes_opt);
123
124	TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input,
125	const Tensor& weight,
126	const ConvBackend backend);
127
128	// ---------------------------------------------------------------------
129	//
130	// Math
131	//
132	// ---------------------------------------------------------------------
133
134	constexpr int input_batch_size_dim = `0`; // also grad_input
135	constexpr int input_channels_dim = `1`;
136	constexpr int output_batch_size_dim = `0`; // also grad_output
137	constexpr int output_channels_dim = `1`;
138	constexpr int weight_output_channels_dim = `0`;
139	constexpr int weight_input_channels_dim = `1`;
140
141	// Often written as 2 + max_dim (extra dims for batch size and channels)
142	constexpr int max_dim = `3`;
143
144	// ---------------------------------------------------------------------
145	//
146	// Checking
147	//
148	// ---------------------------------------------------------------------
149
150	// Used on pad, stride and dilation
151	static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name)
152	{
153	TORCH_CHECK(args.size() <= expected_size,
154	"Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
155	expected_size, " (while checking arguments for ", c, ")");
156	TORCH_CHECK(args.size() >= expected_size,
157	"Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
158	expected_size, " (while checking arguments for ", c, ")");
159
160	auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < `0`;});
161	if (num_negative_values > `0`){
162	std::stringstream ss;
163	ss << arg_name << " should be greater than zero but got (";
164	std::copy(args.begin(), args.end() - `1`, std::ostream_iterator<int>(ss,", "));
165	ss << args.back() << ")" << " (while checking arguments for " << c << ")";
166	AT_ERROR(ss.str());
167	}
168	}
169
170
171	// NOTE [ Convolution checks ]
172	//
173	// NB: For many call sites, it is not strictly necessary to check all of
174	// these relationships (for example, for forward convolution, we compute
175	// the size of output ourselves, so we don't actually need to check
176	// output. However, writing a single function that does everything
177	// means we get to reuse it for both forwards and all backwards
178	// variants, even when the set of "real" inputs varies. The magic of
179	// relational computing!
180	//
181	// (There is one downside, which is that it is slightly harder to write
182	// error messages which are able to distinguish between real inputs
183	// (which the user can change) and computed inputs (which the user can
184	// only indirectly affect). It would be an interesting exercise to
185	// come up with a general framework to handle such situations.)
186	static void convolution_shape_check(
187	CheckedFrom c,
188	const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output,
189	IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups)
190	{
191	check_args(c, padding, input ->dim() - `2`, "padding");
192	check_args(c, stride, padding.size(), "stride");
193	check_args(c, dilation, padding.size(), "dilation");
194
195	// Input
196	checkDimRange(c, input, `3`, `6` / exclusive /);
197	checkSize_symint(c, input, input_channels_dim, weight ->size(`1`) * groups);
198
199	// Weight
200	checkSameDim(c, input, weight);
201
202	// TODO: check that output->size() matches output_sizes
203	// TODO: check that weight matches output->sizes()
204	checkSameDim(c, input, output);
205	}
206
207	// NB: conv_output_size and conv_input_size are not bijections,
208	// as conv_output_size loses information; this is why conv_input_size
209	// takes an extra output_padding argument to resolve the ambiguity.
210
211	template <typename T>
212	static inline std::vector<T> _conv_output_size(
213	ArrayRef<T> input_size, ArrayRef<T> weight_size,
214	ArrayRef<T> padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef ()
215	) {
216	// ASSERT(input_size.size() > 2)
217	// ASSERT(input_size.size() == weight_size.size())
218	bool has_dilation = !dilation.empty();
219	auto dim = input_size.size();
220	std::vector<T> output_size(dim);
221	output_size[`0`] = input_size[input_batch_size_dim];
222	output_size[`1`] = weight_size[weight_output_channels_dim];
223	for (const auto d : c10::irange(`2`, dim)) {
224	auto dilation_ = has_dilation ? dilation[d - `2`] : `1`;
225	auto kernel = dilation_ * (weight_size[d] - `1`) + `1`;
226	output_size[d] = (input_size[d] + (`2` * padding[d - `2`]) - kernel) / stride[d - `2`] + `1`;
227	}
228	return output_size;
229	}
230
231	static inline std::vector<int64_t> conv_output_size(
232	IntArrayRef input_size, IntArrayRef weight_size,
233	IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef ()
234	) {
235	return _conv_output_size(input_size, weight_size, padding, stride, dilation);
236	}
237
238	static inline std::vector<c10::SymInt> conv_output_size(
239	SymIntArrayRef input_size, SymIntArrayRef weight_size,
240	SymIntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef ()
241	) {
242	return _conv_output_size(input_size, weight_size, padding, stride, dilation);
243	}
244
245	template <typename T>
246	std::vector<T> _conv_input_size(
247	ArrayRef<T> output_size, ArrayRef<T> weight_size,
248	ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
249	) {
250	// ASSERT(output_size.size() > 2)
251	// ASSERT(output_size.size() == weight_size.size())
252	auto dim = output_size.size();
253	std::vector<T> input_size(dim);
254	input_size[`0`] = output_size[output_batch_size_dim];
255	input_size[`1`] = weight_size[weight_input_channels_dim] * groups;
256	for (const auto d : c10::irange(`2`, dim)) {
257	auto kernel = (weight_size[d] - `1`) * dilation[d - `2`] + `1`;
258	input_size[d] = (output_size[d] - `1`) * stride[d - `2`] - (padding[d - `2`] * `2`) +
259	kernel + output_padding[d - `2`];
260	}
261	return input_size;
262	}
263
264	static inline std::vector<c10::SymInt> conv_input_size(
265	SymIntArrayRef output_size, SymIntArrayRef weight_size,
266	SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
267	) {
268	return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
269	}
270
271	static inline std::vector<int64_t> conv_input_size(
272	IntArrayRef output_size, IntArrayRef weight_size,
273	IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
274	) {
275	return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
276	}
277
278	template <typename T>
279	std::vector<T> _conv_weight_size(
280	ArrayRef<T> input_size, ArrayRef<T> output_size,
281	ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
282	) {
283	auto dim = input_size.size();
284	std::vector<T> weight_size(dim);
285	weight_size[`0`] = output_size[`1`];
286	weight_size[`1`] = input_size[`1`] / groups;
287	for (const auto d : c10::irange(`2`, dim)) {
288	auto kernel = input_size[d] - (output_size[d] - `1`) * stride[d - `2`]
289	+ padding[d - `2`] * `2` - output_padding[d - `2`];
290	weight_size[d] = (kernel - `1`) / dilation[d - `2`] + `1`;
291	}
292	return weight_size;
293	}
294
295	static inline std::vector<c10::SymInt> conv_weight_size(
296	SymIntArrayRef input_size, SymIntArrayRef output_size,
297	SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
298	) {
299	return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
300	}
301
302	static inline std::vector<int64_t> conv_weight_size(
303	IntArrayRef input_size, IntArrayRef output_size,
304	IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
305	) {
306	return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
307	}
308
309	static inline Tensor reshape_bias(int64_t dim, const Tensor& bias) {
310	std::vector<int64_t> shape(dim, `1`);
311	shape [`1`] = -`1`;
312	return bias.reshape(shape);
313	}
314
315	static inline at::MemoryFormat cudnn_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) {
316	// disable NHWC for float64 input.
317	if (!at::detail::getCUDAHooks().compiledWithCuDNN() \|\|
318	input.scalar_type() == at::kDouble \|\|
319	weight.scalar_type() == at::kDouble) {
320	return at::MemoryFormat::Contiguous;
321	}
322	long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
323	auto input_memory_format = input.suggest_memory_format();
324	auto weight_memory_format = weight.suggest_memory_format();
325	auto weight_ndim = weight.ndimension();
326
327	bool can_use_cudnn_channels_last_2d = (cudnn_version >= `7603`) && (weight_ndim == `4`) && (
328	(input_memory_format == at::MemoryFormat::ChannelsLast) \|\|
329	(weight_memory_format == at::MemoryFormat::ChannelsLast)
330	);
331	if (can_use_cudnn_channels_last_2d) {
332	return at::MemoryFormat::ChannelsLast;
333	}
334
335	bool can_use_cudnn_channels_last_3d = (cudnn_version >= `8005`) && (weight_ndim == `5`) && (
336	(input_memory_format == at::MemoryFormat::ChannelsLast3d) \|\|
337	(weight_memory_format == at::MemoryFormat::ChannelsLast3d)
338	);
339	if (can_use_cudnn_channels_last_3d) {
340	return at::MemoryFormat::ChannelsLast3d;
341	}
342
343	return at::MemoryFormat::Contiguous;
344	}
345
346	static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
347	// disable NHWC for float64 input.
348	if (!at::detail::getCUDAHooks().compiledWithMIOpen() \|\|
349	input.scalar_type() == at::kDouble \|\|
350	weight.scalar_type() == at::kDouble) {
351	return false;
352	}
353
354	auto input_memory_format = input.suggest_memory_format();
355	auto weight_memory_format = weight.suggest_memory_format();
356
357	bool can_use_miopen_channels_last_2d = (
358	(input_memory_format == at::MemoryFormat::ChannelsLast) \|\|
359	(weight_memory_format == at::MemoryFormat::ChannelsLast)
360	);
361
362	bool can_use_miopen_channels_last_3d = false;
363
364	return can_use_miopen_channels_last_2d \|\| can_use_miopen_channels_last_3d;
365	}
366
367	static inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
368
369	// disable NHWC for float64 input.
370	if (input.scalar_type() == at::kDouble \|\|
371	weight.scalar_type() == at::kDouble) {
372	return false;
373	}
374
375	// disable NHWC for MkldnnCPU tensor.
376	if (input.is_mkldnn() \|\| weight.is_mkldnn()) {
377	return false;
378	}
379
380	auto input_memory_format = input.suggest_memory_format();
381	auto weight_memory_format = weight.suggest_memory_format();
382
383	bool can_use_mkldnn_channels_last_2d =
384	(input_memory_format == at::MemoryFormat::ChannelsLast) \|\|
385	(weight_memory_format == at::MemoryFormat::ChannelsLast);
386
387	// TODO: add channels last 3d support
388	bool can_use_mkldnn_channels_last_3d = false;
389
390	return can_use_mkldnn_channels_last_2d \|\| can_use_mkldnn_channels_last_3d;
391	}
392
393	static inline bool thnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
394
395	auto input_memory_format = input.suggest_memory_format();
396	auto weight_memory_format = weight.suggest_memory_format();
397
398	bool can_use_thnn_channels_last_2d = input.device().is_cpu() && (
399	(input_memory_format == at::MemoryFormat::ChannelsLast) \|\| (
400	weight_memory_format == at::MemoryFormat::ChannelsLast));
401
402	return can_use_thnn_channels_last_2d;
403	}
404
405	}} // namespace at::native
406

Browse the source code of pytorch/aten/src/ATen/native/ConvUtils.h