Loops.h source code [pytorch/aten/src/ATen/native/cpu/Loops.h]

1	#pragma once
2
3	// This file provides two functions to help write elementwise kernels:
4	//
5	// cpu_kernel(TensorIterator iter, <lambda>)
6	// cpu_kernel_vec(TensorIterator iter, <lambda>, <vec_lambda>)
7	//
8	// Both functions may generate vectorized code. The cpu_kernel implementation
9	// relies on the compiler's auto-vectorization. The cpu_kernel_vec
10	// implementation uses x86 SIMD intrinsics when available. These functions
11	// are only intended to be used in the ATen/native/cpu subdirectory, since files
12	// in other directories are not compiled with AVX/AVX2 enabled. See README.md
13	// for more details.
14	//
15	// For example, to write a multiplication kernel for float:
16	//
17	// cpu_kernel(iter, [](float a, float b) { return a b; });*
18	//
19	// Or you may write:
20	//
21	// cpu_kernel_vec(iter,
22	// [](float a, float b) { return a b; },*
23	// [](Vectorized<float> a, Vectorized<float> b) { return a b; });*
24	//
25	// See BinaryOpsKernel.cpp for the complete implementation
26	//
27	//
28
29	#include <stdint.h>
30	#include <c10/util/C++17.h>
31	#include <c10/util/Load.h>
32	#include <c10/util/irange.h>
33	#include <ATen/detail/FunctionTraits.h>
34	#include <ATen/native/cpu/IsContiguous.h>
35	#include <ATen/native/TensorIterator.h>
36	#include <ATen/native/TensorIteratorDynamicCasting.h>
37	#include <ATen/cpu/vec/vec.h>
38
39	#include <utility>
40
41	namespace at { namespace native { inline namespace CPU_CAPABILITY {
42
43	using namespace vec;
44
45	template <typename traits, std::size_t... INDEX>
46	typename traits::ArgsTuple
47	dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i,
48	std::index_sequence<INDEX...>) {
49	return std::make_tuple(
50	c10::load<typename traits::template arg<INDEX>::type>(
51	data[INDEX] + i * strides[INDEX])...);
52	}
53
54	template <typename traits>
55	typename traits::ArgsTuple
56	dereference(char* C10_RESTRICT data[], const int64_t* strides, int64_t i) {
57	using Indices = std::make_index_sequence<traits::arity>;
58	return dereference_impl<traits>(data, strides, i, Indices{});
59	}
60
61	template <typename traits, std::size_t... INDEX>
62	typename traits::ArgsTuple
63	dereference_vec_impl(char* C10_RESTRICT data[],
64	const typename traits::result_type& opt_scalar,
65	size_t S,
66	int64_t i,
67	std::index_sequence<INDEX...>) {
68	using Vec = typename traits::result_type;
69	using scalar_t = typename Vec::value_type;
70	return std::make_tuple(
71	S == INDEX + `1` ?
72	opt_scalar :
73	Vec::loadu(data[INDEX] + i * sizeof(scalar_t))...);
74	}
75
76	template <typename traits>
77	typename traits::ArgsTuple
78	dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& opt_scalar, size_t S, int64_t i) {
79	using Indices = std::make_index_sequence<traits::arity>;
80	return dereference_vec_impl<traits>(data, opt_scalar, S, i, Indices{});
81	}
82
83	template <typename func_t,
84	typename std::enable_if<!std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
85	static inline void
86	execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
87	using traits = function_traits<func_t>;
88	using result_type = typename traits::result_type;
89	for (; i < n; i++) {
90	result_type* out_ptr = (result_type)(data[`0`] + i strides[`0`]);
91	*out_ptr = c10::guts::apply(std::forward<func_t>(op), dereference<traits>(
92	&data[`1`],
93	&strides[`1`],
94	i));
95	}
96	}
97
98	template <typename func_t,
99	typename std::enable_if<std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
100	static inline void
101	execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
102	using traits = function_traits<func_t>;
103	for (; i < n; i++) {
104	c10::guts::apply(std::forward<func_t>(op), dereference<traits>(
105	&data[`0`],
106	&strides[`0`],
107	i));
108	}
109	}
110
111	// Basic loop operation (one output, N inputs). May be auto-vectorized
112	// by the compiler. Supports inputs and outputs of different types.
113	template <typename func_t>
114	static inline void
115	basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) {
116	using traits = function_traits<func_t>;
117	constexpr int ntensors = traits::arity + `1`;
118
119	// Copying strides to temporary array helps auto vectorization in older GCC
120	// versions.
121	int64_t strides[ntensors];
122	for (const auto arg : c10::irange(ntensors)) {
123	strides[arg] = strides_[arg];
124	}
125
126	execute_op(data, strides, i, n, std::forward<func_t>(op));
127	}
128
129	// the recursive variadic template for iterating over the returned tuple
130	template<class T, size_t N>
131	struct TupleOutput {
132	static void handle(char C10_RESTRICT data[], const* int64_t *strides, int64_t i,
133	const T &tuple) {
134	TupleOutput<T, N - `1`>::handle(data, strides, i, tuple);
135
136	auto output = std::get<N - `1`>(tuple);
137	using output_type = decltype(output);
138	output_type * out_ptr = (output_type )(data[N - `1`] + i strides[N - `1`]);
139	*out_ptr = output;
140	}
141	};
142
143	// Base case for the above recursive template
144	template<class T>
145	struct TupleOutput<T, `1`> {
146	static void handle(char C10_RESTRICT data[], const* int64_t *strides, int64_t i,
147	const T &tuple) {
148	auto output = std::get<`0`>(tuple);
149	using output_type = decltype(output);
150	output_type* out_ptr = (output_type )(data[`0`] + i strides[`0`]);
151	*out_ptr = output;
152	}
153	};
154
155	template<class... Args>
156	void handle_tuple_outputs(char* C10_RESTRICT data[],
157	const int64_t* strides,
158	int64_t i,
159	const std::tuple<Args...> &tuple) {
160	TupleOutput<decltype(tuple), sizeof...(Args)>::handle(data, strides, i, tuple);
161	}
162
163	// Loop operation for `cpu_kernel_multiple_outputs`.
164	// 1. Use `c10::guts::apply` to make dynamic method invocation
165	// for the lambda passed in `cpu_kernel_multiple_outputs`.
166	// 2. Iterate over the members of the returned tuple, set the corresponding
167	// output tensor by the tuple member in `handle_tuple_outputs` function.
168	template <typename func_t>
169	static inline void
170	multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) {
171	using traits = function_traits<func_t>;
172
173	using result_type = typename traits::result_type;
174	constexpr int num_outputs = std::tuple_size<result_type>::value;
175	constexpr int ntensors = traits::arity + num_outputs;
176
177	// Copying strides to temporary array helps auto vectorization in older GCC
178	// versions.
179	int64_t strides[ntensors];
180	for (const auto arg : c10::irange(ntensors)) {
181	strides[arg] = strides_[arg];
182	}
183
184	for (; i < n; i++) {
185	auto output = c10::guts::apply(op, dereference<traits>(
186	&data[num_outputs],
187	&strides[num_outputs],
188	i));
189	handle_tuple_outputs(data, strides, i, output);
190	}
191	}
192
193	// Explicitly vectorized loop implementation. All inputs and outputs must be
194	// the same type and contiguous with one exception: a single input may be
195	// a scalar (stride 0). It's position is indicated by the argument `S`. If `S`
196	// is 0, then there are no scalar inputs.
197	template <typename func_t, typename vec_func_t>
198	static inline void
199	vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, vec_func_t&& vop) {
200	using traits = function_traits<vec_func_t>;
201	using scalar_t = typename function_traits<func_t>::result_type;
202	using Vec = Vectorized<scalar_t>;
203	constexpr int ntensors = traits::arity + `1`;
204
205	char* C10_RESTRICT data[ntensors];
206	for (const auto arg : c10::irange(ntensors)) {
207	data[arg] = data_[arg];
208	}
209
210	Vec opt_scalar = Vec(S > `0` ? (scalar_t)data[S] : scalar_t(`0`));
211	int64_t i = `0`;
212	for (; i <= n - `2` * Vec::size(); i += `2` * Vec::size()) {
213	auto args1 = dereference_vec<traits>(&data[`1`], opt_scalar, S, i);
214	auto args2 = dereference_vec<traits>(&data[`1`], opt_scalar, S, i + Vec::size());
215	auto out1 = c10::guts::apply(std::forward<vec_func_t>(vop), std::move(args1));
216	auto out2 = c10::guts::apply(std::forward<vec_func_t>(vop), std::move(args2));
217	out1.store(data[`0`] + i * sizeof(scalar_t));
218	out2.store(data[`0`] + (i + Vec::size()) * sizeof(scalar_t));
219	}
220	if (i < n) {
221	int64_t strides[ntensors];
222	for (const auto arg : c10::irange(ntensors)) {
223	strides[arg] = (S > `0` && arg == S) ? `0` : sizeof(scalar_t);
224	}
225	basic_loop(data, strides, i, n, std::forward<func_t>(op));
226	}
227	}
228
229
230	template <typename traits, typename cb_t>
231	static inline void unroll_contiguous_scalar_checks(
232	const int64_t* /strides/,
233	std::index_sequence<>,
234	cb_t&& cb) {
235	cb(`0`);
236	}
237
238	template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX>
239	static inline void unroll_contiguous_scalar_checks(
240	const int64_t* strides,
241	std::index_sequence<INDEX0, INDEX...>,
242	cb_t&& cb) {
243	if (is_contiguous_scalar<traits, INDEX0 + `1`>(strides)) {
244	cb(INDEX0 + `1`);
245	} else {
246	unroll_contiguous_scalar_checks<traits>(strides, std::index_sequence<INDEX...>{}, std::forward<cb_t>(cb));
247	}
248	}
249
250	template <typename op_t, typename vop_t>
251	struct VectorizedLoop2d {
252	op_t op;
253	vop_t vop;
254
255	using traits = function_traits<op_t>;
256	static constexpr int ntensors = traits::arity + `1`;
257	using data_t = std::array<char*, ntensors>;
258
259	VectorizedLoop2d(const op_t &op, vop_t vop):
260	op(op), vop(std::move(vop)) {}
261
262	static void advance(data_t &data, const int64_t *outer_strides) {
263	for (const auto arg : c10::irange(data.size())) {
264	data[arg] += outer_strides[arg];
265	}
266	}
267
268	void operator()(char** base, const int64_t *strides, int64_t size0, int64_t size1) {
269	data_t data;
270	std::copy_n(base, ntensors, data.data());
271	const int64_t *outer_strides = &strides[ntensors];
272
273	if (is_contiguous<traits>(strides)) {
274	for (const auto i C10_UNUSED : c10::irange(size1)) {
275	vectorized_loop(data.data(), size0, `0`, op, vop);
276	advance(data, outer_strides);
277	}
278	} else {
279	using Indices = std::make_index_sequence<traits::arity>;
280	unroll_contiguous_scalar_checks<traits>(strides, Indices{}, [&](size_t idx) {
281	if (idx) {
282	for (const auto i C10_UNUSED : c10::irange(size1)) {
283	vectorized_loop(data.data(), size0, idx, op, vop);
284	advance(data, outer_strides);
285	}
286	} else {
287	for (const auto i C10_UNUSED : c10::irange(size1)) {
288	basic_loop(data.data(), strides, `0`, size0, op);
289	advance(data, outer_strides);
290	}
291	}
292	});
293	}
294	}
295	};
296
297	template <typename op_t, typename vop_t>
298	VectorizedLoop2d<op_t, vop_t> make_vectorized_loop2d(
299	const op_t &op, const vop_t &vop) {
300	return VectorizedLoop2d<op_t, vop_t>(op, vop);
301	}
302
303	template <typename func_t>
304	void cpu_kernel(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) {
305	using traits = function_traits<func_t>;
306	// this could be extended to work with void return types
307	TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
308	TORCH_INTERNAL_ASSERT(iter.noutputs() == `1`);
309	// dynamic casting not currently supported on CPU
310	TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
311
312	iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
313	// basic loop can handle 1d slices with arbitrary strides, and 1d slices is all that
314	// iter.for_each is ever sending to the loop lambda
315	basic_loop(data, strides, `0`, n, std::forward<func_t>(op));
316	}, grain_size);
317	iter.cast_outputs();
318	}
319
320	// This function helps write elementwise kernels that requires multiple outputs.
321	// It follows the similar structure of cpu_kernel.
322	// Instead of `basic_loop` function, a new `multiple_outputs_loop` function is
323	// manipulated to handle multiple return values.
324	// For now `needs_dynamic_casting` check is not added as the passed lambda (`func_t`)
325	// of `multiple_outputs_loop` returns `std::tuple` instead of `scalar_t`.
326	// The `gpu_kernel_multiple_outputs` is also implemented without this check,
327	// We could extend `needs_dynamic_casting` to support both `std::tuple` and
328	// `thrust::tuple` in the future.
329	template <typename func_t>
330	void cpu_kernel_multiple_outputs(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) {
331	using traits = function_traits<func_t>;
332	TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
333
334	iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
335	multiple_outputs_loop(data, strides, `0`, n, std::forward<func_t>(op));
336	}, grain_size);
337	iter.cast_outputs();
338	}
339
340	template <bool check_dynamic_cast=true, typename func_t, typename vec_func_t>
341	void cpu_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, int64_t grain_size = at::internal::GRAIN_SIZE) {
342	using traits = function_traits<func_t>;
343	// this could be extended to work with void return types
344	TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
345	TORCH_INTERNAL_ASSERT(iter.noutputs() == `1`);
346	// dynamic casting not currently supported on CPU, but some kernels (like Fill)
347	// explicitly dynamic_cast, so we give the opt-out of checking.
348	c10::guts::if_constexpr<check_dynamic_cast>([&] {
349	TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
350	});
351
352	iter.for_each(make_vectorized_loop2d(op, vop), grain_size);
353	iter.cast_outputs();
354	}
355
356	template <typename func_t>
357	void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op, const Range& range) {
358	using traits = function_traits<func_t>;
359	constexpr bool result_void = std::is_void<typename traits::result_type>::value;
360	TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity &&
361	((result_void && iter.noutputs() == `0`) \|\| (!result_void && iter.noutputs() == `1`)));
362	// dynamic casting not currently supported on CPU
363	TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
364
365	iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) {
366	basic_loop(data, strides, `0`, n, std::forward<func_t>(op));
367	}, range);
368	iter.cast_outputs();
369	}
370
371	template <typename func_t>
372	void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op) {
373	cpu_serial_kernel(iter, op, {`0`, iter.numel()});
374	}
375
376	template <typename func_t, typename vec_func_t>
377	void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, const Range& range) {
378	using traits = function_traits<func_t>;
379	// this could be extended to work with void return types
380	TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
381	TORCH_INTERNAL_ASSERT(iter.noutputs() == `1`);
382	// dynamic casting not currently supported on CPU
383	TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
384
385	iter.serial_for_each(make_vectorized_loop2d(op, vop), range);
386	iter.cast_outputs();
387	}
388
389	template <typename func_t, typename vec_func_t>
390	void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop) {
391	cpu_serial_kernel_vec(iter, op, vop, {`0`, iter.numel()});
392	}
393
394	}}} // namespace at::native::<anonymous>
395

Browse the source code of pytorch/aten/src/ATen/native/cpu/Loops.h