Parallel.h source code [pytorch/aten/src/ATen/Parallel.h]

1	#pragma once
2	#include <ATen/Config.h>
3	#include <c10/macros/Macros.h>
4	#include <functional>
5	#include <string>
6
7	namespace at {
8
9	inline int64_t divup(int64_t x, int64_t y) {
10	return (x + y - `1`) / y;
11	}
12
13	// Called during new thread initialization
14	TORCH_API void init_num_threads();
15
16	// Sets the number of threads to be used in parallel region
17	TORCH_API void set_num_threads(int);
18
19	// Returns the maximum number of threads that may be used in a parallel region
20	TORCH_API int get_num_threads();
21
22	// Returns the current thread number (starting from 0)
23	// in the current parallel region, or 0 in the sequential region
24	TORCH_API int get_thread_num();
25
26	// Checks whether the code runs in parallel region
27	TORCH_API bool in_parallel_region();
28
29	namespace internal {
30
31	// Initialise num_threads lazily at first parallel call
32	inline void lazy_init_num_threads() {
33	thread_local bool init = false;
34	if (C10_UNLIKELY(!init)) {
35	at::init_num_threads();
36	init = true;
37	}
38	}
39
40	TORCH_API void set_thread_num(int);
41
42	class TORCH_API ThreadIdGuard {
43	public:
44	ThreadIdGuard(int new_id) : old_id_(at::get_thread_num()) {
45	set_thread_num(new_id);
46	}
47
48	~ThreadIdGuard() {
49	set_thread_num(old_id_);
50	}
51
52	private:
53	int old_id_;
54	};
55
56	} // namespace internal
57
58	/*
59	parallel_for
60
61	begin: index at which to start applying user function
62
63	end: index at which to stop applying user function
64
65	grain_size: number of elements per chunk. impacts the degree of parallelization
66
67	f: user function applied in parallel to the chunks, signature:
68	void f(int64_t begin, int64_t end)
69
70	Warning: parallel_for does NOT copy thread local
71	states from the current thread to the worker threads.
72	This means for example that Tensor operations CANNOT be used in the
73	body of your function, only data pointers.
74	*/
75	template <class F>
76	inline void parallel_for(
77	const int64_t begin,
78	const int64_t end,
79	const int64_t grain_size,
80	const F& f);
81
82	/*
83	parallel_reduce
84
85	begin: index at which to start applying reduction
86
87	end: index at which to stop applying reduction
88
89	grain_size: number of elements per chunk. impacts number of elements in
90	intermediate results tensor and degree of parallelization.
91
92	ident: identity for binary combination function sf. sf(ident, x) needs to return
93	x.
94
95	f: function for reduction over a chunk. f needs to be of signature scalar_t
96	f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)
97
98	sf: function to combine two partial results. sf needs to be of signature
99	scalar_t sf(scalar_t x, scalar_t y)
100
101	For example, you might have a tensor of 10000 entires and want to sum together
102	all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
103	an intermediate result tensor with 4 elements. Then it will execute the function
104	"f" you provide and pass the beginning and end index of these chunks, so
105	0-2499, 2500-4999, etc. and the combination identity. It will then write out
106	the result from each of these chunks into the intermediate result tensor. After
107	that it'll reduce the partial results from each chunk into a single number using
108	the combination function sf and the identity ident. For a total summation this
109	would be "+" and 0 respectively. This is similar to tbb's approach [1], where
110	you need to provide a function to accumulate a subrange, a function to combine
111	two partial results and an identity.
112
113	Warning: parallel_reduce does NOT copy thread local
114	states from the current thread to the worker threads.
115	This means for example that Tensor operations CANNOT be used in the
116	body of your function, only data pointers.
117
118	[1] https://software.intel.com/en-us/node/506154
119	*/
120	template <class scalar_t, class F, class SF>
121	inline scalar_t parallel_reduce(
122	const int64_t begin,
123	const int64_t end,
124	const int64_t grain_size,
125	const scalar_t ident,
126	const F& f,
127	const SF& sf);
128
129	// Returns a detailed string describing parallelization settings
130	TORCH_API std::string get_parallel_info();
131
132	// Sets number of threads used for inter-op parallelism
133	TORCH_API void set_num_interop_threads(int);
134
135	// Returns the number of threads used for inter-op parallelism
136	TORCH_API int get_num_interop_threads();
137
138	// Launches inter-op parallel task
139	TORCH_API void launch(std::function<void()> func);
140	namespace internal {
141	void launch_no_thread_state(std::function<void()> fn);
142	} // namespace internal
143
144	// Launches intra-op parallel task
145	TORCH_API void intraop_launch(std::function<void()> func);
146
147	// Returns number of intra-op threads used by default
148	TORCH_API int intraop_default_num_threads();
149
150	} // namespace at
151
152	#if AT_PARALLEL_OPENMP
153	#include <ATen/ParallelOpenMP.h> // IWYU pragma: keep
154	#elif AT_PARALLEL_NATIVE
155	#include <ATen/ParallelNative.h> // IWYU pragma: keep
156	#elif AT_PARALLEL_NATIVE_TBB
157	#include <ATen/ParallelNativeTBB.h> // IWYU pragma: keep
158	#endif
159
160	#include <ATen/Parallel-inl.h> // IWYU pragma: keep
161

Browse the source code of pytorch/aten/src/ATen/Parallel.h