1 | #pragma once |
2 | #include <ATen/Config.h> |
3 | #include <c10/macros/Macros.h> |
4 | #include <functional> |
5 | #include <string> |
6 | |
7 | namespace at { |
8 | |
9 | inline int64_t divup(int64_t x, int64_t y) { |
10 | return (x + y - 1) / y; |
11 | } |
12 | |
13 | // Called during new thread initialization |
14 | TORCH_API void init_num_threads(); |
15 | |
16 | // Sets the number of threads to be used in parallel region |
17 | TORCH_API void set_num_threads(int); |
18 | |
19 | // Returns the maximum number of threads that may be used in a parallel region |
20 | TORCH_API int get_num_threads(); |
21 | |
22 | // Returns the current thread number (starting from 0) |
23 | // in the current parallel region, or 0 in the sequential region |
24 | TORCH_API int get_thread_num(); |
25 | |
26 | // Checks whether the code runs in parallel region |
27 | TORCH_API bool in_parallel_region(); |
28 | |
29 | namespace internal { |
30 | |
31 | // Initialise num_threads lazily at first parallel call |
32 | inline void lazy_init_num_threads() { |
33 | thread_local bool init = false; |
34 | if (C10_UNLIKELY(!init)) { |
35 | at::init_num_threads(); |
36 | init = true; |
37 | } |
38 | } |
39 | |
40 | TORCH_API void set_thread_num(int); |
41 | |
42 | class TORCH_API ThreadIdGuard { |
43 | public: |
44 | ThreadIdGuard(int new_id) : old_id_(at::get_thread_num()) { |
45 | set_thread_num(new_id); |
46 | } |
47 | |
48 | ~ThreadIdGuard() { |
49 | set_thread_num(old_id_); |
50 | } |
51 | |
52 | private: |
53 | int old_id_; |
54 | }; |
55 | |
56 | } // namespace internal |
57 | |
58 | /* |
59 | parallel_for |
60 | |
61 | begin: index at which to start applying user function |
62 | |
63 | end: index at which to stop applying user function |
64 | |
65 | grain_size: number of elements per chunk. impacts the degree of parallelization |
66 | |
67 | f: user function applied in parallel to the chunks, signature: |
68 | void f(int64_t begin, int64_t end) |
69 | |
70 | Warning: parallel_for does NOT copy thread local |
71 | states from the current thread to the worker threads. |
72 | This means for example that Tensor operations CANNOT be used in the |
73 | body of your function, only data pointers. |
74 | */ |
75 | template <class F> |
76 | inline void parallel_for( |
77 | const int64_t begin, |
78 | const int64_t end, |
79 | const int64_t grain_size, |
80 | const F& f); |
81 | |
82 | /* |
83 | parallel_reduce |
84 | |
85 | begin: index at which to start applying reduction |
86 | |
87 | end: index at which to stop applying reduction |
88 | |
89 | grain_size: number of elements per chunk. impacts number of elements in |
90 | intermediate results tensor and degree of parallelization. |
91 | |
92 | ident: identity for binary combination function sf. sf(ident, x) needs to return |
93 | x. |
94 | |
95 | f: function for reduction over a chunk. f needs to be of signature scalar_t |
96 | f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy) |
97 | |
98 | sf: function to combine two partial results. sf needs to be of signature |
99 | scalar_t sf(scalar_t x, scalar_t y) |
100 | |
101 | For example, you might have a tensor of 10000 entires and want to sum together |
102 | all the elements. Parallel_reduce with a grain_size of 2500 will then allocate |
103 | an intermediate result tensor with 4 elements. Then it will execute the function |
104 | "f" you provide and pass the beginning and end index of these chunks, so |
105 | 0-2499, 2500-4999, etc. and the combination identity. It will then write out |
106 | the result from each of these chunks into the intermediate result tensor. After |
107 | that it'll reduce the partial results from each chunk into a single number using |
108 | the combination function sf and the identity ident. For a total summation this |
109 | would be "+" and 0 respectively. This is similar to tbb's approach [1], where |
110 | you need to provide a function to accumulate a subrange, a function to combine |
111 | two partial results and an identity. |
112 | |
113 | Warning: parallel_reduce does NOT copy thread local |
114 | states from the current thread to the worker threads. |
115 | This means for example that Tensor operations CANNOT be used in the |
116 | body of your function, only data pointers. |
117 | |
118 | [1] https://software.intel.com/en-us/node/506154 |
119 | */ |
120 | template <class scalar_t, class F, class SF> |
121 | inline scalar_t parallel_reduce( |
122 | const int64_t begin, |
123 | const int64_t end, |
124 | const int64_t grain_size, |
125 | const scalar_t ident, |
126 | const F& f, |
127 | const SF& sf); |
128 | |
129 | // Returns a detailed string describing parallelization settings |
130 | TORCH_API std::string get_parallel_info(); |
131 | |
132 | // Sets number of threads used for inter-op parallelism |
133 | TORCH_API void set_num_interop_threads(int); |
134 | |
135 | // Returns the number of threads used for inter-op parallelism |
136 | TORCH_API int get_num_interop_threads(); |
137 | |
138 | // Launches inter-op parallel task |
139 | TORCH_API void launch(std::function<void()> func); |
140 | namespace internal { |
141 | void launch_no_thread_state(std::function<void()> fn); |
142 | } // namespace internal |
143 | |
144 | // Launches intra-op parallel task |
145 | TORCH_API void intraop_launch(std::function<void()> func); |
146 | |
147 | // Returns number of intra-op threads used by default |
148 | TORCH_API int intraop_default_num_threads(); |
149 | |
150 | } // namespace at |
151 | |
152 | #if AT_PARALLEL_OPENMP |
153 | #include <ATen/ParallelOpenMP.h> // IWYU pragma: keep |
154 | #elif AT_PARALLEL_NATIVE |
155 | #include <ATen/ParallelNative.h> // IWYU pragma: keep |
156 | #elif AT_PARALLEL_NATIVE_TBB |
157 | #include <ATen/ParallelNativeTBB.h> // IWYU pragma: keep |
158 | #endif |
159 | |
160 | #include <ATen/Parallel-inl.h> // IWYU pragma: keep |
161 | |