1 | #pragma once |
2 | |
3 | #include <evaluator_common.h> |
4 | #include <executor.h> |
5 | #include <fusion.h> |
6 | #include <fusion_segmenter.h> |
7 | #include <scheduler/all_schedulers.h> |
8 | #include <scheduler/registry.h> |
9 | |
10 | #include <c10/macros/Export.h> |
11 | #include <c10/util/ArrayRef.h> |
12 | |
13 | #include <mutex> |
14 | #include <type_traits> |
15 | #include <unordered_map> |
16 | |
17 | namespace torch { |
18 | namespace jit { |
19 | namespace fuser { |
20 | namespace cuda { |
21 | |
22 | class SegmentedGroup; |
23 | class FusionHeuristics; |
24 | class SchedulerRuntimeInfo; |
25 | |
26 | // Utilities for benchmarking and profiling |
27 | struct ExecutorLog { |
28 | std::shared_ptr<HeuristicParams> params = nullptr; |
29 | FusionExecutor* fusion_executor = nullptr; |
30 | }; |
31 | |
32 | //! FusionKernelRuntime is the unified interface from fusion graphs into |
33 | //! caching, compilation into kernels, and kernel launches. |
34 | //! |
35 | //! Each instance is also a cache entry tracked by FusionKernelRuntimeCache. |
36 | //! |
37 | //! Two types of instance can be created, one for complete/single-kernel fusion |
38 | //! and one for segmented/multi-kernel fusion. |
39 | //! Conceptually this is a generalization of FusionExecutor that supports both |
40 | //! single-kernel and multi-kernel caching/compiling/launching |
41 | class TORCH_CUDA_CU_API FusionKernelRuntime { |
42 | public: |
43 | explicit FusionKernelRuntime( |
44 | Fusion* fusion, |
45 | const KernelArgumentHolder& inputs); |
46 | |
47 | //! Type notations within FusionKernelRuntime Context |
48 | using HashType = size_t; |
49 | using SchedulerEntryPtr = std::unique_ptr<SchedulerEntry>; |
50 | |
51 | //! Evicts internally cached parameters based on input sizes. |
52 | //! An interface used by runtime caches. |
53 | void evictCache(size_t input_id) { |
54 | for (auto& fe : executors_) { |
55 | fe.evictCache(input_id); |
56 | } |
57 | } |
58 | |
59 | //! query if we already have a compiled kernel for execution |
60 | bool isCompiled() { |
61 | std::unique_lock<std::mutex> lock0(mutex_, std::try_to_lock); |
62 | std::unique_lock<std::mutex> lock1(compiling_, std::try_to_lock); |
63 | if (!lock0.owns_lock() || !lock1.owns_lock()) { |
64 | // compilation in progress |
65 | return false; |
66 | } |
67 | |
68 | return std::all_of( |
69 | executors_.begin(), executors_.end(), [](const auto& executor) { |
70 | return executor.compiled(); |
71 | }); |
72 | } |
73 | |
74 | //! starts compilation async |
75 | void startAsyncCompile(KernelArgumentHolder& inputs); |
76 | |
77 | //! maps entries in `args` to fusion inputs. |
78 | //! Note that this function also pushes extra bits like dimension extent into |
79 | //! `args` for expression evaluator binding. So consider your `args` polluted |
80 | //! after this function and use it with caution. |
81 | void mapFusionInputsToArgs( |
82 | std::unordered_map<Val*, const ArgAbstract*>& tensor_map, |
83 | KernelArgumentHolder& args); |
84 | |
85 | //! Unified interface to run the managed kernels with given input |
86 | std::vector<at::Tensor> runWithInput(KernelArgumentHolder& args); |
87 | |
88 | //! Turn On/Off profiling |
89 | void profile(bool to_profile = true) { |
90 | profiling_ = to_profile; |
91 | } |
92 | |
93 | //! Internal knob for profiling shape inference |
94 | void disableLaunchParamCache() { |
95 | for (auto& executor : executors_) { |
96 | executor.disableLaunchParamCache(); |
97 | } |
98 | } |
99 | |
100 | //! Internal knob for profiling shape inference |
101 | void disableKernelLaunch() { |
102 | for (auto& executor : executors_) { |
103 | executor.setExecuteKernelFlag(false); |
104 | } |
105 | } |
106 | |
107 | //! Returns if this runtime is segmented |
108 | bool isSegmented() { |
109 | return is_segmented_; |
110 | } |
111 | |
112 | //! Returns the fusion segments if applicable |
113 | SegmentedFusion* fusionSegments() { |
114 | return segmented_fusion_.get(); |
115 | } |
116 | |
117 | //! Returns the list of heuristics in this runtime |
118 | FusionHeuristics* schedulerHeuristics() { |
119 | return heuristics_.get(); |
120 | } |
121 | |
122 | //! Return the most recently used executor, corresponding to the |
123 | //! most recent kernel launch. |
124 | //! TODO: have a interface for grabbing all recent logs. Need to put a buffer |
125 | //! space for recent logs |
126 | ExecutorLog getMostRecentExecutorLog() { |
127 | TORCH_INTERNAL_ASSERT( |
128 | profiling_, "Executor log is only produced in profiling mode" ); |
129 | return most_recent_executor_log_; |
130 | } |
131 | |
132 | // Try to compute heuristics based on the SegmentedFusion managed |
133 | // in this kernel runtime, and will return a nullopt if either |
134 | // any segment cannot be scheduled or the parameters don't match |
135 | using HeuristicsPtr = std::unique_ptr<FusionHeuristics>; |
136 | c10::optional<HeuristicsPtr> getMaybeHeuristicsFor( |
137 | const KernelArgumentHolder& args); |
138 | |
139 | //! Copy the launch params given in the parameter heuristics to prepare |
140 | //! for kernel launch for a new input dimension but same heuristics |
141 | void updateHeuristicsLaunchParams(FusionHeuristics* update_heuristics); |
142 | |
143 | private: |
144 | //! Interface to run a single kernel, either one kernel for single-kernel |
145 | //! fusions, or a kernel for a segmentedGrouup in a segmented fusion. Returns |
146 | //! the kernel outputs. |
147 | std::vector<at::Tensor> runKernelWithInput( |
148 | KernelArgumentHolder& args, |
149 | SegmentedGroup* sg); |
150 | |
151 | //! Interface to compile a single kernel, either one kernel for single-kernel |
152 | //! fusions, or a kernel for a segmentedGrouup in a segmented fusion. Returns |
153 | //! the kernel outputs with tensor that doesn't own memory. |
154 | KernelArgumentHolder compileKernel( |
155 | const KernelArgumentHolder& args, |
156 | SegmentedGroup* sg); |
157 | |
158 | //! Interface to run a the whole graph in a segmented fusion and return the |
159 | //! complete |
160 | //! fusion outputs. |
161 | std::vector<at::Tensor> runMultiKernelWithInput( |
162 | const at::ArrayRef<IValue>& inputs, |
163 | size_t input_id); |
164 | |
165 | //! Access the list of schedulers maintained in this runtime instance |
166 | const std::vector<SchedulerEntryPtr>& schedulers(); |
167 | |
168 | void prepareRuntimeOrder(); |
169 | |
170 | private: |
171 | //! Entries indexed by groupID: |
172 | //! Executors holding compiled kernels |
173 | std::vector<FusionExecutor> executors_; |
174 | |
175 | //! Heuristics object holding scheduler entries for all segments |
176 | std::unique_ptr<FusionHeuristics> heuristics_; |
177 | |
178 | // Checks if this runtime instance is for a single-kernel fusion (false) or a |
179 | // segmented fusion (true). |
180 | bool is_segmented_ = true; |
181 | |
182 | //! Multi-Kernel fusion segment when applies |
183 | std::unique_ptr<SegmentedFusion> segmented_fusion_ = nullptr; |
184 | |
185 | //! Pre-allocated runtime workspace to speed up kernel launch preparation. |
186 | struct RuntimeWorkSpace { |
187 | //! Pre-determined order to run the segmented groups |
188 | std::vector<SegmentedGroup*> group_run_order; |
189 | |
190 | //! Pre-determined order to bind tensor input meta data |
191 | std::vector<Val*> group_extent_binding_order; |
192 | } runtime_workspace_; |
193 | |
194 | //! Utility to speed up value evaluation at runtime |
195 | std::unique_ptr<FusionPrecomputedValues> precomputed_values_; |
196 | |
197 | // States for profiling support |
198 | bool profiling_ = false; |
199 | |
200 | std::mutex mutex_; |
201 | // TODO: remove `compiling_` mutex and rely on `mutex_` only. |
202 | // we don't need the second mutex, if only I could figure out how to pass |
203 | // unique_lock into lambda |
204 | std::mutex compiling_; |
205 | |
206 | // The heuristics and executor for most recent kernel launch |
207 | ExecutorLog most_recent_executor_log_; |
208 | }; |
209 | |
210 | //! Encoding an input set to unique id, which is used to short-cut cache entry |
211 | //! selection in our nested cache implementation to cut off overhead. |
212 | //! |
213 | //! We have implemented naive LRU cache eviction policy here, since each entry |
214 | //! in `InputsIdLookup` is attached to a static input shape/stride, and could |
215 | //! grow gigantic when we have input shapes that does not stabalize to a finite |
216 | //! set. |
217 | //! |
218 | //! \note the uniqueness of the ide generated for a given input set is only |
219 | //! local to the instance of `InputsIdLookup`. |
220 | //! |
221 | class TORCH_CUDA_CU_API InputsIdLookup : public NonCopyable { |
222 | public: |
223 | //! constructor where maximum cache size is fixed during init |
224 | // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers) |
225 | explicit InputsIdLookup(size_t max_cache_size = 100) |
226 | : max_cache_size_(max_cache_size){}; |
227 | |
228 | //! struct to hold return value for lookupId. |
229 | struct IdLookupReturn { |
230 | size_t id = 0; |
231 | size_t evict_id = 0; |
232 | bool eviction = false; |
233 | }; |
234 | |
235 | //! encode each input sets to with an unique id; |
236 | //! Returned data structure also indicates whether eviction has happened |
237 | //! within the lookup cache. This is needed because lookup shortcut is also |
238 | //! cached in nested `GraphCache`, `FusionExecutorCache` and `FusionExecutor`. |
239 | //! see [ Note -- 2 level cache implementation ] |
240 | IdLookupReturn lookupId(const at::ArrayRef<IValue>& inputs); |
241 | |
242 | //! debugging API that returns the size of lookup table |
243 | size_t size() const { |
244 | return encoding_lookup_.size(); |
245 | } |
246 | |
247 | private: |
248 | // string to store encoded input meta information. Reuse the buffer instead of |
249 | // stringtream gives few us perf gain. |
250 | std::string encoding_; // Note: shared state, guarded by mutex_ |
251 | |
252 | // mutex_ used to guard reused encoding_ |
253 | std::mutex mutex_; |
254 | |
255 | //! entry stored in `encoding_lookup_` to implement LRU |
256 | // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) |
257 | struct EncodingEntry { |
258 | size_t id = 0; |
259 | std::list<std::string>::iterator lru_iter; |
260 | }; |
261 | |
262 | //! maximum cache size for LRU |
263 | const size_t max_cache_size_; |
264 | |
265 | //! next available unique id, we monotonically increase `current_id_` avoid |
266 | //! conflicts |
267 | size_t current_id_ = 1; |
268 | |
269 | //! entry in the cache, This is used to implement LRU cache, where entries in |
270 | //! the list is ordered by their recent usage (freshly used entry is placed at |
271 | //! the beginning) |
272 | std::list<std::string> used_entry_; |
273 | |
274 | //! map from `std::string` to a unique id `size_t` (packaged in |
275 | //! `EncodingEntry` |
276 | //! ). We store an iterator to `used_entry_` to implement LRU |
277 | std::unordered_map<std::string, EncodingEntry> encoding_lookup_; |
278 | }; |
279 | |
280 | //! [ Note -- 2 level cache implementation ] |
281 | //! |
282 | //! We have 2 level cache for a separation in function to keep them simpler. |
283 | //! |
284 | //! 2 level hierarchically nested cache is to handle the code generation and |
285 | //! execution of a given PyTorch IR graph that is unique in its computational |
286 | //! graph (see note on unique computational graph down). |
287 | //! |
288 | //! The nested cache structures are: |
289 | //! a. GraphCache |
290 | //! - GraphCache translates PyTorch IR into Fusion IR and pass it to a |
291 | //! `FusionExecutorCache`; |
292 | //! - GraphCache assumes all inputs to comply with profiling information, |
293 | //! mostly tensor size & contiguity (see note on unique computational |
294 | //! graph). The assumption is assured at runtime by |
295 | //! `prim::CudaFusionGuard`; |
296 | //! b. FusionExecutorCache |
297 | //! - has a single `Fusion`, FusionExecutorCache handles kernel schedule |
298 | //! and passed scheduled tensor to `FusionExecutor` to generate code; |
299 | //! - create `FusionExecutor` instances to handle heuristics from dynamic |
300 | //! shape (varying tensor sizes); |
301 | //! - create `FusionExecutor` instances to handle different devices; |
302 | //! - holds input cache `InputsIdLookup`, which allow cache on heuristics |
303 | //! and launch parameters to reduce latency. |
304 | //! |
305 | //! * note on unique computational graph |
306 | //! In theory, computational graph should refer to only the computational nodes |
307 | //! in a subgraph and should remain agnostic to input meta info, like |
308 | //! shape, strides, type e.t.c.. However, the contract right here is fuzzy. |
309 | //! Different executor applies their own protocol of what is a unique |
310 | //! computational graph. e.g. Legacy Executor embeds tensor type & |
311 | //! dimensionality in the graph, while Profiling Executor keeps symbolic shape |
312 | //! as well as stride order in the graph as well. |
313 | //! |
314 | //! Our definition of a "unique" computational graph is aligned with `Fusion` |
315 | //! IR, hence the requirement extends to meta information on input tensors. |
316 | //! Which means, for each input tensor, following properties are fixed: |
317 | //! a) stride order; |
318 | //! b) contiguity information; |
319 | //! c) broadcasting semantics (size-1 or not); |
320 | //! d) rank; |
321 | //! e) scalar type; |
322 | //! |
323 | //! |
324 | //! [ Note -- Segmented Fusion Tentative Design ] |
325 | //! Segmentation adds an extra dimension in caching. Initial implementation, |
326 | //! assumed graph partition strategy is independent of input pattern, which we |
327 | //! can revisit once we have more advanced graph segmentation logic Each |
328 | //! FusionExecutorCache corresponds to one graph and one graph segmentation. |
329 | //! |
330 | //! |
331 | class TORCH_CUDA_CU_API FusionExecutorCache { |
332 | public: |
333 | //! create new fusion executor cache at a given device to handle kernel |
334 | //! generation of dynamic sizes |
335 | //! fusion executor is taking the ownership of `fusion` |
336 | explicit FusionExecutorCache(std::unique_ptr<Fusion> fusion); |
337 | |
338 | //! Execute fusion graph with given inputs, create `FusionExecutor` as needed |
339 | //! Note this function also handles permutation & input update outside of |
340 | //! codegen. |
341 | std::vector<at::Tensor> runFusionWithInputs( |
342 | const at::ArrayRef<IValue>& inputs); |
343 | |
344 | Fusion* fusion() { |
345 | return fusion_.get(); |
346 | } |
347 | |
348 | void printFusion() { |
349 | fusion_->printMath(); |
350 | } |
351 | |
352 | FusionKernelRuntime* getMostRecentKernelRuntime() { |
353 | return most_recent_runtime_; |
354 | } |
355 | |
356 | // TODO: in a follow up we need a global logging structure |
357 | // to capture runtime profiling info. We also need to define |
358 | // a suitable profiling window / buffer size. |
359 | ExecutorLog getMostRecentExecutorInfo() { |
360 | TORCH_INTERNAL_ASSERT(most_recent_runtime_ != nullptr); |
361 | return most_recent_runtime_->getMostRecentExecutorLog(); |
362 | } |
363 | |
364 | void profile(bool to_profile) { |
365 | profiling_ = to_profile; |
366 | for (auto& it : kernel_runtimes_) { |
367 | for (auto& kernel_runtime : it.second) { |
368 | kernel_runtime->profile(to_profile); |
369 | } |
370 | } |
371 | } |
372 | |
373 | //! Internal knob for profiling shape inference |
374 | void disableLaunchParamCache() { |
375 | for (auto& it : kernel_runtimes_) { |
376 | for (auto& kernel_runtime : it.second) { |
377 | kernel_runtime->disableLaunchParamCache(); |
378 | } |
379 | } |
380 | } |
381 | |
382 | //! Internal knob for profiling shape inference |
383 | void disableKernelLaunch() { |
384 | for (auto& it : kernel_runtimes_) { |
385 | for (auto& kernel_runtime : it.second) { |
386 | kernel_runtime->disableKernelLaunch(); |
387 | } |
388 | } |
389 | } |
390 | |
391 | //! converts inputs from IValue to KernelArgumentHolder, also handles cache |
392 | //! lookup |
393 | KernelArgumentHolder prepareInputs(const at::ArrayRef<IValue>& inputs); |
394 | |
395 | //! query if there's a kernel ready to go for given inputs |
396 | bool isCompiled(const at::ArrayRef<IValue>& inputs); |
397 | |
398 | //! compile a kernel executor for given inputs. Note: the compilation is |
399 | //! async, there's some restriction on the user side. e.g. don't overlap |
400 | //! compilation and execution for the same FusionExecutor entry. This is |
401 | //! experimental at this moment, please use with extra caution. |
402 | void compileFusionAsync(const at::ArrayRef<IValue>& inputs); |
403 | |
404 | private: |
405 | //! evict cached short cut entry in `code_to_fe_lookup_` as well as cached |
406 | //! entry in `FusionExecutor` |
407 | void evictCache(size_t cache_id); |
408 | |
409 | FusionKernelRuntime* getKernelRuntimeFor(const KernelArgumentHolder& inputs); |
410 | |
411 | private: |
412 | //! original un-scheduled `Fusion`; |
413 | std::unique_ptr<Fusion> fusion_; |
414 | |
415 | //! inputs to unique_id lookup table; |
416 | InputsIdLookup inputs_id_lookup_; |
417 | |
418 | //! Graphs after input dependent transfoms |
419 | std::unordered_map<size_t, std::vector<std::unique_ptr<FusionKernelRuntime>>> |
420 | kernel_runtimes_; |
421 | |
422 | //! Logging state for most recent compilation |
423 | bool profiling_ = false; |
424 | |
425 | //! Logging state for most recent compilation |
426 | ExecutorLog most_recent_executor_log_; |
427 | |
428 | //! short-cut for cache hit |
429 | std::unordered_map<size_t, FusionKernelRuntime*> id_to_kernel_runtime_; |
430 | |
431 | //! Profiling info: |
432 | //! TODO: this can be largely expanded to look at complete |
433 | //! caching profiles. Currently it just makes it easier to test |
434 | FusionKernelRuntime* most_recent_runtime_ = nullptr; |
435 | |
436 | //! indices of fusion outputs that are aliased to inputs. These are used only |
437 | //! to support in-place update and should have been dropped before pushing |
438 | //! outputs to stack. |
439 | std::set<int> aliased_output_indices_; |
440 | }; |
441 | |
442 | class GraphCache { |
443 | public: |
444 | //! TODO: we should probably change shared_ptr to unique_ptr, as we want to |
445 | //! claim the ownership of the computational graph. |
446 | //! create GraphCache on a given graph; |
447 | //! We extract global stride index order and translate PyTorch JIT IR to |
448 | //! Fusion IR. |
449 | explicit GraphCache(const std::shared_ptr<Graph>& graph); |
450 | |
451 | //! execute graph with given inputs |
452 | std::vector<at::Tensor> runGraphWithInputs( |
453 | const at::ArrayRef<IValue>& inputs); |
454 | |
455 | private: |
456 | //! construct FusionExecutorCache |
457 | void createFusion(const std::shared_ptr<Graph>& graph); |
458 | |
459 | private: |
460 | //! FusionExecutorCache that performs schedule and kernel execution; |
461 | std::unique_ptr<FusionExecutorCache> fusion_executor_cache_; |
462 | |
463 | //! num of outputs |
464 | size_t num_of_outputs_ = 0; |
465 | }; |
466 | |
467 | } // namespace cuda |
468 | } // namespace fuser |
469 | } // namespace jit |
470 | } // namespace torch |
471 | |