1#pragma once
2
3#include <evaluator_common.h>
4#include <executor.h>
5#include <fusion.h>
6#include <fusion_segmenter.h>
7#include <scheduler/all_schedulers.h>
8#include <scheduler/registry.h>
9
10#include <c10/macros/Export.h>
11#include <c10/util/ArrayRef.h>
12
13#include <mutex>
14#include <type_traits>
15#include <unordered_map>
16
17namespace torch {
18namespace jit {
19namespace fuser {
20namespace cuda {
21
22class SegmentedGroup;
23class FusionHeuristics;
24class SchedulerRuntimeInfo;
25
26// Utilities for benchmarking and profiling
27struct ExecutorLog {
28 std::shared_ptr<HeuristicParams> params = nullptr;
29 FusionExecutor* fusion_executor = nullptr;
30};
31
32//! FusionKernelRuntime is the unified interface from fusion graphs into
33//! caching, compilation into kernels, and kernel launches.
34//!
35//! Each instance is also a cache entry tracked by FusionKernelRuntimeCache.
36//!
37//! Two types of instance can be created, one for complete/single-kernel fusion
38//! and one for segmented/multi-kernel fusion.
39//! Conceptually this is a generalization of FusionExecutor that supports both
40//! single-kernel and multi-kernel caching/compiling/launching
41class TORCH_CUDA_CU_API FusionKernelRuntime {
42 public:
43 explicit FusionKernelRuntime(
44 Fusion* fusion,
45 const KernelArgumentHolder& inputs);
46
47 //! Type notations within FusionKernelRuntime Context
48 using HashType = size_t;
49 using SchedulerEntryPtr = std::unique_ptr<SchedulerEntry>;
50
51 //! Evicts internally cached parameters based on input sizes.
52 //! An interface used by runtime caches.
53 void evictCache(size_t input_id) {
54 for (auto& fe : executors_) {
55 fe.evictCache(input_id);
56 }
57 }
58
59 //! query if we already have a compiled kernel for execution
60 bool isCompiled() {
61 std::unique_lock<std::mutex> lock0(mutex_, std::try_to_lock);
62 std::unique_lock<std::mutex> lock1(compiling_, std::try_to_lock);
63 if (!lock0.owns_lock() || !lock1.owns_lock()) {
64 // compilation in progress
65 return false;
66 }
67
68 return std::all_of(
69 executors_.begin(), executors_.end(), [](const auto& executor) {
70 return executor.compiled();
71 });
72 }
73
74 //! starts compilation async
75 void startAsyncCompile(KernelArgumentHolder& inputs);
76
77 //! maps entries in `args` to fusion inputs.
78 //! Note that this function also pushes extra bits like dimension extent into
79 //! `args` for expression evaluator binding. So consider your `args` polluted
80 //! after this function and use it with caution.
81 void mapFusionInputsToArgs(
82 std::unordered_map<Val*, const ArgAbstract*>& tensor_map,
83 KernelArgumentHolder& args);
84
85 //! Unified interface to run the managed kernels with given input
86 std::vector<at::Tensor> runWithInput(KernelArgumentHolder& args);
87
88 //! Turn On/Off profiling
89 void profile(bool to_profile = true) {
90 profiling_ = to_profile;
91 }
92
93 //! Internal knob for profiling shape inference
94 void disableLaunchParamCache() {
95 for (auto& executor : executors_) {
96 executor.disableLaunchParamCache();
97 }
98 }
99
100 //! Internal knob for profiling shape inference
101 void disableKernelLaunch() {
102 for (auto& executor : executors_) {
103 executor.setExecuteKernelFlag(false);
104 }
105 }
106
107 //! Returns if this runtime is segmented
108 bool isSegmented() {
109 return is_segmented_;
110 }
111
112 //! Returns the fusion segments if applicable
113 SegmentedFusion* fusionSegments() {
114 return segmented_fusion_.get();
115 }
116
117 //! Returns the list of heuristics in this runtime
118 FusionHeuristics* schedulerHeuristics() {
119 return heuristics_.get();
120 }
121
122 //! Return the most recently used executor, corresponding to the
123 //! most recent kernel launch.
124 //! TODO: have a interface for grabbing all recent logs. Need to put a buffer
125 //! space for recent logs
126 ExecutorLog getMostRecentExecutorLog() {
127 TORCH_INTERNAL_ASSERT(
128 profiling_, "Executor log is only produced in profiling mode");
129 return most_recent_executor_log_;
130 }
131
132 // Try to compute heuristics based on the SegmentedFusion managed
133 // in this kernel runtime, and will return a nullopt if either
134 // any segment cannot be scheduled or the parameters don't match
135 using HeuristicsPtr = std::unique_ptr<FusionHeuristics>;
136 c10::optional<HeuristicsPtr> getMaybeHeuristicsFor(
137 const KernelArgumentHolder& args);
138
139 //! Copy the launch params given in the parameter heuristics to prepare
140 //! for kernel launch for a new input dimension but same heuristics
141 void updateHeuristicsLaunchParams(FusionHeuristics* update_heuristics);
142
143 private:
144 //! Interface to run a single kernel, either one kernel for single-kernel
145 //! fusions, or a kernel for a segmentedGrouup in a segmented fusion. Returns
146 //! the kernel outputs.
147 std::vector<at::Tensor> runKernelWithInput(
148 KernelArgumentHolder& args,
149 SegmentedGroup* sg);
150
151 //! Interface to compile a single kernel, either one kernel for single-kernel
152 //! fusions, or a kernel for a segmentedGrouup in a segmented fusion. Returns
153 //! the kernel outputs with tensor that doesn't own memory.
154 KernelArgumentHolder compileKernel(
155 const KernelArgumentHolder& args,
156 SegmentedGroup* sg);
157
158 //! Interface to run a the whole graph in a segmented fusion and return the
159 //! complete
160 //! fusion outputs.
161 std::vector<at::Tensor> runMultiKernelWithInput(
162 const at::ArrayRef<IValue>& inputs,
163 size_t input_id);
164
165 //! Access the list of schedulers maintained in this runtime instance
166 const std::vector<SchedulerEntryPtr>& schedulers();
167
168 void prepareRuntimeOrder();
169
170 private:
171 //! Entries indexed by groupID:
172 //! Executors holding compiled kernels
173 std::vector<FusionExecutor> executors_;
174
175 //! Heuristics object holding scheduler entries for all segments
176 std::unique_ptr<FusionHeuristics> heuristics_;
177
178 // Checks if this runtime instance is for a single-kernel fusion (false) or a
179 // segmented fusion (true).
180 bool is_segmented_ = true;
181
182 //! Multi-Kernel fusion segment when applies
183 std::unique_ptr<SegmentedFusion> segmented_fusion_ = nullptr;
184
185 //! Pre-allocated runtime workspace to speed up kernel launch preparation.
186 struct RuntimeWorkSpace {
187 //! Pre-determined order to run the segmented groups
188 std::vector<SegmentedGroup*> group_run_order;
189
190 //! Pre-determined order to bind tensor input meta data
191 std::vector<Val*> group_extent_binding_order;
192 } runtime_workspace_;
193
194 //! Utility to speed up value evaluation at runtime
195 std::unique_ptr<FusionPrecomputedValues> precomputed_values_;
196
197 // States for profiling support
198 bool profiling_ = false;
199
200 std::mutex mutex_;
201 // TODO: remove `compiling_` mutex and rely on `mutex_` only.
202 // we don't need the second mutex, if only I could figure out how to pass
203 // unique_lock into lambda
204 std::mutex compiling_;
205
206 // The heuristics and executor for most recent kernel launch
207 ExecutorLog most_recent_executor_log_;
208};
209
210//! Encoding an input set to unique id, which is used to short-cut cache entry
211//! selection in our nested cache implementation to cut off overhead.
212//!
213//! We have implemented naive LRU cache eviction policy here, since each entry
214//! in `InputsIdLookup` is attached to a static input shape/stride, and could
215//! grow gigantic when we have input shapes that does not stabalize to a finite
216//! set.
217//!
218//! \note the uniqueness of the ide generated for a given input set is only
219//! local to the instance of `InputsIdLookup`.
220//!
221class TORCH_CUDA_CU_API InputsIdLookup : public NonCopyable {
222 public:
223 //! constructor where maximum cache size is fixed during init
224 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
225 explicit InputsIdLookup(size_t max_cache_size = 100)
226 : max_cache_size_(max_cache_size){};
227
228 //! struct to hold return value for lookupId.
229 struct IdLookupReturn {
230 size_t id = 0;
231 size_t evict_id = 0;
232 bool eviction = false;
233 };
234
235 //! encode each input sets to with an unique id;
236 //! Returned data structure also indicates whether eviction has happened
237 //! within the lookup cache. This is needed because lookup shortcut is also
238 //! cached in nested `GraphCache`, `FusionExecutorCache` and `FusionExecutor`.
239 //! see [ Note -- 2 level cache implementation ]
240 IdLookupReturn lookupId(const at::ArrayRef<IValue>& inputs);
241
242 //! debugging API that returns the size of lookup table
243 size_t size() const {
244 return encoding_lookup_.size();
245 }
246
247 private:
248 // string to store encoded input meta information. Reuse the buffer instead of
249 // stringtream gives few us perf gain.
250 std::string encoding_; // Note: shared state, guarded by mutex_
251
252 // mutex_ used to guard reused encoding_
253 std::mutex mutex_;
254
255 //! entry stored in `encoding_lookup_` to implement LRU
256 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
257 struct EncodingEntry {
258 size_t id = 0;
259 std::list<std::string>::iterator lru_iter;
260 };
261
262 //! maximum cache size for LRU
263 const size_t max_cache_size_;
264
265 //! next available unique id, we monotonically increase `current_id_` avoid
266 //! conflicts
267 size_t current_id_ = 1;
268
269 //! entry in the cache, This is used to implement LRU cache, where entries in
270 //! the list is ordered by their recent usage (freshly used entry is placed at
271 //! the beginning)
272 std::list<std::string> used_entry_;
273
274 //! map from `std::string` to a unique id `size_t` (packaged in
275 //! `EncodingEntry`
276 //! ). We store an iterator to `used_entry_` to implement LRU
277 std::unordered_map<std::string, EncodingEntry> encoding_lookup_;
278};
279
280//! [ Note -- 2 level cache implementation ]
281//!
282//! We have 2 level cache for a separation in function to keep them simpler.
283//!
284//! 2 level hierarchically nested cache is to handle the code generation and
285//! execution of a given PyTorch IR graph that is unique in its computational
286//! graph (see note on unique computational graph down).
287//!
288//! The nested cache structures are:
289//! a. GraphCache
290//! - GraphCache translates PyTorch IR into Fusion IR and pass it to a
291//! `FusionExecutorCache`;
292//! - GraphCache assumes all inputs to comply with profiling information,
293//! mostly tensor size & contiguity (see note on unique computational
294//! graph). The assumption is assured at runtime by
295//! `prim::CudaFusionGuard`;
296//! b. FusionExecutorCache
297//! - has a single `Fusion`, FusionExecutorCache handles kernel schedule
298//! and passed scheduled tensor to `FusionExecutor` to generate code;
299//! - create `FusionExecutor` instances to handle heuristics from dynamic
300//! shape (varying tensor sizes);
301//! - create `FusionExecutor` instances to handle different devices;
302//! - holds input cache `InputsIdLookup`, which allow cache on heuristics
303//! and launch parameters to reduce latency.
304//!
305//! * note on unique computational graph
306//! In theory, computational graph should refer to only the computational nodes
307//! in a subgraph and should remain agnostic to input meta info, like
308//! shape, strides, type e.t.c.. However, the contract right here is fuzzy.
309//! Different executor applies their own protocol of what is a unique
310//! computational graph. e.g. Legacy Executor embeds tensor type &
311//! dimensionality in the graph, while Profiling Executor keeps symbolic shape
312//! as well as stride order in the graph as well.
313//!
314//! Our definition of a "unique" computational graph is aligned with `Fusion`
315//! IR, hence the requirement extends to meta information on input tensors.
316//! Which means, for each input tensor, following properties are fixed:
317//! a) stride order;
318//! b) contiguity information;
319//! c) broadcasting semantics (size-1 or not);
320//! d) rank;
321//! e) scalar type;
322//!
323//!
324//! [ Note -- Segmented Fusion Tentative Design ]
325//! Segmentation adds an extra dimension in caching. Initial implementation,
326//! assumed graph partition strategy is independent of input pattern, which we
327//! can revisit once we have more advanced graph segmentation logic Each
328//! FusionExecutorCache corresponds to one graph and one graph segmentation.
329//!
330//!
331class TORCH_CUDA_CU_API FusionExecutorCache {
332 public:
333 //! create new fusion executor cache at a given device to handle kernel
334 //! generation of dynamic sizes
335 //! fusion executor is taking the ownership of `fusion`
336 explicit FusionExecutorCache(std::unique_ptr<Fusion> fusion);
337
338 //! Execute fusion graph with given inputs, create `FusionExecutor` as needed
339 //! Note this function also handles permutation & input update outside of
340 //! codegen.
341 std::vector<at::Tensor> runFusionWithInputs(
342 const at::ArrayRef<IValue>& inputs);
343
344 Fusion* fusion() {
345 return fusion_.get();
346 }
347
348 void printFusion() {
349 fusion_->printMath();
350 }
351
352 FusionKernelRuntime* getMostRecentKernelRuntime() {
353 return most_recent_runtime_;
354 }
355
356 // TODO: in a follow up we need a global logging structure
357 // to capture runtime profiling info. We also need to define
358 // a suitable profiling window / buffer size.
359 ExecutorLog getMostRecentExecutorInfo() {
360 TORCH_INTERNAL_ASSERT(most_recent_runtime_ != nullptr);
361 return most_recent_runtime_->getMostRecentExecutorLog();
362 }
363
364 void profile(bool to_profile) {
365 profiling_ = to_profile;
366 for (auto& it : kernel_runtimes_) {
367 for (auto& kernel_runtime : it.second) {
368 kernel_runtime->profile(to_profile);
369 }
370 }
371 }
372
373 //! Internal knob for profiling shape inference
374 void disableLaunchParamCache() {
375 for (auto& it : kernel_runtimes_) {
376 for (auto& kernel_runtime : it.second) {
377 kernel_runtime->disableLaunchParamCache();
378 }
379 }
380 }
381
382 //! Internal knob for profiling shape inference
383 void disableKernelLaunch() {
384 for (auto& it : kernel_runtimes_) {
385 for (auto& kernel_runtime : it.second) {
386 kernel_runtime->disableKernelLaunch();
387 }
388 }
389 }
390
391 //! converts inputs from IValue to KernelArgumentHolder, also handles cache
392 //! lookup
393 KernelArgumentHolder prepareInputs(const at::ArrayRef<IValue>& inputs);
394
395 //! query if there's a kernel ready to go for given inputs
396 bool isCompiled(const at::ArrayRef<IValue>& inputs);
397
398 //! compile a kernel executor for given inputs. Note: the compilation is
399 //! async, there's some restriction on the user side. e.g. don't overlap
400 //! compilation and execution for the same FusionExecutor entry. This is
401 //! experimental at this moment, please use with extra caution.
402 void compileFusionAsync(const at::ArrayRef<IValue>& inputs);
403
404 private:
405 //! evict cached short cut entry in `code_to_fe_lookup_` as well as cached
406 //! entry in `FusionExecutor`
407 void evictCache(size_t cache_id);
408
409 FusionKernelRuntime* getKernelRuntimeFor(const KernelArgumentHolder& inputs);
410
411 private:
412 //! original un-scheduled `Fusion`;
413 std::unique_ptr<Fusion> fusion_;
414
415 //! inputs to unique_id lookup table;
416 InputsIdLookup inputs_id_lookup_;
417
418 //! Graphs after input dependent transfoms
419 std::unordered_map<size_t, std::vector<std::unique_ptr<FusionKernelRuntime>>>
420 kernel_runtimes_;
421
422 //! Logging state for most recent compilation
423 bool profiling_ = false;
424
425 //! Logging state for most recent compilation
426 ExecutorLog most_recent_executor_log_;
427
428 //! short-cut for cache hit
429 std::unordered_map<size_t, FusionKernelRuntime*> id_to_kernel_runtime_;
430
431 //! Profiling info:
432 //! TODO: this can be largely expanded to look at complete
433 //! caching profiles. Currently it just makes it easier to test
434 FusionKernelRuntime* most_recent_runtime_ = nullptr;
435
436 //! indices of fusion outputs that are aliased to inputs. These are used only
437 //! to support in-place update and should have been dropped before pushing
438 //! outputs to stack.
439 std::set<int> aliased_output_indices_;
440};
441
442class GraphCache {
443 public:
444 //! TODO: we should probably change shared_ptr to unique_ptr, as we want to
445 //! claim the ownership of the computational graph.
446 //! create GraphCache on a given graph;
447 //! We extract global stride index order and translate PyTorch JIT IR to
448 //! Fusion IR.
449 explicit GraphCache(const std::shared_ptr<Graph>& graph);
450
451 //! execute graph with given inputs
452 std::vector<at::Tensor> runGraphWithInputs(
453 const at::ArrayRef<IValue>& inputs);
454
455 private:
456 //! construct FusionExecutorCache
457 void createFusion(const std::shared_ptr<Graph>& graph);
458
459 private:
460 //! FusionExecutorCache that performs schedule and kernel execution;
461 std::unique_ptr<FusionExecutorCache> fusion_executor_cache_;
462
463 //! num of outputs
464 size_t num_of_outputs_ = 0;
465};
466
467} // namespace cuda
468} // namespace fuser
469} // namespace jit
470} // namespace torch
471