kernel_cache.h source code [pytorch/third_party/nvfuser/csrc/kernel_cache.h]

1	#pragma once
2
3	#include <evaluator_common.h>
4	#include <executor.h>
5	#include <fusion.h>
6	#include <fusion_segmenter.h>
7	#include <scheduler/all_schedulers.h>
8	#include <scheduler/registry.h>
9
10	#include <c10/macros/Export.h>
11	#include <c10/util/ArrayRef.h>
12
13	#include <mutex>
14	#include <type_traits>
15	#include <unordered_map>
16
17	namespace torch {
18	namespace jit {
19	namespace fuser {
20	namespace cuda {
21
22	class SegmentedGroup;
23	class FusionHeuristics;
24	class SchedulerRuntimeInfo;
25
26	// Utilities for benchmarking and profiling
27	struct ExecutorLog {
28	std::shared_ptr<HeuristicParams> params = nullptr;
29	FusionExecutor* fusion_executor = nullptr;
30	};
31
32	//! FusionKernelRuntime is the unified interface from fusion graphs into
33	//! caching, compilation into kernels, and kernel launches.
34	//!
35	//! Each instance is also a cache entry tracked by FusionKernelRuntimeCache.
36	//!
37	//! Two types of instance can be created, one for complete/single-kernel fusion
38	//! and one for segmented/multi-kernel fusion.
39	//! Conceptually this is a generalization of FusionExecutor that supports both
40	//! single-kernel and multi-kernel caching/compiling/launching
41	class TORCH_CUDA_CU_API FusionKernelRuntime {
42	public:
43	explicit FusionKernelRuntime(
44	Fusion* fusion,
45	const KernelArgumentHolder& inputs);
46
47	//! Type notations within FusionKernelRuntime Context
48	using HashType = size_t;
49	using SchedulerEntryPtr = std::unique_ptr<SchedulerEntry>;
50
51	//! Evicts internally cached parameters based on input sizes.
52	//! An interface used by runtime caches.
53	void evictCache(size_t input_id) {
54	for (auto& fe : executors_) {
55	fe.evictCache(input_id);
56	}
57	}
58
59	//! query if we already have a compiled kernel for execution
60	bool isCompiled() {
61	std::unique_lock<std::mutex> lock0(mutex_, std::try_to_lock);
62	std::unique_lock<std::mutex> lock1(compiling_, std::try_to_lock);
63	if (!lock0.owns_lock() \|\| !lock1.owns_lock()) {
64	// compilation in progress
65	return false;
66	}
67
68	return std::all_of(
69	executors_.begin(), executors_.end(), [](const auto& executor) {
70	return executor.compiled();
71	});
72	}
73
74	//! starts compilation async
75	void startAsyncCompile(KernelArgumentHolder& inputs);
76
77	//! maps entries in `args` to fusion inputs.
78	//! Note that this function also pushes extra bits like dimension extent into
79	//! `args` for expression evaluator binding. So consider your `args` polluted
80	//! after this function and use it with caution.
81	void mapFusionInputsToArgs(
82	std::unordered_map<Val, const* ArgAbstract*>& tensor_map,
83	KernelArgumentHolder& args);
84
85	//! Unified interface to run the managed kernels with given input
86	std::vector<at::Tensor> runWithInput(KernelArgumentHolder& args);
87
88	//! Turn On/Off profiling
89	void profile(bool to_profile = true) {
90	profiling_ = to_profile;
91	}
92
93	//! Internal knob for profiling shape inference
94	void disableLaunchParamCache() {
95	for (auto& executor : executors_) {
96	executor.disableLaunchParamCache();
97	}
98	}
99
100	//! Internal knob for profiling shape inference
101	void disableKernelLaunch() {
102	for (auto& executor : executors_) {
103	executor.setExecuteKernelFlag(false);
104	}
105	}
106
107	//! Returns if this runtime is segmented
108	bool isSegmented() {
109	return is_segmented_;
110	}
111
112	//! Returns the fusion segments if applicable
113	SegmentedFusion* fusionSegments() {
114	return segmented_fusion_.get();
115	}
116
117	//! Returns the list of heuristics in this runtime
118	FusionHeuristics* schedulerHeuristics() {
119	return heuristics_.get();
120	}
121
122	//! Return the most recently used executor, corresponding to the
123	//! most recent kernel launch.
124	//! TODO: have a interface for grabbing all recent logs. Need to put a buffer
125	//! space for recent logs
126	ExecutorLog getMostRecentExecutorLog() {
127	TORCH_INTERNAL_ASSERT(
128	profiling_, "Executor log is only produced in profiling mode");
129	return most_recent_executor_log_;
130	}
131
132	// Try to compute heuristics based on the SegmentedFusion managed
133	// in this kernel runtime, and will return a nullopt if either
134	// any segment cannot be scheduled or the parameters don't match
135	using HeuristicsPtr = std::unique_ptr<FusionHeuristics>;
136	c10::optional<HeuristicsPtr> getMaybeHeuristicsFor(
137	const KernelArgumentHolder& args);
138
139	//! Copy the launch params given in the parameter heuristics to prepare
140	//! for kernel launch for a new input dimension but same heuristics
141	void updateHeuristicsLaunchParams(FusionHeuristics* update_heuristics);
142
143	private:
144	//! Interface to run a single kernel, either one kernel for single-kernel
145	//! fusions, or a kernel for a segmentedGrouup in a segmented fusion. Returns
146	//! the kernel outputs.
147	std::vector<at::Tensor> runKernelWithInput(
148	KernelArgumentHolder& args,
149	SegmentedGroup* sg);
150
151	//! Interface to compile a single kernel, either one kernel for single-kernel
152	//! fusions, or a kernel for a segmentedGrouup in a segmented fusion. Returns
153	//! the kernel outputs with tensor that doesn't own memory.
154	KernelArgumentHolder compileKernel(
155	const KernelArgumentHolder& args,
156	SegmentedGroup* sg);
157
158	//! Interface to run a the whole graph in a segmented fusion and return the
159	//! complete
160	//! fusion outputs.
161	std::vector<at::Tensor> runMultiKernelWithInput(
162	const at::ArrayRef<IValue>& inputs,
163	size_t input_id);
164
165	//! Access the list of schedulers maintained in this runtime instance
166	const std::vector<SchedulerEntryPtr>& schedulers();
167
168	void prepareRuntimeOrder();
169
170	private:
171	//! Entries indexed by groupID:
172	//! Executors holding compiled kernels
173	std::vector<FusionExecutor> executors_;
174
175	//! Heuristics object holding scheduler entries for all segments
176	std::unique_ptr<FusionHeuristics> heuristics_;
177
178	// Checks if this runtime instance is for a single-kernel fusion (false) or a
179	// segmented fusion (true).
180	bool is_segmented_ = true;
181
182	//! Multi-Kernel fusion segment when applies
183	std::unique_ptr<SegmentedFusion> segmented_fusion_ = nullptr;
184
185	//! Pre-allocated runtime workspace to speed up kernel launch preparation.
186	struct RuntimeWorkSpace {
187	//! Pre-determined order to run the segmented groups
188	std::vector<SegmentedGroup*> group_run_order;
189
190	//! Pre-determined order to bind tensor input meta data
191	std::vector<Val*> group_extent_binding_order;
192	} runtime_workspace_;
193
194	//! Utility to speed up value evaluation at runtime
195	std::unique_ptr<FusionPrecomputedValues> precomputed_values_;
196
197	// States for profiling support
198	bool profiling_ = false;
199
200	std::mutex mutex_;
201	// TODO: remove `compiling_` mutex and rely on `mutex_` only.
202	// we don't need the second mutex, if only I could figure out how to pass
203	// unique_lock into lambda
204	std::mutex compiling_;
205
206	// The heuristics and executor for most recent kernel launch
207	ExecutorLog most_recent_executor_log_;
208	};
209
210	//! Encoding an input set to unique id, which is used to short-cut cache entry
211	//! selection in our nested cache implementation to cut off overhead.
212	//!
213	//! We have implemented naive LRU cache eviction policy here, since each entry
214	//! in `InputsIdLookup` is attached to a static input shape/stride, and could
215	//! grow gigantic when we have input shapes that does not stabalize to a finite
216	//! set.
217	//!
218	//! \note the uniqueness of the ide generated for a given input set is only
219	//! local to the instance of `InputsIdLookup`.
220	//!
221	class TORCH_CUDA_CU_API InputsIdLookup : public NonCopyable {
222	public:
223	//! constructor where maximum cache size is fixed during init
224	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
225	explicit InputsIdLookup(size_t max_cache_size = `100`)
226	: max_cache_size_(max_cache_size){};
227
228	//! struct to hold return value for lookupId.
229	struct IdLookupReturn {
230	size_t id = `0`;
231	size_t evict_id = `0`;
232	bool eviction = false;
233	};
234
235	//! encode each input sets to with an unique id;
236	//! Returned data structure also indicates whether eviction has happened
237	//! within the lookup cache. This is needed because lookup shortcut is also
238	//! cached in nested `GraphCache`, `FusionExecutorCache` and `FusionExecutor`.
239	//! see [ Note -- 2 level cache implementation ]
240	IdLookupReturn lookupId(const at::ArrayRef<IValue>& inputs);
241
242	//! debugging API that returns the size of lookup table
243	size_t size() const {
244	return encoding_lookup_.size();
245	}
246
247	private:
248	// string to store encoded input meta information. Reuse the buffer instead of
249	// stringtream gives few us perf gain.
250	std::string encoding_; // Note: shared state, guarded by mutex_
251
252	// mutex_ used to guard reused encoding_
253	std::mutex mutex_;
254
255	//! entry stored in `encoding_lookup_` to implement LRU
256	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
257	struct EncodingEntry {
258	size_t id = `0`;
259	std::list<std::string>::iterator lru_iter;
260	};
261
262	//! maximum cache size for LRU
263	const size_t max_cache_size_;
264
265	//! next available unique id, we monotonically increase `current_id_` avoid
266	//! conflicts
267	size_t current_id_ = `1`;
268
269	//! entry in the cache, This is used to implement LRU cache, where entries in
270	//! the list is ordered by their recent usage (freshly used entry is placed at
271	//! the beginning)
272	std::list<std::string> used_entry_;
273
274	//! map from `std::string` to a unique id `size_t` (packaged in
275	//! `EncodingEntry`
276	//! ). We store an iterator to `used_entry_` to implement LRU
277	std::unordered_map<std::string, EncodingEntry> encoding_lookup_;
278	};
279
280	//! [ Note -- 2 level cache implementation ]
281	//!
282	//! We have 2 level cache for a separation in function to keep them simpler.
283	//!
284	//! 2 level hierarchically nested cache is to handle the code generation and
285	//! execution of a given PyTorch IR graph that is unique in its computational
286	//! graph (see note on unique computational graph down).
287	//!
288	//! The nested cache structures are:
289	//! a. GraphCache
290	//! - GraphCache translates PyTorch IR into Fusion IR and pass it to a
291	//! `FusionExecutorCache`;
292	//! - GraphCache assumes all inputs to comply with profiling information,
293	//! mostly tensor size & contiguity (see note on unique computational
294	//! graph). The assumption is assured at runtime by
295	//! `prim::CudaFusionGuard`;
296	//! b. FusionExecutorCache
297	//! - has a single `Fusion`, FusionExecutorCache handles kernel schedule
298	//! and passed scheduled tensor to `FusionExecutor` to generate code;
299	//! - create `FusionExecutor` instances to handle heuristics from dynamic
300	//! shape (varying tensor sizes);
301	//! - create `FusionExecutor` instances to handle different devices;
302	//! - holds input cache `InputsIdLookup`, which allow cache on heuristics
303	//! and launch parameters to reduce latency.
304	//!
305	//! note on unique computational graph*
306	//! In theory, computational graph should refer to only the computational nodes
307	//! in a subgraph and should remain agnostic to input meta info, like
308	//! shape, strides, type e.t.c.. However, the contract right here is fuzzy.
309	//! Different executor applies their own protocol of what is a unique
310	//! computational graph. e.g. Legacy Executor embeds tensor type &
311	//! dimensionality in the graph, while Profiling Executor keeps symbolic shape
312	//! as well as stride order in the graph as well.
313	//!
314	//! Our definition of a "unique" computational graph is aligned with `Fusion`
315	//! IR, hence the requirement extends to meta information on input tensors.
316	//! Which means, for each input tensor, following properties are fixed:
317	//! a) stride order;
318	//! b) contiguity information;
319	//! c) broadcasting semantics (size-1 or not);
320	//! d) rank;
321	//! e) scalar type;
322	//!
323	//!
324	//! [ Note -- Segmented Fusion Tentative Design ]
325	//! Segmentation adds an extra dimension in caching. Initial implementation,
326	//! assumed graph partition strategy is independent of input pattern, which we
327	//! can revisit once we have more advanced graph segmentation logic Each
328	//! FusionExecutorCache corresponds to one graph and one graph segmentation.
329	//!
330	//!
331	class TORCH_CUDA_CU_API FusionExecutorCache {
332	public:
333	//! create new fusion executor cache at a given device to handle kernel
334	//! generation of dynamic sizes
335	//! fusion executor is taking the ownership of `fusion`
336	explicit FusionExecutorCache(std::unique_ptr<Fusion> fusion);
337
338	//! Execute fusion graph with given inputs, create `FusionExecutor` as needed
339	//! Note this function also handles permutation & input update outside of
340	//! codegen.
341	std::vector<at::Tensor> runFusionWithInputs(
342	const at::ArrayRef<IValue>& inputs);
343
344	Fusion* fusion() {
345	return fusion_.get();
346	}
347
348	void printFusion() {
349	fusion_->printMath();
350	}
351
352	FusionKernelRuntime* getMostRecentKernelRuntime() {
353	return most_recent_runtime_;
354	}
355
356	// TODO: in a follow up we need a global logging structure
357	// to capture runtime profiling info. We also need to define
358	// a suitable profiling window / buffer size.
359	ExecutorLog getMostRecentExecutorInfo() {
360	TORCH_INTERNAL_ASSERT(most_recent_runtime_ != nullptr);
361	return most_recent_runtime_->getMostRecentExecutorLog();
362	}
363
364	void profile(bool to_profile) {
365	profiling_ = to_profile;
366	for (auto& it : kernel_runtimes_) {
367	for (auto& kernel_runtime : it.second) {
368	kernel_runtime ->profile(to_profile);
369	}
370	}
371	}
372
373	//! Internal knob for profiling shape inference
374	void disableLaunchParamCache() {
375	for (auto& it : kernel_runtimes_) {
376	for (auto& kernel_runtime : it.second) {
377	kernel_runtime ->disableLaunchParamCache();
378	}
379	}
380	}
381
382	//! Internal knob for profiling shape inference
383	void disableKernelLaunch() {
384	for (auto& it : kernel_runtimes_) {
385	for (auto& kernel_runtime : it.second) {
386	kernel_runtime ->disableKernelLaunch();
387	}
388	}
389	}
390
391	//! converts inputs from IValue to KernelArgumentHolder, also handles cache
392	//! lookup
393	KernelArgumentHolder prepareInputs(const at::ArrayRef<IValue>& inputs);
394
395	//! query if there's a kernel ready to go for given inputs
396	bool isCompiled(const at::ArrayRef<IValue>& inputs);
397
398	//! compile a kernel executor for given inputs. Note: the compilation is
399	//! async, there's some restriction on the user side. e.g. don't overlap
400	//! compilation and execution for the same FusionExecutor entry. This is
401	//! experimental at this moment, please use with extra caution.
402	void compileFusionAsync(const at::ArrayRef<IValue>& inputs);
403
404	private:
405	//! evict cached short cut entry in `code_to_fe_lookup_` as well as cached
406	//! entry in `FusionExecutor`
407	void evictCache(size_t cache_id);
408
409	FusionKernelRuntime* getKernelRuntimeFor(const KernelArgumentHolder& inputs);
410
411	private:
412	//! original un-scheduled `Fusion`;
413	std::unique_ptr<Fusion> fusion_;
414
415	//! inputs to unique_id lookup table;
416	InputsIdLookup inputs_id_lookup_;
417
418	//! Graphs after input dependent transfoms
419	std::unordered_map<size_t, std::vector<std::unique_ptr<FusionKernelRuntime>>>
420	kernel_runtimes_;
421
422	//! Logging state for most recent compilation
423	bool profiling_ = false;
424
425	//! Logging state for most recent compilation
426	ExecutorLog most_recent_executor_log_;
427
428	//! short-cut for cache hit
429	std::unordered_map<size_t, FusionKernelRuntime*> id_to_kernel_runtime_;
430
431	//! Profiling info:
432	//! TODO: this can be largely expanded to look at complete
433	//! caching profiles. Currently it just makes it easier to test
434	FusionKernelRuntime* most_recent_runtime_ = nullptr;
435
436	//! indices of fusion outputs that are aliased to inputs. These are used only
437	//! to support in-place update and should have been dropped before pushing
438	//! outputs to stack.
439	std::set<int> aliased_output_indices_;
440	};
441
442	class GraphCache {
443	public:
444	//! TODO: we should probably change shared_ptr to unique_ptr, as we want to
445	//! claim the ownership of the computational graph.
446	//! create GraphCache on a given graph;
447	//! We extract global stride index order and translate PyTorch JIT IR to
448	//! Fusion IR.
449	explicit GraphCache(const std::shared_ptr<Graph>& graph);
450
451	//! execute graph with given inputs
452	std::vector<at::Tensor> runGraphWithInputs(
453	const at::ArrayRef<IValue>& inputs);
454
455	private:
456	//! construct FusionExecutorCache
457	void createFusion(const std::shared_ptr<Graph>& graph);
458
459	private:
460	//! FusionExecutorCache that performs schedule and kernel execution;
461	std::unique_ptr<FusionExecutorCache> fusion_executor_cache_;
462
463	//! num of outputs
464	size_t num_of_outputs_ = `0`;
465	};
466
467	} // namespace cuda
468	} // namespace fuser
469	} // namespace jit
470	} // namespace torch
471

Browse the source code of pytorch/third_party/nvfuser/csrc/kernel_cache.h