CompilationContext.h source code [glow/include/glow/Optimizer/GraphOptimizer/CompilationContext.h]

1	/**
2	* Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16	#ifndef GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H
17	#define GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H
18
19	#include "glow/Backends/BackendOptions.h"
20	#include "glow/Graph/PlaceholderBindings.h"
21	#include "glow/Quantization/Base/Base.h"
22	#include "glow/Support/Error.h"
23
24	namespace glow {
25	namespace runtime {
26	struct PartitionConfig;
27	struct PrePartitionedConfig;
28	class DeferredWeightLoader;
29	} // namespace runtime
30
31	/// Map from Placeholders to their original name and index in the proto that
32	/// loaded them. Used to keep around info from when we import a proto to then
33	/// exporting it later on.
34	using LoadedPlaceholderNameMap =
35	std::unordered_map<const Placeholder , std::pair<std::string, unsigned*>>;
36
37	/// Map from the name of the original op that some quantization parameters was
38	/// loaded from to those associated quantization parameters.
39	using OriginNameToTQPMap =
40	std::unordered_map<std::string, TensorQuantizationParams>;
41
42	/// Configuration for different precision modes.
43	struct PrecisionConfiguration {
44	/// Enum for what kind of transformation should be done for Quantization.
45	enum class QuantizationMode {
46	None, /// Perform no transformations for quantization.
47	Quantize, /// Quantize the graph using previously gathered statistics.
48	Profile, /// Add profiling nodes for quantization statistics gathering.
49	} quantMode{QuantizationMode::None};
50
51	/// Configuration for Profiling.
52	quantization::ProfilingConfiguration profConfig;
53
54	/// Configuration for Quantization.
55	quantization::QuantizationConfiguration quantConfig;
56
57	/// Enum for what kind of float16 format should be used.
58	enum class Float16Format {
59	None, /// No float16 format should be used.
60	FP16, /// FP16 format for float16 should be used.
61	BFloat16, /// FP16 format for float16 should be used.
62	} float16Format{
63	Float16Format::FP16}; /// If convertToFp16, float16 format to be used.
64
65	/// Whether to convert the FloatTy to Float16Ty in the Function.
66	bool convertToFP16{false};
67
68	/// Whether to convert UInt8FusedQTy to UInt8FusedFP16QTy in the Function.
69	bool convertFusedToFP16{false};
70
71	/// Whether to convert UInt4FusedFP16QTy to UInt8FusedQTy in the Function.
72	bool convert4BitFusedTo8Bit{false};
73
74	/// Whether to convert UInt8FusedFP16QTy to UInt8FusedQTy in the Function.
75	bool convert8BitFusedToFP32{false};
76
77	/// Whether to convert UInt4FusedFP16QTy to UInt4FusedQTy in the Function.
78	bool convert4BitFusedToFP32{false};
79
80	/// Whether to convert indices in FusedRowwiseSLWS to Int64ITy.
81	bool convertIndicesToInt64{false};
82
83	/// If convertToFP16, whether to convert input Placeholders.
84	bool convertPlaceholdersToFP16{false};
85
86	/// If convertToFP16, whether to convert Constants.
87	bool convertConstantsToFP16{false};
88
89	/// If convertToFp16, whether to skip convert bias from fp32 to fp16 in FC
90	bool skipBiasFp32tofp16Convert{false};
91
92	/// If convertToFP16, whether to clip out-of-range FP values to the min/max of
93	/// fp16.
94	bool clipFP16{false};
95
96	/// If clipFP16, whether to skip clipping inputs of Nodes.
97	bool clipFP16SkipInputs{false};
98
99	/// Whether to force FP16 accumulation for the SLS family of ops.
100	bool forceFP16AccumSLS{true};
101
102	/// Used during Quantization and convertToFP16 to keep the original precision
103	/// of specific node kinds (i.e. quantization/FP16 conversion would be skipped
104	/// for any node kinds found here). Used during profiling to prevent nodes
105	/// from being lowered before instrumenting the graph (e.g. do not lower group
106	/// convolutions for profiling; see `-do-not-lower-nodes-for-profiling` in
107	/// docs/Quantization.md).
108	KindSet precisionModeKindSet;
109
110	/// Whether to use the precisionModeKindSet as a whitelist instead of the
111	/// default blacklist. Currently only supported for convertToFP16.
112	bool useSetAsWhitelist{false};
113
114	/// Pointer to a map of loader names to loaded quant params.
115	OriginNameToTQPMap originNameToTQPMap{nullptr*};
116
117	/// If true, then discard original quantization params that are loaded, to
118	/// instead track origin of quantization params in \ref originNameToTQPMap.
119	bool loadUniquedDummyQParams{false};
120
121	/// If true, when scales for qparams are loaded, they are clipped to
122	/// kMinScaleFP16 if below kMinScaleFP16.
123	bool zeroScaleFP16Clip{false};
124
125	/// If true, then the model that is loaded is expected to have been originally
126	/// serialized with dummy quantization parameters, and was replaced with
127	/// actual quantization parameters when loaded in this compilation context.
128	bool replaceDummyTQPs{false};
129
130	/// If true, then we can safely assume that all qparams (even dummy qparams)
131	/// are clipped inside the FP16 range.
132	bool clipQuantRangeToFP16{false};
133
134	/// Converts a float16 \p format into an ElemKind.
135	static ElemKind getElementType(Float16Format format) {
136	switch (format) {
137	case Float16Format::FP16:
138	return ElemKind::Float16Ty;
139	case Float16Format::BFloat16:
140	return ElemKind::BFloat16Ty;
141	default:
142	llvm_unreachable("Unknown float16 format");
143	}
144	}
145	};
146
147	using QuantizationMode = PrecisionConfiguration::QuantizationMode;
148
149	/// Options relevant to optimizations during compilation.
150	struct OptimizationOptions {
151	/// Only lower, i.e. skip optimizations and precision transformations. Used
152	/// for testing.
153	llvm::SmallSet<Function *, `1`> onlyLowerFuns;
154
155	/// If true, perform compile-time computation of constant operations.
156	bool enableConstantFolding{true};
157
158	/// If true, perform compile-time deduplication of Constants.
159	bool enableConstantDeduplication{true};
160
161	/// For all Splats in the Function being optimized, if they are used by any
162	/// Nodes listed in this set, then they will be materialized into Constants
163	/// during Constant Folding.
164	KindSet materializeSplatsUsedBySet;
165
166	/// If true, before any Function optimization, all the Constants will be
167	/// temporarily replaced by Placeholders, preventing the Constants from being
168	/// modified during the normal optimization pipeline. The original Constants
169	/// will be put back in place automatically afterward, and then Constant
170	/// Folding will be run.
171	bool delayAndRecordConstantModification{false};
172
173	/// A set used to hold all temporary PHs that were swapped in for real PHs
174	/// when delayAndRecordConstantModification is set.
175	std::unordered_set<Placeholder *> tempPHsForConstants;
176
177	/// If true, then there will be no error checking for backend support during
178	/// the optimization pipeline. Expected that the caller will check if desired
179	/// later on.
180	bool skipBackendSupportCheck{false};
181
182	/// If true, this will merge ConvertTo and Quantize nodes into inputs and
183	/// outputs of the Function. This means modifying the types of Placeholders
184	/// and SaveNodes if they have a corresponding ElemKind conversion (ConvertTo,
185	/// Quantize, Dequantize nodes). Note that this must be accompanied by
186	/// modifying the Tensors backing Placeholders at runtime.
187	bool foldElemKindConversionIntoIO{false};
188
189	/// If true this will fold convertTo and Quantize nodes into only static
190	/// placeholders. The conversion of the Tensors will be handled by the
191	/// provisioner.
192	bool foldStaticPlaceholderConversions{false};
193
194	/// If true, this will direct the partitioner to use SparseNN partitioning
195	/// scheme
196	bool useSparseNNPartitioningScheme{false};
197
198	/// If true, SparseNN partitioning scheme will add extra concats to the
199	/// SLS partition for more efficient inter-partition transfers
200	bool sparseNNPartitioningAddSLSConcats{false};
201
202	/// If true, SparseNN partitioning scheme will balance SLS tables across
203	/// cards using a performance model
204	bool sparseNNPartitioningBalancePerfModel{false};
205
206	/// If true, SparseNN partitioning scheme will move Layer Normalization
207	/// nodes immediately following SLS into SLS partitions
208	bool sparseNNPartitioningPairLNWithSLS{false};
209
210	/// If true, SparseNN partitioning scheme will move Tile
211	/// nodes immediately following SLS for user embeddings into SLS partitions
212	bool sparseNNPartitioningPairTileWithSLS{false};
213
214	/// SparseNN partitioning scheme will move nodes specified
215	/// in a comma-separated string which immediately follow SLS nodes into SLS
216	/// partitions. For example, to move Tanh and Concat, use "Tanh,Concat".
217	std::string sparseNNPartitioningPairSLSWith{""};
218
219	// If "Concat" and "Tanh" are specified in sparseNNPartitioningPairSLSWith,
220	// this will split large Concats going into a Tanh sink to the specified size
221	// before moving them into SLS partitions
222	unsigned int sparseNNPartitioningConcatSplitSize{`1`};
223
224	/// The number of cards over which to split SLS tables when using SparseNN
225	/// partitioning scheme
226	unsigned int sparseNNPartitioningSchemeNumCards{`1`};
227
228	/// The number of bytes to allocate per card for SLS tables when using
229	/// the SparseNN partitioning scheme
230	unsigned int sparseNNPartitioningSchemeSLSTableKBytesPerCard{`0`};
231
232	/// The number of cores to assign to SLS partition when using SparseNN
233	/// partitioning scheme
234	unsigned int sparseNNPartitioningSchemeNumCoresSLS{`1`};
235
236	/// The number of cores to assign to non-SLS partition when using SparseNN
237	/// partitioning scheme
238	unsigned int sparseNNPartitioningSchemeNumCoresOther{`1`};
239
240	/// The algorithm used for Placement tagging in DAG Optimizer
241	std::string DAGOptimizerPlacementTaggingAlgorithm;
242
243	/// The algorithm used for Parallelization tagging in DAG Optimizer
244	std::string DAGOptimizerParallelizationTaggingAlgorithm;
245
246	/// The number of parallel chunks used in DAG Optimizer parallelization
247	int32_t DAGOptimizerNumParallelChunks;
248
249	/// If it is true (false), perform (not perform) ASAP op placement in DAG
250	/// optimization; If it is not set, use acc perf GFlag APLASAPPlacement to
251	/// determine whether to perform ASAP op placement or not
252	llvm::Optional<bool> enableAPLASAPPlacement;
253
254	/// If true does int64 to int32 type demotion if backend supports for specific
255	/// nodes.
256	bool enableTypeDemotion{true};
257
258	/// If true, optimizations are allowed to change quantization scale/offset.
259	bool enableQuantParamChanges{true};
260
261	/// If true, ConcatNodes will not be merged during the optimizer.
262	bool skipConcatMerging{false};
263
264	/// If true, will sink tanh below concat
265	bool sinkTanhBelowConcat{false};
266
267	/// Default ctor.
268	OptimizationOptions() {
269	// By default, always materialize Splats used by ConvolutionNodes, as
270	// optimizations such as BatchNorm fusion depend on it.
271	materializeSplatsUsedBySet.insert(Kinded::Kind::ConvolutionNodeKind);
272	}
273	};
274
275	/// Meta information produced during the compilation. Whereas the compile
276	/// options should be interpreted as input variables for the compilation, the
277	/// below structure is output information produced by the compilation process.
278	struct CompilationInfo {
279	/// The hash of the graph before the lowering stage.
280	llvm::hash_code graphPreLowerHash{`0`};
281	};
282
283	/// Context for compilation.
284	struct CompilationContext {
285	/// Used during Profiling.
286	PlaceholderBindings bindings{nullptr*};
287
288	/// Allows the user to specify user defined partitioning.
289	runtime::PartitionConfig partitionConfig{nullptr*};
290
291	/// Allows a loader to store a pre-partitioned config.
292	runtime::PrePartitionedConfig prepartitionedConfig{nullptr*};
293
294	/// If true the HostManager will try to use all available devices on the host.
295	bool saturateHost{false};
296
297	/// If greater than zero, this is the number of available devices that are
298	/// used when saturateHost is enabled.
299	/// If saturateKDevices is zero and saturateHost is enabled, all available
300	/// devices will be saturated.
301	unsigned saturateKDevices{`0`};
302
303	/// Number of max active requests per instance of this network.
304	unsigned maxActiveRequestsPerInstance{`48`};
305
306	/// Used during Quantization and Profiling.
307	LoweredInfoMap loweredInfoMap{nullptr*};
308
309	/// Set up during model loading to map from Placeholders in the Module to the
310	/// symbolic name they were loaded with from the input model.
311	LoadedPlaceholderNameMap loadedPHNames;
312
313	/// Select whether in Training or Inference mode.
314	enum class CompilationMode {
315	Train, /// Compile the graph in preparation for training.
316	Infer, /// Compile the graph for inference. Notice that this operation
317	/// changes the graph in a way that is not reversible.
318	NumCompilationModes, /// Used to count the number of CompilationModes.
319	} compMode{CompilationMode::Infer};
320
321	/// Options for the Backend to use.
322	BackendOptions backendOpts;
323
324	/// Options for the optimizations to use.
325	OptimizationOptions optimizationOpts;
326
327	/// Configuration for different precision modes.
328	PrecisionConfiguration precisionConfig;
329
330	/// Information produced during compilation.
331	CompilationInfo info;
332
333	/// How to annotate the compilation log filename.
334	std::string compilationLogPrefix{"glow"};
335
336	/// Pointer to deferredWeightLoader object, this is used for large model
337	/// support.
338	runtime::DeferredWeightLoader deferredWeightLoader{nullptr*};
339
340	/// Whether to print out issues/logging during compilation. Used for example
341	/// to disable printing issues encountered during ConstantFolding.
342	bool verboseCompile{true};
343
344	/// Call dumpDag on each Function passed to the backend for compilation.
345	bool dumpFinalGraph = false;
346
347	/// Path where the dumped graphs should go, default "./".
348	std::string dumpGraphPath = "./";
349
350	/// Whether to skip stripping the module.
351	bool skipModuleStrip{false};
352
353	/// Enables Peer to Peer Tensor optimization.
354	bool enableP2P{false};
355
356	/// Enables Device Resident Tensor optimization.
357	bool enableDRT{false};
358
359	/// Number of times a function should be replicated on a device. This is
360	/// enabled for single partition networks. For advanced replication setups use
361	/// user-defined partitioning.
362	unsigned replicationCount{`1`};
363
364	/// Whether to serialize the DAG that has been optimized and partitioned.
365	bool serializeCompiledDAG{false};
366
367	/// Whether to return the Glow AOT serialized ONNX model as a string;
368	/// If false, dump the model as an ONNX model file in local;
369	/// If true, return the model string to glowAOTSerializationModelStrPtr;
370	/// This is for Glow AOT compilation
371	bool returnGlowSerializedModelStr{false};
372
373	/// Placeholder for the returned Glow AOT serialized ONNX model string
374	std::shared_ptr<std::string> glowAOTSerializationModelStrPtr{nullptr};
375
376	/// Whether to use Zip mode to serialize the DAG that has been optimized and
377	/// partitioned.
378	bool useZipModeForSerializeCompiledDAG{false};
379
380	/// Whether to save constant data into the serialized DAG.
381	bool saveConstantInSerializeCompiledDAG{false};
382
383	/// Whether to call the DAG optimizer after the DAG is created in HostManager.
384	bool callDAGOptimizer{false};
385
386	/// Whether to use AOT mode for DAG optimizer.
387	bool useDAGOptimizerAOTMode{false};
388
389	/// Whether we're loading a model that has been AOT optimized.
390	bool loadingAOTModel{false};
391
392	/// Whether to skip provisioning, e.g. if we're doing AOT optimization.
393	bool skipProvisioning{false};
394
395	/// Static placeholder type info used for AOT optimization.
396	std::map<std::string, Type> staticPlaceholderTypesForAOT;
397
398	/// Map from function name to its corresponding compiled serialized functions;
399	/// Used in deserialization.
400	std::unordered_map<std::string, std::shared_ptr<std::vector<char>>>
401	nameToFunctions;
402
403	CompilationContext(PlaceholderBindings bindings_ = nullptr*,
404	LoweredInfoMap loweredInfoMap_ = nullptr*)
405	: bindings(bindings_), loweredInfoMap(loweredInfoMap_) {}
406
407	/// \returns an error if the CompilationContext is malformed for whatever
408	/// configuration it is set up for, otherwise returns success.
409	Error verify() const {
410	RETURN_ERR_IF_NOT(!precisionConfig.useSetAsWhitelist \|\|
411	precisionConfig.convertToFP16,
412	"Can only use the precisionModeKindSet as a whitelist in "
413	"convertToFP16 mode.");
414
415	switch (precisionConfig.quantMode) {
416	case QuantizationMode::Profile:
417	RETURN_ERR_IF_NOT(bindings,
418	ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED,
419	"In Profiling mode, but bindings was not set.\n");
420
421	RETURN_ERR_IF_NOT(loweredInfoMap,
422	ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED,
423	"In Profiling mode, but loweredInfoMap was not set.\n");
424
425	RETURN_ERR_IF_NOT(!precisionConfig.convertToFP16,
426	ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED,
427	"Converting to FP16 while profiling is unsupported.\n");
428	break;
429
430	case QuantizationMode::Quantize:
431	RETURN_ERR_IF_NOT(
432	loweredInfoMap, ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED,
433	"In Quantization mode, but loweredInfoMap was not set.\n");
434	break;
435
436	case QuantizationMode::None:
437	break;
438	}
439
440	RETURN_ERR_IF_NOT(!(optimizationOpts.foldElemKindConversionIntoIO &&
441	optimizationOpts.delayAndRecordConstantModification),
442	"Cannot currently perform elem kind merging into PHs "
443	"when also preventing constant modification.");
444
445	RETURN_ERR_IF_NOT(
446	!(serializeCompiledDAG && skipProvisioning &&
447	!optimizationOpts.delayAndRecordConstantModification &&
448	!saveConstantInSerializeCompiledDAG),
449	"When serializing the compiled DAG while skipping provisioning, C2 "
450	"must also enable delayAndRecordConstantModification. PyTorch does not "
451	"enable delayAndRecordConstantModification in this case, but "
452	"saveConstantInSerializeCompiledDAG should be enabled");
453
454	RETURN_ERR_IF_NOT(
455	!precisionConfig.loadUniquedDummyQParams \|\|
456	precisionConfig.originNameToTQPMap,
457	"If loading unique dummy QParams, must have valid originNameToTQPMap");
458
459	RETURN_ERR_IF_NOT(!precisionConfig.clipQuantRangeToFP16 \|\|
460	precisionConfig.convertToFP16,
461	"Assuming quant ranges are clipped to fp16 should only "
462	"be done along with fp16 conversion.");
463
464	return Error::success();
465	}
466	};
467
468	using CompilationMode = CompilationContext::CompilationMode;
469
470	}; // namespace glow
471
472	#endif // GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H
473

Browse the source code of glow/include/glow/Optimizer/GraphOptimizer/CompilationContext.h