1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#ifndef GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H
17#define GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H
18
19#include "glow/Backends/BackendOptions.h"
20#include "glow/Graph/PlaceholderBindings.h"
21#include "glow/Quantization/Base/Base.h"
22#include "glow/Support/Error.h"
23
24namespace glow {
25namespace runtime {
26struct PartitionConfig;
27struct PrePartitionedConfig;
28class DeferredWeightLoader;
29} // namespace runtime
30
31/// Map from Placeholders to their original name and index in the proto that
32/// loaded them. Used to keep around info from when we import a proto to then
33/// exporting it later on.
34using LoadedPlaceholderNameMap =
35 std::unordered_map<const Placeholder *, std::pair<std::string, unsigned>>;
36
37/// Map from the name of the original op that some quantization parameters was
38/// loaded from to those associated quantization parameters.
39using OriginNameToTQPMap =
40 std::unordered_map<std::string, TensorQuantizationParams>;
41
42/// Configuration for different precision modes.
43struct PrecisionConfiguration {
44 /// Enum for what kind of transformation should be done for Quantization.
45 enum class QuantizationMode {
46 None, /// Perform no transformations for quantization.
47 Quantize, /// Quantize the graph using previously gathered statistics.
48 Profile, /// Add profiling nodes for quantization statistics gathering.
49 } quantMode{QuantizationMode::None};
50
51 /// Configuration for Profiling.
52 quantization::ProfilingConfiguration profConfig;
53
54 /// Configuration for Quantization.
55 quantization::QuantizationConfiguration quantConfig;
56
57 /// Enum for what kind of float16 format should be used.
58 enum class Float16Format {
59 None, /// No float16 format should be used.
60 FP16, /// FP16 format for float16 should be used.
61 BFloat16, /// FP16 format for float16 should be used.
62 } float16Format{
63 Float16Format::FP16}; /// If convertToFp16, float16 format to be used.
64
65 /// Whether to convert the FloatTy to Float16Ty in the Function.
66 bool convertToFP16{false};
67
68 /// Whether to convert UInt8FusedQTy to UInt8FusedFP16QTy in the Function.
69 bool convertFusedToFP16{false};
70
71 /// Whether to convert UInt4FusedFP16QTy to UInt8FusedQTy in the Function.
72 bool convert4BitFusedTo8Bit{false};
73
74 /// Whether to convert UInt8FusedFP16QTy to UInt8FusedQTy in the Function.
75 bool convert8BitFusedToFP32{false};
76
77 /// Whether to convert UInt4FusedFP16QTy to UInt4FusedQTy in the Function.
78 bool convert4BitFusedToFP32{false};
79
80 /// Whether to convert indices in FusedRowwiseSLWS to Int64ITy.
81 bool convertIndicesToInt64{false};
82
83 /// If convertToFP16, whether to convert input Placeholders.
84 bool convertPlaceholdersToFP16{false};
85
86 /// If convertToFP16, whether to convert Constants.
87 bool convertConstantsToFP16{false};
88
89 /// If convertToFp16, whether to skip convert bias from fp32 to fp16 in FC
90 bool skipBiasFp32tofp16Convert{false};
91
92 /// If convertToFP16, whether to clip out-of-range FP values to the min/max of
93 /// fp16.
94 bool clipFP16{false};
95
96 /// If clipFP16, whether to skip clipping inputs of Nodes.
97 bool clipFP16SkipInputs{false};
98
99 /// Whether to force FP16 accumulation for the SLS family of ops.
100 bool forceFP16AccumSLS{true};
101
102 /// Used during Quantization and convertToFP16 to keep the original precision
103 /// of specific node kinds (i.e. quantization/FP16 conversion would be skipped
104 /// for any node kinds found here). Used during profiling to prevent nodes
105 /// from being lowered before instrumenting the graph (e.g. do not lower group
106 /// convolutions for profiling; see `-do-not-lower-nodes-for-profiling` in
107 /// docs/Quantization.md).
108 KindSet precisionModeKindSet;
109
110 /// Whether to use the precisionModeKindSet as a whitelist instead of the
111 /// default blacklist. Currently only supported for convertToFP16.
112 bool useSetAsWhitelist{false};
113
114 /// Pointer to a map of loader names to loaded quant params.
115 OriginNameToTQPMap *originNameToTQPMap{nullptr};
116
117 /// If true, then discard original quantization params that are loaded, to
118 /// instead track origin of quantization params in \ref originNameToTQPMap.
119 bool loadUniquedDummyQParams{false};
120
121 /// If true, when scales for qparams are loaded, they are clipped to
122 /// kMinScaleFP16 if below kMinScaleFP16.
123 bool zeroScaleFP16Clip{false};
124
125 /// If true, then the model that is loaded is expected to have been originally
126 /// serialized with dummy quantization parameters, and was replaced with
127 /// actual quantization parameters when loaded in this compilation context.
128 bool replaceDummyTQPs{false};
129
130 /// If true, then we can safely assume that all qparams (even dummy qparams)
131 /// are clipped inside the FP16 range.
132 bool clipQuantRangeToFP16{false};
133
134 /// Converts a float16 \p format into an ElemKind.
135 static ElemKind getElementType(Float16Format format) {
136 switch (format) {
137 case Float16Format::FP16:
138 return ElemKind::Float16Ty;
139 case Float16Format::BFloat16:
140 return ElemKind::BFloat16Ty;
141 default:
142 llvm_unreachable("Unknown float16 format");
143 }
144 }
145};
146
147using QuantizationMode = PrecisionConfiguration::QuantizationMode;
148
149/// Options relevant to optimizations during compilation.
150struct OptimizationOptions {
151 /// Only lower, i.e. skip optimizations and precision transformations. Used
152 /// for testing.
153 llvm::SmallSet<Function *, 1> onlyLowerFuns;
154
155 /// If true, perform compile-time computation of constant operations.
156 bool enableConstantFolding{true};
157
158 /// If true, perform compile-time deduplication of Constants.
159 bool enableConstantDeduplication{true};
160
161 /// For all Splats in the Function being optimized, if they are used by any
162 /// Nodes listed in this set, then they will be materialized into Constants
163 /// during Constant Folding.
164 KindSet materializeSplatsUsedBySet;
165
166 /// If true, before any Function optimization, all the Constants will be
167 /// temporarily replaced by Placeholders, preventing the Constants from being
168 /// modified during the normal optimization pipeline. The original Constants
169 /// will be put back in place automatically afterward, and then Constant
170 /// Folding will be run.
171 bool delayAndRecordConstantModification{false};
172
173 /// A set used to hold all temporary PHs that were swapped in for real PHs
174 /// when delayAndRecordConstantModification is set.
175 std::unordered_set<Placeholder *> tempPHsForConstants;
176
177 /// If true, then there will be no error checking for backend support during
178 /// the optimization pipeline. Expected that the caller will check if desired
179 /// later on.
180 bool skipBackendSupportCheck{false};
181
182 /// If true, this will merge ConvertTo and Quantize nodes into inputs and
183 /// outputs of the Function. This means modifying the types of Placeholders
184 /// and SaveNodes if they have a corresponding ElemKind conversion (ConvertTo,
185 /// Quantize, Dequantize nodes). Note that this must be accompanied by
186 /// modifying the Tensors backing Placeholders at runtime.
187 bool foldElemKindConversionIntoIO{false};
188
189 /// If true this will fold convertTo and Quantize nodes into only static
190 /// placeholders. The conversion of the Tensors will be handled by the
191 /// provisioner.
192 bool foldStaticPlaceholderConversions{false};
193
194 /// If true, this will direct the partitioner to use SparseNN partitioning
195 /// scheme
196 bool useSparseNNPartitioningScheme{false};
197
198 /// If true, SparseNN partitioning scheme will add extra concats to the
199 /// SLS partition for more efficient inter-partition transfers
200 bool sparseNNPartitioningAddSLSConcats{false};
201
202 /// If true, SparseNN partitioning scheme will balance SLS tables across
203 /// cards using a performance model
204 bool sparseNNPartitioningBalancePerfModel{false};
205
206 /// If true, SparseNN partitioning scheme will move Layer Normalization
207 /// nodes immediately following SLS into SLS partitions
208 bool sparseNNPartitioningPairLNWithSLS{false};
209
210 /// If true, SparseNN partitioning scheme will move Tile
211 /// nodes immediately following SLS for user embeddings into SLS partitions
212 bool sparseNNPartitioningPairTileWithSLS{false};
213
214 /// SparseNN partitioning scheme will move nodes specified
215 /// in a comma-separated string which immediately follow SLS nodes into SLS
216 /// partitions. For example, to move Tanh and Concat, use "Tanh,Concat".
217 std::string sparseNNPartitioningPairSLSWith{""};
218
219 // If "Concat" and "Tanh" are specified in sparseNNPartitioningPairSLSWith,
220 // this will split large Concats going into a Tanh sink to the specified size
221 // before moving them into SLS partitions
222 unsigned int sparseNNPartitioningConcatSplitSize{1};
223
224 /// The number of cards over which to split SLS tables when using SparseNN
225 /// partitioning scheme
226 unsigned int sparseNNPartitioningSchemeNumCards{1};
227
228 /// The number of bytes to allocate per card for SLS tables when using
229 /// the SparseNN partitioning scheme
230 unsigned int sparseNNPartitioningSchemeSLSTableKBytesPerCard{0};
231
232 /// The number of cores to assign to SLS partition when using SparseNN
233 /// partitioning scheme
234 unsigned int sparseNNPartitioningSchemeNumCoresSLS{1};
235
236 /// The number of cores to assign to non-SLS partition when using SparseNN
237 /// partitioning scheme
238 unsigned int sparseNNPartitioningSchemeNumCoresOther{1};
239
240 /// The algorithm used for Placement tagging in DAG Optimizer
241 std::string DAGOptimizerPlacementTaggingAlgorithm;
242
243 /// The algorithm used for Parallelization tagging in DAG Optimizer
244 std::string DAGOptimizerParallelizationTaggingAlgorithm;
245
246 /// The number of parallel chunks used in DAG Optimizer parallelization
247 int32_t DAGOptimizerNumParallelChunks;
248
249 /// If it is true (false), perform (not perform) ASAP op placement in DAG
250 /// optimization; If it is not set, use acc perf GFlag APLASAPPlacement to
251 /// determine whether to perform ASAP op placement or not
252 llvm::Optional<bool> enableAPLASAPPlacement;
253
254 /// If true does int64 to int32 type demotion if backend supports for specific
255 /// nodes.
256 bool enableTypeDemotion{true};
257
258 /// If true, optimizations are allowed to change quantization scale/offset.
259 bool enableQuantParamChanges{true};
260
261 /// If true, ConcatNodes will not be merged during the optimizer.
262 bool skipConcatMerging{false};
263
264 /// If true, will sink tanh below concat
265 bool sinkTanhBelowConcat{false};
266
267 /// Default ctor.
268 OptimizationOptions() {
269 // By default, always materialize Splats used by ConvolutionNodes, as
270 // optimizations such as BatchNorm fusion depend on it.
271 materializeSplatsUsedBySet.insert(Kinded::Kind::ConvolutionNodeKind);
272 }
273};
274
275/// Meta information produced during the compilation. Whereas the compile
276/// options should be interpreted as input variables for the compilation, the
277/// below structure is output information produced by the compilation process.
278struct CompilationInfo {
279 /// The hash of the graph before the lowering stage.
280 llvm::hash_code graphPreLowerHash{0};
281};
282
283/// Context for compilation.
284struct CompilationContext {
285 /// Used during Profiling.
286 PlaceholderBindings *bindings{nullptr};
287
288 /// Allows the user to specify user defined partitioning.
289 runtime::PartitionConfig *partitionConfig{nullptr};
290
291 /// Allows a loader to store a pre-partitioned config.
292 runtime::PrePartitionedConfig *prepartitionedConfig{nullptr};
293
294 /// If true the HostManager will try to use all available devices on the host.
295 bool saturateHost{false};
296
297 /// If greater than zero, this is the number of available devices that are
298 /// used when saturateHost is enabled.
299 /// If saturateKDevices is zero and saturateHost is enabled, all available
300 /// devices will be saturated.
301 unsigned saturateKDevices{0};
302
303 /// Number of max active requests per instance of this network.
304 unsigned maxActiveRequestsPerInstance{48};
305
306 /// Used during Quantization and Profiling.
307 LoweredInfoMap *loweredInfoMap{nullptr};
308
309 /// Set up during model loading to map from Placeholders in the Module to the
310 /// symbolic name they were loaded with from the input model.
311 LoadedPlaceholderNameMap loadedPHNames;
312
313 /// Select whether in Training or Inference mode.
314 enum class CompilationMode {
315 Train, /// Compile the graph in preparation for training.
316 Infer, /// Compile the graph for inference. Notice that this operation
317 /// changes the graph in a way that is not reversible.
318 NumCompilationModes, /// Used to count the number of CompilationModes.
319 } compMode{CompilationMode::Infer};
320
321 /// Options for the Backend to use.
322 BackendOptions backendOpts;
323
324 /// Options for the optimizations to use.
325 OptimizationOptions optimizationOpts;
326
327 /// Configuration for different precision modes.
328 PrecisionConfiguration precisionConfig;
329
330 /// Information produced during compilation.
331 CompilationInfo info;
332
333 /// How to annotate the compilation log filename.
334 std::string compilationLogPrefix{"glow"};
335
336 /// Pointer to deferredWeightLoader object, this is used for large model
337 /// support.
338 runtime::DeferredWeightLoader *deferredWeightLoader{nullptr};
339
340 /// Whether to print out issues/logging during compilation. Used for example
341 /// to disable printing issues encountered during ConstantFolding.
342 bool verboseCompile{true};
343
344 /// Call dumpDag on each Function passed to the backend for compilation.
345 bool dumpFinalGraph = false;
346
347 /// Path where the dumped graphs should go, default "./".
348 std::string dumpGraphPath = "./";
349
350 /// Whether to skip stripping the module.
351 bool skipModuleStrip{false};
352
353 /// Enables Peer to Peer Tensor optimization.
354 bool enableP2P{false};
355
356 /// Enables Device Resident Tensor optimization.
357 bool enableDRT{false};
358
359 /// Number of times a function should be replicated on a device. This is
360 /// enabled for single partition networks. For advanced replication setups use
361 /// user-defined partitioning.
362 unsigned replicationCount{1};
363
364 /// Whether to serialize the DAG that has been optimized and partitioned.
365 bool serializeCompiledDAG{false};
366
367 /// Whether to return the Glow AOT serialized ONNX model as a string;
368 /// If false, dump the model as an ONNX model file in local;
369 /// If true, return the model string to glowAOTSerializationModelStrPtr;
370 /// This is for Glow AOT compilation
371 bool returnGlowSerializedModelStr{false};
372
373 /// Placeholder for the returned Glow AOT serialized ONNX model string
374 std::shared_ptr<std::string> glowAOTSerializationModelStrPtr{nullptr};
375
376 /// Whether to use Zip mode to serialize the DAG that has been optimized and
377 /// partitioned.
378 bool useZipModeForSerializeCompiledDAG{false};
379
380 /// Whether to save constant data into the serialized DAG.
381 bool saveConstantInSerializeCompiledDAG{false};
382
383 /// Whether to call the DAG optimizer after the DAG is created in HostManager.
384 bool callDAGOptimizer{false};
385
386 /// Whether to use AOT mode for DAG optimizer.
387 bool useDAGOptimizerAOTMode{false};
388
389 /// Whether we're loading a model that has been AOT optimized.
390 bool loadingAOTModel{false};
391
392 /// Whether to skip provisioning, e.g. if we're doing AOT optimization.
393 bool skipProvisioning{false};
394
395 /// Static placeholder type info used for AOT optimization.
396 std::map<std::string, Type> staticPlaceholderTypesForAOT;
397
398 /// Map from function name to its corresponding compiled serialized functions;
399 /// Used in deserialization.
400 std::unordered_map<std::string, std::shared_ptr<std::vector<char>>>
401 nameToFunctions;
402
403 CompilationContext(PlaceholderBindings *bindings_ = nullptr,
404 LoweredInfoMap *loweredInfoMap_ = nullptr)
405 : bindings(bindings_), loweredInfoMap(loweredInfoMap_) {}
406
407 /// \returns an error if the CompilationContext is malformed for whatever
408 /// configuration it is set up for, otherwise returns success.
409 Error verify() const {
410 RETURN_ERR_IF_NOT(!precisionConfig.useSetAsWhitelist ||
411 precisionConfig.convertToFP16,
412 "Can only use the precisionModeKindSet as a whitelist in "
413 "convertToFP16 mode.");
414
415 switch (precisionConfig.quantMode) {
416 case QuantizationMode::Profile:
417 RETURN_ERR_IF_NOT(bindings,
418 ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED,
419 "In Profiling mode, but bindings was not set.\n");
420
421 RETURN_ERR_IF_NOT(loweredInfoMap,
422 ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED,
423 "In Profiling mode, but loweredInfoMap was not set.\n");
424
425 RETURN_ERR_IF_NOT(!precisionConfig.convertToFP16,
426 ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED,
427 "Converting to FP16 while profiling is unsupported.\n");
428 break;
429
430 case QuantizationMode::Quantize:
431 RETURN_ERR_IF_NOT(
432 loweredInfoMap, ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED,
433 "In Quantization mode, but loweredInfoMap was not set.\n");
434 break;
435
436 case QuantizationMode::None:
437 break;
438 }
439
440 RETURN_ERR_IF_NOT(!(optimizationOpts.foldElemKindConversionIntoIO &&
441 optimizationOpts.delayAndRecordConstantModification),
442 "Cannot currently perform elem kind merging into PHs "
443 "when also preventing constant modification.");
444
445 RETURN_ERR_IF_NOT(
446 !(serializeCompiledDAG && skipProvisioning &&
447 !optimizationOpts.delayAndRecordConstantModification &&
448 !saveConstantInSerializeCompiledDAG),
449 "When serializing the compiled DAG while skipping provisioning, C2 "
450 "must also enable delayAndRecordConstantModification. PyTorch does not "
451 "enable delayAndRecordConstantModification in this case, but "
452 "saveConstantInSerializeCompiledDAG should be enabled");
453
454 RETURN_ERR_IF_NOT(
455 !precisionConfig.loadUniquedDummyQParams ||
456 precisionConfig.originNameToTQPMap,
457 "If loading unique dummy QParams, must have valid originNameToTQPMap");
458
459 RETURN_ERR_IF_NOT(!precisionConfig.clipQuantRangeToFP16 ||
460 precisionConfig.convertToFP16,
461 "Assuming quant ranges are clipped to fp16 should only "
462 "be done along with fp16 conversion.");
463
464 return Error::success();
465 }
466};
467
468using CompilationMode = CompilationContext::CompilationMode;
469
470}; // namespace glow
471
472#endif // GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H
473