1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #ifndef GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H |
17 | #define GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H |
18 | |
19 | #include "glow/Backends/BackendOptions.h" |
20 | #include "glow/Graph/PlaceholderBindings.h" |
21 | #include "glow/Quantization/Base/Base.h" |
22 | #include "glow/Support/Error.h" |
23 | |
24 | namespace glow { |
25 | namespace runtime { |
26 | struct PartitionConfig; |
27 | struct PrePartitionedConfig; |
28 | class DeferredWeightLoader; |
29 | } // namespace runtime |
30 | |
31 | /// Map from Placeholders to their original name and index in the proto that |
32 | /// loaded them. Used to keep around info from when we import a proto to then |
33 | /// exporting it later on. |
34 | using LoadedPlaceholderNameMap = |
35 | std::unordered_map<const Placeholder *, std::pair<std::string, unsigned>>; |
36 | |
37 | /// Map from the name of the original op that some quantization parameters was |
38 | /// loaded from to those associated quantization parameters. |
39 | using OriginNameToTQPMap = |
40 | std::unordered_map<std::string, TensorQuantizationParams>; |
41 | |
42 | /// Configuration for different precision modes. |
43 | struct PrecisionConfiguration { |
44 | /// Enum for what kind of transformation should be done for Quantization. |
45 | enum class QuantizationMode { |
46 | None, /// Perform no transformations for quantization. |
47 | Quantize, /// Quantize the graph using previously gathered statistics. |
48 | Profile, /// Add profiling nodes for quantization statistics gathering. |
49 | } quantMode{QuantizationMode::None}; |
50 | |
51 | /// Configuration for Profiling. |
52 | quantization::ProfilingConfiguration profConfig; |
53 | |
54 | /// Configuration for Quantization. |
55 | quantization::QuantizationConfiguration quantConfig; |
56 | |
57 | /// Enum for what kind of float16 format should be used. |
58 | enum class Float16Format { |
59 | None, /// No float16 format should be used. |
60 | FP16, /// FP16 format for float16 should be used. |
61 | BFloat16, /// FP16 format for float16 should be used. |
62 | } float16Format{ |
63 | Float16Format::FP16}; /// If convertToFp16, float16 format to be used. |
64 | |
65 | /// Whether to convert the FloatTy to Float16Ty in the Function. |
66 | bool convertToFP16{false}; |
67 | |
68 | /// Whether to convert UInt8FusedQTy to UInt8FusedFP16QTy in the Function. |
69 | bool convertFusedToFP16{false}; |
70 | |
71 | /// Whether to convert UInt4FusedFP16QTy to UInt8FusedQTy in the Function. |
72 | bool convert4BitFusedTo8Bit{false}; |
73 | |
74 | /// Whether to convert UInt8FusedFP16QTy to UInt8FusedQTy in the Function. |
75 | bool convert8BitFusedToFP32{false}; |
76 | |
77 | /// Whether to convert UInt4FusedFP16QTy to UInt4FusedQTy in the Function. |
78 | bool convert4BitFusedToFP32{false}; |
79 | |
80 | /// Whether to convert indices in FusedRowwiseSLWS to Int64ITy. |
81 | bool convertIndicesToInt64{false}; |
82 | |
83 | /// If convertToFP16, whether to convert input Placeholders. |
84 | bool convertPlaceholdersToFP16{false}; |
85 | |
86 | /// If convertToFP16, whether to convert Constants. |
87 | bool convertConstantsToFP16{false}; |
88 | |
89 | /// If convertToFp16, whether to skip convert bias from fp32 to fp16 in FC |
90 | bool skipBiasFp32tofp16Convert{false}; |
91 | |
92 | /// If convertToFP16, whether to clip out-of-range FP values to the min/max of |
93 | /// fp16. |
94 | bool clipFP16{false}; |
95 | |
96 | /// If clipFP16, whether to skip clipping inputs of Nodes. |
97 | bool clipFP16SkipInputs{false}; |
98 | |
99 | /// Whether to force FP16 accumulation for the SLS family of ops. |
100 | bool forceFP16AccumSLS{true}; |
101 | |
102 | /// Used during Quantization and convertToFP16 to keep the original precision |
103 | /// of specific node kinds (i.e. quantization/FP16 conversion would be skipped |
104 | /// for any node kinds found here). Used during profiling to prevent nodes |
105 | /// from being lowered before instrumenting the graph (e.g. do not lower group |
106 | /// convolutions for profiling; see `-do-not-lower-nodes-for-profiling` in |
107 | /// docs/Quantization.md). |
108 | KindSet precisionModeKindSet; |
109 | |
110 | /// Whether to use the precisionModeKindSet as a whitelist instead of the |
111 | /// default blacklist. Currently only supported for convertToFP16. |
112 | bool useSetAsWhitelist{false}; |
113 | |
114 | /// Pointer to a map of loader names to loaded quant params. |
115 | OriginNameToTQPMap *originNameToTQPMap{nullptr}; |
116 | |
117 | /// If true, then discard original quantization params that are loaded, to |
118 | /// instead track origin of quantization params in \ref originNameToTQPMap. |
119 | bool loadUniquedDummyQParams{false}; |
120 | |
121 | /// If true, when scales for qparams are loaded, they are clipped to |
122 | /// kMinScaleFP16 if below kMinScaleFP16. |
123 | bool zeroScaleFP16Clip{false}; |
124 | |
125 | /// If true, then the model that is loaded is expected to have been originally |
126 | /// serialized with dummy quantization parameters, and was replaced with |
127 | /// actual quantization parameters when loaded in this compilation context. |
128 | bool replaceDummyTQPs{false}; |
129 | |
130 | /// If true, then we can safely assume that all qparams (even dummy qparams) |
131 | /// are clipped inside the FP16 range. |
132 | bool clipQuantRangeToFP16{false}; |
133 | |
134 | /// Converts a float16 \p format into an ElemKind. |
135 | static ElemKind getElementType(Float16Format format) { |
136 | switch (format) { |
137 | case Float16Format::FP16: |
138 | return ElemKind::Float16Ty; |
139 | case Float16Format::BFloat16: |
140 | return ElemKind::BFloat16Ty; |
141 | default: |
142 | llvm_unreachable("Unknown float16 format" ); |
143 | } |
144 | } |
145 | }; |
146 | |
147 | using QuantizationMode = PrecisionConfiguration::QuantizationMode; |
148 | |
149 | /// Options relevant to optimizations during compilation. |
150 | struct OptimizationOptions { |
151 | /// Only lower, i.e. skip optimizations and precision transformations. Used |
152 | /// for testing. |
153 | llvm::SmallSet<Function *, 1> onlyLowerFuns; |
154 | |
155 | /// If true, perform compile-time computation of constant operations. |
156 | bool enableConstantFolding{true}; |
157 | |
158 | /// If true, perform compile-time deduplication of Constants. |
159 | bool enableConstantDeduplication{true}; |
160 | |
161 | /// For all Splats in the Function being optimized, if they are used by any |
162 | /// Nodes listed in this set, then they will be materialized into Constants |
163 | /// during Constant Folding. |
164 | KindSet materializeSplatsUsedBySet; |
165 | |
166 | /// If true, before any Function optimization, all the Constants will be |
167 | /// temporarily replaced by Placeholders, preventing the Constants from being |
168 | /// modified during the normal optimization pipeline. The original Constants |
169 | /// will be put back in place automatically afterward, and then Constant |
170 | /// Folding will be run. |
171 | bool delayAndRecordConstantModification{false}; |
172 | |
173 | /// A set used to hold all temporary PHs that were swapped in for real PHs |
174 | /// when delayAndRecordConstantModification is set. |
175 | std::unordered_set<Placeholder *> tempPHsForConstants; |
176 | |
177 | /// If true, then there will be no error checking for backend support during |
178 | /// the optimization pipeline. Expected that the caller will check if desired |
179 | /// later on. |
180 | bool skipBackendSupportCheck{false}; |
181 | |
182 | /// If true, this will merge ConvertTo and Quantize nodes into inputs and |
183 | /// outputs of the Function. This means modifying the types of Placeholders |
184 | /// and SaveNodes if they have a corresponding ElemKind conversion (ConvertTo, |
185 | /// Quantize, Dequantize nodes). Note that this must be accompanied by |
186 | /// modifying the Tensors backing Placeholders at runtime. |
187 | bool foldElemKindConversionIntoIO{false}; |
188 | |
189 | /// If true this will fold convertTo and Quantize nodes into only static |
190 | /// placeholders. The conversion of the Tensors will be handled by the |
191 | /// provisioner. |
192 | bool foldStaticPlaceholderConversions{false}; |
193 | |
194 | /// If true, this will direct the partitioner to use SparseNN partitioning |
195 | /// scheme |
196 | bool useSparseNNPartitioningScheme{false}; |
197 | |
198 | /// If true, SparseNN partitioning scheme will add extra concats to the |
199 | /// SLS partition for more efficient inter-partition transfers |
200 | bool sparseNNPartitioningAddSLSConcats{false}; |
201 | |
202 | /// If true, SparseNN partitioning scheme will balance SLS tables across |
203 | /// cards using a performance model |
204 | bool sparseNNPartitioningBalancePerfModel{false}; |
205 | |
206 | /// If true, SparseNN partitioning scheme will move Layer Normalization |
207 | /// nodes immediately following SLS into SLS partitions |
208 | bool sparseNNPartitioningPairLNWithSLS{false}; |
209 | |
210 | /// If true, SparseNN partitioning scheme will move Tile |
211 | /// nodes immediately following SLS for user embeddings into SLS partitions |
212 | bool sparseNNPartitioningPairTileWithSLS{false}; |
213 | |
214 | /// SparseNN partitioning scheme will move nodes specified |
215 | /// in a comma-separated string which immediately follow SLS nodes into SLS |
216 | /// partitions. For example, to move Tanh and Concat, use "Tanh,Concat". |
217 | std::string sparseNNPartitioningPairSLSWith{"" }; |
218 | |
219 | // If "Concat" and "Tanh" are specified in sparseNNPartitioningPairSLSWith, |
220 | // this will split large Concats going into a Tanh sink to the specified size |
221 | // before moving them into SLS partitions |
222 | unsigned int sparseNNPartitioningConcatSplitSize{1}; |
223 | |
224 | /// The number of cards over which to split SLS tables when using SparseNN |
225 | /// partitioning scheme |
226 | unsigned int {1}; |
227 | |
228 | /// The number of bytes to allocate per card for SLS tables when using |
229 | /// the SparseNN partitioning scheme |
230 | unsigned int sparseNNPartitioningSchemeSLSTableKBytesPerCard{0}; |
231 | |
232 | /// The number of cores to assign to SLS partition when using SparseNN |
233 | /// partitioning scheme |
234 | unsigned int {1}; |
235 | |
236 | /// The number of cores to assign to non-SLS partition when using SparseNN |
237 | /// partitioning scheme |
238 | unsigned int {1}; |
239 | |
240 | /// The algorithm used for Placement tagging in DAG Optimizer |
241 | std::string DAGOptimizerPlacementTaggingAlgorithm; |
242 | |
243 | /// The algorithm used for Parallelization tagging in DAG Optimizer |
244 | std::string DAGOptimizerParallelizationTaggingAlgorithm; |
245 | |
246 | /// The number of parallel chunks used in DAG Optimizer parallelization |
247 | int32_t DAGOptimizerNumParallelChunks; |
248 | |
249 | /// If it is true (false), perform (not perform) ASAP op placement in DAG |
250 | /// optimization; If it is not set, use acc perf GFlag APLASAPPlacement to |
251 | /// determine whether to perform ASAP op placement or not |
252 | llvm::Optional<bool> enableAPLASAPPlacement; |
253 | |
254 | /// If true does int64 to int32 type demotion if backend supports for specific |
255 | /// nodes. |
256 | bool enableTypeDemotion{true}; |
257 | |
258 | /// If true, optimizations are allowed to change quantization scale/offset. |
259 | bool enableQuantParamChanges{true}; |
260 | |
261 | /// If true, ConcatNodes will not be merged during the optimizer. |
262 | bool skipConcatMerging{false}; |
263 | |
264 | /// If true, will sink tanh below concat |
265 | bool sinkTanhBelowConcat{false}; |
266 | |
267 | /// Default ctor. |
268 | OptimizationOptions() { |
269 | // By default, always materialize Splats used by ConvolutionNodes, as |
270 | // optimizations such as BatchNorm fusion depend on it. |
271 | materializeSplatsUsedBySet.insert(Kinded::Kind::ConvolutionNodeKind); |
272 | } |
273 | }; |
274 | |
275 | /// Meta information produced during the compilation. Whereas the compile |
276 | /// options should be interpreted as input variables for the compilation, the |
277 | /// below structure is output information produced by the compilation process. |
278 | struct CompilationInfo { |
279 | /// The hash of the graph before the lowering stage. |
280 | llvm::hash_code graphPreLowerHash{0}; |
281 | }; |
282 | |
283 | /// Context for compilation. |
284 | struct CompilationContext { |
285 | /// Used during Profiling. |
286 | PlaceholderBindings *bindings{nullptr}; |
287 | |
288 | /// Allows the user to specify user defined partitioning. |
289 | runtime::PartitionConfig *partitionConfig{nullptr}; |
290 | |
291 | /// Allows a loader to store a pre-partitioned config. |
292 | runtime::PrePartitionedConfig *prepartitionedConfig{nullptr}; |
293 | |
294 | /// If true the HostManager will try to use all available devices on the host. |
295 | bool saturateHost{false}; |
296 | |
297 | /// If greater than zero, this is the number of available devices that are |
298 | /// used when saturateHost is enabled. |
299 | /// If saturateKDevices is zero and saturateHost is enabled, all available |
300 | /// devices will be saturated. |
301 | unsigned saturateKDevices{0}; |
302 | |
303 | /// Number of max active requests per instance of this network. |
304 | unsigned maxActiveRequestsPerInstance{48}; |
305 | |
306 | /// Used during Quantization and Profiling. |
307 | LoweredInfoMap *loweredInfoMap{nullptr}; |
308 | |
309 | /// Set up during model loading to map from Placeholders in the Module to the |
310 | /// symbolic name they were loaded with from the input model. |
311 | LoadedPlaceholderNameMap loadedPHNames; |
312 | |
313 | /// Select whether in Training or Inference mode. |
314 | enum class CompilationMode { |
315 | Train, /// Compile the graph in preparation for training. |
316 | Infer, /// Compile the graph for inference. Notice that this operation |
317 | /// changes the graph in a way that is not reversible. |
318 | NumCompilationModes, /// Used to count the number of CompilationModes. |
319 | } compMode{CompilationMode::Infer}; |
320 | |
321 | /// Options for the Backend to use. |
322 | BackendOptions backendOpts; |
323 | |
324 | /// Options for the optimizations to use. |
325 | OptimizationOptions optimizationOpts; |
326 | |
327 | /// Configuration for different precision modes. |
328 | PrecisionConfiguration precisionConfig; |
329 | |
330 | /// Information produced during compilation. |
331 | CompilationInfo info; |
332 | |
333 | /// How to annotate the compilation log filename. |
334 | std::string compilationLogPrefix{"glow" }; |
335 | |
336 | /// Pointer to deferredWeightLoader object, this is used for large model |
337 | /// support. |
338 | runtime::DeferredWeightLoader *deferredWeightLoader{nullptr}; |
339 | |
340 | /// Whether to print out issues/logging during compilation. Used for example |
341 | /// to disable printing issues encountered during ConstantFolding. |
342 | bool verboseCompile{true}; |
343 | |
344 | /// Call dumpDag on each Function passed to the backend for compilation. |
345 | bool dumpFinalGraph = false; |
346 | |
347 | /// Path where the dumped graphs should go, default "./". |
348 | std::string dumpGraphPath = "./" ; |
349 | |
350 | /// Whether to skip stripping the module. |
351 | bool skipModuleStrip{false}; |
352 | |
353 | /// Enables Peer to Peer Tensor optimization. |
354 | bool enableP2P{false}; |
355 | |
356 | /// Enables Device Resident Tensor optimization. |
357 | bool enableDRT{false}; |
358 | |
359 | /// Number of times a function should be replicated on a device. This is |
360 | /// enabled for single partition networks. For advanced replication setups use |
361 | /// user-defined partitioning. |
362 | unsigned replicationCount{1}; |
363 | |
364 | /// Whether to serialize the DAG that has been optimized and partitioned. |
365 | bool serializeCompiledDAG{false}; |
366 | |
367 | /// Whether to return the Glow AOT serialized ONNX model as a string; |
368 | /// If false, dump the model as an ONNX model file in local; |
369 | /// If true, return the model string to glowAOTSerializationModelStrPtr; |
370 | /// This is for Glow AOT compilation |
371 | bool returnGlowSerializedModelStr{false}; |
372 | |
373 | /// Placeholder for the returned Glow AOT serialized ONNX model string |
374 | std::shared_ptr<std::string> glowAOTSerializationModelStrPtr{nullptr}; |
375 | |
376 | /// Whether to use Zip mode to serialize the DAG that has been optimized and |
377 | /// partitioned. |
378 | bool useZipModeForSerializeCompiledDAG{false}; |
379 | |
380 | /// Whether to save constant data into the serialized DAG. |
381 | bool saveConstantInSerializeCompiledDAG{false}; |
382 | |
383 | /// Whether to call the DAG optimizer after the DAG is created in HostManager. |
384 | bool callDAGOptimizer{false}; |
385 | |
386 | /// Whether to use AOT mode for DAG optimizer. |
387 | bool useDAGOptimizerAOTMode{false}; |
388 | |
389 | /// Whether we're loading a model that has been AOT optimized. |
390 | bool loadingAOTModel{false}; |
391 | |
392 | /// Whether to skip provisioning, e.g. if we're doing AOT optimization. |
393 | bool skipProvisioning{false}; |
394 | |
395 | /// Static placeholder type info used for AOT optimization. |
396 | std::map<std::string, Type> staticPlaceholderTypesForAOT; |
397 | |
398 | /// Map from function name to its corresponding compiled serialized functions; |
399 | /// Used in deserialization. |
400 | std::unordered_map<std::string, std::shared_ptr<std::vector<char>>> |
401 | nameToFunctions; |
402 | |
403 | CompilationContext(PlaceholderBindings *bindings_ = nullptr, |
404 | LoweredInfoMap *loweredInfoMap_ = nullptr) |
405 | : bindings(bindings_), loweredInfoMap(loweredInfoMap_) {} |
406 | |
407 | /// \returns an error if the CompilationContext is malformed for whatever |
408 | /// configuration it is set up for, otherwise returns success. |
409 | Error verify() const { |
410 | RETURN_ERR_IF_NOT(!precisionConfig.useSetAsWhitelist || |
411 | precisionConfig.convertToFP16, |
412 | "Can only use the precisionModeKindSet as a whitelist in " |
413 | "convertToFP16 mode." ); |
414 | |
415 | switch (precisionConfig.quantMode) { |
416 | case QuantizationMode::Profile: |
417 | RETURN_ERR_IF_NOT(bindings, |
418 | ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED, |
419 | "In Profiling mode, but bindings was not set.\n" ); |
420 | |
421 | RETURN_ERR_IF_NOT(loweredInfoMap, |
422 | ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED, |
423 | "In Profiling mode, but loweredInfoMap was not set.\n" ); |
424 | |
425 | RETURN_ERR_IF_NOT(!precisionConfig.convertToFP16, |
426 | ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED, |
427 | "Converting to FP16 while profiling is unsupported.\n" ); |
428 | break; |
429 | |
430 | case QuantizationMode::Quantize: |
431 | RETURN_ERR_IF_NOT( |
432 | loweredInfoMap, ErrorValue::ErrorCode::COMPILE_CONTEXT_MALFORMED, |
433 | "In Quantization mode, but loweredInfoMap was not set.\n" ); |
434 | break; |
435 | |
436 | case QuantizationMode::None: |
437 | break; |
438 | } |
439 | |
440 | RETURN_ERR_IF_NOT(!(optimizationOpts.foldElemKindConversionIntoIO && |
441 | optimizationOpts.delayAndRecordConstantModification), |
442 | "Cannot currently perform elem kind merging into PHs " |
443 | "when also preventing constant modification." ); |
444 | |
445 | RETURN_ERR_IF_NOT( |
446 | !(serializeCompiledDAG && skipProvisioning && |
447 | !optimizationOpts.delayAndRecordConstantModification && |
448 | !saveConstantInSerializeCompiledDAG), |
449 | "When serializing the compiled DAG while skipping provisioning, C2 " |
450 | "must also enable delayAndRecordConstantModification. PyTorch does not " |
451 | "enable delayAndRecordConstantModification in this case, but " |
452 | "saveConstantInSerializeCompiledDAG should be enabled" ); |
453 | |
454 | RETURN_ERR_IF_NOT( |
455 | !precisionConfig.loadUniquedDummyQParams || |
456 | precisionConfig.originNameToTQPMap, |
457 | "If loading unique dummy QParams, must have valid originNameToTQPMap" ); |
458 | |
459 | RETURN_ERR_IF_NOT(!precisionConfig.clipQuantRangeToFP16 || |
460 | precisionConfig.convertToFP16, |
461 | "Assuming quant ranges are clipped to fp16 should only " |
462 | "be done along with fp16 conversion." ); |
463 | |
464 | return Error::success(); |
465 | } |
466 | }; |
467 | |
468 | using CompilationMode = CompilationContext::CompilationMode; |
469 | |
470 | }; // namespace glow |
471 | |
472 | #endif // GLOW_OPTIMIZER_GRAPHOPTIMIZER_COMPILATIONCONTEXT_H |
473 | |