1 | //===- TargetTransformInfo.h ------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// This pass exposes codegen information to IR-level passes. Every |
10 | /// transformation that uses codegen information is broken into three parts: |
11 | /// 1. The IR-level analysis pass. |
12 | /// 2. The IR-level transformation interface which provides the needed |
13 | /// information. |
14 | /// 3. Codegen-level implementation which uses target-specific hooks. |
15 | /// |
16 | /// This file defines #2, which is the interface that IR-level transformations |
17 | /// use for querying the codegen. |
18 | /// |
19 | //===----------------------------------------------------------------------===// |
20 | |
21 | #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H |
22 | #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H |
23 | |
24 | #include "llvm/IR/InstrTypes.h" |
25 | #include "llvm/IR/Operator.h" |
26 | #include "llvm/IR/PassManager.h" |
27 | #include "llvm/Pass.h" |
28 | #include "llvm/Support/AtomicOrdering.h" |
29 | #include "llvm/Support/BranchProbability.h" |
30 | #include "llvm/Support/DataTypes.h" |
31 | #include "llvm/Support/InstructionCost.h" |
32 | #include <functional> |
33 | #include <utility> |
34 | |
35 | namespace llvm { |
36 | |
37 | namespace Intrinsic { |
38 | typedef unsigned ID; |
39 | } |
40 | |
41 | class AssumptionCache; |
42 | class BlockFrequencyInfo; |
43 | class DominatorTree; |
44 | class BranchInst; |
45 | class CallBase; |
46 | class Function; |
47 | class GlobalValue; |
48 | class InstCombiner; |
49 | class ; |
50 | class IntrinsicInst; |
51 | class LoadInst; |
52 | class LoopAccessInfo; |
53 | class Loop; |
54 | class LoopInfo; |
55 | class ProfileSummaryInfo; |
56 | class RecurrenceDescriptor; |
57 | class SCEV; |
58 | class ScalarEvolution; |
59 | class StoreInst; |
60 | class SwitchInst; |
61 | class TargetLibraryInfo; |
62 | class Type; |
63 | class User; |
64 | class Value; |
65 | class VPIntrinsic; |
66 | struct KnownBits; |
67 | template <typename T> class Optional; |
68 | |
69 | /// Information about a load/store intrinsic defined by the target. |
70 | struct MemIntrinsicInfo { |
71 | /// This is the pointer that the intrinsic is loading from or storing to. |
72 | /// If this is non-null, then analysis/optimization passes can assume that |
73 | /// this intrinsic is functionally equivalent to a load/store from this |
74 | /// pointer. |
75 | Value *PtrVal = nullptr; |
76 | |
77 | // Ordering for atomic operations. |
78 | AtomicOrdering Ordering = AtomicOrdering::NotAtomic; |
79 | |
80 | // Same Id is set by the target for corresponding load/store intrinsics. |
81 | unsigned short MatchingId = 0; |
82 | |
83 | bool ReadMem = false; |
84 | bool WriteMem = false; |
85 | bool IsVolatile = false; |
86 | |
87 | bool isUnordered() const { |
88 | return (Ordering == AtomicOrdering::NotAtomic || |
89 | Ordering == AtomicOrdering::Unordered) && |
90 | !IsVolatile; |
91 | } |
92 | }; |
93 | |
94 | /// Attributes of a target dependent hardware loop. |
95 | struct HardwareLoopInfo { |
96 | HardwareLoopInfo() = delete; |
97 | HardwareLoopInfo(Loop *L) : L(L) {} |
98 | Loop *L = nullptr; |
99 | BasicBlock *ExitBlock = nullptr; |
100 | BranchInst *ExitBranch = nullptr; |
101 | const SCEV *ExitCount = nullptr; |
102 | IntegerType *CountType = nullptr; |
103 | Value *LoopDecrement = nullptr; // Decrement the loop counter by this |
104 | // value in every iteration. |
105 | bool IsNestingLegal = false; // Can a hardware loop be a parent to |
106 | // another hardware loop? |
107 | bool CounterInReg = false; // Should loop counter be updated in |
108 | // the loop via a phi? |
109 | bool PerformEntryTest = false; // Generate the intrinsic which also performs |
110 | // icmp ne zero on the loop counter value and |
111 | // produces an i1 to guard the loop entry. |
112 | bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, |
113 | DominatorTree &DT, bool ForceNestedLoop = false, |
114 | bool ForceHardwareLoopPHI = false); |
115 | bool canAnalyze(LoopInfo &LI); |
116 | }; |
117 | |
118 | class IntrinsicCostAttributes { |
119 | const IntrinsicInst *II = nullptr; |
120 | Type *RetTy = nullptr; |
121 | Intrinsic::ID IID; |
122 | SmallVector<Type *, 4> ParamTys; |
123 | SmallVector<const Value *, 4> Arguments; |
124 | FastMathFlags FMF; |
125 | // If ScalarizationCost is UINT_MAX, the cost of scalarizing the |
126 | // arguments and the return value will be computed based on types. |
127 | InstructionCost ScalarizationCost = InstructionCost::getInvalid(); |
128 | |
129 | public: |
130 | IntrinsicCostAttributes( |
131 | Intrinsic::ID Id, const CallBase &CI, |
132 | InstructionCost ScalarCost = InstructionCost::getInvalid()); |
133 | |
134 | IntrinsicCostAttributes( |
135 | Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys, |
136 | FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr, |
137 | InstructionCost ScalarCost = InstructionCost::getInvalid()); |
138 | |
139 | IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, |
140 | ArrayRef<const Value *> Args); |
141 | |
142 | IntrinsicCostAttributes( |
143 | Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args, |
144 | ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(), |
145 | const IntrinsicInst *I = nullptr, |
146 | InstructionCost ScalarCost = InstructionCost::getInvalid()); |
147 | |
148 | Intrinsic::ID getID() const { return IID; } |
149 | const IntrinsicInst *getInst() const { return II; } |
150 | Type *getReturnType() const { return RetTy; } |
151 | FastMathFlags getFlags() const { return FMF; } |
152 | InstructionCost getScalarizationCost() const { return ScalarizationCost; } |
153 | const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; } |
154 | const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; } |
155 | |
156 | bool isTypeBasedOnly() const { |
157 | return Arguments.empty(); |
158 | } |
159 | |
160 | bool skipScalarizationCost() const { return ScalarizationCost.isValid(); } |
161 | }; |
162 | |
163 | class TargetTransformInfo; |
164 | typedef TargetTransformInfo TTI; |
165 | |
166 | /// This pass provides access to the codegen interfaces that are needed |
167 | /// for IR-level transformations. |
168 | class TargetTransformInfo { |
169 | public: |
170 | /// Construct a TTI object using a type implementing the \c Concept |
171 | /// API below. |
172 | /// |
173 | /// This is used by targets to construct a TTI wrapping their target-specific |
174 | /// implementation that encodes appropriate costs for their target. |
175 | template <typename T> TargetTransformInfo(T Impl); |
176 | |
177 | /// Construct a baseline TTI object using a minimal implementation of |
178 | /// the \c Concept API below. |
179 | /// |
180 | /// The TTI implementation will reflect the information in the DataLayout |
181 | /// provided if non-null. |
182 | explicit TargetTransformInfo(const DataLayout &DL); |
183 | |
184 | // Provide move semantics. |
185 | TargetTransformInfo(TargetTransformInfo &&Arg); |
186 | TargetTransformInfo &operator=(TargetTransformInfo &&RHS); |
187 | |
188 | // We need to define the destructor out-of-line to define our sub-classes |
189 | // out-of-line. |
190 | ~TargetTransformInfo(); |
191 | |
192 | /// Handle the invalidation of this information. |
193 | /// |
194 | /// When used as a result of \c TargetIRAnalysis this method will be called |
195 | /// when the function this was computed for changes. When it returns false, |
196 | /// the information is preserved across those changes. |
197 | bool invalidate(Function &, const PreservedAnalyses &, |
198 | FunctionAnalysisManager::Invalidator &) { |
199 | // FIXME: We should probably in some way ensure that the subtarget |
200 | // information for a function hasn't changed. |
201 | return false; |
202 | } |
203 | |
204 | /// \name Generic Target Information |
205 | /// @{ |
206 | |
207 | /// The kind of cost model. |
208 | /// |
209 | /// There are several different cost models that can be customized by the |
210 | /// target. The normalization of each cost model may be target specific. |
211 | enum TargetCostKind { |
212 | TCK_RecipThroughput, ///< Reciprocal throughput. |
213 | TCK_Latency, ///< The latency of instruction. |
214 | TCK_CodeSize, ///< Instruction code size. |
215 | TCK_SizeAndLatency ///< The weighted sum of size and latency. |
216 | }; |
217 | |
218 | /// Query the cost of a specified instruction. |
219 | /// |
220 | /// Clients should use this interface to query the cost of an existing |
221 | /// instruction. The instruction must have a valid parent (basic block). |
222 | /// |
223 | /// Note, this method does not cache the cost calculation and it |
224 | /// can be expensive in some cases. |
225 | InstructionCost getInstructionCost(const Instruction *I, |
226 | enum TargetCostKind kind) const { |
227 | InstructionCost Cost; |
228 | switch (kind) { |
229 | case TCK_RecipThroughput: |
230 | Cost = getInstructionThroughput(I); |
231 | break; |
232 | case TCK_Latency: |
233 | Cost = getInstructionLatency(I); |
234 | break; |
235 | case TCK_CodeSize: |
236 | case TCK_SizeAndLatency: |
237 | Cost = getUserCost(I, kind); |
238 | break; |
239 | } |
240 | return Cost; |
241 | } |
242 | |
243 | /// Underlying constants for 'cost' values in this interface. |
244 | /// |
245 | /// Many APIs in this interface return a cost. This enum defines the |
246 | /// fundamental values that should be used to interpret (and produce) those |
247 | /// costs. The costs are returned as an int rather than a member of this |
248 | /// enumeration because it is expected that the cost of one IR instruction |
249 | /// may have a multiplicative factor to it or otherwise won't fit directly |
250 | /// into the enum. Moreover, it is common to sum or average costs which works |
251 | /// better as simple integral values. Thus this enum only provides constants. |
252 | /// Also note that the returned costs are signed integers to make it natural |
253 | /// to add, subtract, and test with zero (a common boundary condition). It is |
254 | /// not expected that 2^32 is a realistic cost to be modeling at any point. |
255 | /// |
256 | /// Note that these costs should usually reflect the intersection of code-size |
257 | /// cost and execution cost. A free instruction is typically one that folds |
258 | /// into another instruction. For example, reg-to-reg moves can often be |
259 | /// skipped by renaming the registers in the CPU, but they still are encoded |
260 | /// and thus wouldn't be considered 'free' here. |
261 | enum TargetCostConstants { |
262 | TCC_Free = 0, ///< Expected to fold away in lowering. |
263 | TCC_Basic = 1, ///< The cost of a typical 'add' instruction. |
264 | TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86. |
265 | }; |
266 | |
267 | /// Estimate the cost of a GEP operation when lowered. |
268 | InstructionCost |
269 | getGEPCost(Type *PointeeType, const Value *Ptr, |
270 | ArrayRef<const Value *> Operands, |
271 | TargetCostKind CostKind = TCK_SizeAndLatency) const; |
272 | |
273 | /// \returns A value by which our inlining threshold should be multiplied. |
274 | /// This is primarily used to bump up the inlining threshold wholesale on |
275 | /// targets where calls are unusually expensive. |
276 | /// |
277 | /// TODO: This is a rather blunt instrument. Perhaps altering the costs of |
278 | /// individual classes of instructions would be better. |
279 | unsigned getInliningThresholdMultiplier() const; |
280 | |
281 | /// \returns A value to be added to the inlining threshold. |
282 | unsigned adjustInliningThreshold(const CallBase *CB) const; |
283 | |
284 | /// \returns Vector bonus in percent. |
285 | /// |
286 | /// Vector bonuses: We want to more aggressively inline vector-dense kernels |
287 | /// and apply this bonus based on the percentage of vector instructions. A |
288 | /// bonus is applied if the vector instructions exceed 50% and half that |
289 | /// amount is applied if it exceeds 10%. Note that these bonuses are some what |
290 | /// arbitrary and evolved over time by accident as much as because they are |
291 | /// principled bonuses. |
292 | /// FIXME: It would be nice to base the bonus values on something more |
293 | /// scientific. A target may has no bonus on vector instructions. |
294 | int getInlinerVectorBonusPercent() const; |
295 | |
296 | /// \return the expected cost of a memcpy, which could e.g. depend on the |
297 | /// source/destination type and alignment and the number of bytes copied. |
298 | InstructionCost getMemcpyCost(const Instruction *I) const; |
299 | |
300 | /// \return The estimated number of case clusters when lowering \p 'SI'. |
301 | /// \p JTSize Set a jump table size only when \p SI is suitable for a jump |
302 | /// table. |
303 | unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, |
304 | unsigned &JTSize, |
305 | ProfileSummaryInfo *PSI, |
306 | BlockFrequencyInfo *BFI) const; |
307 | |
308 | /// Estimate the cost of a given IR user when lowered. |
309 | /// |
310 | /// This can estimate the cost of either a ConstantExpr or Instruction when |
311 | /// lowered. |
312 | /// |
313 | /// \p Operands is a list of operands which can be a result of transformations |
314 | /// of the current operands. The number of the operands on the list must equal |
315 | /// to the number of the current operands the IR user has. Their order on the |
316 | /// list must be the same as the order of the current operands the IR user |
317 | /// has. |
318 | /// |
319 | /// The returned cost is defined in terms of \c TargetCostConstants, see its |
320 | /// comments for a detailed explanation of the cost values. |
321 | InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands, |
322 | TargetCostKind CostKind) const; |
323 | |
324 | /// This is a helper function which calls the two-argument getUserCost |
325 | /// with \p Operands which are the current operands U has. |
326 | InstructionCost getUserCost(const User *U, TargetCostKind CostKind) const { |
327 | SmallVector<const Value *, 4> Operands(U->operand_values()); |
328 | return getUserCost(U, Operands, CostKind); |
329 | } |
330 | |
331 | /// If a branch or a select condition is skewed in one direction by more than |
332 | /// this factor, it is very likely to be predicted correctly. |
333 | BranchProbability getPredictableBranchThreshold() const; |
334 | |
335 | /// Return true if branch divergence exists. |
336 | /// |
337 | /// Branch divergence has a significantly negative impact on GPU performance |
338 | /// when threads in the same wavefront take different paths due to conditional |
339 | /// branches. |
340 | bool hasBranchDivergence() const; |
341 | |
342 | /// Return true if the target prefers to use GPU divergence analysis to |
343 | /// replace the legacy version. |
344 | bool useGPUDivergenceAnalysis() const; |
345 | |
346 | /// Returns whether V is a source of divergence. |
347 | /// |
348 | /// This function provides the target-dependent information for |
349 | /// the target-independent LegacyDivergenceAnalysis. LegacyDivergenceAnalysis |
350 | /// first builds the dependency graph, and then runs the reachability |
351 | /// algorithm starting with the sources of divergence. |
352 | bool isSourceOfDivergence(const Value *V) const; |
353 | |
354 | // Returns true for the target specific |
355 | // set of operations which produce uniform result |
356 | // even taking non-uniform arguments |
357 | bool isAlwaysUniform(const Value *V) const; |
358 | |
359 | /// Returns the address space ID for a target's 'flat' address space. Note |
360 | /// this is not necessarily the same as addrspace(0), which LLVM sometimes |
361 | /// refers to as the generic address space. The flat address space is a |
362 | /// generic address space that can be used access multiple segments of memory |
363 | /// with different address spaces. Access of a memory location through a |
364 | /// pointer with this address space is expected to be legal but slower |
365 | /// compared to the same memory location accessed through a pointer with a |
366 | /// different address space. |
367 | // |
368 | /// This is for targets with different pointer representations which can |
369 | /// be converted with the addrspacecast instruction. If a pointer is converted |
370 | /// to this address space, optimizations should attempt to replace the access |
371 | /// with the source address space. |
372 | /// |
373 | /// \returns ~0u if the target does not have such a flat address space to |
374 | /// optimize away. |
375 | unsigned getFlatAddressSpace() const; |
376 | |
377 | /// Return any intrinsic address operand indexes which may be rewritten if |
378 | /// they use a flat address space pointer. |
379 | /// |
380 | /// \returns true if the intrinsic was handled. |
381 | bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
382 | Intrinsic::ID IID) const; |
383 | |
384 | bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const; |
385 | |
386 | /// Return true if globals in this address space can have initializers other |
387 | /// than `undef`. |
388 | bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const; |
389 | |
390 | unsigned getAssumedAddrSpace(const Value *V) const; |
391 | |
392 | std::pair<const Value *, unsigned> |
393 | getPredicatedAddrSpace(const Value *V) const; |
394 | |
395 | /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p |
396 | /// NewV, which has a different address space. This should happen for every |
397 | /// operand index that collectFlatAddressOperands returned for the intrinsic. |
398 | /// \returns nullptr if the intrinsic was not handled. Otherwise, returns the |
399 | /// new value (which may be the original \p II with modified operands). |
400 | Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, |
401 | Value *NewV) const; |
402 | |
403 | /// Test whether calls to a function lower to actual program function |
404 | /// calls. |
405 | /// |
406 | /// The idea is to test whether the program is likely to require a 'call' |
407 | /// instruction or equivalent in order to call the given function. |
408 | /// |
409 | /// FIXME: It's not clear that this is a good or useful query API. Client's |
410 | /// should probably move to simpler cost metrics using the above. |
411 | /// Alternatively, we could split the cost interface into distinct code-size |
412 | /// and execution-speed costs. This would allow modelling the core of this |
413 | /// query more accurately as a call is a single small instruction, but |
414 | /// incurs significant execution cost. |
415 | bool isLoweredToCall(const Function *F) const; |
416 | |
417 | struct LSRCost { |
418 | /// TODO: Some of these could be merged. Also, a lexical ordering |
419 | /// isn't always optimal. |
420 | unsigned Insns; |
421 | unsigned NumRegs; |
422 | unsigned AddRecCost; |
423 | unsigned NumIVMuls; |
424 | unsigned NumBaseAdds; |
425 | unsigned ImmCost; |
426 | unsigned SetupCost; |
427 | unsigned ScaleCost; |
428 | }; |
429 | |
430 | /// Parameters that control the generic loop unrolling transformation. |
431 | struct UnrollingPreferences { |
432 | /// The cost threshold for the unrolled loop. Should be relative to the |
433 | /// getUserCost values returned by this API, and the expectation is that |
434 | /// the unrolled loop's instructions when run through that interface should |
435 | /// not exceed this cost. However, this is only an estimate. Also, specific |
436 | /// loops may be unrolled even with a cost above this threshold if deemed |
437 | /// profitable. Set this to UINT_MAX to disable the loop body cost |
438 | /// restriction. |
439 | unsigned Threshold; |
440 | /// If complete unrolling will reduce the cost of the loop, we will boost |
441 | /// the Threshold by a certain percent to allow more aggressive complete |
442 | /// unrolling. This value provides the maximum boost percentage that we |
443 | /// can apply to Threshold (The value should be no less than 100). |
444 | /// BoostedThreshold = Threshold * min(RolledCost / UnrolledCost, |
445 | /// MaxPercentThresholdBoost / 100) |
446 | /// E.g. if complete unrolling reduces the loop execution time by 50% |
447 | /// then we boost the threshold by the factor of 2x. If unrolling is not |
448 | /// expected to reduce the running time, then we do not increase the |
449 | /// threshold. |
450 | unsigned MaxPercentThresholdBoost; |
451 | /// The cost threshold for the unrolled loop when optimizing for size (set |
452 | /// to UINT_MAX to disable). |
453 | unsigned OptSizeThreshold; |
454 | /// The cost threshold for the unrolled loop, like Threshold, but used |
455 | /// for partial/runtime unrolling (set to UINT_MAX to disable). |
456 | unsigned PartialThreshold; |
457 | /// The cost threshold for the unrolled loop when optimizing for size, like |
458 | /// OptSizeThreshold, but used for partial/runtime unrolling (set to |
459 | /// UINT_MAX to disable). |
460 | unsigned PartialOptSizeThreshold; |
461 | /// A forced unrolling factor (the number of concatenated bodies of the |
462 | /// original loop in the unrolled loop body). When set to 0, the unrolling |
463 | /// transformation will select an unrolling factor based on the current cost |
464 | /// threshold and other factors. |
465 | unsigned Count; |
466 | /// Default unroll count for loops with run-time trip count. |
467 | unsigned DefaultUnrollRuntimeCount; |
468 | // Set the maximum unrolling factor. The unrolling factor may be selected |
469 | // using the appropriate cost threshold, but may not exceed this number |
470 | // (set to UINT_MAX to disable). This does not apply in cases where the |
471 | // loop is being fully unrolled. |
472 | unsigned MaxCount; |
473 | /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but |
474 | /// applies even if full unrolling is selected. This allows a target to fall |
475 | /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount. |
476 | unsigned FullUnrollMaxCount; |
477 | // Represents number of instructions optimized when "back edge" |
478 | // becomes "fall through" in unrolled loop. |
479 | // For now we count a conditional branch on a backedge and a comparison |
480 | // feeding it. |
481 | unsigned BEInsns; |
482 | /// Allow partial unrolling (unrolling of loops to expand the size of the |
483 | /// loop body, not only to eliminate small constant-trip-count loops). |
484 | bool Partial; |
485 | /// Allow runtime unrolling (unrolling of loops to expand the size of the |
486 | /// loop body even when the number of loop iterations is not known at |
487 | /// compile time). |
488 | bool Runtime; |
489 | /// Allow generation of a loop remainder (extra iterations after unroll). |
490 | bool AllowRemainder; |
491 | /// Allow emitting expensive instructions (such as divisions) when computing |
492 | /// the trip count of a loop for runtime unrolling. |
493 | bool AllowExpensiveTripCount; |
494 | /// Apply loop unroll on any kind of loop |
495 | /// (mainly to loops that fail runtime unrolling). |
496 | bool Force; |
497 | /// Allow using trip count upper bound to unroll loops. |
498 | bool UpperBound; |
499 | /// Allow unrolling of all the iterations of the runtime loop remainder. |
500 | bool UnrollRemainder; |
501 | /// Allow unroll and jam. Used to enable unroll and jam for the target. |
502 | bool UnrollAndJam; |
503 | /// Threshold for unroll and jam, for inner loop size. The 'Threshold' |
504 | /// value above is used during unroll and jam for the outer loop size. |
505 | /// This value is used in the same manner to limit the size of the inner |
506 | /// loop. |
507 | unsigned UnrollAndJamInnerLoopThreshold; |
508 | /// Don't allow loop unrolling to simulate more than this number of |
509 | /// iterations when checking full unroll profitability |
510 | unsigned MaxIterationsCountToAnalyze; |
511 | }; |
512 | |
513 | /// Get target-customized preferences for the generic loop unrolling |
514 | /// transformation. The caller will initialize UP with the current |
515 | /// target-independent defaults. |
516 | void (Loop *L, ScalarEvolution &, |
517 | UnrollingPreferences &UP, |
518 | OptimizationRemarkEmitter *ORE) const; |
519 | |
520 | /// Query the target whether it would be profitable to convert the given loop |
521 | /// into a hardware loop. |
522 | bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, |
523 | AssumptionCache &AC, TargetLibraryInfo *LibInfo, |
524 | HardwareLoopInfo &HWLoopInfo) const; |
525 | |
526 | /// Query the target whether it would be prefered to create a predicated |
527 | /// vector loop, which can avoid the need to emit a scalar epilogue loop. |
528 | bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, |
529 | AssumptionCache &AC, TargetLibraryInfo *TLI, |
530 | DominatorTree *DT, |
531 | const LoopAccessInfo *LAI) const; |
532 | |
533 | /// Query the target whether lowering of the llvm.get.active.lane.mask |
534 | /// intrinsic is supported. |
535 | bool emitGetActiveLaneMask() const; |
536 | |
537 | // Parameters that control the loop peeling transformation |
538 | struct PeelingPreferences { |
539 | /// A forced peeling factor (the number of bodied of the original loop |
540 | /// that should be peeled off before the loop body). When set to 0, the |
541 | /// a peeling factor based on profile information and other factors. |
542 | unsigned PeelCount; |
543 | /// Allow peeling off loop iterations. |
544 | bool AllowPeeling; |
545 | /// Allow peeling off loop iterations for loop nests. |
546 | bool AllowLoopNestsPeeling; |
547 | /// Allow peeling basing on profile. Uses to enable peeling off all |
548 | /// iterations basing on provided profile. |
549 | /// If the value is true the peeling cost model can decide to peel only |
550 | /// some iterations and in this case it will set this to false. |
551 | bool PeelProfiledIterations; |
552 | }; |
553 | |
554 | /// Get target-customized preferences for the generic loop peeling |
555 | /// transformation. The caller will initialize \p PP with the current |
556 | /// target-independent defaults with information from \p L and \p SE. |
557 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
558 | PeelingPreferences &PP) const; |
559 | |
560 | /// Targets can implement their own combinations for target-specific |
561 | /// intrinsics. This function will be called from the InstCombine pass every |
562 | /// time a target-specific intrinsic is encountered. |
563 | /// |
564 | /// \returns None to not do anything target specific or a value that will be |
565 | /// returned from the InstCombiner. It is possible to return null and stop |
566 | /// further processing of the intrinsic by returning nullptr. |
567 | Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, |
568 | IntrinsicInst &II) const; |
569 | /// Can be used to implement target-specific instruction combining. |
570 | /// \see instCombineIntrinsic |
571 | Optional<Value *> |
572 | simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, |
573 | APInt DemandedMask, KnownBits &Known, |
574 | bool &KnownBitsComputed) const; |
575 | /// Can be used to implement target-specific instruction combining. |
576 | /// \see instCombineIntrinsic |
577 | Optional<Value *> simplifyDemandedVectorEltsIntrinsic( |
578 | InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, |
579 | APInt &UndefElts2, APInt &UndefElts3, |
580 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
581 | SimplifyAndSetOp) const; |
582 | /// @} |
583 | |
584 | /// \name Scalar Target Information |
585 | /// @{ |
586 | |
587 | /// Flags indicating the kind of support for population count. |
588 | /// |
589 | /// Compared to the SW implementation, HW support is supposed to |
590 | /// significantly boost the performance when the population is dense, and it |
591 | /// may or may not degrade performance if the population is sparse. A HW |
592 | /// support is considered as "Fast" if it can outperform, or is on a par |
593 | /// with, SW implementation when the population is sparse; otherwise, it is |
594 | /// considered as "Slow". |
595 | enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware }; |
596 | |
597 | /// Return true if the specified immediate is legal add immediate, that |
598 | /// is the target has add instructions which can add a register with the |
599 | /// immediate without having to materialize the immediate into a register. |
600 | bool isLegalAddImmediate(int64_t Imm) const; |
601 | |
602 | /// Return true if the specified immediate is legal icmp immediate, |
603 | /// that is the target has icmp instructions which can compare a register |
604 | /// against the immediate without having to materialize the immediate into a |
605 | /// register. |
606 | bool isLegalICmpImmediate(int64_t Imm) const; |
607 | |
608 | /// Return true if the addressing mode represented by AM is legal for |
609 | /// this target, for a load/store of the specified type. |
610 | /// The type may be VoidTy, in which case only return true if the addressing |
611 | /// mode is legal for a load/store of any legal type. |
612 | /// If target returns true in LSRWithInstrQueries(), I may be valid. |
613 | /// TODO: Handle pre/postinc as well. |
614 | bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, |
615 | bool HasBaseReg, int64_t Scale, |
616 | unsigned AddrSpace = 0, |
617 | Instruction *I = nullptr) const; |
618 | |
619 | /// Return true if LSR cost of C1 is lower than C1. |
620 | bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, |
621 | TargetTransformInfo::LSRCost &C2) const; |
622 | |
623 | /// Return true if LSR major cost is number of registers. Targets which |
624 | /// implement their own isLSRCostLess and unset number of registers as major |
625 | /// cost should return false, otherwise return true. |
626 | bool isNumRegsMajorCostOfLSR() const; |
627 | |
628 | /// \returns true if LSR should not optimize a chain that includes \p I. |
629 | bool isProfitableLSRChainElement(Instruction *I) const; |
630 | |
631 | /// Return true if the target can fuse a compare and branch. |
632 | /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost |
633 | /// calculation for the instructions in a loop. |
634 | bool canMacroFuseCmp() const; |
635 | |
636 | /// Return true if the target can save a compare for loop count, for example |
637 | /// hardware loop saves a compare. |
638 | bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, |
639 | DominatorTree *DT, AssumptionCache *AC, |
640 | TargetLibraryInfo *LibInfo) const; |
641 | |
642 | enum AddressingModeKind { |
643 | AMK_PreIndexed, |
644 | AMK_PostIndexed, |
645 | AMK_None |
646 | }; |
647 | |
648 | /// Return the preferred addressing mode LSR should make efforts to generate. |
649 | AddressingModeKind getPreferredAddressingMode(const Loop *L, |
650 | ScalarEvolution *SE) const; |
651 | |
652 | /// Return true if the target supports masked store. |
653 | bool isLegalMaskedStore(Type *DataType, Align Alignment) const; |
654 | /// Return true if the target supports masked load. |
655 | bool isLegalMaskedLoad(Type *DataType, Align Alignment) const; |
656 | |
657 | /// Return true if the target supports nontemporal store. |
658 | bool isLegalNTStore(Type *DataType, Align Alignment) const; |
659 | /// Return true if the target supports nontemporal load. |
660 | bool isLegalNTLoad(Type *DataType, Align Alignment) const; |
661 | |
662 | /// Return true if the target supports masked scatter. |
663 | bool isLegalMaskedScatter(Type *DataType, Align Alignment) const; |
664 | /// Return true if the target supports masked gather. |
665 | bool isLegalMaskedGather(Type *DataType, Align Alignment) const; |
666 | /// Return true if the target forces scalarizing of llvm.masked.gather |
667 | /// intrinsics. |
668 | bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const; |
669 | /// Return true if the target forces scalarizing of llvm.masked.scatter |
670 | /// intrinsics. |
671 | bool forceScalarizeMaskedScatter(VectorType *Type, Align Alignment) const; |
672 | |
673 | /// Return true if the target supports masked compress store. |
674 | bool isLegalMaskedCompressStore(Type *DataType) const; |
675 | /// Return true if the target supports masked expand load. |
676 | bool isLegalMaskedExpandLoad(Type *DataType) const; |
677 | |
678 | /// Return true if we should be enabling ordered reductions for the target. |
679 | bool enableOrderedReductions() const; |
680 | |
681 | /// Return true if the target has a unified operation to calculate division |
682 | /// and remainder. If so, the additional implicit multiplication and |
683 | /// subtraction required to calculate a remainder from division are free. This |
684 | /// can enable more aggressive transformations for division and remainder than |
685 | /// would typically be allowed using throughput or size cost models. |
686 | bool hasDivRemOp(Type *DataType, bool IsSigned) const; |
687 | |
688 | /// Return true if the given instruction (assumed to be a memory access |
689 | /// instruction) has a volatile variant. If that's the case then we can avoid |
690 | /// addrspacecast to generic AS for volatile loads/stores. Default |
691 | /// implementation returns false, which prevents address space inference for |
692 | /// volatile loads/stores. |
693 | bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const; |
694 | |
695 | /// Return true if target doesn't mind addresses in vectors. |
696 | bool prefersVectorizedAddressing() const; |
697 | |
698 | /// Return the cost of the scaling factor used in the addressing |
699 | /// mode represented by AM for this target, for a load/store |
700 | /// of the specified type. |
701 | /// If the AM is supported, the return value must be >= 0. |
702 | /// If the AM is not supported, it returns a negative value. |
703 | /// TODO: Handle pre/postinc as well. |
704 | InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
705 | int64_t BaseOffset, bool HasBaseReg, |
706 | int64_t Scale, |
707 | unsigned AddrSpace = 0) const; |
708 | |
709 | /// Return true if the loop strength reduce pass should make |
710 | /// Instruction* based TTI queries to isLegalAddressingMode(). This is |
711 | /// needed on SystemZ, where e.g. a memcpy can only have a 12 bit unsigned |
712 | /// immediate offset and no index register. |
713 | bool LSRWithInstrQueries() const; |
714 | |
715 | /// Return true if it's free to truncate a value of type Ty1 to type |
716 | /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16 |
717 | /// by referencing its sub-register AX. |
718 | bool isTruncateFree(Type *Ty1, Type *Ty2) const; |
719 | |
720 | /// Return true if it is profitable to hoist instruction in the |
721 | /// then/else to before if. |
722 | bool isProfitableToHoist(Instruction *I) const; |
723 | |
724 | bool useAA() const; |
725 | |
726 | /// Return true if this type is legal. |
727 | bool isTypeLegal(Type *Ty) const; |
728 | |
729 | /// Returns the estimated number of registers required to represent \p Ty. |
730 | InstructionCost getRegUsageForType(Type *Ty) const; |
731 | |
732 | /// Return true if switches should be turned into lookup tables for the |
733 | /// target. |
734 | bool shouldBuildLookupTables() const; |
735 | |
736 | /// Return true if switches should be turned into lookup tables |
737 | /// containing this constant value for the target. |
738 | bool shouldBuildLookupTablesForConstant(Constant *C) const; |
739 | |
740 | /// Return true if lookup tables should be turned into relative lookup tables. |
741 | bool shouldBuildRelLookupTables() const; |
742 | |
743 | /// Return true if the input function which is cold at all call sites, |
744 | /// should use coldcc calling convention. |
745 | bool useColdCCForColdCall(Function &F) const; |
746 | |
747 | /// Estimate the overhead of scalarizing an instruction. Insert and Extract |
748 | /// are set if the demanded result elements need to be inserted and/or |
749 | /// extracted from vectors. |
750 | InstructionCost getScalarizationOverhead(VectorType *Ty, |
751 | const APInt &DemandedElts, |
752 | bool Insert, bool ) const; |
753 | |
754 | /// Estimate the overhead of scalarizing an instructions unique |
755 | /// non-constant operands. The (potentially vector) types to use for each of |
756 | /// argument are passes via Tys. |
757 | InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, |
758 | ArrayRef<Type *> Tys) const; |
759 | |
760 | /// If target has efficient vector element load/store instructions, it can |
761 | /// return true here so that insertion/extraction costs are not added to |
762 | /// the scalarization cost of a load/store. |
763 | bool supportsEfficientVectorElementLoadStore() const; |
764 | |
765 | /// Don't restrict interleaved unrolling to small loops. |
766 | bool enableAggressiveInterleaving(bool LoopHasReductions) const; |
767 | |
768 | /// Returns options for expansion of memcmp. IsZeroCmp is |
769 | // true if this is the expansion of memcmp(p1, p2, s) == 0. |
770 | struct MemCmpExpansionOptions { |
771 | // Return true if memcmp expansion is enabled. |
772 | operator bool() const { return MaxNumLoads > 0; } |
773 | |
774 | // Maximum number of load operations. |
775 | unsigned MaxNumLoads = 0; |
776 | |
777 | // The list of available load sizes (in bytes), sorted in decreasing order. |
778 | SmallVector<unsigned, 8> LoadSizes; |
779 | |
780 | // For memcmp expansion when the memcmp result is only compared equal or |
781 | // not-equal to 0, allow up to this number of load pairs per block. As an |
782 | // example, this may allow 'memcmp(a, b, 3) == 0' in a single block: |
783 | // a0 = load2bytes &a[0] |
784 | // b0 = load2bytes &b[0] |
785 | // a2 = load1byte &a[2] |
786 | // b2 = load1byte &b[2] |
787 | // r = cmp eq (a0 ^ b0 | a2 ^ b2), 0 |
788 | unsigned NumLoadsPerBlock = 1; |
789 | |
790 | // Set to true to allow overlapping loads. For example, 7-byte compares can |
791 | // be done with two 4-byte compares instead of 4+2+1-byte compares. This |
792 | // requires all loads in LoadSizes to be doable in an unaligned way. |
793 | bool AllowOverlappingLoads = false; |
794 | }; |
795 | MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, |
796 | bool IsZeroCmp) const; |
797 | |
798 | /// Enable matching of interleaved access groups. |
799 | bool enableInterleavedAccessVectorization() const; |
800 | |
801 | /// Enable matching of interleaved access groups that contain predicated |
802 | /// accesses or gaps and therefore vectorized using masked |
803 | /// vector loads/stores. |
804 | bool enableMaskedInterleavedAccessVectorization() const; |
805 | |
806 | /// Indicate that it is potentially unsafe to automatically vectorize |
807 | /// floating-point operations because the semantics of vector and scalar |
808 | /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math |
809 | /// does not support IEEE-754 denormal numbers, while depending on the |
810 | /// platform, scalar floating-point math does. |
811 | /// This applies to floating-point math operations and calls, not memory |
812 | /// operations, shuffles, or casts. |
813 | bool isFPVectorizationPotentiallyUnsafe() const; |
814 | |
815 | /// Determine if the target supports unaligned memory accesses. |
816 | bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, |
817 | unsigned AddressSpace = 0, |
818 | Align Alignment = Align(1), |
819 | bool *Fast = nullptr) const; |
820 | |
821 | /// Return hardware support for population count. |
822 | PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const; |
823 | |
824 | /// Return true if the hardware has a fast square-root instruction. |
825 | bool haveFastSqrt(Type *Ty) const; |
826 | |
827 | /// Return true if it is faster to check if a floating-point value is NaN |
828 | /// (or not-NaN) versus a comparison against a constant FP zero value. |
829 | /// Targets should override this if materializing a 0.0 for comparison is |
830 | /// generally as cheap as checking for ordered/unordered. |
831 | bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const; |
832 | |
833 | /// Return the expected cost of supporting the floating point operation |
834 | /// of the specified type. |
835 | InstructionCost getFPOpCost(Type *Ty) const; |
836 | |
837 | /// Return the expected cost of materializing for the given integer |
838 | /// immediate of the specified type. |
839 | InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, |
840 | TargetCostKind CostKind) const; |
841 | |
842 | /// Return the expected cost of materialization for the given integer |
843 | /// immediate of the specified type for a given instruction. The cost can be |
844 | /// zero if the immediate can be folded into the specified instruction. |
845 | InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx, |
846 | const APInt &Imm, Type *Ty, |
847 | TargetCostKind CostKind, |
848 | Instruction *Inst = nullptr) const; |
849 | InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
850 | const APInt &Imm, Type *Ty, |
851 | TargetCostKind CostKind) const; |
852 | |
853 | /// Return the expected cost for the given integer when optimising |
854 | /// for size. This is different than the other integer immediate cost |
855 | /// functions in that it is subtarget agnostic. This is useful when you e.g. |
856 | /// target one ISA such as Aarch32 but smaller encodings could be possible |
857 | /// with another such as Thumb. This return value is used as a penalty when |
858 | /// the total costs for a constant is calculated (the bigger the cost, the |
859 | /// more beneficial constant hoisting is). |
860 | InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, |
861 | const APInt &Imm, Type *Ty) const; |
862 | /// @} |
863 | |
864 | /// \name Vector Target Information |
865 | /// @{ |
866 | |
867 | /// The various kinds of shuffle patterns for vector queries. |
868 | enum ShuffleKind { |
869 | SK_Broadcast, ///< Broadcast element 0 to all other elements. |
870 | SK_Reverse, ///< Reverse the order of the vector. |
871 | SK_Select, ///< Selects elements from the corresponding lane of |
872 | ///< either source operand. This is equivalent to a |
873 | ///< vector select with a constant condition operand. |
874 | SK_Transpose, ///< Transpose two vectors. |
875 | SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset. |
876 | , ///< ExtractSubvector Index indicates start offset. |
877 | SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one |
878 | ///< with any shuffle mask. |
879 | SK_PermuteSingleSrc, ///< Shuffle elements of single source vector with any |
880 | ///< shuffle mask. |
881 | SK_Splice ///< Concatenates elements from the first input vector |
882 | ///< with elements of the second input vector. Returning |
883 | ///< a vector of the same type as the input vectors. |
884 | }; |
885 | |
886 | /// Additional information about an operand's possible values. |
887 | enum OperandValueKind { |
888 | OK_AnyValue, // Operand can have any value. |
889 | OK_UniformValue, // Operand is uniform (splat of a value). |
890 | OK_UniformConstantValue, // Operand is uniform constant. |
891 | OK_NonUniformConstantValue // Operand is a non uniform constant value. |
892 | }; |
893 | |
894 | /// Additional properties of an operand's values. |
895 | enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 }; |
896 | |
897 | /// \return the number of registers in the target-provided register class. |
898 | unsigned getNumberOfRegisters(unsigned ClassID) const; |
899 | |
900 | /// \return the target-provided register class ID for the provided type, |
901 | /// accounting for type promotion and other type-legalization techniques that |
902 | /// the target might apply. However, it specifically does not account for the |
903 | /// scalarization or splitting of vector types. Should a vector type require |
904 | /// scalarization or splitting into multiple underlying vector registers, that |
905 | /// type should be mapped to a register class containing no registers. |
906 | /// Specifically, this is designed to provide a simple, high-level view of the |
907 | /// register allocation later performed by the backend. These register classes |
908 | /// don't necessarily map onto the register classes used by the backend. |
909 | /// FIXME: It's not currently possible to determine how many registers |
910 | /// are used by the provided type. |
911 | unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const; |
912 | |
913 | /// \return the target-provided register class name |
914 | const char *getRegisterClassName(unsigned ClassID) const; |
915 | |
916 | enum RegisterKind { RGK_Scalar, RGK_FixedWidthVector, RGK_ScalableVector }; |
917 | |
918 | /// \return The width of the largest scalar or vector register type. |
919 | TypeSize getRegisterBitWidth(RegisterKind K) const; |
920 | |
921 | /// \return The width of the smallest vector register type. |
922 | unsigned getMinVectorRegisterBitWidth() const; |
923 | |
924 | /// \return The maximum value of vscale if the target specifies an |
925 | /// architectural maximum vector length, and None otherwise. |
926 | Optional<unsigned> getMaxVScale() const; |
927 | |
928 | /// \return the value of vscale to tune the cost model for. |
929 | Optional<unsigned> getVScaleForTuning() const; |
930 | |
931 | /// \return True if the vectorization factor should be chosen to |
932 | /// make the vector of the smallest element type match the size of a |
933 | /// vector register. For wider element types, this could result in |
934 | /// creating vectors that span multiple vector registers. |
935 | /// If false, the vectorization factor will be chosen based on the |
936 | /// size of the widest element type. |
937 | bool shouldMaximizeVectorBandwidth() const; |
938 | |
939 | /// \return The minimum vectorization factor for types of given element |
940 | /// bit width, or 0 if there is no minimum VF. The returned value only |
941 | /// applies when shouldMaximizeVectorBandwidth returns true. |
942 | /// If IsScalable is true, the returned ElementCount must be a scalable VF. |
943 | ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const; |
944 | |
945 | /// \return The maximum vectorization factor for types of given element |
946 | /// bit width and opcode, or 0 if there is no maximum VF. |
947 | /// Currently only used by the SLP vectorizer. |
948 | unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; |
949 | |
950 | /// \return True if it should be considered for address type promotion. |
951 | /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is |
952 | /// profitable without finding other extensions fed by the same input. |
953 | bool shouldConsiderAddressTypePromotion( |
954 | const Instruction &I, bool &) const; |
955 | |
956 | /// \return The size of a cache line in bytes. |
957 | unsigned getCacheLineSize() const; |
958 | |
959 | /// The possible cache levels |
960 | enum class CacheLevel { |
961 | L1D, // The L1 data cache |
962 | L2D, // The L2 data cache |
963 | |
964 | // We currently do not model L3 caches, as their sizes differ widely between |
965 | // microarchitectures. Also, we currently do not have a use for L3 cache |
966 | // size modeling yet. |
967 | }; |
968 | |
969 | /// \return The size of the cache level in bytes, if available. |
970 | Optional<unsigned> getCacheSize(CacheLevel Level) const; |
971 | |
972 | /// \return The associativity of the cache level, if available. |
973 | Optional<unsigned> getCacheAssociativity(CacheLevel Level) const; |
974 | |
975 | /// \return How much before a load we should place the prefetch |
976 | /// instruction. This is currently measured in number of |
977 | /// instructions. |
978 | unsigned getPrefetchDistance() const; |
979 | |
980 | /// Some HW prefetchers can handle accesses up to a certain constant stride. |
981 | /// Sometimes prefetching is beneficial even below the HW prefetcher limit, |
982 | /// and the arguments provided are meant to serve as a basis for deciding this |
983 | /// for a particular loop. |
984 | /// |
985 | /// \param NumMemAccesses Number of memory accesses in the loop. |
986 | /// \param NumStridedMemAccesses Number of the memory accesses that |
987 | /// ScalarEvolution could find a known stride |
988 | /// for. |
989 | /// \param NumPrefetches Number of software prefetches that will be |
990 | /// emitted as determined by the addresses |
991 | /// involved and the cache line size. |
992 | /// \param HasCall True if the loop contains a call. |
993 | /// |
994 | /// \return This is the minimum stride in bytes where it makes sense to start |
995 | /// adding SW prefetches. The default is 1, i.e. prefetch with any |
996 | /// stride. |
997 | unsigned getMinPrefetchStride(unsigned NumMemAccesses, |
998 | unsigned NumStridedMemAccesses, |
999 | unsigned NumPrefetches, bool HasCall) const; |
1000 | |
1001 | /// \return The maximum number of iterations to prefetch ahead. If |
1002 | /// the required number of iterations is more than this number, no |
1003 | /// prefetching is performed. |
1004 | unsigned getMaxPrefetchIterationsAhead() const; |
1005 | |
1006 | /// \return True if prefetching should also be done for writes. |
1007 | bool enableWritePrefetching() const; |
1008 | |
1009 | /// \return The maximum interleave factor that any transform should try to |
1010 | /// perform for this target. This number depends on the level of parallelism |
1011 | /// and the number of execution units in the CPU. |
1012 | unsigned getMaxInterleaveFactor(unsigned VF) const; |
1013 | |
1014 | /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2. |
1015 | static OperandValueKind getOperandInfo(const Value *V, |
1016 | OperandValueProperties &OpProps); |
1017 | |
1018 | /// This is an approximation of reciprocal throughput of a math/logic op. |
1019 | /// A higher cost indicates less expected throughput. |
1020 | /// From Agner Fog's guides, reciprocal throughput is "the average number of |
1021 | /// clock cycles per instruction when the instructions are not part of a |
1022 | /// limiting dependency chain." |
1023 | /// Therefore, costs should be scaled to account for multiple execution units |
1024 | /// on the target that can process this type of instruction. For example, if |
1025 | /// there are 5 scalar integer units and 2 vector integer units that can |
1026 | /// calculate an 'add' in a single cycle, this model should indicate that the |
1027 | /// cost of the vector add instruction is 2.5 times the cost of the scalar |
1028 | /// add instruction. |
1029 | /// \p Args is an optional argument which holds the instruction operands |
1030 | /// values so the TTI can analyze those values searching for special |
1031 | /// cases or optimizations based on those values. |
1032 | /// \p CxtI is the optional original context instruction, if one exists, to |
1033 | /// provide even more information. |
1034 | InstructionCost getArithmeticInstrCost( |
1035 | unsigned Opcode, Type *Ty, |
1036 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1037 | OperandValueKind Opd1Info = OK_AnyValue, |
1038 | OperandValueKind Opd2Info = OK_AnyValue, |
1039 | OperandValueProperties Opd1PropInfo = OP_None, |
1040 | OperandValueProperties Opd2PropInfo = OP_None, |
1041 | ArrayRef<const Value *> Args = ArrayRef<const Value *>(), |
1042 | const Instruction *CxtI = nullptr) const; |
1043 | |
1044 | /// \return The cost of a shuffle instruction of kind Kind and of type Tp. |
1045 | /// The exact mask may be passed as Mask, or else the array will be empty. |
1046 | /// The index and subtype parameters are used by the subvector insertion and |
1047 | /// extraction shuffle kinds to show the insert/extract point and the type of |
1048 | /// the subvector being inserted/extracted. |
1049 | /// NOTE: For subvector extractions Tp represents the source type. |
1050 | InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, |
1051 | ArrayRef<int> Mask = None, int Index = 0, |
1052 | VectorType *SubTp = nullptr) const; |
1053 | |
1054 | /// Represents a hint about the context in which a cast is used. |
1055 | /// |
1056 | /// For zext/sext, the context of the cast is the operand, which must be a |
1057 | /// load of some kind. For trunc, the context is of the cast is the single |
1058 | /// user of the instruction, which must be a store of some kind. |
1059 | /// |
1060 | /// This enum allows the vectorizer to give getCastInstrCost an idea of the |
1061 | /// type of cast it's dealing with, as not every cast is equal. For instance, |
1062 | /// the zext of a load may be free, but the zext of an interleaving load can |
1063 | //// be (very) expensive! |
1064 | /// |
1065 | /// See \c getCastContextHint to compute a CastContextHint from a cast |
1066 | /// Instruction*. Callers can use it if they don't need to override the |
1067 | /// context and just want it to be calculated from the instruction. |
1068 | /// |
1069 | /// FIXME: This handles the types of load/store that the vectorizer can |
1070 | /// produce, which are the cases where the context instruction is most |
1071 | /// likely to be incorrect. There are other situations where that can happen |
1072 | /// too, which might be handled here but in the long run a more general |
1073 | /// solution of costing multiple instructions at the same times may be better. |
1074 | enum class CastContextHint : uint8_t { |
1075 | None, ///< The cast is not used with a load/store of any kind. |
1076 | Normal, ///< The cast is used with a normal load/store. |
1077 | Masked, ///< The cast is used with a masked load/store. |
1078 | GatherScatter, ///< The cast is used with a gather/scatter. |
1079 | Interleave, ///< The cast is used with an interleaved load/store. |
1080 | Reversed, ///< The cast is used with a reversed load/store. |
1081 | }; |
1082 | |
1083 | /// Calculates a CastContextHint from \p I. |
1084 | /// This should be used by callers of getCastInstrCost if they wish to |
1085 | /// determine the context from some instruction. |
1086 | /// \returns the CastContextHint for ZExt/SExt/Trunc, None if \p I is nullptr, |
1087 | /// or if it's another type of cast. |
1088 | static CastContextHint getCastContextHint(const Instruction *I); |
1089 | |
1090 | /// \return The expected cost of cast instructions, such as bitcast, trunc, |
1091 | /// zext, etc. If there is an existing instruction that holds Opcode, it |
1092 | /// may be passed in the 'I' parameter. |
1093 | InstructionCost |
1094 | getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, |
1095 | TTI::CastContextHint CCH, |
1096 | TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, |
1097 | const Instruction *I = nullptr) const; |
1098 | |
1099 | /// \return The expected cost of a sign- or zero-extended vector extract. Use |
1100 | /// -1 to indicate that there is no information about the index value. |
1101 | InstructionCost (unsigned Opcode, Type *Dst, |
1102 | VectorType *VecTy, |
1103 | unsigned Index = -1) const; |
1104 | |
1105 | /// \return The expected cost of control-flow related instructions such as |
1106 | /// Phi, Ret, Br, Switch. |
1107 | InstructionCost |
1108 | getCFInstrCost(unsigned Opcode, |
1109 | TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, |
1110 | const Instruction *I = nullptr) const; |
1111 | |
1112 | /// \returns The expected cost of compare and select instructions. If there |
1113 | /// is an existing instruction that holds Opcode, it may be passed in the |
1114 | /// 'I' parameter. The \p VecPred parameter can be used to indicate the select |
1115 | /// is using a compare with the specified predicate as condition. When vector |
1116 | /// types are passed, \p VecPred must be used for all lanes. |
1117 | InstructionCost |
1118 | getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, |
1119 | CmpInst::Predicate VecPred, |
1120 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1121 | const Instruction *I = nullptr) const; |
1122 | |
1123 | /// \return The expected cost of vector Insert and Extract. |
1124 | /// Use -1 to indicate that there is no information on the index value. |
1125 | InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, |
1126 | unsigned Index = -1) const; |
1127 | |
1128 | /// \return The cost of replication shuffle of \p VF elements typed \p EltTy |
1129 | /// \p ReplicationFactor times. |
1130 | /// |
1131 | /// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is: |
1132 | /// <0,0,0,1,1,1,2,2,2,3,3,3> |
1133 | InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, |
1134 | int VF, |
1135 | const APInt &DemandedDstElts, |
1136 | TTI::TargetCostKind CostKind); |
1137 | |
1138 | /// \return The cost of Load and Store instructions. |
1139 | InstructionCost |
1140 | getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, |
1141 | unsigned AddressSpace, |
1142 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1143 | const Instruction *I = nullptr) const; |
1144 | |
1145 | /// \return The cost of VP Load and Store instructions. |
1146 | InstructionCost |
1147 | getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, |
1148 | unsigned AddressSpace, |
1149 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1150 | const Instruction *I = nullptr) const; |
1151 | |
1152 | /// \return The cost of masked Load and Store instructions. |
1153 | InstructionCost getMaskedMemoryOpCost( |
1154 | unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, |
1155 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; |
1156 | |
1157 | /// \return The cost of Gather or Scatter operation |
1158 | /// \p Opcode - is a type of memory access Load or Store |
1159 | /// \p DataTy - a vector type of the data to be loaded or stored |
1160 | /// \p Ptr - pointer [or vector of pointers] - address[es] in memory |
1161 | /// \p VariableMask - true when the memory access is predicated with a mask |
1162 | /// that is not a compile-time constant |
1163 | /// \p Alignment - alignment of single element |
1164 | /// \p I - the optional original context instruction, if one exists, e.g. the |
1165 | /// load/store to transform or the call to the gather/scatter intrinsic |
1166 | InstructionCost getGatherScatterOpCost( |
1167 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
1168 | Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1169 | const Instruction *I = nullptr) const; |
1170 | |
1171 | /// \return The cost of the interleaved memory operation. |
1172 | /// \p Opcode is the memory operation code |
1173 | /// \p VecTy is the vector type of the interleaved access. |
1174 | /// \p Factor is the interleave factor |
1175 | /// \p Indices is the indices for interleaved load members (as interleaved |
1176 | /// load allows gaps) |
1177 | /// \p Alignment is the alignment of the memory operation |
1178 | /// \p AddressSpace is address space of the pointer. |
1179 | /// \p UseMaskForCond indicates if the memory access is predicated. |
1180 | /// \p UseMaskForGaps indicates if gaps should be masked. |
1181 | InstructionCost getInterleavedMemoryOpCost( |
1182 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
1183 | Align Alignment, unsigned AddressSpace, |
1184 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1185 | bool UseMaskForCond = false, bool UseMaskForGaps = false) const; |
1186 | |
1187 | /// A helper function to determine the type of reduction algorithm used |
1188 | /// for a given \p Opcode and set of FastMathFlags \p FMF. |
1189 | static bool requiresOrderedReduction(Optional<FastMathFlags> FMF) { |
1190 | return FMF != None && !(*FMF).allowReassoc(); |
1191 | } |
1192 | |
1193 | /// Calculate the cost of vector reduction intrinsics. |
1194 | /// |
1195 | /// This is the cost of reducing the vector value of type \p Ty to a scalar |
1196 | /// value using the operation denoted by \p Opcode. The FastMathFlags |
1197 | /// parameter \p FMF indicates what type of reduction we are performing: |
1198 | /// 1. Tree-wise. This is the typical 'fast' reduction performed that |
1199 | /// involves successively splitting a vector into half and doing the |
1200 | /// operation on the pair of halves until you have a scalar value. For |
1201 | /// example: |
1202 | /// (v0, v1, v2, v3) |
1203 | /// ((v0+v2), (v1+v3), undef, undef) |
1204 | /// ((v0+v2+v1+v3), undef, undef, undef) |
1205 | /// This is the default behaviour for integer operations, whereas for |
1206 | /// floating point we only do this if \p FMF indicates that |
1207 | /// reassociation is allowed. |
1208 | /// 2. Ordered. For a vector with N elements this involves performing N |
1209 | /// operations in lane order, starting with an initial scalar value, i.e. |
1210 | /// result = InitVal + v0 |
1211 | /// result = result + v1 |
1212 | /// result = result + v2 |
1213 | /// result = result + v3 |
1214 | /// This is only the case for FP operations and when reassociation is not |
1215 | /// allowed. |
1216 | /// |
1217 | InstructionCost getArithmeticReductionCost( |
1218 | unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF, |
1219 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; |
1220 | |
1221 | InstructionCost getMinMaxReductionCost( |
1222 | VectorType *Ty, VectorType *CondTy, bool IsUnsigned, |
1223 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; |
1224 | |
1225 | /// Calculate the cost of an extended reduction pattern, similar to |
1226 | /// getArithmeticReductionCost of an Add reduction with an extension and |
1227 | /// optional multiply. This is the cost of as: |
1228 | /// ResTy vecreduce.add(ext(Ty A)), or if IsMLA flag is set then: |
1229 | /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). The reduction happens |
1230 | /// on a VectorType with ResTy elements and Ty lanes. |
1231 | InstructionCost getExtendedAddReductionCost( |
1232 | bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, |
1233 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; |
1234 | |
1235 | /// \returns The cost of Intrinsic instructions. Analyses the real arguments. |
1236 | /// Three cases are handled: 1. scalar instruction 2. vector instruction |
1237 | /// 3. scalar instruction which is to be vectorized. |
1238 | InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
1239 | TTI::TargetCostKind CostKind) const; |
1240 | |
1241 | /// \returns The cost of Call instructions. |
1242 | InstructionCost getCallInstrCost( |
1243 | Function *F, Type *RetTy, ArrayRef<Type *> Tys, |
1244 | TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const; |
1245 | |
1246 | /// \returns The number of pieces into which the provided type must be |
1247 | /// split during legalization. Zero is returned when the answer is unknown. |
1248 | unsigned getNumberOfParts(Type *Tp) const; |
1249 | |
1250 | /// \returns The cost of the address computation. For most targets this can be |
1251 | /// merged into the instruction indexing mode. Some targets might want to |
1252 | /// distinguish between address computation for memory operations on vector |
1253 | /// types and scalar types. Such targets should override this function. |
1254 | /// The 'SE' parameter holds pointer for the scalar evolution object which |
1255 | /// is used in order to get the Ptr step value in case of constant stride. |
1256 | /// The 'Ptr' parameter holds SCEV of the access pointer. |
1257 | InstructionCost getAddressComputationCost(Type *Ty, |
1258 | ScalarEvolution *SE = nullptr, |
1259 | const SCEV *Ptr = nullptr) const; |
1260 | |
1261 | /// \returns The cost, if any, of keeping values of the given types alive |
1262 | /// over a callsite. |
1263 | /// |
1264 | /// Some types may require the use of register classes that do not have |
1265 | /// any callee-saved registers, so would require a spill and fill. |
1266 | InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const; |
1267 | |
1268 | /// \returns True if the intrinsic is a supported memory intrinsic. Info |
1269 | /// will contain additional information - whether the intrinsic may write |
1270 | /// or read to memory, volatility and the pointer. Info is undefined |
1271 | /// if false is returned. |
1272 | bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; |
1273 | |
1274 | /// \returns The maximum element size, in bytes, for an element |
1275 | /// unordered-atomic memory intrinsic. |
1276 | unsigned getAtomicMemIntrinsicMaxElementSize() const; |
1277 | |
1278 | /// \returns A value which is the result of the given memory intrinsic. New |
1279 | /// instructions may be created to extract the result from the given intrinsic |
1280 | /// memory operation. Returns nullptr if the target cannot create a result |
1281 | /// from the given intrinsic. |
1282 | Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, |
1283 | Type *ExpectedType) const; |
1284 | |
1285 | /// \returns The type to use in a loop expansion of a memcpy call. |
1286 | Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, |
1287 | unsigned SrcAddrSpace, unsigned DestAddrSpace, |
1288 | unsigned SrcAlign, unsigned DestAlign) const; |
1289 | |
1290 | /// \param[out] OpsOut The operand types to copy RemainingBytes of memory. |
1291 | /// \param RemainingBytes The number of bytes to copy. |
1292 | /// |
1293 | /// Calculates the operand types to use when copying \p RemainingBytes of |
1294 | /// memory, where source and destination alignments are \p SrcAlign and |
1295 | /// \p DestAlign respectively. |
1296 | void getMemcpyLoopResidualLoweringType( |
1297 | SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, |
1298 | unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, |
1299 | unsigned SrcAlign, unsigned DestAlign) const; |
1300 | |
1301 | /// \returns True if the two functions have compatible attributes for inlining |
1302 | /// purposes. |
1303 | bool areInlineCompatible(const Function *Caller, |
1304 | const Function *Callee) const; |
1305 | |
1306 | /// \returns True if the caller and callee agree on how \p Types will be |
1307 | /// passed to or returned from the callee. |
1308 | /// to the callee. |
1309 | /// \param Types List of types to check. |
1310 | bool areTypesABICompatible(const Function *Caller, const Function *Callee, |
1311 | const ArrayRef<Type *> &Types) const; |
1312 | |
1313 | /// The type of load/store indexing. |
1314 | enum MemIndexedMode { |
1315 | MIM_Unindexed, ///< No indexing. |
1316 | MIM_PreInc, ///< Pre-incrementing. |
1317 | MIM_PreDec, ///< Pre-decrementing. |
1318 | MIM_PostInc, ///< Post-incrementing. |
1319 | MIM_PostDec ///< Post-decrementing. |
1320 | }; |
1321 | |
1322 | /// \returns True if the specified indexed load for the given type is legal. |
1323 | bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const; |
1324 | |
1325 | /// \returns True if the specified indexed store for the given type is legal. |
1326 | bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const; |
1327 | |
1328 | /// \returns The bitwidth of the largest vector type that should be used to |
1329 | /// load/store in the given address space. |
1330 | unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; |
1331 | |
1332 | /// \returns True if the load instruction is legal to vectorize. |
1333 | bool isLegalToVectorizeLoad(LoadInst *LI) const; |
1334 | |
1335 | /// \returns True if the store instruction is legal to vectorize. |
1336 | bool isLegalToVectorizeStore(StoreInst *SI) const; |
1337 | |
1338 | /// \returns True if it is legal to vectorize the given load chain. |
1339 | bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, |
1340 | unsigned AddrSpace) const; |
1341 | |
1342 | /// \returns True if it is legal to vectorize the given store chain. |
1343 | bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, |
1344 | unsigned AddrSpace) const; |
1345 | |
1346 | /// \returns True if it is legal to vectorize the given reduction kind. |
1347 | bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, |
1348 | ElementCount VF) const; |
1349 | |
1350 | /// \returns True if the given type is supported for scalable vectors |
1351 | bool isElementTypeLegalForScalableVector(Type *Ty) const; |
1352 | |
1353 | /// \returns The new vector factor value if the target doesn't support \p |
1354 | /// SizeInBytes loads or has a better vector factor. |
1355 | unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, |
1356 | unsigned ChainSizeInBytes, |
1357 | VectorType *VecTy) const; |
1358 | |
1359 | /// \returns The new vector factor value if the target doesn't support \p |
1360 | /// SizeInBytes stores or has a better vector factor. |
1361 | unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, |
1362 | unsigned ChainSizeInBytes, |
1363 | VectorType *VecTy) const; |
1364 | |
1365 | /// Flags describing the kind of vector reduction. |
1366 | struct ReductionFlags { |
1367 | ReductionFlags() = default; |
1368 | bool IsMaxOp = |
1369 | false; ///< If the op a min/max kind, true if it's a max operation. |
1370 | bool IsSigned = false; ///< Whether the operation is a signed int reduction. |
1371 | bool NoNaN = |
1372 | false; ///< If op is an fp min/max, whether NaNs may be present. |
1373 | }; |
1374 | |
1375 | /// \returns True if the target prefers reductions in loop. |
1376 | bool preferInLoopReduction(unsigned Opcode, Type *Ty, |
1377 | ReductionFlags Flags) const; |
1378 | |
1379 | /// \returns True if the target prefers reductions select kept in the loop |
1380 | /// when tail folding. i.e. |
1381 | /// loop: |
1382 | /// p = phi (0, s) |
1383 | /// a = add (p, x) |
1384 | /// s = select (mask, a, p) |
1385 | /// vecreduce.add(s) |
1386 | /// |
1387 | /// As opposed to the normal scheme of p = phi (0, a) which allows the select |
1388 | /// to be pulled out of the loop. If the select(.., add, ..) can be predicated |
1389 | /// by the target, this can lead to cleaner code generation. |
1390 | bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, |
1391 | ReductionFlags Flags) const; |
1392 | |
1393 | /// \returns True if the target wants to expand the given reduction intrinsic |
1394 | /// into a shuffle sequence. |
1395 | bool shouldExpandReduction(const IntrinsicInst *II) const; |
1396 | |
1397 | /// \returns the size cost of rematerializing a GlobalValue address relative |
1398 | /// to a stack reload. |
1399 | unsigned getGISelRematGlobalCost() const; |
1400 | |
1401 | /// \returns True if the target supports scalable vectors. |
1402 | bool supportsScalableVectors() const; |
1403 | |
1404 | /// \return true when scalable vectorization is preferred. |
1405 | bool enableScalableVectorization() const; |
1406 | |
1407 | /// \name Vector Predication Information |
1408 | /// @{ |
1409 | /// Whether the target supports the %evl parameter of VP intrinsic efficiently |
1410 | /// in hardware, for the given opcode and type/alignment. (see LLVM Language |
1411 | /// Reference - "Vector Predication Intrinsics"). |
1412 | /// Use of %evl is discouraged when that is not the case. |
1413 | bool hasActiveVectorLength(unsigned Opcode, Type *DataType, |
1414 | Align Alignment) const; |
1415 | |
1416 | struct VPLegalization { |
1417 | enum VPTransform { |
1418 | // keep the predicating parameter |
1419 | Legal = 0, |
1420 | // where legal, discard the predicate parameter |
1421 | Discard = 1, |
1422 | // transform into something else that is also predicating |
1423 | Convert = 2 |
1424 | }; |
1425 | |
1426 | // How to transform the EVL parameter. |
1427 | // Legal: keep the EVL parameter as it is. |
1428 | // Discard: Ignore the EVL parameter where it is safe to do so. |
1429 | // Convert: Fold the EVL into the mask parameter. |
1430 | VPTransform EVLParamStrategy; |
1431 | |
1432 | // How to transform the operator. |
1433 | // Legal: The target supports this operator. |
1434 | // Convert: Convert this to a non-VP operation. |
1435 | // The 'Discard' strategy is invalid. |
1436 | VPTransform OpStrategy; |
1437 | |
1438 | bool shouldDoNothing() const { |
1439 | return (EVLParamStrategy == Legal) && (OpStrategy == Legal); |
1440 | } |
1441 | VPLegalization(VPTransform EVLParamStrategy, VPTransform OpStrategy) |
1442 | : EVLParamStrategy(EVLParamStrategy), OpStrategy(OpStrategy) {} |
1443 | }; |
1444 | |
1445 | /// \returns How the target needs this vector-predicated operation to be |
1446 | /// transformed. |
1447 | VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const; |
1448 | /// @} |
1449 | |
1450 | /// @} |
1451 | |
1452 | private: |
1453 | /// Estimate the latency of specified instruction. |
1454 | /// Returns 1 as the default value. |
1455 | InstructionCost getInstructionLatency(const Instruction *I) const; |
1456 | |
1457 | /// Returns the expected throughput cost of the instruction. |
1458 | /// Returns -1 if the cost is unknown. |
1459 | InstructionCost getInstructionThroughput(const Instruction *I) const; |
1460 | |
1461 | /// The abstract base class used to type erase specific TTI |
1462 | /// implementations. |
1463 | class Concept; |
1464 | |
1465 | /// The template model for the base class which wraps a concrete |
1466 | /// implementation in a type erased interface. |
1467 | template <typename T> class Model; |
1468 | |
1469 | std::unique_ptr<Concept> TTIImpl; |
1470 | }; |
1471 | |
1472 | class TargetTransformInfo::Concept { |
1473 | public: |
1474 | virtual ~Concept() = 0; |
1475 | virtual const DataLayout &getDataLayout() const = 0; |
1476 | virtual InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, |
1477 | ArrayRef<const Value *> Operands, |
1478 | TTI::TargetCostKind CostKind) = 0; |
1479 | virtual unsigned getInliningThresholdMultiplier() = 0; |
1480 | virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0; |
1481 | virtual int getInlinerVectorBonusPercent() = 0; |
1482 | virtual InstructionCost getMemcpyCost(const Instruction *I) = 0; |
1483 | virtual unsigned |
1484 | getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, |
1485 | ProfileSummaryInfo *PSI, |
1486 | BlockFrequencyInfo *BFI) = 0; |
1487 | virtual InstructionCost getUserCost(const User *U, |
1488 | ArrayRef<const Value *> Operands, |
1489 | TargetCostKind CostKind) = 0; |
1490 | virtual BranchProbability getPredictableBranchThreshold() = 0; |
1491 | virtual bool hasBranchDivergence() = 0; |
1492 | virtual bool useGPUDivergenceAnalysis() = 0; |
1493 | virtual bool isSourceOfDivergence(const Value *V) = 0; |
1494 | virtual bool isAlwaysUniform(const Value *V) = 0; |
1495 | virtual unsigned getFlatAddressSpace() = 0; |
1496 | virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
1497 | Intrinsic::ID IID) const = 0; |
1498 | virtual bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const = 0; |
1499 | virtual bool |
1500 | canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const = 0; |
1501 | virtual unsigned getAssumedAddrSpace(const Value *V) const = 0; |
1502 | virtual std::pair<const Value *, unsigned> |
1503 | getPredicatedAddrSpace(const Value *V) const = 0; |
1504 | virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, |
1505 | Value *OldV, |
1506 | Value *NewV) const = 0; |
1507 | virtual bool isLoweredToCall(const Function *F) = 0; |
1508 | virtual void (Loop *L, ScalarEvolution &, |
1509 | UnrollingPreferences &UP, |
1510 | OptimizationRemarkEmitter *ORE) = 0; |
1511 | virtual void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
1512 | PeelingPreferences &PP) = 0; |
1513 | virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, |
1514 | AssumptionCache &AC, |
1515 | TargetLibraryInfo *LibInfo, |
1516 | HardwareLoopInfo &HWLoopInfo) = 0; |
1517 | virtual bool |
1518 | preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, |
1519 | AssumptionCache &AC, TargetLibraryInfo *TLI, |
1520 | DominatorTree *DT, const LoopAccessInfo *LAI) = 0; |
1521 | virtual bool emitGetActiveLaneMask() = 0; |
1522 | virtual Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, |
1523 | IntrinsicInst &II) = 0; |
1524 | virtual Optional<Value *> |
1525 | simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, |
1526 | APInt DemandedMask, KnownBits &Known, |
1527 | bool &KnownBitsComputed) = 0; |
1528 | virtual Optional<Value *> simplifyDemandedVectorEltsIntrinsic( |
1529 | InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, |
1530 | APInt &UndefElts2, APInt &UndefElts3, |
1531 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
1532 | SimplifyAndSetOp) = 0; |
1533 | virtual bool isLegalAddImmediate(int64_t Imm) = 0; |
1534 | virtual bool isLegalICmpImmediate(int64_t Imm) = 0; |
1535 | virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, |
1536 | int64_t BaseOffset, bool HasBaseReg, |
1537 | int64_t Scale, unsigned AddrSpace, |
1538 | Instruction *I) = 0; |
1539 | virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, |
1540 | TargetTransformInfo::LSRCost &C2) = 0; |
1541 | virtual bool isNumRegsMajorCostOfLSR() = 0; |
1542 | virtual bool isProfitableLSRChainElement(Instruction *I) = 0; |
1543 | virtual bool canMacroFuseCmp() = 0; |
1544 | virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, |
1545 | LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, |
1546 | TargetLibraryInfo *LibInfo) = 0; |
1547 | virtual AddressingModeKind |
1548 | getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const = 0; |
1549 | virtual bool isLegalMaskedStore(Type *DataType, Align Alignment) = 0; |
1550 | virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0; |
1551 | virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0; |
1552 | virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0; |
1553 | virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0; |
1554 | virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0; |
1555 | virtual bool forceScalarizeMaskedGather(VectorType *DataType, |
1556 | Align Alignment) = 0; |
1557 | virtual bool forceScalarizeMaskedScatter(VectorType *DataType, |
1558 | Align Alignment) = 0; |
1559 | virtual bool isLegalMaskedCompressStore(Type *DataType) = 0; |
1560 | virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0; |
1561 | virtual bool enableOrderedReductions() = 0; |
1562 | virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0; |
1563 | virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0; |
1564 | virtual bool prefersVectorizedAddressing() = 0; |
1565 | virtual InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
1566 | int64_t BaseOffset, |
1567 | bool HasBaseReg, int64_t Scale, |
1568 | unsigned AddrSpace) = 0; |
1569 | virtual bool LSRWithInstrQueries() = 0; |
1570 | virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0; |
1571 | virtual bool isProfitableToHoist(Instruction *I) = 0; |
1572 | virtual bool useAA() = 0; |
1573 | virtual bool isTypeLegal(Type *Ty) = 0; |
1574 | virtual InstructionCost getRegUsageForType(Type *Ty) = 0; |
1575 | virtual bool shouldBuildLookupTables() = 0; |
1576 | virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0; |
1577 | virtual bool shouldBuildRelLookupTables() = 0; |
1578 | virtual bool useColdCCForColdCall(Function &F) = 0; |
1579 | virtual InstructionCost getScalarizationOverhead(VectorType *Ty, |
1580 | const APInt &DemandedElts, |
1581 | bool Insert, |
1582 | bool ) = 0; |
1583 | virtual InstructionCost |
1584 | getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, |
1585 | ArrayRef<Type *> Tys) = 0; |
1586 | virtual bool supportsEfficientVectorElementLoadStore() = 0; |
1587 | virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; |
1588 | virtual MemCmpExpansionOptions |
1589 | enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0; |
1590 | virtual bool enableInterleavedAccessVectorization() = 0; |
1591 | virtual bool enableMaskedInterleavedAccessVectorization() = 0; |
1592 | virtual bool isFPVectorizationPotentiallyUnsafe() = 0; |
1593 | virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, |
1594 | unsigned BitWidth, |
1595 | unsigned AddressSpace, |
1596 | Align Alignment, |
1597 | bool *Fast) = 0; |
1598 | virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0; |
1599 | virtual bool haveFastSqrt(Type *Ty) = 0; |
1600 | virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0; |
1601 | virtual InstructionCost getFPOpCost(Type *Ty) = 0; |
1602 | virtual InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, |
1603 | const APInt &Imm, Type *Ty) = 0; |
1604 | virtual InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, |
1605 | TargetCostKind CostKind) = 0; |
1606 | virtual InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx, |
1607 | const APInt &Imm, Type *Ty, |
1608 | TargetCostKind CostKind, |
1609 | Instruction *Inst = nullptr) = 0; |
1610 | virtual InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
1611 | const APInt &Imm, Type *Ty, |
1612 | TargetCostKind CostKind) = 0; |
1613 | virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0; |
1614 | virtual unsigned getRegisterClassForType(bool Vector, |
1615 | Type *Ty = nullptr) const = 0; |
1616 | virtual const char *getRegisterClassName(unsigned ClassID) const = 0; |
1617 | virtual TypeSize getRegisterBitWidth(RegisterKind K) const = 0; |
1618 | virtual unsigned getMinVectorRegisterBitWidth() const = 0; |
1619 | virtual Optional<unsigned> getMaxVScale() const = 0; |
1620 | virtual Optional<unsigned> getVScaleForTuning() const = 0; |
1621 | virtual bool shouldMaximizeVectorBandwidth() const = 0; |
1622 | virtual ElementCount getMinimumVF(unsigned ElemWidth, |
1623 | bool IsScalable) const = 0; |
1624 | virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0; |
1625 | virtual bool shouldConsiderAddressTypePromotion( |
1626 | const Instruction &I, bool &) = 0; |
1627 | virtual unsigned getCacheLineSize() const = 0; |
1628 | virtual Optional<unsigned> getCacheSize(CacheLevel Level) const = 0; |
1629 | virtual Optional<unsigned> getCacheAssociativity(CacheLevel Level) const = 0; |
1630 | |
1631 | /// \return How much before a load we should place the prefetch |
1632 | /// instruction. This is currently measured in number of |
1633 | /// instructions. |
1634 | virtual unsigned getPrefetchDistance() const = 0; |
1635 | |
1636 | /// \return Some HW prefetchers can handle accesses up to a certain |
1637 | /// constant stride. This is the minimum stride in bytes where it |
1638 | /// makes sense to start adding SW prefetches. The default is 1, |
1639 | /// i.e. prefetch with any stride. Sometimes prefetching is beneficial |
1640 | /// even below the HW prefetcher limit, and the arguments provided are |
1641 | /// meant to serve as a basis for deciding this for a particular loop. |
1642 | virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, |
1643 | unsigned NumStridedMemAccesses, |
1644 | unsigned NumPrefetches, |
1645 | bool HasCall) const = 0; |
1646 | |
1647 | /// \return The maximum number of iterations to prefetch ahead. If |
1648 | /// the required number of iterations is more than this number, no |
1649 | /// prefetching is performed. |
1650 | virtual unsigned getMaxPrefetchIterationsAhead() const = 0; |
1651 | |
1652 | /// \return True if prefetching should also be done for writes. |
1653 | virtual bool enableWritePrefetching() const = 0; |
1654 | |
1655 | virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0; |
1656 | virtual InstructionCost getArithmeticInstrCost( |
1657 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
1658 | OperandValueKind Opd1Info, OperandValueKind Opd2Info, |
1659 | OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo, |
1660 | ArrayRef<const Value *> Args, const Instruction *CxtI = nullptr) = 0; |
1661 | virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, |
1662 | ArrayRef<int> Mask, int Index, |
1663 | VectorType *SubTp) = 0; |
1664 | virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, |
1665 | Type *Src, CastContextHint CCH, |
1666 | TTI::TargetCostKind CostKind, |
1667 | const Instruction *I) = 0; |
1668 | virtual InstructionCost (unsigned Opcode, Type *Dst, |
1669 | VectorType *VecTy, |
1670 | unsigned Index) = 0; |
1671 | virtual InstructionCost getCFInstrCost(unsigned Opcode, |
1672 | TTI::TargetCostKind CostKind, |
1673 | const Instruction *I = nullptr) = 0; |
1674 | virtual InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, |
1675 | Type *CondTy, |
1676 | CmpInst::Predicate VecPred, |
1677 | TTI::TargetCostKind CostKind, |
1678 | const Instruction *I) = 0; |
1679 | virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, |
1680 | unsigned Index) = 0; |
1681 | |
1682 | virtual InstructionCost |
1683 | getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, |
1684 | const APInt &DemandedDstElts, |
1685 | TTI::TargetCostKind CostKind) = 0; |
1686 | |
1687 | virtual InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, |
1688 | Align Alignment, |
1689 | unsigned AddressSpace, |
1690 | TTI::TargetCostKind CostKind, |
1691 | const Instruction *I) = 0; |
1692 | virtual InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, |
1693 | Align Alignment, |
1694 | unsigned AddressSpace, |
1695 | TTI::TargetCostKind CostKind, |
1696 | const Instruction *I) = 0; |
1697 | virtual InstructionCost |
1698 | getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, |
1699 | unsigned AddressSpace, |
1700 | TTI::TargetCostKind CostKind) = 0; |
1701 | virtual InstructionCost |
1702 | getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, |
1703 | bool VariableMask, Align Alignment, |
1704 | TTI::TargetCostKind CostKind, |
1705 | const Instruction *I = nullptr) = 0; |
1706 | |
1707 | virtual InstructionCost getInterleavedMemoryOpCost( |
1708 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
1709 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
1710 | bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0; |
1711 | virtual InstructionCost |
1712 | getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
1713 | Optional<FastMathFlags> FMF, |
1714 | TTI::TargetCostKind CostKind) = 0; |
1715 | virtual InstructionCost |
1716 | getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, |
1717 | TTI::TargetCostKind CostKind) = 0; |
1718 | virtual InstructionCost getExtendedAddReductionCost( |
1719 | bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, |
1720 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) = 0; |
1721 | virtual InstructionCost |
1722 | getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
1723 | TTI::TargetCostKind CostKind) = 0; |
1724 | virtual InstructionCost getCallInstrCost(Function *F, Type *RetTy, |
1725 | ArrayRef<Type *> Tys, |
1726 | TTI::TargetCostKind CostKind) = 0; |
1727 | virtual unsigned getNumberOfParts(Type *Tp) = 0; |
1728 | virtual InstructionCost |
1729 | getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0; |
1730 | virtual InstructionCost |
1731 | getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) = 0; |
1732 | virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst, |
1733 | MemIntrinsicInfo &Info) = 0; |
1734 | virtual unsigned getAtomicMemIntrinsicMaxElementSize() const = 0; |
1735 | virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, |
1736 | Type *ExpectedType) = 0; |
1737 | virtual Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, |
1738 | unsigned SrcAddrSpace, |
1739 | unsigned DestAddrSpace, |
1740 | unsigned SrcAlign, |
1741 | unsigned DestAlign) const = 0; |
1742 | virtual void getMemcpyLoopResidualLoweringType( |
1743 | SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, |
1744 | unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, |
1745 | unsigned SrcAlign, unsigned DestAlign) const = 0; |
1746 | virtual bool areInlineCompatible(const Function *Caller, |
1747 | const Function *Callee) const = 0; |
1748 | virtual bool areTypesABICompatible(const Function *Caller, |
1749 | const Function *Callee, |
1750 | const ArrayRef<Type *> &Types) const = 0; |
1751 | virtual bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const = 0; |
1752 | virtual bool isIndexedStoreLegal(MemIndexedMode Mode, Type *Ty) const = 0; |
1753 | virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0; |
1754 | virtual bool isLegalToVectorizeLoad(LoadInst *LI) const = 0; |
1755 | virtual bool isLegalToVectorizeStore(StoreInst *SI) const = 0; |
1756 | virtual bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, |
1757 | Align Alignment, |
1758 | unsigned AddrSpace) const = 0; |
1759 | virtual bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, |
1760 | Align Alignment, |
1761 | unsigned AddrSpace) const = 0; |
1762 | virtual bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, |
1763 | ElementCount VF) const = 0; |
1764 | virtual bool isElementTypeLegalForScalableVector(Type *Ty) const = 0; |
1765 | virtual unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, |
1766 | unsigned ChainSizeInBytes, |
1767 | VectorType *VecTy) const = 0; |
1768 | virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, |
1769 | unsigned ChainSizeInBytes, |
1770 | VectorType *VecTy) const = 0; |
1771 | virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty, |
1772 | ReductionFlags) const = 0; |
1773 | virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, |
1774 | ReductionFlags) const = 0; |
1775 | virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; |
1776 | virtual unsigned getGISelRematGlobalCost() const = 0; |
1777 | virtual bool enableScalableVectorization() const = 0; |
1778 | virtual bool supportsScalableVectors() const = 0; |
1779 | virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType, |
1780 | Align Alignment) const = 0; |
1781 | virtual InstructionCost getInstructionLatency(const Instruction *I) = 0; |
1782 | virtual VPLegalization |
1783 | getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; |
1784 | }; |
1785 | |
1786 | template <typename T> |
1787 | class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { |
1788 | T Impl; |
1789 | |
1790 | public: |
1791 | Model(T Impl) : Impl(std::move(Impl)) {} |
1792 | ~Model() override = default; |
1793 | |
1794 | const DataLayout &getDataLayout() const override { |
1795 | return Impl.getDataLayout(); |
1796 | } |
1797 | |
1798 | InstructionCost |
1799 | getGEPCost(Type *PointeeType, const Value *Ptr, |
1800 | ArrayRef<const Value *> Operands, |
1801 | TargetTransformInfo::TargetCostKind CostKind) override { |
1802 | return Impl.getGEPCost(PointeeType, Ptr, Operands, CostKind); |
1803 | } |
1804 | unsigned getInliningThresholdMultiplier() override { |
1805 | return Impl.getInliningThresholdMultiplier(); |
1806 | } |
1807 | unsigned adjustInliningThreshold(const CallBase *CB) override { |
1808 | return Impl.adjustInliningThreshold(CB); |
1809 | } |
1810 | int getInlinerVectorBonusPercent() override { |
1811 | return Impl.getInlinerVectorBonusPercent(); |
1812 | } |
1813 | InstructionCost getMemcpyCost(const Instruction *I) override { |
1814 | return Impl.getMemcpyCost(I); |
1815 | } |
1816 | InstructionCost getUserCost(const User *U, ArrayRef<const Value *> Operands, |
1817 | TargetCostKind CostKind) override { |
1818 | return Impl.getUserCost(U, Operands, CostKind); |
1819 | } |
1820 | BranchProbability getPredictableBranchThreshold() override { |
1821 | return Impl.getPredictableBranchThreshold(); |
1822 | } |
1823 | bool hasBranchDivergence() override { return Impl.hasBranchDivergence(); } |
1824 | bool useGPUDivergenceAnalysis() override { |
1825 | return Impl.useGPUDivergenceAnalysis(); |
1826 | } |
1827 | bool isSourceOfDivergence(const Value *V) override { |
1828 | return Impl.isSourceOfDivergence(V); |
1829 | } |
1830 | |
1831 | bool isAlwaysUniform(const Value *V) override { |
1832 | return Impl.isAlwaysUniform(V); |
1833 | } |
1834 | |
1835 | unsigned getFlatAddressSpace() override { return Impl.getFlatAddressSpace(); } |
1836 | |
1837 | bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
1838 | Intrinsic::ID IID) const override { |
1839 | return Impl.collectFlatAddressOperands(OpIndexes, IID); |
1840 | } |
1841 | |
1842 | bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override { |
1843 | return Impl.isNoopAddrSpaceCast(FromAS, ToAS); |
1844 | } |
1845 | |
1846 | bool |
1847 | canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override { |
1848 | return Impl.canHaveNonUndefGlobalInitializerInAddressSpace(AS); |
1849 | } |
1850 | |
1851 | unsigned getAssumedAddrSpace(const Value *V) const override { |
1852 | return Impl.getAssumedAddrSpace(V); |
1853 | } |
1854 | |
1855 | std::pair<const Value *, unsigned> |
1856 | getPredicatedAddrSpace(const Value *V) const override { |
1857 | return Impl.getPredicatedAddrSpace(V); |
1858 | } |
1859 | |
1860 | Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, |
1861 | Value *NewV) const override { |
1862 | return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV); |
1863 | } |
1864 | |
1865 | bool isLoweredToCall(const Function *F) override { |
1866 | return Impl.isLoweredToCall(F); |
1867 | } |
1868 | void (Loop *L, ScalarEvolution &SE, |
1869 | UnrollingPreferences &UP, |
1870 | OptimizationRemarkEmitter *ORE) override { |
1871 | return Impl.getUnrollingPreferences(L, SE, UP, ORE); |
1872 | } |
1873 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
1874 | PeelingPreferences &PP) override { |
1875 | return Impl.getPeelingPreferences(L, SE, PP); |
1876 | } |
1877 | bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, |
1878 | AssumptionCache &AC, TargetLibraryInfo *LibInfo, |
1879 | HardwareLoopInfo &HWLoopInfo) override { |
1880 | return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); |
1881 | } |
1882 | bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, |
1883 | AssumptionCache &AC, TargetLibraryInfo *TLI, |
1884 | DominatorTree *DT, |
1885 | const LoopAccessInfo *LAI) override { |
1886 | return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); |
1887 | } |
1888 | bool emitGetActiveLaneMask() override { |
1889 | return Impl.emitGetActiveLaneMask(); |
1890 | } |
1891 | Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, |
1892 | IntrinsicInst &II) override { |
1893 | return Impl.instCombineIntrinsic(IC, II); |
1894 | } |
1895 | Optional<Value *> |
1896 | simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, |
1897 | APInt DemandedMask, KnownBits &Known, |
1898 | bool &KnownBitsComputed) override { |
1899 | return Impl.simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, |
1900 | KnownBitsComputed); |
1901 | } |
1902 | Optional<Value *> simplifyDemandedVectorEltsIntrinsic( |
1903 | InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, |
1904 | APInt &UndefElts2, APInt &UndefElts3, |
1905 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
1906 | SimplifyAndSetOp) override { |
1907 | return Impl.simplifyDemandedVectorEltsIntrinsic( |
1908 | IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, |
1909 | SimplifyAndSetOp); |
1910 | } |
1911 | bool isLegalAddImmediate(int64_t Imm) override { |
1912 | return Impl.isLegalAddImmediate(Imm); |
1913 | } |
1914 | bool isLegalICmpImmediate(int64_t Imm) override { |
1915 | return Impl.isLegalICmpImmediate(Imm); |
1916 | } |
1917 | bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, |
1918 | bool HasBaseReg, int64_t Scale, unsigned AddrSpace, |
1919 | Instruction *I) override { |
1920 | return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale, |
1921 | AddrSpace, I); |
1922 | } |
1923 | bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, |
1924 | TargetTransformInfo::LSRCost &C2) override { |
1925 | return Impl.isLSRCostLess(C1, C2); |
1926 | } |
1927 | bool isNumRegsMajorCostOfLSR() override { |
1928 | return Impl.isNumRegsMajorCostOfLSR(); |
1929 | } |
1930 | bool isProfitableLSRChainElement(Instruction *I) override { |
1931 | return Impl.isProfitableLSRChainElement(I); |
1932 | } |
1933 | bool canMacroFuseCmp() override { return Impl.canMacroFuseCmp(); } |
1934 | bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, |
1935 | DominatorTree *DT, AssumptionCache *AC, |
1936 | TargetLibraryInfo *LibInfo) override { |
1937 | return Impl.canSaveCmp(L, BI, SE, LI, DT, AC, LibInfo); |
1938 | } |
1939 | AddressingModeKind |
1940 | getPreferredAddressingMode(const Loop *L, |
1941 | ScalarEvolution *SE) const override { |
1942 | return Impl.getPreferredAddressingMode(L, SE); |
1943 | } |
1944 | bool isLegalMaskedStore(Type *DataType, Align Alignment) override { |
1945 | return Impl.isLegalMaskedStore(DataType, Alignment); |
1946 | } |
1947 | bool isLegalMaskedLoad(Type *DataType, Align Alignment) override { |
1948 | return Impl.isLegalMaskedLoad(DataType, Alignment); |
1949 | } |
1950 | bool isLegalNTStore(Type *DataType, Align Alignment) override { |
1951 | return Impl.isLegalNTStore(DataType, Alignment); |
1952 | } |
1953 | bool isLegalNTLoad(Type *DataType, Align Alignment) override { |
1954 | return Impl.isLegalNTLoad(DataType, Alignment); |
1955 | } |
1956 | bool isLegalMaskedScatter(Type *DataType, Align Alignment) override { |
1957 | return Impl.isLegalMaskedScatter(DataType, Alignment); |
1958 | } |
1959 | bool isLegalMaskedGather(Type *DataType, Align Alignment) override { |
1960 | return Impl.isLegalMaskedGather(DataType, Alignment); |
1961 | } |
1962 | bool forceScalarizeMaskedGather(VectorType *DataType, |
1963 | Align Alignment) override { |
1964 | return Impl.forceScalarizeMaskedGather(DataType, Alignment); |
1965 | } |
1966 | bool forceScalarizeMaskedScatter(VectorType *DataType, |
1967 | Align Alignment) override { |
1968 | return Impl.forceScalarizeMaskedScatter(DataType, Alignment); |
1969 | } |
1970 | bool isLegalMaskedCompressStore(Type *DataType) override { |
1971 | return Impl.isLegalMaskedCompressStore(DataType); |
1972 | } |
1973 | bool isLegalMaskedExpandLoad(Type *DataType) override { |
1974 | return Impl.isLegalMaskedExpandLoad(DataType); |
1975 | } |
1976 | bool enableOrderedReductions() override { |
1977 | return Impl.enableOrderedReductions(); |
1978 | } |
1979 | bool hasDivRemOp(Type *DataType, bool IsSigned) override { |
1980 | return Impl.hasDivRemOp(DataType, IsSigned); |
1981 | } |
1982 | bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) override { |
1983 | return Impl.hasVolatileVariant(I, AddrSpace); |
1984 | } |
1985 | bool prefersVectorizedAddressing() override { |
1986 | return Impl.prefersVectorizedAddressing(); |
1987 | } |
1988 | InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
1989 | int64_t BaseOffset, bool HasBaseReg, |
1990 | int64_t Scale, |
1991 | unsigned AddrSpace) override { |
1992 | return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale, |
1993 | AddrSpace); |
1994 | } |
1995 | bool LSRWithInstrQueries() override { return Impl.LSRWithInstrQueries(); } |
1996 | bool isTruncateFree(Type *Ty1, Type *Ty2) override { |
1997 | return Impl.isTruncateFree(Ty1, Ty2); |
1998 | } |
1999 | bool isProfitableToHoist(Instruction *I) override { |
2000 | return Impl.isProfitableToHoist(I); |
2001 | } |
2002 | bool useAA() override { return Impl.useAA(); } |
2003 | bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); } |
2004 | InstructionCost getRegUsageForType(Type *Ty) override { |
2005 | return Impl.getRegUsageForType(Ty); |
2006 | } |
2007 | bool shouldBuildLookupTables() override { |
2008 | return Impl.shouldBuildLookupTables(); |
2009 | } |
2010 | bool shouldBuildLookupTablesForConstant(Constant *C) override { |
2011 | return Impl.shouldBuildLookupTablesForConstant(C); |
2012 | } |
2013 | bool shouldBuildRelLookupTables() override { |
2014 | return Impl.shouldBuildRelLookupTables(); |
2015 | } |
2016 | bool useColdCCForColdCall(Function &F) override { |
2017 | return Impl.useColdCCForColdCall(F); |
2018 | } |
2019 | |
2020 | InstructionCost getScalarizationOverhead(VectorType *Ty, |
2021 | const APInt &DemandedElts, |
2022 | bool Insert, bool ) override { |
2023 | return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); |
2024 | } |
2025 | InstructionCost |
2026 | getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, |
2027 | ArrayRef<Type *> Tys) override { |
2028 | return Impl.getOperandsScalarizationOverhead(Args, Tys); |
2029 | } |
2030 | |
2031 | bool supportsEfficientVectorElementLoadStore() override { |
2032 | return Impl.supportsEfficientVectorElementLoadStore(); |
2033 | } |
2034 | |
2035 | bool enableAggressiveInterleaving(bool LoopHasReductions) override { |
2036 | return Impl.enableAggressiveInterleaving(LoopHasReductions); |
2037 | } |
2038 | MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, |
2039 | bool IsZeroCmp) const override { |
2040 | return Impl.enableMemCmpExpansion(OptSize, IsZeroCmp); |
2041 | } |
2042 | bool enableInterleavedAccessVectorization() override { |
2043 | return Impl.enableInterleavedAccessVectorization(); |
2044 | } |
2045 | bool enableMaskedInterleavedAccessVectorization() override { |
2046 | return Impl.enableMaskedInterleavedAccessVectorization(); |
2047 | } |
2048 | bool isFPVectorizationPotentiallyUnsafe() override { |
2049 | return Impl.isFPVectorizationPotentiallyUnsafe(); |
2050 | } |
2051 | bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, |
2052 | unsigned AddressSpace, Align Alignment, |
2053 | bool *Fast) override { |
2054 | return Impl.allowsMisalignedMemoryAccesses(Context, BitWidth, AddressSpace, |
2055 | Alignment, Fast); |
2056 | } |
2057 | PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override { |
2058 | return Impl.getPopcntSupport(IntTyWidthInBit); |
2059 | } |
2060 | bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); } |
2061 | |
2062 | bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) override { |
2063 | return Impl.isFCmpOrdCheaperThanFCmpZero(Ty); |
2064 | } |
2065 | |
2066 | InstructionCost getFPOpCost(Type *Ty) override { |
2067 | return Impl.getFPOpCost(Ty); |
2068 | } |
2069 | |
2070 | InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, |
2071 | const APInt &Imm, Type *Ty) override { |
2072 | return Impl.getIntImmCodeSizeCost(Opc, Idx, Imm, Ty); |
2073 | } |
2074 | InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, |
2075 | TargetCostKind CostKind) override { |
2076 | return Impl.getIntImmCost(Imm, Ty, CostKind); |
2077 | } |
2078 | InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx, |
2079 | const APInt &Imm, Type *Ty, |
2080 | TargetCostKind CostKind, |
2081 | Instruction *Inst = nullptr) override { |
2082 | return Impl.getIntImmCostInst(Opc, Idx, Imm, Ty, CostKind, Inst); |
2083 | } |
2084 | InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
2085 | const APInt &Imm, Type *Ty, |
2086 | TargetCostKind CostKind) override { |
2087 | return Impl.getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind); |
2088 | } |
2089 | unsigned getNumberOfRegisters(unsigned ClassID) const override { |
2090 | return Impl.getNumberOfRegisters(ClassID); |
2091 | } |
2092 | unsigned getRegisterClassForType(bool Vector, |
2093 | Type *Ty = nullptr) const override { |
2094 | return Impl.getRegisterClassForType(Vector, Ty); |
2095 | } |
2096 | const char *getRegisterClassName(unsigned ClassID) const override { |
2097 | return Impl.getRegisterClassName(ClassID); |
2098 | } |
2099 | TypeSize getRegisterBitWidth(RegisterKind K) const override { |
2100 | return Impl.getRegisterBitWidth(K); |
2101 | } |
2102 | unsigned getMinVectorRegisterBitWidth() const override { |
2103 | return Impl.getMinVectorRegisterBitWidth(); |
2104 | } |
2105 | Optional<unsigned> getMaxVScale() const override { |
2106 | return Impl.getMaxVScale(); |
2107 | } |
2108 | Optional<unsigned> getVScaleForTuning() const override { |
2109 | return Impl.getVScaleForTuning(); |
2110 | } |
2111 | bool shouldMaximizeVectorBandwidth() const override { |
2112 | return Impl.shouldMaximizeVectorBandwidth(); |
2113 | } |
2114 | ElementCount getMinimumVF(unsigned ElemWidth, |
2115 | bool IsScalable) const override { |
2116 | return Impl.getMinimumVF(ElemWidth, IsScalable); |
2117 | } |
2118 | unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override { |
2119 | return Impl.getMaximumVF(ElemWidth, Opcode); |
2120 | } |
2121 | bool shouldConsiderAddressTypePromotion( |
2122 | const Instruction &I, bool &) override { |
2123 | return Impl.shouldConsiderAddressTypePromotion( |
2124 | I, AllowPromotionWithoutCommonHeader); |
2125 | } |
2126 | unsigned getCacheLineSize() const override { return Impl.getCacheLineSize(); } |
2127 | Optional<unsigned> getCacheSize(CacheLevel Level) const override { |
2128 | return Impl.getCacheSize(Level); |
2129 | } |
2130 | Optional<unsigned> getCacheAssociativity(CacheLevel Level) const override { |
2131 | return Impl.getCacheAssociativity(Level); |
2132 | } |
2133 | |
2134 | /// Return the preferred prefetch distance in terms of instructions. |
2135 | /// |
2136 | unsigned getPrefetchDistance() const override { |
2137 | return Impl.getPrefetchDistance(); |
2138 | } |
2139 | |
2140 | /// Return the minimum stride necessary to trigger software |
2141 | /// prefetching. |
2142 | /// |
2143 | unsigned getMinPrefetchStride(unsigned NumMemAccesses, |
2144 | unsigned NumStridedMemAccesses, |
2145 | unsigned NumPrefetches, |
2146 | bool HasCall) const override { |
2147 | return Impl.getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, |
2148 | NumPrefetches, HasCall); |
2149 | } |
2150 | |
2151 | /// Return the maximum prefetch distance in terms of loop |
2152 | /// iterations. |
2153 | /// |
2154 | unsigned getMaxPrefetchIterationsAhead() const override { |
2155 | return Impl.getMaxPrefetchIterationsAhead(); |
2156 | } |
2157 | |
2158 | /// \return True if prefetching should also be done for writes. |
2159 | bool enableWritePrefetching() const override { |
2160 | return Impl.enableWritePrefetching(); |
2161 | } |
2162 | |
2163 | unsigned getMaxInterleaveFactor(unsigned VF) override { |
2164 | return Impl.getMaxInterleaveFactor(VF); |
2165 | } |
2166 | unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, |
2167 | unsigned &JTSize, |
2168 | ProfileSummaryInfo *PSI, |
2169 | BlockFrequencyInfo *BFI) override { |
2170 | return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize, PSI, BFI); |
2171 | } |
2172 | InstructionCost getArithmeticInstrCost( |
2173 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
2174 | OperandValueKind Opd1Info, OperandValueKind Opd2Info, |
2175 | OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo, |
2176 | ArrayRef<const Value *> Args, |
2177 | const Instruction *CxtI = nullptr) override { |
2178 | return Impl.getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info, |
2179 | Opd1PropInfo, Opd2PropInfo, Args, CxtI); |
2180 | } |
2181 | InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, |
2182 | ArrayRef<int> Mask, int Index, |
2183 | VectorType *SubTp) override { |
2184 | return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp); |
2185 | } |
2186 | InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, |
2187 | CastContextHint CCH, |
2188 | TTI::TargetCostKind CostKind, |
2189 | const Instruction *I) override { |
2190 | return Impl.getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
2191 | } |
2192 | InstructionCost (unsigned Opcode, Type *Dst, |
2193 | VectorType *VecTy, |
2194 | unsigned Index) override { |
2195 | return Impl.getExtractWithExtendCost(Opcode, Dst, VecTy, Index); |
2196 | } |
2197 | InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, |
2198 | const Instruction *I = nullptr) override { |
2199 | return Impl.getCFInstrCost(Opcode, CostKind, I); |
2200 | } |
2201 | InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, |
2202 | CmpInst::Predicate VecPred, |
2203 | TTI::TargetCostKind CostKind, |
2204 | const Instruction *I) override { |
2205 | return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); |
2206 | } |
2207 | InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, |
2208 | unsigned Index) override { |
2209 | return Impl.getVectorInstrCost(Opcode, Val, Index); |
2210 | } |
2211 | InstructionCost |
2212 | getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, |
2213 | const APInt &DemandedDstElts, |
2214 | TTI::TargetCostKind CostKind) override { |
2215 | return Impl.getReplicationShuffleCost(EltTy, ReplicationFactor, VF, |
2216 | DemandedDstElts, CostKind); |
2217 | } |
2218 | InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, |
2219 | unsigned AddressSpace, |
2220 | TTI::TargetCostKind CostKind, |
2221 | const Instruction *I) override { |
2222 | return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
2223 | CostKind, I); |
2224 | } |
2225 | InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, |
2226 | unsigned AddressSpace, |
2227 | TTI::TargetCostKind CostKind, |
2228 | const Instruction *I) override { |
2229 | return Impl.getVPMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
2230 | CostKind, I); |
2231 | } |
2232 | InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, |
2233 | Align Alignment, unsigned AddressSpace, |
2234 | TTI::TargetCostKind CostKind) override { |
2235 | return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
2236 | CostKind); |
2237 | } |
2238 | InstructionCost |
2239 | getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, |
2240 | bool VariableMask, Align Alignment, |
2241 | TTI::TargetCostKind CostKind, |
2242 | const Instruction *I = nullptr) override { |
2243 | return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
2244 | Alignment, CostKind, I); |
2245 | } |
2246 | InstructionCost getInterleavedMemoryOpCost( |
2247 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
2248 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
2249 | bool UseMaskForCond, bool UseMaskForGaps) override { |
2250 | return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
2251 | Alignment, AddressSpace, CostKind, |
2252 | UseMaskForCond, UseMaskForGaps); |
2253 | } |
2254 | InstructionCost |
2255 | getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
2256 | Optional<FastMathFlags> FMF, |
2257 | TTI::TargetCostKind CostKind) override { |
2258 | return Impl.getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
2259 | } |
2260 | InstructionCost |
2261 | getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, |
2262 | TTI::TargetCostKind CostKind) override { |
2263 | return Impl.getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); |
2264 | } |
2265 | InstructionCost getExtendedAddReductionCost( |
2266 | bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, |
2267 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) override { |
2268 | return Impl.getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, Ty, |
2269 | CostKind); |
2270 | } |
2271 | InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
2272 | TTI::TargetCostKind CostKind) override { |
2273 | return Impl.getIntrinsicInstrCost(ICA, CostKind); |
2274 | } |
2275 | InstructionCost getCallInstrCost(Function *F, Type *RetTy, |
2276 | ArrayRef<Type *> Tys, |
2277 | TTI::TargetCostKind CostKind) override { |
2278 | return Impl.getCallInstrCost(F, RetTy, Tys, CostKind); |
2279 | } |
2280 | unsigned getNumberOfParts(Type *Tp) override { |
2281 | return Impl.getNumberOfParts(Tp); |
2282 | } |
2283 | InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, |
2284 | const SCEV *Ptr) override { |
2285 | return Impl.getAddressComputationCost(Ty, SE, Ptr); |
2286 | } |
2287 | InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) override { |
2288 | return Impl.getCostOfKeepingLiveOverCall(Tys); |
2289 | } |
2290 | bool getTgtMemIntrinsic(IntrinsicInst *Inst, |
2291 | MemIntrinsicInfo &Info) override { |
2292 | return Impl.getTgtMemIntrinsic(Inst, Info); |
2293 | } |
2294 | unsigned getAtomicMemIntrinsicMaxElementSize() const override { |
2295 | return Impl.getAtomicMemIntrinsicMaxElementSize(); |
2296 | } |
2297 | Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, |
2298 | Type *ExpectedType) override { |
2299 | return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType); |
2300 | } |
2301 | Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, |
2302 | unsigned SrcAddrSpace, unsigned DestAddrSpace, |
2303 | unsigned SrcAlign, |
2304 | unsigned DestAlign) const override { |
2305 | return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace, |
2306 | DestAddrSpace, SrcAlign, DestAlign); |
2307 | } |
2308 | void getMemcpyLoopResidualLoweringType( |
2309 | SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, |
2310 | unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, |
2311 | unsigned SrcAlign, unsigned DestAlign) const override { |
2312 | Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes, |
2313 | SrcAddrSpace, DestAddrSpace, |
2314 | SrcAlign, DestAlign); |
2315 | } |
2316 | bool areInlineCompatible(const Function *Caller, |
2317 | const Function *Callee) const override { |
2318 | return Impl.areInlineCompatible(Caller, Callee); |
2319 | } |
2320 | bool areTypesABICompatible(const Function *Caller, const Function *Callee, |
2321 | const ArrayRef<Type *> &Types) const override { |
2322 | return Impl.areTypesABICompatible(Caller, Callee, Types); |
2323 | } |
2324 | bool isIndexedLoadLegal(MemIndexedMode Mode, Type *Ty) const override { |
2325 | return Impl.isIndexedLoadLegal(Mode, Ty, getDataLayout()); |
2326 | } |
2327 | bool isIndexedStoreLegal(MemIndexedMode Mode, Type *Ty) const override { |
2328 | return Impl.isIndexedStoreLegal(Mode, Ty, getDataLayout()); |
2329 | } |
2330 | unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override { |
2331 | return Impl.getLoadStoreVecRegBitWidth(AddrSpace); |
2332 | } |
2333 | bool isLegalToVectorizeLoad(LoadInst *LI) const override { |
2334 | return Impl.isLegalToVectorizeLoad(LI); |
2335 | } |
2336 | bool isLegalToVectorizeStore(StoreInst *SI) const override { |
2337 | return Impl.isLegalToVectorizeStore(SI); |
2338 | } |
2339 | bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, |
2340 | unsigned AddrSpace) const override { |
2341 | return Impl.isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, |
2342 | AddrSpace); |
2343 | } |
2344 | bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, |
2345 | unsigned AddrSpace) const override { |
2346 | return Impl.isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment, |
2347 | AddrSpace); |
2348 | } |
2349 | bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, |
2350 | ElementCount VF) const override { |
2351 | return Impl.isLegalToVectorizeReduction(RdxDesc, VF); |
2352 | } |
2353 | bool isElementTypeLegalForScalableVector(Type *Ty) const override { |
2354 | return Impl.isElementTypeLegalForScalableVector(Ty); |
2355 | } |
2356 | unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, |
2357 | unsigned ChainSizeInBytes, |
2358 | VectorType *VecTy) const override { |
2359 | return Impl.getLoadVectorFactor(VF, LoadSize, ChainSizeInBytes, VecTy); |
2360 | } |
2361 | unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, |
2362 | unsigned ChainSizeInBytes, |
2363 | VectorType *VecTy) const override { |
2364 | return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); |
2365 | } |
2366 | bool preferInLoopReduction(unsigned Opcode, Type *Ty, |
2367 | ReductionFlags Flags) const override { |
2368 | return Impl.preferInLoopReduction(Opcode, Ty, Flags); |
2369 | } |
2370 | bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, |
2371 | ReductionFlags Flags) const override { |
2372 | return Impl.preferPredicatedReductionSelect(Opcode, Ty, Flags); |
2373 | } |
2374 | bool shouldExpandReduction(const IntrinsicInst *II) const override { |
2375 | return Impl.shouldExpandReduction(II); |
2376 | } |
2377 | |
2378 | unsigned getGISelRematGlobalCost() const override { |
2379 | return Impl.getGISelRematGlobalCost(); |
2380 | } |
2381 | |
2382 | bool supportsScalableVectors() const override { |
2383 | return Impl.supportsScalableVectors(); |
2384 | } |
2385 | |
2386 | bool enableScalableVectorization() const override { |
2387 | return Impl.enableScalableVectorization(); |
2388 | } |
2389 | |
2390 | bool hasActiveVectorLength(unsigned Opcode, Type *DataType, |
2391 | Align Alignment) const override { |
2392 | return Impl.hasActiveVectorLength(Opcode, DataType, Alignment); |
2393 | } |
2394 | |
2395 | InstructionCost getInstructionLatency(const Instruction *I) override { |
2396 | return Impl.getInstructionLatency(I); |
2397 | } |
2398 | |
2399 | VPLegalization |
2400 | getVPLegalizationStrategy(const VPIntrinsic &PI) const override { |
2401 | return Impl.getVPLegalizationStrategy(PI); |
2402 | } |
2403 | }; |
2404 | |
2405 | template <typename T> |
2406 | TargetTransformInfo::TargetTransformInfo(T Impl) |
2407 | : TTIImpl(new Model<T>(Impl)) {} |
2408 | |
2409 | /// Analysis pass providing the \c TargetTransformInfo. |
2410 | /// |
2411 | /// The core idea of the TargetIRAnalysis is to expose an interface through |
2412 | /// which LLVM targets can analyze and provide information about the middle |
2413 | /// end's target-independent IR. This supports use cases such as target-aware |
2414 | /// cost modeling of IR constructs. |
2415 | /// |
2416 | /// This is a function analysis because much of the cost modeling for targets |
2417 | /// is done in a subtarget specific way and LLVM supports compiling different |
2418 | /// functions targeting different subtargets in order to support runtime |
2419 | /// dispatch according to the observed subtarget. |
2420 | class TargetIRAnalysis : public AnalysisInfoMixin<TargetIRAnalysis> { |
2421 | public: |
2422 | typedef TargetTransformInfo Result; |
2423 | |
2424 | /// Default construct a target IR analysis. |
2425 | /// |
2426 | /// This will use the module's datalayout to construct a baseline |
2427 | /// conservative TTI result. |
2428 | TargetIRAnalysis(); |
2429 | |
2430 | /// Construct an IR analysis pass around a target-provide callback. |
2431 | /// |
2432 | /// The callback will be called with a particular function for which the TTI |
2433 | /// is needed and must return a TTI object for that function. |
2434 | TargetIRAnalysis(std::function<Result(const Function &)> TTICallback); |
2435 | |
2436 | // Value semantics. We spell out the constructors for MSVC. |
2437 | TargetIRAnalysis(const TargetIRAnalysis &Arg) |
2438 | : TTICallback(Arg.TTICallback) {} |
2439 | TargetIRAnalysis(TargetIRAnalysis &&Arg) |
2440 | : TTICallback(std::move(Arg.TTICallback)) {} |
2441 | TargetIRAnalysis &operator=(const TargetIRAnalysis &RHS) { |
2442 | TTICallback = RHS.TTICallback; |
2443 | return *this; |
2444 | } |
2445 | TargetIRAnalysis &operator=(TargetIRAnalysis &&RHS) { |
2446 | TTICallback = std::move(RHS.TTICallback); |
2447 | return *this; |
2448 | } |
2449 | |
2450 | Result run(const Function &F, FunctionAnalysisManager &); |
2451 | |
2452 | private: |
2453 | friend AnalysisInfoMixin<TargetIRAnalysis>; |
2454 | static AnalysisKey Key; |
2455 | |
2456 | /// The callback used to produce a result. |
2457 | /// |
2458 | /// We use a completely opaque callback so that targets can provide whatever |
2459 | /// mechanism they desire for constructing the TTI for a given function. |
2460 | /// |
2461 | /// FIXME: Should we really use std::function? It's relatively inefficient. |
2462 | /// It might be possible to arrange for even stateful callbacks to outlive |
2463 | /// the analysis and thus use a function_ref which would be lighter weight. |
2464 | /// This may also be less error prone as the callback is likely to reference |
2465 | /// the external TargetMachine, and that reference needs to never dangle. |
2466 | std::function<Result(const Function &)> TTICallback; |
2467 | |
2468 | /// Helper function used as the callback in the default constructor. |
2469 | static Result getDefaultTTI(const Function &F); |
2470 | }; |
2471 | |
2472 | /// Wrapper pass for TargetTransformInfo. |
2473 | /// |
2474 | /// This pass can be constructed from a TTI object which it stores internally |
2475 | /// and is queried by passes. |
2476 | class TargetTransformInfoWrapperPass : public ImmutablePass { |
2477 | TargetIRAnalysis TIRA; |
2478 | Optional<TargetTransformInfo> TTI; |
2479 | |
2480 | virtual void anchor(); |
2481 | |
2482 | public: |
2483 | static char ID; |
2484 | |
2485 | /// We must provide a default constructor for the pass but it should |
2486 | /// never be used. |
2487 | /// |
2488 | /// Use the constructor below or call one of the creation routines. |
2489 | TargetTransformInfoWrapperPass(); |
2490 | |
2491 | explicit TargetTransformInfoWrapperPass(TargetIRAnalysis TIRA); |
2492 | |
2493 | TargetTransformInfo &getTTI(const Function &F); |
2494 | }; |
2495 | |
2496 | /// Create an analysis pass wrapper around a TTI object. |
2497 | /// |
2498 | /// This analysis pass just holds the TTI instance and makes it available to |
2499 | /// clients. |
2500 | ImmutablePass *createTargetTransformInfoWrapperPass(TargetIRAnalysis TIRA); |
2501 | |
2502 | } // namespace llvm |
2503 | |
2504 | #endif |
2505 | |