1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #include "BackendTestUtils.h" |
18 | #include "glow/Graph/Graph.h" |
19 | #include "glow/IR/IR.h" |
20 | #include "glow/IR/IRBuilder.h" |
21 | #include "glow/IR/IRUtils.h" |
22 | #include "glow/IR/Instrs.h" |
23 | #include "glow/Optimizer/IROptimizer/IRFunctionPassManager.h" |
24 | #include "glow/Optimizer/IROptimizer/IROptimizer.h" |
25 | |
26 | #include "llvm/Support/Casting.h" |
27 | |
28 | #include "gtest/gtest.h" |
29 | |
30 | #include <algorithm> |
31 | #include <cassert> |
32 | #include <cstddef> |
33 | #include <cstdint> |
34 | #include <iostream> |
35 | #include <string> |
36 | |
37 | using namespace glow; |
38 | using llvm::cast; |
39 | using llvm::dyn_cast; |
40 | using llvm::isa; |
41 | |
42 | /// Basic test of DSE (Dead Store Elimination) |
43 | TEST(Optimizer, dseBasic) { |
44 | Module mod; |
45 | Function *F = mod.createFunction("DeadStoreElimination" ); |
46 | IRFunction M(F); |
47 | IRBuilder bb(&M); |
48 | |
49 | auto *input1 = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "input1" , |
50 | WeightVar::MutabilityKind::Constant); |
51 | auto *input2 = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "input2" , |
52 | WeightVar::MutabilityKind::Constant); |
53 | auto *input3 = bb.createWeightVar(glow::ElemKind::BoolTy, {1}, "input3" , |
54 | WeightVar::MutabilityKind::Constant); |
55 | auto *output = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "output" , |
56 | WeightVar::MutabilityKind::Mutable); |
57 | |
58 | bb.createElementAddInst("elem_add1" , output, input1, input1); |
59 | bb.createElementSelectInst("select" , output, input3, output, input2); |
60 | bb.createElementAddInst("elem_add2" , output, input2, input2); |
61 | |
62 | optimize(M, MockBackend().shouldShareBuffers()); |
63 | |
64 | // Check that the first relu instruction and select are eliminated, because |
65 | // their outputs are never read. |
66 | EXPECT_EQ(M.getInstrs().size(), 1); |
67 | } |
68 | |
69 | /// Check that DSE does not remove the last write into a WeightVar. |
70 | TEST(Optimizer, dseDoNotRemloveLastWriteIntoWeightVar) { |
71 | Module mod; |
72 | Function *F = mod.createFunction("DeadStoreElimination" ); |
73 | IRFunction M(F); |
74 | IRBuilder bb(&M); |
75 | |
76 | auto *input1 = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "input1" , |
77 | WeightVar::MutabilityKind::Constant); |
78 | auto *input2 = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "input2" , |
79 | WeightVar::MutabilityKind::Constant); |
80 | auto *output = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "output" , |
81 | WeightVar::MutabilityKind::Mutable); |
82 | |
83 | // Last write into a WeightVar should not be removed even if there is |
84 | // no instruction that reads it, because it is an observable side-effect. |
85 | bb.createElementAddInst("elem_add" , output, input1, input2); |
86 | bb.createTensorViewInst( |
87 | "cast" , output, mod.uniqueType(Type(glow::ElemKind::FloatTy, {1, 1, 1})), |
88 | {0}); |
89 | |
90 | optimize(M, MockBackend().shouldShareBuffers()); |
91 | |
92 | // Check that the first relu instruction and select are eliminated, because |
93 | // their outputs are never read. |
94 | EXPECT_EQ(M.getInstrs().size(), 1); |
95 | } |
96 | |
97 | TEST(Optimizer, shareBuffers) { |
98 | Module mod; |
99 | Function *F = mod.createFunction("ShareBuffers" ); |
100 | IRFunction M(F); |
101 | IRBuilder bb(&M); |
102 | |
103 | auto *input = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "input" , |
104 | WeightVar::MutabilityKind::Constant); |
105 | auto *output = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "output" , |
106 | WeightVar::MutabilityKind::Mutable); |
107 | |
108 | auto *alloc1 = |
109 | bb.createAllocActivationInst("alloc1" , glow::ElemKind::FloatTy, 1); |
110 | auto *alloc2 = |
111 | bb.createAllocActivationInst("alloc2" , glow::ElemKind::FloatTy, 1); |
112 | auto *alloc3 = |
113 | bb.createAllocActivationInst("alloc3" , glow::ElemKind::FloatTy, 1); |
114 | bb.createSplatInst("splat1" , alloc1, 0.0); |
115 | bb.createSplatInst("splat2" , alloc2, 1.0); |
116 | bb.createElementAddInst("elem_add1" , alloc3, alloc1, input); |
117 | bb.createElementAddInst("elem_add2" , alloc2, input, input); |
118 | // alloc1 and alloc2 are not live after this instruction. |
119 | bb.createElementAddInst("elem_add3" , alloc1, alloc2, input); |
120 | bb.createCopyInst("copy" , output, alloc3); |
121 | bb.createDeallocActivationInst("dealloc3" , alloc3); |
122 | bb.createDeallocActivationInst("dealloc2" , alloc2); |
123 | bb.createDeallocActivationInst("dealloc1" , alloc1); |
124 | |
125 | optimize(M, MockBackend().shouldShareBuffers()); |
126 | |
127 | // Check that the first relu instruction and select are eliminated, because |
128 | // their outputs are never read. |
129 | EXPECT_EQ(M.getInstrs().size(), 2); |
130 | } |
131 | |
132 | TEST(Optimizer, deleteDeadViews) { |
133 | Module mod; |
134 | Function *F = mod.createFunction("DeleteDeadViews" ); |
135 | IRFunction M(F); |
136 | IRBuilder bb(&M); |
137 | |
138 | auto *input = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "input" , |
139 | WeightVar::MutabilityKind::Constant); |
140 | auto *output = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "output" , |
141 | WeightVar::MutabilityKind::Mutable); |
142 | |
143 | auto *tensorView1 = bb.createTensorViewInst( |
144 | "tensor_view1" , input, |
145 | mod.uniqueType(Type{glow::ElemKind::FloatTy, {1, 1}}), {0}); |
146 | |
147 | bb.createTensorViewInst("tensor_view2" , tensorView1, |
148 | mod.uniqueType(Type{glow::ElemKind::FloatTy, {1}}), |
149 | {0, 0}); |
150 | bb.createCopyInst("copy" , output, input); |
151 | |
152 | optimize(M, MockBackend().shouldShareBuffers()); |
153 | |
154 | // Check that all tensor_view instructions are eliminated, because they are |
155 | // never used. |
156 | EXPECT_EQ(M.getInstrs().size(), 1); |
157 | } |
158 | |
159 | TEST(Optimizer, copyPropagation) { |
160 | Module mod; |
161 | Function *F = mod.createFunction("ShareBuffers" ); |
162 | IRFunction M(F); |
163 | IRBuilder bb(&M); |
164 | |
165 | auto *input = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "input" , |
166 | WeightVar::MutabilityKind::Constant); |
167 | auto *output = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "output" , |
168 | WeightVar::MutabilityKind::Mutable); |
169 | |
170 | auto *alloc1 = |
171 | bb.createAllocActivationInst("alloc1" , glow::ElemKind::FloatTy, 1); |
172 | auto *alloc2 = |
173 | bb.createAllocActivationInst("alloc2" , glow::ElemKind::FloatTy, 1); |
174 | auto *alloc3 = |
175 | bb.createAllocActivationInst("alloc3" , glow::ElemKind::FloatTy, 1); |
176 | bb.createSplatInst("splat1" , alloc1, 1.0); |
177 | bb.createCopyInst("copy1" , alloc2, alloc1); |
178 | bb.createElementAddInst("elem_add1" , output, alloc2, input); |
179 | bb.createSplatInst("splat2" , alloc1, 0.0); |
180 | bb.createElementAddInst("elem_add2" , output, alloc2, alloc1); |
181 | bb.createDeallocActivationInst("dealloc3" , alloc3); |
182 | bb.createDeallocActivationInst("dealloc2" , alloc2); |
183 | bb.createDeallocActivationInst("dealloc1" , alloc1); |
184 | |
185 | optimize(M, MockBackend().shouldShareBuffers()); |
186 | |
187 | EXPECT_EQ(M.getInstrs().size(), 5); |
188 | |
189 | auto &instrs = M.getInstrs(); |
190 | EXPECT_TRUE(std::none_of( |
191 | instrs.begin(), instrs.end(), [](const Instruction &I) -> bool { |
192 | return I.getKind() == Instruction::Kind::CopyInstKind; |
193 | })); |
194 | } |
195 | |
196 | TEST(Optimizer, copyPropagationSimple) { |
197 | Module mod; |
198 | auto *F = mod.createFunction("ShareBuffers" ); |
199 | IRFunction M(F); |
200 | IRBuilder bb(&M); |
201 | |
202 | auto *input = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "input" , |
203 | WeightVar::MutabilityKind::Constant); |
204 | auto *output = bb.createWeightVar(glow::ElemKind::FloatTy, {1}, "output" , |
205 | WeightVar::MutabilityKind::Mutable); |
206 | |
207 | auto *alloc1 = |
208 | bb.createAllocActivationInst("alloc1" , glow::ElemKind::FloatTy, 1); |
209 | auto *alloc2 = |
210 | bb.createAllocActivationInst("alloc2" , glow::ElemKind::FloatTy, 1); |
211 | bb.createSplatInst("splat1" , alloc1, 1.0); |
212 | bb.createCopyInst("copy1" , alloc2, alloc1); |
213 | bb.createElementAddInst("elem_add1" , output, alloc2, input); |
214 | bb.createDeallocActivationInst("dealloc2" , alloc2); |
215 | bb.createDeallocActivationInst("dealloc1" , alloc1); |
216 | |
217 | optimize(M, MockBackend().shouldShareBuffers()); |
218 | |
219 | EXPECT_EQ(M.getInstrs().size(), 2); |
220 | |
221 | auto &instrs = M.getInstrs(); |
222 | EXPECT_TRUE(std::none_of( |
223 | instrs.begin(), instrs.end(), [](const Instruction &I) -> bool { |
224 | return isa<AllocActivationInst>(&I) || isa<DeallocActivationInst>(&I) || |
225 | isa<CopyInst>(&I); |
226 | })); |
227 | } |
228 | |
229 | TEST(Optimizer, copyPropagationTranspose) { |
230 | Module mod; |
231 | Function *F = mod.createFunction("ShareBuffers" ); |
232 | IRFunction M(F); |
233 | IRBuilder bb(&M); |
234 | |
235 | auto *output1 = |
236 | bb.createWeightVar(glow::ElemKind::FloatTy, {3, 1, 1}, "output1" , |
237 | WeightVar::MutabilityKind::Mutable); |
238 | auto *output2 = |
239 | bb.createWeightVar(glow::ElemKind::FloatTy, {1, 1, 3}, "output2" , |
240 | WeightVar::MutabilityKind::Mutable); |
241 | |
242 | auto *alloc1 = bb.createAllocActivationInst("alloc1" , glow::ElemKind::FloatTy, |
243 | {1, 1, 3}); |
244 | auto *alloc2 = bb.createAllocActivationInst("alloc2" , glow::ElemKind::FloatTy, |
245 | {3, 1, 1}); |
246 | bb.createSplatInst("splat1" , alloc1, 1.0); |
247 | bb.createTransposeInst("transpose" , alloc2, alloc1, {2, 0, 1}); |
248 | bb.createElementAddInst("elem_add2" , output1, alloc2, alloc2); |
249 | bb.createElementAddInst("elem_add2" , output2, alloc1, alloc1); |
250 | bb.createDeallocActivationInst("dealloc2" , alloc2); |
251 | bb.createDeallocActivationInst("dealloc1" , alloc1); |
252 | |
253 | optimize(M, MockBackend().shouldShareBuffers()); |
254 | |
255 | EXPECT_EQ(M.getInstrs().size(), 5); |
256 | |
257 | auto &instrs = M.getInstrs(); |
258 | EXPECT_TRUE(std::none_of( |
259 | instrs.begin(), instrs.end(), [](const Instruction &I) -> bool { |
260 | return isa<TransposeInst>(&I) || isa<AllocActivationInst>(&I) || |
261 | isa<DeallocActivationInst>(&I); |
262 | })); |
263 | } |
264 | |
265 | /// Test the isSliceContiguous utility function. |
266 | TEST(Optimizer, isSliceContiguous) { |
267 | EXPECT_EQ(isSliceContiguous({1, 1, 1}, {3, 3, 3}), true); |
268 | EXPECT_EQ(isSliceContiguous({1, 1, 2}, {3, 3, 3}), true); |
269 | EXPECT_EQ(isSliceContiguous({1, 1, 3}, {3, 3, 3}), true); |
270 | EXPECT_EQ(isSliceContiguous({1, 2, 1}, {3, 3, 3}), false); |
271 | EXPECT_EQ(isSliceContiguous({1, 2, 2}, {3, 3, 3}), false); |
272 | EXPECT_EQ(isSliceContiguous({1, 2, 3}, {3, 3, 3}), true); |
273 | EXPECT_EQ(isSliceContiguous({1, 3, 1}, {3, 3, 3}), false); |
274 | EXPECT_EQ(isSliceContiguous({1, 3, 2}, {3, 3, 3}), false); |
275 | EXPECT_EQ(isSliceContiguous({1, 3, 3}, {3, 3, 3}), true); |
276 | EXPECT_EQ(isSliceContiguous({2, 1, 1}, {3, 3, 3}), false); |
277 | EXPECT_EQ(isSliceContiguous({2, 1, 2}, {3, 3, 3}), false); |
278 | EXPECT_EQ(isSliceContiguous({2, 1, 3}, {3, 3, 3}), false); |
279 | EXPECT_EQ(isSliceContiguous({2, 2, 1}, {3, 3, 3}), false); |
280 | EXPECT_EQ(isSliceContiguous({2, 2, 2}, {3, 3, 3}), false); |
281 | EXPECT_EQ(isSliceContiguous({2, 2, 3}, {3, 3, 3}), false); |
282 | EXPECT_EQ(isSliceContiguous({2, 3, 1}, {3, 3, 3}), false); |
283 | EXPECT_EQ(isSliceContiguous({2, 3, 2}, {3, 3, 3}), false); |
284 | EXPECT_EQ(isSliceContiguous({2, 3, 3}, {3, 3, 3}), true); |
285 | EXPECT_EQ(isSliceContiguous({3, 1, 1}, {3, 3, 3}), false); |
286 | EXPECT_EQ(isSliceContiguous({3, 1, 2}, {3, 3, 3}), false); |
287 | EXPECT_EQ(isSliceContiguous({3, 1, 3}, {3, 3, 3}), false); |
288 | EXPECT_EQ(isSliceContiguous({3, 2, 1}, {3, 3, 3}), false); |
289 | EXPECT_EQ(isSliceContiguous({3, 2, 2}, {3, 3, 3}), false); |
290 | EXPECT_EQ(isSliceContiguous({3, 2, 3}, {3, 3, 3}), false); |
291 | EXPECT_EQ(isSliceContiguous({3, 3, 1}, {3, 3, 3}), false); |
292 | EXPECT_EQ(isSliceContiguous({3, 3, 2}, {3, 3, 3}), false); |
293 | EXPECT_EQ(isSliceContiguous({3, 3, 3}, {3, 3, 3}), true); |
294 | } |
295 | |
296 | /// Utility function for testing the optimization of an InsertTensorInstruction |
297 | /// to a TensorViewInstruction when the inserted tensor (slice) is contiguous. |
298 | static void testInsertOptimizer(llvm::ArrayRef<dim_t> srcShape, |
299 | llvm::ArrayRef<dim_t> destShape, |
300 | llvm::ArrayRef<dim_t> offsets) { |
301 | Module mod; |
302 | Function *F = mod.createFunction("InsertOptimizer" ); |
303 | IRFunction M(F); |
304 | IRBuilder bb(&M); |
305 | |
306 | auto *dest = bb.createWeightVar(glow::ElemKind::FloatTy, destShape, "dest" , |
307 | WeightVar::MutabilityKind::Mutable); |
308 | auto *srcAlloc = bb.createAllocActivationInst( |
309 | "srcAlloc" , glow::ElemKind::FloatTy, srcShape); |
310 | bb.createSplatInst("srcSplat" , srcAlloc, 1.0); |
311 | bb.createSplatInst("destSplat" , dest, 2.0); |
312 | bb.createInsertTensorInst("insert" , dest, srcAlloc, offsets, 1, 0); |
313 | bb.createDeallocActivationInst("deallocSrc" , srcAlloc); |
314 | |
315 | optimize(M, MockBackend().shouldShareBuffers()); |
316 | |
317 | auto &instrs = M.getInstrs(); |
318 | if (srcShape == destShape) { |
319 | // If the slice was fully inserted then we should be left with only the |
320 | // the source Splat. |
321 | EXPECT_EQ(instrs.size(), 1); |
322 | EXPECT_EQ(instrs.begin()->getName().str(), std::string("srcSplat" )); |
323 | } else if (isSliceContiguous(srcShape, destShape)) { |
324 | // If the slice is contiguous then we should be left with 2 Splats and a |
325 | // TensorView. The Insert, Alloc and Dealloc should be gone. |
326 | EXPECT_EQ(instrs.size(), 3); |
327 | EXPECT_TRUE(std::all_of( |
328 | instrs.begin(), instrs.end(), [](const Instruction &I) -> bool { |
329 | return isa<SplatInst>(&I) || isa<TensorViewInst>(&I); |
330 | })); |
331 | } else { |
332 | // If the slice is not contiguous, we should be left with the original |
333 | // instructions: Alloc, 2 Splats, Insert, Dealloc. |
334 | EXPECT_EQ(instrs.size(), 5); |
335 | } |
336 | } |
337 | |
338 | /// Simple test where a single Insert is replaced by a TensorView with offsets. |
339 | TEST(Optimizer, insertOptimizer) { |
340 | testInsertOptimizer({1, 1, 1}, {3, 3, 3}, {0, 0, 0}); |
341 | testInsertOptimizer({1, 1, 2}, {3, 3, 3}, {1, 1, 1}); |
342 | testInsertOptimizer({1, 1, 3}, {3, 3, 3}, {2, 2, 0}); |
343 | testInsertOptimizer({1, 2, 1}, {3, 3, 3}, {0, 0, 1}); |
344 | testInsertOptimizer({1, 2, 2}, {3, 3, 3}, {1, 1, 0}); |
345 | testInsertOptimizer({1, 2, 3}, {3, 3, 3}, {2, 0, 0}); |
346 | testInsertOptimizer({1, 3, 1}, {3, 3, 3}, {0, 0, 0}); |
347 | testInsertOptimizer({1, 3, 2}, {3, 3, 3}, {1, 0, 1}); |
348 | testInsertOptimizer({1, 3, 3}, {3, 3, 3}, {2, 0, 0}); |
349 | testInsertOptimizer({2, 1, 1}, {3, 3, 3}, {0, 0, 1}); |
350 | testInsertOptimizer({2, 1, 2}, {3, 3, 3}, {1, 1, 0}); |
351 | testInsertOptimizer({2, 1, 3}, {3, 3, 3}, {0, 2, 0}); |
352 | testInsertOptimizer({2, 2, 1}, {3, 3, 3}, {1, 0, 0}); |
353 | testInsertOptimizer({2, 2, 2}, {3, 3, 3}, {0, 1, 1}); |
354 | testInsertOptimizer({2, 2, 3}, {3, 3, 3}, {1, 0, 0}); |
355 | testInsertOptimizer({2, 3, 1}, {3, 3, 3}, {0, 0, 1}); |
356 | testInsertOptimizer({2, 3, 2}, {3, 3, 3}, {1, 0, 0}); |
357 | testInsertOptimizer({2, 3, 3}, {3, 3, 3}, {0, 0, 0}); |
358 | testInsertOptimizer({3, 1, 1}, {3, 3, 3}, {0, 0, 0}); |
359 | testInsertOptimizer({3, 1, 2}, {3, 3, 3}, {0, 1, 1}); |
360 | testInsertOptimizer({3, 1, 3}, {3, 3, 3}, {0, 2, 0}); |
361 | testInsertOptimizer({3, 2, 1}, {3, 3, 3}, {0, 0, 1}); |
362 | testInsertOptimizer({3, 2, 2}, {3, 3, 3}, {0, 1, 0}); |
363 | testInsertOptimizer({3, 2, 3}, {3, 3, 3}, {0, 0, 0}); |
364 | testInsertOptimizer({3, 3, 1}, {3, 3, 3}, {0, 0, 0}); |
365 | testInsertOptimizer({3, 3, 2}, {3, 3, 3}, {0, 0, 1}); |
366 | testInsertOptimizer({3, 3, 3}, {3, 3, 3}, {0, 0, 0}); |
367 | } |
368 | |
369 | /// Utility function for testing the optimization of an ExtractTensorInstruction |
370 | /// to a TensorViewInstruction when the inserted tensor (slice) is contiguous. |
371 | static void (llvm::ArrayRef<dim_t> destShape, |
372 | llvm::ArrayRef<dim_t> srcShape, |
373 | llvm::ArrayRef<dim_t> offsets) { |
374 | Module mod; |
375 | Function *F = mod.createFunction("ExtractOptimizer" ); |
376 | IRFunction M(F); |
377 | IRBuilder bb(&M); |
378 | |
379 | auto *src = bb.createWeightVar(glow::ElemKind::FloatTy, srcShape, "src" , |
380 | WeightVar::MutabilityKind::Mutable); |
381 | auto *dest = bb.createWeightVar(glow::ElemKind::FloatTy, destShape, "dest" , |
382 | WeightVar::MutabilityKind::Mutable); |
383 | bb.createSplatInst("srcSplat" , src, 1.0); |
384 | auto *destAlloc = |
385 | bb.createAllocActivationInst("dest" , glow::ElemKind::FloatTy, destShape); |
386 | bb.createExtractTensorInst("extract" , destAlloc, src, offsets); |
387 | bb.createCopyInst("save" , dest, destAlloc); |
388 | bb.createDeallocActivationInst("deallocDest" , destAlloc); |
389 | |
390 | optimize(M, MockBackend().shouldShareBuffers()); |
391 | |
392 | auto &instrs = M.getInstrs(); |
393 | if (destShape == srcShape) { |
394 | // If the slice was fully extracted then we should be left with a Splat |
395 | // and a Copy. The Alloc, Extract and Dealloc should be gone. |
396 | EXPECT_EQ(instrs.size(), 2); |
397 | EXPECT_TRUE(std::all_of(instrs.begin(), instrs.end(), |
398 | [](const Instruction &I) -> bool { |
399 | return isa<SplatInst>(&I) || isa<CopyInst>(&I); |
400 | })); |
401 | } else if (isSliceContiguous(destShape, srcShape)) { |
402 | // If the extracted slice is contiguous then we should be left with a Splat, |
403 | // a TensorView and a Copy. The Extract, Alloc and Dealloc should be gone. |
404 | EXPECT_EQ(instrs.size(), 3); |
405 | EXPECT_TRUE(std::all_of( |
406 | instrs.begin(), instrs.end(), [](const Instruction &I) -> bool { |
407 | return isa<SplatInst>(&I) || isa<TensorViewInst>(&I) || |
408 | isa<CopyInst>(&I); |
409 | })); |
410 | } else { |
411 | // If the slice is not contiguous, we should be left with a Splat and an |
412 | // Extract. The Alloc, Copy and Dealloc should be gone. |
413 | EXPECT_EQ(instrs.size(), 2); |
414 | EXPECT_TRUE(std::all_of( |
415 | instrs.begin(), instrs.end(), [](const Instruction &I) -> bool { |
416 | return isa<SplatInst>(&I) || isa<ExtractTensorInst>(&I); |
417 | })); |
418 | } |
419 | } |
420 | |
421 | /// Simple test where a single Extract is replaced by a TensorView with offsets. |
422 | TEST(Optimizer, extractOptimizer) { |
423 | testExtractOptimizer({1, 1, 1}, {3, 3, 3}, {0, 0, 0}); |
424 | testExtractOptimizer({1, 1, 2}, {3, 3, 3}, {1, 1, 1}); |
425 | testExtractOptimizer({1, 1, 3}, {3, 3, 3}, {2, 2, 0}); |
426 | testExtractOptimizer({1, 2, 1}, {3, 3, 3}, {0, 0, 1}); |
427 | testExtractOptimizer({1, 2, 2}, {3, 3, 3}, {1, 1, 0}); |
428 | testExtractOptimizer({1, 2, 3}, {3, 3, 3}, {2, 0, 0}); |
429 | testExtractOptimizer({1, 3, 1}, {3, 3, 3}, {0, 0, 0}); |
430 | testExtractOptimizer({1, 3, 2}, {3, 3, 3}, {1, 0, 1}); |
431 | testExtractOptimizer({1, 3, 3}, {3, 3, 3}, {2, 0, 0}); |
432 | testExtractOptimizer({2, 1, 1}, {3, 3, 3}, {0, 0, 1}); |
433 | testExtractOptimizer({2, 1, 2}, {3, 3, 3}, {1, 1, 0}); |
434 | testExtractOptimizer({2, 1, 3}, {3, 3, 3}, {0, 2, 0}); |
435 | testExtractOptimizer({2, 2, 1}, {3, 3, 3}, {1, 0, 0}); |
436 | testExtractOptimizer({2, 2, 2}, {3, 3, 3}, {0, 1, 1}); |
437 | testExtractOptimizer({2, 2, 3}, {3, 3, 3}, {1, 0, 0}); |
438 | testExtractOptimizer({2, 3, 1}, {3, 3, 3}, {0, 0, 1}); |
439 | testExtractOptimizer({2, 3, 2}, {3, 3, 3}, {1, 0, 0}); |
440 | testExtractOptimizer({2, 3, 3}, {3, 3, 3}, {0, 0, 0}); |
441 | testExtractOptimizer({3, 1, 1}, {3, 3, 3}, {0, 0, 0}); |
442 | testExtractOptimizer({3, 1, 2}, {3, 3, 3}, {0, 1, 1}); |
443 | testExtractOptimizer({3, 1, 3}, {3, 3, 3}, {0, 2, 0}); |
444 | testExtractOptimizer({3, 2, 1}, {3, 3, 3}, {0, 0, 1}); |
445 | testExtractOptimizer({3, 2, 2}, {3, 3, 3}, {0, 1, 0}); |
446 | testExtractOptimizer({3, 2, 3}, {3, 3, 3}, {0, 0, 0}); |
447 | testExtractOptimizer({3, 3, 1}, {3, 3, 3}, {0, 0, 0}); |
448 | testExtractOptimizer({3, 3, 2}, {3, 3, 3}, {0, 0, 1}); |
449 | testExtractOptimizer({3, 3, 3}, {3, 3, 3}, {0, 0, 0}); |
450 | } |
451 | |
452 | /// This is representative of what a ConcatNode is IRGen'd into: src1 and src2 |
453 | /// represent the two tensors that are being concatenated, and dest represents |
454 | /// the resulting concatenated tensor. |
455 | TEST(Optimizer, twoInsertsWithBuffersOptimizer) { |
456 | Module mod; |
457 | Function *F = mod.createFunction("InsertWithBufferOptimizer" ); |
458 | IRFunction M(F); |
459 | IRBuilder bb(&M); |
460 | |
461 | auto *output = |
462 | bb.createWeightVar(glow::ElemKind::FloatTy, {4, 4, 5}, "output" , |
463 | WeightVar::MutabilityKind::Mutable); |
464 | |
465 | auto *allocSrc1 = bb.createAllocActivationInst( |
466 | "allocSrc1" , glow::ElemKind::FloatTy, {2, 4, 5}); |
467 | auto *allocSrc2 = bb.createAllocActivationInst( |
468 | "allocSrc2" , glow::ElemKind::FloatTy, {2, 4, 5}); |
469 | auto *allocDest = bb.createAllocActivationInst( |
470 | "allocDest" , glow::ElemKind::FloatTy, {4, 4, 5}); |
471 | |
472 | bb.createSplatInst("splatSrc1" , allocSrc1, 1.0); |
473 | bb.createSplatInst("splatSrc2" , allocSrc2, 2.0); |
474 | bb.createSplatInst("splatDest" , allocDest, 3.0); |
475 | |
476 | bb.createInsertTensorInst("insert1" , allocDest, allocSrc1, {0, 0, 0}, 1, 0); |
477 | bb.createInsertTensorInst("insert2" , allocDest, allocSrc2, {2, 0, 0}, 1, 0); |
478 | |
479 | bb.createCopyInst("copy" , output, allocDest); |
480 | |
481 | bb.createDeallocActivationInst("deallocDest" , allocDest); |
482 | bb.createDeallocActivationInst("deallocSrc2" , allocSrc2); |
483 | bb.createDeallocActivationInst("deallocSrc1" , allocSrc1); |
484 | |
485 | optimize(M, MockBackend().shouldShareBuffers()); |
486 | |
487 | // After optimization, should be left with three splats and two tensorviews; |
488 | // the inserts, allocs, and deallocs should be gone. |
489 | auto &instrs = M.getInstrs(); |
490 | EXPECT_EQ(instrs.size(), 5); |
491 | EXPECT_TRUE(std::all_of( |
492 | instrs.begin(), instrs.end(), [](const Instruction &I) -> bool { |
493 | return isa<SplatInst>(&I) || isa<TensorViewInst>(&I); |
494 | })); |
495 | } |
496 | |
497 | /// This is representative of what a SliceNode is IRGen'd into: src is the |
498 | /// original source tensor, and then two slices are created into dest1 and |
499 | /// dest2. |
500 | TEST(Optimizer, twoExtractsWithBuffersOptimizer) { |
501 | Module mod; |
502 | Function *F = mod.createFunction("ExtractWithBufferOptimizer" ); |
503 | IRFunction M(F); |
504 | IRBuilder bb(&M); |
505 | |
506 | auto *output1 = |
507 | bb.createWeightVar(glow::ElemKind::FloatTy, {2, 4, 5}, "output1" , |
508 | WeightVar::MutabilityKind::Mutable); |
509 | auto *output2 = |
510 | bb.createWeightVar(glow::ElemKind::FloatTy, {2, 4, 5}, "output2" , |
511 | WeightVar::MutabilityKind::Mutable); |
512 | |
513 | auto *allocSrc = bb.createAllocActivationInst( |
514 | "allocSrc" , glow::ElemKind::FloatTy, {4, 4, 5}); |
515 | auto *allocDest1 = bb.createAllocActivationInst( |
516 | "allocDest1" , glow::ElemKind::FloatTy, {2, 4, 5}); |
517 | auto *allocDest2 = bb.createAllocActivationInst( |
518 | "allocDest2" , glow::ElemKind::FloatTy, {2, 4, 5}); |
519 | |
520 | bb.createSplatInst("splatSrc" , allocSrc, 3.0); |
521 | |
522 | bb.createExtractTensorInst("extract1" , allocDest1, allocSrc, {0, 0, 0}); |
523 | bb.createExtractTensorInst("extract2" , allocDest2, allocSrc, {2, 0, 0}); |
524 | |
525 | bb.createCopyInst("copy" , output1, allocDest1); |
526 | bb.createCopyInst("copy" , output2, allocDest2); |
527 | |
528 | bb.createDeallocActivationInst("deallocSrc" , allocSrc); |
529 | bb.createDeallocActivationInst("deallocDest2" , allocDest2); |
530 | bb.createDeallocActivationInst("deallocDest1" , allocDest1); |
531 | |
532 | optimize(M, MockBackend().shouldShareBuffers()); |
533 | |
534 | // After optimization, the extracts should be gone, as well as both allocDests |
535 | // and their deallocs. Should be left with splatSrc, allocSrc, deallocSrc, two |
536 | // tensorviews, and two copies from the tensorviews into the outputs. |
537 | auto &instrs = M.getInstrs(); |
538 | EXPECT_EQ(instrs.size(), 7); |
539 | EXPECT_TRUE(std::none_of( |
540 | instrs.begin(), instrs.end(), |
541 | [](const Instruction &I) -> bool { return isa<ExtractTensorInst>(&I); })); |
542 | } |
543 | |
544 | /// Check that we are able to coalesce a copy forward from the input. |
545 | /// This test consists in copy from the input variable. |
546 | /// Its may characteristic is that this copy cannot be coalesced with |
547 | /// the output (otherwise it would be a backward chain of |
548 | /// copies from output). |
549 | /// The shareBuffers optimization works backward, so as long as |
550 | /// it manages to coalesce things with output one by one, we |
551 | /// won't see if the forward copies are properly handled. |
552 | TEST(Optimizer, forwardCopy) { |
553 | Module mod; |
554 | Function *F = mod.createFunction("forwardCopy" ); |
555 | IRFunction M(F); |
556 | IRBuilder bb(&M); |
557 | |
558 | auto *input = bb.createWeightVar(glow::ElemKind::FloatTy, {64}, "input" , |
559 | WeightVar::MutabilityKind::Mutable); |
560 | auto *output = bb.createWeightVar(glow::ElemKind::FloatTy, {2, 64}, "output" , |
561 | WeightVar::MutabilityKind::Mutable); |
562 | auto *tmp1 = |
563 | bb.createAllocActivationInst("tmp1" , glow::ElemKind::FloatTy, {64}); |
564 | bb.createCopyInst("copy1" , tmp1, input); |
565 | |
566 | auto *view = bb.createTensorViewInst( |
567 | "view" , tmp1, mod.uniqueType(Type(glow::ElemKind::FloatTy, {1, 64})), |
568 | {0}); |
569 | bb.createInsertTensorInst("copyOutput" , output, view, {0, 0}, 1, 0); |
570 | |
571 | bb.createDeallocActivationInst("dealloc1" , tmp1); |
572 | |
573 | auto &instrs = M.getInstrs(); |
574 | auto nbInstrsBeforeOpt = instrs.size(); |
575 | optimize(M, MockBackend().shouldShareBuffers()); |
576 | |
577 | // After optimization, the copy should have been coalesced with input. |
578 | // nbIntrsBeforeOpt - 1 copy - 1 dealloc - 1 alloc |
579 | EXPECT_EQ(instrs.size(), |
580 | nbInstrsBeforeOpt /*copy*/ - 1 /*alloca*/ - 1 /*dealloc*/ - 1); |
581 | EXPECT_TRUE(std::none_of(instrs.begin(), instrs.end(), |
582 | [](const Instruction &I) -> bool { |
583 | return isa<AllocActivationInst>(&I); |
584 | })); |
585 | EXPECT_TRUE(std::none_of( |
586 | instrs.begin(), instrs.end(), |
587 | [](const Instruction &I) -> bool { return isa<CopyInst>(&I); })); |
588 | } |
589 | |
590 | /// Check that we are able to coalesce chain of copies |
591 | /// forward from the input. |
592 | /// This test is similar to forwardCopy, expect it uses a chain of copies (more |
593 | /// than one) instead of just on copy from input. |
594 | TEST(Optimizer, chainOfTwoForwardCopies) { |
595 | Module mod; |
596 | Function *F = mod.createFunction("chainOfTwoForwardCopies" ); |
597 | IRFunction M(F); |
598 | IRBuilder bb(&M); |
599 | |
600 | auto *input = bb.createWeightVar(glow::ElemKind::FloatTy, {64}, "input" , |
601 | WeightVar::MutabilityKind::Mutable); |
602 | auto *output = bb.createWeightVar(glow::ElemKind::FloatTy, {2, 64}, "output" , |
603 | WeightVar::MutabilityKind::Mutable); |
604 | auto *tmp1 = |
605 | bb.createAllocActivationInst("tmp1" , glow::ElemKind::FloatTy, {64}); |
606 | bb.createCopyInst("copy1" , tmp1, input); |
607 | |
608 | auto *tmp2 = |
609 | bb.createAllocActivationInst("tmp2" , glow::ElemKind::FloatTy, {64}); |
610 | bb.createCopyInst("copy2" , tmp2, tmp1); |
611 | auto *view = bb.createTensorViewInst( |
612 | "view" , tmp2, mod.uniqueType(Type(glow::ElemKind::FloatTy, {1, 64})), |
613 | {0}); |
614 | bb.createInsertTensorInst("copyOutput" , output, view, {0, 0}, 1, 0); |
615 | |
616 | bb.createDeallocActivationInst("dealloc1" , tmp1); |
617 | bb.createDeallocActivationInst("dealloc2" , tmp2); |
618 | |
619 | auto &instrs = M.getInstrs(); |
620 | auto nbInstrsBeforeOpt = instrs.size(); |
621 | optimize(M, MockBackend().shouldShareBuffers()); |
622 | |
623 | // After optimization, the copies should have been coalesced with |
624 | // input. |
625 | // Ideally, we should get rid of 2 copies, the related 2 allocactivations and |
626 | // deallocation. |
627 | // Therefore expected instructions should be |
628 | // nbIntrsBeforeOpt - 2 copies - 2 dealloc - 2 alloc |
629 | EXPECT_EQ(instrs.size(), |
630 | nbInstrsBeforeOpt /*copy*/ - 2 /*alloca*/ - 2 /*dealloc*/ - 2); |
631 | EXPECT_TRUE(std::none_of(instrs.begin(), instrs.end(), |
632 | [](const Instruction &I) -> bool { |
633 | return isa<AllocActivationInst>(&I); |
634 | })); |
635 | EXPECT_TRUE(std::none_of( |
636 | instrs.begin(), instrs.end(), |
637 | [](const Instruction &I) -> bool { return isa<CopyInst>(&I); })); |
638 | } |
639 | |
640 | /// The idea of this test is to have live intervals looking like this: |
641 | /// A B |
642 | /// | <-copy | |
643 | /// inout | |
644 | /// | | |
645 | /// Because of the inout on A, A and B interfere. |
646 | /// Make sure we don't coalesce such buffers. |
647 | TEST(Optimizer, inoutCopy) { |
648 | Module mod; |
649 | Function *F = mod.createFunction("inoutCopy" ); |
650 | IRFunction M(F); |
651 | IRBuilder bb(&M); |
652 | |
653 | auto *input = bb.createWeightVar(glow::ElemKind::FloatTy, {2, 64}, "input" , |
654 | WeightVar::MutabilityKind::Mutable); |
655 | auto *output = bb.createWeightVar(glow::ElemKind::FloatTy, {3, 64}, "output" , |
656 | WeightVar::MutabilityKind::Mutable); |
657 | auto *output2 = |
658 | bb.createWeightVar(glow::ElemKind::FloatTy, {2, 64}, "output2" , |
659 | WeightVar::MutabilityKind::Mutable); |
660 | // This copy cannot be eliminated because input must not be changed. |
661 | // Indeed, this is an observable variable plus it is used as a source |
662 | // for a copy to output2. |
663 | auto *tmp1 = |
664 | bb.createAllocActivationInst("tmp1" , glow::ElemKind::FloatTy, {2, 64}); |
665 | bb.createCopyInst("copy1" , tmp1, input); |
666 | |
667 | auto *tmp2 = |
668 | bb.createAllocActivationInst("tmp2" , glow::ElemKind::FloatTy, {64}); |
669 | bb.createSplatInst("splat" , tmp2, 3.0); |
670 | auto *view = |
671 | bb.createTensorView(ElemKind::FloatTy, {1, 64}, tmp2, "view" , {0}); |
672 | bb.createInsertTensorInst("insertTmp1" , tmp1, view, {0, 0}, 1, 0); |
673 | bb.createInsertTensorInst("insertOutput" , output, tmp1, {1, 0}, 1, 0); |
674 | bb.createCopyInst("copyOutput2" , output2, input); |
675 | |
676 | bb.createDeallocActivationInst("dealloc1" , tmp1); |
677 | bb.createDeallocActivationInst("dealloc2" , tmp2); |
678 | |
679 | optimize(M, MockBackend().shouldShareBuffers()); |
680 | |
681 | // After optimization, the copies shouldn't have been touched. |
682 | // tmp1 = copy input cannot be coalesced because tmp1 is inout. |
683 | // output2 = copy input cannot be coalesced because they are both |
684 | // externally visible. |
685 | EXPECT_EQ(input->getNumUsers(), 2); |
686 | EXPECT_TRUE( |
687 | std::all_of(input->getUsers().begin(), input->getUsers().end(), |
688 | [](const Use &I) -> bool { return isa<CopyInst>(I.get()); })); |
689 | const Value *expectedDest[] = {tmp1, output2}; |
690 | unsigned idx = 0; |
691 | for (const Use &use : input->getUsers()) { |
692 | if (idx == sizeof(expectedDest) / sizeof(expectedDest[0])) { |
693 | // If we end up here that means that input has too many users. |
694 | EXPECT_FALSE(true); |
695 | break; |
696 | } |
697 | EXPECT_EQ(use.get()->getOperand(0).first, expectedDest[idx++]); |
698 | } |
699 | } |
700 | |
701 | /// Check that we properly define a buffer when we extend its live-range |
702 | /// on a segment of the source that does not have any definition. |
703 | /// A source live-range without any definition can happen when this |
704 | /// is the first use of a WeightVar. |
705 | /// At the high level, this test looks like this: |
706 | /// WeightVar Buffer |
707 | /// | useA |
708 | /// | useB | def |
709 | /// | redef | save to output |
710 | /// - UseA is the first use of WeightVar and we want it to be replaced by |
711 | /// a use of Buffer. I.e., Buffer live-range is extended toward the top. |
712 | /// - UseB involves both WeightVar and Buffer. It exposes the buffer sharing |
713 | /// opportunity between these two variables. It must happen after useA |
714 | /// to expose the case of extending the live-range of a buffer toward |
715 | /// the top where no definition exists. |
716 | /// - redef redefines WeightVar. It is necessary otherwise both useA and useB |
717 | /// could all share the same buffer and thus, we would extend the live-range |
718 | /// of the buffer in useA downward (or the use of Buffer up to the |
719 | /// definition of the buffer in useA), which is not what we want to test. |
720 | /// - save to output is required to keep the def of Buffer alive. Moreover, |
721 | /// the save must be done in such a way that the output buffer and Buffer |
722 | /// must not be able to share the same buffer. Otherwise, the live-range |
723 | /// of output buffer will be extended upward to useB and given output and |
724 | /// WeightVar are both externally observable, the output buffer cannot be |
725 | /// merged with WeightVar. |
726 | /// Therefore, we won't expose an extension of output up to useA |
727 | /// and won't test the case where the replaced buffer doesn't have any |
728 | /// definition. |
729 | /// |
730 | /// The expected result at a high level looks like this: |
731 | /// WeightVar Buffer |
732 | /// | copy | <- Buffer gets WeightVar |
733 | /// useA | <- Buffer is used instead of WeightVar |
734 | /// useB | def <- ditto |
735 | /// | redef | save to output |
736 | TEST(Optimizer, bufferReuseWithoutDefs) { |
737 | Module mod; |
738 | Function *F = mod.createFunction("bufferReuseWithoutDefs" ); |
739 | IRFunction M(F); |
740 | IRBuilder bb(&M); |
741 | |
742 | auto *input = bb.createWeightVar(glow::ElemKind::FloatTy, {64}, "input" , |
743 | WeightVar::MutabilityKind::Mutable); |
744 | auto *output = bb.createWeightVar(glow::ElemKind::FloatTy, {2, 64}, "output" , |
745 | WeightVar::MutabilityKind::Mutable); |
746 | auto *tmp1 = |
747 | bb.createAllocActivationInst("tmp1" , glow::ElemKind::FloatTy, {64}); |
748 | |
749 | auto *tmp2 = |
750 | bb.createAllocActivationInst("tmp2" , glow::ElemKind::FloatTy, {64}); |
751 | auto *tmp3 = |
752 | bb.createAllocActivationInst("tmp3" , glow::ElemKind::FloatTy, {64}); |
753 | |
754 | bb.createSplatInst("tmp2init" , tmp2, 1.0); |
755 | // use input for some stuff. |
756 | auto *useA = bb.createElementAddInst("useA" , tmp3, tmp2, input); |
757 | // Make the first user of input a dependency of the definition |
758 | // of tmp1 that way the scheduler cannot mess with the layout |
759 | // we want for the instructions ordering. |
760 | bb.createElementAddInst("useB" , tmp1, input, tmp3); |
761 | bb.createCopyInst("redef" , input, tmp3); |
762 | auto *view = bb.createTensorViewInst( |
763 | "view" , tmp1, mod.uniqueType(Type(glow::ElemKind::FloatTy, {1, 64})), |
764 | {0}); |
765 | bb.createInsertTensorInst("save" , output, view, {0, 0}, 1, 0); |
766 | |
767 | bb.createDeallocActivationInst("dealloc1" , tmp1); |
768 | bb.createDeallocActivationInst("dealloc2" , tmp2); |
769 | bb.createDeallocActivationInst("dealloc2" , tmp3); |
770 | |
771 | optimize(M, MockBackend().shouldShareBuffers()); |
772 | |
773 | // Check that we manage to expose the problematic case we wanted: |
774 | // tmp1 is extended upward and replace the use of input. |
775 | EXPECT_EQ(useA->getRHS(), tmp1); |
776 | // Check that tmp1 is properly defined before useA. |
777 | Instruction *instBeforeUseA = &*std::prev(useA->getIterator()); |
778 | |
779 | EXPECT_TRUE(isa<CopyInst>(instBeforeUseA)); |
780 | // The somewhat complicated check is to make sure we don't crash the test |
781 | // when instBeforeUseA is not a copy. |
782 | // I.e., this test was failing (instead of crashing) when the |
783 | // bug was present. |
784 | EXPECT_EQ(instBeforeUseA->getNumOperands() > 0 |
785 | ? instBeforeUseA->getOperand(0).first |
786 | : nullptr, |
787 | tmp1); |
788 | EXPECT_EQ(instBeforeUseA->getNumOperands() > 1 |
789 | ? instBeforeUseA->getOperand(1).first |
790 | : nullptr, |
791 | input); |
792 | } |
793 | |
794 | /// Same as bufferReuseWithoutDefs but with casts in the middle. |
795 | /// This makes sure that we properly set the types for whatever fixup |
796 | /// code we will insert. |
797 | /// The high level view of the test is: |
798 | /// WeightVar Buffer |
799 | /// | useA |
800 | /// | useB(cast)| def |
801 | /// | redef | save to output |
802 | /// |
803 | /// The expected result at a high level looks like this: |
804 | /// WeightVar Buffer |
805 | /// | copy(cast)| <- Buffer gets WeightVar |
806 | /// useA(cast)| <- Buffer is used instead of WeightVar |
807 | /// useB | def <- ditto |
808 | /// | redef | save to output |
809 | TEST(Optimizer, bufferReuseWithoutDefsPlusCasts) { |
810 | Module mod; |
811 | Function *F = mod.createFunction("bufferReuseWithoutDefsPlusCasts" ); |
812 | IRFunction M(F); |
813 | IRBuilder bb(&M); |
814 | |
815 | auto *input = bb.createWeightVar(glow::ElemKind::FloatTy, {1, 64}, "input" , |
816 | WeightVar::MutabilityKind::Mutable); |
817 | auto *output = bb.createWeightVar(glow::ElemKind::FloatTy, {2, 64}, "output" , |
818 | WeightVar::MutabilityKind::Mutable); |
819 | auto *tmp1 = |
820 | bb.createAllocActivationInst("tmp1" , glow::ElemKind::FloatTy, {64}); |
821 | |
822 | auto *tmp2 = |
823 | bb.createAllocActivationInst("tmp2" , glow::ElemKind::FloatTy, {1, 64}); |
824 | auto *tmp3 = |
825 | bb.createAllocActivationInst("tmp3" , glow::ElemKind::FloatTy, {1, 64}); |
826 | |
827 | bb.createSplatInst("tmp2init" , tmp2, 1.0); |
828 | auto *useA = bb.createElementAddInst("useA" , tmp3, tmp2, input); |
829 | auto *inputView = bb.createTensorViewInst( |
830 | "inputView" , input, mod.uniqueType(Type(glow::ElemKind::FloatTy, {64})), |
831 | {0, 0}); |
832 | auto *tmp3View = bb.createTensorViewInst( |
833 | "tmp3View" , tmp3, mod.uniqueType(Type(glow::ElemKind::FloatTy, {64})), |
834 | {0, 0}); |
835 | |
836 | bb.createElementAddInst("useB" , tmp1, inputView, tmp3View); |
837 | bb.createCopyInst("redef" , input, tmp3); |
838 | auto *view = bb.createTensorViewInst( |
839 | "view" , tmp1, mod.uniqueType(Type(glow::ElemKind::FloatTy, {1, 64})), |
840 | {0}); |
841 | bb.createInsertTensorInst("save" , output, view, {0, 0}, 1, 0); |
842 | |
843 | bb.createDeallocActivationInst("dealloc1" , tmp1); |
844 | bb.createDeallocActivationInst("dealloc2" , tmp2); |
845 | bb.createDeallocActivationInst("dealloc2" , tmp3); |
846 | |
847 | optimize(M, MockBackend().shouldShareBuffers()); |
848 | |
849 | // Check that we manage to expose the problematic case we wanted: |
850 | // tmp1 is extended upward and replace the use of input. |
851 | Value *useARHS = useA->getRHS(); |
852 | EXPECT_EQ(getOrigin(useARHS), tmp1); |
853 | Instruction *tmp1TensorView = dyn_cast<TensorViewInst>(useARHS); |
854 | EXPECT_TRUE(tmp1TensorView && tmp1TensorView->getOperand(0).first == tmp1); |
855 | // Check that tmp1 is properly defined before useA. |
856 | Instruction *tmp1Fixup = |
857 | tmp1TensorView ? &*std::prev(tmp1TensorView->getIterator()) : nullptr; |
858 | EXPECT_TRUE(tmp1Fixup && isa<CopyInst>(tmp1Fixup)); |
859 | // The somewhat complicated check is to make sure we don't crash the test |
860 | // when instBeforeUseA is not a copy. |
861 | EXPECT_EQ((tmp1Fixup && tmp1Fixup->getNumOperands() > 0) |
862 | ? getOrigin(tmp1Fixup->getOperand(0).first) |
863 | : nullptr, |
864 | tmp1); |
865 | // Now check that input feeds tmp1Fixup and was properly casted. |
866 | Instruction *inputCast = |
867 | (tmp1Fixup && tmp1Fixup->getNumOperands() > 1) |
868 | ? dyn_cast<TensorViewInst>(tmp1Fixup->getOperand(1).first) |
869 | : nullptr; |
870 | EXPECT_EQ(inputCast ? getOrigin(inputCast) : nullptr, input); |
871 | EXPECT_EQ(inputCast ? inputCast->getOperand(0).first : nullptr, input); |
872 | } |
873 | |
874 | /// Check that a copy from a buffer to itself is |
875 | /// detected when both src and dest are hidden under TensorView |
876 | /// instructions and eliminated if the linearized offsets of the src and dest |
877 | /// are equal. |
878 | TEST(Optimizer, copyEliminationTensorViewToTensorView) { |
879 | Module mod; |
880 | Function *F = mod.createFunction("copyEliminationTensorViewToTensorView" ); |
881 | IRFunction M(F); |
882 | IRBuilder bb(&M); |
883 | |
884 | // Test that a copy between tensorviews with identical offsets which have |
885 | // different src operands with different offsets into the same underlying |
886 | // buffer is not optimised away. |
887 | |
888 | // Create a WeightVar for TensorViews to use as their source operand. |
889 | auto *A = bb.createWeightVar(glow::ElemKind::FloatTy, {4, 2}, "A" , |
890 | WeightVar::MutabilityKind::Mutable); |
891 | |
892 | // Create a view into A. |
893 | auto *view1 = bb.createTensorViewInst( |
894 | "view1" , A, mod.uniqueType(Type(glow::ElemKind::FloatTy, {1, 2, 1})), |
895 | {0, 0}); |
896 | |
897 | // Create another view into A with the same shape as view1 but different |
898 | // offsets. |
899 | auto *view2 = bb.createTensorViewInst( |
900 | "view2" , A, mod.uniqueType(Type(glow::ElemKind::FloatTy, {1, 2, 1})), |
901 | {1, 1}); |
902 | |
903 | // Create views into view1 and view2 with identical offsets. |
904 | auto *view3 = bb.createTensorViewInst( |
905 | "view3" , view1, mod.uniqueType(Type(glow::ElemKind::FloatTy, {2, 1})), |
906 | {0, 0, 0}); |
907 | |
908 | auto *view4 = bb.createTensorViewInst( |
909 | "view4" , view2, mod.uniqueType(Type(glow::ElemKind::FloatTy, {2, 1})), |
910 | {0, 0, 0}); |
911 | |
912 | // Create a copy from view3 to view4. These views both point to 2 elements in |
913 | // A starting at offset {0, 0}, so this should be optimized out. |
914 | bb.createCopyInst("copyViewToView" , view3, view4); |
915 | |
916 | auto &instrs = M.getInstrs(); |
917 | optimize(M, MockBackend().shouldShareBuffers()); |
918 | |
919 | // All instructions should remain because the linearized offsets of the final |
920 | // tensorview are not the same. |
921 | EXPECT_EQ(instrs.size(), 5); |
922 | EXPECT_FALSE(std::none_of( |
923 | instrs.begin(), instrs.end(), |
924 | [](const Instruction &I) -> bool { return isa<CopyInst>(&I); })); |
925 | EXPECT_FALSE(std::none_of( |
926 | instrs.begin(), instrs.end(), |
927 | [](const Instruction &I) -> bool { return isa<TensorViewInst>(&I); })); |
928 | |
929 | // Reset state for next test. |
930 | M.clear(); |
931 | M.setGraph(F); |
932 | |
933 | // Test that a copy between tensorviews with different offsets which have |
934 | // different src operands with different offsets but have the same linearized |
935 | // offset into the same underlying buffer is optimized away. |
936 | |
937 | // Create a WeightVar for TensorViews to use as their source operand. |
938 | auto *D = bb.createWeightVar(glow::ElemKind::FloatTy, {4, 2}, "B" , |
939 | WeightVar::MutabilityKind::Mutable); |
940 | |
941 | // Create another WeightVar. E will be copied into this to avoid |
942 | // optimizing all instructions away. |
943 | auto *E = bb.createWeightVar(glow::ElemKind::FloatTy, {4, 2}, "C" , |
944 | WeightVar::MutabilityKind::Mutable); |
945 | |
946 | // Create a view into D. The linearized offset of this TensorView is 0 and the |
947 | // size is 8. |
948 | auto *view7 = bb.createTensorViewInst( |
949 | "view7" , D, mod.uniqueType(Type(glow::ElemKind::FloatTy, {4, 2, 1})), |
950 | {0, 0}); |
951 | |
952 | // Create a view into view7. The linearized offset of this TensorView is |
953 | // 4 and the size is 2. |
954 | auto *view8 = bb.createTensorViewInst( |
955 | "view8" , view7, mod.uniqueType(Type(glow::ElemKind::FloatTy, {2})), |
956 | {2, 0, 0}); |
957 | |
958 | // Create a view into D. The linearized offset of this TensorView is 4 and the |
959 | // size is 4. |
960 | auto *view9 = bb.createTensorViewInst( |
961 | "view9" , D, mod.uniqueType(Type(glow::ElemKind::FloatTy, {4})), {2, 0}); |
962 | |
963 | // Create a view into view9. The linearized offset of this TensorView is 4 and |
964 | // the size is 2. |
965 | auto *view10 = bb.createTensorViewInst( |
966 | "view10" , view9, mod.uniqueType(Type(glow::ElemKind::FloatTy, {2})), {0}); |
967 | |
968 | // Create a copy from view8 to view 10. Since the linearized offsets and types |
969 | // of the two views are identical, this copy should be optimized out. |
970 | bb.createCopyInst("copyViewToView" , view8, view10); |
971 | |
972 | // Insert D into E just to make sure the IR isn't empty after optimisation. |
973 | bb.createInsertTensorInst("copyOutput" , E, D, /*Offsets=*/{0, 0}, |
974 | /*Count=*/1, /*Axis=*/0); |
975 | |
976 | optimize(M, MockBackend().shouldShareBuffers()); |
977 | |
978 | // Only one instruction (the InsertTensor) should remain. |
979 | EXPECT_EQ(instrs.size(), 1); |
980 | EXPECT_TRUE(std::none_of( |
981 | instrs.begin(), instrs.end(), |
982 | [](const Instruction &I) -> bool { return isa<CopyInst>(&I); })); |
983 | EXPECT_TRUE(std::none_of( |
984 | instrs.begin(), instrs.end(), |
985 | [](const Instruction &I) -> bool { return isa<TensorViewInst>(&I); })); |
986 | } |
987 | |
988 | /// Check that a copy from a buffer to itself is |
989 | /// detected when the src is hidden under a layer of TensorView instructions and |
990 | /// eliminated if the linearized offsets of the src and dest are equal. |
991 | TEST(Optimizer, copyEliminationTensorViewBuffer) { |
992 | Module mod; |
993 | Function *F = mod.createFunction("copyEliminationTensorViewToBuffer" ); |
994 | IRFunction M(F); |
995 | IRBuilder bb(&M); |
996 | |
997 | // Create a WeightVar for TensorViews to use as their source operand. |
998 | auto *B = bb.createWeightVar(glow::ElemKind::FloatTy, {4, 2}, "B" , |
999 | WeightVar::MutabilityKind::Mutable); |
1000 | |
1001 | // Create another WeightVar. B will be copied into this to avoid |
1002 | // optimizing all instructions away. |
1003 | auto *C = bb.createWeightVar(glow::ElemKind::FloatTy, {4, 2}, "C" , |
1004 | WeightVar::MutabilityKind::Mutable); |
1005 | |
1006 | // Create two stacked views into A. Two are required because a tensorview |
1007 | // that has the same type as its src is eliminated before copy elimination is |
1008 | // applied. |
1009 | auto *view1 = bb.createTensorViewInst( |
1010 | "view1" , B, mod.uniqueType(Type(glow::ElemKind::FloatTy, {1, 4, 2})), |
1011 | {0, 0}); |
1012 | |
1013 | auto *view2 = bb.createTensorViewInst( |
1014 | "view2" , view1, mod.uniqueType(Type(glow::ElemKind::FloatTy, {4, 2})), |
1015 | {0, 0, 0}); |
1016 | |
1017 | // Create a copy from view2 to B. This view points to the start of A and has |
1018 | // the same type, so this should be optimized out. |
1019 | bb.createCopyInst("copyViewToBuf" , view2, B); |
1020 | |
1021 | // Create a copy from B to view2. This should also be optimized out for the |
1022 | // same reason. |
1023 | bb.createCopyInst("copyBufToView" , B, view2); |
1024 | |
1025 | // Insert B into C. This exists just to make sure the optimised IR isn't |
1026 | // empty. |
1027 | bb.createInsertTensorInst("copyOutput" , C, B, /*Offsets=*/{0, 0}, |
1028 | /*Count=*/1, /*Axis=*/0); |
1029 | |
1030 | auto &instrs = M.getInstrs(); |
1031 | optimize(M, MockBackend().shouldShareBuffers()); |
1032 | |
1033 | // Only one instruction (the InsertTensor) should remain. |
1034 | EXPECT_EQ(instrs.size(), 1); |
1035 | EXPECT_TRUE(std::none_of( |
1036 | instrs.begin(), instrs.end(), |
1037 | [](const Instruction &I) -> bool { return isa<CopyInst>(&I); })); |
1038 | EXPECT_TRUE(std::none_of( |
1039 | instrs.begin(), instrs.end(), |
1040 | [](const Instruction &I) -> bool { return isa<TensorViewInst>(&I); })); |
1041 | } |
1042 | |
1043 | /// Check if dump functions work for Value and IRFunction. |
1044 | TEST(Optimizer, dumpDataStructure) { |
1045 | Module mod; |
1046 | Function *F = mod.createFunction("inoutCopy" ); |
1047 | IRFunction M(F); |
1048 | IRBuilder bb(&M); |
1049 | |
1050 | Value *input = bb.createWeightVar(glow::ElemKind::FloatTy, {2, 64}, "input" , |
1051 | WeightVar::MutabilityKind::Mutable); |
1052 | // Dump Value. |
1053 | std::string storageV1; |
1054 | llvm::raw_string_ostream osV1(storageV1); |
1055 | input->dump(osV1); |
1056 | std::string mesV = input->toString(); |
1057 | std::string expectMesV = R"(%input = WeightVar float<2 x 64> mutable)" ; |
1058 | EXPECT_EQ(mesV, expectMesV); |
1059 | EXPECT_EQ(mesV, osV1.str()); |
1060 | std::string storageV2; |
1061 | llvm::raw_string_ostream osV2(storageV2); |
1062 | osV2 << input; |
1063 | EXPECT_EQ(mesV, osV2.str()); |
1064 | // Dump IRFunction. |
1065 | std::string storageIRF1; |
1066 | llvm::raw_string_ostream osIRF1(storageIRF1); |
1067 | M.dump(osIRF1); |
1068 | std::string mesI = M.toString(); |
1069 | std::string expectMesI = R"(function inoutCopy |
1070 | declare { |
1071 | %input = WeightVar float<2 x 64> mutable // size: 512 |
1072 | |
1073 | ; size = 512 bytes |
1074 | } |
1075 | |
1076 | code { |
1077 | } |
1078 | )" ; |
1079 | EXPECT_EQ(mesI, expectMesI); |
1080 | EXPECT_EQ(mesI, osIRF1.str()); |
1081 | std::string storageIRF2; |
1082 | llvm::raw_string_ostream osIRF2(storageIRF2); |
1083 | osIRF2 << M; |
1084 | EXPECT_EQ(mesI, osIRF2.str()); |
1085 | } |
1086 | |