1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include "BackendTestUtils.h" |
17 | |
18 | #include "glow/Base/TensorSerialization.h" |
19 | #include "glow/Converter/TypeAToTypeBFunctionConverter.h" |
20 | #include "glow/ExecutionEngine/ExecutionEngine.h" |
21 | #include "glow/Exporter/ONNXModelWriter.h" |
22 | #include "glow/Flags/Flags.h" |
23 | #include "glow/Graph/Graph.h" |
24 | #include "glow/Partitioner/Partitioner.h" |
25 | #include "glow/Runtime/DeferredWeightLoader.h" |
26 | #include "glow/Runtime/HostManager/HostManager.h" |
27 | #include "lib/Onnxifi/Base.h" |
28 | |
29 | #include <algorithm> |
30 | #include <cmath> |
31 | #include <future> |
32 | #include <random> |
33 | |
34 | #include "gtest/gtest.h" |
35 | |
36 | #include "llvm/Support/CommandLine.h" |
37 | #include "llvm/Support/FileSystem.h" |
38 | |
39 | constexpr size_t MAX_MEMORY = 64e+9; |
40 | |
41 | using namespace glow; |
42 | |
43 | namespace { |
44 | llvm::cl::OptionCategory recSysTestCat("RecSys Category" ); |
45 | |
46 | llvm::cl::opt<bool> enableStaticPlaceholderOpt( |
47 | "enable-static-placeholder" , llvm::cl::desc("Enable Static Placeholder." ), |
48 | llvm::cl::Optional, llvm::cl::init(false), llvm::cl::cat(recSysTestCat)); |
49 | |
50 | llvm::cl::opt<unsigned> miniBatchOpt("mini-batch" , llvm::cl::desc("Minibatch." ), |
51 | llvm::cl::Optional, llvm::cl::init(8), |
52 | llvm::cl::cat(recSysTestCat)); |
53 | |
54 | llvm::cl::opt<unsigned> concurrentReqestsOpt( |
55 | "concurrent-count" , llvm::cl::desc("Number of concurrent requests." ), |
56 | llvm::cl::Optional, llvm::cl::init(1), llvm::cl::cat(recSysTestCat)); |
57 | |
58 | llvm::cl::opt<unsigned> |
59 | repsOpt("reps" , llvm::cl::desc("Number of benchmark repetitions." ), |
60 | llvm::cl::Optional, llvm::cl::init(1), |
61 | llvm::cl::cat(recSysTestCat)); |
62 | |
63 | llvm::cl::opt<unsigned> embeddingDimOpt("embedding-dim" , |
64 | llvm::cl::desc("Embedding dim." ), |
65 | llvm::cl::Optional, llvm::cl::init(64), |
66 | llvm::cl::cat(recSysTestCat)); |
67 | |
68 | llvm::cl::opt<unsigned> denseDimOpt("dense-dim" , llvm::cl::desc("Dense dim." ), |
69 | llvm::cl::Optional, llvm::cl::init(800), |
70 | llvm::cl::cat(recSysTestCat)); |
71 | |
72 | llvm::cl::opt<unsigned> numHiddenBottomMLPLayersOpt( |
73 | "num-hidden-bottom-mlp-layers" , |
74 | llvm::cl::desc("Number of hidden bottom MLP layers." ), llvm::cl::Optional, |
75 | llvm::cl::init(3), llvm::cl::cat(recSysTestCat)); |
76 | |
77 | llvm::cl::list<unsigned> bottomMLPIntermediateDimsOpt( |
78 | "bottom-mlp-intermediate-dims" , |
79 | llvm::cl::desc( |
80 | "Comma-separated list of intermediate dim for each of the bottom MLP " |
81 | "hidden layers and output layer. Will wrap around to the start of the " |
82 | "list and reuse dimensions if less than the number of layers. If " |
83 | "unprovided, default is 1024." ), |
84 | llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated, |
85 | llvm::cl::cat(recSysTestCat)); |
86 | |
87 | llvm::cl::opt<unsigned> |
88 | numHiddenTopMLPLayersOpt("num-hidden-top-mlp-layers" , |
89 | llvm::cl::desc("Number of hidden top MLP layers." ), |
90 | llvm::cl::Optional, llvm::cl::init(3), |
91 | llvm::cl::cat(recSysTestCat)); |
92 | |
93 | llvm::cl::list<unsigned> topMLPIntermediateDimsOpt( |
94 | "top-mlp-intermediate-dims" , |
95 | llvm::cl::desc( |
96 | "Comma-separated list of intermediate dim for each of the top MLP " |
97 | "hidden layers and output layer. Will wrap around to the start of the " |
98 | "list and reuse dimensions if less than the number of layers. If " |
99 | "unprovided, default is 1024." ), |
100 | llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated, |
101 | llvm::cl::cat(recSysTestCat)); |
102 | |
103 | llvm::cl::list<unsigned> lengthsMinMaxOpt( |
104 | "lengths-min-max" , |
105 | llvm::cl::desc("Comma separated [min, max) value to be used when " |
106 | "generating random lengths inputs for SLS/SLWS. If left " |
107 | "unspecified, will use [90, 110)." ), |
108 | llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated, |
109 | llvm::cl::cat(recSysTestCat)); |
110 | |
111 | llvm::cl::opt<unsigned> randomSeedContentOpt( |
112 | "random-seed-content" , |
113 | llvm::cl::desc( |
114 | "Seed for the random data generation for indices and weights tensor" ), |
115 | llvm::cl::Optional, llvm::cl::init(2001), llvm::cl::cat(recSysTestCat)); |
116 | |
117 | llvm::cl::opt<unsigned> randomSeedLengthsOpt( |
118 | "random-seed-lengths" , |
119 | llvm::cl::desc("Seed for the random data generation for lengths tensor" ), |
120 | llvm::cl::Optional, llvm::cl::init(2001), llvm::cl::cat(recSysTestCat)); |
121 | |
122 | llvm::cl::list<unsigned> tableSizesOpt( |
123 | "embedding-table-sizes" , |
124 | llvm::cl::desc("Comma-separated list of embedding table sizes." ), |
125 | llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated, |
126 | llvm::cl::cat(recSysTestCat)); |
127 | |
128 | llvm::cl::list<unsigned> tableCountsOpt( |
129 | "embedding-table-counts" , |
130 | llvm::cl::desc("Comma-separated list of embedding table counts, " |
131 | "corresponding to a count for each size listed in " |
132 | "embedding-table-sizes." ), |
133 | llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated, |
134 | llvm::cl::cat(recSysTestCat)); |
135 | |
136 | llvm::cl::opt<unsigned> deviceMemCapacityOpt( |
137 | "device-mem-capacity" , |
138 | llvm::cl::desc("Device memory capacity in kB. Default is dependent on the " |
139 | "test in order to potentially force partitioning." ), |
140 | llvm::cl::Optional, llvm::cl::init(0), llvm::cl::cat(recSysTestCat)); |
141 | |
142 | llvm::cl::opt<unsigned> numDevicesOpt( |
143 | "num-devices" , llvm::cl::desc("Number of devices to use for partitioning." ), |
144 | llvm::cl::Optional, llvm::cl::init(2), llvm::cl::cat(recSysTestCat)); |
145 | |
146 | llvm::cl::opt<unsigned> partitioningNumDevicesOpt( |
147 | "partitioning-num-devices" , |
148 | llvm::cl::desc( |
149 | "Number of devices to override sparseNNPartitioningNumCards." ), |
150 | llvm::cl::Optional, llvm::cl::init(1), llvm::cl::cat(recSysTestCat)); |
151 | |
152 | llvm::cl::opt<std::string> traceDir( |
153 | "trace-dir" , |
154 | llvm::cl::desc("Directory used to store Glow trace events files. If not " |
155 | "used, tracing is not enabled." ), |
156 | llvm::cl::Optional, llvm::cl::cat(recSysTestCat)); |
157 | |
158 | llvm::cl::opt<bool> dumpBinaryResults( |
159 | "dump-binary-results" , |
160 | llvm::cl::desc("Dump raw binary Tensor results after execution." ), |
161 | llvm::cl::init(false), llvm::cl::cat(recSysTestCat)); |
162 | |
163 | llvm::cl::opt<bool> dumpModelInputs( |
164 | "dump-model-inputs" , |
165 | llvm::cl::desc( |
166 | "Dump model and inputs into format that repro binary can run." ), |
167 | llvm::cl::init(false), llvm::cl::cat(recSysTestCat)); |
168 | |
169 | llvm::cl::opt<bool> dumpFinalGraph( |
170 | "dump-final-graph" , |
171 | llvm::cl::desc( |
172 | "Call dumpDag on each Function passed to the backend for compilation." ), |
173 | llvm::cl::init(false), llvm::cl::cat(recSysTestCat)); |
174 | |
175 | llvm::cl::opt<bool> saturateHost("saturate-host" , |
176 | llvm::cl::desc("Enable host saturation." ), |
177 | llvm::cl::init(false), |
178 | llvm::cl::cat(recSysTestCat)); |
179 | |
180 | llvm::cl::opt<bool> fuseScaleOffsetFp32Opt( |
181 | "glow_global_fused_scale_offset_fp32" , |
182 | llvm::cl::desc( |
183 | "Enable converting scale/offset in sls's input data from fp16 to fp32" ), |
184 | llvm::cl::init(false), llvm::cl::cat(recSysTestCat)); |
185 | |
186 | llvm::cl::opt<bool> skipCorrectnessCheck( |
187 | "skip_correctness_check" , |
188 | llvm::cl::desc("Skip correctness check with Interpreter backend" ), |
189 | llvm::cl::Optional, llvm::cl::init(false), llvm::cl::cat(recSysTestCat)); |
190 | } // namespace |
191 | |
192 | class TestDeferredWeightLoader : public DeferredWeightLoader { |
193 | public: |
194 | Error loadNextWeight() override { |
195 | position_++; |
196 | return Error::success(); |
197 | } |
198 | Error setSrc(void *loaderObject) override { return Error::success(); } |
199 | |
200 | Tensor *addWeight(TypeRef ty) { |
201 | // auto weight = Tensor(ty); |
202 | weights_.push_back(Tensor(ty)); |
203 | return &weights_.at(weights_.size() - 1); |
204 | } |
205 | |
206 | void addName(std::string name) { names_.push_back(name); } |
207 | void setTypeInfo(std::map<std::string, Type> info) override {} |
208 | |
209 | std::string getName() override { |
210 | if (position_ >= int(names_.size())) { |
211 | return "" ; |
212 | } |
213 | return names_[position_]; |
214 | } |
215 | |
216 | Tensor *getTensor() override { |
217 | if (position_ >= int(weights_.size())) { |
218 | return nullptr; |
219 | } |
220 | return &weights_[position_]; |
221 | } |
222 | |
223 | private: |
224 | std::vector<Tensor> weights_{}; |
225 | std::vector<std::string> names_{}; |
226 | int position_{-1}; |
227 | }; |
228 | |
229 | /// Fills the tensor \p H with some stable random data with the seed \p seed |
230 | /// and the range [-scale .. scale]. |
231 | static void fillStableRandomData(Handle<float> H, size_t seed, |
232 | float scale = 1) { |
233 | for (size_t i = 0, e = H.size(); i < e; i++) { |
234 | H.raw(i) = scale * (float((int(i * 1921 + seed) % 100) - 50) / 50); |
235 | } |
236 | } |
237 | |
238 | /// Fills the tensor \p H with some stable random integers with the seed \p |
239 | /// seed and the range [0, scale). |
240 | template <typename T> |
241 | static void fillStableRandomIndex(Handle<T> H, size_t seed, size_t min = 0, |
242 | size_t max = 10) { |
243 | for (size_t i = 0, e = H.size(); i < e; i++) { |
244 | H.raw(i) = min + (int(i * 1921 + seed) % (max - min)); |
245 | } |
246 | } |
247 | template void fillStableRandomIndex(Handle<int64_t> Handle, size_t seed, |
248 | size_t min, size_t max); |
249 | template void fillStableRandomIndex(Handle<int32_t> Handle, size_t seed, |
250 | size_t min, size_t max); |
251 | |
252 | /// Sum of all elements in Tensor. |
253 | static size_t sumOfElements(Handle<int32_t> H) { |
254 | size_t sum = 0; |
255 | for (size_t i = 0, e = H.size(); i < e; i++) { |
256 | sum += H.raw(i); |
257 | } |
258 | return sum; |
259 | } |
260 | |
261 | /// Tests a simplified Recommendation System model. |
262 | /// |
263 | /// The RecSys model has four components: |
264 | /// * An initial Multilayer Perceptron acting in the inputs. |
265 | /// * Some number of Sparse Features: SparseLengthSum nodes acting on |
266 | /// embedding tables (see https://caffe2.ai/docs/sparse-operations.html). |
267 | /// * An interaction layer bringing together the output for the top MLP and |
268 | /// the sparse features. |
269 | /// * A final MLP acting on the result of the interaction. |
270 | /// |
271 | /// The final result is a float indicating the strength of the recommendation. |
272 | /// |
273 | /// |
274 | /// +------+ |
275 | /// |Output| |
276 | /// +--^---+ |
277 | /// | |
278 | /// +---+---+ |
279 | /// | TOP | |
280 | /// | | |
281 | /// | MLP | |
282 | /// +---^---+ |
283 | /// | |
284 | /// | |
285 | /// +-------+--------+ |
286 | /// | Interaction <---------+ |
287 | /// +-----> <---+ | |
288 | /// | +--------^-----^-+ | | |
289 | /// | | | | | |
290 | /// +--+----+ +-+-+ +-+-+ +-+-+ +-+-+ |
291 | /// | Bottom| |SLS| |SLS| |SLS| |SLS| |
292 | /// | | +---+ +---+ +---+ +---+ |
293 | /// | MLP | Sparse Features |
294 | /// +---^---+ |
295 | /// | |
296 | /// +---+---+ |
297 | /// | Input | |
298 | /// +-------+ |
299 | /// |
300 | class RecommendationSystemTest : public BackendTest { |
301 | public: |
302 | RecommendationSystemTest() : BackendTest(/* deviceMemory */ MAX_MEMORY) {} |
303 | |
304 | protected: |
305 | ExecutionContext context_; |
306 | PlaceholderBindings *bindings_; |
307 | PrecisionConfiguration precConfig_; |
308 | PrecisionConfiguration precConfigForInterpreter_; |
309 | |
310 | // Test Config: |
311 | dim_t miniBatch; |
312 | dim_t embeddingDim; |
313 | dim_t denseDim; |
314 | std::vector<dim_t> tableSizes; |
315 | std::vector<dim_t> bottomMLPIntermediateDims; |
316 | std::vector<dim_t> topMLPIntermediateDims; |
317 | size_t lengthsMin; |
318 | size_t lengthsMax; |
319 | |
320 | // Used to configure correct precision settings: |
321 | bool quantizeSLWSData{false}; |
322 | bool quantizeFC{false}; |
323 | bool convertToFP16{false}; |
324 | bool useFP16SLWS{false}; |
325 | bool useFP16AccumSLWS{false}; |
326 | |
327 | bool convertFusedToFP16{false}; |
328 | bool convert4or8BitFusedToFP32{false}; |
329 | |
330 | // Used to enable static placeholder: |
331 | bool enableStaticPlaceholder{false}; |
332 | |
333 | // Whether to use SLWS with gather of weights, instead of SLS. |
334 | bool gatherWeights{false}; |
335 | |
336 | // Used to disable Interpreter deferred weight loading, because we run |
337 | // FBA and Interpreter tests sequentially. |
338 | bool isInterpreter{false}; |
339 | |
340 | // Partitioner config: |
341 | uint64_t deviceMemCapacity; |
342 | size_t numDevices; |
343 | bool useSparseNNPartitioning{false}; |
344 | bool sparseNNPartitioningAddSLSConcats{false}; |
345 | int32_t sparseNNPartitioningNumCards{1}; |
346 | int64_t sparseNNPartitioningSLSKbytes{1000}; |
347 | int32_t sparseNNPartitioningNumCoresSLS{1}; |
348 | int32_t sparseNNPartitioningNumCoresOther{1}; |
349 | |
350 | // Result from executing the unpartitioned model on the backend being tested. |
351 | Tensor *resultTensor{nullptr}; |
352 | |
353 | /// Helper that \returns intermediate dims given a provided list of dims \p |
354 | /// providedIntermediateDims and the number of layers needed \p numLayers. If |
355 | /// the provided list is empty then all dims will be set to |
356 | /// \p defaultIntermediateDim. If the size of \p providedIntermediateDims is |
357 | /// less than \p numLayers then it will wrap around and reuse |
358 | /// \p providedIntermediateDims until \p numLayers are added to the returned |
359 | /// vector. |
360 | static std::vector<dim_t> |
361 | getIntermediateDims(llvm::ArrayRef<unsigned> providedIntermediateDims, |
362 | unsigned numLayers, dim_t defaultIntermediateDim = 1024) { |
363 | std::vector<dim_t> destIntermediateDims; |
364 | std::vector<dim_t> dims(providedIntermediateDims.begin(), |
365 | providedIntermediateDims.end()); |
366 | if (dims.empty()) { |
367 | dims.push_back(defaultIntermediateDim); |
368 | } |
369 | const size_t numProvidedDimsTop = dims.size(); |
370 | // Note: Add one extra intermediate dim, which is used by the output layer |
371 | // of the MLP. The input layer is set based on its own input. |
372 | for (dim_t i = 0, e = numLayers + 1; i < e; i++) { |
373 | destIntermediateDims.push_back(dims[i % numProvidedDimsTop]); |
374 | } |
375 | return destIntermediateDims; |
376 | } |
377 | |
378 | void SetUp() override { |
379 | bindings_ = context_.getPlaceholderBindings(); |
380 | |
381 | /// Test configuration, tweak here: |
382 | miniBatch = miniBatchOpt; |
383 | embeddingDim = embeddingDimOpt; |
384 | denseDim = denseDimOpt; |
385 | lengthsMin = 90; |
386 | lengthsMax = 111; |
387 | |
388 | if (!tableSizesOpt.empty()) { |
389 | if (!tableCountsOpt.empty()) { |
390 | CHECK_EQ(tableSizesOpt.size(), tableCountsOpt.size()) |
391 | << "Embedding table sizes and counts must be same length." ; |
392 | for (size_t i = 0, e = tableSizesOpt.size(); i < e; i++) { |
393 | for (size_t j = 0, f = tableCountsOpt[i]; j < f; j++) { |
394 | tableSizes.push_back(tableSizesOpt[i]); |
395 | } |
396 | } |
397 | } else { |
398 | tableSizes = |
399 | std::vector<dim_t>(tableSizesOpt.begin(), tableSizesOpt.end()); |
400 | } |
401 | // Stable randomization of the order of the tables. |
402 | std::shuffle(tableSizes.begin(), tableSizes.end(), std::mt19937()); |
403 | } else { |
404 | tableSizes = {8000, 6000, 7000, 9000, 12000, |
405 | 8000, 6000, 7000, 9000, 12000}; |
406 | } |
407 | |
408 | // Set up the bottom and top MLP intermediate dimensions. |
409 | bottomMLPIntermediateDims = getIntermediateDims( |
410 | bottomMLPIntermediateDimsOpt, numHiddenBottomMLPLayersOpt); |
411 | topMLPIntermediateDims = getIntermediateDims(topMLPIntermediateDimsOpt, |
412 | numHiddenTopMLPLayersOpt); |
413 | |
414 | if (!lengthsMinMaxOpt.empty()) { |
415 | assert(lengthsMinMaxOpt.size() == 2 && |
416 | "If min and max are used, must be 2 values provided" ); |
417 | lengthsMin = lengthsMinMaxOpt[0]; |
418 | lengthsMax = lengthsMinMaxOpt[1]; |
419 | assert(lengthsMinMaxOpt[0] < lengthsMinMaxOpt[1] && "Min must be < max" ); |
420 | } |
421 | |
422 | // Create TraceContext if trace file path is provided. |
423 | if (!traceDir.empty()) { |
424 | context_.setTraceContext( |
425 | glow::make_unique<TraceContext>(TraceEvent::TraceLevel::STANDARD)); |
426 | } |
427 | |
428 | // If device memory capacity is unset via command line, use 32MB by default. |
429 | deviceMemCapacity = |
430 | (int64_t)1024 * |
431 | ((deviceMemCapacityOpt != 0) ? deviceMemCapacityOpt : 1024 * 32); |
432 | |
433 | numDevices = numDevicesOpt; |
434 | } |
435 | |
436 | // dump inputs into onnx file which can run with repro binary. |
437 | void dumpInputs() { |
438 | std::stringstream ss; |
439 | ss << "input_0.onnx" ; |
440 | std::ofstream of(ss.str(), std::ios::binary); |
441 | auto *resultPHBindings = context_.getPlaceholderBindings(); |
442 | ONNX_NAMESPACE::GraphProto inputG; |
443 | for (auto &pair : resultPHBindings->pairs()) { |
444 | auto *t = inputG.add_initializer(); |
445 | auto *PH = pair.first; |
446 | const auto &resultTensor = pair.second; |
447 | ONNXModelWriter::writeTensor(resultTensor, t, |
448 | /*useGlowCustomOps*/ true); |
449 | t->set_name(PH->getName().str()); |
450 | } |
451 | std::string buffer; |
452 | inputG.SerializeToString(&buffer); |
453 | of << buffer; |
454 | } |
455 | |
456 | // dump outputs into onnx file which can run with repro binary. |
457 | void dumpOutputs() { |
458 | std::stringstream ss; |
459 | ss << "output_0.onnx" ; |
460 | std::ofstream of(ss.str(), std::ios::binary); |
461 | ONNX_NAMESPACE::GraphProto inputG; |
462 | auto *t = inputG.add_initializer(); |
463 | ONNXModelWriter::writeTensor(*resultTensor, t, |
464 | /*useGlowCustomOps*/ true); |
465 | t->set_name("save" ); |
466 | std::string buffer; |
467 | inputG.SerializeToString(&buffer); |
468 | of << buffer; |
469 | } |
470 | |
471 | void TearDown() override { |
472 | if (dumpBinaryResults) { |
473 | ASSERT_TRUE(resultTensor) << "Could not dump result tensor, was nullptr" ; |
474 | llvm::SmallString<64> path; |
475 | auto tempFileRes = |
476 | llvm::sys::fs::createTemporaryFile("result" , "bin" , path); |
477 | if (tempFileRes.value() != 0) { |
478 | FAIL() << "Failed to create temp file to write into." ; |
479 | } |
480 | std::cout |
481 | << "Dumping binary results of " |
482 | << ::testing::UnitTest::GetInstance()->current_test_info()->name() |
483 | << " to " << path.data() << std::endl; |
484 | TensorSerializationOptions opts; |
485 | opts.withType = false; |
486 | dumpTensorToBinaryFile(*resultTensor, path, opts); |
487 | } |
488 | |
489 | if (dumpModelInputs) { |
490 | dumpInputs(); |
491 | } |
492 | |
493 | resultTensor = nullptr; |
494 | bindings_->clear(); |
495 | |
496 | auto *traceContext = context_.getTraceContext(); |
497 | |
498 | if (traceContext) { |
499 | // If traceContext exists, that means trace data was collected and needs |
500 | // to be dumped to a file. |
501 | |
502 | // Get the test case and test names. They will be used to name the file. |
503 | const ::testing::TestInfo *const testInfo = |
504 | ::testing::UnitTest::GetInstance()->current_test_info(); |
505 | std::string testName(testInfo->name()); |
506 | std::string testCaseName(testInfo->test_case_name()); |
507 | |
508 | // Replace all '/' in the test case and test names with '-' to preclude |
509 | // errors related to directories not existing. |
510 | for (auto &c : testName) { |
511 | if (c == '/') { |
512 | c = '-'; |
513 | } |
514 | } |
515 | |
516 | for (auto &c : testCaseName) { |
517 | if (c == '/') { |
518 | c = '-'; |
519 | } |
520 | } |
521 | |
522 | auto traceFileName = |
523 | strFormat("%s/%s-%s.json" , traceDir.getValue().c_str(), |
524 | testName.c_str(), testCaseName.c_str()); |
525 | traceContext->dump(traceFileName); |
526 | } |
527 | } |
528 | |
529 | /// Creates a Multi-layer perceptron network consisting of start & end FCs |
530 | /// with \p intermediateLayers hidden layers. |
531 | /// * All weights and biases are random. |
532 | /// * All internal activations are RELU. |
533 | /// * Parent node \p N_ has output dimension \p inputDim. |
534 | /// * Hidden layers have dimension of \p intDim * intDim. |
535 | /// * Output layer has output dimension \p outputDim. |
536 | static NodeValue createMLP(Module &mod, Function *F_, Node *N_, |
537 | dim_t inputDim, llvm::ArrayRef<dim_t> intDims, |
538 | dim_t outputDim, dim_t intermediateLayers) { |
539 | assert(intermediateLayers > 0); |
540 | |
541 | const dim_t firstIntDim = intDims[0]; |
542 | |
543 | // Type object for the internal layers. |
544 | // Note: dimension argument is a placeholder and will get filled out by each |
545 | // createRandomizedConstant invocation. |
546 | auto internalType = mod.uniqueType(ElemKind::FloatTy, {1}); |
547 | |
548 | /// Initial |
549 | auto *initial_bias = createRandomizedConstant( |
550 | mod, internalType, {firstIntDim}, "initial_bias" ); |
551 | auto *initial_weight = createRandomizedConstant( |
552 | mod, internalType, {inputDim, firstIntDim}, "initial_weight" ); |
553 | |
554 | FullyConnectedNode *initial_layer = F_->createFullyConnected( |
555 | "dense" , N_, initial_weight, |
556 | initial_bias); // Output is size {MB, intermediate dim} |
557 | NodeValue last = F_->createRELU("relu1" , initial_layer); |
558 | |
559 | /// Intermediate |
560 | for (unsigned i = 0; i < intermediateLayers; ++i) { |
561 | // The current intermediate dimension is based on the previous FC's |
562 | // result's trailing dimension. Thus we set the current FC's trailing |
563 | // weight dim equal to the next FC's intermediate dimension. |
564 | const dim_t intDim = intDims[i + 1]; |
565 | auto *intermediate_bias = createRandomizedConstant( |
566 | mod, internalType, {intDim}, "intermediate_bias" ); |
567 | auto *intermediate_weight = createRandomizedConstant( |
568 | mod, internalType, {last.dims()[1], intDim}, "intermediate_weight" ); |
569 | |
570 | FullyConnectedNode *intermediate_layer = F_->createFullyConnected( |
571 | "dense" , last, intermediate_weight, |
572 | intermediate_bias); // Output is size {MB, intDims[i]} |
573 | last = F_->createRELU("relu2" , intermediate_layer); |
574 | } |
575 | |
576 | /// End |
577 | auto *end_bias = |
578 | createRandomizedConstant(mod, internalType, {outputDim}, "end_bias" ); |
579 | auto *end_weight = createRandomizedConstant( |
580 | mod, internalType, {last.dims()[1], outputDim}, "end_weight" ); |
581 | |
582 | FullyConnectedNode *end_layer = F_->createFullyConnected( |
583 | "dense" , last, end_weight, end_bias); // Output is size {MB, embDim} |
584 | |
585 | auto *RN = F_->createRELU("relu3" , end_layer); |
586 | |
587 | return RN->getResult(); |
588 | } |
589 | |
590 | /// Creates a rowwise quantized Multi-layer perceptron network consisting of |
591 | /// start & end FCs with \p intermediateLayers hidden layers. |
592 | /// * All weights and biases are random. Weights are Int8Q (rowwise), biases |
593 | /// are Int32. |
594 | /// * All internal activations are RELU, however the final layer has no |
595 | /// activation attached. |
596 | /// * Parent node \p N_ has output dimension \p inputDim int float. |
597 | /// * Hidden layers have dimension of \p intDim * intDim int Int8Q |
598 | /// (rowwise). |
599 | /// * Output layer has output dimension \p outputDim in float. |
600 | /// |
601 | /// Quantized MLPs use RowwiseQuantizedFullyConnected Nodes, which expect: |
602 | /// * weights to be Float32 and convert to Int8 fused rowwise quantized |
603 | /// Tensors internally |
604 | /// * Biases are Int32 quantized. |
605 | static NodeValue createQuantizedMLP(Module &mod, Function *F_, NodeValue N_, |
606 | dim_t inputDim, |
607 | llvm::ArrayRef<dim_t> intDims, |
608 | dim_t outputDim, |
609 | dim_t intermediateLayers) { |
610 | // Must have intermediate layers. |
611 | assert(intermediateLayers > 0); |
612 | |
613 | const dim_t minibatchSize = N_.dims()[0]; |
614 | const dim_t firstIntDim = intDims[0]; |
615 | |
616 | // Type objects for the internal types. |
617 | // Note: dimension argument is a placeholder and will get filled out by each |
618 | // createRandomizedConstant invocation. |
619 | auto internalTypeF = mod.uniqueType(ElemKind::FloatTy, {1}); |
620 | auto internalTypeQ = mod.uniqueType(ElemKind::Int8QTy, {1}, 1, 0); |
621 | auto internalBiasType = mod.uniqueType(ElemKind::Int32QTy, {1}, 1e-11, 0); |
622 | |
623 | auto *start = F_->createQuantize( |
624 | "mlp_quant" , N_, mod.uniqueTypeWithNewShape(internalTypeQ, N_.dims())); |
625 | |
626 | /// Initial. |
627 | auto *initial_bias = createRandomizedConstant( |
628 | mod, internalBiasType, {firstIntDim}, "initial_bias" ); |
629 | auto *initial_weight = createRandomizedConstant( |
630 | mod, internalTypeF, {inputDim, firstIntDim}, "initial_weight" ); |
631 | |
632 | // Output is size {MB, intermediatDim} |
633 | quantization::Schema rowwiseQuantSchema = useSymmetricRowwiseQuantFC |
634 | ? quantization::Symmetric |
635 | : quantization::Asymmetric; |
636 | Node *initial_layer = F_->createRowwiseQuantizedFullyConnected( |
637 | "dense" , start, initial_weight, initial_bias, |
638 | mod.uniqueTypeWithNewShape(internalTypeQ, {minibatchSize, firstIntDim}), |
639 | rowwiseQuantSchema, |
640 | /* transposeWeight */ true); |
641 | |
642 | NodeValue last = F_->createRELU("initial_relu" , initial_layer); |
643 | |
644 | /// Intermediate |
645 | for (unsigned i = 0; i < intermediateLayers; ++i) { |
646 | // The current intermediate dimension is based on the previous FC's |
647 | // result's trailing dimension. Thus we set the current FC's trailing |
648 | // weight dim equal to the next FC's intermediate dimension. |
649 | const dim_t intDim = intDims[i + 1]; |
650 | auto *intermediate_bias = createRandomizedConstant( |
651 | mod, internalBiasType, {intDim}, "intermediate_bias" ); |
652 | auto *intermediate_weight = createRandomizedConstant( |
653 | mod, internalTypeF, {last.dims()[1], intDim}, "intermediate_weight" ); |
654 | |
655 | Node *intermediate_layer = F_->createRowwiseQuantizedFullyConnected( |
656 | "dense" , last, intermediate_weight, intermediate_bias, |
657 | mod.uniqueType(ElemKind::Int8QTy, {minibatchSize, intDim}, 1.0, 0), |
658 | rowwiseQuantSchema, |
659 | /* transposeWeight */ true); // Output is size {MB, intDims[i]} |
660 | last = F_->createRELU("intermediate_relu" , intermediate_layer); |
661 | } |
662 | |
663 | /// End |
664 | auto *end_bias = createRandomizedConstant(mod, internalBiasType, |
665 | {outputDim}, "end_bias" ); |
666 | auto *end_weight = createRandomizedConstant( |
667 | mod, internalTypeF, {last.dims()[1], outputDim}, "end_weight" ); |
668 | |
669 | // Output is size {MB, embDim} |
670 | auto *end_layer = F_->createRowwiseQuantizedFullyConnected( |
671 | "dense" , last, end_weight, end_bias, |
672 | mod.uniqueTypeWithNewShape(internalTypeQ, {minibatchSize, outputDim}), |
673 | rowwiseQuantSchema, |
674 | /* transposeWeight */ true); |
675 | |
676 | auto *RN = F_->createRELU("relu" , end_layer); |
677 | auto *DQN = F_->createDequantize("mlp_dequant" , RN, ElemKind::FloatTy); |
678 | |
679 | return DQN->getResult(); |
680 | } |
681 | |
682 | /// Creates a number of Sparse tables (FP32 or Int8Q), the Indices lookup and |
683 | /// the SpareLengthsSum Node tying it together. |
684 | void createSparseEmbeddings(Module &mod, PlaceholderBindings &bindings_, |
685 | Function *F_, TestDeferredWeightLoader &loader, |
686 | llvm::ArrayRef<Placeholder *> lengths, |
687 | llvm::ArrayRef<dim_t> embSizes, dim_t embDim, |
688 | std::vector<NodeValue> &embeddings) { |
689 | auto internalTypeF = mod.uniqueType(ElemKind::FloatTy, {1}); |
690 | |
691 | for (unsigned int i = 0; i < lengths.size(); i++) { |
692 | fillStableRandomIndex( |
693 | bindings_.allocate(lengths[i])->getHandle<int32_t>(), |
694 | randomSeedLengthsOpt, lengthsMin, lengthsMax); |
695 | |
696 | dim_t sum = |
697 | sumOfElements(bindings_.get(lengths[i])->getHandle<int32_t>()); |
698 | auto *indices = mod.createPlaceholder( |
699 | ElemKind::Int64ITy, {sum}, "indices" + std::to_string(i), false); |
700 | fillStableRandomIndex(bindings_.allocate(indices)->getHandle<int64_t>(), |
701 | randomSeedContentOpt, 0, embSizes[i]); |
702 | |
703 | // output is size {MB, embDim} |
704 | if (quantizeSLWSData) { |
705 | Storage *data; |
706 | if (!isInterpreter && enableStaticPlaceholder) { |
707 | Placeholder *ph = createFusedRowwiseQuantizedPlaceholder( |
708 | mod, {embSizes[i], embDim}, "data" + std::to_string(i), |
709 | useFP16SLWS); |
710 | |
711 | ph->setStatic(true); |
712 | auto *tensor = loader.addWeight(ph->getType()); |
713 | auto fData = Tensor(ElemKind::FloatTy, {embSizes[i], embDim}); |
714 | fData.getHandle<uint8_t>().randomize(UINT8_MIN, UINT8_MAX, |
715 | mod.getPRNG()); |
716 | loader.addName("data" + std::to_string(i)); |
717 | |
718 | bindings_.allocate(ph); |
719 | updateInputPlaceholders(bindings_, {ph}, {tensor}); |
720 | |
721 | data = ph; |
722 | |
723 | Tensor rwqData(ElemKind::UInt8FusedQTy, |
724 | {embSizes[i], embDim + 2 * (dim_t)sizeof(float)}, |
725 | data->getType()->getScale(), |
726 | data->getType()->getOffset()); |
727 | |
728 | quantization::tensorFusedRowwiseQuantization<float>(fData, rwqData); |
729 | tensor->assign(&rwqData); |
730 | } else { |
731 | data = createRandomFusedRowwiseQuantizedConstant( |
732 | mod, {embSizes[i], embDim}, "data" + std::to_string(i), |
733 | useFP16SLWS); |
734 | } |
735 | |
736 | embeddings[i] = F_->createFusedRowwiseQuantizedSparseLengthsSum( |
737 | "RQSLWS" + std::to_string(i), data, indices, lengths[i], |
738 | useFP16AccumSLWS); |
739 | // Convert back to Float if we used Float16 here. Optimizer will |
740 | // eliminate if necessary. |
741 | if (useFP16SLWS) { |
742 | embeddings[i] = F_->createConvertTo( |
743 | "convert_" + embeddings[i].getNode()->getName().str(), |
744 | embeddings[i], ElemKind::FloatTy); |
745 | } |
746 | } else { |
747 | Storage *data; |
748 | if (!isInterpreter && enableStaticPlaceholder) { |
749 | Placeholder *ph = |
750 | mod.createPlaceholder(ElemKind::FloatTy, {embSizes[i], embDim}, |
751 | "data" + std::to_string(i), false); |
752 | ph->setStatic(true); |
753 | auto *tensor = loader.addWeight(ph->getType()); |
754 | tensor->getHandle<float>().initXavier(tensor->getType().size() * 2, |
755 | mod.getPRNG()); |
756 | loader.addName("data" + std::to_string(i)); |
757 | |
758 | bindings_.allocate(ph); |
759 | updateInputPlaceholders(bindings_, {ph}, {tensor}); |
760 | data = ph; |
761 | } else { |
762 | data = createRandomizedConstant(mod, internalTypeF, |
763 | {embSizes[i], embDim}, |
764 | "data" + std::to_string(i)); |
765 | } |
766 | |
767 | embeddings[i] = F_->createSparseLengthsSum("sls" + std::to_string(i), |
768 | data, indices, lengths[i]); |
769 | } |
770 | } |
771 | } |
772 | |
773 | /// Creates a number of Sparse tables (FP32 or Int8Q), the Indices lookup and |
774 | /// the SpareLengthsSum Node tying it together. |
775 | /// TODO: we need to quantize the data tensors for deferred weight loading. |
776 | void createSparseWeightedGatherEmbeddings( |
777 | Module &mod, PlaceholderBindings &bindings_, Function *F_, |
778 | TestDeferredWeightLoader &loader, llvm::ArrayRef<Placeholder *> lengths, |
779 | llvm::ArrayRef<dim_t> tableSizes, dim_t embeddingDim, |
780 | std::vector<NodeValue> &embeddings, uint32_t weightsSize = 1000) { |
781 | for (size_t i = 0; i < lengths.size(); i++) { |
782 | fillStableRandomIndex( |
783 | bindings_.allocate(lengths[i])->getHandle<int32_t>(), |
784 | randomSeedLengthsOpt, lengthsMin, lengthsMax); |
785 | |
786 | dim_t sum = |
787 | sumOfElements(bindings_.get(lengths[i])->getHandle<int32_t>()); |
788 | auto *indices = mod.createPlaceholder( |
789 | ElemKind::Int64ITy, {sum}, "indices" + std::to_string(i), false); |
790 | fillStableRandomIndex(bindings_.allocate(indices)->getHandle<int64_t>(), |
791 | randomSeedContentOpt, 0, tableSizes[i]); |
792 | |
793 | // Should be able to pass weights - fix later. Currently, just a |
794 | // randomized constant. |
795 | Constant *weightsConst = createRandomizedConstant( |
796 | mod, mod.uniqueType(ElemKind::FloatTy, {weightsSize}), {weightsSize}, |
797 | "weights" + std::to_string(i)); |
798 | |
799 | auto *weightIndices = |
800 | mod.createPlaceholder(ElemKind::Int32ITy, {sum}, |
801 | "weight_indices" + std::to_string(i), false); |
802 | fillStableRandomIndex( |
803 | bindings_.allocate(weightIndices)->getHandle<int32_t>(), |
804 | randomSeedContentOpt, 0, weightsSize - 1); |
805 | |
806 | auto *weights = F_->createGather("weight_gather" + std::to_string(i), |
807 | weightsConst, weightIndices, 0); |
808 | |
809 | // output is size {MB, embeddingDim_} |
810 | if (quantizeSLWSData) { |
811 | Storage *data; |
812 | if (!isInterpreter && enableStaticPlaceholder) { |
813 | Placeholder *ph = createFusedRowwiseQuantizedPlaceholder( |
814 | mod, {tableSizes[i], embeddingDim}, "data" + std::to_string(i), |
815 | useFP16SLWS); |
816 | ph->setStatic(true); |
817 | auto *tensor = loader.addWeight(ph->getType()); |
818 | tensor->getHandle<uint8_t>().randomize(UINT8_MIN, UINT8_MAX, |
819 | mod.getPRNG()); |
820 | |
821 | loader.addName("data" + std::to_string(i)); |
822 | |
823 | bindings_.allocate(ph); |
824 | updateInputPlaceholders(bindings_, {ph}, {tensor}); |
825 | |
826 | data = ph; |
827 | } else { |
828 | data = createRandomFusedRowwiseQuantizedConstant( |
829 | mod, {tableSizes[i], embeddingDim}, "data" + std::to_string(i), |
830 | useFP16SLWS); |
831 | } |
832 | |
833 | embeddings[i] = F_->createFusedRowwiseQuantizedSparseLengthsWeightedSum( |
834 | "RQSLWS" + std::to_string(i), data, weights, indices, lengths[i], |
835 | useFP16AccumSLWS); |
836 | // Convert back to Float if we used Float16 here. Optimizer will |
837 | // eliminate if necessary. |
838 | if (useFP16SLWS) { |
839 | embeddings[i] = F_->createConvertTo( |
840 | "convert_" + embeddings[i].getNode()->getName().str(), |
841 | embeddings[i], ElemKind::FloatTy); |
842 | } |
843 | } else { |
844 | Storage *data; |
845 | if (!isInterpreter && enableStaticPlaceholder) { |
846 | Placeholder *ph = mod.createPlaceholder( |
847 | ElemKind::FloatTy, {tableSizes[i], embeddingDim}, |
848 | "data" + std::to_string(i), false); |
849 | ph->setStatic(true); |
850 | auto *tensor = loader.addWeight(ph->getType()); |
851 | tensor->getHandle<float>().initXavier(tensor->getType().size() * 2, |
852 | mod.getPRNG()); |
853 | loader.addName("data" + std::to_string(i)); |
854 | |
855 | bindings_.allocate(ph); |
856 | updateInputPlaceholders(bindings_, {ph}, {tensor}); |
857 | data = ph; |
858 | } else { |
859 | data = createRandomizedConstant( |
860 | mod, |
861 | mod.uniqueType(ElemKind::FloatTy, {tableSizes[i], embeddingDim}), |
862 | {tableSizes[i], embeddingDim}, "data" + std::to_string(i)); |
863 | } |
864 | |
865 | embeddings[i] = F_->createSparseLengthsWeightedSum( |
866 | "slws" + std::to_string(i), data, weights, indices, lengths[i]); |
867 | } |
868 | } |
869 | } |
870 | |
871 | /// Builds a simple graph, \returns the Tensor output of the graph. |
872 | Tensor *createSimpleRecSysGraph(Module &mod, PlaceholderBindings &bindings, |
873 | Function *F, TestDeferredWeightLoader &loader, |
874 | llvm::ArrayRef<dim_t> embSizes, |
875 | dim_t embDim) { |
876 | EXPECT_EQ(tableSizes.size(), embSizes.size()); |
877 | |
878 | // Create the tables. |
879 | std::vector<Placeholder *> lengths(tableSizes.size()); |
880 | for (unsigned int i = 0; i < lengths.size(); i++) { |
881 | lengths[i] = mod.createPlaceholder(ElemKind::Int32ITy, {miniBatch}, |
882 | "SL" + std::to_string(i), false); |
883 | } |
884 | |
885 | auto *denseData = mod.createPlaceholder(ElemKind::FloatTy, |
886 | {miniBatch, denseDim}, "denseData" , |
887 | false); // denseDim can be anything |
888 | |
889 | // First Dense embedding |
890 | fillStableRandomData(bindings.allocate(denseData)->getHandle(), |
891 | randomSeedContentOpt, 0.001); |
892 | NodeValue bottomMLP; |
893 | if (quantizeFC) { |
894 | bottomMLP = createQuantizedMLP(mod, F, denseData, denseData->dims()[1], |
895 | bottomMLPIntermediateDims, embDim, |
896 | numHiddenBottomMLPLayersOpt); |
897 | } else { |
898 | bottomMLP = createMLP(mod, F, denseData, denseData->dims()[1], |
899 | bottomMLPIntermediateDims, embDim, |
900 | numHiddenBottomMLPLayersOpt); |
901 | } |
902 | |
903 | // Sparse Embeddings |
904 | std::vector<NodeValue> embeddings(lengths.size()); |
905 | if (gatherWeights) { |
906 | createSparseWeightedGatherEmbeddings(mod, bindings, F, loader, lengths, |
907 | embSizes, embDim, embeddings); |
908 | } else { |
909 | createSparseEmbeddings(mod, bindings, F, loader, lengths, embSizes, |
910 | embDim, embeddings); |
911 | } |
912 | |
913 | // Interacting sparse and dense |
914 | embeddings.push_back(bottomMLP); |
915 | std::cout << "Number of embeddings concatenated: " << embeddings.size() |
916 | << std::endl; |
917 | auto *CN = F->createConcat("concat" , embeddings, |
918 | 1); // Output is size {MB, embDim*n} |
919 | auto *reshaped = |
920 | F->createReshape("reshape" , CN, |
921 | {bottomMLP.dims()[0], (dim_t)embeddings.size(), |
922 | embDim}); // {MB, n, embDim} |
923 | auto *transposed = |
924 | F->createTranspose("transpose" , reshaped, {0, 2, 1}); // {MB, embDim, n} |
925 | auto *dot = F->createBatchMatMul("dot_products" , reshaped, |
926 | transposed); // {MB, n, n} |
927 | auto *reshapeDot = F->createReshape( |
928 | "reshapeDot" , dot, |
929 | {bottomMLP.dims()[0], |
930 | (dim_t)(embeddings.size() * embeddings.size())}); // {MB, n^2} |
931 | NodeValue interact = F->createConcat("interact" , {reshapeDot, bottomMLP}, |
932 | 1); // {MB, n^2 + embDim} |
933 | |
934 | // MLP at the top |
935 | Node *topMLP; |
936 | if (quantizeFC) { |
937 | topMLP = createQuantizedMLP(mod, F, interact, interact.dims()[1], |
938 | topMLPIntermediateDims, |
939 | /* outputDim */ 1, numHiddenTopMLPLayersOpt); |
940 | } else { |
941 | topMLP = createMLP(mod, F, interact, interact.dims()[1], |
942 | topMLPIntermediateDims, |
943 | /* outputDim */ 1, numHiddenTopMLPLayersOpt); |
944 | } |
945 | |
946 | // Output |
947 | auto *save = F->createSave("save" , topMLP); |
948 | |
949 | return bindings.allocate(save->getPlaceholder()); |
950 | } |
951 | |
952 | /// Set up the precision configuration. This will be used for all |
953 | /// compilations which are compared to (Interpreter/Partitioned). |
954 | void setupPrecisionConfig() { |
955 | if (convertToFP16) { |
956 | precConfig_.convertToFP16 = convertToFP16; |
957 | precConfig_.convertFusedToFP16 = convertFusedToFP16; |
958 | precConfig_.convert4BitFusedToFP32 = convert4or8BitFusedToFP32; |
959 | precConfig_.convert8BitFusedToFP32 = convert4or8BitFusedToFP32; |
960 | // Note: always do not convert RWQ-SLWS here. The creator itself for |
961 | // precisionForNonDataSLWS already directly created the node with the |
962 | // correct precision. |
963 | precConfig_.precisionModeKindSet.insert( |
964 | Kinded::Kind::FusedRowwiseQuantizedSparseLengthsWeightedSumNodeKind); |
965 | precConfig_.precisionModeKindSet.insert( |
966 | Kinded::Kind::RowwiseQuantizedFullyConnectedNodeKind); |
967 | } |
968 | if (fuseScaleOffsetFp32Opt) { |
969 | precConfig_.convert4BitFusedToFP32 = fuseScaleOffsetFp32Opt; |
970 | precConfig_.convert8BitFusedToFP32 = fuseScaleOffsetFp32Opt; |
971 | } |
972 | } |
973 | |
974 | /// Set up the precision configuration for Interpreter. |
975 | void setupPrecisionConfigforInterpreter() { |
976 | if (convertToFP16) { |
977 | precConfigForInterpreter_.convertToFP16 = convertToFP16; |
978 | precConfigForInterpreter_.convertFusedToFP16 = convertToFP16; |
979 | // Note: always do not convert RWQ-SLWS here. The creator itself for |
980 | // precisionForNonDataSLWS already directly created the node with the |
981 | // correct precision. |
982 | precConfigForInterpreter_.precisionModeKindSet.insert( |
983 | Kinded::Kind::FusedRowwiseQuantizedSparseLengthsWeightedSumNodeKind); |
984 | precConfigForInterpreter_.precisionModeKindSet.insert( |
985 | Kinded::Kind::RowwiseQuantizedFullyConnectedNodeKind); |
986 | } |
987 | if (fuseScaleOffsetFp32Opt) { |
988 | precConfigForInterpreter_.convert4BitFusedToFP32 = fuseScaleOffsetFp32Opt; |
989 | precConfigForInterpreter_.convert8BitFusedToFP32 = fuseScaleOffsetFp32Opt; |
990 | } |
991 | } |
992 | |
993 | void printPerfSummary(std::vector<double> times) { |
994 | std::cout << "_,benchName,concurrent-count,runtime,QPS" << std::endl; |
995 | for (auto t : times) { |
996 | auto qps = miniBatchOpt / t * concurrentReqestsOpt; |
997 | std::cout << "BenchResult,RecommendationSystemTest," |
998 | << (unsigned)concurrentReqestsOpt << "," |
999 | << t / concurrentReqestsOpt << "," << qps << std::endl; |
1000 | } |
1001 | double min = *(std::min_element(times.begin(), times.end())); |
1002 | dim_t midElt = times.size() / 2; |
1003 | std::nth_element(times.begin(), times.begin() + midElt, times.end()); |
1004 | double median = times[midElt]; |
1005 | double medianRuntime = median / ((double)concurrentReqestsOpt); |
1006 | double minRuntime = min / ((double)concurrentReqestsOpt); |
1007 | std::cout << "_,benchName,reps,concurrent-count,medianRuntime,minRuntime," |
1008 | "medianQPS,maxQPS" |
1009 | << std::endl; |
1010 | std::cout << "BenchSummary,RecommendationSystemTest," << (unsigned)repsOpt |
1011 | << "," << (unsigned)concurrentReqestsOpt << "," << medianRuntime |
1012 | << "," << minRuntime << "," << miniBatchOpt / medianRuntime << "," |
1013 | << miniBatchOpt / minRuntime << std::endl; |
1014 | } |
1015 | |
1016 | void testRecSys(bool checkConcat = false) { |
1017 | assert((!useFP16AccumSLWS || useFP16SLWS) && |
1018 | "Can only use FP16 accumulation when using FP16 precision." ); |
1019 | isInterpreter = false; |
1020 | setupPrecisionConfig(); |
1021 | |
1022 | // Generate the network. |
1023 | std::unique_ptr<Module> mod(new Module); |
1024 | TestDeferredWeightLoader loader; |
1025 | |
1026 | F_ = mod->createFunction("main" ); |
1027 | resultTensor = createSimpleRecSysGraph(*mod.get(), *bindings_, F_, loader, |
1028 | tableSizes, embeddingDim); |
1029 | |
1030 | Placeholder *concatPH = nullptr; |
1031 | if (checkConcat) { |
1032 | // Add an observer node after concat. |
1033 | auto *CN = F_->getNodeByName("concat" ); |
1034 | auto *saveConcat = F_->createSave("after_concat_data" , CN); |
1035 | concatPH = saveConcat->getPlaceholder(); |
1036 | } |
1037 | if (dumpModelInputs) { |
1038 | // dump model into a zip file which can run with repro binary. |
1039 | glow::onnxifi::saveOnnxifiModel(F_); |
1040 | } |
1041 | auto configs = |
1042 | runtime::generateDeviceConfigs(1, getBackendName(), MAX_MEMORY); |
1043 | std::unique_ptr<HostManager> hostManager( |
1044 | new HostManager(std::move(configs))); |
1045 | |
1046 | DeferredLoader()->registerLoader(&loader); |
1047 | |
1048 | CompilationContext cctx; |
1049 | if (enableStaticPlaceholder) { |
1050 | cctx.optimizationOpts.foldStaticPlaceholderConversions = true; |
1051 | } |
1052 | cctx.precisionConfig = precConfig_; |
1053 | cctx.deferredWeightLoader = &loader; |
1054 | cctx.dumpFinalGraph = dumpFinalGraph; |
1055 | EXIT_ON_ERR(hostManager->addNetwork(std::move(mod), cctx)); |
1056 | |
1057 | // Run graph |
1058 | std::vector<double> times(repsOpt); |
1059 | for (size_t i = 0; i < repsOpt; i++) { |
1060 | auto start = std::chrono::high_resolution_clock::now(); |
1061 | dispatchInference("main" , hostManager.get(), context_, |
1062 | concurrentReqestsOpt); |
1063 | auto end = std::chrono::high_resolution_clock::now(); |
1064 | auto duration = std::chrono::duration<double>(end - start).count(); |
1065 | times[i] = duration; |
1066 | } |
1067 | |
1068 | printPerfSummary(times); |
1069 | |
1070 | // NaNs are a sign of something gone wrong. Always verify there aren't any |
1071 | // in the result. |
1072 | auto resultTensorH = resultTensor->getHandle(); |
1073 | for (size_t i = 0, e = resultTensorH.size(); i < e; i++) { |
1074 | EXPECT_FALSE(std::isnan(resultTensorH.raw(i))); |
1075 | } |
1076 | |
1077 | if (checkConcat) { |
1078 | // Get result and verify. |
1079 | EXPECT_EQ(resultTensor->size(), miniBatch); |
1080 | |
1081 | auto *concatT = bindings_->get(concatPH); |
1082 | auto concatH = concatT->getHandle(); |
1083 | // Check that intermediate concat results didn't overflow. |
1084 | std::cout << "Intermediate concats" << std::endl; |
1085 | concatH.dump(); |
1086 | for (int i = 0, e = concatH.size(); i < e; ++i) { |
1087 | EXPECT_LE(fabs(concatH.raw(i)), 100); |
1088 | } |
1089 | |
1090 | std::cout << "Result of prediction" << std::endl; |
1091 | std::cout << resultTensorH.size() << std::endl; |
1092 | resultTensorH.dump(); |
1093 | for (int i = 0, e = resultTensorH.size(); i < e; ++i) { |
1094 | EXPECT_GE(resultTensorH.raw(i), 0.0); |
1095 | } |
1096 | } |
1097 | |
1098 | if (dumpModelInputs) { |
1099 | dumpOutputs(); |
1100 | } |
1101 | |
1102 | // Undeploy the network. |
1103 | CHECK(!ERR_TO_BOOL(hostManager->removeNetwork("main" ))) |
1104 | << "Could not remove the network" ; |
1105 | // Free memory. |
1106 | hostManager.reset(); |
1107 | mod.reset(); |
1108 | |
1109 | // Compare against interpreter if we're not executing already on it. |
1110 | if (!skipCorrectnessCheck && getBackendName() != "Interpreter" ) { |
1111 | compareAgainstInterpreter(); |
1112 | } else { |
1113 | std::cout << "Skip correctness check with Interpreter backend" |
1114 | << std::endl; |
1115 | } |
1116 | } |
1117 | |
1118 | /// Run on the Interpreter and compare the result to previous result. |
1119 | void compareAgainstInterpreter() { |
1120 | isInterpreter = true; |
1121 | setupPrecisionConfigforInterpreter(); |
1122 | |
1123 | ExecutionContext contextI; |
1124 | // Create a new module for the interpreter run. |
1125 | std::unique_ptr<Module> modI(new Module); |
1126 | TestDeferredWeightLoader loaderI; |
1127 | auto *IF = modI->createFunction("main" ); |
1128 | PlaceholderBindings *bindingsI = contextI.getPlaceholderBindings(); |
1129 | Tensor *resultIT = createSimpleRecSysGraph(*modI, *bindingsI, IF, loaderI, |
1130 | tableSizes, embeddingDim); |
1131 | bindingsI->allocate(modI->getPlaceholders()); |
1132 | |
1133 | // Set device memory to 64GB to prevent partitioning. We are using the |
1134 | // Interpreter's result just as a reference result to compare against. |
1135 | auto configs = generateDeviceConfigs(1, "Interpreter" , MAX_MEMORY); |
1136 | std::unique_ptr<HostManager> hostManager( |
1137 | new HostManager(std::move(configs))); |
1138 | |
1139 | DeferredLoader()->registerLoader(&loaderI); |
1140 | |
1141 | // Use the same precision transformation for compilation. |
1142 | CompilationContext cctx; |
1143 | cctx.precisionConfig = precConfigForInterpreter_; |
1144 | cctx.deferredWeightLoader = &loaderI; |
1145 | EXIT_ON_ERR(hostManager->addNetwork(std::move(modI), cctx)); |
1146 | dispatchInference("main" , hostManager.get(), contextI, |
1147 | concurrentReqestsOpt); |
1148 | |
1149 | assert(resultTensor && "Must run and set resultTensor before comparing " |
1150 | "against the intepreter." ); |
1151 | EXPECT_TRUE(resultIT->isEqual(*resultTensor, 0.005)); |
1152 | } |
1153 | |
1154 | /// Create partitions to run and compare results. |
1155 | void testPartitionedRecSys(size_t numDevices, size_t memSize, |
1156 | ExecutionContext &context) { |
1157 | isInterpreter = false; |
1158 | // Result tensors are reused below, so create a local copy. |
1159 | Tensor referenceResultT = resultTensor->clone(); |
1160 | // Generate configs and create a new HostManager for testing partitioning. |
1161 | auto configs = generateDeviceConfigs(numDevices, getBackendName(), memSize); |
1162 | std::unique_ptr<HostManager> hostManager( |
1163 | new HostManager(std::move(configs))); |
1164 | |
1165 | // Create a new module and placeholderBindings to run on the partitioning |
1166 | // HostManager. |
1167 | PlaceholderBindings bindingsP; |
1168 | std::unique_ptr<Module> modP(new Module); |
1169 | TestDeferredWeightLoader loaderP; |
1170 | // Since HostManager consumed the uniquePtr we grab a raw pointer to the |
1171 | // module so we can verify partitioning. |
1172 | Module *rawModule = modP.get(); |
1173 | auto *funcP = modP->createFunction("main" ); |
1174 | createSimpleRecSysGraph(*modP, bindingsP, funcP, loaderP, tableSizes, |
1175 | embeddingDim); |
1176 | |
1177 | assert(memSize > 0 && "Must set partitionerPerDeviceMemCapacity > 0." ); |
1178 | assert(numDevices > 0 && "Must set partitionerNumDevices > 0." ); |
1179 | std::cout << numDevices << " devices of size " << memSize << "\n" ; |
1180 | |
1181 | DeferredLoader()->registerLoader(&loaderP); |
1182 | |
1183 | // Use the same precision transformation for compilation. |
1184 | CompilationContext cctx; |
1185 | if (enableStaticPlaceholder) { |
1186 | cctx.optimizationOpts.foldStaticPlaceholderConversions = true; |
1187 | } |
1188 | cctx.precisionConfig = precConfig_; |
1189 | cctx.deferredWeightLoader = &loaderP; |
1190 | cctx.optimizationOpts.useSparseNNPartitioningScheme = |
1191 | useSparseNNPartitioning; |
1192 | cctx.optimizationOpts.sparseNNPartitioningAddSLSConcats = |
1193 | sparseNNPartitioningAddSLSConcats; |
1194 | cctx.optimizationOpts.sparseNNPartitioningSchemeNumCards = |
1195 | sparseNNPartitioningNumCards; |
1196 | cctx.optimizationOpts.sparseNNPartitioningSchemeSLSTableKBytesPerCard = |
1197 | sparseNNPartitioningSLSKbytes; |
1198 | cctx.optimizationOpts.sparseNNPartitioningSchemeNumCoresSLS = |
1199 | sparseNNPartitioningNumCoresSLS; |
1200 | cctx.optimizationOpts.sparseNNPartitioningSchemeNumCoresOther = |
1201 | sparseNNPartitioningNumCoresOther; |
1202 | cctx.dumpFinalGraph = dumpFinalGraph; |
1203 | cctx.saturateHost = saturateHost; |
1204 | EXIT_ON_ERR(hostManager->addNetwork(std::move(modP), cctx)); |
1205 | std::cout << "Partitions = " << rawModule->getFunctions().size() |
1206 | << std::endl; |
1207 | |
1208 | // Run the partitioned graph and compare the results. |
1209 | auto &bindings = *context.getPlaceholderBindings(); |
1210 | bindings.clear(); |
1211 | bindings.allocate(rawModule->getPlaceholders()); |
1212 | bindingsP.allocate(rawModule->getPlaceholders()); |
1213 | for (const auto &PH : bindingsP.pairs()) { |
1214 | bindingsP.copyToTarget(PH.first->getName(), bindings); |
1215 | } |
1216 | |
1217 | dispatchInference("main" , hostManager.get(), context, concurrentReqestsOpt); |
1218 | |
1219 | Tensor *resultTensorP = |
1220 | bindings.get(bindings.getPlaceholderByNameSlow("save" )); |
1221 | if (enableStaticPlaceholder) { |
1222 | EXPECT_TRUE(referenceResultT.isEqual(*resultTensorP, 0.005)); |
1223 | } else { |
1224 | EXPECT_TRUE(referenceResultT.isEqual(*resultTensorP)); |
1225 | } |
1226 | } |
1227 | |
1228 | /// Test SparseLengthsSum independently. |
1229 | void testSLSQuant() { |
1230 | isInterpreter = false; |
1231 | std::unique_ptr<Module> mod(new Module); |
1232 | TestDeferredWeightLoader loader; |
1233 | F_ = mod->createFunction("main" ); |
1234 | std::vector<Placeholder *> sparseLengths(1); |
1235 | sparseLengths[0] = |
1236 | mod->createPlaceholder(ElemKind::Int32ITy, {miniBatch}, "SL0" , false); |
1237 | |
1238 | std::vector<NodeValue> embeddings(sparseLengths.size()); |
1239 | createSparseEmbeddings(*mod.get(), *bindings_, F_, loader, sparseLengths, |
1240 | tableSizes, embeddingDim, embeddings); |
1241 | |
1242 | auto *save = F_->createSave("save" , embeddings[0]); |
1243 | Tensor *resultTensorLocal = bindings_->allocate(save->getPlaceholder()); |
1244 | |
1245 | DeferredLoader()->registerLoader(&loader); |
1246 | |
1247 | // Use the same precision transformation for compilation. |
1248 | CompilationContext cctx; |
1249 | if (enableStaticPlaceholder) { |
1250 | cctx.optimizationOpts.foldStaticPlaceholderConversions = true; |
1251 | } |
1252 | cctx.precisionConfig = precConfig_; |
1253 | cctx.deferredWeightLoader = &loader; |
1254 | auto configs = generateDeviceConfigs(1, getBackendName(), MAX_MEMORY); |
1255 | std::unique_ptr<HostManager> hostManager( |
1256 | new HostManager(std::move(configs))); |
1257 | EXIT_ON_ERR(hostManager->addNetwork(std::move(mod), cctx)); |
1258 | |
1259 | // Run graph. |
1260 | dispatchInference("main" , hostManager.get(), context_, |
1261 | concurrentReqestsOpt); |
1262 | |
1263 | // TODO: for now we only check the output dimension, contents are ignored |
1264 | EXPECT_EQ(resultTensorLocal->size(), miniBatch * embeddingDim); |
1265 | resultTensorLocal->getHandle().dump(); |
1266 | } |
1267 | }; |
1268 | |
1269 | /// Standard Tests |
1270 | /// These tests have three options: |
1271 | /// * quantizeSLWSData enables Int8 Fused Rowwise Quantization for the Sparse |
1272 | /// Embeddings (Int8 quantized values with float scale and offset). |
1273 | /// * quantizeFC enables Int8 Fused Rowwise Quantization for FC weights and |
1274 | /// activations inside the MLPs. |
1275 | /// * convertToFP16 walks the graph at the end of constructing the graph and |
1276 | /// converts all FP32 nodes & tensors to FP16, meaning the graph will use |
1277 | /// FP16 for internal weights, biases and activations (when not already Int8 |
1278 | /// quantized). Inputs and outputs are still FP32 but are immediately |
1279 | /// dropped to FP16 precision at the beginning of the graph. |
1280 | /// * useFP16SLWS represents whether to use Float16 for non-data |
1281 | /// inputs/outputs for SLWS and SLS Nodes, and for data per-row scale and |
1282 | /// offset. |
1283 | /// * useFP16AccumSLWS represents whether to use Float16 accumulation for SLWS |
1284 | /// and SLS Nodes. Note this should only be used if useFP16SLWS. |
1285 | |
1286 | /// Everything in FP32. |
1287 | TEST_P(RecommendationSystemTest, RecSys_FP32) { |
1288 | CHECK_IF_ENABLED(); |
1289 | |
1290 | quantizeSLWSData = false; |
1291 | useFP16SLWS = false; |
1292 | useFP16AccumSLWS = false; |
1293 | quantizeFC = false; |
1294 | convertToFP16 = false; |
1295 | |
1296 | testRecSys(); |
1297 | } |
1298 | |
1299 | // RecSys_FP32 with deferred weight loading. |
1300 | TEST_P(RecommendationSystemTest, RecSys_FP32_Deferred) { |
1301 | CHECK_IF_ENABLED(); |
1302 | |
1303 | quantizeSLWSData = false; |
1304 | useFP16SLWS = false; |
1305 | useFP16AccumSLWS = false; |
1306 | quantizeFC = false; |
1307 | convertToFP16 = true; |
1308 | enableStaticPlaceholder = true; |
1309 | convertFusedToFP16 = false; |
1310 | convert4or8BitFusedToFP32 = true; |
1311 | |
1312 | testRecSys(); |
1313 | } |
1314 | |
1315 | /// Rowwise quantize the SLWS and FC; everything else in FP32. |
1316 | TEST_P(RecommendationSystemTest, RecSys_RWQuantized_SLWS_FC) { |
1317 | CHECK_IF_ENABLED(); |
1318 | |
1319 | quantizeSLWSData = true; |
1320 | useFP16SLWS = false; |
1321 | useFP16AccumSLWS = false; |
1322 | quantizeFC = true; |
1323 | convertToFP16 = false; |
1324 | |
1325 | testRecSys(); |
1326 | } |
1327 | |
1328 | // RecSys_RWQuantized_SLWS_FC with deferred weight loading. |
1329 | TEST_P(RecommendationSystemTest, RecSys_RWQuantized_SLWS_FC_Deferred) { |
1330 | CHECK_IF_ENABLED(); |
1331 | |
1332 | quantizeSLWSData = true; |
1333 | useFP16SLWS = false; |
1334 | useFP16AccumSLWS = false; |
1335 | quantizeFC = true; |
1336 | |
1337 | enableStaticPlaceholder = true; |
1338 | convertToFP16 = true; |
1339 | convertFusedToFP16 = false; |
1340 | convert4or8BitFusedToFP32 = true; |
1341 | |
1342 | testRecSys(); |
1343 | } |
1344 | |
1345 | /// Rowwise quantize the SLWS; everything else in FP32. |
1346 | TEST_P(RecommendationSystemTest, RecSys_RWQuantized_SLWS) { |
1347 | CHECK_IF_ENABLED(); |
1348 | |
1349 | quantizeSLWSData = true; |
1350 | useFP16SLWS = false; |
1351 | useFP16AccumSLWS = false; |
1352 | quantizeFC = false; |
1353 | convertToFP16 = false; |
1354 | |
1355 | testRecSys(); |
1356 | } |
1357 | |
1358 | // RecSys_RWQuantized_SLWS with deferred weight loading. |
1359 | TEST_P(RecommendationSystemTest, RecSys_RWQuantized_SLWS_Deferred) { |
1360 | CHECK_IF_ENABLED(); |
1361 | |
1362 | quantizeSLWSData = true; |
1363 | useFP16SLWS = false; |
1364 | useFP16AccumSLWS = false; |
1365 | quantizeFC = false; |
1366 | |
1367 | enableStaticPlaceholder = true; |
1368 | convertToFP16 = true; |
1369 | convertFusedToFP16 = false; |
1370 | convert4or8BitFusedToFP32 = true; |
1371 | |
1372 | testRecSys(); |
1373 | } |
1374 | |
1375 | /// Rowwise quantize the SLWS and FC; everything else in FP16. |
1376 | TEST_P(RecommendationSystemTest, RecSys_RWQuantized_SLWS_FC_FP16) { |
1377 | CHECK_IF_ENABLED(); |
1378 | |
1379 | quantizeSLWSData = true; |
1380 | useFP16SLWS = false; |
1381 | useFP16AccumSLWS = false; |
1382 | quantizeFC = true; |
1383 | convertToFP16 = true; |
1384 | convertFusedToFP16 = true; |
1385 | |
1386 | testRecSys(); |
1387 | } |
1388 | |
1389 | /// Rowwise quantize the SLWS; everything else in FP16. |
1390 | TEST_P(RecommendationSystemTest, RecSys_RWQuantized_SLWS_FP16) { |
1391 | CHECK_IF_ENABLED(); |
1392 | |
1393 | quantizeSLWSData = true; |
1394 | useFP16SLWS = false; |
1395 | useFP16AccumSLWS = false; |
1396 | quantizeFC = false; |
1397 | convertToFP16 = true; |
1398 | convertFusedToFP16 = true; |
1399 | |
1400 | testRecSys(); |
1401 | } |
1402 | |
1403 | // RecSys_RWQuantized_SLWS_FP16 with deferred weight loading. |
1404 | TEST_P(RecommendationSystemTest, RecSys_RWQuantized_SLWS_FP16_Deferred) { |
1405 | CHECK_IF_ENABLED(); |
1406 | |
1407 | quantizeSLWSData = true; |
1408 | useFP16SLWS = false; |
1409 | useFP16AccumSLWS = false; |
1410 | quantizeFC = false; |
1411 | |
1412 | enableStaticPlaceholder = true; |
1413 | convertToFP16 = true; |
1414 | convertFusedToFP16 = false; |
1415 | convert4or8BitFusedToFP32 = true; |
1416 | |
1417 | testRecSys(); |
1418 | } |
1419 | |
1420 | /// Rowwise quantize the SLWS, with FP16 for scales/bias, and other |
1421 | /// inputs/outputs in FP16. Everything else in FP32. |
1422 | TEST_P(RecommendationSystemTest, RecSys_RWQuantizedFP16_SLWS) { |
1423 | CHECK_IF_ENABLED(); |
1424 | |
1425 | quantizeSLWSData = true; |
1426 | useFP16SLWS = true; |
1427 | useFP16AccumSLWS = false; |
1428 | quantizeFC = false; |
1429 | convertToFP16 = false; |
1430 | |
1431 | testRecSys(); |
1432 | } |
1433 | |
1434 | /// Rowwise quantize the SLWS, with FP16 for scales/bias, and other |
1435 | /// inputs/outputs in FP16, and use FP16 accumulation. Everything else in FP32. |
1436 | TEST_P(RecommendationSystemTest, RecSys_RWQuantizedFP16AccumFP16_SLWS) { |
1437 | CHECK_IF_ENABLED(); |
1438 | |
1439 | quantizeSLWSData = true; |
1440 | useFP16SLWS = true; |
1441 | useFP16AccumSLWS = true; |
1442 | quantizeFC = false; |
1443 | convertToFP16 = false; |
1444 | |
1445 | testRecSys(); |
1446 | } |
1447 | |
1448 | /// Rowwise quantize the SLWS, with FP16 for scales/bias, and other |
1449 | /// inputs/outputs in FP16. Everything else in FP16. |
1450 | TEST_P(RecommendationSystemTest, RecSys_RWQuantizedFP16_SLWS_FP16) { |
1451 | CHECK_IF_ENABLED(); |
1452 | |
1453 | quantizeSLWSData = true; |
1454 | useFP16SLWS = true; |
1455 | useFP16AccumSLWS = false; |
1456 | quantizeFC = false; |
1457 | convertToFP16 = true; |
1458 | convertFusedToFP16 = true; |
1459 | |
1460 | testRecSys(); |
1461 | } |
1462 | |
1463 | /// Rowwise quantize the SLWS, with FP16 for scales/bias, and other |
1464 | /// inputs/outputs in FP16, and use FP16 accumulation. Everything else in FP16. |
1465 | TEST_P(RecommendationSystemTest, RecSys_RWQuantizedFP16AccumFP16_SLWS_FP16) { |
1466 | CHECK_IF_ENABLED(); |
1467 | |
1468 | quantizeSLWSData = true; |
1469 | useFP16SLWS = true; |
1470 | useFP16AccumSLWS = true; |
1471 | quantizeFC = false; |
1472 | convertToFP16 = true; |
1473 | convertFusedToFP16 = true; |
1474 | |
1475 | testRecSys(); |
1476 | } |
1477 | |
1478 | /// Partitioning Tests |
1479 | /// These tests have the same options as the above, but also partition the |
1480 | /// created graph into segments and walk the dag. The test then compares output |
1481 | /// for the partitioned and unpartitioned runs. |
1482 | |
1483 | TEST_P(RecommendationSystemTest, RecSys_FP32_Partitioned) { |
1484 | CHECK_IF_ENABLED(); |
1485 | |
1486 | quantizeSLWSData = false; |
1487 | useFP16SLWS = false; |
1488 | useFP16AccumSLWS = false; |
1489 | quantizeFC = false; |
1490 | convertToFP16 = false; |
1491 | |
1492 | testRecSys(); |
1493 | |
1494 | // If the memory capacity was not set on the command line, then double the |
1495 | // default value for this test. |
1496 | if (deviceMemCapacityOpt == 0) { |
1497 | deviceMemCapacity *= 2; // Double memory for this test |
1498 | } |
1499 | |
1500 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1501 | } |
1502 | |
1503 | // RecSys_FP32_Partitioned with deferred weight loading. |
1504 | TEST_P(RecommendationSystemTest, RecSys_FP32_Partitioned_Deferred) { |
1505 | CHECK_IF_ENABLED(); |
1506 | |
1507 | quantizeSLWSData = false; |
1508 | useFP16SLWS = false; |
1509 | useFP16AccumSLWS = false; |
1510 | quantizeFC = false; |
1511 | |
1512 | enableStaticPlaceholder = true; |
1513 | convertToFP16 = true; |
1514 | convertFusedToFP16 = false; |
1515 | convert4or8BitFusedToFP32 = true; |
1516 | |
1517 | testRecSys(); |
1518 | |
1519 | // If the memory capacity was not set on the command line, then double the |
1520 | // default value for this test. |
1521 | if (deviceMemCapacityOpt == 0) { |
1522 | deviceMemCapacity *= 2; // Double memory for this test |
1523 | } |
1524 | |
1525 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1526 | } |
1527 | |
1528 | TEST_P(RecommendationSystemTest, RecSys_Partitioned_RWQuantized_SLWS) { |
1529 | CHECK_IF_ENABLED(); |
1530 | |
1531 | quantizeSLWSData = true; |
1532 | useFP16SLWS = false; |
1533 | useFP16AccumSLWS = false; |
1534 | quantizeFC = false; |
1535 | convertToFP16 = false; |
1536 | |
1537 | testRecSys(); |
1538 | |
1539 | // If the memory capacity was not set on the command line, then double the |
1540 | // default value for this test. |
1541 | if (deviceMemCapacityOpt == 0) { |
1542 | deviceMemCapacity *= 2; // Double memory for this test |
1543 | } |
1544 | |
1545 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1546 | } |
1547 | |
1548 | // RecSys_Partitioned_RWQuantized_SLWS with deferred weight loading. |
1549 | TEST_P(RecommendationSystemTest, RecSys_Partitioned_RWQuantized_SLWS_Deferred) { |
1550 | CHECK_IF_ENABLED(); |
1551 | |
1552 | quantizeSLWSData = true; |
1553 | useFP16SLWS = false; |
1554 | useFP16AccumSLWS = false; |
1555 | quantizeFC = false; |
1556 | |
1557 | enableStaticPlaceholder = true; |
1558 | convertToFP16 = true; |
1559 | convertFusedToFP16 = false; |
1560 | convert4or8BitFusedToFP32 = true; |
1561 | |
1562 | testRecSys(); |
1563 | |
1564 | // If the memory capacity was not set on the command line, then double the |
1565 | // default value for this test. |
1566 | if (deviceMemCapacityOpt == 0) { |
1567 | deviceMemCapacity *= 2; // Double memory for this test |
1568 | } |
1569 | |
1570 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1571 | } |
1572 | |
1573 | TEST_P(RecommendationSystemTest, RecSys_Partitioned_RWQuantized_SLWS_FC) { |
1574 | CHECK_IF_ENABLED(); |
1575 | |
1576 | quantizeSLWSData = true; |
1577 | useFP16SLWS = false; |
1578 | useFP16AccumSLWS = false; |
1579 | quantizeFC = true; |
1580 | convertToFP16 = false; |
1581 | |
1582 | testRecSys(); |
1583 | |
1584 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1585 | } |
1586 | |
1587 | // RecSys_Partitioned_RWQuantized_SLWS_FC with deferred weight loading. |
1588 | TEST_P(RecommendationSystemTest, |
1589 | RecSys_Partitioned_RWQuantized_SLWS_FC_Deferred) { |
1590 | CHECK_IF_ENABLED(); |
1591 | |
1592 | quantizeSLWSData = true; |
1593 | useFP16SLWS = false; |
1594 | useFP16AccumSLWS = false; |
1595 | quantizeFC = true; |
1596 | |
1597 | enableStaticPlaceholder = true; |
1598 | convertToFP16 = true; |
1599 | convertFusedToFP16 = false; |
1600 | convert4or8BitFusedToFP32 = true; |
1601 | |
1602 | testRecSys(); |
1603 | |
1604 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1605 | } |
1606 | |
1607 | TEST_P(RecommendationSystemTest, RecSys_Partitioned_RWQuantized_SLWS_FP16) { |
1608 | CHECK_IF_ENABLED(); |
1609 | |
1610 | quantizeSLWSData = true; |
1611 | useFP16SLWS = false; |
1612 | useFP16AccumSLWS = false; |
1613 | quantizeFC = false; |
1614 | convertToFP16 = true; |
1615 | convertFusedToFP16 = true; |
1616 | |
1617 | testRecSys(); |
1618 | |
1619 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1620 | } |
1621 | |
1622 | // RecSys_Partitioned_RWQuantized_SLWS_FP16 with deferred weight loading. |
1623 | TEST_P(RecommendationSystemTest, |
1624 | RecSys_Partitioned_RWQuantized_SLWS_FP16_Deferred) { |
1625 | CHECK_IF_ENABLED(); |
1626 | |
1627 | quantizeSLWSData = true; |
1628 | useFP16SLWS = false; |
1629 | useFP16AccumSLWS = false; |
1630 | quantizeFC = false; |
1631 | |
1632 | enableStaticPlaceholder = true; |
1633 | convertToFP16 = true; |
1634 | convertFusedToFP16 = false; |
1635 | convert4or8BitFusedToFP32 = true; |
1636 | |
1637 | testRecSys(); |
1638 | |
1639 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1640 | } |
1641 | |
1642 | TEST_P(RecommendationSystemTest, RecSys_Partitioned_RWQuantized_SLWS_FC_FP16) { |
1643 | CHECK_IF_ENABLED(); |
1644 | |
1645 | quantizeSLWSData = true; |
1646 | useFP16SLWS = false; |
1647 | useFP16AccumSLWS = false; |
1648 | quantizeFC = true; |
1649 | convertToFP16 = true; |
1650 | convertFusedToFP16 = true; |
1651 | |
1652 | testRecSys(); |
1653 | |
1654 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1655 | } |
1656 | |
1657 | /// Rowwise quantize the SLWS, with FP16 for scales/bias, and other |
1658 | /// inputs/outputs in FP16. Everything else in FP32. Also run partitioned and |
1659 | /// compare results. |
1660 | TEST_P(RecommendationSystemTest, RecSys_Partitioned_RWQuantizedFP16_SLWS) { |
1661 | CHECK_IF_ENABLED(); |
1662 | |
1663 | quantizeSLWSData = true; |
1664 | useFP16SLWS = true; |
1665 | useFP16AccumSLWS = false; |
1666 | quantizeFC = false; |
1667 | convertToFP16 = false; |
1668 | |
1669 | testRecSys(); |
1670 | |
1671 | // If the memory capacity was not set on the command line, then double the |
1672 | // default value for this test. |
1673 | if (deviceMemCapacityOpt == 0) { |
1674 | deviceMemCapacity *= 2; // Double memory for this test |
1675 | } |
1676 | |
1677 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1678 | } |
1679 | |
1680 | // RecSys_Partitioned_RWQuantizedFP16_SLWS with deferred weight loading. |
1681 | TEST_P(RecommendationSystemTest, |
1682 | RecSys_Partitioned_RWQuantizedFP16_SLWS_Deferred) { |
1683 | CHECK_IF_ENABLED(); |
1684 | |
1685 | quantizeSLWSData = true; |
1686 | useFP16SLWS = true; |
1687 | useFP16AccumSLWS = false; |
1688 | quantizeFC = false; |
1689 | |
1690 | enableStaticPlaceholder = true; |
1691 | convertToFP16 = true; |
1692 | convertFusedToFP16 = false; |
1693 | convert4or8BitFusedToFP32 = true; |
1694 | |
1695 | testRecSys(); |
1696 | |
1697 | // If the memory capacity was not set on the command line, then double the |
1698 | // default value for this test. |
1699 | if (deviceMemCapacityOpt == 0) { |
1700 | deviceMemCapacity *= 2; // Double memory for this test |
1701 | } |
1702 | |
1703 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1704 | } |
1705 | |
1706 | /// Rowwise quantize the SLWS, with FP16 for scales/bias, and other |
1707 | /// inputs/outputs in FP16, and use FP16 accumulation. Everything else in FP32. |
1708 | /// Also run partitioned and compare results. |
1709 | TEST_P(RecommendationSystemTest, |
1710 | RecSys_Partitioned_RWQuantizedFP16AccumFP16_SLWS) { |
1711 | CHECK_IF_ENABLED(); |
1712 | |
1713 | quantizeSLWSData = true; |
1714 | useFP16SLWS = true; |
1715 | useFP16AccumSLWS = true; |
1716 | quantizeFC = false; |
1717 | convertToFP16 = false; |
1718 | |
1719 | testRecSys(); |
1720 | |
1721 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1722 | } |
1723 | |
1724 | /// Rowwise quantize the SLWS, with FP16 for scales/bias, and other |
1725 | /// inputs/outputs in FP16. Everything else in FP16. Also run partitioned and |
1726 | /// compare results. |
1727 | TEST_P(RecommendationSystemTest, RecSys_Partitioned_RWQuantizedFP16_SLWS_FP16) { |
1728 | CHECK_IF_ENABLED(); |
1729 | |
1730 | quantizeSLWSData = true; |
1731 | useFP16SLWS = true; |
1732 | useFP16AccumSLWS = false; |
1733 | quantizeFC = false; |
1734 | convertToFP16 = true; |
1735 | convertFusedToFP16 = true; |
1736 | |
1737 | testRecSys(); |
1738 | |
1739 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1740 | } |
1741 | |
1742 | /// Rowwise quantize the SLWS, with FP16 for scales/bias, and other |
1743 | /// inputs/outputs in FP16, and use FP16 accumulation. Everything else in FP16. |
1744 | /// Also run partitioned and compare results. |
1745 | TEST_P(RecommendationSystemTest, |
1746 | RecSys_Partitioned_RWQuantizedFP16AccumFP16_SLWS_FP16) { |
1747 | CHECK_IF_ENABLED(); |
1748 | |
1749 | quantizeSLWSData = true; |
1750 | useFP16SLWS = true; |
1751 | useFP16AccumSLWS = true; |
1752 | quantizeFC = false; |
1753 | convertToFP16 = true; |
1754 | convertFusedToFP16 = true; |
1755 | |
1756 | testRecSys(); |
1757 | |
1758 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1759 | } |
1760 | |
1761 | /// Rowwise quantize the SLWS, with FP16 for scales/bias, and other |
1762 | /// inputs/outputs in FP16, and use FP16 accumulation. Everything else in FP16. |
1763 | /// Also run partitioned using SparseNN partitioning and compare results. |
1764 | TEST_P(RecommendationSystemTest, |
1765 | RecSys_Partitioned_RWQuantizedFP16AccumFP16_SLWS_FP16_SNN_Partitioning) { |
1766 | CHECK_IF_ENABLED(); |
1767 | |
1768 | quantizeSLWSData = true; |
1769 | useFP16SLWS = true; |
1770 | useFP16AccumSLWS = true; |
1771 | quantizeFC = false; |
1772 | convertToFP16 = true; |
1773 | convertFusedToFP16 = true; |
1774 | |
1775 | // Options for SparseNN Partitioning |
1776 | useSparseNNPartitioning = true; |
1777 | sparseNNPartitioningAddSLSConcats = true; |
1778 | sparseNNPartitioningNumCards = partitioningNumDevicesOpt; |
1779 | sparseNNPartitioningSLSKbytes = 1000000; |
1780 | sparseNNPartitioningNumCoresSLS = 6; |
1781 | sparseNNPartitioningNumCoresOther = 4; |
1782 | |
1783 | testRecSys(); |
1784 | |
1785 | testPartitionedRecSys(numDevices, deviceMemCapacity, context_); |
1786 | } |
1787 | |
1788 | /// Test SLS independently, with no other layers being run. |
1789 | TEST_P(RecommendationSystemTest, RecSys_SLS_Only) { |
1790 | CHECK_IF_ENABLED(); |
1791 | |
1792 | quantizeSLWSData = true; |
1793 | |
1794 | // Normally called in testRecSys(), but we're bypassing it here. |
1795 | setupPrecisionConfig(); |
1796 | |
1797 | testSLSQuant(); |
1798 | } |
1799 | |
1800 | // RecSys_SLS_Only with deferred weight loading. |
1801 | TEST_P(RecommendationSystemTest, RecSys_SLS_Only_Deferred) { |
1802 | CHECK_IF_ENABLED(); |
1803 | |
1804 | quantizeSLWSData = true; |
1805 | |
1806 | enableStaticPlaceholder = true; |
1807 | convertFusedToFP16 = false; |
1808 | convert4or8BitFusedToFP32 = true; |
1809 | |
1810 | // Normally called in testRecSys(), but we're bypassing it here. |
1811 | setupPrecisionConfig(); |
1812 | |
1813 | testSLSQuant(); |
1814 | } |
1815 | |
1816 | /// Test gathering weights for SLWS. |
1817 | TEST_P(RecommendationSystemTest, RecSys_FP32_Gather_Weights) { |
1818 | CHECK_IF_ENABLED(); |
1819 | |
1820 | quantizeSLWSData = false; |
1821 | useFP16SLWS = false; |
1822 | useFP16AccumSLWS = false; |
1823 | quantizeFC = false; |
1824 | convertToFP16 = false; |
1825 | |
1826 | gatherWeights = true; |
1827 | |
1828 | testRecSys(); |
1829 | } |
1830 | |
1831 | INSTANTIATE_BACKEND_TEST(RecommendationSystemTest); |
1832 | |