1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef NGEN_INTERFACE_HPP |
18 | #define NGEN_INTERFACE_HPP |
19 | |
20 | |
21 | #include "ngen_core.hpp" |
22 | #include <sstream> |
23 | |
24 | |
25 | namespace ngen { |
26 | |
27 | template <HW hw> class OpenCLCodeGenerator; |
28 | template <HW hw> class L0CodeGenerator; |
29 | |
30 | // Exceptions. |
31 | #ifdef NGEN_SAFE |
32 | class unknown_argument_exception : public std::runtime_error { |
33 | public: |
34 | unknown_argument_exception() : std::runtime_error("Argument not found" ) {} |
35 | }; |
36 | |
37 | class bad_argument_type_exception : public std::runtime_error { |
38 | public: |
39 | bad_argument_type_exception() : std::runtime_error("Bad argument type" ) {} |
40 | }; |
41 | |
42 | class interface_not_finalized : public std::runtime_error { |
43 | public: |
44 | interface_not_finalized() : std::runtime_error("Interface has not been finalized" ) {} |
45 | }; |
46 | |
47 | class use_simd1_local_id_exception : public std::runtime_error { |
48 | public: |
49 | use_simd1_local_id_exception() : std::runtime_error("Use getSIMD1LocalID for SIMD1 kernels" ) {} |
50 | }; |
51 | #endif |
52 | |
53 | enum class ExternalArgumentType { Scalar, GlobalPtr, LocalPtr, Hidden }; |
54 | enum class GlobalAccessType { None = 0, Stateless = 1, Surface = 2, All = 3 }; |
55 | |
56 | static inline GlobalAccessType operator|(GlobalAccessType access1, GlobalAccessType access2) |
57 | { |
58 | return static_cast<GlobalAccessType>(static_cast<int>(access1) | static_cast<int>(access2)); |
59 | } |
60 | |
61 | class InterfaceHandler |
62 | { |
63 | template <HW hw> friend class OpenCLCodeGenerator; |
64 | template <HW hw> friend class L0CodeGenerator; |
65 | |
66 | public: |
67 | InterfaceHandler(HW hw_) : hw(hw_), simd(GRF::bytes(hw_) >> 2) |
68 | , inlineGRFs(defaultInlineGRFs(hw)) |
69 | {} |
70 | |
71 | inline void externalName(const std::string &name) { kernelName = name; } |
72 | |
73 | template <typename DT> |
74 | inline void newArgument(std::string name) { newArgument(name, getDataType<DT>()); } |
75 | inline void newArgument(std::string name, DataType type, ExternalArgumentType exttype = ExternalArgumentType::Scalar, GlobalAccessType access = GlobalAccessType::All); |
76 | inline void newArgument(std::string name, ExternalArgumentType exttype, GlobalAccessType access = GlobalAccessType::All); |
77 | |
78 | inline Subregister getArgument(const std::string &name) const; |
79 | inline Subregister getArgumentIfExists(const std::string &name) const; |
80 | inline int getArgumentSurface(const std::string &name) const; |
81 | inline int getArgumentSurfaceIfExists(const std::string &name) const; |
82 | inline GRF getLocalID(int dim) const; |
83 | inline RegData getSIMD1LocalID(int dim) const; |
84 | inline Subregister getLocalSize(int dim) const; |
85 | |
86 | const std::string &getExternalName() const { return kernelName; } |
87 | int getSIMD() const { return simd; } |
88 | int getGRFCount() const { return needGRF; } |
89 | size_t getSLMSize() const { return slmSize; } |
90 | |
91 | void require32BitBuffers() { allow64BitBuffers = false; } |
92 | void requireBarrier() { barrierCount = 1; } |
93 | void requireBarriers(int nBarriers) { barrierCount = nBarriers; } |
94 | void requireDPAS() { needDPAS = true; } |
95 | void requireGlobalAtomics() { needGlobalAtomics = true; } |
96 | void requireGRF(int grfs) { needGRF = grfs; } |
97 | void requireLocalID(int dimensions) { needLocalID = dimensions; } |
98 | void requireLocalSize() { needLocalSize = true; } |
99 | void requireNonuniformWGs() { needNonuniformWGs = true; } |
100 | void requireNoPreemption() { needNoPreemption = true; } |
101 | void requireScratch(size_t bytes = 1) { scratchSize = bytes; } |
102 | void requireSIMD(int simd_) { simd = simd_; } |
103 | void requireSLM(size_t bytes) { slmSize = bytes; } |
104 | void requireStatelessWrites(bool req = true) { needStatelessWrites = req; } |
105 | inline void requireType(DataType type); |
106 | template <typename T> void requireType() { requireType(getDataType<T>()); } |
107 | void requireWalkOrder(int o1, int o2) { walkOrder[0] = o1; walkOrder[1] = o2; walkOrder[2] = -1; } |
108 | void requireWalkOrder(int o1, int o2, int o3) { walkOrder[0] = o1; walkOrder[1] = o2; walkOrder[2] = o3; } |
109 | void requireWorkgroup(size_t x, size_t y = 1, |
110 | size_t z = 1) { wg[0] = x; wg[1] = y; wg[2] = z; } |
111 | |
112 | void setInlineGRFCount(int grfs) { inlineGRFs = grfs; } |
113 | void setSkipPerThreadOffset(int32_t offset) { offsetSkipPerThread = offset; } |
114 | void setSkipCrossThreadOffset(int32_t offset) { offsetSkipCrossThread = offset; } |
115 | |
116 | inline GRF getCrossthreadBase(bool effective = true) const; |
117 | inline GRF getArgLoadBase() const; |
118 | |
119 | inline void finalize(); |
120 | |
121 | template <typename CodeGenerator> |
122 | inline void generatePrologue(CodeGenerator &generator, const GRF &temp = GRF(127)) const; |
123 | |
124 | inline void generateDummyCL(std::ostream &stream) const; |
125 | inline std::string generateZeInfo() const; |
126 | |
127 | #ifdef NGEN_ASM |
128 | inline void dumpAssignments(std::ostream &stream) const; |
129 | #endif |
130 | |
131 | static constexpr int noSurface = 0x80; // Returned by getArgumentSurfaceIfExists in case of no surface assignment |
132 | |
133 | protected: |
134 | struct Assignment { |
135 | std::string name; |
136 | DataType type; |
137 | ExternalArgumentType exttype; |
138 | GlobalAccessType access; |
139 | Subregister reg; |
140 | int surface; |
141 | int index; |
142 | |
143 | bool globalSurfaceAccess() const { return (static_cast<int>(access) & static_cast<int>(GlobalAccessType::Surface)); } |
144 | bool globalStatelessAccess() const { return (static_cast<int>(access) & static_cast<int>(GlobalAccessType::Stateless)); } |
145 | }; |
146 | |
147 | HW hw; |
148 | |
149 | std::vector<Assignment> assignments; |
150 | std::string kernelName = "default_kernel" ; |
151 | |
152 | int nextArgIndex = 0; |
153 | bool finalized = false; |
154 | |
155 | bool allow64BitBuffers = 0; |
156 | int barrierCount = 0; |
157 | bool needDPAS = false; |
158 | bool needGlobalAtomics = false; |
159 | int32_t needGRF = 128; |
160 | int needLocalID = 0; |
161 | bool needLocalSize = false; |
162 | bool needNonuniformWGs = false; |
163 | bool needNoPreemption = false; |
164 | bool needHalf = false; |
165 | bool needDouble = false; |
166 | bool needStatelessWrites = true; |
167 | int32_t offsetSkipPerThread = 0; |
168 | int32_t offsetSkipCrossThread = 0; |
169 | size_t scratchSize = 0; |
170 | int simd = 8; |
171 | size_t slmSize = 0; |
172 | int walkOrder[3] = {-1, -1, -1}; |
173 | size_t wg[3] = {0, 0, 0}; |
174 | |
175 | int crossthreadBytes = 0; |
176 | int crossthreadGRFs = 0; |
177 | int inlineGRFs = 0; |
178 | inline int getCrossthreadGRFs() const; |
179 | inline int getCrossthreadBytes() const; |
180 | int grfsPerLID() const { return (simd > 16 && GRF::bytes(hw) < 64) ? 2 : 1; } |
181 | |
182 | static inline int defaultInlineGRFs(HW hw); |
183 | }; |
184 | |
185 | using NEOInterfaceHandler = InterfaceHandler; |
186 | |
187 | void InterfaceHandler::newArgument(std::string name, DataType type, ExternalArgumentType exttype, GlobalAccessType access) |
188 | { |
189 | if (exttype != ExternalArgumentType::GlobalPtr) |
190 | access = GlobalAccessType::None; |
191 | assignments.push_back({name, type, exttype, access, Subregister{}, noSurface, nextArgIndex++}); |
192 | } |
193 | |
194 | void InterfaceHandler::newArgument(std::string name, ExternalArgumentType exttype, GlobalAccessType access) |
195 | { |
196 | DataType type = DataType::invalid; |
197 | |
198 | switch (exttype) { |
199 | case ExternalArgumentType::GlobalPtr: type = DataType::uq; break; |
200 | case ExternalArgumentType::LocalPtr: type = DataType::ud; break; |
201 | default: |
202 | #ifdef NGEN_SAFE |
203 | throw bad_argument_type_exception(); |
204 | #else |
205 | break; |
206 | #endif |
207 | } |
208 | |
209 | newArgument(name, type, exttype, access); |
210 | } |
211 | |
212 | Subregister InterfaceHandler::getArgumentIfExists(const std::string &name) const |
213 | { |
214 | for (auto &assignment : assignments) { |
215 | if (assignment.name == name) |
216 | return assignment.reg; |
217 | } |
218 | |
219 | return Subregister{}; |
220 | } |
221 | |
222 | Subregister InterfaceHandler::getArgument(const std::string &name) const |
223 | { |
224 | Subregister arg = getArgumentIfExists(name); |
225 | |
226 | #ifdef NGEN_SAFE |
227 | if (arg.isInvalid()) |
228 | throw unknown_argument_exception(); |
229 | #endif |
230 | |
231 | return arg; |
232 | } |
233 | |
234 | int InterfaceHandler::getArgumentSurfaceIfExists(const std::string &name) const |
235 | { |
236 | for (auto &assignment : assignments) |
237 | if (assignment.name == name) |
238 | return assignment.surface; |
239 | return noSurface; |
240 | } |
241 | |
242 | int InterfaceHandler::getArgumentSurface(const std::string &name) const |
243 | { |
244 | int surface = getArgumentSurfaceIfExists(name); |
245 | |
246 | #ifdef NGEN_SAFE |
247 | if (surface == noSurface) |
248 | throw unknown_argument_exception(); |
249 | #endif |
250 | return surface; |
251 | } |
252 | |
253 | RegData InterfaceHandler::getSIMD1LocalID(int dim) const |
254 | { |
255 | #ifdef NGEN_SAFE |
256 | if (dim > needLocalID || simd != 1) throw unknown_argument_exception(); |
257 | #endif |
258 | |
259 | return GRF(1).uw(dim); |
260 | } |
261 | |
262 | GRF InterfaceHandler::getLocalID(int dim) const |
263 | { |
264 | #ifdef NGEN_SAFE |
265 | if (dim > needLocalID) throw unknown_argument_exception(); |
266 | if (simd == 1) throw use_simd1_local_id_exception(); |
267 | #endif |
268 | |
269 | if (simd == 1) |
270 | return GRF(1).uw(); |
271 | else |
272 | return GRF(1 + dim * grfsPerLID()).uw(); |
273 | } |
274 | |
275 | void InterfaceHandler::requireType(DataType type) |
276 | { |
277 | switch (type) { |
278 | case DataType::hf: needHalf = true; break; |
279 | case DataType::df: needDouble = true; break; |
280 | default: break; |
281 | } |
282 | } |
283 | |
284 | static inline const char *getCLDataType(DataType type) |
285 | { |
286 | static const char *names[16] = {"uint" , "int" , "ushort" , "short" , "uchar" , "char" , "double" , "float" , "ulong" , "long" , "half" , "ushort" , "INVALID" , "INVALID" , "INVALID" , "INVALID" }; |
287 | return names[static_cast<uint8_t>(type) & 0xF]; |
288 | } |
289 | |
290 | void InterfaceHandler::generateDummyCL(std::ostream &stream) const |
291 | { |
292 | #ifdef NGEN_SAFE |
293 | if (!finalized) throw interface_not_finalized(); |
294 | #endif |
295 | const char *dpasDummy = " int __builtin_IB_sub_group_idpas_s8_s8_8_1(int, int, int8) __attribute__((const));\n" |
296 | " int z = __builtin_IB_sub_group_idpas_s8_s8_8_1(0, ____[0], 1);\n" |
297 | " for (int i = 0; i < z; i++) (void) ____[0];\n" ; |
298 | |
299 | if (needHalf) stream << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" ; |
300 | if (needDouble) stream << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" ; |
301 | |
302 | if (wg[0] > 0 && wg[1] > 0 && wg[2] > 0) |
303 | stream << "__attribute__((reqd_work_group_size(" << wg[0] << ',' << wg[1] << ',' << wg[2] << ")))\n" ; |
304 | if (walkOrder[0] >= 0) { |
305 | stream << "__attribute__((intel_reqd_workgroup_walk_order(" << walkOrder[0] << ',' << walkOrder[1]; |
306 | if (walkOrder[2] >= 0) |
307 | stream << ',' << walkOrder[2]; |
308 | stream << ")))\n" ; |
309 | } |
310 | stream << "__attribute__((intel_reqd_sub_group_size(" << simd << ")))\n" ; |
311 | stream << "kernel void " << kernelName << '('; |
312 | |
313 | bool firstArg = true; |
314 | for (const auto &assignment : assignments) { |
315 | if (assignment.exttype == ExternalArgumentType::Hidden) continue; |
316 | |
317 | if (!firstArg) stream << ", " ; |
318 | |
319 | switch (assignment.exttype) { |
320 | case ExternalArgumentType::GlobalPtr: stream << "global void *" ; break; |
321 | case ExternalArgumentType::LocalPtr: stream << "local void *" ; break; |
322 | case ExternalArgumentType::Scalar: stream << getCLDataType(assignment.type) << ' '; break; |
323 | default: break; |
324 | } |
325 | |
326 | stream << assignment.name; |
327 | firstArg = false; |
328 | } |
329 | stream << ") {\n" ; |
330 | stream << " global volatile int *____;\n" ; |
331 | |
332 | if (needLocalID) stream << " (void) ____[get_local_id(0)];\n" ; |
333 | if (needLocalSize) stream << " (void) ____[get_enqueued_local_size(0)];\n" ; |
334 | if (barrierCount > 0) stream << " __asm__ volatile(\"barrier\");\n" ; |
335 | for (int i = 1; i < barrierCount; i++) { |
336 | stream << " local NamedBarrier_t *bar" << i << ";\n" |
337 | " bar" << i << " = named_barrier_init(1);\n" |
338 | " work_group_named_barrier(bar" << i << ", 0);\n" ; |
339 | } |
340 | if (needDPAS) stream << dpasDummy; |
341 | if (needGlobalAtomics) stream << " atomic_inc(____);\n" ; |
342 | if (scratchSize > 0) stream << " volatile char scratch[" << scratchSize << "] = {0};\n" ; |
343 | if (slmSize > 0) stream << " volatile local char slm[" << slmSize << "]; slm[0]++;\n" ; |
344 | if (needNoPreemption) { |
345 | if (hw == HW::Gen9) |
346 | stream << " volatile double *__df; *__df = 1.1 / *__df;\n" ; // IEEE macro causes IGC to disable MTP. |
347 | /* To do: Gen11 */ |
348 | } |
349 | |
350 | if (hw >= HW::XeHP) for (const auto &assignment : assignments) { |
351 | // Force IGC to assume stateless accesses could occur if necessary. |
352 | if (assignment.exttype == ExternalArgumentType::GlobalPtr && assignment.globalStatelessAccess()) |
353 | stream << " __asm__ volatile(\"\" :: \"rw.u\"(" << assignment.name << "));\n" ; |
354 | } |
355 | |
356 | for (const auto &assignment : assignments) { |
357 | // Force IGC to assume surface accesses could occur if necessary. |
358 | if (assignment.exttype == ExternalArgumentType::GlobalPtr && assignment.globalSurfaceAccess()) |
359 | stream << " { volatile uchar __load = ((global uchar *) " << assignment.name << ")[get_local_id(0)];}\n" ; |
360 | } |
361 | |
362 | stream << "}\n" ; |
363 | } |
364 | |
365 | inline Subregister InterfaceHandler::getLocalSize(int dim) const |
366 | { |
367 | static const std::string localSizeArgs[3] = {"__local_size0" , "__local_size1" , "__local_size2" }; |
368 | return getArgument(localSizeArgs[dim]); |
369 | } |
370 | |
371 | void InterfaceHandler::finalize() |
372 | { |
373 | // Make assignments, following NEO rules: |
374 | // - all inputs are naturally aligned |
375 | // - all sub-DWord inputs are DWord-aligned |
376 | // - first register is |
377 | // r3 (no local IDs) |
378 | // r5 (SIMD8/16, local IDs) |
379 | // r8 (SIMD32, local IDs) |
380 | // [- assign local ptr arguments left-to-right? not checked] |
381 | // - assign global pointer arguments left-to-right |
382 | // - assign scalar arguments left-to-right |
383 | // - assign surface indices left-to-right for global pointers |
384 | // - no arguments can cross a GRF boundary. Arrays like work size count |
385 | // as 1 argument for this rule. |
386 | |
387 | static const std::string localSizeArgs[3] = {"__local_size0" , "__local_size1" , "__local_size2" }; |
388 | static const std::string scratchSizeArg = "__scratch_size" ; |
389 | |
390 | GRF base = getCrossthreadBase(); |
391 | int offset = 32; |
392 | int nextSurface = 0; |
393 | const int grfSize = GRF::bytes(hw); |
394 | |
395 | auto assignArgsOfType = [&](ExternalArgumentType exttype) { |
396 | for (auto &assignment : assignments) { |
397 | if (assignment.exttype != exttype) continue; |
398 | |
399 | auto bytes = getBytes(assignment.type); |
400 | auto size = getDwords(assignment.type) << 2; |
401 | |
402 | if (assignment.name == localSizeArgs[0]) { |
403 | // Move to next GRF if local size arguments won't fit in this one. |
404 | if (offset > grfSize - (3 * 4)) { |
405 | offset = 0; |
406 | base++; |
407 | } |
408 | } |
409 | |
410 | offset = (offset + size - 1) & -size; |
411 | if (offset >= grfSize) { |
412 | offset = 0; |
413 | base++; |
414 | } |
415 | |
416 | assignment.reg = base.sub(offset / bytes, assignment.type); |
417 | |
418 | if (assignment.exttype == ExternalArgumentType::GlobalPtr) { |
419 | if (!assignment.globalStatelessAccess()) |
420 | assignment.reg = Subregister{}; |
421 | if (assignment.globalSurfaceAccess()) |
422 | assignment.surface = nextSurface; |
423 | nextSurface++; |
424 | } |
425 | else if (assignment.exttype == ExternalArgumentType::Scalar) |
426 | requireType(assignment.type); |
427 | |
428 | offset += size; |
429 | } |
430 | }; |
431 | |
432 | assignArgsOfType(ExternalArgumentType::LocalPtr); |
433 | assignArgsOfType(ExternalArgumentType::GlobalPtr); |
434 | assignArgsOfType(ExternalArgumentType::Scalar); |
435 | |
436 | // Add private memory size arguments. |
437 | if (scratchSize > 0) |
438 | newArgument(scratchSizeArg, DataType::uq, ExternalArgumentType::Hidden); |
439 | |
440 | // Add enqueued local size arguments. |
441 | if (needLocalSize && needNonuniformWGs) |
442 | for (int dim = 0; dim < 3; dim++) |
443 | newArgument(localSizeArgs[dim], DataType::ud, ExternalArgumentType::Hidden); |
444 | |
445 | assignArgsOfType(ExternalArgumentType::Hidden); |
446 | |
447 | crossthreadBytes = (base.getBase() - getCrossthreadBase().getBase()) * GRF::bytes(hw) |
448 | + ((offset + 31) & -32); |
449 | crossthreadGRFs = GRF::bytesToGRFs(hw, crossthreadBytes); |
450 | |
451 | // Manually add regular local size arguments. |
452 | if (needLocalSize && !needNonuniformWGs) |
453 | for (int dim = 0; dim < 3; dim++) |
454 | assignments.push_back({localSizeArgs[dim], DataType::ud, ExternalArgumentType::Hidden, |
455 | GlobalAccessType::None, GRF(getCrossthreadBase()).ud(dim + 3), noSurface, -1}); |
456 | |
457 | finalized = true; |
458 | } |
459 | |
460 | GRF InterfaceHandler::getCrossthreadBase(bool effective) const |
461 | { |
462 | if (!needLocalID) |
463 | return GRF((!effective || (hw >= HW::XeHP)) ? 1 : 2); |
464 | else if (simd == 1) |
465 | return GRF(2); |
466 | else |
467 | return GRF(1 + 3 * grfsPerLID()); |
468 | } |
469 | |
470 | GRF InterfaceHandler::getArgLoadBase() const |
471 | { |
472 | return getCrossthreadBase().advance(inlineGRFs); |
473 | } |
474 | |
475 | int InterfaceHandler::getCrossthreadBytes() const |
476 | { |
477 | #ifdef NGEN_SAFE |
478 | if (!finalized) throw interface_not_finalized(); |
479 | #endif |
480 | return crossthreadBytes; |
481 | } |
482 | |
483 | int InterfaceHandler::getCrossthreadGRFs() const |
484 | { |
485 | #ifdef NGEN_SAFE |
486 | if (!finalized) throw interface_not_finalized(); |
487 | #endif |
488 | return crossthreadGRFs; |
489 | } |
490 | |
491 | int InterfaceHandler::defaultInlineGRFs(HW hw) |
492 | { |
493 | if (hw == HW::XeHP) return 1; |
494 | if (hw == HW::XeHPG) return 1; |
495 | return 0; |
496 | } |
497 | |
498 | template <typename CodeGenerator> |
499 | void InterfaceHandler::generatePrologue(CodeGenerator &generator, const GRF &temp) const |
500 | { |
501 | #ifdef NGEN_INTERFACE_OLD_PROLOGUE |
502 | if (needLocalID) |
503 | generator.loadlid(getCrossthreadBytes(), needLocalID, simd, temp, 8*16); |
504 | if (getCrossthreadGRFs() > 1) |
505 | generator.loadargs(getCrossthreadBase(), getCrossthreadGRFs(), temp); |
506 | #else |
507 | if (needLocalID) |
508 | generator.loadlid(getCrossthreadBytes(), needLocalID, simd, temp, 12*16); |
509 | if (getCrossthreadGRFs() > inlineGRFs) |
510 | generator.loadargs(getArgLoadBase(), getCrossthreadGRFs() - inlineGRFs, temp); |
511 | #endif |
512 | } |
513 | |
514 | std::string InterfaceHandler::generateZeInfo() const |
515 | { |
516 | #ifdef NGEN_SAFE |
517 | if (!finalized) throw interface_not_finalized(); |
518 | #endif |
519 | |
520 | std::stringstream md; |
521 | |
522 | md << "version: 1.8\n" |
523 | "kernels: \n" |
524 | " - name: \"" << kernelName << "\"\n" |
525 | " execution_env: \n" |
526 | " grf_count: " << needGRF << "\n" |
527 | " simd_size: " << simd << "\n" ; |
528 | if (simd > 1) |
529 | md << " required_sub_group_size: " << simd << "\n" ; |
530 | if (wg[0] > 0 && wg[1] > 0 && wg[2] > 0) { |
531 | md << " required_work_group_size:\n" |
532 | << " - " << wg[0] << "\n" |
533 | << " - " << wg[1] << "\n" |
534 | << " - " << wg[2] << "\n" ; |
535 | } |
536 | if (walkOrder[0] >= 0) { |
537 | md << " work_group_walk_order_dimensions:\n" |
538 | << " - " << walkOrder[0] << "\n" |
539 | << " - " << walkOrder[1] << "\n" |
540 | << " - " << std::max(walkOrder[2], 0) << "\n" ; |
541 | } |
542 | if (offsetSkipPerThread > 0) |
543 | md << " offset_to_skip_per_thread_data_load: " << offsetSkipPerThread << '\n'; |
544 | if (barrierCount > 0) |
545 | md << " barrier_count: " << barrierCount << '\n'; |
546 | if (allow64BitBuffers) |
547 | md << " has_4gb_buffers: true\n" ; |
548 | if (needDPAS) |
549 | md << " has_dpas: true\n" ; |
550 | if (needGlobalAtomics) |
551 | md << " has_global_atomics: true\n" ; |
552 | if (slmSize > 0) |
553 | md << " slm_size: " << slmSize << '\n'; |
554 | if (!needStatelessWrites) |
555 | md << " has_no_stateless_write: true\n" ; |
556 | if (needNoPreemption) |
557 | md << " disable_mid_thread_preemption: true\n" ; |
558 | if (inlineGRFs > 0) |
559 | md << " inline_data_payload_size: " << inlineGRFs * GRF::bytes(hw) << "\n" ; |
560 | if (!assignments.empty()) { |
561 | md << "\n" |
562 | " payload_arguments: \n" ; |
563 | } |
564 | for (auto &assignment : assignments) { |
565 | uint32_t size = 0; |
566 | bool skipArg = false; |
567 | bool explicitArg = true; |
568 | |
569 | if (assignment.globalSurfaceAccess()) { |
570 | md << " - arg_type: arg_bypointer\n" |
571 | " arg_index: " << assignment.index << "\n" |
572 | " offset: 0\n" |
573 | " size: 0\n" |
574 | " addrmode: stateful\n" |
575 | " addrspace: global\n" |
576 | " access_type: readwrite\n" |
577 | "\n" ; |
578 | } |
579 | |
580 | switch (assignment.exttype) { |
581 | case ExternalArgumentType::Scalar: |
582 | md << " - arg_type: arg_byvalue\n" ; |
583 | size = (assignment.reg.getDwords() << 2); |
584 | break; |
585 | case ExternalArgumentType::GlobalPtr: |
586 | if (!assignment.globalStatelessAccess()) { |
587 | skipArg = true; |
588 | break; |
589 | } |
590 | // fall through |
591 | case ExternalArgumentType::LocalPtr: |
592 | md << " - arg_type: arg_bypointer\n" ; |
593 | size = (assignment.reg.getDwords() << 2); |
594 | break; |
595 | case ExternalArgumentType::Hidden: { |
596 | explicitArg = false; |
597 | if (assignment.name == "__local_size0" ) { |
598 | // from Zebin spec : local_size Argument size : int32x3 |
599 | // may need refining to allow |
600 | // either int32x1, int32x2, int32x3 (x, xy, xyz) |
601 | // or fine grain : local_size_x, local_size_y, local_size_z |
602 | md << " - arg_type: " |
603 | << (needNonuniformWGs ? "enqueued_local_size\n" : "local_size\n" ); |
604 | size = (assignment.reg.getDwords() << 2) * 3; |
605 | } else |
606 | skipArg = true; |
607 | break; |
608 | } |
609 | } |
610 | if (skipArg) |
611 | continue; |
612 | |
613 | auto offset = (assignment.reg.getBase() - getCrossthreadBase().getBase()) * GRF::bytes(hw) + assignment.reg.getByteOffset(); |
614 | if (explicitArg) |
615 | md << " arg_index: " << assignment.index << "\n" ; |
616 | md << " offset: " << offset << "\n" |
617 | " size: " << size << '\n'; |
618 | |
619 | if (assignment.globalStatelessAccess()) { |
620 | md << " addrmode: stateless\n" |
621 | " addrspace: global\n" |
622 | " access_type: readwrite\n" ; |
623 | } else if (assignment.exttype == ExternalArgumentType::LocalPtr) { |
624 | md << " addrmode: slm\n" |
625 | " addrspace: local\n" |
626 | " access_type: readwrite\n" ; |
627 | } |
628 | md << "\n" ; |
629 | } |
630 | |
631 | bool firstSurface = true; |
632 | for (auto &assignment : assignments) { |
633 | if (assignment.globalSurfaceAccess()) { |
634 | if (firstSurface) { |
635 | md << "\n" |
636 | " binding_table_indices: \n" ; |
637 | firstSurface = false; |
638 | } |
639 | md << " - bti_value: " << assignment.surface << "\n" |
640 | " arg_index: " << assignment.index << "\n" |
641 | " \n" ; |
642 | } |
643 | } |
644 | |
645 | if (needLocalID) { |
646 | md << "\n" |
647 | " per_thread_payload_arguments: \n" ; |
648 | |
649 | if (simd == 1) { |
650 | md << " - arg_type: packed_local_ids\n" |
651 | " offset: 0\n" |
652 | " size: 6\n" |
653 | " \n" ; |
654 | } else { |
655 | auto localIDBytes = grfsPerLID() * GRF::bytes(hw); |
656 | localIDBytes *= 3; // runtime currently supports 0 or 3 localId channels in per thread data |
657 | md << " - arg_type: local_id\n" |
658 | " offset: 0\n" |
659 | " size: " << localIDBytes << "\n" |
660 | " \n" ; |
661 | } |
662 | } |
663 | |
664 | md << "\n" ; // ensure file ends with newline |
665 | |
666 | #ifdef NGEN_DUMP_ZE_INFO |
667 | std::cerr << md.str(); |
668 | #endif |
669 | |
670 | return md.str(); |
671 | } |
672 | |
673 | #ifdef NGEN_ASM |
674 | void InterfaceHandler::dumpAssignments(std::ostream &stream) const |
675 | { |
676 | LabelManager manager; |
677 | |
678 | for (auto &assignment : assignments) { |
679 | stream << "// " ; |
680 | if (assignment.reg.isValid()) |
681 | assignment.reg.outputText(stream, PrintDetail::sub, manager); |
682 | else |
683 | stream << "(none)" ; |
684 | stream << '\t' << assignment.name; |
685 | if (assignment.surface != noSurface) |
686 | stream << "\t(BTI " << assignment.surface << ')'; |
687 | stream << std::endl; |
688 | } |
689 | } |
690 | #endif |
691 | |
692 | } /* namespace ngen */ |
693 | |
694 | #endif /* header guard */ |
695 | |