1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *ocl_types_header = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2019-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#ifndef GPU_OCL_OCL_TYPES_H )==""\n"
21R"==(#define GPU_OCL_OCL_TYPES_H )==""\n"
22R"==(#include "gpu/ocl/ocl_math_utils.h" )==""\n"
23R"==(#define unroll_for __attribute__((opencl_unroll_hint)) for )==""\n"
24R"==(#define for_ for )==""\n"
25R"==(#define CONCAt2(a, b) a##b )==""\n"
26R"==(#define CONCAT2(a, b) CONCAt2(a, b) )==""\n"
27R"==(#define CONCAT3(a, b, c) CONCAT2(CONCAT2(a, b), c) )==""\n"
28R"==(#if defined(DT_F16) || defined(SRC_DT_F16) || defined(SRC0_DT_F16) \ )==""\n"
29R"==(|| defined(SRC1_DT_F16) || defined(DST_DT_F16) || defined(WEI_DT_F16) \ )==""\n"
30R"==(|| defined(BIA_DT_F16) || defined(ACC_DT_F16) )==""\n"
31R"==(#pragma OPENCL EXTENSION cl_khr_fp16 : enable )==""\n"
32R"==(#endif )==""\n"
33R"==(#if DT_F64 )==""\n"
34R"==(#pragma OPENCL EXTENSION cl_khr_fp64 : enable )==""\n"
35R"==(#endif )==""\n"
36R"==(#if DT_F32 == 1 )==""\n"
37R"==(#define DATA_T float )==""\n"
38R"==(#define DATA2_T float2 )==""\n"
39R"==(#define DATA4_T float4 )==""\n"
40R"==(#define DATA8_T float8 )==""\n"
41R"==(#define DATA16_T float16 )==""\n"
42R"==(#define DATA_MAX FLT_MAX )==""\n"
43R"==(#define DATA_MIN -DATA_MAX )==""\n"
44R"==(#define DATA_ZERO 0.0f )==""\n"
45R"==(#define DATA_ONE 1.0f )==""\n"
46R"==(#define DEF_ACC_DATA_T float )==""\n"
47R"==(#define DEF_ACC_DATA2_T float2 )==""\n"
48R"==(#define DEF_ACC_DATA4_T float4 )==""\n"
49R"==(#define DEF_ACC_DATA8_T float8 )==""\n"
50R"==(#define POST_OP_DATA_T float )==""\n"
51R"==(#define TO_DATA_T(v) (float)(v) )==""\n"
52R"==(#define TO_DEF_ACC_DATA_T convert_float )==""\n"
53R"==(#define TO_DEF_ACC_DATA2_T convert_float2 )==""\n"
54R"==(#define TO_DEF_ACC_DATA4_T convert_float4 )==""\n"
55R"==(#define TO_DEF_ACC_DATA8_T convert_float8 )==""\n"
56R"==(#define DATA_TO_REF convert_float )==""\n"
57R"==(#define CONVERT_DATA_T convert_float )==""\n"
58R"==(#define CONVERT_DATA2_T convert_float2 )==""\n"
59R"==(#define CONVERT_DATA4_T convert_float4 )==""\n"
60R"==(#define CONVERT_DATA8_T convert_float8 )==""\n"
61R"==(#define CONVERT_FLOAT_T convert_float )==""\n"
62R"==(#define CONVERT_FLOAT2_T convert_float2 )==""\n"
63R"==(#define CONVERT_FLOAT4_T convert_float4 )==""\n"
64R"==(#define CONVERT_FLOAT8_T convert_float8 )==""\n"
65R"==(#define BLOCK_READ intel_sub_group_block_read )==""\n"
66R"==(#define BLOCK_WRITE intel_sub_group_block_write )==""\n"
67R"==(#define BLOCK_READ2 intel_sub_group_block_read2 )==""\n"
68R"==(#define BLOCK_READ4 intel_sub_group_block_read4 )==""\n"
69R"==(#define BLOCK_READ8 intel_sub_group_block_read8 )==""\n"
70R"==(#define BLOCK_WRITE2 intel_sub_group_block_write2 )==""\n"
71R"==(#define BLOCK_WRITE4 intel_sub_group_block_write4 )==""\n"
72R"==(#define BLOCK_WRITE8 intel_sub_group_block_write8 )==""\n"
73R"==(#define AS_DATA_T as_float )==""\n"
74R"==(#define AS_DATA2_T as_float2 )==""\n"
75R"==(#define AS_DATA4_T as_float4 )==""\n"
76R"==(#define AS_DATA8_T as_float8 )==""\n"
77R"==(#define AS_UINT_T as_uint )==""\n"
78R"==(#define AS_UINT2_T as_uint2 )==""\n"
79R"==(#define AS_UINT4_T as_uint4 )==""\n"
80R"==(#define AS_UINT8_T as_uint8 )==""\n"
81R"==(#define BLOCK_DATA_T uint )==""\n"
82R"==(#define BLOCK_DATA2_T uint2 )==""\n"
83R"==(#define BLOCK_DATA4_T uint4 )==""\n"
84R"==(#define BLOCK_DATA8_T uint8 )==""\n"
85R"==(#define AS_BLOCK_DATA_T as_uint )==""\n"
86R"==(#define AS_BLOCK_DATA2_T as_uint2 )==""\n"
87R"==(#define AS_BLOCK_DATA4_T as_uint4 )==""\n"
88R"==(#define AS_BLOCK_DATA8_T as_uint8 )==""\n"
89R"==(#define MMAD_DATA_T uint )==""\n"
90R"==(#define MMAD_DATA4_T uint4 )==""\n"
91R"==(#define MMAD_DATA8_T uint8 )==""\n"
92R"==(#define MMAD_ACC_DATA4_T float4 )==""\n"
93R"==(#define MMAD_ACC_DATA8_T float8 )==""\n"
94R"==(#elif DT_F64 == 1 )==""\n"
95R"==(#define DATA_T double )==""\n"
96R"==(#define DATA2_T double2 )==""\n"
97R"==(#define DATA4_T double4 )==""\n"
98R"==(#define DATA8_T double8 )==""\n"
99R"==(#define DATA16_T double16 )==""\n"
100R"==(#define DATA_MAX DBL_MAX )==""\n"
101R"==(#define DATA_MIN -DATA_MAX )==""\n"
102R"==(#define DATA_ZERO 0.0d )==""\n"
103R"==(#define DATA_ONE 1.0d )==""\n"
104R"==(#define DEF_ACC_DATA_T double )==""\n"
105R"==(#define DEF_ACC_DATA2_T double2 )==""\n"
106R"==(#define DEF_ACC_DATA4_T double4 )==""\n"
107R"==(#define DEF_ACC_DATA8_T double8 )==""\n"
108R"==(#define POST_OP_DATA_T double )==""\n"
109R"==(#define TO_DATA_T(v) (double)(v) )==""\n"
110R"==(#define TO_DEF_ACC_DATA_T(v) (double)(v) )==""\n"
111R"==(#define DATA_TO_REF convert_float )==""\n"
112R"==(#define CONVERT_DATA_T convert_double )==""\n"
113R"==(#define CONVERT_DATA2_T convert_double2 )==""\n"
114R"==(#define CONVERT_DATA4_T convert_double4 )==""\n"
115R"==(#define CONVERT_DATA8_T convert_double8 )==""\n"
116R"==(#define CONVERT_FLOAT_T convert_float )==""\n"
117R"==(#define CONVERT_FLOAT2_T convert_float2 )==""\n"
118R"==(#define CONVERT_FLOAT4_T convert_float4 )==""\n"
119R"==(#define CONVERT_FLOAT8_T convert_float8 )==""\n"
120R"==(#define AS_DATA_T as_double )==""\n"
121R"==(#define AS_DATA2_T as_double2 )==""\n"
122R"==(#define AS_DATA4_T as_double4 )==""\n"
123R"==(#define AS_DATA8_T as_double8 )==""\n"
124R"==(#elif DT_F16 == 1 )==""\n"
125R"==(#define DATA_T half )==""\n"
126R"==(#define DATA2_T half2 )==""\n"
127R"==(#define DATA4_T half4 )==""\n"
128R"==(#define DATA8_T half8 )==""\n"
129R"==(#define DATA16_T half16 )==""\n"
130R"==(#define AS_DATA2_T as_half2 )==""\n"
131R"==(#define DATA_MAX HALF_MAX )==""\n"
132R"==(#define DATA_MIN -DATA_MAX )==""\n"
133R"==(#define DATA_ZERO 0.0h )==""\n"
134R"==(#define DATA_ONE 1.0h )==""\n"
135R"==(#define DEF_ACC_DATA_T float )==""\n"
136R"==(#define DEF_ACC_DATA2_T float2 )==""\n"
137R"==(#define DEF_ACC_DATA4_T float4 )==""\n"
138R"==(#define DEF_ACC_DATA8_T float8 )==""\n"
139R"==(#define POST_OP_DATA_T float )==""\n"
140R"==(#define TO_DATA_T convert_half )==""\n"
141R"==(#define TO_DEF_ACC_DATA_T convert_float )==""\n"
142R"==(#define TO_DEF_ACC_DATA2_T convert_float2 )==""\n"
143R"==(#define TO_DEF_ACC_DATA4_T convert_float4 )==""\n"
144R"==(#define TO_DEF_ACC_DATA8_T convert_float8 )==""\n"
145R"==(#define DATA_TO_REF convert_half )==""\n"
146R"==(#define CONVERT_DATA_T convert_half )==""\n"
147R"==(#define CONVERT_DATA2_T convert_half2 )==""\n"
148R"==(#define CONVERT_DATA4_T convert_half4 )==""\n"
149R"==(#define CONVERT_DATA8_T convert_half8 )==""\n"
150R"==(#define CONVERT_FLOAT_T convert_float )==""\n"
151R"==(#define CONVERT_FLOAT2_T convert_float2 )==""\n"
152R"==(#define CONVERT_FLOAT4_T convert_float4 )==""\n"
153R"==(#define CONVERT_FLOAT8_T convert_float8 )==""\n"
154R"==(#define BLOCK_READ intel_sub_group_block_read_us )==""\n"
155R"==(#define BLOCK_WRITE intel_sub_group_block_write_us )==""\n"
156R"==(#define BLOCK_READ2 intel_sub_group_block_read_us2 )==""\n"
157R"==(#define BLOCK_READ4 intel_sub_group_block_read_us4 )==""\n"
158R"==(#define BLOCK_READ8 intel_sub_group_block_read_us8 )==""\n"
159R"==(#define BLOCK_WRITE2 intel_sub_group_block_write_us2 )==""\n"
160R"==(#define BLOCK_WRITE4 intel_sub_group_block_write_us4 )==""\n"
161R"==(#define BLOCK_WRITE8 intel_sub_group_block_write_us8 )==""\n"
162R"==(#define AS_DATA_T as_half )==""\n"
163R"==(#define AS_DATA2_T as_half2 )==""\n"
164R"==(#define AS_DATA4_T as_half4 )==""\n"
165R"==(#define AS_DATA8_T as_half8 )==""\n"
166R"==(#define AS_UINT_T as_ushort )==""\n"
167R"==(#define AS_UINT2_T as_ushort2 )==""\n"
168R"==(#define AS_UINT4_T as_ushort4 )==""\n"
169R"==(#define AS_UINT8_T as_ushort8 )==""\n"
170R"==(#define BLOCK_DATA_T ushort )==""\n"
171R"==(#define BLOCK_DATA2_T ushort2 )==""\n"
172R"==(#define BLOCK_DATA4_T ushort4 )==""\n"
173R"==(#define BLOCK_DATA8_T ushort8 )==""\n"
174R"==(#define AS_BLOCK_DATA_T as_ushort )==""\n"
175R"==(#define AS_BLOCK_DATA2_T as_ushort2 )==""\n"
176R"==(#define AS_BLOCK_DATA4_T as_ushort4 )==""\n"
177R"==(#define AS_BLOCK_DATA8_T as_ushort8 )==""\n"
178R"==(#define MMAD_DATA_T uint )==""\n"
179R"==(#define MMAD_DATA4_T uint4 )==""\n"
180R"==(#define MMAD_DATA8_T uint8 )==""\n"
181R"==(#define MMAD_ACC_DATA4_T float4 )==""\n"
182R"==(#define MMAD_ACC_DATA8_T float8 )==""\n"
183R"==(#elif DT_BF16 == 1 )==""\n"
184R"==(#define DATA_T ushort )==""\n"
185R"==(#define DATA2_T ushort2 )==""\n"
186R"==(#define POST_OP_DATA_T float )==""\n"
187R"==(#define DATA2_T ushort2 )==""\n"
188R"==(#define DATA4_T ushort4 )==""\n"
189R"==(#define DATA8_T ushort8 )==""\n"
190R"==(#define DATA16_T ushort16 )==""\n"
191R"==(#define DATA_MAX (ushort)0x7F7F )==""\n"
192R"==(#define DATA_MIN (ushort)0xFF7F )==""\n"
193R"==(#define DATA_ZERO (ushort)0x0000 )==""\n"
194R"==(#define DATA_ONE (ushort)0x3F80 )==""\n"
195R"==(#define DEF_ACC_DATA_T float )==""\n"
196R"==(#define DEF_ACC_DATA2_T float2 )==""\n"
197R"==(#define DEF_ACC_DATA4_T float4 )==""\n"
198R"==(#define DEF_ACC_DATA8_T float8 )==""\n"
199R"==(#define TO_DATA_T cvt_f32_to_bf16 )==""\n"
200R"==(#define TO_DEF_ACC_DATA_T cvt_bf16_to_f32 )==""\n"
201R"==(#define TO_DEF_ACC_DATA2_T cvt_bf16_to_f32 )==""\n"
202R"==(#define TO_DEF_ACC_DATA4_T cvt_bf16_to_f32 )==""\n"
203R"==(#define TO_DEF_ACC_DATA8_T cvt_bf16_to_f32 )==""\n"
204R"==(#define DATA_TO_REF cvt_bf16_to_f32 )==""\n"
205R"==(#define CONVERT_DATA_T(v) cvt_f32_to_bf16(convert_float(v)) )==""\n"
206R"==(#define CONVERT_DATA2_T(v) cvt_f32_to_bf16(convert_float2(v)) )==""\n"
207R"==(#define CONVERT_DATA4_T(v) cvt_f32_to_bf16(convert_float4(v)) )==""\n"
208R"==(#define CONVERT_DATA8_T(v) cvt_f32_to_bf16(convert_float8(v)) )==""\n"
209R"==(#define CONVERT_FLOAT_T cvt_bf16_to_f32 )==""\n"
210R"==(#define CONVERT_FLOAT2_T cvt_bf16_to_f32 )==""\n"
211R"==(#define CONVERT_FLOAT4_T cvt_bf16_to_f32 )==""\n"
212R"==(#define CONVERT_FLOAT8_T cvt_bf16_to_f32 )==""\n"
213R"==(#define BLOCK_READ intel_sub_group_block_read_us )==""\n"
214R"==(#define BLOCK_WRITE intel_sub_group_block_write_us )==""\n"
215R"==(#define BLOCK_READ2 intel_sub_group_block_read_us2 )==""\n"
216R"==(#define BLOCK_READ4 intel_sub_group_block_read_us4 )==""\n"
217R"==(#define BLOCK_READ8 intel_sub_group_block_read_us8 )==""\n"
218R"==(#define BLOCK_WRITE2 intel_sub_group_block_write_us2 )==""\n"
219R"==(#define BLOCK_WRITE4 intel_sub_group_block_write_us4 )==""\n"
220R"==(#define BLOCK_WRITE8 intel_sub_group_block_write_us8 )==""\n"
221R"==(#define AS_DATA_T as_ushort )==""\n"
222R"==(#define AS_DATA2_T as_ushort2 )==""\n"
223R"==(#define AS_DATA4_T as_ushort4 )==""\n"
224R"==(#define AS_DATA8_T as_ushort8 )==""\n"
225R"==(#define AS_UINT_T as_ushort )==""\n"
226R"==(#define AS_UINT2_T as_ushort2 )==""\n"
227R"==(#define AS_UINT4_T as_ushort4 )==""\n"
228R"==(#define AS_UINT8_T as_ushort8 )==""\n"
229R"==(#define BLOCK_DATA_T ushort )==""\n"
230R"==(#define BLOCK_DATA2_T ushort2 )==""\n"
231R"==(#define BLOCK_DATA4_T ushort4 )==""\n"
232R"==(#define BLOCK_DATA8_T ushort8 )==""\n"
233R"==(#define AS_BLOCK_DATA_T as_ushort )==""\n"
234R"==(#define AS_BLOCK_DATA2_T as_ushort2 )==""\n"
235R"==(#define AS_BLOCK_DATA4_T as_ushort4 )==""\n"
236R"==(#define AS_BLOCK_DATA8_T as_ushort8 )==""\n"
237R"==(#define MMAD_DATA_T uint )==""\n"
238R"==(#define MMAD_DATA4_T uint4 )==""\n"
239R"==(#define MMAD_DATA8_T uint8 )==""\n"
240R"==(#define MMAD_ACC_DATA4_T float4 )==""\n"
241R"==(#define MMAD_ACC_DATA8_T float8 )==""\n"
242R"==(#elif DT_S8 == 1 )==""\n"
243R"==(#define DATA_T char )==""\n"
244R"==(#define DATA2_T char2 )==""\n"
245R"==(#define DATA4_T char4 )==""\n"
246R"==(#define DATA8_T char8 )==""\n"
247R"==(#define DATA16_T char16 )==""\n"
248R"==(#define DATA_MAX CHAR_MAX )==""\n"
249R"==(#define DATA_MIN CHAR_MIN )==""\n"
250R"==(#define DATA_ZERO 0 )==""\n"
251R"==(#define DATA_ONE 1 )==""\n"
252R"==(#define INT8_T int8 )==""\n"
253R"==(#define DEF_ACC_DATA_T int )==""\n"
254R"==(#define DEF_ACC_DATA2_T int2 )==""\n"
255R"==(#define DEF_ACC_DATA4_T int4 )==""\n"
256R"==(#define DEF_ACC_DATA8_T int8 )==""\n"
257R"==(#define POST_OP_DATA_T float )==""\n"
258R"==(#define TO_DATA_T(v) convert_char_sat_rte(v) )==""\n"
259R"==(#define TO_DEF_ACC_DATA_T convert_int_sat_rte )==""\n"
260R"==(#define TO_DEF_ACC_DATA2_T convert_int2_sat_rte )==""\n"
261R"==(#define TO_DEF_ACC_DATA4_T convert_int4_sat_rte )==""\n"
262R"==(#define TO_DEF_ACC_DATA8_T convert_int8_sat_rte )==""\n"
263R"==(#define DATA_TO_REF convert_float )==""\n"
264R"==(#define CONVERT_DATA_T convert_char_sat_rte )==""\n"
265R"==(#define CONVERT_DATA2_T convert_char2_sat_rte )==""\n"
266R"==(#define CONVERT_DATA4_T convert_char4_sat_rte )==""\n"
267R"==(#define CONVERT_DATA8_T convert_char8_sat_rte )==""\n"
268R"==(#define CONVERT_FLOAT_T convert_float )==""\n"
269R"==(#define CONVERT_FLOAT2_T convert_float2 )==""\n"
270R"==(#define CONVERT_FLOAT4_T convert_float4 )==""\n"
271R"==(#define CONVERT_FLOAT8_T convert_float8 )==""\n"
272R"==(#define BLOCK_READ intel_sub_group_block_read_uc )==""\n"
273R"==(#define BLOCK_WRITE intel_sub_group_block_write_uc )==""\n"
274R"==(#define BLOCK_READ2 intel_sub_group_block_read_uc2 )==""\n"
275R"==(#define BLOCK_READ4 intel_sub_group_block_read_uc4 )==""\n"
276R"==(#define BLOCK_READ8 intel_sub_group_block_read_uc8 )==""\n"
277R"==(#define BLOCK_WRITE2 intel_sub_group_block_write_uc2 )==""\n"
278R"==(#define BLOCK_WRITE4 intel_sub_group_block_write_uc4 )==""\n"
279R"==(#define BLOCK_WRITE8 intel_sub_group_block_write_uc8 )==""\n"
280R"==(#define AS_DATA_T as_char )==""\n"
281R"==(#define AS_DATA2_T as_char2 )==""\n"
282R"==(#define AS_DATA4_T as_char4 )==""\n"
283R"==(#define AS_DATA8_T as_char8 )==""\n"
284R"==(#define AS_DATA16_T as_char16 )==""\n"
285R"==(#define AS_UINT_T as_uchar )==""\n"
286R"==(#define AS_UINT2_T as_uchar2 )==""\n"
287R"==(#define AS_UINT4_T as_uchar4 )==""\n"
288R"==(#define AS_UINT8_T as_uchar8 )==""\n"
289R"==(#define AS_INT8_T as_int8 )==""\n"
290R"==(#define BLOCK_DATA_T uchar )==""\n"
291R"==(#define BLOCK_DATA2_T uchar2 )==""\n"
292R"==(#define BLOCK_DATA4_T uchar4 )==""\n"
293R"==(#define BLOCK_DATA8_T uchar8 )==""\n"
294R"==(#define AS_BLOCK_DATA_T as_uchar )==""\n"
295R"==(#define AS_BLOCK_DATA2_T as_uchar2 )==""\n"
296R"==(#define AS_BLOCK_DATA4_T as_uchar4 )==""\n"
297R"==(#define AS_BLOCK_DATA8_T as_uchar8 )==""\n"
298R"==(#define MMAD_DATA_T int )==""\n"
299R"==(#define MMAD_DATA4_T int4 )==""\n"
300R"==(#define MMAD_DATA8_T int8 )==""\n"
301R"==(#define MMAD_ACC_DATA4_T int4 )==""\n"
302R"==(#define MMAD_ACC_DATA8_T int8 )==""\n"
303R"==(#elif DT_U8 == 1 )==""\n"
304R"==(#define DATA_T uchar )==""\n"
305R"==(#define DATA2_T uchar2 )==""\n"
306R"==(#define DATA4_T uchar4 )==""\n"
307R"==(#define DATA8_T uchar8 )==""\n"
308R"==(#define DATA16_T uchar16 )==""\n"
309R"==(#define DATA_MAX UCHAR_MAX )==""\n"
310R"==(#define DATA_MIN 0 )==""\n"
311R"==(#define DATA_ZERO 0 )==""\n"
312R"==(#define DATA_ONE 1 )==""\n"
313R"==(#define INT8_T uint8 )==""\n"
314R"==(#define DEF_ACC_DATA_T int )==""\n"
315R"==(#define DEF_ACC_DATA2_T int2 )==""\n"
316R"==(#define DEF_ACC_DATA4_T int4 )==""\n"
317R"==(#define DEF_ACC_DATA8_T int8 )==""\n"
318R"==(#define POST_OP_DATA_T float )==""\n"
319R"==(#define TO_DATA_T(v) convert_uchar_sat_rte(v) )==""\n"
320R"==(#define TO_DEF_ACC_DATA_T convert_int_sat_rte )==""\n"
321R"==(#define TO_DEF_ACC_DATA2_T convert_int2_sat_rte )==""\n"
322R"==(#define TO_DEF_ACC_DATA4_T convert_int4_sat_rte )==""\n"
323R"==(#define TO_DEF_ACC_DATA8_T convert_int8_sat_rte )==""\n"
324R"==(#define DATA_TO_REF convert_float )==""\n"
325R"==(#define CONVERT_DATA_T convert_uchar_sat_rte )==""\n"
326R"==(#define CONVERT_DATA2_T convert_uchar2_sat_rte )==""\n"
327R"==(#define CONVERT_DATA4_T convert_uchar4_sat_rte )==""\n"
328R"==(#define CONVERT_DATA8_T convert_uchar8_sat_rte )==""\n"
329R"==(#define CONVERT_FLOAT_T convert_float )==""\n"
330R"==(#define CONVERT_FLOAT2_T convert_float2 )==""\n"
331R"==(#define CONVERT_FLOAT4_T convert_float4 )==""\n"
332R"==(#define CONVERT_FLOAT8_T convert_float8 )==""\n"
333R"==(#define BLOCK_READ intel_sub_group_block_read_uc )==""\n"
334R"==(#define BLOCK_WRITE intel_sub_group_block_write_uc )==""\n"
335R"==(#define BLOCK_READ2 intel_sub_group_block_read_uc2 )==""\n"
336R"==(#define BLOCK_READ4 intel_sub_group_block_read_uc4 )==""\n"
337R"==(#define BLOCK_READ8 intel_sub_group_block_read_uc8 )==""\n"
338R"==(#define BLOCK_WRITE2 intel_sub_group_block_write_uc2 )==""\n"
339R"==(#define BLOCK_WRITE4 intel_sub_group_block_write_uc4 )==""\n"
340R"==(#define BLOCK_WRITE8 intel_sub_group_block_write_uc8 )==""\n"
341R"==(#define AS_DATA_T as_uchar )==""\n"
342R"==(#define AS_DATA2_T as_uchar2 )==""\n"
343R"==(#define AS_DATA4_T as_uchar4 )==""\n"
344R"==(#define AS_DATA8_T as_uchar8 )==""\n"
345R"==(#define AS_DATA16_T as_uchar16 )==""\n"
346R"==(#define AS_UINT_T as_uchar )==""\n"
347R"==(#define AS_UINT2_T as_uchar2 )==""\n"
348R"==(#define AS_UINT4_T as_uchar4 )==""\n"
349R"==(#define AS_UINT8_T as_uchar8 )==""\n"
350R"==(#define AS_INT8_T as_uint8 )==""\n"
351R"==(#define BLOCK_DATA_T uchar )==""\n"
352R"==(#define BLOCK_DATA2_T uchar2 )==""\n"
353R"==(#define BLOCK_DATA4_T uchar4 )==""\n"
354R"==(#define BLOCK_DATA8_T uchar8 )==""\n"
355R"==(#define AS_BLOCK_DATA_T as_uchar )==""\n"
356R"==(#define AS_BLOCK_DATA2_T as_uchar2 )==""\n"
357R"==(#define AS_BLOCK_DATA4_T as_uchar4 )==""\n"
358R"==(#define AS_BLOCK_DATA8_T as_uchar8 )==""\n"
359R"==(#define MMAD_DATA_T uint )==""\n"
360R"==(#define MMAD_DATA4_T uint4 )==""\n"
361R"==(#define MMAD_DATA8_T uint8 )==""\n"
362R"==(#define MMAD_ACC_DATA4_T int4 )==""\n"
363R"==(#define MMAD_ACC_DATA8_T int8 )==""\n"
364R"==(#elif DT_S32 == 1 )==""\n"
365R"==(#define MMAD_DATA_T uint )==""\n"
366R"==(#define MMAD_DATA4_T uint4 )==""\n"
367R"==(#define MMAD_DATA8_T uint8 )==""\n"
368R"==(#define DATA_T int )==""\n"
369R"==(#define DATA2_T int2 )==""\n"
370R"==(#define DATA4_T int4 )==""\n"
371R"==(#define DATA8_T int8 )==""\n"
372R"==(#define DATA16_T int16 )==""\n"
373R"==(#define DATA_MAX INT_MAX )==""\n"
374R"==(#define DATA_MIN INT_MIN )==""\n"
375R"==(#define DATA_ZERO 0 )==""\n"
376R"==(#define DATA_ONE 1 )==""\n"
377R"==(#define DATA_TO_REF convert_float )==""\n"
378R"==(#define TO_DATA_T(v) convert_int_sat_rte )==""\n"
379R"==(#define TO_DATA8_T(v) convert_int8_sat_rte )==""\n"
380R"==(#define CONVERT_DATA_T convert_int_sat_rte )==""\n"
381R"==(#define CONVERT_DATA2_T convert_int2_sat_rte )==""\n"
382R"==(#define CONVERT_DATA4_T convert_int4_sat_rte )==""\n"
383R"==(#define CONVERT_DATA8_T convert_int8_sat_rte )==""\n"
384R"==(#define CONVERT_FLOAT_T convert_float )==""\n"
385R"==(#define CONVERT_FLOAT2_T convert_float2 )==""\n"
386R"==(#define CONVERT_FLOAT4_T convert_float4 )==""\n"
387R"==(#define CONVERT_FLOAT8_T convert_float8 )==""\n"
388R"==(#define DEF_ACC_DATA_T int )==""\n"
389R"==(#define DEF_ACC_DATA2_T int2 )==""\n"
390R"==(#define DEF_ACC_DATA4_T int4 )==""\n"
391R"==(#define DEF_ACC_DATA8_T int8 )==""\n"
392R"==(#define TO_DATA_T(v) convert_int_sat_rte(v) )==""\n"
393R"==(#define TO_DEF_ACC_DATA_T convert_int_sat_rte )==""\n"
394R"==(#define TO_DEF_ACC_DATA2_T convert_int2_sat_rte )==""\n"
395R"==(#define TO_DEF_ACC_DATA4_T convert_int4_sat_rte )==""\n"
396R"==(#define TO_DEF_ACC_DATA8_T convert_int8_sat_rte )==""\n"
397R"==(#define POST_OP_DATA_T float )==""\n"
398R"==(#define DATA_MIN INT_MIN )==""\n"
399R"==(#define DATA_MAX INT_MAX )==""\n"
400R"==(#define DATA_ZERO 0 )==""\n"
401R"==(#define ROUND )==""\n"
402R"==(#define BLOCK_READ intel_sub_group_block_read )==""\n"
403R"==(#define BLOCK_WRITE intel_sub_group_block_write )==""\n"
404R"==(#define BLOCK_READ2 intel_sub_group_block_read2 )==""\n"
405R"==(#define BLOCK_READ4 intel_sub_group_block_read4 )==""\n"
406R"==(#define BLOCK_READ8 intel_sub_group_block_read8 )==""\n"
407R"==(#define BLOCK_WRITE2 intel_sub_group_block_write2 )==""\n"
408R"==(#define BLOCK_WRITE4 intel_sub_group_block_write4 )==""\n"
409R"==(#define BLOCK_WRITE8 intel_sub_group_block_write8 )==""\n"
410R"==(#define AS_DATA_T as_int )==""\n"
411R"==(#define AS_DATA2_T as_int2 )==""\n"
412R"==(#define AS_DATA4_T as_int4 )==""\n"
413R"==(#define AS_DATA8_T as_int8 )==""\n"
414R"==(#define AS_UINT_T as_uint )==""\n"
415R"==(#define AS_UINT2_T as_uint2 )==""\n"
416R"==(#define AS_UINT4_T as_uint4 )==""\n"
417R"==(#define AS_UINT8_T as_uint8 )==""\n"
418R"==(#define BLOCK_DATA_T uint )==""\n"
419R"==(#define BLOCK_DATA2_T uint2 )==""\n"
420R"==(#define BLOCK_DATA4_T uint4 )==""\n"
421R"==(#define BLOCK_DATA8_T uint8 )==""\n"
422R"==(#define AS_BLOCK_DATA_T as_uint )==""\n"
423R"==(#define AS_BLOCK_DATA2_T as_uint2 )==""\n"
424R"==(#define AS_BLOCK_DATA4_T as_uint4 )==""\n"
425R"==(#define AS_BLOCK_DATA8_T as_uint8 )==""\n"
426R"==(#elif !defined(DT_UNDEF) )==""\n"
427R"==(#error "Unexpected data type" )==""\n"
428R"==(#endif )==""\n"
429R"==(#if VECT_DT_N == 1 )==""\n"
430R"==(#define VECT_DATA_T DATA_T )==""\n"
431R"==(#define VECT_DEF_ACC_DATA_T DEF_ACC_DATA_T )==""\n"
432R"==(#define AS_VECT_DEF_ACC_DATA_T TO_DEF_ACC_DATA_T )==""\n"
433R"==(#define AS_VECT_DATA_T AS_DATA_T )==""\n"
434R"==(#define VECT_BLOCK_READ BLOCK_READ )==""\n"
435R"==(#define VECT_BLOCK_WRITE BLOCK_WRITE )==""\n"
436R"==(#define VECT_UINT_READ intel_sub_group_block_read )==""\n"
437R"==(#define VECT_UINT_WRITE intel_sub_group_block_write )==""\n"
438R"==(#define VECT_UCHAR_READ intel_sub_group_block_read_uc )==""\n"
439R"==(#define VECT_UCHAR_WRITE intel_sub_group_block_write_uc )==""\n"
440R"==(#define VECT_BLOCK_DATA_T BLOCK_DATA_T )==""\n"
441R"==(#define AS_VECT_BLOCK_DATA_T AS_BLOCK_DATA_T )==""\n"
442R"==(#define CONVERT_VECT_FLOAT_T CONVERT_FLOAT_T )==""\n"
443R"==(#define CONVERT_VECTOR_DATA_T CONVERT_DATA_T )==""\n"
444R"==(#define CONVERT_VECT_CHAR_T convert_char )==""\n"
445R"==(#define CONVERT_VECT_INT_T convert_int )==""\n"
446R"==(#define VECT_INT_T int )==""\n"
447R"==(#define VECT_UINT_T uint )==""\n"
448R"==(#define VECT_FLOAT_T float )==""\n"
449R"==(#define VECT_CHAR_T char )==""\n"
450R"==(#define AS_VECT_INT_T as_int )==""\n"
451R"==(#define AS_VECT_UINT_T as_uint )==""\n"
452R"==(#define AS_VECT_FLOAT_T as_float )==""\n"
453R"==(#define AS_VECT_CHAR_T as_char )==""\n"
454R"==(#define AS_VECT_UCHAR_T as_uchar )==""\n"
455R"==(#elif VECT_DT_N == 2 )==""\n"
456R"==(#define VECT_DATA_T DATA2_T )==""\n"
457R"==(#define VECT_DEF_ACC_DATA_T DEF_ACC_DATA2_T )==""\n"
458R"==(#define AS_VECT_DEF_ACC_DATA_T TO_DEF_ACC_DATA2_T )==""\n"
459R"==(#define AS_VECT_DATA_T AS_DATA2_T )==""\n"
460R"==(#define VECT_BLOCK_READ BLOCK_READ2 )==""\n"
461R"==(#define VECT_BLOCK_WRITE BLOCK_WRITE2 )==""\n"
462R"==(#define VECT_UINT_READ intel_sub_group_block_read2 )==""\n"
463R"==(#define VECT_UINT_WRITE intel_sub_group_block_write2 )==""\n"
464R"==(#define VECT_UCHAR_READ intel_sub_group_block_read_uc2 )==""\n"
465R"==(#define VECT_UCHAR_WRITE intel_sub_group_block_write_uc2 )==""\n"
466R"==(#define VECT_BLOCK_DATA_T BLOCK_DATA2_T )==""\n"
467R"==(#define AS_VECT_BLOCK_DATA_T AS_BLOCK_DATA2_T )==""\n"
468R"==(#define CONVERT_VECT_FLOAT_T CONVERT_FLOAT2_T )==""\n"
469R"==(#define CONVERT_VECTOR_DATA_T CONVERT_DATA2_T )==""\n"
470R"==(#define CONVERT_VECT_CHAR_T convert_char2 )==""\n"
471R"==(#define CONVERT_VECT_INT_T convert_int2 )==""\n"
472R"==(#define VECT_INT_T int2 )==""\n"
473R"==(#define VECT_UINT_T uint2 )==""\n"
474R"==(#define VECT_FLOAT_T float2 )==""\n"
475R"==(#define VECT_CHAR_T char2 )==""\n"
476R"==(#define AS_VECT_INT_T as_int2 )==""\n"
477R"==(#define AS_VECT_UINT_T as_uint2 )==""\n"
478R"==(#define AS_VECT_FLOAT_T as_float2 )==""\n"
479R"==(#define AS_VECT_CHAR_T as_char2 )==""\n"
480R"==(#define AS_VECT_UCHAR_T as_uchar2 )==""\n"
481R"==(#elif VECT_DT_N == 4 )==""\n"
482R"==(#define VECT_DATA_T DATA4_T )==""\n"
483R"==(#define VECT_DEF_ACC_DATA_T DEF_ACC_DATA4_T )==""\n"
484R"==(#define AS_VECT_DEF_ACC_DATA_T TO_DEF_ACC_DATA4_T )==""\n"
485R"==(#define AS_VECT_DATA_T AS_DATA4_T )==""\n"
486R"==(#define VECT_BLOCK_READ BLOCK_READ4 )==""\n"
487R"==(#define VECT_BLOCK_WRITE BLOCK_WRITE4 )==""\n"
488R"==(#define VECT_UINT_READ intel_sub_group_block_read4 )==""\n"
489R"==(#define VECT_UINT_WRITE intel_sub_group_block_write4 )==""\n"
490R"==(#define VECT_UCHAR_READ intel_sub_group_block_read_uc4 )==""\n"
491R"==(#define VECT_UCHAR_WRITE intel_sub_group_block_write_uc4 )==""\n"
492R"==(#define VECT_BLOCK_DATA_T BLOCK_DATA4_T )==""\n"
493R"==(#define AS_VECT_BLOCK_DATA_T AS_BLOCK_DATA4_T )==""\n"
494R"==(#define CONVERT_VECT_FLOAT_T CONVERT_FLOAT4_T )==""\n"
495R"==(#define CONVERT_VECTOR_DATA_T CONVERT_DATA4_T )==""\n"
496R"==(#define CONVERT_VECT_CHAR_T convert_char4 )==""\n"
497R"==(#define CONVERT_VECT_INT_T convert_int4 )==""\n"
498R"==(#define VECT_INT_T int4 )==""\n"
499R"==(#define VECT_UINT_T uint4 )==""\n"
500R"==(#define VECT_FLOAT_T float4 )==""\n"
501R"==(#define VECT_CHAR_T char4 )==""\n"
502R"==(#define AS_VECT_INT_T as_int4 )==""\n"
503R"==(#define AS_VECT_UINT_T as_uint4 )==""\n"
504R"==(#define AS_VECT_FLOAT_T as_float4 )==""\n"
505R"==(#define AS_VECT_CHAR_T as_char4 )==""\n"
506R"==(#define AS_VECT_UCHAR_T as_uchar4 )==""\n"
507R"==(#elif VECT_DT_N == 8 )==""\n"
508R"==(#define VECT_DATA_T DATA8_T )==""\n"
509R"==(#define VECT_DEF_ACC_DATA_T DEF_ACC_DATA8_T )==""\n"
510R"==(#define AS_VECT_DEF_ACC_DATA_T TO_DEF_ACC_DATA8_T )==""\n"
511R"==(#define AS_VECT_DATA_T AS_DATA8_T )==""\n"
512R"==(#define VECT_BLOCK_READ BLOCK_READ8 )==""\n"
513R"==(#define VECT_BLOCK_WRITE BLOCK_WRITE8 )==""\n"
514R"==(#define VECT_UINT_READ intel_sub_group_block_read8 )==""\n"
515R"==(#define VECT_UINT_WRITE intel_sub_group_block_write8 )==""\n"
516R"==(#define VECT_UCHAR_READ intel_sub_group_block_read_uc8 )==""\n"
517R"==(#define VECT_UCHAR_WRITE intel_sub_group_block_write_uc8 )==""\n"
518R"==(#define VECT_BLOCK_DATA_T BLOCK_DATA8_T )==""\n"
519R"==(#define AS_VECT_BLOCK_DATA_T AS_BLOCK_DATA8_T )==""\n"
520R"==(#define CONVERT_VECT_FLOAT_T CONVERT_FLOAT8_T )==""\n"
521R"==(#define CONVERT_VECTOR_DATA_T CONVERT_DATA8_T )==""\n"
522R"==(#define CONVERT_VECT_CHAR_T convert_char8 )==""\n"
523R"==(#define CONVERT_VECT_INT_T convert_int8 )==""\n"
524R"==(#define VECT_INT_T int8 )==""\n"
525R"==(#define VECT_UINT_T uint8 )==""\n"
526R"==(#define VECT_FLOAT_T float8 )==""\n"
527R"==(#define VECT_CHAR_T char8 )==""\n"
528R"==(#define AS_VECT_INT_T as_int8 )==""\n"
529R"==(#define AS_VECT_UINT_T as_uint8 )==""\n"
530R"==(#define AS_VECT_FLOAT_T as_float8 )==""\n"
531R"==(#define AS_VECT_CHAR_T as_char8 )==""\n"
532R"==(#define AS_VECT_UCHAR_T as_uchar8 )==""\n"
533R"==(#endif )==""\n"
534R"==(#define AS_MMAD_DATA_T CONCAT2(as_, MMAD_DATA_T) )==""\n"
535R"==(#define AS_MMAD_DATA4_T CONCAT2(as_, MMAD_DATA4_T) )==""\n"
536R"==(#define AS_MMAD_DATA8_T CONCAT2(as_, MMAD_DATA8_T) )==""\n"
537R"==(#ifdef SRC_DATA_T )==""\n"
538R"==(#define SRC_DATA2_T CONCAT2(SRC_DATA_T, 2) )==""\n"
539R"==(#define SRC_DATA4_T CONCAT2(SRC_DATA_T, 4) )==""\n"
540R"==(#define SRC_DATA8_T CONCAT2(SRC_DATA_T, 8) )==""\n"
541R"==(#define SRC_DATA16_T CONCAT2(SRC_DATA_T, 16) )==""\n"
542R"==(#ifdef SRC_DT_U8 )==""\n"
543R"==(#define SRC_MMAD_DATA_T uint )==""\n"
544R"==(#define SRC_MMAD_DATA4_T uint4 )==""\n"
545R"==(#define SRC_MMAD_DATA8_T uint8 )==""\n"
546R"==(#elif SRC_DT_S8 )==""\n"
547R"==(#define SRC_MMAD_DATA_T int )==""\n"
548R"==(#define SRC_MMAD_DATA4_T int4 )==""\n"
549R"==(#define SRC_MMAD_DATA8_T int8 )==""\n"
550R"==(#elif SRC_DT_F16 || SRC_DT_BF16 )==""\n"
551R"==(#define SRC_MMAD_DATA_T uint )==""\n"
552R"==(#define SRC_MMAD_DATA4_T uint4 )==""\n"
553R"==(#define SRC_MMAD_DATA8_T uint8 )==""\n"
554R"==(#endif )==""\n"
555R"==(#if defined(SRC_DT_U8) || defined(SRC_DT_S8) )==""\n"
556R"==(#define SRC_MMAD_ACC_DATA4_T int4 )==""\n"
557R"==(#define SRC_MMAD_ACC_DATA8_T int8 )==""\n"
558R"==(#else )==""\n"
559R"==(#define SRC_MMAD_ACC_DATA4_T float4 )==""\n"
560R"==(#define SRC_MMAD_ACC_DATA8_T float8 )==""\n"
561R"==(#endif )==""\n"
562R"==(#define AS_SRC_DATA2_T CONCAT2(as_, SRC_DATA2_T) )==""\n"
563R"==(#define AS_SRC_DATA4_T CONCAT2(as_, SRC_DATA4_T) )==""\n"
564R"==(#define AS_SRC_DATA8_T CONCAT2(as_, SRC_DATA8_T) )==""\n"
565R"==(#define AS_SRC_DATA16_T CONCAT2(as_, SRC_DATA16_T) )==""\n"
566R"==(#define AS_SRC_MMAD_DATA_T CONCAT2(as_, SRC_MMAD_DATA_T) )==""\n"
567R"==(#define AS_SRC_MMAD_DATA4_T CONCAT2(as_, SRC_MMAD_DATA4_T) )==""\n"
568R"==(#define AS_SRC_MMAD_DATA8_T CONCAT2(as_, SRC_MMAD_DATA8_T) )==""\n"
569R"==(#if SRC_DT_BF16 )==""\n"
570R"==(#define SRC_TO_REF(x) cvt_bf16_to_f32(x) )==""\n"
571R"==(#define SRC_TO_REF8(x) cvt_bf16_to_f32(x) )==""\n"
572R"==(#define REF_TO_SRC(x) cvt_f32_to_bf16(x) )==""\n"
573R"==(#else )==""\n"
574R"==(#define SRC_TO_REF(x) (x) )==""\n"
575R"==(#define SRC_TO_REF8(x) (x) )==""\n"
576R"==(#define REF_TO_SRC(x) (x) )==""\n"
577R"==(#endif )==""\n"
578R"==(#if SRC_DT_BF16 )==""\n"
579R"==(#define TO_SRC(x) cvt_f32_to_bf16(x) )==""\n"
580R"==(#elif SRC_DT_U8 )==""\n"
581R"==(#define TO_SRC(x) convert_uchar_sat_rte(x) )==""\n"
582R"==(#elif SRC_DT_S8 )==""\n"
583R"==(#define TO_SRC(x) convert_char_sat_rte(x) )==""\n"
584R"==(#elif SRC_DT_S32 )==""\n"
585R"==(#define TO_SRC(x) convert_int_sat_rte(x) )==""\n"
586R"==(#else )==""\n"
587R"==(#define TO_SRC(x) (x) )==""\n"
588R"==(#endif )==""\n"
589R"==(#endif )==""\n"
590R"==(#ifdef A_DATA_T )==""\n"
591R"==(#define A_DATA8_T CONCAT2(A_DATA_T, 8) )==""\n"
592R"==(#if A_DT_BF16 )==""\n"
593R"==(#define A_TO_REF(x) cvt_bf16_to_f32(x) )==""\n"
594R"==(#define A_TO_REF8(x) cvt_bf16_to_f32(x) )==""\n"
595R"==(#define REF_TO_A(x) cvt_f32_to_bf16(x) )==""\n"
596R"==(#else )==""\n"
597R"==(#define A_TO_REF(x) (x) )==""\n"
598R"==(#define A_TO_REF8(x) (x) )==""\n"
599R"==(#define REF_TO_A(x) (x) )==""\n"
600R"==(#endif )==""\n"
601R"==(#if A_DT_BF16 )==""\n"
602R"==(#define TO_A(x) cvt_f32_to_bf16(x) )==""\n"
603R"==(#elif A_DT_U8 )==""\n"
604R"==(#define TO_A(x) convert_uchar_sat_rte(x) )==""\n"
605R"==(#elif A_DT_S8 )==""\n"
606R"==(#define TO_A(x) convert_char_sat_rte(x) )==""\n"
607R"==(#elif A_DT_S32 )==""\n"
608R"==(#define TO_A(x) convert_int_sat_rte(x) )==""\n"
609R"==(#else )==""\n"
610R"==(#define TO_A(x) (x) )==""\n"
611R"==(#endif )==""\n"
612R"==(#endif )==""\n"
613R"==(#ifdef WEI_DATA_T )==""\n"
614R"==(#if WEI_DT_BF16 )==""\n"
615R"==(#define WEI_TO_REF(x) cvt_bf16_to_f32(x) )==""\n"
616R"==(#define REF_TO_WEI(x) cvt_f32_to_bf16(x) )==""\n"
617R"==(#else )==""\n"
618R"==(#define WEI_TO_REF(x) (x) )==""\n"
619R"==(#define REF_TO_WEI(x) (x) )==""\n"
620R"==(#endif )==""\n"
621R"==(#if WEI_DT_BF16 )==""\n"
622R"==(#define TO_WEI(x) cvt_f32_to_bf16(x) )==""\n"
623R"==(#elif WEI_DT_U8 )==""\n"
624R"==(#define TO_WEI(x) convert_uchar_sat_rte(x) )==""\n"
625R"==(#elif WEI_DT_S8 )==""\n"
626R"==(#define TO_WEI(x) convert_char_sat_rte(x) )==""\n"
627R"==(#elif WEI_DT_S32 )==""\n"
628R"==(#define TO_WEI(x) convert_int_sat_rte(x) )==""\n"
629R"==(#else )==""\n"
630R"==(#define TO_WEI(x) (x) )==""\n"
631R"==(#endif )==""\n"
632R"==(#endif )==""\n"
633R"==(#ifdef DIFF_WEI_DATA_T )==""\n"
634R"==(#if DIFF_WEI_DT_BF16 )==""\n"
635R"==(#define DIFF_WEI_TO_REF(x) cvt_bf16_to_f32(x) )==""\n"
636R"==(#define REF_TO_DIFF_WEI(x) cvt_f32_to_bf16(x) )==""\n"
637R"==(#else )==""\n"
638R"==(#define DIFF_WEI_TO_REF(x) (x) )==""\n"
639R"==(#define REF_TO_DIFF_WEI(x) (x) )==""\n"
640R"==(#endif )==""\n"
641R"==(#if DIFF_WEI_DT_BF16 )==""\n"
642R"==(#define TO_DIFF_WEI(x) cvt_f32_to_bf16(x) )==""\n"
643R"==(#elif DIFF_WEI_DT_U8 )==""\n"
644R"==(#define TO_DIFF_WEI(x) convert_uchar_sat_rte(x) )==""\n"
645R"==(#elif DIFF_WEI_DT_S8 )==""\n"
646R"==(#define TO_DIFF_WEI(x) convert_char_sat_rte(x) )==""\n"
647R"==(#elif DIFF_WEI_DT_S32 )==""\n"
648R"==(#define TO_DIFF_WEI(x) convert_int_sat_rte(x) )==""\n"
649R"==(#else )==""\n"
650R"==(#define TO_DIFF_WEI(x) (x) )==""\n"
651R"==(#endif )==""\n"
652R"==(#endif )==""\n"
653R"==(#ifdef B_DATA_T )==""\n"
654R"==(#if B_DT_BF16 )==""\n"
655R"==(#define B_TO_REF(x) cvt_bf16_to_f32(x) )==""\n"
656R"==(#define REF_TO_B(x) cvt_f32_to_bf16(x) )==""\n"
657R"==(#else )==""\n"
658R"==(#define B_TO_REF(x) (x) )==""\n"
659R"==(#define REF_TO_B(x) (x) )==""\n"
660R"==(#endif )==""\n"
661R"==(#if B_DT_BF16 )==""\n"
662R"==(#define TO_B(x) cvt_f32_to_bf16(x) )==""\n"
663R"==(#elif B_DT_U8 )==""\n"
664R"==(#define TO_B(x) convert_uchar_sat_rte(x) )==""\n"
665R"==(#elif B_DT_S8 )==""\n"
666R"==(#define TO_B(x) convert_char_sat_rte(x) )==""\n"
667R"==(#elif B_DT_S32 )==""\n"
668R"==(#define TO_B(x) convert_int_sat_rte(x) )==""\n"
669R"==(#else )==""\n"
670R"==(#define TO_B(x) (x) )==""\n"
671R"==(#endif )==""\n"
672R"==(#endif )==""\n"
673R"==(#ifdef BIA_DATA_T )==""\n"
674R"==(#define BIA_DATA2_T CONCAT2(BIA_DATA_T, 2) )==""\n"
675R"==(#if BIA_DT_BF16 )==""\n"
676R"==(#define BIA_TO_REF(x) cvt_bf16_to_f32(x) )==""\n"
677R"==(#define REF_TO_BIA(x) cvt_f32_to_bf16(x) )==""\n"
678R"==(#else )==""\n"
679R"==(#define BIA_TO_REF(x) (x) )==""\n"
680R"==(#define REF_TO_BIA(x) (x) )==""\n"
681R"==(#endif )==""\n"
682R"==(#if BIA_DT_BF16 )==""\n"
683R"==(#define TO_BIA(x) cvt_f32_to_bf16(x) )==""\n"
684R"==(#elif BIA_DT_U8 )==""\n"
685R"==(#define TO_BIA(x) convert_uchar_sat_rte(x) )==""\n"
686R"==(#elif BIA_DT_S8 )==""\n"
687R"==(#define TO_BIA(x) convert_char_sat_rte(x) )==""\n"
688R"==(#elif BIA_DT_S32 )==""\n"
689R"==(#define TO_BIA(x) convert_int_sat_rte(x) )==""\n"
690R"==(#else )==""\n"
691R"==(#define TO_BIA(x) (x) )==""\n"
692R"==(#endif )==""\n"
693R"==(#endif )==""\n"
694R"==(#ifdef DST_DATA_T )==""\n"
695R"==(#define DST_DATA2_T CONCAT2(DST_DATA_T, 2) )==""\n"
696R"==(#define DST_DATA4_T CONCAT2(DST_DATA_T, 4) )==""\n"
697R"==(#define DST_DATA8_T CONCAT2(DST_DATA_T, 8) )==""\n"
698R"==(#define DST_DATA16_T CONCAT2(DST_DATA_T, 16) )==""\n"
699R"==(#define AS_DST_DATA_T CONCAT2(as_, DST_DATA_T) )==""\n"
700R"==(#define AS_DST_DATA2_T CONCAT2(as_, DST_DATA2_T) )==""\n"
701R"==(#define AS_DST_DATA4_T CONCAT2(as_, DST_DATA4_T) )==""\n"
702R"==(#define AS_DST_DATA8_T CONCAT2(as_, DST_DATA8_T) )==""\n"
703R"==(#define AS_DST_DATA16_T CONCAT2(as_, DST_DATA16_T) )==""\n"
704R"==(#if DST_DT_F32 || DST_DT_F16 )==""\n"
705R"==(#define CONVERT_DST_DATA_T CONCAT2(convert_, DST_DATA_T) )==""\n"
706R"==(#define CONVERT_DST_DATA2_T CONCAT2(convert_, DST_DATA2_T) )==""\n"
707R"==(#define CONVERT_DST_DATA4_T CONCAT2(convert_, DST_DATA4_T) )==""\n"
708R"==(#define CONVERT_DST_DATA8_T CONCAT2(convert_, DST_DATA8_T) )==""\n"
709R"==(#define CONVERT_DST_DATA16_T CONCAT2(convert_, DST_DATA16_T) )==""\n"
710R"==(#else )==""\n"
711R"==(#define CONVERT_DST_DATA_T CONCAT3(convert_, DST_DATA_T, _sat_rte) )==""\n"
712R"==(#define CONVERT_DST_DATA2_T CONCAT3(convert_, DST_DATA2_T, _sat_rte) )==""\n"
713R"==(#define CONVERT_DST_DATA4_T CONCAT3(convert_, DST_DATA4_T, _sat_rte) )==""\n"
714R"==(#define CONVERT_DST_DATA8_T CONCAT3(convert_, DST_DATA8_T, _sat_rte) )==""\n"
715R"==(#define CONVERT_DST_DATA16_T CONCAT3(convert_, DST_DATA16_T, _sat_rte) )==""\n"
716R"==(#endif )==""\n"
717R"==(#if DST_DT_U8 )==""\n"
718R"==(#define MMAD_DATA_T uint )==""\n"
719R"==(#define MMAD_DATA4_T uint4 )==""\n"
720R"==(#define MMAD_DATA8_T uint8 )==""\n"
721R"==(#elif DST_DT_S8 )==""\n"
722R"==(#define MMAD_DATA_T int )==""\n"
723R"==(#define MMAD_DATA4_T int4 )==""\n"
724R"==(#define MMAD_DATA8_T int8 )==""\n"
725R"==(#endif )==""\n"
726R"==(#if DST_DT_U8 || DST_DT_S8 )==""\n"
727R"==(#define BLOCK_READ_DST2(ptr) \ )==""\n"
728R"==(AS_DST_DATA2_T(intel_sub_group_block_read_uc2((__global uchar *)ptr)) )==""\n"
729R"==(#define BLOCK_WRITE_DST2(ptr, v) \ )==""\n"
730R"==(intel_sub_group_block_write_uc2((__global uchar *)ptr, as_uchar2(v)) )==""\n"
731R"==(#define BLOCK_READ_DST(ptr) \ )==""\n"
732R"==(AS_DST_DATA_T(intel_sub_group_block_read_uc((__global uchar *)ptr)) )==""\n"
733R"==(#define BLOCK_WRITE_DST(ptr, v) \ )==""\n"
734R"==(intel_sub_group_block_write_uc((__global uchar *)ptr, as_uchar(v)) )==""\n"
735R"==(#define BLOCK_READ_DST2(ptr) \ )==""\n"
736R"==(AS_DST_DATA2_T(intel_sub_group_block_read_uc2((__global uchar *)ptr)) )==""\n"
737R"==(#define BLOCK_WRITE_DST2(ptr, v) \ )==""\n"
738R"==(intel_sub_group_block_write_uc2((__global uchar *)ptr, as_uchar2(v)) )==""\n"
739R"==(#define BLOCK_READ_DST4(ptr) \ )==""\n"
740R"==(AS_DST_DATA4_T(intel_sub_group_block_read_uc4((__global uchar *)ptr)) )==""\n"
741R"==(#define BLOCK_WRITE_DST4(ptr, v) \ )==""\n"
742R"==(intel_sub_group_block_write_uc4((__global uchar *)ptr, as_uchar4(v)) )==""\n"
743R"==(#define BLOCK_READ_DST8(ptr) \ )==""\n"
744R"==(AS_DST_DATA8_T(intel_sub_group_block_read_uc8((__global uchar *)ptr)) )==""\n"
745R"==(#define BLOCK_WRITE_DST8(ptr, v) \ )==""\n"
746R"==(intel_sub_group_block_write_uc8((__global uchar *)ptr, as_uchar8(v)) )==""\n"
747R"==(#define BLOCK_READ_DST16(ptr) \ )==""\n"
748R"==(AS_DST_DATA16_T(intel_sub_group_block_read_uc16((__global uchar *)ptr)) )==""\n"
749R"==(#define BLOCK_WRITE_DST16(ptr, v) \ )==""\n"
750R"==(intel_sub_group_block_write_uc16((__global uchar *)ptr, as_uchar16(v)) )==""\n"
751R"==(#elif DST_DT_F16 || DST_DT_BF16 )==""\n"
752R"==(#define BLOCK_READ_DST(ptr) \ )==""\n"
753R"==(AS_DST_DATA_T(intel_sub_group_block_read_us((__global ushort *)ptr)) )==""\n"
754R"==(#define BLOCK_WRITE_DST(ptr, v) \ )==""\n"
755R"==(intel_sub_group_block_write_us((__global ushort *)ptr, as_ushort(v)) )==""\n"
756R"==(#define BLOCK_READ_DST2(ptr) \ )==""\n"
757R"==(AS_DST_DATA2_T(intel_sub_group_block_read_us2((__global ushort *)ptr)) )==""\n"
758R"==(#define BLOCK_WRITE_DST2(ptr, v) \ )==""\n"
759R"==(intel_sub_group_block_write_us2((__global ushort *)ptr, as_ushort2(v)) )==""\n"
760R"==(#define BLOCK_READ_DST4(ptr) \ )==""\n"
761R"==(AS_DST_DATA4_T(intel_sub_group_block_read_us4((__global ushort *)ptr)) )==""\n"
762R"==(#define BLOCK_WRITE_DST4(ptr, v) \ )==""\n"
763R"==(intel_sub_group_block_write_us4((__global ushort *)ptr, as_ushort4(v)) )==""\n"
764R"==(#define BLOCK_READ_DST8(ptr) \ )==""\n"
765R"==(AS_DST_DATA8_T(intel_sub_group_block_read_us8((__global ushort *)ptr)) )==""\n"
766R"==(#define BLOCK_WRITE_DST8(ptr, v) \ )==""\n"
767R"==(intel_sub_group_block_write_us8((__global ushort *)ptr, as_ushort8(v)) )==""\n"
768R"==(#define BLOCK_READ_DST16(ptr) \ )==""\n"
769R"==((DST_DATA16_T)( \ )==""\n"
770R"==(BLOCK_READ_DST8(ptr), BLOCK_READ_DST8(ptr + 8 * SUB_GROUP_SIZE)) )==""\n"
771R"==(#define BLOCK_WRITE_DST16(ptr, v) \ )==""\n"
772R"==(do { \ )==""\n"
773R"==(BLOCK_WRITE_DST8(ptr, (v).s01234567); \ )==""\n"
774R"==(BLOCK_WRITE_DST8(ptr + 8 * SUB_GROUP_SIZE, (v).s89abcdef); \ )==""\n"
775R"==(} while (0) )==""\n"
776R"==(#elif DST_DT_S32 || DST_DT_F32 )==""\n"
777R"==(#define BLOCK_READ_DST(ptr) \ )==""\n"
778R"==(AS_DST_DATA_T(intel_sub_group_block_read((__global uint *)ptr)) )==""\n"
779R"==(#define BLOCK_WRITE_DST(ptr, v) \ )==""\n"
780R"==(intel_sub_group_block_write((__global uint *)ptr, as_uint(v)) )==""\n"
781R"==(#define BLOCK_READ_DST2(ptr) \ )==""\n"
782R"==(AS_DST_DATA2_T(intel_sub_group_block_read2((__global uint *)ptr)) )==""\n"
783R"==(#define BLOCK_WRITE_DST2(ptr, v) \ )==""\n"
784R"==(intel_sub_group_block_write2((__global uint *)ptr, as_uint2(v)) )==""\n"
785R"==(#define BLOCK_READ_DST4(ptr) \ )==""\n"
786R"==(AS_DST_DATA4_T(intel_sub_group_block_read4((__global uint *)ptr)) )==""\n"
787R"==(#define BLOCK_WRITE_DST4(ptr, v) \ )==""\n"
788R"==(intel_sub_group_block_write4((__global uint *)ptr, as_uint4(v)) )==""\n"
789R"==(#define BLOCK_READ_DST8(ptr) \ )==""\n"
790R"==(AS_DST_DATA8_T(intel_sub_group_block_read8((__global uint *)ptr)) )==""\n"
791R"==(#define BLOCK_WRITE_DST8(ptr, v) \ )==""\n"
792R"==(intel_sub_group_block_write8((__global uint *)ptr, as_uint8(v)) )==""\n"
793R"==(#define BLOCK_READ_DST16(ptr) \ )==""\n"
794R"==((DST_DATA16_T)( \ )==""\n"
795R"==(BLOCK_READ_DST8(ptr), BLOCK_READ_DST8(ptr + 8 * SUB_GROUP_SIZE)) )==""\n"
796R"==(#define BLOCK_WRITE_DST16(ptr, v) \ )==""\n"
797R"==(do { \ )==""\n"
798R"==(BLOCK_WRITE_DST8(ptr, (v).s01234567); \ )==""\n"
799R"==(BLOCK_WRITE_DST8(ptr + 8 * SUB_GROUP_SIZE, (v).s89abcdef); \ )==""\n"
800R"==(} while (0) )==""\n"
801R"==(#elif DST_DT_F16 || DST_DT_BF16 )==""\n"
802R"==(#define BLOCK_READ_DST(ptr) \ )==""\n"
803R"==(AS_DST_DATA_T(intel_sub_group_block_read_us((__global ushort *)ptr)) )==""\n"
804R"==(#define BLOCK_WRITE_DST(ptr, v) \ )==""\n"
805R"==(intel_sub_group_block_write_us((__global ushort *)ptr, as_ushort(v)) )==""\n"
806R"==(#define BLOCK_READ_DST2(ptr) \ )==""\n"
807R"==(AS_DST_DATA2_T(intel_sub_group_block_read_us2((__global ushort *)ptr)) )==""\n"
808R"==(#define BLOCK_WRITE_DST2(ptr, v) \ )==""\n"
809R"==(intel_sub_group_block_write_us2((__global ushort *)ptr, as_short2(v)) )==""\n"
810R"==(#define BLOCK_READ_DST4(ptr) \ )==""\n"
811R"==(AS_DST_DATA4_T(intel_sub_group_block_read_us4((__global ushort *)ptr)) )==""\n"
812R"==(#define BLOCK_WRITE_DST4(ptr, v) \ )==""\n"
813R"==(intel_sub_group_block_write_us4((__global ushort *)ptr, as_ushort4(v)) )==""\n"
814R"==(#define BLOCK_READ_DST8(ptr) \ )==""\n"
815R"==(AS_DST_DATA8_T(intel_sub_group_block_read_us8((__global ushort *)ptr)) )==""\n"
816R"==(#define BLOCK_WRITE_DST8(ptr, v) \ )==""\n"
817R"==(intel_sub_group_block_write_us8((__global ushort *)ptr, as_ushort8(v)) )==""\n"
818R"==(#define BLOCK_READ_DST16(ptr) \ )==""\n"
819R"==((DST_DATA16_T)( \ )==""\n"
820R"==(BLOCK_READ_DST8(ptr), BLOCK_READ_DST8(ptr + 8 * SUB_GROUP_SIZE)) )==""\n"
821R"==(#define BLOCK_WRITE_DST16(ptr, v) \ )==""\n"
822R"==(do { \ )==""\n"
823R"==(BLOCK_WRITE_DST8(ptr, (v).s01234567); \ )==""\n"
824R"==(BLOCK_WRITE_DST8(ptr + 8 * SUB_GROUP_SIZE, (v).s89abcdef); \ )==""\n"
825R"==(} while (0) )==""\n"
826R"==(#endif )==""\n"
827R"==(#if DST_DT_BF16 )==""\n"
828R"==(#define DST_TO_REF(x) cvt_bf16_to_f32(x) )==""\n"
829R"==(#define DST_TO_REF2(x) cvt_bf16_to_f32(x) )==""\n"
830R"==(#define DST_TO_REF8(x) cvt_bf16_to_f32(x) )==""\n"
831R"==(#define REF_TO_DST(x) cvt_f32_to_bf16(x) )==""\n"
832R"==(#define REF_TO_DST8(x) cvt_f32_to_bf16(convert_float8(x)) )==""\n"
833R"==(#elif DST_DT_F16 )==""\n"
834R"==(#define REF_TO_DST(x) convert_half(x) )==""\n"
835R"==(#define DST_TO_REF(x) convert_float(x) )==""\n"
836R"==(#define DST_TO_REF2(x) convert_float2(x) )==""\n"
837R"==(#define DST_TO_REF8(x) convert_float8(x) )==""\n"
838R"==(#elif DST_DT_U8 )==""\n"
839R"==(#define DST_TO_REF(x) (x) )==""\n"
840R"==(#define DST_TO_REF2(x) (x) )==""\n"
841R"==(#define DST_TO_REF8(x) (x) )==""\n"
842R"==(#define REF_TO_DST(x) convert_uchar(x) )==""\n"
843R"==(#define REF_TO_DST8(x) convert_uchar8(x) )==""\n"
844R"==(#elif DST_DT_S8 )==""\n"
845R"==(#define DST_TO_REF(x) (x) )==""\n"
846R"==(#define DST_TO_REF2(x) (x) )==""\n"
847R"==(#define DST_TO_REF8(x) (x) )==""\n"
848R"==(#define REF_TO_DST(x) convert_char(x) )==""\n"
849R"==(#define REF_TO_DST8(x) convert_char8(x) )==""\n"
850R"==(#else )==""\n"
851R"==(#define DST_TO_REF(x) (x) )==""\n"
852R"==(#define DST_TO_REF2(x) (x) )==""\n"
853R"==(#define DST_TO_REF8(x) (x) )==""\n"
854R"==(#define REF_TO_DST(x) (x) )==""\n"
855R"==(#define REF_TO_DST8(x) (x) )==""\n"
856R"==(#endif )==""\n"
857R"==(#if DST_DT_BF16 )==""\n"
858R"==(#define TO_DST(x) cvt_f32_to_bf16(x) )==""\n"
859R"==(#define TO_DST2(x) cvt_f32_to_bf16(convert_float2(x)) )==""\n"
860R"==(#define TO_DST4(x) cvt_f32_to_bf16(convert_float4(x)) )==""\n"
861R"==(#define TO_DST8(x) cvt_f32_to_bf16(convert_float8(x)) )==""\n"
862R"==(#elif DST_DT_F16 )==""\n"
863R"==(#define TO_DST(x) convert_half(x) )==""\n"
864R"==(#define TO_DST2(x) convert_half2(x) )==""\n"
865R"==(#define TO_DST4(x) convert_half4(x) )==""\n"
866R"==(#define TO_DST8(x) convert_half8(x) )==""\n"
867R"==(#elif DST_DT_U8 )==""\n"
868R"==(#define TO_DST(x) convert_uchar_sat_rte(x) )==""\n"
869R"==(#define TO_DST2(x) convert_uchar2_sat_rte(x) )==""\n"
870R"==(#define TO_DST4(x) convert_uchar4_sat_rte(x) )==""\n"
871R"==(#define TO_DST8(x) convert_uchar8_sat_rte(x) )==""\n"
872R"==(#define TO_DST16(x) convert_uchar16_sat_rte(x) )==""\n"
873R"==(#elif DST_DT_S8 )==""\n"
874R"==(#define TO_DST(x) convert_char_sat_rte(x) )==""\n"
875R"==(#define TO_DST2(x) convert_char2_sat_rte(x) )==""\n"
876R"==(#define TO_DST4(x) convert_char4_sat_rte(x) )==""\n"
877R"==(#define TO_DST8(x) convert_char8_sat_rte(x) )==""\n"
878R"==(#define TO_DST16(x) convert_char16_sat_rte(x) )==""\n"
879R"==(#elif DST_DT_S32 )==""\n"
880R"==(#define TO_DST(x) convert_int_sat_rte(x) )==""\n"
881R"==(#define TO_DST2(x) convert_int2_sat_rte(x) )==""\n"
882R"==(#define TO_DST4(x) convert_int4_sat_rte(x) )==""\n"
883R"==(#define TO_DST8(x) convert_int8_sat_rte(x) )==""\n"
884R"==(#elif DST_DT_F32 )==""\n"
885R"==(#define TO_DST(x) convert_float(x) )==""\n"
886R"==(#define TO_DST2(x) convert_float2(x) )==""\n"
887R"==(#define TO_DST4(x) convert_float4(x) )==""\n"
888R"==(#define TO_DST8(x) convert_float8(x) )==""\n"
889R"==(#elif DST_DT_F64 )==""\n"
890R"==(#define TO_DST(x) convert_double(x) )==""\n"
891R"==(#define TO_DST2(x) convert_double2(x) )==""\n"
892R"==(#define TO_DST4(x) convert_double4(x) )==""\n"
893R"==(#define TO_DST8(x) convert_double8(x) )==""\n"
894R"==(#else )==""\n"
895R"==(#error "Not expected" )==""\n"
896R"==(#endif )==""\n"
897R"==(#endif )==""\n"
898R"==(#ifdef C_DATA_T )==""\n"
899R"==(#define C_DATA8_T CONCAT2(C_DATA_T, 8) )==""\n"
900R"==(#if C_DT_BF16 )==""\n"
901R"==(#define C_TO_REF(x) cvt_bf16_to_f32(x) )==""\n"
902R"==(#define C_TO_REF8(x) cvt_bf16_to_f32(x) )==""\n"
903R"==(#define REF_TO_C(x) cvt_f32_to_bf16(x) )==""\n"
904R"==(#define REF_TO_C8(x) cvt_f32_to_bf16(convert_float8(x)) )==""\n"
905R"==(#else )==""\n"
906R"==(#define C_TO_REF(x) (x) )==""\n"
907R"==(#define C_TO_REF8(x) (x) )==""\n"
908R"==(#define REF_TO_C(x) (x) )==""\n"
909R"==(#define REF_TO_C8(x) (x) )==""\n"
910R"==(#endif )==""\n"
911R"==(#if C_DT_BF16 )==""\n"
912R"==(#define TO_C(x) cvt_f32_to_bf16(x) )==""\n"
913R"==(#define TO_C8(x) cvt_f32_to_bf16(convert_float8(x)) )==""\n"
914R"==(#elif C_DT_F16 )==""\n"
915R"==(#define TO_C(x) convert_half(x) )==""\n"
916R"==(#define TO_C8(x) convert_half8(x) )==""\n"
917R"==(#elif C_DT_U8 )==""\n"
918R"==(#define TO_C(x) convert_uchar_sat_rte(x) )==""\n"
919R"==(#define TO_C8(x) convert_uchar8_sat_rte(x) )==""\n"
920R"==(#elif C_DT_S8 )==""\n"
921R"==(#define TO_C(x) convert_char_sat_rte(x) )==""\n"
922R"==(#define TO_C8(x) convert_char8_sat_rte(x) )==""\n"
923R"==(#elif C_DT_S32 )==""\n"
924R"==(#define TO_C(x) convert_int_sat_rte(x) )==""\n"
925R"==(#define TO_C8(x) convert_int8_sat_rte(x) )==""\n"
926R"==(#elif C_DT_F32 )==""\n"
927R"==(#define TO_C(x) convert_float(x) )==""\n"
928R"==(#define TO_C8(x) convert_float8(x) )==""\n"
929R"==(#elif C_DT_F64 )==""\n"
930R"==(#define TO_C(x) convert_double(x) )==""\n"
931R"==(#define TO_C8(x) convert_double8(x) )==""\n"
932R"==(#else )==""\n"
933R"==(#error "Not expected" )==""\n"
934R"==(#endif )==""\n"
935R"==(#endif )==""\n"
936R"==(#ifdef ACC_DATA_T )==""\n"
937R"==(#if ACC_DT_F16 )==""\n"
938R"==(#define TO_ACC(x) convert_half(x) )==""\n"
939R"==(#elif ACC_DT_F32 )==""\n"
940R"==(#define TO_ACC(x) convert_float(x) )==""\n"
941R"==(#elif ACC_DT_F64 )==""\n"
942R"==(#define TO_ACC(x) convert_double(x) )==""\n"
943R"==(#elif ACC_DT_S32 )==""\n"
944R"==(#define TO_ACC(x) convert_int(x) )==""\n"
945R"==(#else )==""\n"
946R"==(#error "Unexpected accumulation data type" )==""\n"
947R"==(#endif )==""\n"
948R"==(#endif )==""\n"
949R"==(#ifdef SUM_DATA_T )==""\n"
950R"==(#define SUM_DATA2_T CONCAT2(SUM_DATA_T, 2) )==""\n"
951R"==(#define SUM_DATA4_T CONCAT2(SUM_DATA_T, 4) )==""\n"
952R"==(#define SUM_DATA8_T CONCAT2(SUM_DATA_T, 8) )==""\n"
953R"==(#define SUM_DATA16_T CONCAT2(SUM_DATA_T, 16) )==""\n"
954R"==(#define AS_SUM_DATA_T CONCAT2(as_, SUM_DATA_T) )==""\n"
955R"==(#define AS_SUM_DATA2_T CONCAT2(as_, SUM_DATA2_T) )==""\n"
956R"==(#define AS_SUM_DATA4_T CONCAT2(as_, SUM_DATA4_T) )==""\n"
957R"==(#define AS_SUM_DATA8_T CONCAT2(as_, SUM_DATA8_T) )==""\n"
958R"==(#define AS_SUM_DATA16_T CONCAT2(as_, SUM_DATA16_T) )==""\n"
959R"==(#if SUM_DT_BF16 )==""\n"
960R"==(#define SUM_TO_REF cvt_bf16_to_f32 )==""\n"
961R"==(#else )==""\n"
962R"==(#define SUM_TO_REF )==""\n"
963R"==(#endif )==""\n"
964R"==(#endif )==""\n"
965R"==(#define OFF_MD_2(prefix, x0, x1, x2, x3, x4, x5) \ )==""\n"
966R"==(((((x0) / CONCAT2(prefix, _B0_2)) / CONCAT2(prefix, _B0_1) \ )==""\n"
967R"==(* CONCAT2(prefix, _S0_0)) \ )==""\n"
968R"==(+ (((x0) / CONCAT2(prefix, _B0_2)) % CONCAT2(prefix, _B0_1) \ )==""\n"
969R"==(* CONCAT2(prefix, _S0_1)) \ )==""\n"
970R"==(+ (((x0) % CONCAT2(prefix, _B0_2)) * CONCAT2(prefix, _S0_2)) \ )==""\n"
971R"==(+ (((x1) / CONCAT2(prefix, _B1_2)) / CONCAT2(prefix, _B1_1) \ )==""\n"
972R"==(* CONCAT2(prefix, _S1_0)) \ )==""\n"
973R"==(+ (((x1) / CONCAT2(prefix, _B1_2)) % CONCAT2(prefix, _B1_1) \ )==""\n"
974R"==(* CONCAT2(prefix, _S1_1)) \ )==""\n"
975R"==(+ (((x1) % CONCAT2(prefix, _B1_2)) * CONCAT2(prefix, _S1_2)) \ )==""\n"
976R"==(+ (((x2) / CONCAT2(prefix, _B2_2)) / CONCAT2(prefix, _B2_1) \ )==""\n"
977R"==(* CONCAT2(prefix, _S2_0)) \ )==""\n"
978R"==(+ (((x2) / CONCAT2(prefix, _B2_2)) % CONCAT2(prefix, _B2_1) \ )==""\n"
979R"==(* CONCAT2(prefix, _S2_1)) \ )==""\n"
980R"==(+ (((x2) % CONCAT2(prefix, _B2_2)) * CONCAT2(prefix, _S2_2)) \ )==""\n"
981R"==(+ (((x3) / CONCAT2(prefix, _B3_2)) / CONCAT2(prefix, _B3_1) \ )==""\n"
982R"==(* CONCAT2(prefix, _S3_0)) \ )==""\n"
983R"==(+ (((x3) / CONCAT2(prefix, _B3_2)) % CONCAT2(prefix, _B3_1) \ )==""\n"
984R"==(* CONCAT2(prefix, _S3_1)) \ )==""\n"
985R"==(+ (((x3) % CONCAT2(prefix, _B3_2)) * CONCAT2(prefix, _S3_2)) \ )==""\n"
986R"==(+ (((x4) / CONCAT2(prefix, _B4_2)) / CONCAT2(prefix, _B4_1) \ )==""\n"
987R"==(* CONCAT2(prefix, _S4_0)) \ )==""\n"
988R"==(+ (((x4) / CONCAT2(prefix, _B4_2)) % CONCAT2(prefix, _B4_1) \ )==""\n"
989R"==(* CONCAT2(prefix, _S4_1)) \ )==""\n"
990R"==(+ (((x4) % CONCAT2(prefix, _B4_2)) * CONCAT2(prefix, _S4_2)) \ )==""\n"
991R"==(+ (((x5) / CONCAT2(prefix, _B5_2)) / CONCAT2(prefix, _B5_1) \ )==""\n"
992R"==(* CONCAT2(prefix, _S5_0)) \ )==""\n"
993R"==(+ (((x5) / CONCAT2(prefix, _B5_2)) % CONCAT2(prefix, _B5_1) \ )==""\n"
994R"==(* CONCAT2(prefix, _S5_1)) \ )==""\n"
995R"==(+ (((x5) % CONCAT2(prefix, _B5_2)) * CONCAT2(prefix, _S5_2))) )==""\n"
996R"==(#define OFF_MD_3(prefix, x0, x1, x2, x3, x4, x5) \ )==""\n"
997R"==(((((((x0) / CONCAT2(prefix, _B0_3)) / CONCAT2(prefix, _B0_2)) \ )==""\n"
998R"==(/ CONCAT2(prefix, _B0_1)) \ )==""\n"
999R"==(* CONCAT2(prefix, _S0_0)) \ )==""\n"
1000R"==(+ (((((x0) / CONCAT2(prefix, _B0_3)) / CONCAT2(prefix, _B0_2)) \ )==""\n"
1001R"==(% CONCAT2(prefix, _B0_1)) \ )==""\n"
1002R"==(* CONCAT2(prefix, _S0_1)) \ )==""\n"
1003R"==(+ ((((x0) / CONCAT2(prefix, _B0_3)) % CONCAT2(prefix, _B0_2)) \ )==""\n"
1004R"==(* CONCAT2(prefix, _S0_2)) \ )==""\n"
1005R"==(+ (((x0) % CONCAT2(prefix, _B0_3)) * CONCAT2(prefix, _S0_3)) \ )==""\n"
1006R"==(+ (((((x1) / CONCAT2(prefix, _B1_3)) / CONCAT2(prefix, _B1_2)) \ )==""\n"
1007R"==(/ CONCAT2(prefix, _B1_1)) \ )==""\n"
1008R"==(* CONCAT2(prefix, _S1_0)) \ )==""\n"
1009R"==(+ (((((x1) / CONCAT2(prefix, _B1_3)) / CONCAT2(prefix, _B1_2)) \ )==""\n"
1010R"==(% CONCAT2(prefix, _B1_1)) \ )==""\n"
1011R"==(* CONCAT2(prefix, _S1_1)) \ )==""\n"
1012R"==(+ ((((x1) / CONCAT2(prefix, _B1_3)) % CONCAT2(prefix, _B1_2)) \ )==""\n"
1013R"==(* CONCAT2(prefix, _S1_2)) \ )==""\n"
1014R"==(+ (((x1) % CONCAT2(prefix, _B1_3)) * CONCAT2(prefix, _S1_3)) \ )==""\n"
1015R"==(+ (((((x2) / CONCAT2(prefix, _B2_3)) / CONCAT2(prefix, _B2_2)) \ )==""\n"
1016R"==(/ CONCAT2(prefix, _B2_1)) \ )==""\n"
1017R"==(* CONCAT2(prefix, _S2_0)) \ )==""\n"
1018R"==(+ (((((x2) / CONCAT2(prefix, _B2_3)) / CONCAT2(prefix, _B2_2)) \ )==""\n"
1019R"==(% CONCAT2(prefix, _B2_1)) \ )==""\n"
1020R"==(* CONCAT2(prefix, _S2_1)) \ )==""\n"
1021R"==(+ ((((x2) / CONCAT2(prefix, _B2_3)) % CONCAT2(prefix, _B2_2)) \ )==""\n"
1022R"==(* CONCAT2(prefix, _S2_2)) \ )==""\n"
1023R"==(+ (((x2) % CONCAT2(prefix, _B2_3)) * CONCAT2(prefix, _S2_3)) \ )==""\n"
1024R"==(+ (((((x3) / CONCAT2(prefix, _B3_3)) / CONCAT2(prefix, _B3_2)) \ )==""\n"
1025R"==(/ CONCAT2(prefix, _B3_1)) \ )==""\n"
1026R"==(* CONCAT2(prefix, _S3_0)) \ )==""\n"
1027R"==(+ (((((x3) / CONCAT2(prefix, _B3_3)) / CONCAT2(prefix, _B3_2)) \ )==""\n"
1028R"==(% CONCAT2(prefix, _B3_1)) \ )==""\n"
1029R"==(* CONCAT2(prefix, _S3_1)) \ )==""\n"
1030R"==(+ ((((x3) / CONCAT2(prefix, _B3_3)) % CONCAT2(prefix, _B3_2)) \ )==""\n"
1031R"==(* CONCAT2(prefix, _S3_2)) \ )==""\n"
1032R"==(+ (((x3) % CONCAT2(prefix, _B3_3)) * CONCAT2(prefix, _S3_3)) \ )==""\n"
1033R"==(+ (((((x4) / CONCAT2(prefix, _B4_3)) / CONCAT2(prefix, _B4_2)) \ )==""\n"
1034R"==(/ CONCAT2(prefix, _B4_1)) \ )==""\n"
1035R"==(* CONCAT2(prefix, _S4_0)) \ )==""\n"
1036R"==(+ (((((x4) / CONCAT2(prefix, _B4_3)) / CONCAT2(prefix, _B4_2)) \ )==""\n"
1037R"==(% CONCAT2(prefix, _B4_1)) \ )==""\n"
1038R"==(* CONCAT2(prefix, _S4_1)) \ )==""\n"
1039R"==(+ ((((x4) / CONCAT2(prefix, _B4_3)) % CONCAT2(prefix, _B4_2)) \ )==""\n"
1040R"==(* CONCAT2(prefix, _S4_2)) \ )==""\n"
1041R"==(+ (((x4) % CONCAT2(prefix, _B4_3)) * CONCAT2(prefix, _S4_3)) \ )==""\n"
1042R"==(+ (((((x5) / CONCAT2(prefix, _B5_3)) / CONCAT2(prefix, _B5_2)) \ )==""\n"
1043R"==(/ CONCAT2(prefix, _B5_1)) \ )==""\n"
1044R"==(* CONCAT2(prefix, _S5_0)) \ )==""\n"
1045R"==(+ (((((x5) / CONCAT2(prefix, _B5_3)) / CONCAT2(prefix, _B5_2)) \ )==""\n"
1046R"==(% CONCAT2(prefix, _B5_1)) \ )==""\n"
1047R"==(* CONCAT2(prefix, _S5_1)) \ )==""\n"
1048R"==(+ ((((x5) / CONCAT2(prefix, _B5_3)) % CONCAT2(prefix, _B5_2)) \ )==""\n"
1049R"==(* CONCAT2(prefix, _S5_2)) \ )==""\n"
1050R"==(+ (((x5) % CONCAT2(prefix, _B5_3)) * CONCAT2(prefix, _S5_3))) )==""\n"
1051R"==(#define OFF_MD(prefix, x0, x1, x2, x3, x4, x5) \ )==""\n"
1052R"==(CONCAT2(OFF_MD_, CONCAT2(prefix, _NLEVELS))(prefix, x0, x1, x2, x3, x4, x5) )==""\n"
1053R"==(#if SRC_NDIMS == 3 )==""\n"
1054R"==(#define CONV_SRC_OFF(n, c, d, h, w) OFF_MD(SRC, n, c, w, 0, 0, 0) )==""\n"
1055R"==(#elif SRC_NDIMS == 4 )==""\n"
1056R"==(#define CONV_SRC_OFF(n, c, d, h, w) OFF_MD(SRC, n, c, h, w, 0, 0) )==""\n"
1057R"==(#elif SRC_NDIMS == 5 )==""\n"
1058R"==(#define CONV_SRC_OFF(n, c, d, h, w) OFF_MD(SRC, n, c, d, h, w, 0) )==""\n"
1059R"==(#endif )==""\n"
1060R"==(#if WEI_NDIMS == 3 )==""\n"
1061R"==(#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, o, i, w, 0, 0, 0) )==""\n"
1062R"==(#elif WEI_NDIMS == 4 )==""\n"
1063R"==(#if WITH_GROUPS == 0 )==""\n"
1064R"==(#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, o, i, h, w, 0, 0) )==""\n"
1065R"==(#else )==""\n"
1066R"==(#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, g, o, i, w, 0, 0) )==""\n"
1067R"==(#endif )==""\n"
1068R"==(#elif WEI_NDIMS == 5 )==""\n"
1069R"==(#if WITH_GROUPS == 0 )==""\n"
1070R"==(#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, o, i, d, h, w, 0) )==""\n"
1071R"==(#else )==""\n"
1072R"==(#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, g, o, i, h, w, 0) )==""\n"
1073R"==(#endif )==""\n"
1074R"==(#elif WEI_NDIMS == 6 )==""\n"
1075R"==(#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, g, o, i, d, h, w) )==""\n"
1076R"==(#endif )==""\n"
1077R"==(#if DST_NDIMS == 3 )==""\n"
1078R"==(#define CONV_DST_OFF(n, c, d, h, w) OFF_MD(DST, n, c, w, 0, 0, 0) )==""\n"
1079R"==(#elif DST_NDIMS == 4 )==""\n"
1080R"==(#define CONV_DST_OFF(n, c, d, h, w) OFF_MD(DST, n, c, h, w, 0, 0) )==""\n"
1081R"==(#elif DST_NDIMS == 5 )==""\n"
1082R"==(#define CONV_DST_OFF(n, c, d, h, w) OFF_MD(DST, n, c, d, h, w, 0) )==""\n"
1083R"==(#endif )==""\n"
1084R"==(#if NDIMS == 2 )==""\n"
1085R"==(#define SRC_OFF(x0, x1, d, h, w) \ )==""\n"
1086R"==((((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \ )==""\n"
1087R"==(+ ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1) )==""\n"
1088R"==(#if WITH_GROUPS == 1 )==""\n"
1089R"==(#define WEI_OFF(x0, x1, x2, d, h, w) \ )==""\n"
1090R"==((((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \ )==""\n"
1091R"==(+ ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \ )==""\n"
1092R"==(+ ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2) )==""\n"
1093R"==(#else )==""\n"
1094R"==(#define WEI_OFF(g, x0, x1, d, h, w) \ )==""\n"
1095R"==((((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \ )==""\n"
1096R"==(+ ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1) )==""\n"
1097R"==(#endif )==""\n"
1098R"==(#define DST_OFF(x0, x1, d, h, w) \ )==""\n"
1099R"==((((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \ )==""\n"
1100R"==(+ ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1) )==""\n"
1101R"==(#elif NDIMS == 3 )==""\n"
1102R"==(#define SRC_OFF(x0, x1, d, h, x2) \ )==""\n"
1103R"==((((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \ )==""\n"
1104R"==(+ ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \ )==""\n"
1105R"==(+ ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2) )==""\n"
1106R"==(#if WITH_GROUPS == 1 )==""\n"
1107R"==(#define WEI_OFF(x0, x1, x2, d, h, x3) \ )==""\n"
1108R"==((((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \ )==""\n"
1109R"==(+ ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \ )==""\n"
1110R"==(+ ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2 \ )==""\n"
1111R"==(+ ((x3) % WEI_B3) * WEI_SB3 + ((x3) / WEI_B3) * WEI_S3) )==""\n"
1112R"==(#else )==""\n"
1113R"==(#define WEI_OFF(g, x0, x1, d, h, x2) \ )==""\n"
1114R"==((((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \ )==""\n"
1115R"==(+ ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \ )==""\n"
1116R"==(+ ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2) )==""\n"
1117R"==(#endif )==""\n"
1118R"==(#define DST_OFF(x0, x1, d, h, x2) \ )==""\n"
1119R"==((((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \ )==""\n"
1120R"==(+ ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \ )==""\n"
1121R"==(+ ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2) )==""\n"
1122R"==(#elif NDIMS == 4 )==""\n"
1123R"==(#define SRC_OFF(x0, x1, d, x2, x3) \ )==""\n"
1124R"==((((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \ )==""\n"
1125R"==(+ ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \ )==""\n"
1126R"==(+ ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2 \ )==""\n"
1127R"==(+ ((x3) % SRC_B3) * SRC_SB3 + ((x3) / SRC_B3) * SRC_S3) )==""\n"
1128R"==(#if WITH_GROUPS == 1 )==""\n"
1129R"==(#define WEI_OFF(x0, x1, x2, d, x3, x4) \ )==""\n"
1130R"==((((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \ )==""\n"
1131R"==(+ ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \ )==""\n"
1132R"==(+ ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2 \ )==""\n"
1133R"==(+ ((x3) % WEI_B3) * WEI_SB3 + ((x3) / WEI_B3) * WEI_S3 \ )==""\n"
1134R"==(+ ((x4) % WEI_B4) * WEI_SB4 + ((x4) / WEI_B4) * WEI_S4) )==""\n"
1135R"==(#else )==""\n"
1136R"==(#define WEI_OFF(g, x1, x2, d, x3, x4) \ )==""\n"
1137R"==((((x1) % WEI_B0) * WEI_SB0 + ((x1) / WEI_B0) * WEI_S0 \ )==""\n"
1138R"==(+ ((x2) % WEI_B1) * WEI_SB1 + ((x2) / WEI_B1) * WEI_S1 \ )==""\n"
1139R"==(+ ((x3) % WEI_B2) * WEI_SB2 + ((x3) / WEI_B2) * WEI_S2 \ )==""\n"
1140R"==(+ ((x4) % WEI_B3) * WEI_SB3 + ((x4) / WEI_B3) * WEI_S3) )==""\n"
1141R"==(#endif )==""\n"
1142R"==(#define DST_OFF(x0, x1, d, x2, x3) \ )==""\n"
1143R"==((((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \ )==""\n"
1144R"==(+ ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \ )==""\n"
1145R"==(+ ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2 \ )==""\n"
1146R"==(+ ((x3) % DST_B3) * DST_SB3 + ((x3) / DST_B3) * DST_S3) )==""\n"
1147R"==(#elif NDIMS == 5 )==""\n"
1148R"==(#define SRC_OFF(x0, x1, x2, x3, x4) \ )==""\n"
1149R"==((((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \ )==""\n"
1150R"==(+ ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \ )==""\n"
1151R"==(+ ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2 \ )==""\n"
1152R"==(+ ((x3) % SRC_B3) * SRC_SB3 + ((x3) / SRC_B3) * SRC_S3 \ )==""\n"
1153R"==(+ ((x4) % SRC_B4) * SRC_SB4 + ((x4) / SRC_B4) * SRC_S4) )==""\n"
1154R"==(#if WITH_GROUPS == 1 )==""\n"
1155R"==(#define WEI_OFF(x0, x1, x2, x3, x4, x5) \ )==""\n"
1156R"==((((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \ )==""\n"
1157R"==(+ ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \ )==""\n"
1158R"==(+ ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2 \ )==""\n"
1159R"==(+ ((x3) % WEI_B3) * WEI_SB3 + ((x3) / WEI_B3) * WEI_S3 \ )==""\n"
1160R"==(+ ((x4) % WEI_B4) * WEI_SB4 + ((x4) / WEI_B4) * WEI_S4 \ )==""\n"
1161R"==(+ ((x5) % WEI_B5) * WEI_SB5 + ((x5) / WEI_B5) * WEI_S5) )==""\n"
1162R"==(#else )==""\n"
1163R"==(#define WEI_OFF(g, x1, x2, x3, x4, x5) \ )==""\n"
1164R"==((((x1) % WEI_B0) * WEI_SB0 + ((x1) / WEI_B0) * WEI_S0 \ )==""\n"
1165R"==(+ ((x2) % WEI_B1) * WEI_SB1 + ((x2) / WEI_B1) * WEI_S1 \ )==""\n"
1166R"==(+ ((x3) % WEI_B2) * WEI_SB2 + ((x3) / WEI_B2) * WEI_S2 \ )==""\n"
1167R"==(+ ((x4) % WEI_B3) * WEI_SB3 + ((x4) / WEI_B3) * WEI_S3 \ )==""\n"
1168R"==(+ ((x5) % WEI_B4) * WEI_SB4 + ((x5) / WEI_B4) * WEI_S4) )==""\n"
1169R"==(#endif )==""\n"
1170R"==(#define DST_OFF(x0, x1, x2, x3, x4) \ )==""\n"
1171R"==((((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \ )==""\n"
1172R"==(+ ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \ )==""\n"
1173R"==(+ ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2 \ )==""\n"
1174R"==(+ ((x3) % DST_B3) * DST_SB3 + ((x3) / DST_B3) * DST_S3 \ )==""\n"
1175R"==(+ ((x4) % DST_B4) * DST_SB4 + ((x4) / DST_B4) * DST_S4) )==""\n"
1176R"==(#endif )==""\n"
1177R"==(#define GWS_OP_ZERO(x, y) 0 )==""\n"
1178R"==(#define GWS_OP_FIRST(x, y) (x) )==""\n"
1179R"==(#define GWS_OP_MOD(x, y) ((x) % (y)) )==""\n"
1180R"==(#define ROUND_UP(a,b) (((a) + (b) - 1) / (b)) )==""\n"
1181R"==(#define GWS0_GET_ID0() GWS0_OP0((get_global_id(GWS0_IDX0) / GWS0_STRIDE0), ROUND_UP(GWS0_DIM0, GWS0_BLOCK0)) / GWS0_VEC_SIZE0 * GWS0_VEC_SIZE0 * GWS0_BLOCK0 )==""\n"
1182R"==(#define GWS0_GET_ID1() GWS0_OP1((get_global_id(GWS0_IDX1) / GWS0_STRIDE1), ROUND_UP(GWS0_DIM1, GWS0_BLOCK1)) / GWS0_VEC_SIZE1 * GWS0_VEC_SIZE1 * GWS0_BLOCK1 )==""\n"
1183R"==(#define GWS0_GET_ID2() GWS0_OP2((get_global_id(GWS0_IDX2) / GWS0_STRIDE2), ROUND_UP(GWS0_DIM2, GWS0_BLOCK2)) / GWS0_VEC_SIZE2 * GWS0_VEC_SIZE2 * GWS0_BLOCK2 )==""\n"
1184R"==(#define GWS0_GET_ID3() GWS0_OP3((get_global_id(GWS0_IDX3) / GWS0_STRIDE3), ROUND_UP(GWS0_DIM3, GWS0_BLOCK3)) / GWS0_VEC_SIZE3 * GWS0_VEC_SIZE3 * GWS0_BLOCK3 )==""\n"
1185R"==(#define GWS0_GET_ID4() GWS0_OP4((get_global_id(GWS0_IDX4) / GWS0_STRIDE4), ROUND_UP(GWS0_DIM4, GWS0_BLOCK4)) / GWS0_VEC_SIZE4 * GWS0_VEC_SIZE4 * GWS0_BLOCK4 )==""\n"
1186R"==(#define GWS0_GET_ID5() GWS0_OP5((get_global_id(GWS0_IDX5) / GWS0_STRIDE5), ROUND_UP(GWS0_DIM5, GWS0_BLOCK5)) / GWS0_VEC_SIZE5 * GWS0_VEC_SIZE5 * GWS0_BLOCK5 )==""\n"
1187R"==(#define GWS0_GET_BLOCK0() GWS0_BLOCK0 )==""\n"
1188R"==(#define GWS0_GET_BLOCK1() GWS0_BLOCK1 )==""\n"
1189R"==(#define GWS0_GET_BLOCK2() GWS0_BLOCK2 )==""\n"
1190R"==(#define GWS0_GET_BLOCK3() GWS0_BLOCK3 )==""\n"
1191R"==(#define GWS0_GET_BLOCK4() GWS0_BLOCK4 )==""\n"
1192R"==(#define GWS0_GET_BLOCK5() GWS0_BLOCK5 )==""\n"
1193R"==(#define GWS1_GET_ID0() GWS1_OP0((get_global_id(GWS1_IDX0) / GWS1_STRIDE0), ROUND_UP(GWS1_DIM0, GWS1_BLOCK0)) / GWS1_VEC_SIZE0 * GWS1_VEC_SIZE0 * GWS1_BLOCK0 )==""\n"
1194R"==(#define GWS1_GET_ID1() GWS1_OP1((get_global_id(GWS1_IDX1) / GWS1_STRIDE1), ROUND_UP(GWS1_DIM1, GWS1_BLOCK1)) / GWS1_VEC_SIZE1 * GWS1_VEC_SIZE1 * GWS1_BLOCK1 )==""\n"
1195R"==(#define GWS1_GET_ID2() GWS1_OP2((get_global_id(GWS1_IDX2) / GWS1_STRIDE2), ROUND_UP(GWS1_DIM2, GWS1_BLOCK2)) / GWS1_VEC_SIZE2 * GWS1_VEC_SIZE2 * GWS1_BLOCK2 )==""\n"
1196R"==(#define GWS1_GET_ID3() GWS1_OP3((get_global_id(GWS1_IDX3) / GWS1_STRIDE3), ROUND_UP(GWS1_DIM3, GWS1_BLOCK3)) / GWS1_VEC_SIZE3 * GWS1_VEC_SIZE3 * GWS1_BLOCK3 )==""\n"
1197R"==(#define GWS1_GET_ID4() GWS1_OP4((get_global_id(GWS1_IDX4) / GWS1_STRIDE4), ROUND_UP(GWS1_DIM4, GWS1_BLOCK4)) / GWS1_VEC_SIZE4 * GWS1_VEC_SIZE4 * GWS1_BLOCK4 )==""\n"
1198R"==(#define GWS1_GET_ID5() GWS1_OP5((get_global_id(GWS1_IDX5) / GWS1_STRIDE5), ROUND_UP(GWS1_DIM5, GWS1_BLOCK5)) / GWS1_VEC_SIZE5 * GWS1_VEC_SIZE5 * GWS1_BLOCK5 )==""\n"
1199R"==(#define GWS1_GET_BLOCK0() GWS1_BLOCK0 )==""\n"
1200R"==(#define GWS1_GET_BLOCK1() GWS1_BLOCK1 )==""\n"
1201R"==(#define GWS1_GET_BLOCK2() GWS1_BLOCK2 )==""\n"
1202R"==(#define GWS1_GET_BLOCK3() GWS1_BLOCK3 )==""\n"
1203R"==(#define GWS1_GET_BLOCK4() GWS1_BLOCK4 )==""\n"
1204R"==(#define GWS1_GET_BLOCK5() GWS1_BLOCK5 )==""\n"
1205R"==(#define GWS2_GET_ID0() GWS2_OP0((get_global_id(GWS2_IDX0) / GWS2_STRIDE0), ROUND_UP(GWS2_DIM0, GWS2_BLOCK0)) / GWS2_VEC_SIZE0 * GWS2_VEC_SIZE0 * GWS2_BLOCK0 )==""\n"
1206R"==(#define GWS2_GET_ID1() GWS2_OP1((get_global_id(GWS2_IDX1) / GWS2_STRIDE1), ROUND_UP(GWS2_DIM1, GWS2_BLOCK1)) / GWS2_VEC_SIZE1 * GWS2_VEC_SIZE1 * GWS2_BLOCK1 )==""\n"
1207R"==(#define GWS2_GET_ID2() GWS2_OP2((get_global_id(GWS2_IDX2) / GWS2_STRIDE2), ROUND_UP(GWS2_DIM2, GWS2_BLOCK2)) / GWS2_VEC_SIZE2 * GWS2_VEC_SIZE2 * GWS2_BLOCK2 )==""\n"
1208R"==(#define GWS2_GET_ID3() GWS2_OP3((get_global_id(GWS2_IDX3) / GWS2_STRIDE3), ROUND_UP(GWS2_DIM3, GWS2_BLOCK3)) / GWS2_VEC_SIZE3 * GWS2_VEC_SIZE3 * GWS2_BLOCK3 )==""\n"
1209R"==(#define GWS2_GET_ID4() GWS2_OP4((get_global_id(GWS2_IDX4) / GWS2_STRIDE4), ROUND_UP(GWS2_DIM4, GWS2_BLOCK4)) / GWS2_VEC_SIZE4 * GWS2_VEC_SIZE4 * GWS2_BLOCK4 )==""\n"
1210R"==(#define GWS2_GET_ID5() GWS2_OP5((get_global_id(GWS2_IDX5) / GWS2_STRIDE5), ROUND_UP(GWS2_DIM5, GWS2_BLOCK5)) / GWS2_VEC_SIZE5 * GWS2_VEC_SIZE5 * GWS2_BLOCK5 )==""\n"
1211R"==(#define GWS2_GET_BLOCK0() GWS2_BLOCK0 )==""\n"
1212R"==(#define GWS2_GET_BLOCK1() GWS2_BLOCK1 )==""\n"
1213R"==(#define GWS2_GET_BLOCK2() GWS2_BLOCK2 )==""\n"
1214R"==(#define GWS2_GET_BLOCK3() GWS2_BLOCK3 )==""\n"
1215R"==(#define GWS2_GET_BLOCK4() GWS2_BLOCK4 )==""\n"
1216R"==(#define GWS2_GET_BLOCK5() GWS2_BLOCK5 )==""\n"
1217R"==(#define GWS3_GET_ID0() GWS3_OP0((get_global_id(GWS3_IDX0) / GWS3_STRIDE0), ROUND_UP(GWS3_DIM0, GWS3_BLOCK0)) / GWS3_VEC_SIZE0 * GWS3_VEC_SIZE0 * GWS3_BLOCK0 )==""\n"
1218R"==(#define GWS3_GET_ID1() GWS3_OP1((get_global_id(GWS3_IDX1) / GWS3_STRIDE1), ROUND_UP(GWS3_DIM1, GWS3_BLOCK1)) / GWS3_VEC_SIZE1 * GWS3_VEC_SIZE1 * GWS3_BLOCK1 )==""\n"
1219R"==(#define GWS3_GET_ID2() GWS3_OP2((get_global_id(GWS3_IDX2) / GWS3_STRIDE2), ROUND_UP(GWS3_DIM2, GWS3_BLOCK2)) / GWS3_VEC_SIZE2 * GWS3_VEC_SIZE2 * GWS3_BLOCK2 )==""\n"
1220R"==(#define GWS3_GET_ID3() GWS3_OP3((get_global_id(GWS3_IDX3) / GWS3_STRIDE3), ROUND_UP(GWS3_DIM3, GWS3_BLOCK3)) / GWS3_VEC_SIZE3 * GWS3_VEC_SIZE3 * GWS3_BLOCK3 )==""\n"
1221R"==(#define GWS3_GET_ID4() GWS3_OP4((get_global_id(GWS3_IDX4) / GWS3_STRIDE4), ROUND_UP(GWS3_DIM4, GWS3_BLOCK4)) / GWS3_VEC_SIZE4 * GWS3_VEC_SIZE4 * GWS3_BLOCK4 )==""\n"
1222R"==(#define GWS3_GET_ID5() GWS3_OP5((get_global_id(GWS3_IDX5) / GWS3_STRIDE5), ROUND_UP(GWS3_DIM5, GWS3_BLOCK5)) / GWS3_VEC_SIZE5 * GWS3_VEC_SIZE5 * GWS3_BLOCK5 )==""\n"
1223R"==(#define GWS3_GET_BLOCK0() GWS3_BLOCK0 )==""\n"
1224R"==(#define GWS3_GET_BLOCK1() GWS3_BLOCK1 )==""\n"
1225R"==(#define GWS3_GET_BLOCK2() GWS3_BLOCK2 )==""\n"
1226R"==(#define GWS3_GET_BLOCK3() GWS3_BLOCK3 )==""\n"
1227R"==(#define GWS3_GET_BLOCK4() GWS3_BLOCK4 )==""\n"
1228R"==(#define GWS3_GET_BLOCK5() GWS3_BLOCK5 )==""\n"
1229R"==(#define KERNEL_ATTR_SG0 \ )==""\n"
1230R"==(__attribute__((reqd_work_group_size( \ )==""\n"
1231R"==(GWS_LWS0_DEFAULT, GWS_LWS1_DEFAULT, GWS_LWS2_DEFAULT))) )==""\n"
1232R"==(#define KERNEL_ATTR_SG1 \ )==""\n"
1233R"==(KERNEL_ATTR_SG0 \ )==""\n"
1234R"==(__attribute__((intel_reqd_sub_group_size(GWS_SGS_DEFAULT))) )==""\n"
1235R"==(#define KERNEL_ATTR CONCAT2(KERNEL_ATTR_SG, GWS_WITH_SG_DEFAULT) )==""\n"
1236R"==(#define NAMED_KERNEL_ATTR_SG0(name) \ )==""\n"
1237R"==(__attribute__((reqd_work_group_size(CONCAT2(GWS_LWS0_, name), \ )==""\n"
1238R"==(CONCAT2(GWS_LWS1_, name), CONCAT2(GWS_LWS2_, name)))) )==""\n"
1239R"==(#define NAMED_KERNEL_ATTR_SG1(name) \ )==""\n"
1240R"==(NAMED_KERNEL_ATTR_SG0(name) \ )==""\n"
1241R"==(__attribute__((intel_reqd_sub_group_size(CONCAT2(GWS_SGS_, name)))) )==""\n"
1242R"==(#define NAMED_KERNEL_ATTR(name) \ )==""\n"
1243R"==(CONCAT2(NAMED_KERNEL_ATTR_SG, CONCAT2(GWS_WITH_SG_, name))(name) )==""\n"
1244R"==(#define MAYBE_SKIP_NON_UNIFORM_WG() \ )==""\n"
1245R"==(do { \ )==""\n"
1246R"==(if ((GWS_0 != GWS_ORIG_0) && (GWS_ORIG_0 % LWS_0 != 0) \ )==""\n"
1247R"==(&& (get_global_id(0) >= GWS_ORIG_0)) \ )==""\n"
1248R"==(return; \ )==""\n"
1249R"==(if ((GWS_1 != GWS_ORIG_1) && (GWS_ORIG_1 % LWS_1 != 0) \ )==""\n"
1250R"==(&& (get_global_id(1) >= GWS_ORIG_1)) \ )==""\n"
1251R"==(return; \ )==""\n"
1252R"==(if ((GWS_2 != GWS_ORIG_2) && (GWS_ORIG_2 % LWS_2 != 0) \ )==""\n"
1253R"==(&& (get_global_id(2) >= GWS_ORIG_2)) \ )==""\n"
1254R"==(return; \ )==""\n"
1255R"==(} while (0) )==""\n"
1256R"==(#endif )==""\n"
1257R"==(#if SRC_DT_U8 == 1 )==""\n"
1258R"==(#define SRC_DT_ALIAS UCHAR )==""\n"
1259R"==(#elif SRC_DT_S8 == 1 )==""\n"
1260R"==(#define SRC_DT_ALIAS CHAR )==""\n"
1261R"==(#elif SRC_DT_F16 == 1 )==""\n"
1262R"==(#define SRC_DT_ALIAS HALF )==""\n"
1263R"==(#elif SRC_DT_BF16 == 1 )==""\n"
1264R"==(#define SRC_DT_ALIAS BFLOAT )==""\n"
1265R"==(#elif SRC_DT_F32 == 1 )==""\n"
1266R"==(#define SRC_DT_ALIAS FLOAT )==""\n"
1267R"==(#endif )==""\n"
1268R"==(#if DST_DT_U8 == 1 )==""\n"
1269R"==(#define DST_DT_ALIAS UCHAR )==""\n"
1270R"==(#elif DST_DT_S8 == 1 )==""\n"
1271R"==(#define DST_DT_ALIAS CHAR )==""\n"
1272R"==(#elif DST_DT_F16 == 1 )==""\n"
1273R"==(#define DST_DT_ALIAS HALF )==""\n"
1274R"==(#elif DST_DT_BF16 == 1 )==""\n"
1275R"==(#define DST_DT_ALIAS BFLOAT )==""\n"
1276R"==(#elif DST_DT_F32 == 1 )==""\n"
1277R"==(#define DST_DT_ALIAS FLOAT )==""\n"
1278R"==(#endif )==""\n"
1279R"==(#define ALIAS(prefix) CONCAT2(prefix, _DT_ALIAS) )==""\n"
1280R"==(#define BLOCK1_T uchar )==""\n"
1281R"==(#define BLOCK2_T ushort )==""\n"
1282R"==(#define BLOCK4_T uint )==""\n"
1283R"==(#define BLOCK8_T ulong )==""\n"
1284R"==(#define BLOCK_T(alias) CONCAT3(BLOCK, SIZEOF(alias), _T) )==""\n"
1285R"==(#define BLOCK1_ALIAS UCHAR )==""\n"
1286R"==(#define BLOCK2_ALIAS USHORT )==""\n"
1287R"==(#define BLOCK4_ALIAS UINT )==""\n"
1288R"==(#define BLOCK_ALIAS(prefix) CONCAT3(BLOCK, SIZEOF(ALIAS(prefix)), _ALIAS) )==""\n"
1289R"==(#define SIZEOF_UCHAR 1 )==""\n"
1290R"==(#define SIZEOF_CHAR 1 )==""\n"
1291R"==(#define SIZEOF_BFLOAT 2 )==""\n"
1292R"==(#define SIZEOF_HALF 2 )==""\n"
1293R"==(#define SIZEOF_FLOAT 4 )==""\n"
1294R"==(#define SIZEOF(alias) CONCAT2(SIZEOF_, alias) )==""\n"
1295R"==(#define READ_UCHAR8 intel_sub_group_block_read_uc8 )==""\n"
1296R"==(#define READ_USHORT8 intel_sub_group_block_read_us8 )==""\n"
1297R"==(#define READ_UINT8 intel_sub_group_block_read8 )==""\n"
1298R"==(#define READ_BLOCK8(prefix, ptr) READ_BLOCK_N(prefix, 8)(ptr) )==""\n"
1299R"==(#define READ_BLOCK_N(prefix, n) CONCAT3(READ_, BLOCK_ALIAS(prefix), n) )==""\n"
1300R"==(#define WRITE_UCHAR8 intel_sub_group_block_write_uc8 )==""\n"
1301R"==(#define WRITE_USHORT8 intel_sub_group_block_write_us8 )==""\n"
1302R"==(#define WRITE_UINT8 intel_sub_group_block_write8 )==""\n"
1303R"==(#define WRITE_BLOCK8(prefix, ptr, val) WRITE_BLOCK_N(prefix, 8)(ptr, val) )==""\n"
1304R"==(#define WRITE_BLOCK_N(prefix, n) CONCAT3(WRITE_, BLOCK_ALIAS(prefix), n) )==""\n"
1305R"==(#define AS_UCHAR8 as_uchar8 )==""\n"
1306R"==(#define AS_CHAR8 as_char8 )==""\n"
1307R"==(#define AS_HALF8 as_half8 )==""\n"
1308R"==(#define AS_USHORT8 as_ushort8 )==""\n"
1309R"==(#define AS_BFLOAT8 as_ushort8 )==""\n"
1310R"==(#define AS_FLOAT8 as_float8 )==""\n"
1311R"==(#define AS_INT8 as_int8 )==""\n"
1312R"==(#define AS_UINT8 as_uint8 )==""\n"
1313R"==(#define BLOCK_TO_DATA8(prefix, val) BLOCK_TO_DATA_N(prefix, 8)(val) )==""\n"
1314R"==(#define BLOCK_TO_DATA_N(prefix, n) CONCAT3(AS_, ALIAS(prefix), n) )==""\n"
1315R"==(#define DATA_TO_BLOCK8(prefix, val) DATA_TO_BLOCK_N(prefix, 8)(val) )==""\n"
1316R"==(#define DATA_TO_BLOCK_N(prefix, n) CONCAT3(AS_, BLOCK_ALIAS(prefix), n) )==""\n"
1317R"==(#define UCHAR_TO_FLOAT1 convert_float )==""\n"
1318R"==(#define UCHAR_TO_FLOAT8 convert_float8 )==""\n"
1319R"==(#define FLOAT_TO_UCHAR1 convert_uchar_sat_rte )==""\n"
1320R"==(#define FLOAT_TO_UCHAR8 convert_uchar8_sat_rte )==""\n"
1321R"==(#define CHAR_TO_FLOAT1 convert_float )==""\n"
1322R"==(#define CHAR_TO_FLOAT8 convert_float8 )==""\n"
1323R"==(#define FLOAT_TO_CHAR1 convert_char_sat_rte )==""\n"
1324R"==(#define FLOAT_TO_CHAR8 convert_char8_sat_rte )==""\n"
1325R"==(#define HALF_TO_FLOAT1 convert_float )==""\n"
1326R"==(#define HALF_TO_FLOAT8 convert_float8 )==""\n"
1327R"==(#define FLOAT_TO_HALF1 convert_half )==""\n"
1328R"==(#define FLOAT_TO_HALF8 convert_half8 )==""\n"
1329R"==(#define BFLOAT_TO_FLOAT1 cvt_bf16_to_f32 )==""\n"
1330R"==(#define BFLOAT_TO_FLOAT8 cvt_bf16_to_f32 )==""\n"
1331R"==(#define FLOAT_TO_BFLOAT1 cvt_f32_to_bf16 )==""\n"
1332R"==(#define FLOAT_TO_BFLOAT8 cvt_f32_to_bf16 )==""\n"
1333R"==(#define FLOAT_TO_FLOAT1 convert_float )==""\n"
1334R"==(#define FLOAT_TO_FLOAT8 convert_float8 )==""\n"
1335R"==(#define DATA_TO_FLOAT(prefix, val) DATA_TO_FLOAT_N(prefix, 1)(val) )==""\n"
1336R"==(#define DATA_TO_FLOAT8(prefix, val) DATA_TO_FLOAT_N(prefix, 8)(val) )==""\n"
1337R"==(#define DATA_TO_FLOAT_N(prefix, n) CONCAT3(ALIAS(prefix), _TO_FLOAT, n) )==""\n"
1338R"==(#define FLOAT_TO_DATA(prefix, val) FLOAT_TO_DATA_N(prefix, 1)(val) )==""\n"
1339R"==(#define FLOAT_TO_DATA8(prefix, val) FLOAT_TO_DATA_N(prefix, 8)(val) )==""\n"
1340R"==(#define FLOAT_TO_DATA_N(prefix, n) CONCAT3(FLOAT_TO_, ALIAS(prefix), n) )==""\n"
1341R"==()==";
1342}
1343}
1344}
1345}