1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *binary_types_header = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#ifndef GPU_OCL_BINARY_TYPES_H )==""\n"
21R"==(#define GPU_OCL_BINARY_TYPES_H )==""\n"
22R"==(#include "gpu/ocl/ocl_post_ops.h" )==""\n"
23R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
24R"==(#undef DST_OFF )==""\n"
25R"==(#define DST_OFF(x0, x1, x2, x3, x4, x5) OFF_MD(DST, x0, x1, x2, x3, x4, x5) )==""\n"
26R"==(#define SRC0_OFF(x0, x1, x2, x3, x4, x5) OFF_MD(SRC0, x0, x1, x2, x3, x4, x5) )==""\n"
27R"==(#define SRC1_OFF(x0, x1, x2, x3, x4, x5) OFF_MD(SRC1, x0, x1, x2, x3, x4, x5) )==""\n"
28R"==(#if SRC0_DT_S8 )==""\n"
29R"==(#define SRC0_BLOCK_READ(src) \ )==""\n"
30R"==(as_char(intel_sub_group_block_read_uc((const __global uchar *)(src))) )==""\n"
31R"==(#define SRC0_BLOCK_READ2(src) \ )==""\n"
32R"==(as_char2(intel_sub_group_block_read_uc2((const __global uchar *)(src))) )==""\n"
33R"==(#define SRC0_BLOCK_READ4(src) \ )==""\n"
34R"==(as_char4(intel_sub_group_block_read_uc4((const __global uchar *)(src))) )==""\n"
35R"==(#define SRC0_BLOCK_READ8(src) \ )==""\n"
36R"==(as_char8(intel_sub_group_block_read_uc8((const __global uchar *)(src))) )==""\n"
37R"==(#endif )==""\n"
38R"==(#if SRC1_DT_S8 )==""\n"
39R"==(#define SRC1_BLOCK_READ(src) \ )==""\n"
40R"==(as_char(intel_sub_group_block_read_uc((const __global uchar *)(src))) )==""\n"
41R"==(#define SRC1_BLOCK_READ2(src) \ )==""\n"
42R"==(as_char2(intel_sub_group_block_read_uc2((const __global uchar *)(src))) )==""\n"
43R"==(#define SRC1_BLOCK_READ4(src) \ )==""\n"
44R"==(as_char4(intel_sub_group_block_read_uc4((const __global uchar *)(src))) )==""\n"
45R"==(#define SRC1_BLOCK_READ8(src) \ )==""\n"
46R"==(as_char8(intel_sub_group_block_read_uc8((const __global uchar *)(src))) )==""\n"
47R"==(#endif )==""\n"
48R"==(#if SRC0_DT_U8 )==""\n"
49R"==(#define SRC0_BLOCK_READ(src) \ )==""\n"
50R"==(as_uchar(intel_sub_group_block_read_uc((const __global uchar *)(src))) )==""\n"
51R"==(#define SRC0_BLOCK_READ2(src) \ )==""\n"
52R"==(as_uchar2(intel_sub_group_block_read_uc2((const __global uchar *)(src))) )==""\n"
53R"==(#define SRC0_BLOCK_READ4(src) \ )==""\n"
54R"==(as_uchar4(intel_sub_group_block_read_uc4((const __global uchar *)(src))) )==""\n"
55R"==(#define SRC0_BLOCK_READ8(src) \ )==""\n"
56R"==(as_uchar8(intel_sub_group_block_read_uc8((const __global uchar *)(src))) )==""\n"
57R"==(#endif )==""\n"
58R"==(#if SRC1_DT_U8 )==""\n"
59R"==(#define SRC1_BLOCK_READ(src) \ )==""\n"
60R"==(as_uchar(intel_sub_group_block_read_uc((const __global uchar *)(src))) )==""\n"
61R"==(#define SRC1_BLOCK_READ2(src) \ )==""\n"
62R"==(as_uchar2(intel_sub_group_block_read_uc2((const __global uchar *)(src))) )==""\n"
63R"==(#define SRC1_BLOCK_READ4(src) \ )==""\n"
64R"==(as_uchar4(intel_sub_group_block_read_uc4((const __global uchar *)(src))) )==""\n"
65R"==(#define SRC1_BLOCK_READ8(src) \ )==""\n"
66R"==(as_uchar8(intel_sub_group_block_read_uc8((const __global uchar *)(src))) )==""\n"
67R"==(#endif )==""\n"
68R"==(#if SRC0_DT_F16 )==""\n"
69R"==(#define SRC0_BLOCK_READ(src) \ )==""\n"
70R"==(as_half(intel_sub_group_block_read_us((const __global ushort *)(src))) )==""\n"
71R"==(#define SRC0_BLOCK_READ2(src) \ )==""\n"
72R"==(as_half2(intel_sub_group_block_read_us2((const __global ushort *)(src))) )==""\n"
73R"==(#define SRC0_BLOCK_READ4(src) \ )==""\n"
74R"==(as_half4(intel_sub_group_block_read_us4((const __global ushort *)(src))) )==""\n"
75R"==(#define SRC0_BLOCK_READ8(src) \ )==""\n"
76R"==(as_half8(intel_sub_group_block_read_us8((const __global ushort *)(src))) )==""\n"
77R"==(#endif )==""\n"
78R"==(#if SRC1_DT_F16 )==""\n"
79R"==(#define SRC1_BLOCK_READ(src) \ )==""\n"
80R"==(as_half(intel_sub_group_block_read_us((const __global ushort *)(src))) )==""\n"
81R"==(#define SRC1_BLOCK_READ2(src) \ )==""\n"
82R"==(as_half2(intel_sub_group_block_read_us2((const __global ushort *)(src))) )==""\n"
83R"==(#define SRC1_BLOCK_READ4(src) \ )==""\n"
84R"==(as_half4(intel_sub_group_block_read_us4((const __global ushort *)(src))) )==""\n"
85R"==(#define SRC1_BLOCK_READ8(src) \ )==""\n"
86R"==(as_half8(intel_sub_group_block_read_us8((const __global ushort *)(src))) )==""\n"
87R"==(#endif )==""\n"
88R"==(#if SRC0_DT_S32 )==""\n"
89R"==(#define SRC0_BLOCK_READ(src) \ )==""\n"
90R"==(as_int(intel_sub_group_block_read((const __global uint *)(src))) )==""\n"
91R"==(#define SRC0_BLOCK_READ2(src) \ )==""\n"
92R"==(as_int2(intel_sub_group_block_read2((const __global uint *)(src))) )==""\n"
93R"==(#define SRC0_BLOCK_READ4(src) \ )==""\n"
94R"==(as_int4(intel_sub_group_block_read4((const __global uint *)(src))) )==""\n"
95R"==(#define SRC0_BLOCK_READ8(src) \ )==""\n"
96R"==(as_int8(intel_sub_group_block_read8((const __global uint *)(src))) )==""\n"
97R"==(#endif )==""\n"
98R"==(#if SRC1_DT_S32 )==""\n"
99R"==(#define SRC1_BLOCK_READ(src) \ )==""\n"
100R"==(as_int(intel_sub_group_block_read((const __global uint *)(src))) )==""\n"
101R"==(#define SRC1_BLOCK_READ2(src) \ )==""\n"
102R"==(as_int2(intel_sub_group_block_read2((const __global uint *)(src))) )==""\n"
103R"==(#define SRC1_BLOCK_READ4(src) \ )==""\n"
104R"==(as_int4(intel_sub_group_block_read4((const __global uint *)(src))) )==""\n"
105R"==(#define SRC1_BLOCK_READ8(src) \ )==""\n"
106R"==(as_int8(intel_sub_group_block_read8((const __global uint *)(src))) )==""\n"
107R"==(#endif )==""\n"
108R"==(#if SRC0_DT_F32 )==""\n"
109R"==(#define SRC0_BLOCK_READ(src) \ )==""\n"
110R"==(as_float(intel_sub_group_block_read((const __global uint *)(src))) )==""\n"
111R"==(#define SRC0_BLOCK_READ2(src) \ )==""\n"
112R"==(as_float2(intel_sub_group_block_read2((const __global uint *)(src))) )==""\n"
113R"==(#define SRC0_BLOCK_READ4(src) \ )==""\n"
114R"==(as_float4(intel_sub_group_block_read4((const __global uint *)(src))) )==""\n"
115R"==(#define SRC0_BLOCK_READ8(src) \ )==""\n"
116R"==(as_float8(intel_sub_group_block_read8((const __global uint *)(src))) )==""\n"
117R"==(#endif )==""\n"
118R"==(#if SRC1_DT_F32 )==""\n"
119R"==(#define SRC1_BLOCK_READ(src) \ )==""\n"
120R"==(as_float(intel_sub_group_block_read((const __global uint *)(src))) )==""\n"
121R"==(#define SRC1_BLOCK_READ2(src) \ )==""\n"
122R"==(as_float2(intel_sub_group_block_read2((const __global uint *)(src))) )==""\n"
123R"==(#define SRC1_BLOCK_READ4(src) \ )==""\n"
124R"==(as_float4(intel_sub_group_block_read4((const __global uint *)(src))) )==""\n"
125R"==(#define SRC1_BLOCK_READ8(src) \ )==""\n"
126R"==(as_float8(intel_sub_group_block_read8((const __global uint *)(src))) )==""\n"
127R"==(#endif )==""\n"
128R"==(#if SRC0_DT_BF16 )==""\n"
129R"==(#define SRC0_BLOCK_READ(src) \ )==""\n"
130R"==(as_ushort(intel_sub_group_block_read_us((const __global ushort *)(src))) )==""\n"
131R"==(#define SRC0_BLOCK_READ2(src) \ )==""\n"
132R"==(as_ushort2(intel_sub_group_block_read_us2((const __global ushort *)(src))) )==""\n"
133R"==(#define SRC0_BLOCK_READ4(src) \ )==""\n"
134R"==(as_ushort4(intel_sub_group_block_read_us4((const __global ushort *)(src))) )==""\n"
135R"==(#define SRC0_BLOCK_READ8(src) \ )==""\n"
136R"==(as_ushort8(intel_sub_group_block_read_us8((const __global ushort *)(src))) )==""\n"
137R"==(#endif )==""\n"
138R"==(#if SRC1_DT_BF16 )==""\n"
139R"==(#define SRC1_BLOCK_READ(src) \ )==""\n"
140R"==(as_ushort(intel_sub_group_block_read_us((const __global ushort *)(src))) )==""\n"
141R"==(#define SRC1_BLOCK_READ2(src) \ )==""\n"
142R"==(as_ushort2(intel_sub_group_block_read_us2((const __global ushort *)(src))) )==""\n"
143R"==(#define SRC1_BLOCK_READ4(src) \ )==""\n"
144R"==(as_ushort4(intel_sub_group_block_read_us4((const __global ushort *)(src))) )==""\n"
145R"==(#define SRC1_BLOCK_READ8(src) \ )==""\n"
146R"==(as_ushort8(intel_sub_group_block_read_us8((const __global ushort *)(src))) )==""\n"
147R"==(#endif )==""\n"
148R"==(#if DST_DT_S8 )==""\n"
149R"==(#define DST_BLOCK_READ(src) \ )==""\n"
150R"==(as_char(intel_sub_group_block_read_uc((const __global uchar *)(src))) )==""\n"
151R"==(#define DST_BLOCK_READ2(src) \ )==""\n"
152R"==(as_char2(intel_sub_group_block_read_uc2((const __global uchar *)(src))) )==""\n"
153R"==(#define DST_BLOCK_READ4(src) \ )==""\n"
154R"==(as_char4(intel_sub_group_block_read_uc4((const __global uchar *)(src))) )==""\n"
155R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
156R"==(as_char8(intel_sub_group_block_read_uc8((const __global uchar *)(src))) )==""\n"
157R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
158R"==(intel_sub_group_block_write_uc((__global uchar *)(dst), as_uchar(val)) )==""\n"
159R"==(#define DST_BLOCK_WRITE2(dst, val) \ )==""\n"
160R"==(intel_sub_group_block_write_uc2((__global uchar *)(dst), as_uchar2(val)) )==""\n"
161R"==(#define DST_BLOCK_WRITE4(dst, val) \ )==""\n"
162R"==(intel_sub_group_block_write_uc4((__global uchar *)(dst), as_uchar4(val)) )==""\n"
163R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
164R"==(intel_sub_group_block_write_uc8((__global uchar *)(dst), as_uchar8(val)) )==""\n"
165R"==(#endif )==""\n"
166R"==(#if DST_DT_U8 )==""\n"
167R"==(#define DST_BLOCK_READ(src) \ )==""\n"
168R"==(as_uchar(intel_sub_group_block_read_uc((const __global uchar *)(src))) )==""\n"
169R"==(#define DST_BLOCK_READ2(src) \ )==""\n"
170R"==(as_uchar2(intel_sub_group_block_read_uc2((const __global uchar *)(src))) )==""\n"
171R"==(#define DST_BLOCK_READ4(src) \ )==""\n"
172R"==(as_uchar4(intel_sub_group_block_read_uc4((const __global uchar *)(src))) )==""\n"
173R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
174R"==(as_uchar8(intel_sub_group_block_read_uc8((const __global uchar *)(src))) )==""\n"
175R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
176R"==(intel_sub_group_block_write_uc((__global uchar *)(dst), as_uchar(val)) )==""\n"
177R"==(#define DST_BLOCK_WRITE2(dst, val) \ )==""\n"
178R"==(intel_sub_group_block_write_uc2((__global uchar *)(dst), as_uchar2(val)) )==""\n"
179R"==(#define DST_BLOCK_WRITE4(dst, val) \ )==""\n"
180R"==(intel_sub_group_block_write_uc4((__global uchar *)(dst), as_uchar4(val)) )==""\n"
181R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
182R"==(intel_sub_group_block_write_uc8((__global uchar *)(dst), as_uchar8(val)) )==""\n"
183R"==(#endif )==""\n"
184R"==(#if DST_DT_F16 )==""\n"
185R"==(#define DST_BLOCK_READ(src) \ )==""\n"
186R"==(as_half(intel_sub_group_block_read_us((const __global ushort *)(src))) )==""\n"
187R"==(#define DST_BLOCK_READ2(src) \ )==""\n"
188R"==(as_half2(intel_sub_group_block_read_us2((const __global ushort *)(src))) )==""\n"
189R"==(#define DST_BLOCK_READ4(src) \ )==""\n"
190R"==(as_half4(intel_sub_group_block_read_us4((const __global ushort *)(src))) )==""\n"
191R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
192R"==(as_half8(intel_sub_group_block_read_us8((const __global ushort *)(src))) )==""\n"
193R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
194R"==(intel_sub_group_block_write_us((__global ushort *)(dst), as_ushort(val)) )==""\n"
195R"==(#define DST_BLOCK_WRITE2(dst, val) \ )==""\n"
196R"==(intel_sub_group_block_write_us2((__global ushort *)(dst), as_ushort2(val)) )==""\n"
197R"==(#define DST_BLOCK_WRITE4(dst, val) \ )==""\n"
198R"==(intel_sub_group_block_write_us4((__global ushort *)(dst), as_ushort4(val)) )==""\n"
199R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
200R"==(intel_sub_group_block_write_us8((__global ushort *)(dst), as_ushort8(val)) )==""\n"
201R"==(#endif )==""\n"
202R"==(#if DST_DT_S32 )==""\n"
203R"==(#define DST_BLOCK_READ(src) \ )==""\n"
204R"==(as_int(intel_sub_group_block_read((const __global uint *)(src))) )==""\n"
205R"==(#define DST_BLOCK_READ2(src) \ )==""\n"
206R"==(as_int2(intel_sub_group_block_read2((const __global uint *)(src))) )==""\n"
207R"==(#define DST_BLOCK_READ4(src) \ )==""\n"
208R"==(as_int4(intel_sub_group_block_read4((const __global uint *)(src))) )==""\n"
209R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
210R"==(as_int8(intel_sub_group_block_read8((const __global uint *)(src))) )==""\n"
211R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
212R"==(intel_sub_group_block_write((__global uint *)(dst), as_uint(val)) )==""\n"
213R"==(#define DST_BLOCK_WRITE2(dst, val) \ )==""\n"
214R"==(intel_sub_group_block_write2((__global uint *)(dst), as_uint2(val)) )==""\n"
215R"==(#define DST_BLOCK_WRITE4(dst, val) \ )==""\n"
216R"==(intel_sub_group_block_write4((__global uint *)(dst), as_uint4(val)) )==""\n"
217R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
218R"==(intel_sub_group_block_write8((__global uint *)(dst), as_uint8(val)) )==""\n"
219R"==(#endif )==""\n"
220R"==(#if DST_DT_F32 )==""\n"
221R"==(#define DST_BLOCK_READ(src) \ )==""\n"
222R"==(as_float(intel_sub_group_block_read((const __global uint *)(src))) )==""\n"
223R"==(#define DST_BLOCK_READ2(src) \ )==""\n"
224R"==(as_float2(intel_sub_group_block_read2((const __global uint *)(src))) )==""\n"
225R"==(#define DST_BLOCK_READ4(src) \ )==""\n"
226R"==(as_float4(intel_sub_group_block_read4((const __global uint *)(src))) )==""\n"
227R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
228R"==(as_float8(intel_sub_group_block_read8((const __global uint *)(src))) )==""\n"
229R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
230R"==(intel_sub_group_block_write((__global uint *)(dst), as_uint(val)) )==""\n"
231R"==(#define DST_BLOCK_WRITE2(dst, val) \ )==""\n"
232R"==(intel_sub_group_block_write2((__global uint *)(dst), as_uint2(val)) )==""\n"
233R"==(#define DST_BLOCK_WRITE4(dst, val) \ )==""\n"
234R"==(intel_sub_group_block_write4((__global uint *)(dst), as_uint4(val)) )==""\n"
235R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
236R"==(intel_sub_group_block_write8((__global uint *)(dst), as_uint8(val)) )==""\n"
237R"==(#endif )==""\n"
238R"==(#if DST_DT_BF16 )==""\n"
239R"==(#define DST_BLOCK_READ(src) \ )==""\n"
240R"==(as_ushort(intel_sub_group_block_read_us((const __global ushort *)(src))) )==""\n"
241R"==(#define DST_BLOCK_READ2(src) \ )==""\n"
242R"==(as_ushort2(intel_sub_group_block_read_us2((const __global ushort *)(src))) )==""\n"
243R"==(#define DST_BLOCK_READ4(src) \ )==""\n"
244R"==(as_ushort4(intel_sub_group_block_read_us4((const __global ushort *)(src))) )==""\n"
245R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
246R"==(as_ushort8(intel_sub_group_block_read_us8((const __global ushort *)(src))) )==""\n"
247R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
248R"==(intel_sub_group_block_write_us((__global ushort *)(dst), as_ushort(val)) )==""\n"
249R"==(#define DST_BLOCK_WRITE2(dst, val) \ )==""\n"
250R"==(intel_sub_group_block_write_us2((__global ushort *)(dst), as_ushort2(val)) )==""\n"
251R"==(#define DST_BLOCK_WRITE4(dst, val) \ )==""\n"
252R"==(intel_sub_group_block_write_us4((__global ushort *)(dst), as_ushort4(val)) )==""\n"
253R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
254R"==(intel_sub_group_block_write_us8((__global ushort *)(dst), as_ushort8(val)) )==""\n"
255R"==(#endif )==""\n"
256R"==(#if NVECT == 1 || IS_PLAIN_LAYOUT )==""\n"
257R"==(#define ELEM_DATA_T float )==""\n"
258R"==(#elif NVECT == 2 )==""\n"
259R"==(#define ELEM_DATA_T float2 )==""\n"
260R"==(#elif NVECT == 4 )==""\n"
261R"==(#define ELEM_DATA_T float4 )==""\n"
262R"==(#elif NVECT == 8 )==""\n"
263R"==(#define ELEM_DATA_T float8 )==""\n"
264R"==(#endif )==""\n"
265R"==(ELEM_DATA_T get_eltwise_op(ELEM_DATA_T src0, ELEM_DATA_T src1) { )==""\n"
266R"==(ELEM_DATA_T d = 0; )==""\n"
267R"==(#if IS_ADD )==""\n"
268R"==(d = src0 + src1; )==""\n"
269R"==(#elif IS_MUL )==""\n"
270R"==(d = src0 * src1; )==""\n"
271R"==(#elif IS_MAX )==""\n"
272R"==(d = max(src0, src1); )==""\n"
273R"==(#elif IS_MIN )==""\n"
274R"==(d = min(src0, src1); )==""\n"
275R"==(#elif IS_DIV )==""\n"
276R"==(d = src0 / src1; )==""\n"
277R"==(#elif IS_SUB )==""\n"
278R"==(d = src0 - src1; )==""\n"
279R"==(#elif IS_GE )==""\n"
280R"==(d = (src0 >= src1) ? 1.0f : 0.0f; )==""\n"
281R"==(#elif IS_GT )==""\n"
282R"==(d = (src0 > src1) ? 1.0f : 0.0f; )==""\n"
283R"==(#elif IS_LE )==""\n"
284R"==(d = (src0 <= src1) ? 1.0f : 0.0f; )==""\n"
285R"==(#elif IS_LT )==""\n"
286R"==(d = (src0 < src1) ? 1.0f : 0.0f; )==""\n"
287R"==(#elif IS_EQ )==""\n"
288R"==(d = (src0 == src1) ? 1.0f : 0.0f; )==""\n"
289R"==(#elif IS_NE )==""\n"
290R"==(d = (src0 != src1) ? 1.0f : 0.0f; )==""\n"
291R"==(#endif )==""\n"
292R"==(return d; )==""\n"
293R"==(} )==""\n"
294R"==(#define READ_DATA(size, name, source_ptr, dest_ptr, scale) \ )==""\n"
295R"==({ \ )==""\n"
296R"==(unsigned offset = 0; \ )==""\n"
297R"==(unroll_for(unsigned j8 = 0; j8 < size / 8; ++j8) { \ )==""\n"
298R"==(*((float8 *)(dest_ptr + offset)) = scale \ )==""\n"
299R"==(* CONVERT_FLOAT8_T(CONCAT2(name, _BLOCK_READ8)( \ )==""\n"
300R"==((source_ptr + offset * SUB_GROUP_SIZE))); \ )==""\n"
301R"==(offset += 8; \ )==""\n"
302R"==(} \ )==""\n"
303R"==(if ((size % 8) / 4) { \ )==""\n"
304R"==(*((float4 *)(dest_ptr + offset)) = scale \ )==""\n"
305R"==(* CONVERT_FLOAT4_T(CONCAT2(name, _BLOCK_READ4)( \ )==""\n"
306R"==((source_ptr + offset * SUB_GROUP_SIZE))); \ )==""\n"
307R"==(offset += 4; \ )==""\n"
308R"==(} \ )==""\n"
309R"==(if ((size % 4) / 2) { \ )==""\n"
310R"==(*((float2 *)(dest_ptr + offset)) = scale \ )==""\n"
311R"==(* CONVERT_FLOAT2_T(CONCAT2(name, _BLOCK_READ2)( \ )==""\n"
312R"==((source_ptr + offset * SUB_GROUP_SIZE))); \ )==""\n"
313R"==(offset += 2; \ )==""\n"
314R"==(} \ )==""\n"
315R"==(if ((size % 2)) { \ )==""\n"
316R"==(*((float *)(dest_ptr + offset)) = scale \ )==""\n"
317R"==(* CONVERT_FLOAT_T(CONCAT2(name, _BLOCK_READ)( \ )==""\n"
318R"==((source_ptr + offset * SUB_GROUP_SIZE))); \ )==""\n"
319R"==(} \ )==""\n"
320R"==(} )==""\n"
321R"==(#define WRITE_DATA(size, name, source_ptr, dest_ptr) \ )==""\n"
322R"==({ \ )==""\n"
323R"==(unsigned offset = 0; \ )==""\n"
324R"==(unroll_for(unsigned j8 = 0; j8 < size / 8; ++j8) { \ )==""\n"
325R"==(CONCAT2(name, _BLOCK_WRITE8) \ )==""\n"
326R"==(((dest_ptr + offset * SUB_GROUP_SIZE), \ )==""\n"
327R"==(TO_DST8(*((float8 *)(source_ptr + offset)))); \ )==""\n"
328R"==(offset += 8; \ )==""\n"
329R"==(} \ )==""\n"
330R"==(if ((size % 8) / 4) { \ )==""\n"
331R"==(CONCAT2(name, _BLOCK_WRITE4) \ )==""\n"
332R"==(((dest_ptr + offset * SUB_GROUP_SIZE), \ )==""\n"
333R"==(TO_DST4(*((float4 *)(source_ptr + offset)))); \ )==""\n"
334R"==(offset += 4; \ )==""\n"
335R"==(} \ )==""\n"
336R"==(if ((size % 4) / 2) { \ )==""\n"
337R"==(CONCAT2(name, _BLOCK_WRITE2) \ )==""\n"
338R"==(((dest_ptr + offset * SUB_GROUP_SIZE), \ )==""\n"
339R"==(TO_DST2(*((float2 *)(source_ptr + offset)))); \ )==""\n"
340R"==(offset += 2; \ )==""\n"
341R"==(} \ )==""\n"
342R"==(if ((size % 2)) { \ )==""\n"
343R"==(CONCAT2(name, _BLOCK_WRITE) \ )==""\n"
344R"==(((dest_ptr + offset * SUB_GROUP_SIZE), \ )==""\n"
345R"==(TO_DST(*((float *)(source_ptr + offset)))); \ )==""\n"
346R"==(} \ )==""\n"
347R"==(} )==""\n"
348R"==(#endif )==""\n"
349R"==()==";
350}
351}
352}
353}