1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *reorder_common_header = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2019-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#undef cl_future_bf16_cvt )==""\n"
21R"==(#define DT_UNDEF 1 )==""\n"
22R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
23R"==(#include "gpu/ocl/ocl_math_utils.h" )==""\n"
24R"==(#if SRC_DT_F16 || DST_DT_F16 )==""\n"
25R"==(#pragma OPENCL EXTENSION cl_khr_fp16 : enable )==""\n"
26R"==(#endif )==""\n"
27R"==(#if SRC_DT_F64 || DST_DT_F64 )==""\n"
28R"==(#pragma OPENCL EXTENSION cl_khr_fp64 : enable )==""\n"
29R"==(#endif )==""\n"
30R"==(#undef SRC_OFF )==""\n"
31R"==(#undef DST_OFF )==""\n"
32R"==(#define SRC_OFF(x0, x1, x2, x3, x4, x5) \ )==""\n"
33R"==(OFF_MD(SRC, (x0), (x1), (x2), (x3), (x4), (x5)) )==""\n"
34R"==(#define DST_OFF(x0, x1, x2, x3, x4, x5) \ )==""\n"
35R"==(OFF_MD(DST, (x0), (x1), (x2), (x3), (x4), (x5)) )==""\n"
36R"==(#define SRC_OFF_G(gr, x0, x1, x2, x3, x4) \ )==""\n"
37R"==(OFF_MD(SRC, gr, (x0), (x1), (x2), (x3), (x4)) )==""\n"
38R"==(#define DST_OFF_G(gr, x0, x1, x2, x3, x4) \ )==""\n"
39R"==(OFF_MD(DST, gr, (x0), (x1), (x2), (x3), (x4)) )==""\n"
40R"==(#if SRC_DT_S8 )==""\n"
41R"==(#define SRC_BLOCK_READ(src) \ )==""\n"
42R"==(as_char(intel_sub_group_block_read_uc((const __global uchar *)(src))) )==""\n"
43R"==(#define SRC_BLOCK_READ8(src) \ )==""\n"
44R"==(as_char8(intel_sub_group_block_read_uc8((const __global uchar *)(src))) )==""\n"
45R"==(#define SRC_BLOCK_WRITE(dst, val) \ )==""\n"
46R"==(intel_sub_group_block_write_uc((__global uchar *)(dst), as_uchar(val)) )==""\n"
47R"==(#define SRC_BLOCK_WRITE8(dst, val) \ )==""\n"
48R"==(intel_sub_group_block_write_uc8((__global uchar *)(dst), as_uchar8(val)) )==""\n"
49R"==(#endif )==""\n"
50R"==(#if SRC_DT_U8 )==""\n"
51R"==(#define SRC_BLOCK_READ(src) \ )==""\n"
52R"==(as_uchar(intel_sub_group_block_read_uc((const __global uchar *)(src))) )==""\n"
53R"==(#define SRC_BLOCK_READ8(src) \ )==""\n"
54R"==(as_uchar8(intel_sub_group_block_read_uc8((const __global uchar *)(src))) )==""\n"
55R"==(#define SRC_BLOCK_WRITE(dst, val) \ )==""\n"
56R"==(intel_sub_group_block_write_uc((__global uchar *)(dst), as_uchar(val)) )==""\n"
57R"==(#define SRC_BLOCK_WRITE8(dst, val) \ )==""\n"
58R"==(intel_sub_group_block_write_uc8((__global uchar *)(dst), as_uchar8(val)) )==""\n"
59R"==(#endif )==""\n"
60R"==(#if SRC_DT_F16 )==""\n"
61R"==(#define SRC_BLOCK_READ(src) \ )==""\n"
62R"==(as_half(intel_sub_group_block_read_us((const __global ushort *)(src))) )==""\n"
63R"==(#define SRC_BLOCK_READ8(src) \ )==""\n"
64R"==(as_half8(intel_sub_group_block_read_us8((const __global ushort *)(src))) )==""\n"
65R"==(#define SRC_BLOCK_WRITE(dst, val) \ )==""\n"
66R"==(intel_sub_group_block_write_us((__global ushort *)(dst), as_ushort(val)) )==""\n"
67R"==(#define SRC_BLOCK_WRITE8(dst, val) \ )==""\n"
68R"==(intel_sub_group_block_write_us8((__global ushort *)(dst), as_ushort8(val)) )==""\n"
69R"==(#endif )==""\n"
70R"==(#if SRC_DT_S32 )==""\n"
71R"==(#define SRC_BLOCK_READ(src) \ )==""\n"
72R"==(as_int(intel_sub_group_block_read((const __global uint *)(src))) )==""\n"
73R"==(#define SRC_BLOCK_READ8(src) \ )==""\n"
74R"==(as_int8(intel_sub_group_block_read8((const __global uint *)(src))) )==""\n"
75R"==(#define SRC_BLOCK_WRITE(dst, val) \ )==""\n"
76R"==(intel_sub_group_block_write((__global uint *)(dst), as_uint(val)) )==""\n"
77R"==(#define SRC_BLOCK_WRITE8(dst, val) \ )==""\n"
78R"==(intel_sub_group_block_write8((__global uint *)(dst), as_uint8(val)) )==""\n"
79R"==(#endif )==""\n"
80R"==(#if SRC_DT_F32 )==""\n"
81R"==(#define SRC_BLOCK_READ(src) \ )==""\n"
82R"==(as_float(intel_sub_group_block_read((const __global uint *)(src))) )==""\n"
83R"==(#define SRC_BLOCK_READ8(src) \ )==""\n"
84R"==(as_float8(intel_sub_group_block_read8((const __global uint *)(src))) )==""\n"
85R"==(#define SRC_BLOCK_WRITE(dst, val) \ )==""\n"
86R"==(intel_sub_group_block_write((__global uint *)(dst), as_uint(val)) )==""\n"
87R"==(#define SRC_BLOCK_WRITE8(dst, val) \ )==""\n"
88R"==(intel_sub_group_block_write8((__global uint *)(dst), as_uint8(val)) )==""\n"
89R"==(#endif )==""\n"
90R"==(#if SRC_DT_F64 )==""\n"
91R"==(#define SRC_BLOCK_READ(src) \ )==""\n"
92R"==(as_double(intel_sub_group_block_read2((const __global uint *)(src))) )==""\n"
93R"==(#define SRC_BLOCK_READ8(src) \ )==""\n"
94R"==((double8)((as_double4(intel_sub_group_block_read8( \ )==""\n"
95R"==((const __global uint *)(src)))), \ )==""\n"
96R"==((as_double4(intel_sub_group_block_read8( \ )==""\n"
97R"==((const __global uint *)(src + 8))))) )==""\n"
98R"==(#define SRC_BLOCK_WRITE(dst, val) \ )==""\n"
99R"==(intel_sub_group_block_write2((__global uint *)(dst), as_uint2(val)) )==""\n"
100R"==(#define SRC_BLOCK_WRITE8(dst, val) \ )==""\n"
101R"==(do { \ )==""\n"
102R"==(intel_sub_group_block_write8( \ )==""\n"
103R"==((__global uint *)(dst), as_uint8(val.lo)); \ )==""\n"
104R"==(intel_sub_group_block_write8( \ )==""\n"
105R"==((__global uint *)(dst + 8), as_uint8(val.hi)); \ )==""\n"
106R"==(} while (0) )==""\n"
107R"==(#endif )==""\n"
108R"==(#if SRC_DT_BF16 )==""\n"
109R"==(#define SRC_BLOCK_READ(src) \ )==""\n"
110R"==(as_ushort(intel_sub_group_block_read_us((const __global ushort *)(src))) )==""\n"
111R"==(#define SRC_BLOCK_READ8(src) \ )==""\n"
112R"==(as_ushort8(intel_sub_group_block_read_us8((const __global ushort *)(src))) )==""\n"
113R"==(#define SRC_BLOCK_WRITE(dst, val) \ )==""\n"
114R"==(intel_sub_group_block_write_us((__global ushort *)(dst), as_ushort(val)) )==""\n"
115R"==(#define SRC_BLOCK_WRITE8(dst, val) \ )==""\n"
116R"==(intel_sub_group_block_write_us8((__global ushort *)(dst), as_ushort8(val)) )==""\n"
117R"==(#endif )==""\n"
118R"==(#if DST_DT_S8 )==""\n"
119R"==(#define DST_BLOCK_READ(src) \ )==""\n"
120R"==(as_char(intel_sub_group_block_read_uc((const __global uchar *)(src))) )==""\n"
121R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
122R"==(as_char8(intel_sub_group_block_read_uc8((const __global uchar *)(src))) )==""\n"
123R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
124R"==(intel_sub_group_block_write_uc((__global uchar *)(dst), as_uchar(val)) )==""\n"
125R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
126R"==(intel_sub_group_block_write_uc8((__global uchar *)(dst), as_uchar8(val)) )==""\n"
127R"==(#endif )==""\n"
128R"==(#if DST_DT_U8 )==""\n"
129R"==(#define DST_BLOCK_READ(src) \ )==""\n"
130R"==(as_uchar(intel_sub_group_block_read_uc((const __global uchar *)(src))) )==""\n"
131R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
132R"==(as_uchar8(intel_sub_group_block_read_uc8((const __global uchar *)(src))) )==""\n"
133R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
134R"==(intel_sub_group_block_write_uc((__global uchar *)(dst), as_uchar(val)) )==""\n"
135R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
136R"==(intel_sub_group_block_write_uc8((__global uchar *)(dst), as_uchar8(val)) )==""\n"
137R"==(#endif )==""\n"
138R"==(#if DST_DT_F16 )==""\n"
139R"==(#define DST_BLOCK_READ(src) \ )==""\n"
140R"==(as_half(intel_sub_group_block_read_us((const __global ushort *)(src))) )==""\n"
141R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
142R"==(as_half8(intel_sub_group_block_read_us8((const __global ushort *)(src))) )==""\n"
143R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
144R"==(intel_sub_group_block_write_us((__global ushort *)(dst), as_ushort(val)) )==""\n"
145R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
146R"==(intel_sub_group_block_write_us8((__global ushort *)(dst), as_ushort8(val)) )==""\n"
147R"==(#endif )==""\n"
148R"==(#if DST_DT_S32 )==""\n"
149R"==(#define DST_BLOCK_READ(src) \ )==""\n"
150R"==(as_int(intel_sub_group_block_read((const __global uint *)(src))) )==""\n"
151R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
152R"==(as_int8(intel_sub_group_block_read8((const __global uint *)(src))) )==""\n"
153R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
154R"==(intel_sub_group_block_write((__global uint *)(dst), as_uint(val)) )==""\n"
155R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
156R"==(intel_sub_group_block_write8((__global uint *)(dst), as_uint8(val)) )==""\n"
157R"==(#endif )==""\n"
158R"==(#if DST_DT_F32 )==""\n"
159R"==(#define DST_BLOCK_READ(src) \ )==""\n"
160R"==(as_float(intel_sub_group_block_read((const __global uint *)(src))) )==""\n"
161R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
162R"==(as_float8(intel_sub_group_block_read8((const __global uint *)(src))) )==""\n"
163R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
164R"==(intel_sub_group_block_write((__global uint *)(dst), as_uint(val)) )==""\n"
165R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
166R"==(intel_sub_group_block_write8((__global uint *)(dst), as_uint8(val)) )==""\n"
167R"==(#endif )==""\n"
168R"==(#if DST_DT_F64 )==""\n"
169R"==(#define DST_BLOCK_READ(src) \ )==""\n"
170R"==(as_double(intel_sub_group_block_read2((const __global uint *)(src))) )==""\n"
171R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
172R"==((double8)((as_double4(intel_sub_group_block_read8( \ )==""\n"
173R"==((const __global uint *)(src)))), \ )==""\n"
174R"==((as_double4(intel_sub_group_block_read8( \ )==""\n"
175R"==((const __global uint *)(src))))) )==""\n"
176R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
177R"==(intel_sub_group_block_write2((__global uint *)(dst), as_uint2(val)) )==""\n"
178R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
179R"==(do { \ )==""\n"
180R"==(intel_sub_group_block_write8( \ )==""\n"
181R"==((__global uint *)(dst), as_uint8(val.lo)); \ )==""\n"
182R"==(intel_sub_group_block_write8( \ )==""\n"
183R"==((__global uint *)(dst + 8), as_uint8(val.hi)); \ )==""\n"
184R"==(} while (0) )==""\n"
185R"==(#endif )==""\n"
186R"==(#if DST_DT_BF16 )==""\n"
187R"==(#define DST_BLOCK_READ(src) \ )==""\n"
188R"==(as_ushort(intel_sub_group_block_read_us((const __global ushort *)(src))) )==""\n"
189R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
190R"==(as_ushort8(intel_sub_group_block_read_us8((const __global ushort *)(src))) )==""\n"
191R"==(#define DST_BLOCK_WRITE(dst, val) \ )==""\n"
192R"==(intel_sub_group_block_write_us((__global ushort *)(dst), as_ushort(val)) )==""\n"
193R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
194R"==(intel_sub_group_block_write_us8((__global ushort *)(dst), as_ushort8(val)) )==""\n"
195R"==(#endif )==""\n"
196R"==(#if (SRC_DT_S8 && DST_DT_S8) || (SRC_DT_U8 && DST_DT_U8) \ )==""\n"
197R"==(|| (SRC_DT_BF16 && DST_DT_BF16) || (SRC_DT_F16 && DST_DT_F16) \ )==""\n"
198R"==(|| (SRC_DT_F32 && DST_DT_F32) || (SRC_DT_S32 && DST_DT_S32) \ )==""\n"
199R"==(|| (SRC_DT_F64 && DST_DT_F64) )==""\n"
200R"==(#define SRC_TO_DST(x) (x) )==""\n"
201R"==(#define SRC_TO_DST8(x) (x) )==""\n"
202R"==(#else )==""\n"
203R"==(#define SRC_TO_DST(x) TO_DST(SRC_TO_REF(x)) )==""\n"
204R"==(#define SRC_TO_DST8(x) TO_DST8(SRC_TO_REF8(x)) )==""\n"
205R"==(#endif )==""\n"
206R"==(#define SCALE_MUL(x, s) (x) * (s) )==""\n"
207R"==(#define SCALE_DIV(x, s) (x) / (s) )==""\n"
208R"==(#define SCALE_NONE(x, s) (x) )==""\n"
209R"==(#if WITH_SRC_SCALE )==""\n"
210R"==(#define SRC_SCALE SCALE_MUL )==""\n"
211R"==(#else )==""\n"
212R"==(#define SRC_SCALE SCALE_NONE )==""\n"
213R"==(#endif )==""\n"
214R"==(#if WITH_DST_SCALE )==""\n"
215R"==(#define DST_SCALE SCALE_DIV )==""\n"
216R"==(#else )==""\n"
217R"==(#define DST_SCALE SCALE_NONE )==""\n"
218R"==(#endif )==""\n"
219R"==(#if WITH_SUM_SCALE )==""\n"
220R"==(#define SUM_SCALE SCALE_MUL )==""\n"
221R"==(#else )==""\n"
222R"==(#define SUM_SCALE SCALE_NONE )==""\n"
223R"==(#endif )==""\n"
224R"==(#define ZP_SHIFT(x, x0) (x) - (float)(x0) )==""\n"
225R"==(#define ZP_UNSHIFT(x, x0) (x) + (float)(x0) )==""\n"
226R"==(#define ZP_NO_SHIFT(x, x0) (x) )==""\n"
227R"==(#define ZP_READ_VAL(x) x )==""\n"
228R"==(#define ZP_READ_PTR(x) x[0] )==""\n"
229R"==(#define ZP_ZERO(x) 0 )==""\n"
230R"==(#if WITH_SRC_ZPOINT )==""\n"
231R"==(#define SRC_SHIFT ZP_SHIFT )==""\n"
232R"==(#define GET_SRC_ZP ZP_READ_PTR )==""\n"
233R"==(#else )==""\n"
234R"==(#define SRC_SHIFT ZP_NO_SHIFT )==""\n"
235R"==(#define GET_SRC_ZP ZP_ZERO )==""\n"
236R"==(#endif )==""\n"
237R"==(#if WITH_DST_ZPOINT )==""\n"
238R"==(#define DST_SHIFT ZP_UNSHIFT )==""\n"
239R"==(#define GET_DST_ZP ZP_READ_PTR )==""\n"
240R"==(#else )==""\n"
241R"==(#define DST_SHIFT ZP_NO_SHIFT )==""\n"
242R"==(#define GET_DST_ZP ZP_ZERO )==""\n"
243R"==(#endif )==""\n"
244R"==(#if WITH_SUM_ZPOINT )==""\n"
245R"==(#define SUM_SHIFT ZP_SHIFT )==""\n"
246R"==(#define GET_SUM_ZP ZP_READ_VAL )==""\n"
247R"==(#else )==""\n"
248R"==(#define SUM_SHIFT ZP_NO_SHIFT )==""\n"
249R"==(#define GET_SUM_ZP ZP_ZERO )==""\n"
250R"==(#endif )==""\n"
251R"==(#define SRC_AXPY(a, x, x0) SRC_SCALE(SRC_SHIFT(x, x0), a) )==""\n"
252R"==(#define DST_AXPY(a, x, x0) DST_SHIFT(DST_SCALE(x, a), x0) )==""\n"
253R"==(#define SUM_AXPY(a, x, x0) SUM_SCALE(SUM_SHIFT(x, x0), a) )==""\n"
254R"==(#define AXPY(src, dst, a, b, c, x0, y0, z0) \ )==""\n"
255R"==(DST_AXPY(b, (SRC_AXPY(a, src, x0) + SUM_AXPY(c, dst, y0 + z0)), y0) )==""\n"
256R"==(#if WITH_SRC_SCALE || WITH_SRC_ZPOINT )==""\n"
257R"==(#define WITH_SRC_MOD 1 )==""\n"
258R"==(#endif )==""\n"
259R"==(#if WITH_DST_SCALE || WITH_DST_ZPOINT )==""\n"
260R"==(#define WITH_DST_MOD 1 )==""\n"
261R"==(#endif )==""\n"
262R"==(#if WITH_SUM_SCALE || WITH_SUM_ZPOINT )==""\n"
263R"==(#define WITH_SUM_MOD 1 )==""\n"
264R"==(#endif )==""\n"
265R"==(#if WITH_SUM_MOD )==""\n"
266R"==(#define REORDER(_dst, _src, _a, _b, _c, _x0, _y0, _z0) \ )==""\n"
267R"==(do { \ )==""\n"
268R"==(const float _x = SRC_TO_REF(_src); \ )==""\n"
269R"==(const float _y = DST_TO_REF(_dst); \ )==""\n"
270R"==(const float _s = AXPY(_x, _y, _a, _b, _c, _x0, _y0, _z0); \ )==""\n"
271R"==(_dst = TO_DST(_s); \ )==""\n"
272R"==(} while (0) )==""\n"
273R"==(#define REORDER8(_dst, _src, _a, _b, _c, _x0, _y0, _z0) \ )==""\n"
274R"==(do { \ )==""\n"
275R"==(const float8 _x = convert_float8(SRC_TO_REF8(_src)); \ )==""\n"
276R"==(const float8 _y = convert_float8(DST_TO_REF8(_dst)); \ )==""\n"
277R"==(const float8 _s = AXPY(_x, _y, _a, _b, _c, _x0, _y0, _z0); \ )==""\n"
278R"==(_dst = TO_DST8(_s); \ )==""\n"
279R"==(} while (0) )==""\n"
280R"==(#elif WITH_SRC_MOD || WITH_DST_MOD )==""\n"
281R"==(#define REORDER(_dst, _src, _a, _b, _c, _x0, _y0, _z0) \ )==""\n"
282R"==(do { \ )==""\n"
283R"==(const float _x = SRC_TO_REF(_src); \ )==""\n"
284R"==(const float _s = AXPY(_x, 0.f, _a, _b, _c, _x0, _y0, _z0); \ )==""\n"
285R"==(_dst = TO_DST(_s); \ )==""\n"
286R"==(} while (0) )==""\n"
287R"==(#define REORDER8(_dst, _src, _a, _b, _c, _x0, _y0, _z0) \ )==""\n"
288R"==(do { \ )==""\n"
289R"==(const float8 _x = convert_float8(SRC_TO_REF8(_src)); \ )==""\n"
290R"==(const float8 _s = AXPY(_x, 0.f, _a, _b, _c, _x0, _y0, _z0); \ )==""\n"
291R"==(_dst = TO_DST8(_s); \ )==""\n"
292R"==(} while (0) )==""\n"
293R"==(#else )==""\n"
294R"==(#define REORDER(_dst, _src, _a, _b, _c, _x0, _y0, _z0) \ )==""\n"
295R"==(do { \ )==""\n"
296R"==(_dst = SRC_TO_DST(_src); \ )==""\n"
297R"==(} while (0) )==""\n"
298R"==(#define REORDER8(_dst, _src, _a, _b, _c, _x0, _y0, _z0) \ )==""\n"
299R"==(do { \ )==""\n"
300R"==(_dst = SRC_TO_DST8(_src); \ )==""\n"
301R"==(} while (0) )==""\n"
302R"==(#endif )==""\n"
303R"==(#if WITH_SRC_SCALE || WITH_DST_SCALE )==""\n"
304R"==(#define MASK_DIM(prefix, dim) ((CONCAT2(prefix, _SCALE_MASK) >> dim) & 1) )==""\n"
305R"==(#define SCALE_DIM(prefix, dim) \ )==""\n"
306R"==((MASK_DIM(prefix, dim) ? CONCAT3(prefix, _D, dim) : 1) )==""\n"
307R"==(#define SCALE_S5(prefix) (1) )==""\n"
308R"==(#define SCALE_S4(prefix) (SCALE_DIM(prefix, 5) * SCALE_S5(prefix)) )==""\n"
309R"==(#define SCALE_S3(prefix) (SCALE_DIM(prefix, 4) * SCALE_S4(prefix)) )==""\n"
310R"==(#define SCALE_S2(prefix) (SCALE_DIM(prefix, 3) * SCALE_S3(prefix)) )==""\n"
311R"==(#define SCALE_S1(prefix) (SCALE_DIM(prefix, 2) * SCALE_S2(prefix)) )==""\n"
312R"==(#define SCALE_S0(prefix) (SCALE_DIM(prefix, 1) * SCALE_S1(prefix)) )==""\n"
313R"==(#define SCALE_STRIDE(prefix, dim) \ )==""\n"
314R"==((CONCAT2(SCALE_S, dim)(prefix) * MASK_DIM(prefix, dim)) )==""\n"
315R"==(#define SCALE_OFF(prefix, x0, x1, x2, x3, x4, x5) \ )==""\n"
316R"==(((x0)*SCALE_STRIDE(prefix, 0) + (x1)*SCALE_STRIDE(prefix, 1) \ )==""\n"
317R"==(+ (x2)*SCALE_STRIDE(prefix, 2) + (x3)*SCALE_STRIDE(prefix, 3) \ )==""\n"
318R"==(+ (x4)*SCALE_STRIDE(prefix, 4) + (x5)*SCALE_STRIDE(prefix, 5)) )==""\n"
319R"==(#endif )==""\n"
320R"==()==";
321}
322}
323}
324}