1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *simple_concat_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2020-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#if DATA_TYPE_SIZE == 4 )==""\n"
21R"==(#define DATA_T uint )==""\n"
22R"==(#define DATA2_T uint2 )==""\n"
23R"==(#define DATA4_T uint4 )==""\n"
24R"==(#define DATA8_T uint8 )==""\n"
25R"==(#define BLOCK_READ intel_sub_group_block_read )==""\n"
26R"==(#define BLOCK_WRITE intel_sub_group_block_write )==""\n"
27R"==(#define BLOCK_READ2 intel_sub_group_block_read2 )==""\n"
28R"==(#define BLOCK_WRITE2 intel_sub_group_block_write2 )==""\n"
29R"==(#define BLOCK_READ4 intel_sub_group_block_read4 )==""\n"
30R"==(#define BLOCK_WRITE4 intel_sub_group_block_write4 )==""\n"
31R"==(#define BLOCK_READ8 intel_sub_group_block_read8 )==""\n"
32R"==(#define BLOCK_WRITE8 intel_sub_group_block_write8 )==""\n"
33R"==(#elif DATA_TYPE_SIZE == 2 )==""\n"
34R"==(#define DATA_T ushort )==""\n"
35R"==(#define DATA2_T ushort2 )==""\n"
36R"==(#define DATA4_T ushort4 )==""\n"
37R"==(#define DATA8_T ushort8 )==""\n"
38R"==(#define BLOCK_READ intel_sub_group_block_read_us )==""\n"
39R"==(#define BLOCK_WRITE intel_sub_group_block_write_us )==""\n"
40R"==(#define BLOCK_READ2 intel_sub_group_block_read_us2 )==""\n"
41R"==(#define BLOCK_WRITE2 intel_sub_group_block_write_us2 )==""\n"
42R"==(#define BLOCK_READ4 intel_sub_group_block_read_us4 )==""\n"
43R"==(#define BLOCK_WRITE4 intel_sub_group_block_write_us4 )==""\n"
44R"==(#define BLOCK_READ8 intel_sub_group_block_read_us8 )==""\n"
45R"==(#define BLOCK_WRITE8 intel_sub_group_block_write_us8 )==""\n"
46R"==(#elif DATA_TYPE_SIZE == 1 )==""\n"
47R"==(#define DATA_T uchar )==""\n"
48R"==(#define DATA2_T uchar2 )==""\n"
49R"==(#define DATA4_T uchar4 )==""\n"
50R"==(#define DATA8_T uchar8 )==""\n"
51R"==(#define BLOCK_READ intel_sub_group_block_read_uc )==""\n"
52R"==(#define BLOCK_WRITE intel_sub_group_block_write_uc )==""\n"
53R"==(#define BLOCK_READ2 intel_sub_group_block_read_uc2 )==""\n"
54R"==(#define BLOCK_WRITE2 intel_sub_group_block_write_uc2 )==""\n"
55R"==(#define BLOCK_READ4 intel_sub_group_block_read_uc4 )==""\n"
56R"==(#define BLOCK_WRITE4 intel_sub_group_block_write_uc4 )==""\n"
57R"==(#define BLOCK_READ8 intel_sub_group_block_read_uc8 )==""\n"
58R"==(#define BLOCK_WRITE8 intel_sub_group_block_write_uc8 )==""\n"
59R"==(#endif )==""\n"
60R"==(#define REDUCE_STAGE_1(cat, f) f(0) )==""\n"
61R"==(#define REDUCE_STAGE_2(cat, f) cat(REDUCE_STAGE_1(cat, f), f(1)) )==""\n"
62R"==(#define REDUCE_STAGE_3(cat, f) cat(REDUCE_STAGE_2(cat, f), f(2)) )==""\n"
63R"==(#define REDUCE_STAGE_4(cat, f) cat(REDUCE_STAGE_3(cat, f), f(3)) )==""\n"
64R"==(#define REDUCE_STAGE_5(cat, f) cat(REDUCE_STAGE_4(cat, f), f(4)) )==""\n"
65R"==(#define REDUCE_STAGE_6(cat, f) cat(REDUCE_STAGE_5(cat, f), f(5)) )==""\n"
66R"==(#define REDUCE_STAGE_7(cat, f) cat(REDUCE_STAGE_6(cat, f), f(6)) )==""\n"
67R"==(#define REDUCE_STAGE_8(cat, f) cat(REDUCE_STAGE_7(cat, f), f(7)) )==""\n"
68R"==(#define REDUCE_STAGE_9(cat, f) cat(REDUCE_STAGE_8(cat, f), f(8)) )==""\n"
69R"==(#define REDUCE_STAGE_10(cat, f) cat(REDUCE_STAGE_9(cat, f), f(9)) )==""\n"
70R"==(#define REDUCE_STAGE_11(cat, f) cat(REDUCE_STAGE_10(cat, f), f(10)) )==""\n"
71R"==(#define REDUCE_STAGE_12(cat, f) cat(REDUCE_STAGE_11(cat, f), f(11)) )==""\n"
72R"==(#define REDUCE_STAGE_13(cat, f) cat(REDUCE_STAGE_12(cat, f), f(12)) )==""\n"
73R"==(#define REDUCE_STAGE_14(cat, f) cat(REDUCE_STAGE_13(cat, f), f(13)) )==""\n"
74R"==(#define REDUCE_STAGE_15(cat, f) cat(REDUCE_STAGE_14(cat, f), f(14)) )==""\n"
75R"==(#define REDUCE_STAGE_16(cat, f) cat(REDUCE_STAGE_15(cat, f), f(15)) )==""\n"
76R"==(#define REDUCE_STAGE_17(cat, f) cat(REDUCE_STAGE_16(cat, f), f(16)) )==""\n"
77R"==(#define REDUCE_STAGE_18(cat, f) cat(REDUCE_STAGE_17(cat, f), f(17)) )==""\n"
78R"==(#define REDUCE_STAGE_19(cat, f) cat(REDUCE_STAGE_18(cat, f), f(18)) )==""\n"
79R"==(#define REDUCE_STAGE_20(cat, f) cat(REDUCE_STAGE_19(cat, f), f(19)) )==""\n"
80R"==(#define REDUCE_STAGE_21(cat, f) cat(REDUCE_STAGE_20(cat, f), f(20)) )==""\n"
81R"==(#define REDUCE_STAGE_22(cat, f) cat(REDUCE_STAGE_21(cat, f), f(21)) )==""\n"
82R"==(#define REDUCE_STAGE_23(cat, f) cat(REDUCE_STAGE_22(cat, f), f(22)) )==""\n"
83R"==(#define REDUCE_STAGE_24(cat, f) cat(REDUCE_STAGE_23(cat, f), f(23)) )==""\n"
84R"==(#define REDUCE_STAGE_25(cat, f) cat(REDUCE_STAGE_24(cat, f), f(24)) )==""\n"
85R"==(#define REDUCE_STAGE_26(cat, f) cat(REDUCE_STAGE_25(cat, f), f(25)) )==""\n"
86R"==(#define REDUCE_STAGE_27(cat, f) cat(REDUCE_STAGE_26(cat, f), f(26)) )==""\n"
87R"==(#define REDUCE_STAGE_28(cat, f) cat(REDUCE_STAGE_27(cat, f), f(27)) )==""\n"
88R"==(#define REDUCE_STAGE_29(cat, f) cat(REDUCE_STAGE_28(cat, f), f(28)) )==""\n"
89R"==(#define REDUCE_STAGE_30(cat, f) cat(REDUCE_STAGE_29(cat, f), f(29)) )==""\n"
90R"==(#define REDUCE_STAGE_31(cat, f) cat(REDUCE_STAGE_30(cat, f), f(30)) )==""\n"
91R"==(#define REDUCE_STAGE_32(cat, f) cat(REDUCE_STAGE_31(cat, f), f(31)) )==""\n"
92R"==(#define REDUCE_STAGE_33(cat, f) cat(REDUCE_STAGE_32(cat, f), f(32)) )==""\n"
93R"==(#define REDUCE_STAGE_34(cat, f) cat(REDUCE_STAGE_33(cat, f), f(33)) )==""\n"
94R"==(#define REDUCE_STAGE_35(cat, f) cat(REDUCE_STAGE_34(cat, f), f(34)) )==""\n"
95R"==(#define REDUCE_STAGE_36(cat, f) cat(REDUCE_STAGE_35(cat, f), f(35)) )==""\n"
96R"==(#define REDUCE_STAGE_37(cat, f) cat(REDUCE_STAGE_36(cat, f), f(36)) )==""\n"
97R"==(#define REDUCE_STAGE_38(cat, f) cat(REDUCE_STAGE_37(cat, f), f(37)) )==""\n"
98R"==(#define REDUCE_STAGE_39(cat, f) cat(REDUCE_STAGE_38(cat, f), f(38)) )==""\n"
99R"==(#define REDUCE_STAGE_40(cat, f) cat(REDUCE_STAGE_39(cat, f), f(39)) )==""\n"
100R"==(#define REDUCE_STAGE_41(cat, f) cat(REDUCE_STAGE_40(cat, f), f(40)) )==""\n"
101R"==(#define REDUCE_STAGE_42(cat, f) cat(REDUCE_STAGE_41(cat, f), f(41)) )==""\n"
102R"==(#define REDUCE_STAGE_43(cat, f) cat(REDUCE_STAGE_42(cat, f), f(42)) )==""\n"
103R"==(#define REDUCE_STAGE_44(cat, f) cat(REDUCE_STAGE_43(cat, f), f(43)) )==""\n"
104R"==(#define REDUCE_STAGE_45(cat, f) cat(REDUCE_STAGE_44(cat, f), f(44)) )==""\n"
105R"==(#define REDUCE_STAGE_46(cat, f) cat(REDUCE_STAGE_45(cat, f), f(45)) )==""\n"
106R"==(#define REDUCE_STAGE_47(cat, f) cat(REDUCE_STAGE_46(cat, f), f(46)) )==""\n"
107R"==(#define REDUCE_STAGE_48(cat, f) cat(REDUCE_STAGE_47(cat, f), f(47)) )==""\n"
108R"==(#define REDUCE_STAGE_49(cat, f) cat(REDUCE_STAGE_48(cat, f), f(48)) )==""\n"
109R"==(#define REDUCE_STAGE_50(cat, f) cat(REDUCE_STAGE_49(cat, f), f(49)) )==""\n"
110R"==(#define REDUCE_STAGE_51(cat, f) cat(REDUCE_STAGE_50(cat, f), f(50)) )==""\n"
111R"==(#define REDUCE_STAGE_52(cat, f) cat(REDUCE_STAGE_51(cat, f), f(51)) )==""\n"
112R"==(#define REDUCE_STAGE_53(cat, f) cat(REDUCE_STAGE_52(cat, f), f(52)) )==""\n"
113R"==(#define REDUCE_STAGE_54(cat, f) cat(REDUCE_STAGE_53(cat, f), f(53)) )==""\n"
114R"==(#define REDUCE_STAGE_55(cat, f) cat(REDUCE_STAGE_54(cat, f), f(54)) )==""\n"
115R"==(#define REDUCE_STAGE_56(cat, f) cat(REDUCE_STAGE_55(cat, f), f(55)) )==""\n"
116R"==(#define REDUCE_STAGE_57(cat, f) cat(REDUCE_STAGE_56(cat, f), f(56)) )==""\n"
117R"==(#define REDUCE_STAGE_58(cat, f) cat(REDUCE_STAGE_57(cat, f), f(57)) )==""\n"
118R"==(#define REDUCE_STAGE_59(cat, f) cat(REDUCE_STAGE_58(cat, f), f(58)) )==""\n"
119R"==(#define REDUCE_STAGE_60(cat, f) cat(REDUCE_STAGE_59(cat, f), f(59)) )==""\n"
120R"==(#define REDUCE_STAGE_61(cat, f) cat(REDUCE_STAGE_60(cat, f), f(60)) )==""\n"
121R"==(#define REDUCE_STAGE_62(cat, f) cat(REDUCE_STAGE_61(cat, f), f(61)) )==""\n"
122R"==(#define REDUCE_STAGE_63(cat, f) cat(REDUCE_STAGE_62(cat, f), f(62)) )==""\n"
123R"==(#define REDUCE_STAGE_64(cat, f) cat(REDUCE_STAGE_63(cat, f), f(63)) )==""\n"
124R"==(#define REDUCE2(n, cat, f) REDUCE_STAGE_##n(cat, f) )==""\n"
125R"==(#define REDUCE(n, cat, f) REDUCE2(n, cat, f) )==""\n"
126R"==(#define JOIN_COMMA(x, y) x, y )==""\n"
127R"==(#define SRC_PTR(n) __global const DATA_T *src##n )==""\n"
128R"==(#define SRC_PTRS REDUCE(N_INPUTS, JOIN_COMMA, SRC_PTR) )==""\n"
129R"==(#define JOIN_ELSE(x, y) y else x )==""\n"
130R"==(#define CHECK_AND_GET(n) \ )==""\n"
131R"==(if (get_global_id(2) >= OFFSET##n) \ )==""\n"
132R"==(src = src##n + get_global_id(1) * SRC##n##_EXT_OFFSET + x \ )==""\n"
133R"==(- OFFSET##n * INNER_OFFSET; )==""\n"
134R"==(#define SET_SRC REDUCE(N_INPUTS, JOIN_ELSE, CHECK_AND_GET) )==""\n"
135R"==(#if BLOCK != 1 )==""\n"
136R"==(__attribute__((intel_reqd_sub_group_size(SIMD))) )==""\n"
137R"==(#endif )==""\n"
138R"==(__kernel void )==""\n"
139R"==(simple_concat(__global DATA_T *dst, long dst_offset0, SRC_PTRS) { )==""\n"
140R"==(DATA8_T A0, A1, A2, A3; )==""\n"
141R"==(DATA_T B; )==""\n"
142R"==(DATA2_T C; )==""\n"
143R"==(DATA4_T D; )==""\n"
144R"==(const size_t x = (get_global_id(0) / SIMD) * BLOCK )==""\n"
145R"==(+ get_global_id(2) * INNER_OFFSET; )==""\n"
146R"==(__global const DATA_T *src; )==""\n"
147R"==(SET_SRC; )==""\n"
148R"==(#if BLOCK == 1 )==""\n"
149R"==(B = src[0]; )==""\n"
150R"==(#elif BLOCK == SIMD )==""\n"
151R"==(B = BLOCK_READ(src); )==""\n"
152R"==(#elif BLOCK == 2 * SIMD )==""\n"
153R"==(C = BLOCK_READ2(src); )==""\n"
154R"==(#elif BLOCK == 3 * SIMD )==""\n"
155R"==(C = BLOCK_READ2(src); )==""\n"
156R"==(B = BLOCK_READ(&src[2 * SIMD]); )==""\n"
157R"==(#elif BLOCK == 4 * SIMD )==""\n"
158R"==(D = BLOCK_READ4(src); )==""\n"
159R"==(#elif BLOCK == 5 * SIMD )==""\n"
160R"==(D = BLOCK_READ4(src); )==""\n"
161R"==(B = BLOCK_READ(&src[4 * SIMD]); )==""\n"
162R"==(#elif BLOCK == 6 * SIMD )==""\n"
163R"==(D = BLOCK_READ4(src); )==""\n"
164R"==(C = BLOCK_READ2(&src[4 * SIMD]); )==""\n"
165R"==(#elif BLOCK == 7 * SIMD )==""\n"
166R"==(B = BLOCK_READ(src); )==""\n"
167R"==(C = BLOCK_READ2(&src[SIMD]); )==""\n"
168R"==(D = BLOCK_READ4(&src[3 * SIMD]); )==""\n"
169R"==(#elif BLOCK >= 8 * SIMD )==""\n"
170R"==(A0 = BLOCK_READ8(src); )==""\n"
171R"==(#elif BLOCK >= 16 * SIMD )==""\n"
172R"==(A1 = BLOCK_READ8(&src[8 * SIMD]); )==""\n"
173R"==(#elif BLOCK >= 24 * SIMD )==""\n"
174R"==(A2 = BLOCK_READ8(&src[16 * SIMD]); )==""\n"
175R"==(#elif BLOCK >= 32 * SIMD )==""\n"
176R"==(A3 = BLOCK_READ8(&src[24 * SIMD]); )==""\n"
177R"==(#endif )==""\n"
178R"==(dst += dst_offset0 + get_global_id(1) * DST_EXT_OFFSET + x; )==""\n"
179R"==(#if BLOCK == 1 )==""\n"
180R"==(dst[0] = B; )==""\n"
181R"==(#elif BLOCK == SIMD )==""\n"
182R"==(BLOCK_WRITE(dst, B); )==""\n"
183R"==(#elif BLOCK == 2 * SIMD )==""\n"
184R"==(BLOCK_WRITE2(dst, C); )==""\n"
185R"==(#elif BLOCK == 3 * SIMD )==""\n"
186R"==(BLOCK_WRITE2(dst, C); )==""\n"
187R"==(BLOCK_WRITE(&dst[2 * SIMD], B); )==""\n"
188R"==(#elif BLOCK == 4 * SIMD )==""\n"
189R"==(BLOCK_WRITE4(dst, D); )==""\n"
190R"==(#elif BLOCK == 5 * SIMD )==""\n"
191R"==(BLOCK_WRITE4(dst, D); )==""\n"
192R"==(BLOCK_WRITE(&dst[4 * SIMD], B); )==""\n"
193R"==(#elif BLOCK == 6 * SIMD )==""\n"
194R"==(BLOCK_WRITE4(dst, D); )==""\n"
195R"==(BLOCK_WRITE2(&dst[4 * SIMD], C); )==""\n"
196R"==(#elif BLOCK == 7 * SIMD )==""\n"
197R"==(BLOCK_WRITE(dst, B); )==""\n"
198R"==(BLOCK_WRITE2(&dst[SIMD], C); )==""\n"
199R"==(BLOCK_WRITE4(&dst[3 * SIMD], D); )==""\n"
200R"==(#elif BLOCK >= 8 * SIMD )==""\n"
201R"==(BLOCK_WRITE8(dst, A0); )==""\n"
202R"==(#elif BLOCK >= 16 * SIMD )==""\n"
203R"==(BLOCK_WRITE8(&dst[8 * SIMD], A1); )==""\n"
204R"==(#elif BLOCK >= 24 * SIMD )==""\n"
205R"==(BLOCK_WRITE8(&dst[16 * SIMD], A2); )==""\n"
206R"==(#elif BLOCK >= 32 * SIMD )==""\n"
207R"==(BLOCK_WRITE8(&dst[24 * SIMD], A3); )==""\n"
208R"==(#endif )==""\n"
209R"==(} )==""\n"
210R"==()==";
211}
212}
213}
214}