1 | namespace dnnl { |
2 | namespace impl { |
3 | namespace gpu { |
4 | namespace ocl { |
5 | const char *simple_concat_kernel = R"==(/******************************************************************************* )==" "\n" |
6 | R"==(* Copyright 2020-2022 Intel Corporation )==" "\n" |
7 | R"==(* )==" "\n" |
8 | R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==" "\n" |
9 | R"==(* you may not use this file except in compliance with the License. )==" "\n" |
10 | R"==(* You may obtain a copy of the License at )==" "\n" |
11 | R"==(* )==" "\n" |
12 | R"==(* http: )==" "\n" |
13 | R"==(* )==" "\n" |
14 | R"==(* Unless required by applicable law or agreed to in writing, software )==" "\n" |
15 | R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==" "\n" |
16 | R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==" "\n" |
17 | R"==(* See the License for the specific language governing permissions and )==" "\n" |
18 | R"==(* limitations under the License. )==" "\n" |
19 | R"==(*******************************************************************************/ )==" "\n" |
20 | R"==(#if DATA_TYPE_SIZE == 4 )==" "\n" |
21 | R"==(#define DATA_T uint )==" "\n" |
22 | R"==(#define DATA2_T uint2 )==" "\n" |
23 | R"==(#define DATA4_T uint4 )==" "\n" |
24 | R"==(#define DATA8_T uint8 )==" "\n" |
25 | R"==(#define BLOCK_READ intel_sub_group_block_read )==" "\n" |
26 | R"==(#define BLOCK_WRITE intel_sub_group_block_write )==" "\n" |
27 | R"==(#define BLOCK_READ2 intel_sub_group_block_read2 )==" "\n" |
28 | R"==(#define BLOCK_WRITE2 intel_sub_group_block_write2 )==" "\n" |
29 | R"==(#define BLOCK_READ4 intel_sub_group_block_read4 )==" "\n" |
30 | R"==(#define BLOCK_WRITE4 intel_sub_group_block_write4 )==" "\n" |
31 | R"==(#define BLOCK_READ8 intel_sub_group_block_read8 )==" "\n" |
32 | R"==(#define BLOCK_WRITE8 intel_sub_group_block_write8 )==" "\n" |
33 | R"==(#elif DATA_TYPE_SIZE == 2 )==" "\n" |
34 | R"==(#define DATA_T ushort )==" "\n" |
35 | R"==(#define DATA2_T ushort2 )==" "\n" |
36 | R"==(#define DATA4_T ushort4 )==" "\n" |
37 | R"==(#define DATA8_T ushort8 )==" "\n" |
38 | R"==(#define BLOCK_READ intel_sub_group_block_read_us )==" "\n" |
39 | R"==(#define BLOCK_WRITE intel_sub_group_block_write_us )==" "\n" |
40 | R"==(#define BLOCK_READ2 intel_sub_group_block_read_us2 )==" "\n" |
41 | R"==(#define BLOCK_WRITE2 intel_sub_group_block_write_us2 )==" "\n" |
42 | R"==(#define BLOCK_READ4 intel_sub_group_block_read_us4 )==" "\n" |
43 | R"==(#define BLOCK_WRITE4 intel_sub_group_block_write_us4 )==" "\n" |
44 | R"==(#define BLOCK_READ8 intel_sub_group_block_read_us8 )==" "\n" |
45 | R"==(#define BLOCK_WRITE8 intel_sub_group_block_write_us8 )==" "\n" |
46 | R"==(#elif DATA_TYPE_SIZE == 1 )==" "\n" |
47 | R"==(#define DATA_T uchar )==" "\n" |
48 | R"==(#define DATA2_T uchar2 )==" "\n" |
49 | R"==(#define DATA4_T uchar4 )==" "\n" |
50 | R"==(#define DATA8_T uchar8 )==" "\n" |
51 | R"==(#define BLOCK_READ intel_sub_group_block_read_uc )==" "\n" |
52 | R"==(#define BLOCK_WRITE intel_sub_group_block_write_uc )==" "\n" |
53 | R"==(#define BLOCK_READ2 intel_sub_group_block_read_uc2 )==" "\n" |
54 | R"==(#define BLOCK_WRITE2 intel_sub_group_block_write_uc2 )==" "\n" |
55 | R"==(#define BLOCK_READ4 intel_sub_group_block_read_uc4 )==" "\n" |
56 | R"==(#define BLOCK_WRITE4 intel_sub_group_block_write_uc4 )==" "\n" |
57 | R"==(#define BLOCK_READ8 intel_sub_group_block_read_uc8 )==" "\n" |
58 | R"==(#define BLOCK_WRITE8 intel_sub_group_block_write_uc8 )==" "\n" |
59 | R"==(#endif )==" "\n" |
60 | R"==(#define REDUCE_STAGE_1(cat, f) f(0) )==" "\n" |
61 | R"==(#define REDUCE_STAGE_2(cat, f) cat(REDUCE_STAGE_1(cat, f), f(1)) )==" "\n" |
62 | R"==(#define REDUCE_STAGE_3(cat, f) cat(REDUCE_STAGE_2(cat, f), f(2)) )==" "\n" |
63 | R"==(#define REDUCE_STAGE_4(cat, f) cat(REDUCE_STAGE_3(cat, f), f(3)) )==" "\n" |
64 | R"==(#define REDUCE_STAGE_5(cat, f) cat(REDUCE_STAGE_4(cat, f), f(4)) )==" "\n" |
65 | R"==(#define REDUCE_STAGE_6(cat, f) cat(REDUCE_STAGE_5(cat, f), f(5)) )==" "\n" |
66 | R"==(#define REDUCE_STAGE_7(cat, f) cat(REDUCE_STAGE_6(cat, f), f(6)) )==" "\n" |
67 | R"==(#define REDUCE_STAGE_8(cat, f) cat(REDUCE_STAGE_7(cat, f), f(7)) )==" "\n" |
68 | R"==(#define REDUCE_STAGE_9(cat, f) cat(REDUCE_STAGE_8(cat, f), f(8)) )==" "\n" |
69 | R"==(#define REDUCE_STAGE_10(cat, f) cat(REDUCE_STAGE_9(cat, f), f(9)) )==" "\n" |
70 | R"==(#define REDUCE_STAGE_11(cat, f) cat(REDUCE_STAGE_10(cat, f), f(10)) )==" "\n" |
71 | R"==(#define REDUCE_STAGE_12(cat, f) cat(REDUCE_STAGE_11(cat, f), f(11)) )==" "\n" |
72 | R"==(#define REDUCE_STAGE_13(cat, f) cat(REDUCE_STAGE_12(cat, f), f(12)) )==" "\n" |
73 | R"==(#define REDUCE_STAGE_14(cat, f) cat(REDUCE_STAGE_13(cat, f), f(13)) )==" "\n" |
74 | R"==(#define REDUCE_STAGE_15(cat, f) cat(REDUCE_STAGE_14(cat, f), f(14)) )==" "\n" |
75 | R"==(#define REDUCE_STAGE_16(cat, f) cat(REDUCE_STAGE_15(cat, f), f(15)) )==" "\n" |
76 | R"==(#define REDUCE_STAGE_17(cat, f) cat(REDUCE_STAGE_16(cat, f), f(16)) )==" "\n" |
77 | R"==(#define REDUCE_STAGE_18(cat, f) cat(REDUCE_STAGE_17(cat, f), f(17)) )==" "\n" |
78 | R"==(#define REDUCE_STAGE_19(cat, f) cat(REDUCE_STAGE_18(cat, f), f(18)) )==" "\n" |
79 | R"==(#define REDUCE_STAGE_20(cat, f) cat(REDUCE_STAGE_19(cat, f), f(19)) )==" "\n" |
80 | R"==(#define REDUCE_STAGE_21(cat, f) cat(REDUCE_STAGE_20(cat, f), f(20)) )==" "\n" |
81 | R"==(#define REDUCE_STAGE_22(cat, f) cat(REDUCE_STAGE_21(cat, f), f(21)) )==" "\n" |
82 | R"==(#define REDUCE_STAGE_23(cat, f) cat(REDUCE_STAGE_22(cat, f), f(22)) )==" "\n" |
83 | R"==(#define REDUCE_STAGE_24(cat, f) cat(REDUCE_STAGE_23(cat, f), f(23)) )==" "\n" |
84 | R"==(#define REDUCE_STAGE_25(cat, f) cat(REDUCE_STAGE_24(cat, f), f(24)) )==" "\n" |
85 | R"==(#define REDUCE_STAGE_26(cat, f) cat(REDUCE_STAGE_25(cat, f), f(25)) )==" "\n" |
86 | R"==(#define REDUCE_STAGE_27(cat, f) cat(REDUCE_STAGE_26(cat, f), f(26)) )==" "\n" |
87 | R"==(#define REDUCE_STAGE_28(cat, f) cat(REDUCE_STAGE_27(cat, f), f(27)) )==" "\n" |
88 | R"==(#define REDUCE_STAGE_29(cat, f) cat(REDUCE_STAGE_28(cat, f), f(28)) )==" "\n" |
89 | R"==(#define REDUCE_STAGE_30(cat, f) cat(REDUCE_STAGE_29(cat, f), f(29)) )==" "\n" |
90 | R"==(#define REDUCE_STAGE_31(cat, f) cat(REDUCE_STAGE_30(cat, f), f(30)) )==" "\n" |
91 | R"==(#define REDUCE_STAGE_32(cat, f) cat(REDUCE_STAGE_31(cat, f), f(31)) )==" "\n" |
92 | R"==(#define REDUCE_STAGE_33(cat, f) cat(REDUCE_STAGE_32(cat, f), f(32)) )==" "\n" |
93 | R"==(#define REDUCE_STAGE_34(cat, f) cat(REDUCE_STAGE_33(cat, f), f(33)) )==" "\n" |
94 | R"==(#define REDUCE_STAGE_35(cat, f) cat(REDUCE_STAGE_34(cat, f), f(34)) )==" "\n" |
95 | R"==(#define REDUCE_STAGE_36(cat, f) cat(REDUCE_STAGE_35(cat, f), f(35)) )==" "\n" |
96 | R"==(#define REDUCE_STAGE_37(cat, f) cat(REDUCE_STAGE_36(cat, f), f(36)) )==" "\n" |
97 | R"==(#define REDUCE_STAGE_38(cat, f) cat(REDUCE_STAGE_37(cat, f), f(37)) )==" "\n" |
98 | R"==(#define REDUCE_STAGE_39(cat, f) cat(REDUCE_STAGE_38(cat, f), f(38)) )==" "\n" |
99 | R"==(#define REDUCE_STAGE_40(cat, f) cat(REDUCE_STAGE_39(cat, f), f(39)) )==" "\n" |
100 | R"==(#define REDUCE_STAGE_41(cat, f) cat(REDUCE_STAGE_40(cat, f), f(40)) )==" "\n" |
101 | R"==(#define REDUCE_STAGE_42(cat, f) cat(REDUCE_STAGE_41(cat, f), f(41)) )==" "\n" |
102 | R"==(#define REDUCE_STAGE_43(cat, f) cat(REDUCE_STAGE_42(cat, f), f(42)) )==" "\n" |
103 | R"==(#define REDUCE_STAGE_44(cat, f) cat(REDUCE_STAGE_43(cat, f), f(43)) )==" "\n" |
104 | R"==(#define REDUCE_STAGE_45(cat, f) cat(REDUCE_STAGE_44(cat, f), f(44)) )==" "\n" |
105 | R"==(#define REDUCE_STAGE_46(cat, f) cat(REDUCE_STAGE_45(cat, f), f(45)) )==" "\n" |
106 | R"==(#define REDUCE_STAGE_47(cat, f) cat(REDUCE_STAGE_46(cat, f), f(46)) )==" "\n" |
107 | R"==(#define REDUCE_STAGE_48(cat, f) cat(REDUCE_STAGE_47(cat, f), f(47)) )==" "\n" |
108 | R"==(#define REDUCE_STAGE_49(cat, f) cat(REDUCE_STAGE_48(cat, f), f(48)) )==" "\n" |
109 | R"==(#define REDUCE_STAGE_50(cat, f) cat(REDUCE_STAGE_49(cat, f), f(49)) )==" "\n" |
110 | R"==(#define REDUCE_STAGE_51(cat, f) cat(REDUCE_STAGE_50(cat, f), f(50)) )==" "\n" |
111 | R"==(#define REDUCE_STAGE_52(cat, f) cat(REDUCE_STAGE_51(cat, f), f(51)) )==" "\n" |
112 | R"==(#define REDUCE_STAGE_53(cat, f) cat(REDUCE_STAGE_52(cat, f), f(52)) )==" "\n" |
113 | R"==(#define REDUCE_STAGE_54(cat, f) cat(REDUCE_STAGE_53(cat, f), f(53)) )==" "\n" |
114 | R"==(#define REDUCE_STAGE_55(cat, f) cat(REDUCE_STAGE_54(cat, f), f(54)) )==" "\n" |
115 | R"==(#define REDUCE_STAGE_56(cat, f) cat(REDUCE_STAGE_55(cat, f), f(55)) )==" "\n" |
116 | R"==(#define REDUCE_STAGE_57(cat, f) cat(REDUCE_STAGE_56(cat, f), f(56)) )==" "\n" |
117 | R"==(#define REDUCE_STAGE_58(cat, f) cat(REDUCE_STAGE_57(cat, f), f(57)) )==" "\n" |
118 | R"==(#define REDUCE_STAGE_59(cat, f) cat(REDUCE_STAGE_58(cat, f), f(58)) )==" "\n" |
119 | R"==(#define REDUCE_STAGE_60(cat, f) cat(REDUCE_STAGE_59(cat, f), f(59)) )==" "\n" |
120 | R"==(#define REDUCE_STAGE_61(cat, f) cat(REDUCE_STAGE_60(cat, f), f(60)) )==" "\n" |
121 | R"==(#define REDUCE_STAGE_62(cat, f) cat(REDUCE_STAGE_61(cat, f), f(61)) )==" "\n" |
122 | R"==(#define REDUCE_STAGE_63(cat, f) cat(REDUCE_STAGE_62(cat, f), f(62)) )==" "\n" |
123 | R"==(#define REDUCE_STAGE_64(cat, f) cat(REDUCE_STAGE_63(cat, f), f(63)) )==" "\n" |
124 | R"==(#define REDUCE2(n, cat, f) REDUCE_STAGE_##n(cat, f) )==" "\n" |
125 | R"==(#define REDUCE(n, cat, f) REDUCE2(n, cat, f) )==" "\n" |
126 | R"==(#define JOIN_COMMA(x, y) x, y )==" "\n" |
127 | R"==(#define SRC_PTR(n) __global const DATA_T *src##n )==" "\n" |
128 | R"==(#define SRC_PTRS REDUCE(N_INPUTS, JOIN_COMMA, SRC_PTR) )==" "\n" |
129 | R"==(#define JOIN_ELSE(x, y) y else x )==" "\n" |
130 | R"==(#define CHECK_AND_GET(n) \ )==" "\n" |
131 | R"==(if (get_global_id(2) >= OFFSET##n) \ )==" "\n" |
132 | R"==(src = src##n + get_global_id(1) * SRC##n##_EXT_OFFSET + x \ )==" "\n" |
133 | R"==(- OFFSET##n * INNER_OFFSET; )==" "\n" |
134 | R"==(#define SET_SRC REDUCE(N_INPUTS, JOIN_ELSE, CHECK_AND_GET) )==" "\n" |
135 | R"==(#if BLOCK != 1 )==" "\n" |
136 | R"==(__attribute__((intel_reqd_sub_group_size(SIMD))) )==" "\n" |
137 | R"==(#endif )==" "\n" |
138 | R"==(__kernel void )==" "\n" |
139 | R"==(simple_concat(__global DATA_T *dst, long dst_offset0, SRC_PTRS) { )==" "\n" |
140 | R"==(DATA8_T A0, A1, A2, A3; )==" "\n" |
141 | R"==(DATA_T B; )==" "\n" |
142 | R"==(DATA2_T C; )==" "\n" |
143 | R"==(DATA4_T D; )==" "\n" |
144 | R"==(const size_t x = (get_global_id(0) / SIMD) * BLOCK )==" "\n" |
145 | R"==(+ get_global_id(2) * INNER_OFFSET; )==" "\n" |
146 | R"==(__global const DATA_T *src; )==" "\n" |
147 | R"==(SET_SRC; )==" "\n" |
148 | R"==(#if BLOCK == 1 )==" "\n" |
149 | R"==(B = src[0]; )==" "\n" |
150 | R"==(#elif BLOCK == SIMD )==" "\n" |
151 | R"==(B = BLOCK_READ(src); )==" "\n" |
152 | R"==(#elif BLOCK == 2 * SIMD )==" "\n" |
153 | R"==(C = BLOCK_READ2(src); )==" "\n" |
154 | R"==(#elif BLOCK == 3 * SIMD )==" "\n" |
155 | R"==(C = BLOCK_READ2(src); )==" "\n" |
156 | R"==(B = BLOCK_READ(&src[2 * SIMD]); )==" "\n" |
157 | R"==(#elif BLOCK == 4 * SIMD )==" "\n" |
158 | R"==(D = BLOCK_READ4(src); )==" "\n" |
159 | R"==(#elif BLOCK == 5 * SIMD )==" "\n" |
160 | R"==(D = BLOCK_READ4(src); )==" "\n" |
161 | R"==(B = BLOCK_READ(&src[4 * SIMD]); )==" "\n" |
162 | R"==(#elif BLOCK == 6 * SIMD )==" "\n" |
163 | R"==(D = BLOCK_READ4(src); )==" "\n" |
164 | R"==(C = BLOCK_READ2(&src[4 * SIMD]); )==" "\n" |
165 | R"==(#elif BLOCK == 7 * SIMD )==" "\n" |
166 | R"==(B = BLOCK_READ(src); )==" "\n" |
167 | R"==(C = BLOCK_READ2(&src[SIMD]); )==" "\n" |
168 | R"==(D = BLOCK_READ4(&src[3 * SIMD]); )==" "\n" |
169 | R"==(#elif BLOCK >= 8 * SIMD )==" "\n" |
170 | R"==(A0 = BLOCK_READ8(src); )==" "\n" |
171 | R"==(#elif BLOCK >= 16 * SIMD )==" "\n" |
172 | R"==(A1 = BLOCK_READ8(&src[8 * SIMD]); )==" "\n" |
173 | R"==(#elif BLOCK >= 24 * SIMD )==" "\n" |
174 | R"==(A2 = BLOCK_READ8(&src[16 * SIMD]); )==" "\n" |
175 | R"==(#elif BLOCK >= 32 * SIMD )==" "\n" |
176 | R"==(A3 = BLOCK_READ8(&src[24 * SIMD]); )==" "\n" |
177 | R"==(#endif )==" "\n" |
178 | R"==(dst += dst_offset0 + get_global_id(1) * DST_EXT_OFFSET + x; )==" "\n" |
179 | R"==(#if BLOCK == 1 )==" "\n" |
180 | R"==(dst[0] = B; )==" "\n" |
181 | R"==(#elif BLOCK == SIMD )==" "\n" |
182 | R"==(BLOCK_WRITE(dst, B); )==" "\n" |
183 | R"==(#elif BLOCK == 2 * SIMD )==" "\n" |
184 | R"==(BLOCK_WRITE2(dst, C); )==" "\n" |
185 | R"==(#elif BLOCK == 3 * SIMD )==" "\n" |
186 | R"==(BLOCK_WRITE2(dst, C); )==" "\n" |
187 | R"==(BLOCK_WRITE(&dst[2 * SIMD], B); )==" "\n" |
188 | R"==(#elif BLOCK == 4 * SIMD )==" "\n" |
189 | R"==(BLOCK_WRITE4(dst, D); )==" "\n" |
190 | R"==(#elif BLOCK == 5 * SIMD )==" "\n" |
191 | R"==(BLOCK_WRITE4(dst, D); )==" "\n" |
192 | R"==(BLOCK_WRITE(&dst[4 * SIMD], B); )==" "\n" |
193 | R"==(#elif BLOCK == 6 * SIMD )==" "\n" |
194 | R"==(BLOCK_WRITE4(dst, D); )==" "\n" |
195 | R"==(BLOCK_WRITE2(&dst[4 * SIMD], C); )==" "\n" |
196 | R"==(#elif BLOCK == 7 * SIMD )==" "\n" |
197 | R"==(BLOCK_WRITE(dst, B); )==" "\n" |
198 | R"==(BLOCK_WRITE2(&dst[SIMD], C); )==" "\n" |
199 | R"==(BLOCK_WRITE4(&dst[3 * SIMD], D); )==" "\n" |
200 | R"==(#elif BLOCK >= 8 * SIMD )==" "\n" |
201 | R"==(BLOCK_WRITE8(dst, A0); )==" "\n" |
202 | R"==(#elif BLOCK >= 16 * SIMD )==" "\n" |
203 | R"==(BLOCK_WRITE8(&dst[8 * SIMD], A1); )==" "\n" |
204 | R"==(#elif BLOCK >= 24 * SIMD )==" "\n" |
205 | R"==(BLOCK_WRITE8(&dst[16 * SIMD], A2); )==" "\n" |
206 | R"==(#elif BLOCK >= 32 * SIMD )==" "\n" |
207 | R"==(BLOCK_WRITE8(&dst[24 * SIMD], A3); )==" "\n" |
208 | R"==(#endif )==" "\n" |
209 | R"==(} )==" "\n" |
210 | R"==()==" ; |
211 | } |
212 | } |
213 | } |
214 | } |