1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *gen9_concat_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2021-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
21R"==(#define IS_IN_PART(x) (dst_dims[CONCAT_AXIS] < CONCAT3(SRC, x, _END)) )==""\n"
22R"==(#define SET_DIMS(x, y) \ )==""\n"
23R"==({ \ )==""\n"
24R"==(part = y; \ )==""\n"
25R"==(if (y > 0) { \ )==""\n"
26R"==(src_dims[CONCAT_AXIS] \ )==""\n"
27R"==(= dst_dims[CONCAT_AXIS] - CONCAT3(SRC, x, _END); \ )==""\n"
28R"==(} \ )==""\n"
29R"==(src_off = OFF_MD(CONCAT2(SRC, y), src_dims[0], src_dims[1], \ )==""\n"
30R"==(src_dims[2], src_dims[3], src_dims[4], src_dims[5]); \ )==""\n"
31R"==(src = CONCAT2(src, y); \ )==""\n"
32R"==(} )==""\n"
33R"==(#define SRC_DATA_T SRC0_DATA_T )==""\n"
34R"==(#define DD(i) CONCAt2(DST_D, i) )==""\n"
35R"==(#define NEEDS_PADDING(dim0, dim1, dim2, dim3, dim4, dim5) \ )==""\n"
36R"==(dim0 >= DD(0) || dim1 >= DD(1) || dim2 >= DD(2) || dim3 >= DD(3) \ )==""\n"
37R"==(|| dim4 >= DD(4) || dim5 >= DD(5) )==""\n"
38R"==(KERNEL_ATTR )==""\n"
39R"==(__kernel void gen9_concat(__global DST_DATA_T *dst, long dst_offset0, )==""\n"
40R"==(__global const SRC_DATA_T *src0, __global const SRC_DATA_T *src1, )==""\n"
41R"==(__global const SRC_DATA_T *src2, __global const SRC_DATA_T *src3, )==""\n"
42R"==(__global const SRC_DATA_T *src4, __global const SRC_DATA_T *src5, )==""\n"
43R"==(__global const SRC_DATA_T *src6, __global const SRC_DATA_T *src7, )==""\n"
44R"==(__global const SRC_DATA_T *src8, __global const SRC_DATA_T *src9, )==""\n"
45R"==(__global const SRC_DATA_T *src10, __global const SRC_DATA_T *src11, )==""\n"
46R"==(__global const SRC_DATA_T *src12, __global const SRC_DATA_T *src13, )==""\n"
47R"==(__global const SRC_DATA_T *src14, __global const SRC_DATA_T *src15) { )==""\n"
48R"==(dst += dst_offset0; )==""\n"
49R"==(int dst_dims[6], src_dims[6]; )==""\n"
50R"==(src_dims[0] = dst_dims[0] = GWS_GET_D0(); )==""\n"
51R"==(src_dims[1] = dst_dims[1] = GWS_GET_D1(); )==""\n"
52R"==(src_dims[2] = dst_dims[2] = GWS_GET_D2(); )==""\n"
53R"==(src_dims[3] = dst_dims[3] = GWS_GET_D3(); )==""\n"
54R"==(src_dims[4] = dst_dims[4] = GWS_GET_D4(); )==""\n"
55R"==(src_dims[5] = dst_dims[5] = GWS_GET_D5(); )==""\n"
56R"==(const int iter_dim_end = min( )==""\n"
57R"==(dst_dims[ITER_DIM_IDX] + ITER_DIM_CHUNK, ITER_DIM_PADDED_SIZE); )==""\n"
58R"==(if (NEEDS_PADDING(dst_dims[0], dst_dims[1], dst_dims[2], dst_dims[3], )==""\n"
59R"==(dst_dims[4], dst_dims[5])) { )==""\n"
60R"==(for (; dst_dims[ITER_DIM_IDX] < iter_dim_end; )==""\n"
61R"==(dst_dims[ITER_DIM_IDX]++) { )==""\n"
62R"==(const int dst_off = OFF_MD(DST, dst_dims[0], dst_dims[1], )==""\n"
63R"==(dst_dims[2], dst_dims[3], dst_dims[4], dst_dims[5]); )==""\n"
64R"==(#if SUB_GROUP_SIZE > 1 )==""\n"
65R"==(BLOCK_WRITE_DST(&dst[dst_off], TO_DST(DATA_ZERO)); )==""\n"
66R"==(#else )==""\n"
67R"==(dst[dst_off] = TO_DST(DATA_ZERO); )==""\n"
68R"==(#endif )==""\n"
69R"==(} )==""\n"
70R"==(return; )==""\n"
71R"==(} )==""\n"
72R"==(for (; dst_dims[ITER_DIM_IDX] < min(DD(ITER_DIM_IDX), iter_dim_end); )==""\n"
73R"==(dst_dims[ITER_DIM_IDX]++, src_dims[ITER_DIM_IDX]++) { )==""\n"
74R"==(int part; )==""\n"
75R"==(int src_off; )==""\n"
76R"==(__global SRC_DATA_T *src; )==""\n"
77R"==(if (IS_IN_PART(0)) SET_DIMS(0, 0) )==""\n"
78R"==(#if NUM_INPUTS >= 2 )==""\n"
79R"==(else if (IS_IN_PART(1)) )==""\n"
80R"==(SET_DIMS(0, 1) )==""\n"
81R"==(#endif )==""\n"
82R"==(#if NUM_INPUTS >= 3 )==""\n"
83R"==(else if (IS_IN_PART(2)) )==""\n"
84R"==(SET_DIMS(1, 2) )==""\n"
85R"==(#endif )==""\n"
86R"==(#if NUM_INPUTS >= 4 )==""\n"
87R"==(else if (IS_IN_PART(3)) )==""\n"
88R"==(SET_DIMS(2, 3) )==""\n"
89R"==(#endif )==""\n"
90R"==(#if NUM_INPUTS >= 5 )==""\n"
91R"==(else if (IS_IN_PART(4)) )==""\n"
92R"==(SET_DIMS(3, 4) )==""\n"
93R"==(#endif )==""\n"
94R"==(#if NUM_INPUTS >= 6 )==""\n"
95R"==(else if (IS_IN_PART(5)) )==""\n"
96R"==(SET_DIMS(4, 5) )==""\n"
97R"==(#endif )==""\n"
98R"==(#if NUM_INPUTS >= 7 )==""\n"
99R"==(else if (IS_IN_PART(6)) )==""\n"
100R"==(SET_DIMS(5, 6) )==""\n"
101R"==(#endif )==""\n"
102R"==(#if NUM_INPUTS >= 8 )==""\n"
103R"==(else if (IS_IN_PART(7)) )==""\n"
104R"==(SET_DIMS(6, 7) )==""\n"
105R"==(#endif )==""\n"
106R"==(#if NUM_INPUTS >= 9 )==""\n"
107R"==(else if (IS_IN_PART(8)) )==""\n"
108R"==(SET_DIMS(7, 8) )==""\n"
109R"==(#endif )==""\n"
110R"==(#if NUM_INPUTS >= 10 )==""\n"
111R"==(else if (IS_IN_PART(9)) )==""\n"
112R"==(SET_DIMS(8, 9) )==""\n"
113R"==(#endif )==""\n"
114R"==(#if NUM_INPUTS >= 11 )==""\n"
115R"==(else if (IS_IN_PART(10)) )==""\n"
116R"==(SET_DIMS(9, 10) )==""\n"
117R"==(#endif )==""\n"
118R"==(#if NUM_INPUTS >= 12 )==""\n"
119R"==(else if (IS_IN_PART(11)) )==""\n"
120R"==(SET_DIMS(10, 11) )==""\n"
121R"==(#endif )==""\n"
122R"==(#if NUM_INPUTS >= 13 )==""\n"
123R"==(else if (IS_IN_PART(12)) )==""\n"
124R"==(SET_DIMS(11, 12) )==""\n"
125R"==(#endif )==""\n"
126R"==(#if NUM_INPUTS >= 14 )==""\n"
127R"==(else if (IS_IN_PART(13)) )==""\n"
128R"==(SET_DIMS(12, 13) )==""\n"
129R"==(#endif )==""\n"
130R"==(#if NUM_INPUTS >= 15 )==""\n"
131R"==(else if (IS_IN_PART(14)) )==""\n"
132R"==(SET_DIMS(13, 14) )==""\n"
133R"==(#endif )==""\n"
134R"==(#if NUM_INPUTS >= 16 )==""\n"
135R"==(else if (IS_IN_PART(15)) )==""\n"
136R"==(SET_DIMS(14, 15) )==""\n"
137R"==(#endif )==""\n"
138R"==(const int dst_off = OFF_MD(DST, dst_dims[0], dst_dims[1], dst_dims[2], )==""\n"
139R"==(dst_dims[3], dst_dims[4], dst_dims[5]); )==""\n"
140R"==(#if SUB_GROUP_SIZE > 1 )==""\n"
141R"==(#if DT_BF16 == 1 )==""\n"
142R"==(float src_val = DATA_TO_REF(AS_DATA_T( )==""\n"
143R"==(BLOCK_READ((const __global BLOCK_DATA_T *)&src[src_off]))); )==""\n"
144R"==(#else )==""\n"
145R"==(SRC_DATA_T src_val = AS_DATA_T( )==""\n"
146R"==(BLOCK_READ((const __global BLOCK_DATA_T *)&src[src_off])); )==""\n"
147R"==(#endif )==""\n"
148R"==(BLOCK_WRITE_DST(&dst[dst_off], TO_DST(src_val)); )==""\n"
149R"==(#else )==""\n"
150R"==(#if DT_BF16 == 1 )==""\n"
151R"==(float src_val = DATA_TO_REF(src[src_off]); )==""\n"
152R"==(#else )==""\n"
153R"==(SRC_DATA_T src_val = src[src_off]; )==""\n"
154R"==(#endif )==""\n"
155R"==(dst[dst_off] = TO_DST(src_val); )==""\n"
156R"==(#endif )==""\n"
157R"==(} )==""\n"
158R"==(for (; dst_dims[ITER_DIM_IDX] < iter_dim_end; dst_dims[ITER_DIM_IDX]++) { )==""\n"
159R"==(const int dst_off = OFF_MD(DST, dst_dims[0], dst_dims[1], dst_dims[2], )==""\n"
160R"==(dst_dims[3], dst_dims[4], dst_dims[5]); )==""\n"
161R"==(#if SUB_GROUP_SIZE > 1 )==""\n"
162R"==(BLOCK_WRITE_DST(&dst[dst_off], TO_DST(DATA_ZERO)); )==""\n"
163R"==(#else )==""\n"
164R"==(dst[dst_off] = TO_DST(DATA_ZERO); )==""\n"
165R"==(#endif )==""\n"
166R"==(} )==""\n"
167R"==(} )==""\n"
168R"==()==";
169}
170}
171}
172}