1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *rnn_reorder_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2019-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#define DT_UNDEF )==""\n"
21R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
22R"==(#if IN_TYPE_F16 || OUT_TYPE_F16 )==""\n"
23R"==(#pragma OPENCL EXTENSION cl_khr_fp16 : enable )==""\n"
24R"==(#endif )==""\n"
25R"==(#define IN_OFF(x0, x1, x2, x3, x4, x5) \ )==""\n"
26R"==((((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \ )==""\n"
27R"==(+ ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \ )==""\n"
28R"==(+ ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2 \ )==""\n"
29R"==(+ ((x3) % SRC_B3) * SRC_SB3 + ((x3) / SRC_B3) * SRC_S3 \ )==""\n"
30R"==(+ ((x4) % SRC_B4) * SRC_SB4 + ((x4) / SRC_B4) * SRC_S4 \ )==""\n"
31R"==(+ ((x5) % SRC_B5) * SRC_SB5 + ((x5) / SRC_B5) * SRC_S5) )==""\n"
32R"==(#define OUT_OFF(x0, x1, x2, x3, x4, x5) \ )==""\n"
33R"==((((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \ )==""\n"
34R"==(+ ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \ )==""\n"
35R"==(+ ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2 \ )==""\n"
36R"==(+ ((x3) % DST_B3) * DST_SB3 + ((x3) / DST_B3) * DST_S3 \ )==""\n"
37R"==(+ ((x4) % DST_B4) * DST_SB4 + ((x4) / DST_B4) * DST_S4 \ )==""\n"
38R"==(+ ((x5) % DST_B5) * DST_SB5 + ((x5) / DST_B5) * DST_S5) )==""\n"
39R"==(#if IN_TYPE_F32 )==""\n"
40R"==(#define DT_IN float )==""\n"
41R"==(#else )==""\n"
42R"==(#error Unimplemented )==""\n"
43R"==(#endif )==""\n"
44R"==(#if OUT_TYPE_S8 )==""\n"
45R"==(#define DT_OUT char )==""\n"
46R"==(#else )==""\n"
47R"==(#error Unimplemented )==""\n"
48R"==(#endif )==""\n"
49R"==(#if OUT_TYPE_S8 )==""\n"
50R"==(#define CONVERT_F32_TO_OUT convert_char_sat_rte )==""\n"
51R"==(#define CONVERT_F32_TO_OUT8 convert_char8_sat_rte )==""\n"
52R"==(#else )==""\n"
53R"==(#error Unimplemented )==""\n"
54R"==(#endif )==""\n"
55R"==(#define CONVERT_IN_TO_OUT(x) CONVERT_F32_TO_OUT(x) )==""\n"
56R"==(#define QZ_B0(v, scale) CONVERT_F32_TO_OUT(v *scale) )==""\n"
57R"==(#define REORDER(_out, _in, _s) \ )==""\n"
58R"==(do { \ )==""\n"
59R"==(_out = QZ_B0(_in, _s); \ )==""\n"
60R"==(} while (0) )==""\n"
61R"==(#define COMP_DT float )==""\n"
62R"==(#define COMP_DST_OFFSET_EL (DST_D0 * DST_S0) )==""\n"
63R"==(#define COMP_OFF(i0, i1, i2, i3) \ )==""\n"
64R"==(((((i0) * (DST_D1) + (i1)) * (DST_D3) + (i2)) * (DST_D4) + (i3)) )==""\n"
65R"==(KERNEL_ATTR )==""\n"
66R"==(__kernel void wei_reorder(__global DT_IN *input, __global DT_IN *scales, )==""\n"
67R"==(__global DT_OUT *output) { )==""\n"
68R"==(__global char *temp = (__global char *)(output + COMP_DST_OFFSET_EL); )==""\n"
69R"==(__global COMP_DT *comp )==""\n"
70R"==(= (__global COMP_DT *)(((unsigned long)temp + (sizeof(COMP_DT) - 1)) )==""\n"
71R"==(& -sizeof(COMP_DT)); )==""\n"
72R"==(#if REF_REORDER )==""\n"
73R"==(const int d0 = GWS_GET_D0(); )==""\n"
74R"==(const int d1 = GWS_GET_D1(); )==""\n"
75R"==(const int d3 = GWS_GET_D3(); )==""\n"
76R"==(const int d4 = GWS_GET_D4(); )==""\n"
77R"==(#if MASK )==""\n"
78R"==(float s = scales[d3 * SRC_D4 + d4]; )==""\n"
79R"==(#else )==""\n"
80R"==(float s = scales[0]; )==""\n"
81R"==(#endif )==""\n"
82R"==(int reduction = 0; )==""\n"
83R"==(for (int d2 = 0; d2 < SRC_D2; ++d2) { )==""\n"
84R"==(const int in_off = IN_OFF(d0, d1, d2, d3, d4, 0); )==""\n"
85R"==(const int out_off = OUT_OFF(d0, d1, d2, d3, d4, 0); )==""\n"
86R"==(REORDER(output[out_off], input[in_off], s); )==""\n"
87R"==(reduction += convert_int(QZ_B0(input[in_off], s)); )==""\n"
88R"==(} )==""\n"
89R"==(comp[COMP_OFF(d0, d1, d3, d4)] = convert_float(reduction); )==""\n"
90R"==(#else )==""\n"
91R"==(#error Unimplemented )==""\n"
92R"==(#endif )==""\n"
93R"==(} )==""\n"
94R"==()==";
95}
96}
97}
98}