1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *ref_reorder_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2019-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#include "gpu/ocl/reorder_common.h" )==""\n"
21R"==(KERNEL_ATTR )==""\n"
22R"==(__kernel void ref_reorder(__global SRC_DATA_T *restrict src, )==""\n"
23R"==(__global DST_DATA_T *restrict dst, __global float *restrict src_scales, )==""\n"
24R"==(__global int *restrict src_zps, __global float *restrict dst_scales, )==""\n"
25R"==(__global int *dst_zps, float sum_scale, int sum_zp) { )==""\n"
26R"==(const int src_zp = GET_SRC_ZP(src_zps); )==""\n"
27R"==(const int dst_zp = GET_DST_ZP(dst_zps); )==""\n"
28R"==(float src_scale = 1.0f; )==""\n"
29R"==(float dst_scale = 1.0f; )==""\n"
30R"==(src += SRC_OFFSET0; )==""\n"
31R"==(dst += DST_OFFSET0; )==""\n"
32R"==(const int d0_blk_start = GWS_GET_D0(); )==""\n"
33R"==(const int d1_blk_start = GWS_GET_D1(); )==""\n"
34R"==(const int d2_blk_start = GWS_GET_D2(); )==""\n"
35R"==(const int d3_blk_start = GWS_GET_D3(); )==""\n"
36R"==(const int d4_blk_start = GWS_GET_D4(); )==""\n"
37R"==(const int d5_blk_start = GWS_GET_D5(); )==""\n"
38R"==(const int d0_blk_end = d0_blk_start + GWS_GET_D0_BLOCK(); )==""\n"
39R"==(const int d1_blk_end = d1_blk_start + GWS_GET_D1_BLOCK(); )==""\n"
40R"==(const int d2_blk_end = d2_blk_start + GWS_GET_D2_BLOCK(); )==""\n"
41R"==(const int d3_blk_end = d3_blk_start + GWS_GET_D3_BLOCK(); )==""\n"
42R"==(const int d4_blk_end = d4_blk_start + GWS_GET_D4_BLOCK(); )==""\n"
43R"==(const int d5_blk_end = d5_blk_start + GWS_GET_D5_BLOCK(); )==""\n"
44R"==(for_(int d0 = d0_blk_start; d0 < d0_blk_end; ++d0) )==""\n"
45R"==(for_(int d1 = d1_blk_start; d1 < d1_blk_end; ++d1) )==""\n"
46R"==(for_(int d2 = d2_blk_start; d2 < d2_blk_end; ++d2) )==""\n"
47R"==(for_(int d3 = d3_blk_start; d3 < d3_blk_end; ++d3) )==""\n"
48R"==(for_(int d4 = d4_blk_start; d4 < d4_blk_end; ++d4) )==""\n"
49R"==(for (int d5 = d5_blk_start; d5 < d5_blk_end; ++d5) { )==""\n"
50R"==(const int src_off = SRC_OFF(d0, d1, d2, d3, d4, d5); )==""\n"
51R"==(const int dst_off = DST_OFF(d0, d1, d2, d3, d4, d5); )==""\n"
52R"==(#if PAD_FILL_ZERO == 1 )==""\n"
53R"==(int pad_d0 = d0 >= SRC_D0; )==""\n"
54R"==(int pad_d1 = NDIMS > 1 && d1 >= SRC_D1; )==""\n"
55R"==(int pad_d2 = NDIMS > 2 && d2 >= SRC_D2; )==""\n"
56R"==(int pad_d3 = NDIMS > 3 && d3 >= SRC_D3; )==""\n"
57R"==(int pad_d4 = NDIMS > 4 && d4 >= SRC_D4; )==""\n"
58R"==(int pad_d5 = NDIMS > 5 && d5 >= SRC_D5; )==""\n"
59R"==(if (pad_d0 || pad_d1 || pad_d2 || pad_d3 || pad_d4 || pad_d5) { )==""\n"
60R"==(dst[dst_off] = 0; )==""\n"
61R"==(continue; )==""\n"
62R"==(} )==""\n"
63R"==(#endif )==""\n"
64R"==(#if WITH_SRC_SCALE )==""\n"
65R"==(src_scale = src_scales[SCALE_OFF(SRC, d0, d1, d2, d3, d4, d5)]; )==""\n"
66R"==(#endif )==""\n"
67R"==(#if WITH_DST_SCALE )==""\n"
68R"==(dst_scale = dst_scales[SCALE_OFF(DST, d0, d1, d2, d3, d4, d5)]; )==""\n"
69R"==(#endif )==""\n"
70R"==(REORDER(dst[dst_off], src[src_off], src_scale, dst_scale, sum_scale, )==""\n"
71R"==(src_zp, dst_zp, sum_zp); )==""\n"
72R"==(} )==""\n"
73R"==(} )==""\n"
74R"==()==";
75}
76}
77}
78}