1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *gen9_eltwise_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2020-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#include "gpu/ocl/ocl_eltwise.h" )==""\n"
21R"==(#include "gpu/ocl/ocl_post_ops.h" )==""\n"
22R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
23R"==(#define SIMD GWS_SGS_DEFAULT )==""\n"
24R"==(KERNEL_ATTR )==""\n"
25R"==(__kernel void gen9_eltwise_fwd(__global DATA_T *src, __global DATA_T *dst, )==""\n"
26R"==(int nelems, float alpha, float beta) { )==""\n"
27R"==(const uint grsize = get_local_size(0); )==""\n"
28R"==(const uint grid = get_group_id(0); )==""\n"
29R"==(const uint sgid = get_sub_group_id(); )==""\n"
30R"==(const uint lid = get_sub_group_local_id(); )==""\n"
31R"==(const uint gid = get_global_id(0); )==""\n"
32R"==(ptrdiff_t offset )==""\n"
33R"==(= (grid * grsize + sgid * get_max_sub_group_size()) * VECT_DT_N; )==""\n"
34R"==(__global BLOCK_DATA_T *read_pos = (__global BLOCK_DATA_T *)src + offset; )==""\n"
35R"==(__global BLOCK_DATA_T *write_pos = (__global BLOCK_DATA_T *)dst + offset; )==""\n"
36R"==(VECT_DATA_T val; )==""\n"
37R"==(const uint nel_per_read = SIMD * VECT_DT_N; )==""\n"
38R"==(if (!NELEMS_OVERFLOW || offset + nel_per_read < nelems) { )==""\n"
39R"==(val = AS_VECT_DATA_T(VECT_BLOCK_READ(read_pos)); )==""\n"
40R"==(} else { )==""\n"
41R"==(uint pos = offset + lid; )==""\n"
42R"==(for (int i = 0; i < VECT_DT_N && pos < nelems; ++i) { )==""\n"
43R"==(val[i] = src[pos]; )==""\n"
44R"==(pos += SIMD; )==""\n"
45R"==(} )==""\n"
46R"==(} )==""\n"
47R"==(for (int i = 0; i < VECT_DT_N; ++i) { )==""\n"
48R"==(val[i] = CONVERT_DATA_T( )==""\n"
49R"==(fwd_eltwise(DATA_TO_REF(val[i]), alpha, beta, 1.0f)); )==""\n"
50R"==(} )==""\n"
51R"==(if (!NELEMS_OVERFLOW || offset + nel_per_read < nelems) { )==""\n"
52R"==(VECT_BLOCK_WRITE(write_pos, AS_VECT_BLOCK_DATA_T(val)); )==""\n"
53R"==(} else { )==""\n"
54R"==(uint pos = offset + lid; )==""\n"
55R"==(for (int i = 0; i < VECT_DT_N && pos < nelems; ++i) { )==""\n"
56R"==(dst[pos] = val[i]; )==""\n"
57R"==(pos += SIMD; )==""\n"
58R"==(} )==""\n"
59R"==(} )==""\n"
60R"==(} )==""\n"
61R"==(KERNEL_ATTR )==""\n"
62R"==(__kernel void gen9_eltwise_bwd(__global DATA_T *src, __global DATA_T *diff_src, )==""\n"
63R"==(__global DATA_T *diff_dst, int nelems, float alpha, float beta) { )==""\n"
64R"==(const uint grsize = get_local_size(0); )==""\n"
65R"==(const uint grid = get_group_id(0); )==""\n"
66R"==(const uint sgid = get_sub_group_id(); )==""\n"
67R"==(const uint lid = get_sub_group_local_id(); )==""\n"
68R"==(ptrdiff_t offset = (grid * grsize + sgid * SIMD) * VECT_DT_N; )==""\n"
69R"==(__global BLOCK_DATA_T *src_pos = (__global BLOCK_DATA_T *)src + offset; )==""\n"
70R"==(__global BLOCK_DATA_T *diff_pos )==""\n"
71R"==(= (__global BLOCK_DATA_T *)diff_dst + offset; )==""\n"
72R"==(__global BLOCK_DATA_T *write_pos )==""\n"
73R"==(= (__global BLOCK_DATA_T *)diff_src + offset; )==""\n"
74R"==(VECT_DATA_T val_dd; )==""\n"
75R"==(VECT_DATA_T val_src; )==""\n"
76R"==(const uint nel_per_read = SIMD * VECT_DT_N; )==""\n"
77R"==(if (!NELEMS_OVERFLOW || offset + nel_per_read < nelems) { )==""\n"
78R"==(val_src = AS_VECT_DATA_T(VECT_BLOCK_READ(src_pos)); )==""\n"
79R"==(val_dd = AS_VECT_DATA_T(VECT_BLOCK_READ(diff_pos)); )==""\n"
80R"==(} else { )==""\n"
81R"==(uint pos = offset + lid; )==""\n"
82R"==(for (int i = 0; i < VECT_DT_N && pos < nelems; ++i) { )==""\n"
83R"==(val_dd[i] = diff_dst[pos]; )==""\n"
84R"==(val_src[i] = src[pos]; )==""\n"
85R"==(pos += SIMD; )==""\n"
86R"==(} )==""\n"
87R"==(} )==""\n"
88R"==(for (int i = 0; i < VECT_DT_N; ++i) { )==""\n"
89R"==(val_dd[i] = CONVERT_DATA_T(bwd_eltwise( )==""\n"
90R"==(DATA_TO_REF(val_dd[i]), DATA_TO_REF(val_src[i]), alpha, beta)); )==""\n"
91R"==(} )==""\n"
92R"==(if (!NELEMS_OVERFLOW || offset + nel_per_read < nelems) { )==""\n"
93R"==(VECT_BLOCK_WRITE(write_pos, AS_VECT_BLOCK_DATA_T(val_dd)); )==""\n"
94R"==(} else { )==""\n"
95R"==(uint pos = offset + lid; )==""\n"
96R"==(for (int i = 0; i < VECT_DT_N && pos < nelems; ++i) { )==""\n"
97R"==(diff_src[pos] = val_dd[i]; )==""\n"
98R"==(pos += SIMD; )==""\n"
99R"==(} )==""\n"
100R"==(} )==""\n"
101R"==(} )==""\n"
102R"==()==";
103}
104}
105}
106}