gen9_sum_kernel.cpp source code [oneDNN/build/src/gpu/ocl/gen9_sum_kernel.cpp]

1	namespace dnnl {
2	namespace impl {
3	namespace gpu {
4	namespace ocl {
5	const char gen9_sum_kernel = R"==(/****************************************************************************** )==""\n"
6	R"==(* Copyright 2020-2021 Intel Corporation )==""\n"
7	R"==(* )==""\n"
8	R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9	R"==(* you may not use this file except in compliance with the License. )==""\n"
10	R"==(* You may obtain a copy of the License at )==""\n"
11	R"==(* )==""\n"
12	R"==(* http: )==""\n"
13	R"==(* )==""\n"
14	R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15	R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16	R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17	R"==(* See the License for the specific language governing permissions and )==""\n"
18	R"==(* limitations under the License. )==""\n"
19	R"==(*******************************************************************************/ )==""\n"
20	R"==(#if DST_DT_S8 )==""\n"
21	R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
22	R"==(as_char8(intel_sub_group_block_read_uc8((const __global uchar *)(src))) )==""\n"
23	R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
24	R"==(intel_sub_group_block_write_uc8((__global uchar *)(dst), as_uchar8(val)) )==""\n"
25	R"==(#endif )==""\n"
26	R"==(#if DST_DT_U8 )==""\n"
27	R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
28	R"==(as_uchar8(intel_sub_group_block_read_uc8((const __global uchar *)(src))) )==""\n"
29	R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
30	R"==(intel_sub_group_block_write_uc8((__global uchar *)(dst), as_uchar8(val)) )==""\n"
31	R"==(#endif )==""\n"
32	R"==(#if DST_DT_F16 )==""\n"
33	R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
34	R"==(as_half8(intel_sub_group_block_read_us8((const __global ushort *)(src))) )==""\n"
35	R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
36	R"==(intel_sub_group_block_write_us8((__global ushort *)(dst), as_ushort8(val)) )==""\n"
37	R"==(#endif )==""\n"
38	R"==(#if DST_DT_S32 )==""\n"
39	R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
40	R"==(as_int8(intel_sub_group_block_read8((const __global uint *)(src))) )==""\n"
41	R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
42	R"==(intel_sub_group_block_write8((__global uint *)(dst), as_uint8(val)) )==""\n"
43	R"==(#endif )==""\n"
44	R"==(#if DST_DT_F32 )==""\n"
45	R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
46	R"==(as_float8(intel_sub_group_block_read8((const __global uint *)(src))) )==""\n"
47	R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
48	R"==(intel_sub_group_block_write8((__global uint *)(dst), as_uint8(val)) )==""\n"
49	R"==(#endif )==""\n"
50	R"==(#if DST_DT_BF16 )==""\n"
51	R"==(#define DST_BLOCK_READ8(src) \ )==""\n"
52	R"==(as_ushort8(intel_sub_group_block_read_us8((const __global ushort *)(src))) )==""\n"
53	R"==(#define DST_BLOCK_WRITE8(dst, val) \ )==""\n"
54	R"==(intel_sub_group_block_write_us8((__global ushort *)(dst), as_ushort8(val)) )==""\n"
55	R"==(#endif )==""\n"
56	R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
57	R"==(float8 get_values(__global SRC_DATA_T *src, ptrdiff_t offset) { )==""\n"
58	R"==(float8 val; )==""\n"
59	R"==(const uint max_sub_group_size = get_max_sub_group_size(); )==""\n"
60	R"==(__global BLOCK_DATA_T read_pos = (__global BLOCK_DATA_T )src + offset; )==""\n"
61	R"==(if (offset + VECT_DT_N * max_sub_group_size < N_ELEMS) { )==""\n"
62	R"==(val = CONVERT_FLOAT8_T(AS_DATA8_T(BLOCK_READ8(read_pos))); )==""\n"
63	R"==(} else { )==""\n"
64	R"==(const uint sub_group_local_id = get_sub_group_local_id(); )==""\n"
65	R"==(uint pos = offset + sub_group_local_id; )==""\n"
66	R"==(for (uint i = 0; pos < N_ELEMS && i < VECT_DT_N; i++) { )==""\n"
67	R"==(val[i] = CONVERT_FLOAT_T(src[pos]); )==""\n"
68	R"==(pos += max_sub_group_size; )==""\n"
69	R"==(} )==""\n"
70	R"==(} )==""\n"
71	R"==(return val; )==""\n"
72	R"==(} )==""\n"
73	R"==(__kernel void gen9_sum(__global SRC_DATA_T input0, __global SRC_DATA_T input1, )==""\n"
74	R"==(__global SRC_DATA_T input2, __global SRC_DATA_T input3, )==""\n"
75	R"==(__global SRC_DATA_T input4, __global SRC_DATA_T input5, )==""\n"
76	R"==(__global SRC_DATA_T input6, __global SRC_DATA_T input7, )==""\n"
77	R"==(__global SRC_DATA_T input8, __global SRC_DATA_T input9, )==""\n"
78	R"==(__global SRC_DATA_T input10, __global SRC_DATA_T input11, )==""\n"
79	R"==(__global SRC_DATA_T input12, __global SRC_DATA_T input13, )==""\n"
80	R"==(__global SRC_DATA_T input14, __global SRC_DATA_T input15, )==""\n"
81	R"==(__global DST_DATA_T output, __global float scales) { )==""\n"
82	R"==(const uint group_id = get_group_id(0); )==""\n"
83	R"==(const uint group_size = get_local_size(0); )==""\n"
84	R"==(const uint sub_group_id = get_sub_group_id(); )==""\n"
85	R"==(const uint max_sub_group_size = get_max_sub_group_size(); )==""\n"
86	R"==(const uint sub_group_local_id = get_sub_group_local_id(); )==""\n"
87	R"==(ptrdiff_t offset )==""\n"
88	R"==(= (group_id * group_size + sub_group_id * max_sub_group_size) )==""\n"
89	R"==(* VECT_DT_N; )==""\n"
90	R"==(__global BLOCK_DATA_T write_pos = (__global BLOCK_DATA_T )output + offset; )==""\n"
91	R"==(int id = 0; )==""\n"
92	R"==(float8 sum = 0; )==""\n"
93	R"==(if (id < N_INPUTS) sum += get_values(input0, offset) * scales[id++]; )==""\n"
94	R"==(if (id < N_INPUTS) sum += get_values(input1, offset) * scales[id++]; )==""\n"
95	R"==(if (id < N_INPUTS) sum += get_values(input2, offset) * scales[id++]; )==""\n"
96	R"==(if (id < N_INPUTS) sum += get_values(input3, offset) * scales[id++]; )==""\n"
97	R"==(if (id < N_INPUTS) sum += get_values(input4, offset) * scales[id++]; )==""\n"
98	R"==(if (id < N_INPUTS) sum += get_values(input5, offset) * scales[id++]; )==""\n"
99	R"==(if (id < N_INPUTS) sum += get_values(input6, offset) * scales[id++]; )==""\n"
100	R"==(if (id < N_INPUTS) sum += get_values(input7, offset) * scales[id++]; )==""\n"
101	R"==(if (id < N_INPUTS) sum += get_values(input8, offset) * scales[id++]; )==""\n"
102	R"==(if (id < N_INPUTS) sum += get_values(input9, offset) * scales[id++]; )==""\n"
103	R"==(if (id < N_INPUTS) sum += get_values(input10, offset) * scales[id++]; )==""\n"
104	R"==(if (id < N_INPUTS) sum += get_values(input11, offset) * scales[id++]; )==""\n"
105	R"==(if (id < N_INPUTS) sum += get_values(input12, offset) * scales[id++]; )==""\n"
106	R"==(if (id < N_INPUTS) sum += get_values(input13, offset) * scales[id++]; )==""\n"
107	R"==(if (id < N_INPUTS) sum += get_values(input14, offset) * scales[id++]; )==""\n"
108	R"==(if (id < N_INPUTS) sum += get_values(input15, offset) * scales[id++]; )==""\n"
109	R"==(if (offset + VECT_DT_N * max_sub_group_size < N_ELEMS) { )==""\n"
110	R"==(DST_BLOCK_WRITE8(write_pos, TO_DST8(sum)); )==""\n"
111	R"==(} else { )==""\n"
112	R"==(uint pos = offset + sub_group_local_id; )==""\n"
113	R"==(for (uint i = 0; pos < N_ELEMS && i < VECT_DT_N; i++) { )==""\n"
114	R"==(output[pos] = TO_DST(sum[i]); )==""\n"
115	R"==(pos += max_sub_group_size; )==""\n"
116	R"==(} )==""\n"
117	R"==(} )==""\n"
118	R"==(} )==""\n"
119	R"==()==";
120	}
121	}
122	}
123	}

Browse the source code of oneDNN/build/src/gpu/ocl/gen9_sum_kernel.cpp