1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *ocl_zero_points_header = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2019-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#ifndef GPU_OCL_OCL_ZERO_POINTS_H )==""\n"
21R"==(#define GPU_OCL_OCL_ZERO_POINTS_H )==""\n"
22R"==(#if WITH_SRC_ZPOINTS )==""\n"
23R"==(#if WITH_SRC_ZPOINTS_PER_IC )==""\n"
24R"==(int4 read_src_zero_points_32c(const __global int *ptr, const int ic) { )==""\n"
25R"==(int4 z; )==""\n"
26R"==(ptr += ic; )==""\n"
27R"==(#if IC % 32 != 0 )==""\n"
28R"==(if (ic + 32 > IC) { )==""\n"
29R"==(const int max_local_id = IC % 8; )==""\n"
30R"==(const int local_id = get_sub_group_local_id(); )==""\n"
31R"==(if (ic + 8 > IC) { )==""\n"
32R"==(z.s0 = local_id < max_local_id ? ptr[0 + local_id] : 0; )==""\n"
33R"==(z.s1 = 0; )==""\n"
34R"==(z.s2 = 0; )==""\n"
35R"==(z.s3 = 0; )==""\n"
36R"==(} else if (ic + 16 > IC) { )==""\n"
37R"==(z.s0 = as_int(intel_sub_group_block_read( )==""\n"
38R"==((const __global uint *)(ptr + 0))); )==""\n"
39R"==(z.s1 = local_id < max_local_id ? ptr[8 + local_id] : 0; )==""\n"
40R"==(z.s2 = 0; )==""\n"
41R"==(z.s3 = 0; )==""\n"
42R"==(} else if (ic + 24 > IC) { )==""\n"
43R"==(z.s0 = as_int(intel_sub_group_block_read( )==""\n"
44R"==((const __global uint *)(ptr + 0))); )==""\n"
45R"==(z.s1 = as_int(intel_sub_group_block_read( )==""\n"
46R"==((const __global uint *)(ptr + 8))); )==""\n"
47R"==(z.s2 = local_id < max_local_id ? ptr[16 + local_id] : 0; )==""\n"
48R"==(z.s3 = 0; )==""\n"
49R"==(} else { )==""\n"
50R"==(z.s0 = as_int(intel_sub_group_block_read( )==""\n"
51R"==((const __global uint *)(ptr + 0))); )==""\n"
52R"==(z.s1 = as_int(intel_sub_group_block_read( )==""\n"
53R"==((const __global uint *)(ptr + 8))); )==""\n"
54R"==(z.s2 = as_int(intel_sub_group_block_read( )==""\n"
55R"==((const __global uint *)(ptr + 16))); )==""\n"
56R"==(z.s3 = local_id < max_local_id ? ptr[24 + local_id] : 0; )==""\n"
57R"==(} )==""\n"
58R"==(} else )==""\n"
59R"==(#endif )==""\n"
60R"==({ )==""\n"
61R"==(z = as_int4(intel_sub_group_block_read4((const __global uint *)ptr)); )==""\n"
62R"==(} )==""\n"
63R"==(return z; )==""\n"
64R"==(} )==""\n"
65R"==(int2 read_src_zero_points_32g(const __global int *ptr, const int g) { )==""\n"
66R"==(int2 z; )==""\n"
67R"==(ptr += g; )==""\n"
68R"==(#if G % 32 != 0 )==""\n"
69R"==(if (g + 32 > G) { )==""\n"
70R"==(const int max_local_id = G % 16; )==""\n"
71R"==(const int local_id = get_sub_group_local_id(); )==""\n"
72R"==(if (g + 16 > G) { )==""\n"
73R"==(z.s0 = local_id < max_local_id ? ptr[0 + local_id] : 0; )==""\n"
74R"==(z.s1 = 0; )==""\n"
75R"==(} else { )==""\n"
76R"==(z.s0 = as_int( )==""\n"
77R"==(intel_sub_group_block_read((const __global uint *)ptr)); )==""\n"
78R"==(z.s1 = local_id < max_local_id ? ptr[16 + local_id] : 0; )==""\n"
79R"==(} )==""\n"
80R"==(} else )==""\n"
81R"==(#endif )==""\n"
82R"==({ )==""\n"
83R"==(z = as_int2(intel_sub_group_block_read2((const __global uint *)ptr)); )==""\n"
84R"==(} )==""\n"
85R"==(return z; )==""\n"
86R"==(} )==""\n"
87R"==(int calc_src_compensation_x32(int4 z, int8 wei) { )==""\n"
88R"==(int sum = 0; )==""\n"
89R"==(__attribute__((opencl_unroll_hint)) for (uint i = 0; i < 8; ++i) { )==""\n"
90R"==(char4 w = as_char4(wei[i]); )==""\n"
91R"==(sum += sub_group_broadcast(z[i >> 1], (i & 1) * 4 + 0) * w[0]; )==""\n"
92R"==(sum += sub_group_broadcast(z[i >> 1], (i & 1) * 4 + 1) * w[1]; )==""\n"
93R"==(sum += sub_group_broadcast(z[i >> 1], (i & 1) * 4 + 2) * w[2]; )==""\n"
94R"==(sum += sub_group_broadcast(z[i >> 1], (i & 1) * 4 + 3) * w[3]; )==""\n"
95R"==(} )==""\n"
96R"==(return sum; )==""\n"
97R"==(} )==""\n"
98R"==(#else )==""\n"
99R"==(int read_src_zero_point(const __global int *ptr) { )==""\n"
100R"==(const int z = ptr[0]; )==""\n"
101R"==(return z; )==""\n"
102R"==(} )==""\n"
103R"==(#endif )==""\n"
104R"==(int calc_src_compensation_x4(int4 z, int wei) { )==""\n"
105R"==(int sum = 0; )==""\n"
106R"==({ )==""\n"
107R"==(char4 w = as_char4(wei); )==""\n"
108R"==(sum += z.s0 * w.s0; )==""\n"
109R"==(sum += z.s1 * w.s1; )==""\n"
110R"==(sum += z.s2 * w.s2; )==""\n"
111R"==(sum += z.s3 * w.s3; )==""\n"
112R"==(} )==""\n"
113R"==(return sum; )==""\n"
114R"==(} )==""\n"
115R"==(#endif )==""\n"
116R"==(#if WITH_DST_ZPOINTS )==""\n"
117R"==(int4 read_dst_zero_points_32c(const __global int *ptr, const int oc) { )==""\n"
118R"==(#if WITH_DST_ZPOINTS_PER_OC )==""\n"
119R"==(int4 z; )==""\n"
120R"==(ptr += oc; )==""\n"
121R"==(#if OC % 32 != 0 )==""\n"
122R"==(if (oc + 32 > OC) { )==""\n"
123R"==(const int max_local_id = OC % 8; )==""\n"
124R"==(const int local_id = get_sub_group_local_id(); )==""\n"
125R"==(if (oc + 8 > OC) { )==""\n"
126R"==(z.s0 = local_id < max_local_id ? ptr[0 + local_id] : 0; )==""\n"
127R"==(z.s1 = 0; )==""\n"
128R"==(z.s2 = 0; )==""\n"
129R"==(z.s3 = 0; )==""\n"
130R"==(} else if (oc + 16 > OC) { )==""\n"
131R"==(z.s0 = as_int(intel_sub_group_block_read( )==""\n"
132R"==((const __global uint *)(ptr + 0))); )==""\n"
133R"==(z.s1 = local_id < max_local_id ? ptr[8 + local_id] : 0; )==""\n"
134R"==(z.s2 = 0; )==""\n"
135R"==(z.s3 = 0; )==""\n"
136R"==(} else if (oc + 24 > OC) { )==""\n"
137R"==(z.s0 = as_int(intel_sub_group_block_read( )==""\n"
138R"==((const __global uint *)(ptr + 0))); )==""\n"
139R"==(z.s1 = as_int(intel_sub_group_block_read( )==""\n"
140R"==((const __global uint *)(ptr + 8))); )==""\n"
141R"==(z.s2 = local_id < max_local_id ? ptr[16 + local_id] : 0; )==""\n"
142R"==(z.s3 = 0; )==""\n"
143R"==(} else { )==""\n"
144R"==(z.s0 = as_int(intel_sub_group_block_read( )==""\n"
145R"==((const __global uint *)(ptr + 0))); )==""\n"
146R"==(z.s1 = as_int(intel_sub_group_block_read( )==""\n"
147R"==((const __global uint *)(ptr + 8))); )==""\n"
148R"==(z.s2 = as_int(intel_sub_group_block_read( )==""\n"
149R"==((const __global uint *)(ptr + 16))); )==""\n"
150R"==(z.s3 = local_id < max_local_id ? ptr[24 + local_id] : 0; )==""\n"
151R"==(} )==""\n"
152R"==(} else )==""\n"
153R"==(#endif )==""\n"
154R"==({ )==""\n"
155R"==(z = as_int4(intel_sub_group_block_read4((const __global uint *)ptr)); )==""\n"
156R"==(} )==""\n"
157R"==(#else )==""\n"
158R"==(const int4 z = ptr[0]; )==""\n"
159R"==(#endif )==""\n"
160R"==(return z; )==""\n"
161R"==(} )==""\n"
162R"==(int2 read_dst_zero_points_32g(const __global int *ptr, const int g) { )==""\n"
163R"==(#if WITH_DST_ZPOINTS_PER_OC )==""\n"
164R"==(int2 z; )==""\n"
165R"==(ptr += g; )==""\n"
166R"==(#if G % 32 != 0 )==""\n"
167R"==(if (g + 32 > G) { )==""\n"
168R"==(const int max_local_id = G % 16; )==""\n"
169R"==(const int local_id = get_sub_group_local_id(); )==""\n"
170R"==(if (g + 16 > G) { )==""\n"
171R"==(z.s0 = local_id < max_local_id ? ptr[0 + local_id] : 0; )==""\n"
172R"==(z.s1 = 0; )==""\n"
173R"==(} else { )==""\n"
174R"==(z.s0 = as_int( )==""\n"
175R"==(intel_sub_group_block_read((const __global uint *)ptr)); )==""\n"
176R"==(z.s1 = local_id < max_local_id ? ptr[16 + local_id] : 0; )==""\n"
177R"==(} )==""\n"
178R"==(} else )==""\n"
179R"==(#endif )==""\n"
180R"==({ )==""\n"
181R"==(z = as_int2(intel_sub_group_block_read2((const __global uint *)ptr)); )==""\n"
182R"==(} )==""\n"
183R"==(#else )==""\n"
184R"==(const int2 z = (int2)(ptr[0]); )==""\n"
185R"==(#endif )==""\n"
186R"==(return z; )==""\n"
187R"==(} )==""\n"
188R"==(#endif )==""\n"
189R"==(#if WITH_SRC_ZPOINTS || WITH_DST_ZPOINTS )==""\n"
190R"==(float4 zero_pad_dst_32c(float4 dst, const int oc) { )==""\n"
191R"==(#if OC % 32 != 0 )==""\n"
192R"==(if (oc + 32 > OC) { )==""\n"
193R"==(const int max_local_id = OC % 8; )==""\n"
194R"==(const int local_id = get_sub_group_local_id(); )==""\n"
195R"==(if (oc + 8 > OC) { )==""\n"
196R"==(dst.s0 = local_id < max_local_id ? dst.s0 : 0; )==""\n"
197R"==(dst.s1 = 0; )==""\n"
198R"==(dst.s2 = 0; )==""\n"
199R"==(dst.s3 = 0; )==""\n"
200R"==(} else if (oc + 16 > OC) { )==""\n"
201R"==(dst.s1 = local_id < max_local_id ? dst.s1 : 0; )==""\n"
202R"==(dst.s2 = 0; )==""\n"
203R"==(dst.s3 = 0; )==""\n"
204R"==(} else if (oc + 24 > OC) { )==""\n"
205R"==(dst.s2 = local_id < max_local_id ? dst.s2 : 0; )==""\n"
206R"==(dst.s3 = 0; )==""\n"
207R"==(} else { )==""\n"
208R"==(dst.s3 = local_id < max_local_id ? dst.s3 : 0; )==""\n"
209R"==(} )==""\n"
210R"==(} )==""\n"
211R"==(#endif )==""\n"
212R"==(return dst; )==""\n"
213R"==(} )==""\n"
214R"==(#endif )==""\n"
215R"==(#endif )==""\n"
216R"==()==";
217}
218}
219}
220}