matmul_utils.hpp source code [oneDNN/src/cpu/matmul/matmul_utils.hpp]

1	/*******************************************************************************
2	* Copyright 2020 Intel Corporation
3	* Copyright 2022 Arm Ltd. and affiliates
4	*
5	* Licensed under the Apache License, Version 2.0 (the "License");
6	* you may not use this file except in compliance with the License.
7	* You may obtain a copy of the License at
8	*
9	* http://www.apache.org/licenses/LICENSE-2.0
10	*
11	* Unless required by applicable law or agreed to in writing, software
12	* distributed under the License is distributed on an "AS IS" BASIS,
13	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14	* See the License for the specific language governing permissions and
15	* limitations under the License.
16	*******************************************************************************/
17
18	#ifndef CPU_MATMUL_UTILS_HPP
19	#define CPU_MATMUL_UTILS_HPP
20
21	#include "common/memory_desc_wrapper.hpp"
22	#include "common/utils.hpp"
23
24	namespace dnnl {
25	namespace impl {
26	namespace cpu {
27
28	namespace matmul {
29
30	struct matmul_helper_t {
31	using mdw_t = const memory_desc_wrapper;
32
33	matmul_helper_t(mdw_t &src_md, mdw_t &weights_md, mdw_t &dst_md)
34	: src_md_(src_md), weights_md_(weights_md), dst_md_(dst_md) {}
35
36	int ndims() const { return dst_md_.ndims(); }
37	bool batched() const { return ndims() > `2`; }
38
39	dim_t batch() const {
40	return utils::array_product(dst_md_.dims(), ndims() - `2`);
41	};
42	dim_t src_batch() const {
43	return utils::array_product(src_md_.dims(), ndims() - `2`);
44	};
45	dim_t wei_batch() const {
46	return utils::array_product(weights_md_.dims(), ndims() - `2`);
47	};
48
49	dim_t M() const { return dst_md_.dims()[ndims() - `2`]; }
50	dim_t N() const { return dst_md_.dims()[ndims() - `1`]; }
51	dim_t K() const { return src_md_.dims()[ndims() - `1`]; }
52
53	char transA() const {
54	const auto &strides = &src_md_.blocking_desc().strides[ndims() - `2`];
55	return (strides[`1`] == `1` && src_md_.dims()[ndims() - `2`] > `1`) ? `'N'` : `'T'`;
56	}
57
58	char transB() const {
59	const auto &strides = &weights_md_.blocking_desc().strides[ndims() - `2`];
60	return (strides[`1`] == `1` && weights_md_.dims()[ndims() - `2`] > `1`) ? `'N'`
61	: `'T'`;
62	}
63
64	dim_t lda() const {
65	const auto &strides = &src_md_.blocking_desc().strides[ndims() - `2`];
66	return strides[transA() == `'N'` ? `0` : `1`];
67	}
68
69	dim_t ldb() const {
70	const auto &strides = &weights_md_.blocking_desc().strides[ndims() - `2`];
71	return strides[transB() == `'N'` ? `0` : `1`];
72	}
73
74	dim_t ldc() const { return dst_md_.blocking_desc().strides[ndims() - `2`]; }
75
76	// TODO similar optimization is also possible for wei batch fusion.
77	bool can_fuse_src_batch_dims() const {
78	/ Note:*
79	We can fuse src batch dims so that a single GeMM can be used iff
80	1. src is not transposed
81	2. wei batch dims are all 1's
82	3. The strides in batch dims are trivial (allowing permutations).
83	4. src and dst layout are identical. Example:
84	src layout : {batch dim_idx permutations}xMxK
85	dst layout : {identical batch dim_idx perm}xMxN;
86
87	For example,
88	src_layout : aXdXcXbXmXk
89	wei_layout: 1X1X1X1xkxn or 1X1X1X1xnxk
90	dst_layout : aXdXcXbXmXn
91
92	A single GeMM call can be used instead with m = adcbm
93	*/
94	// Note 1:
95	if (transA() == `'T'`) return false;
96
97	const int n_dims = ndims();
98	const int batch_ndims = n_dims - `2`;
99	if (batch_ndims == `0`) return true;
100
101	// Note 2:
102	if (utils::array_product(weights_md_.dims(), batch_ndims) != `1`)
103	return false;
104
105	// determine batch dims layout
106	dims_t src_strides;
107	utils::array_copy(
108	src_strides, src_md_.blocking_desc().strides, batch_ndims);
109
110	// compute ou_dims. It is required to get correct perm
111	dims_t blocks = {`0`};
112	src_md_.compute_blocks(blocks);
113	dims_t ou_dims;
114	for (int i = `0`; i < batch_ndims; ++i)
115	ou_dims[i] = src_md_.padded_dims()[i] / blocks[i];
116
117	dims_t perm;
118	for (int i = `0`; i < batch_ndims; ++i)
119	perm[i] = i;
120
121	// permute batch dim idx by sorting based on strides.
122	utils::simultaneous_sort(src_strides, ou_dims, perm, batch_ndims,
123	[](stride_t a, stride_t b) { return a - b; });
124
125	dim_t src_stride = M() * lda();
126	dim_t dst_stride = M() * ldc();
127
128	// Note 3-4:
129	for (int i = `0`; i < batch_ndims; ++i) {
130	const int dim_idx = perm[i];
131	if (src_md_.blocking_desc().strides[dim_idx] != src_stride
132	\|\| dst_md_.blocking_desc().strides[dim_idx] != dst_stride)
133	return false;
134	src_stride = src_stride * src_md_.dims()[dim_idx];
135	dst_stride = dst_stride * dst_md_.dims()[dim_idx];
136	}
137
138	return true;
139	}
140
141	private:
142	mdw_t src_md_;
143	mdw_t weights_md_;
144	mdw_t dst_md_;
145	};
146
147	} // namespace matmul
148	} // namespace cpu
149	} // namespace impl
150	} // namespace dnnl
151	#endif
152

Browse the source code of oneDNN/src/cpu/matmul/matmul_utils.hpp