superblock.cc source code [triton/python/src/superblock.cc]

1	#include <iostream>
2	#include <pybind11/numpy.h>
3	#include <pybind11/pybind11.h>
4	#include <pybind11/stl.h>
5	#include <string>
6	#include <tuple>
7	#include <vector>
8	#ifdef _OPENMP
9	#include <omp.h>
10	#endif
11
12	// row-major 3d tensor
13	class tensor_3d {
14	public:
15	tensor_3d(int size_0, int size_1, int size_2, int data = nullptr) : data_(size_0 size_1 * size_2, `0`) {
16	if (data)
17	std::copy(data, data + data_.size(), data_.begin());
18	stride_0_ = size_1 * size_2;
19	stride_1_ = size_2;
20	stride_2_ = `1`;
21	}
22
23	int &operator()(int i, int j, int k) {
24	return data_[i * stride_0_ + j * stride_1_ + k];
25	}
26
27	private:
28	std::vector<int> data_;
29	int stride_0_;
30	int stride_1_;
31	int stride_2_;
32	};
33
34	std::vector<int> segment_blocks(tensor_3d &layout, tensor_3d &idx, int max_width, int H, int M, int N) {
35	tensor_3d tmp(H, M, N);
36	std::vector<int> current(H, `0`);
37	int num = `0`;
38	std::vector<int> lut(H * M * N * `4`);
39	for (size_t h = `0`; h < H; h++) {
40	// surrounding indices
41	std::vector<int> ii_left(max_width, -`1`);
42	std::vector<std::vector<int>> ii_top(max_width, std::vector<int>(N, -`1`));
43	// start the dynamic programming algorithm
44	for (size_t m = `0`; m < M; m++) {
45	for (size_t n = `0`; n < N; n++) {
46	int v = layout (h, m, n);
47	if (v == `0`)
48	continue;
49	int n_left = ii_left [max_width - `1`];
50	int m_top = ii_top [max_width - `1`][n];
51	int top = (m_top >= `0`) ? tmp (h, m_top, n) : `0`;
52	int left = (n_left >= `0`) ? tmp (h, m, n_left) : `0`;
53	int topleft = (m_top >= `0` && n_left >= `0`) ? tmp (h, m_top, n_left) : `0`;
54	int width = std::min(left, std::min(top, topleft)) + `1`;
55	// reset width if blocks cannot be
56	// packed together (i.e., there's a 1 "in the middle")
57	for (int nn = n_left + `1`; nn < n; nn++)
58	if (ii_top [max_width - `1`][nn] > ii_top [max_width - `1`][n])
59	width = `1`;
60	tmp (h, m, n) = width;
61	// update n_left ring buffer
62	for (int k = `0`; k < max_width - `1`; k++)
63	ii_left [k] = ii_left [k + `1`];
64	ii_left [max_width - `1`] = n;
65	// update ii_top ring buffer
66	for (int k = `0`; k < max_width - `1`; k++)
67	ii_top [k][n] = ii_top [k + `1`][n];
68	ii_top [max_width - `1`][n] = m;
69	// block is too small -- skip
70	if (width != max_width)
71	continue;
72	// retained blocks are set to zeros
73	for (size_t km = `0`; km < max_width; km++)
74	for (size_t kn = `0`; kn < max_width; kn++) {
75	int mm = ii_top [km][n];
76	int nn = ii_left [kn];
77	if (mm < `0` \|\| nn < `0`)
78	continue;
79	layout (h, mm, nn) = `0`;
80	tmp (h, mm, nn) = `0`;
81	lut [num++] = (int)h;
82	lut [num++] = (int)mm;
83	lut [num++] = (int)nn;
84	lut [num++] = idx (h, mm, nn);
85	}
86	}
87	}
88	}
89	lut.resize(num);
90	return lut;
91	}
92
93	typedef std::pair<int, pybind11::array_t<int>> lut_t;
94
95	std::vector<lut_t> superblock(uintptr_t LAYOUT, int H, int M, int N, int start_width) {
96	std::vector<lut_t> ret;
97	int current = `0`;
98	tensor_3d layout(H, M, N, (int *)LAYOUT);
99	tensor_3d idx(H, M, N);
100	for (int64_t h = `0`; h < H; h++)
101	for (int64_t m = `0`; m < M; m++)
102	for (int64_t n = `0`; n < N; n++) {
103	if (layout (h, m, n) == `0`)
104	continue;
105	idx (h, m, n) = current++;
106	}
107	// create lut
108	for (int max_width = start_width; max_width > `0`; max_width /= `2`) {
109	auto lut = segment_blocks(layout, idx, max_width, H, M, N);
110	if (lut.size() == `0`)
111	continue;
112	ret.push_back(std::make_pair(max_width, pybind11::array_t<int>(lut.size(), lut.data())));
113	}
114	return ret;
115	}
116
117	void init_superblocking(pybind11::module &m) {
118	m.def("superblock", &superblock, "super-blocking for block-sparse matrix multiplication");
119	}

Browse the source code of triton/python/src/superblock.cc