1 | #include <iostream> |
2 | #include <pybind11/numpy.h> |
3 | #include <pybind11/pybind11.h> |
4 | #include <pybind11/stl.h> |
5 | #include <string> |
6 | #include <tuple> |
7 | #include <vector> |
8 | #ifdef _OPENMP |
9 | #include <omp.h> |
10 | #endif |
11 | |
12 | // row-major 3d tensor |
13 | class tensor_3d { |
14 | public: |
15 | tensor_3d(int size_0, int size_1, int size_2, int *data = nullptr) : data_(size_0 * size_1 * size_2, 0) { |
16 | if (data) |
17 | std::copy(data, data + data_.size(), data_.begin()); |
18 | stride_0_ = size_1 * size_2; |
19 | stride_1_ = size_2; |
20 | stride_2_ = 1; |
21 | } |
22 | |
23 | int &operator()(int i, int j, int k) { |
24 | return data_[i * stride_0_ + j * stride_1_ + k]; |
25 | } |
26 | |
27 | private: |
28 | std::vector<int> data_; |
29 | int stride_0_; |
30 | int stride_1_; |
31 | int stride_2_; |
32 | }; |
33 | |
34 | std::vector<int> segment_blocks(tensor_3d &layout, tensor_3d &idx, int max_width, int H, int M, int N) { |
35 | tensor_3d tmp(H, M, N); |
36 | std::vector<int> current(H, 0); |
37 | int num = 0; |
38 | std::vector<int> lut(H * M * N * 4); |
39 | for (size_t h = 0; h < H; h++) { |
40 | // surrounding indices |
41 | std::vector<int> ii_left(max_width, -1); |
42 | std::vector<std::vector<int>> ii_top(max_width, std::vector<int>(N, -1)); |
43 | // start the dynamic programming algorithm |
44 | for (size_t m = 0; m < M; m++) { |
45 | for (size_t n = 0; n < N; n++) { |
46 | int v = layout(h, m, n); |
47 | if (v == 0) |
48 | continue; |
49 | int n_left = ii_left[max_width - 1]; |
50 | int m_top = ii_top[max_width - 1][n]; |
51 | int top = (m_top >= 0) ? tmp(h, m_top, n) : 0; |
52 | int left = (n_left >= 0) ? tmp(h, m, n_left) : 0; |
53 | int topleft = (m_top >= 0 && n_left >= 0) ? tmp(h, m_top, n_left) : 0; |
54 | int width = std::min(left, std::min(top, topleft)) + 1; |
55 | // reset width if blocks cannot be |
56 | // packed together (i.e., there's a 1 "in the middle") |
57 | for (int nn = n_left + 1; nn < n; nn++) |
58 | if (ii_top[max_width - 1][nn] > ii_top[max_width - 1][n]) |
59 | width = 1; |
60 | tmp(h, m, n) = width; |
61 | // update n_left ring buffer |
62 | for (int k = 0; k < max_width - 1; k++) |
63 | ii_left[k] = ii_left[k + 1]; |
64 | ii_left[max_width - 1] = n; |
65 | // update ii_top ring buffer |
66 | for (int k = 0; k < max_width - 1; k++) |
67 | ii_top[k][n] = ii_top[k + 1][n]; |
68 | ii_top[max_width - 1][n] = m; |
69 | // block is too small -- skip |
70 | if (width != max_width) |
71 | continue; |
72 | // retained blocks are set to zeros |
73 | for (size_t km = 0; km < max_width; km++) |
74 | for (size_t kn = 0; kn < max_width; kn++) { |
75 | int mm = ii_top[km][n]; |
76 | int nn = ii_left[kn]; |
77 | if (mm < 0 || nn < 0) |
78 | continue; |
79 | layout(h, mm, nn) = 0; |
80 | tmp(h, mm, nn) = 0; |
81 | lut[num++] = (int)h; |
82 | lut[num++] = (int)mm; |
83 | lut[num++] = (int)nn; |
84 | lut[num++] = idx(h, mm, nn); |
85 | } |
86 | } |
87 | } |
88 | } |
89 | lut.resize(num); |
90 | return lut; |
91 | } |
92 | |
93 | typedef std::pair<int, pybind11::array_t<int>> lut_t; |
94 | |
95 | std::vector<lut_t> superblock(uintptr_t LAYOUT, int H, int M, int N, int start_width) { |
96 | std::vector<lut_t> ret; |
97 | int current = 0; |
98 | tensor_3d layout(H, M, N, (int *)LAYOUT); |
99 | tensor_3d idx(H, M, N); |
100 | for (int64_t h = 0; h < H; h++) |
101 | for (int64_t m = 0; m < M; m++) |
102 | for (int64_t n = 0; n < N; n++) { |
103 | if (layout(h, m, n) == 0) |
104 | continue; |
105 | idx(h, m, n) = current++; |
106 | } |
107 | // create lut |
108 | for (int max_width = start_width; max_width > 0; max_width /= 2) { |
109 | auto lut = segment_blocks(layout, idx, max_width, H, M, N); |
110 | if (lut.size() == 0) |
111 | continue; |
112 | ret.push_back(std::make_pair(max_width, pybind11::array_t<int>(lut.size(), lut.data()))); |
113 | } |
114 | return ret; |
115 | } |
116 | |
117 | void init_superblocking(pybind11::module &m) { |
118 | m.def("superblock" , &superblock, "super-blocking for block-sparse matrix multiplication" ); |
119 | } |