1 | #include "taichi/program/sparse_matrix.h" |
2 | |
3 | #include <unordered_map> |
4 | #include <sstream> |
5 | #include <string> |
6 | #include <unordered_map> |
7 | #include <utility> |
8 | |
9 | #include "Eigen/Dense" |
10 | #include "Eigen/SparseLU" |
11 | |
12 | #define BUILD(TYPE) \ |
13 | { \ |
14 | using T = Eigen::Triplet<float##TYPE>; \ |
15 | std::vector<T> *triplets = static_cast<std::vector<T> *>(triplets_adr); \ |
16 | matrix_.setFromTriplets(triplets->begin(), triplets->end()); \ |
17 | } |
18 | |
19 | #define MAKE_MATRIX(TYPE, STORAGE) \ |
20 | { \ |
21 | Pair("f" #TYPE, #STORAGE), \ |
22 | [](int rows, int cols, DataType dt) -> std::unique_ptr<SparseMatrix> { \ |
23 | using FC = Eigen::SparseMatrix<float##TYPE, Eigen::STORAGE>; \ |
24 | return std::make_unique<EigenSparseMatrix<FC>>(rows, cols, dt); \ |
25 | } \ |
26 | } |
27 | |
28 | #define INSTANTIATE_SPMV(type, storage) \ |
29 | template void \ |
30 | EigenSparseMatrix<Eigen::SparseMatrix<type, Eigen::storage>>::spmv( \ |
31 | Program *prog, const Ndarray &x, const Ndarray &y); |
32 | |
33 | namespace { |
34 | using Pair = std::pair<std::string, std::string>; |
35 | struct key_hash { |
36 | std::size_t operator()(const Pair &k) const { |
37 | auto h1 = std::hash<std::string>{}(k.first); |
38 | auto h2 = std::hash<std::string>{}(k.second); |
39 | return h1 ^ h2; |
40 | } |
41 | }; |
42 | |
43 | template <typename T, typename T1, typename T2> |
44 | void print_triplets_from_csr(int64_t n_rows, |
45 | int n_cols, |
46 | T *row, |
47 | T1 *col, |
48 | T2 *value, |
49 | std::ostringstream &ostr) { |
50 | using Triplets = Eigen::Triplet<T2>; |
51 | std::vector<Triplets> trips; |
52 | for (int64_t i = 1; i <= n_rows; ++i) { |
53 | auto n_i = row[i] - row[i - 1]; |
54 | for (auto j = 0; j < n_i; ++j) { |
55 | trips.push_back({static_cast<int>(i - 1), |
56 | static_cast<int>(col[row[i - 1] + j]), |
57 | static_cast<float>(value[row[i - 1] + j])}); |
58 | } |
59 | } |
60 | Eigen::SparseMatrix<float> m(n_rows, n_cols); |
61 | m.setFromTriplets(trips.begin(), trips.end()); |
62 | Eigen::IOFormat clean_fmt(4, 0, ", " , "\n" , "[" , "]" ); |
63 | ostr << Eigen::MatrixXf(m.cast<float>()).format(clean_fmt); |
64 | } |
65 | |
66 | template <typename T, typename T1, typename T2> |
67 | T2 get_element_from_csr(int row, |
68 | int col, |
69 | T *row_data, |
70 | T1 *col_data, |
71 | T2 *value) { |
72 | for (T i = row_data[row]; i < row_data[row + 1]; ++i) { |
73 | if (col == col_data[i]) |
74 | return value[i]; |
75 | } |
76 | // zero entry |
77 | return 0; |
78 | } |
79 | |
80 | } // namespace |
81 | |
82 | namespace taichi::lang { |
83 | |
84 | SparseMatrixBuilder::SparseMatrixBuilder(int rows, |
85 | int cols, |
86 | int max_num_triplets, |
87 | DataType dtype, |
88 | const std::string &storage_format, |
89 | Program *prog) |
90 | : rows_(rows), |
91 | cols_(cols), |
92 | max_num_triplets_(max_num_triplets), |
93 | dtype_(dtype), |
94 | storage_format_(storage_format), |
95 | prog_(prog) { |
96 | auto element_size = data_type_size(dtype); |
97 | TI_ASSERT((element_size == 4 || element_size == 8)); |
98 | ndarray_data_base_ptr_ = std::make_unique<Ndarray>( |
99 | prog_, dtype_, std::vector<int>{3 * (int)max_num_triplets_ + 1}); |
100 | } |
101 | |
102 | template <typename T, typename G> |
103 | void SparseMatrixBuilder::print_triplets_template() { |
104 | auto ptr = get_ndarray_data_ptr(); |
105 | G *data = reinterpret_cast<G *>(ptr); |
106 | num_triplets_ = data[0]; |
107 | fmt::print("n={}, m={}, num_triplets={} (max={})\n" , rows_, cols_, |
108 | num_triplets_, max_num_triplets_); |
109 | data += 1; |
110 | for (int i = 0; i < num_triplets_; i++) { |
111 | fmt::print("[{}, {}] = {}\n" , data[i * 3], data[i * 3 + 1], |
112 | taichi_union_cast<T>(data[i * 3 + 2])); |
113 | } |
114 | } |
115 | |
116 | void SparseMatrixBuilder::print_triplets_eigen() { |
117 | auto element_size = data_type_size(dtype_); |
118 | switch (element_size) { |
119 | case 4: |
120 | print_triplets_template<float32, int32>(); |
121 | break; |
122 | case 8: |
123 | print_triplets_template<float64, int64>(); |
124 | break; |
125 | default: |
126 | TI_ERROR("Unsupported sparse matrix data type!" ); |
127 | break; |
128 | } |
129 | } |
130 | |
131 | void SparseMatrixBuilder::print_triplets_cuda() { |
132 | #ifdef TI_WITH_CUDA |
133 | CUDADriver::get_instance().memcpy_device_to_host( |
134 | &num_triplets_, (void *)get_ndarray_data_ptr(), sizeof(int)); |
135 | fmt::print("n={}, m={}, num_triplets={} (max={})\n" , rows_, cols_, |
136 | num_triplets_, max_num_triplets_); |
137 | auto len = 3 * num_triplets_ + 1; |
138 | std::vector<float32> trips(len); |
139 | CUDADriver::get_instance().memcpy_device_to_host( |
140 | (void *)trips.data(), (void *)get_ndarray_data_ptr(), |
141 | len * sizeof(float32)); |
142 | for (auto i = 0; i < num_triplets_; i++) { |
143 | int row = taichi_union_cast<int>(trips[3 * i + 1]); |
144 | int col = taichi_union_cast<int>(trips[3 * i + 2]); |
145 | auto val = trips[i * 3 + 3]; |
146 | fmt::print("[{}, {}] = {}\n" , row, col, val); |
147 | } |
148 | #endif |
149 | } |
150 | |
151 | intptr_t SparseMatrixBuilder::get_ndarray_data_ptr() const { |
152 | return prog_->get_ndarray_data_ptr_as_int(ndarray_data_base_ptr_.get()); |
153 | } |
154 | |
155 | template <typename T, typename G> |
156 | void SparseMatrixBuilder::build_template(std::unique_ptr<SparseMatrix> &m) { |
157 | using V = Eigen::Triplet<T>; |
158 | std::vector<V> triplets; |
159 | auto ptr = get_ndarray_data_ptr(); |
160 | G *data = reinterpret_cast<G *>(ptr); |
161 | num_triplets_ = data[0]; |
162 | data += 1; |
163 | for (int i = 0; i < num_triplets_; i++) { |
164 | triplets.push_back( |
165 | V(data[i * 3], data[i * 3 + 1], taichi_union_cast<T>(data[i * 3 + 2]))); |
166 | } |
167 | m->build_triplets(static_cast<void *>(&triplets)); |
168 | clear(); |
169 | } |
170 | |
171 | std::unique_ptr<SparseMatrix> SparseMatrixBuilder::build() { |
172 | TI_ASSERT(built_ == false); |
173 | built_ = true; |
174 | auto sm = make_sparse_matrix(rows_, cols_, dtype_, storage_format_); |
175 | auto element_size = data_type_size(dtype_); |
176 | switch (element_size) { |
177 | case 4: |
178 | build_template<float32, int32>(sm); |
179 | break; |
180 | case 8: |
181 | build_template<float64, int64>(sm); |
182 | break; |
183 | default: |
184 | TI_ERROR("Unsupported sparse matrix data type!" ); |
185 | break; |
186 | } |
187 | return sm; |
188 | } |
189 | |
190 | std::unique_ptr<SparseMatrix> SparseMatrixBuilder::build_cuda() { |
191 | TI_ASSERT(built_ == false); |
192 | built_ = true; |
193 | auto sm = make_cu_sparse_matrix(rows_, cols_, dtype_); |
194 | #ifdef TI_WITH_CUDA |
195 | CUDADriver::get_instance().memcpy_device_to_host( |
196 | &num_triplets_, (void *)get_ndarray_data_ptr(), sizeof(int)); |
197 | auto len = 3 * num_triplets_ + 1; |
198 | std::vector<float32> trips(len); |
199 | CUDADriver::get_instance().memcpy_device_to_host( |
200 | (void *)trips.data(), (void *)get_ndarray_data_ptr(), |
201 | len * sizeof(float32)); |
202 | std::unordered_map<int, std::tuple<int, int, float32>> entries; |
203 | for (auto i = 0; i < num_triplets_; i++) { |
204 | int row = taichi_union_cast<int>(trips[3 * i + 1]); |
205 | int col = taichi_union_cast<int>(trips[3 * i + 2]); |
206 | auto val = trips[i * 3 + 3]; |
207 | auto e_idx = row * cols_ + col; |
208 | if (entries.find(e_idx) == entries.end()) { |
209 | entries[e_idx] = std::make_tuple(row, col, val); |
210 | } else { |
211 | auto [r, c, v] = entries[e_idx]; |
212 | entries[e_idx] = std::make_tuple(r, c, v + val); |
213 | } |
214 | } |
215 | auto entry_size = entries.size(); |
216 | int *row_host = (int *)malloc(sizeof(int) * entry_size); |
217 | int *col_host = (int *)malloc(sizeof(int) * entry_size); |
218 | float32 *value_host = (float32 *)malloc(sizeof(float32) * entry_size); |
219 | int count = 0; |
220 | for (auto entry : entries) { |
221 | auto [row, col, value] = entry.second; |
222 | row_host[count] = row; |
223 | col_host[count] = col; |
224 | value_host[count] = value; |
225 | count++; |
226 | } |
227 | void *row_device = nullptr, *col_device = nullptr, *value_device = nullptr; |
228 | CUDADriver::get_instance().malloc(&row_device, entry_size * sizeof(int)); |
229 | CUDADriver::get_instance().malloc(&col_device, entry_size * sizeof(int)); |
230 | CUDADriver::get_instance().malloc(&value_device, |
231 | entry_size * sizeof(float32)); |
232 | CUDADriver::get_instance().memcpy_host_to_device(row_device, (void *)row_host, |
233 | entry_size * sizeof(int)); |
234 | CUDADriver::get_instance().memcpy_host_to_device(col_device, (void *)col_host, |
235 | entry_size * sizeof(int)); |
236 | CUDADriver::get_instance().memcpy_host_to_device( |
237 | value_device, (void *)value_host, entry_size * sizeof(float32)); |
238 | sm->build_csr_from_coo(row_device, col_device, value_device, entry_size); |
239 | clear(); |
240 | free(row_host); |
241 | free(col_host); |
242 | free(value_host); |
243 | #endif |
244 | return sm; |
245 | } |
246 | |
247 | void SparseMatrixBuilder::clear() { |
248 | built_ = false; |
249 | ndarray_data_base_ptr_->write_int(std::vector<int>{0}, 0); |
250 | num_triplets_ = 0; |
251 | } |
252 | |
253 | template <class EigenMatrix> |
254 | const std::string EigenSparseMatrix<EigenMatrix>::to_string() const { |
255 | Eigen::IOFormat clean_fmt(4, 0, ", " , "\n" , "[" , "]" ); |
256 | // Note that the code below first converts the sparse matrix into a dense one. |
257 | // https://stackoverflow.com/questions/38553335/how-can-i-print-in-console-a-formatted-sparse-matrix-with-eigen |
258 | std::ostringstream ostr; |
259 | ostr << Eigen::MatrixXf(matrix_.template cast<float>()).format(clean_fmt); |
260 | return ostr.str(); |
261 | } |
262 | |
263 | template <class EigenMatrix> |
264 | void EigenSparseMatrix<EigenMatrix>::build_triplets(void *triplets_adr) { |
265 | std::string sdtype = taichi::lang::data_type_name(dtype_); |
266 | if (sdtype == "f32" ) { |
267 | BUILD(32) |
268 | } else if (sdtype == "f64" ) { |
269 | BUILD(64) |
270 | } else { |
271 | TI_ERROR("Unsupported sparse matrix data type {}!" , sdtype); |
272 | } |
273 | } |
274 | |
275 | template <class EigenMatrix> |
276 | void EigenSparseMatrix<EigenMatrix>::spmv(Program *prog, |
277 | const Ndarray &x, |
278 | const Ndarray &y) { |
279 | size_t dX = prog->get_ndarray_data_ptr_as_int(&x); |
280 | size_t dY = prog->get_ndarray_data_ptr_as_int(&y); |
281 | std::string sdtype = taichi::lang::data_type_name(dtype_); |
282 | if (sdtype == "f32" ) { |
283 | Eigen::Map<Eigen::VectorXf>((float *)dY, cols_) = |
284 | matrix_.template cast<float>() * |
285 | Eigen::Map<Eigen::VectorXf>((float *)dX, cols_); |
286 | } else if (sdtype == "f64" ) { |
287 | Eigen::Map<Eigen::VectorXd>((double *)dY, cols_) = |
288 | matrix_.template cast<double>() * |
289 | Eigen::Map<Eigen::VectorXd>((double *)dX, cols_); |
290 | } else { |
291 | TI_ERROR("Unsupported sparse matrix data type {}!" , sdtype); |
292 | } |
293 | } |
294 | |
295 | INSTANTIATE_SPMV(float32, ColMajor) |
296 | INSTANTIATE_SPMV(float32, RowMajor) |
297 | INSTANTIATE_SPMV(float64, ColMajor) |
298 | INSTANTIATE_SPMV(float64, RowMajor) |
299 | |
300 | std::unique_ptr<SparseMatrix> make_sparse_matrix( |
301 | int rows, |
302 | int cols, |
303 | DataType dt, |
304 | const std::string &storage_format = "col_major" ) { |
305 | using func_type = std::unique_ptr<SparseMatrix> (*)(int, int, DataType); |
306 | static const std::unordered_map<Pair, func_type, key_hash> map = { |
307 | MAKE_MATRIX(32, ColMajor), MAKE_MATRIX(32, RowMajor), |
308 | MAKE_MATRIX(64, ColMajor), MAKE_MATRIX(64, RowMajor)}; |
309 | std::unordered_map<std::string, std::string> format_map = { |
310 | {"col_major" , "ColMajor" }, {"row_major" , "RowMajor" }}; |
311 | std::string tdt = taichi::lang::data_type_name(dt); |
312 | Pair key = std::make_pair(tdt, format_map.at(storage_format)); |
313 | auto it = map.find(key); |
314 | if (it != map.end()) { |
315 | auto func = map.at(key); |
316 | return func(rows, cols, dt); |
317 | } else |
318 | TI_ERROR("Unsupported sparse matrix data type: {}, storage format: {}" , tdt, |
319 | storage_format); |
320 | } |
321 | |
322 | std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(int rows, |
323 | int cols, |
324 | DataType dt) { |
325 | return std::unique_ptr<SparseMatrix>( |
326 | std::make_unique<CuSparseMatrix>(rows, cols, dt)); |
327 | } |
328 | |
329 | std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(cusparseSpMatDescr_t mat, |
330 | int rows, |
331 | int cols, |
332 | DataType dt, |
333 | void *csr_row_ptr, |
334 | void *csr_col_ind, |
335 | void *csr_val_, |
336 | int nnz) { |
337 | return std::unique_ptr<SparseMatrix>(std::make_unique<CuSparseMatrix>( |
338 | mat, rows, cols, dt, csr_row_ptr, csr_col_ind, csr_val_, nnz)); |
339 | } |
340 | |
341 | template <typename T> |
342 | void build_ndarray_template(SparseMatrix &sm, |
343 | intptr_t data_ptr, |
344 | size_t num_triplets) { |
345 | using V = Eigen::Triplet<T>; |
346 | std::vector<V> triplets; |
347 | T *data = reinterpret_cast<T *>(data_ptr); |
348 | for (int i = 0; i < num_triplets; i++) { |
349 | triplets.push_back( |
350 | V(data[i * 3], data[i * 3 + 1], taichi_union_cast<T>(data[i * 3 + 2]))); |
351 | } |
352 | sm.build_triplets(static_cast<void *>(&triplets)); |
353 | } |
354 | |
355 | void make_sparse_matrix_from_ndarray(Program *prog, |
356 | SparseMatrix &sm, |
357 | const Ndarray &ndarray) { |
358 | std::string sdtype = taichi::lang::data_type_name(sm.get_data_type()); |
359 | auto data_ptr = prog->get_ndarray_data_ptr_as_int(&ndarray); |
360 | auto num_triplets = ndarray.get_nelement() * ndarray.get_element_size() / 3; |
361 | if (sdtype == "f32" ) { |
362 | build_ndarray_template<float32>(sm, data_ptr, num_triplets); |
363 | } else if (sdtype == "f64" ) { |
364 | build_ndarray_template<float64>(sm, data_ptr, num_triplets); |
365 | } else { |
366 | TI_ERROR("Unsupported sparse matrix data type {}!" , sdtype); |
367 | } |
368 | } |
369 | |
370 | void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr, |
371 | void *coo_col_ptr, |
372 | void *coo_values_ptr, |
373 | int nnz) { |
374 | #if defined(TI_WITH_CUDA) |
375 | // Step 1: Sort coo first |
376 | cusparseHandle_t cusparse_handle = nullptr; |
377 | CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle); |
378 | cusparseSpVecDescr_t vec_permutation; |
379 | cusparseDnVecDescr_t vec_values; |
380 | void *d_permutation = nullptr, *d_values_sorted = nullptr; |
381 | CUDADriver::get_instance().malloc(&d_permutation, nnz * sizeof(int)); |
382 | CUDADriver::get_instance().malloc(&d_values_sorted, nnz * sizeof(float)); |
383 | CUSPARSEDriver::get_instance().cpCreateSpVec( |
384 | &vec_permutation, nnz, nnz, d_permutation, d_values_sorted, |
385 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); |
386 | CUSPARSEDriver::get_instance().cpCreateDnVec(&vec_values, nnz, coo_values_ptr, |
387 | CUDA_R_32F); |
388 | size_t bufferSize = 0; |
389 | CUSPARSEDriver::get_instance().cpXcoosort_bufferSizeExt( |
390 | cusparse_handle, rows_, cols_, nnz, coo_row_ptr, coo_col_ptr, |
391 | &bufferSize); |
392 | void *dbuffer = nullptr; |
393 | if (bufferSize > 0) |
394 | CUDADriver::get_instance().malloc(&dbuffer, bufferSize); |
395 | // Setup permutation vector to identity |
396 | CUSPARSEDriver::get_instance().cpCreateIdentityPermutation( |
397 | cusparse_handle, nnz, d_permutation); |
398 | CUSPARSEDriver::get_instance().cpXcoosortByRow(cusparse_handle, rows_, cols_, |
399 | nnz, coo_row_ptr, coo_col_ptr, |
400 | d_permutation, dbuffer); |
401 | CUSPARSEDriver::get_instance().cpGather(cusparse_handle, vec_values, |
402 | vec_permutation); |
403 | CUDADriver::get_instance().memcpy_device_to_device( |
404 | coo_values_ptr, d_values_sorted, nnz * sizeof(float)); |
405 | // Step 2: coo to csr |
406 | void *csr_row_offset_ptr = nullptr; |
407 | CUDADriver::get_instance().malloc(&csr_row_offset_ptr, |
408 | sizeof(int) * (rows_ + 1)); |
409 | CUSPARSEDriver::get_instance().cpCoo2Csr( |
410 | cusparse_handle, (void *)coo_row_ptr, nnz, rows_, |
411 | (void *)csr_row_offset_ptr, CUSPARSE_INDEX_BASE_ZERO); |
412 | |
413 | CUSPARSEDriver::get_instance().cpCreateCsr( |
414 | &matrix_, rows_, cols_, nnz, csr_row_offset_ptr, coo_col_ptr, |
415 | coo_values_ptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, |
416 | CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); |
417 | if (vec_permutation) |
418 | CUSPARSEDriver::get_instance().cpDestroySpVec(vec_permutation); |
419 | if (vec_values) |
420 | CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_values); |
421 | if (cusparse_handle) |
422 | CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle); |
423 | if (coo_row_ptr) |
424 | CUDADriver::get_instance().mem_free(coo_row_ptr); |
425 | if (d_values_sorted) |
426 | CUDADriver::get_instance().mem_free(d_values_sorted); |
427 | if (d_permutation) |
428 | CUDADriver::get_instance().mem_free(d_permutation); |
429 | if (dbuffer) |
430 | CUDADriver::get_instance().mem_free(dbuffer); |
431 | csr_row_ptr_ = csr_row_offset_ptr; |
432 | csr_col_ind_ = coo_col_ptr; |
433 | csr_val_ = coo_values_ptr; |
434 | nnz_ = nnz; |
435 | #endif |
436 | } |
437 | |
438 | CuSparseMatrix::~CuSparseMatrix() { |
439 | #if defined(TI_WITH_CUDA) |
440 | if (matrix_) |
441 | CUSPARSEDriver::get_instance().cpDestroySpMat(matrix_); |
442 | if (csr_row_ptr_) |
443 | CUDADriver::get_instance().mem_free(csr_row_ptr_); |
444 | if (csr_col_ind_) |
445 | CUDADriver::get_instance().mem_free(csr_col_ind_); |
446 | if (csr_val_) |
447 | CUDADriver::get_instance().mem_free(csr_val_); |
448 | #endif |
449 | } |
450 | |
451 | // Reference::https://docs.nvidia.com/cuda/cusparse/index.html#csrgeam2 |
452 | std::unique_ptr<SparseMatrix> CuSparseMatrix::addition( |
453 | const CuSparseMatrix &other, |
454 | const float alpha, |
455 | const float beta) const { |
456 | #if defined(TI_WITH_CUDA) |
457 | // Get information of this matrix: A |
458 | size_t nrows_A = 0, ncols_A = 0, nnz_A = 0; |
459 | void *drow_offsets_A = nullptr, *dcol_indices_A = nullptr, |
460 | *dvalues_A = nullptr; |
461 | cusparseIndexType_t csrRowOffsetsType_A, csrColIndType_A; |
462 | cusparseIndexBase_t idxBase_A; |
463 | cudaDataType valueType_A; |
464 | TI_ASSERT(matrix_ != nullptr); |
465 | |
466 | CUSPARSEDriver::get_instance().cpCsrGet( |
467 | matrix_, &nrows_A, &ncols_A, &nnz_A, &drow_offsets_A, &dcol_indices_A, |
468 | &dvalues_A, &csrRowOffsetsType_A, &csrColIndType_A, &idxBase_A, |
469 | &valueType_A); |
470 | // Get information of other matrix: B |
471 | size_t nrows_B = 0, ncols_B = 0, nnz_B = 0; |
472 | void *drow_offsets_B = nullptr, *dcol_indices_B = nullptr, |
473 | *dvalues_B = nullptr; |
474 | cusparseIndexType_t csrRowOffsetsType_B, csrColIndType_B; |
475 | cusparseIndexBase_t idxBase_B; |
476 | cudaDataType valueType_B; |
477 | CUSPARSEDriver::get_instance().cpCsrGet( |
478 | other.matrix_, &nrows_B, &ncols_B, &nnz_B, &drow_offsets_B, |
479 | &dcol_indices_B, &dvalues_B, &csrRowOffsetsType_B, &csrColIndType_B, |
480 | &idxBase_B, &valueType_B); |
481 | |
482 | // Create sparse matrix: C |
483 | int *drow_offsets_C = nullptr; |
484 | int *dcol_indices_C = nullptr; |
485 | float *dvalues_C = nullptr; |
486 | cusparseMatDescr_t descrA = nullptr, descrB = nullptr, descrC = nullptr; |
487 | CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrA); |
488 | CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrB); |
489 | CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrC); |
490 | CUSPARSEDriver::get_instance().cpSetMatType(descrA, |
491 | CUSPARSE_MATRIX_TYPE_GENERAL); |
492 | CUSPARSEDriver::get_instance().cpSetMatType(descrB, |
493 | CUSPARSE_MATRIX_TYPE_GENERAL); |
494 | CUSPARSEDriver::get_instance().cpSetMatType(descrC, |
495 | CUSPARSE_MATRIX_TYPE_GENERAL); |
496 | CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrC, |
497 | CUSPARSE_INDEX_BASE_ZERO); |
498 | CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrA, |
499 | CUSPARSE_INDEX_BASE_ZERO); |
500 | CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrB, |
501 | CUSPARSE_INDEX_BASE_ZERO); |
502 | |
503 | // Start to do addition |
504 | cusparseHandle_t cusparse_handle; |
505 | CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle); |
506 | // alpha, nnzTotalDevHostPtr points to host memory |
507 | size_t BufferSizeInBytes; |
508 | char *buffer = nullptr; |
509 | int nnzC; |
510 | int *nnzTotalDevHostPtr = &nnzC; |
511 | CUSPARSEDriver::get_instance().cpSetPointerMode(cusparse_handle, |
512 | CUSPARSE_POINTER_MODE_HOST); |
513 | CUDADriver::get_instance().malloc((void **)(&drow_offsets_C), |
514 | sizeof(int) * (nrows_A + 1)); |
515 | // Prepare buffer |
516 | CUSPARSEDriver::get_instance().cpScsrgeam2_bufferSizeExt( |
517 | cusparse_handle, nrows_A, ncols_A, (void *)(&alpha), descrA, nnz_A, |
518 | dvalues_A, drow_offsets_A, dcol_indices_A, (void *)&beta, descrB, nnz_B, |
519 | dvalues_B, drow_offsets_B, dcol_indices_B, descrC, dvalues_C, |
520 | drow_offsets_C, dcol_indices_C, &BufferSizeInBytes); |
521 | |
522 | if (BufferSizeInBytes > 0) |
523 | CUDADriver::get_instance().malloc((void **)(&buffer), BufferSizeInBytes); |
524 | |
525 | // Determine drow_offsets_C and the total number of nonzero elements. |
526 | CUSPARSEDriver::get_instance().cpXcsrgeam2Nnz( |
527 | cusparse_handle, nrows_A, ncols_A, descrA, nnz_A, drow_offsets_A, |
528 | dcol_indices_A, descrB, nnz_B, drow_offsets_B, dcol_indices_B, descrC, |
529 | drow_offsets_C, nnzTotalDevHostPtr, buffer); |
530 | |
531 | int baseC; |
532 | if (nullptr != nnzTotalDevHostPtr) { |
533 | nnzC = *nnzTotalDevHostPtr; |
534 | } else { |
535 | CUDADriver::get_instance().memcpy_device_to_host( |
536 | (void *)(&nnzC), (void *)(drow_offsets_C + nrows_A), sizeof(int)); |
537 | CUDADriver::get_instance().memcpy_device_to_host( |
538 | (void *)(&baseC), (void *)(drow_offsets_C), sizeof(int)); |
539 | nnzC -= baseC; |
540 | } |
541 | |
542 | CUDADriver::get_instance().malloc((void **)&dcol_indices_C, |
543 | sizeof(int) * nnzC); |
544 | CUDADriver::get_instance().malloc((void **)&dvalues_C, sizeof(float) * nnzC); |
545 | |
546 | CUSPARSEDriver::get_instance().cpScsrgeam2( |
547 | cusparse_handle, nrows_A, ncols_A, (void *)(&alpha), descrA, nnz_A, |
548 | dvalues_A, drow_offsets_A, dcol_indices_A, (void *)(&beta), descrB, nnz_B, |
549 | dvalues_B, drow_offsets_B, dcol_indices_B, descrC, dvalues_C, |
550 | drow_offsets_C, dcol_indices_C, buffer); |
551 | |
552 | cusparseSpMatDescr_t matrix_C; |
553 | CUSPARSEDriver::get_instance().cpCreateCsr( |
554 | &matrix_C, rows_, cols_, nnzC, drow_offsets_C, dcol_indices_C, dvalues_C, |
555 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, |
556 | CUDA_R_32F); |
557 | |
558 | CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle); |
559 | CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrA); |
560 | CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrB); |
561 | CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrC); |
562 | CUDADriver::get_instance().mem_free(buffer); |
563 | return make_cu_sparse_matrix(matrix_C, rows_, cols_, PrimitiveType::f32, |
564 | drow_offsets_C, dcol_indices_C, dvalues_C, nnzC); |
565 | ; |
566 | #else |
567 | TI_NOT_IMPLEMENTED; |
568 | return std::unique_ptr<SparseMatrix>(); |
569 | #endif |
570 | } |
571 | |
572 | std::unique_ptr<SparseMatrix> CuSparseMatrix::matmul( |
573 | const CuSparseMatrix &other) const { |
574 | #if defined(TI_WITH_CUDA) |
575 | return gemm(other, 1.0f, 0.0f); |
576 | #else |
577 | TI_NOT_IMPLEMENTED; |
578 | return std::unique_ptr<SparseMatrix>(); |
579 | #endif |
580 | } |
581 | |
582 | // Reference: |
583 | // https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/spgemm |
584 | std::unique_ptr<SparseMatrix> CuSparseMatrix::gemm(const CuSparseMatrix &other, |
585 | const float alpha, |
586 | const float beta) const { |
587 | #if defined(TI_WITH_CUDA) |
588 | cusparseHandle_t handle = nullptr; |
589 | CUSPARSEDriver::get_instance().cpCreate(&handle); |
590 | cusparseOperation_t op_A = CUSPARSE_OPERATION_NON_TRANSPOSE; |
591 | cusparseOperation_t op_B = CUSPARSE_OPERATION_NON_TRANSPOSE; |
592 | |
593 | size_t nrows_A = rows_; |
594 | size_t ncols_B = other.cols_; |
595 | auto mat_A = matrix_; |
596 | auto mat_B = other.matrix_; |
597 | |
598 | // 1. create resulting matrix `C` |
599 | cusparseSpMatDescr_t mat_C; |
600 | CUSPARSEDriver::get_instance().cpCreateCsr( |
601 | &mat_C, nrows_A, ncols_B, 0, nullptr, nullptr, nullptr, |
602 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, |
603 | CUDA_R_32F); |
604 | |
605 | // 2. create gemm descr |
606 | cusparseSpGEMMDescr_t spgemm_desc; |
607 | CUSPARSEDriver::get_instance().cpCreateSpGEMM(&spgemm_desc); |
608 | |
609 | // 3. ask buffer_size1 bytes for external memory |
610 | void *d_buffer1 = nullptr; |
611 | size_t buffer_size1 = 0; |
612 | CUSPARSEDriver::get_instance().cpSpGEMM_workEstimation( |
613 | handle, op_A, op_B, &alpha, this->matrix_, other.matrix_, &beta, mat_C, |
614 | CUDA_R_32F, CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size1, nullptr); |
615 | CUDADriver::get_instance().malloc((void **)&d_buffer1, buffer_size1); |
616 | // 4. inspect the matrices A and B to understand the memory requirement for |
617 | // the next step |
618 | CUSPARSEDriver::get_instance().cpSpGEMM_workEstimation( |
619 | handle, op_A, op_B, &alpha, this->matrix_, other.matrix_, &beta, mat_C, |
620 | CUDA_R_32F, CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size1, |
621 | d_buffer1); |
622 | |
623 | // 5. ask buffer_size2 bytes for external memory |
624 | size_t buffer_size2 = 0; |
625 | CUSPARSEDriver::get_instance().cpSpGEMM_compute( |
626 | handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F, |
627 | CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size2, nullptr); |
628 | void *d_buffer2 = nullptr; |
629 | CUDADriver::get_instance().malloc((void **)&d_buffer2, buffer_size2); |
630 | |
631 | // 6. compute the intermediate product of A * B |
632 | CUSPARSEDriver::get_instance().cpSpGEMM_compute( |
633 | handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F, |
634 | CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size2, d_buffer2); |
635 | |
636 | // 7. get info of matrix C |
637 | size_t nrows_C, cols_C, nnz_C; |
638 | CUSPARSEDriver::get_instance().cpGetSize(mat_C, &nrows_C, &cols_C, &nnz_C); |
639 | |
640 | // 8. allocate matric C |
641 | int *d_csr_row_ptr_C, *d_csr_col_ind_C; |
642 | float *d_values_C; |
643 | CUDADriver::get_instance().malloc((void **)&d_csr_row_ptr_C, |
644 | (nrows_A + 1) * sizeof(int)); |
645 | CUDADriver::get_instance().malloc((void **)&d_csr_col_ind_C, |
646 | nnz_C * sizeof(int)); |
647 | CUDADriver::get_instance().malloc((void **)&d_values_C, |
648 | nnz_C * sizeof(float)); |
649 | |
650 | // 9. update matrix C with new pointers |
651 | CUSPARSEDriver::get_instance().cpCsrSetPointers(mat_C, d_csr_row_ptr_C, |
652 | d_csr_col_ind_C, d_values_C); |
653 | |
654 | // 10. copy the final products of C. |
655 | CUSPARSEDriver::get_instance().cpSpGEMM_copy( |
656 | handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F, |
657 | CUSPARSE_SPGEMM_DEFAULT, spgemm_desc); |
658 | |
659 | CUDADriver::get_instance().mem_free(d_buffer1); |
660 | CUDADriver::get_instance().mem_free(d_buffer2); |
661 | CUSPARSEDriver::get_instance().cpDestroy(handle); |
662 | CUSPARSEDriver::get_instance().cpDestroySpGEMM(spgemm_desc); |
663 | |
664 | return make_cu_sparse_matrix(mat_C, nrows_A, ncols_B, PrimitiveType::f32, |
665 | d_csr_row_ptr_C, d_csr_col_ind_C, d_values_C, |
666 | nnz_C); |
667 | #else |
668 | TI_NOT_IMPLEMENTED; |
669 | return std::unique_ptr<SparseMatrix>(); |
670 | #endif |
671 | } |
672 | |
673 | // Convert CSR to CSC format using routine `Csr2cscEx2` |
674 | // to implement transpose. |
675 | // Reference |
676 | // https://stackoverflow.com/questions/57368010/how-to-transpose-a-sparse-matrix-in-cusparse |
677 | std::unique_ptr<SparseMatrix> CuSparseMatrix::transpose() const { |
678 | #if defined(TI_WITH_CUDA) |
679 | cusparseHandle_t handle; |
680 | CUSPARSEDriver::get_instance().cpCreate(&handle); |
681 | size_t nrows_A, ncols_A, nnz; |
682 | void *d_csr_val = nullptr, *d_csr_val_AT = nullptr; |
683 | int *d_csr_row_ptr = nullptr, *d_csr_col_ind = nullptr; |
684 | int *d_csr_row_ptr_AT = nullptr, *d_csr_col_ptr_AT = nullptr; |
685 | cusparseIndexType_t csr_row_otr_type, csr_col_otr_type; |
686 | cusparseIndexBase_t idx_base_type; |
687 | cudaDataType value_type; |
688 | size_t buffer_size; |
689 | |
690 | // 1. get pointers of A |
691 | CUSPARSEDriver::get_instance().cpCsrGet( |
692 | matrix_, &nrows_A, &ncols_A, &nnz, (void **)&d_csr_row_ptr, |
693 | (void **)&d_csr_col_ind, (void **)&d_csr_val, &csr_row_otr_type, |
694 | &csr_col_otr_type, &idx_base_type, &value_type); |
695 | |
696 | // 2. ask bufer size for Csr2cscEx2 |
697 | CUSPARSEDriver::get_instance().cpCsr2cscEx2_bufferSize( |
698 | handle, nrows_A, ncols_A, nnz, (void *)&d_csr_val, (int *)&d_csr_row_ptr, |
699 | (int *)&d_csr_col_ind, (void *)&d_csr_val_AT, (int *)&d_csr_row_ptr_AT, |
700 | (int *)&d_csr_col_ptr_AT, CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, |
701 | CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, &buffer_size); |
702 | void *buffer = nullptr; |
703 | CUDADriver::get_instance().malloc((void **)&buffer, buffer_size); |
704 | |
705 | CUDADriver::get_instance().malloc((void **)&d_csr_val_AT, |
706 | nnz * sizeof(float)); |
707 | CUDADriver::get_instance().malloc((void **)&d_csr_row_ptr_AT, |
708 | (ncols_A + 1) * sizeof(int)); |
709 | CUDADriver::get_instance().malloc((void **)&d_csr_col_ptr_AT, |
710 | nnz * sizeof(int)); |
711 | |
712 | // 3. execute Csr2cscEx2 |
713 | CUSPARSEDriver::get_instance().cpCsr2cscEx2( |
714 | handle, nrows_A, ncols_A, nnz, d_csr_val, d_csr_row_ptr, d_csr_col_ind, |
715 | d_csr_val_AT, d_csr_row_ptr_AT, d_csr_col_ptr_AT, CUDA_R_32F, |
716 | CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, |
717 | buffer); |
718 | |
719 | // 4. create AT. |
720 | cusparseSpMatDescr_t mat_AT; |
721 | CUSPARSEDriver::get_instance().cpCreateCsr( |
722 | &mat_AT, ncols_A, nrows_A, nnz, (void *)d_csr_row_ptr_AT, |
723 | (void *)d_csr_col_ptr_AT, (void *)d_csr_val_AT, CUSPARSE_INDEX_32I, |
724 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); |
725 | CUDADriver::get_instance().mem_free(buffer); |
726 | CUSPARSEDriver::get_instance().cpDestroy(handle); |
727 | return make_cu_sparse_matrix(mat_AT, ncols_A, nrows_A, PrimitiveType::f32, |
728 | d_csr_row_ptr_AT, d_csr_col_ptr_AT, d_csr_val_AT, |
729 | nnz); |
730 | #else |
731 | TI_NOT_IMPLEMENTED; |
732 | return std::unique_ptr<SparseMatrix>(); |
733 | #endif |
734 | } |
735 | |
736 | void CuSparseMatrix::spmv(Program *prog, const Ndarray &x, const Ndarray &y) { |
737 | #if defined(TI_WITH_CUDA) |
738 | size_t dX = prog->get_ndarray_data_ptr_as_int(&x); |
739 | size_t dY = prog->get_ndarray_data_ptr_as_int(&y); |
740 | |
741 | cusparseDnVecDescr_t vecX, vecY; |
742 | CUSPARSEDriver::get_instance().cpCreateDnVec(&vecX, cols_, (void *)dX, |
743 | CUDA_R_32F); |
744 | CUSPARSEDriver::get_instance().cpCreateDnVec(&vecY, rows_, (void *)dY, |
745 | CUDA_R_32F); |
746 | |
747 | cusparseHandle_t cusparse_handle; |
748 | CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle); |
749 | float alpha = 1.0f, beta = 0.0f; |
750 | size_t bufferSize = 0; |
751 | CUSPARSEDriver::get_instance().cpSpMV_bufferSize( |
752 | cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX, |
753 | &beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, &bufferSize); |
754 | |
755 | void *dBuffer = nullptr; |
756 | if (bufferSize > 0) |
757 | CUDADriver::get_instance().malloc(&dBuffer, bufferSize); |
758 | CUSPARSEDriver::get_instance().cpSpMV( |
759 | cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX, |
760 | &beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, dBuffer); |
761 | |
762 | CUSPARSEDriver::get_instance().cpDestroyDnVec(vecX); |
763 | CUSPARSEDriver::get_instance().cpDestroyDnVec(vecY); |
764 | CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle); |
765 | CUDADriver::get_instance().mem_free(dBuffer); |
766 | #endif |
767 | } |
768 | |
769 | const std::string CuSparseMatrix::to_string() const { |
770 | std::ostringstream ostr; |
771 | #ifdef TI_WITH_CUDA |
772 | size_t rows, cols, nnz; |
773 | float *dR; |
774 | int *dC, *dV; |
775 | cusparseIndexType_t row_type, column_type; |
776 | cusparseIndexBase_t idx_base; |
777 | cudaDataType value_type; |
778 | CUSPARSEDriver::get_instance().cpCsrGet( |
779 | matrix_, &rows, &cols, &nnz, (void **)&dR, (void **)&dC, (void **)&dV, |
780 | &row_type, &column_type, &idx_base, &value_type); |
781 | |
782 | auto *hR = new int[rows + 1]; |
783 | auto *hC = new int[nnz]; |
784 | auto *hV = new float[nnz]; |
785 | |
786 | CUDADriver::get_instance().memcpy_device_to_host((void *)hR, (void *)dR, |
787 | (rows + 1) * sizeof(int)); |
788 | CUDADriver::get_instance().memcpy_device_to_host((void *)hC, (void *)dC, |
789 | (nnz) * sizeof(int)); |
790 | CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV, |
791 | (nnz) * sizeof(float)); |
792 | |
793 | print_triplets_from_csr<int, int, float>(rows, cols, hR, hC, hV, ostr); |
794 | delete[] hR; |
795 | delete[] hC; |
796 | delete[] hV; |
797 | #endif |
798 | return ostr.str(); |
799 | } |
800 | |
801 | float CuSparseMatrix::get_element(int row, int col) const { |
802 | float res = 0.0f; |
803 | #ifdef TI_WITH_CUDA |
804 | size_t rows, cols, nnz; |
805 | float *dR; |
806 | int *dC, *dV; |
807 | cusparseIndexType_t row_type, column_type; |
808 | cusparseIndexBase_t idx_base; |
809 | cudaDataType value_type; |
810 | CUSPARSEDriver::get_instance().cpCsrGet( |
811 | matrix_, &rows, &cols, &nnz, (void **)&dR, (void **)&dC, (void **)&dV, |
812 | &row_type, &column_type, &idx_base, &value_type); |
813 | |
814 | TI_ASSERT(row < rows); |
815 | TI_ASSERT(col < cols); |
816 | |
817 | auto *hR = new int[rows + 1]; |
818 | auto *hC = new int[nnz]; |
819 | auto *hV = new float[nnz]; |
820 | |
821 | CUDADriver::get_instance().memcpy_device_to_host((void *)hR, (void *)dR, |
822 | (rows + 1) * sizeof(int)); |
823 | CUDADriver::get_instance().memcpy_device_to_host((void *)hC, (void *)dC, |
824 | (nnz) * sizeof(int)); |
825 | CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV, |
826 | (nnz) * sizeof(float)); |
827 | |
828 | res = get_element_from_csr<int, int, float>(row, col, hR, hC, hV); |
829 | |
830 | delete[] hR; |
831 | delete[] hC; |
832 | delete[] hV; |
833 | #endif // TI_WITH_CUDA |
834 | return res; |
835 | } |
836 | |
837 | } // namespace taichi::lang |
838 | |