1#include "taichi/program/sparse_matrix.h"
2
3#include <unordered_map>
4#include <sstream>
5#include <string>
6#include <unordered_map>
7#include <utility>
8
9#include "Eigen/Dense"
10#include "Eigen/SparseLU"
11
12#define BUILD(TYPE) \
13 { \
14 using T = Eigen::Triplet<float##TYPE>; \
15 std::vector<T> *triplets = static_cast<std::vector<T> *>(triplets_adr); \
16 matrix_.setFromTriplets(triplets->begin(), triplets->end()); \
17 }
18
19#define MAKE_MATRIX(TYPE, STORAGE) \
20 { \
21 Pair("f" #TYPE, #STORAGE), \
22 [](int rows, int cols, DataType dt) -> std::unique_ptr<SparseMatrix> { \
23 using FC = Eigen::SparseMatrix<float##TYPE, Eigen::STORAGE>; \
24 return std::make_unique<EigenSparseMatrix<FC>>(rows, cols, dt); \
25 } \
26 }
27
28#define INSTANTIATE_SPMV(type, storage) \
29 template void \
30 EigenSparseMatrix<Eigen::SparseMatrix<type, Eigen::storage>>::spmv( \
31 Program *prog, const Ndarray &x, const Ndarray &y);
32
33namespace {
34using Pair = std::pair<std::string, std::string>;
35struct key_hash {
36 std::size_t operator()(const Pair &k) const {
37 auto h1 = std::hash<std::string>{}(k.first);
38 auto h2 = std::hash<std::string>{}(k.second);
39 return h1 ^ h2;
40 }
41};
42
43template <typename T, typename T1, typename T2>
44void print_triplets_from_csr(int64_t n_rows,
45 int n_cols,
46 T *row,
47 T1 *col,
48 T2 *value,
49 std::ostringstream &ostr) {
50 using Triplets = Eigen::Triplet<T2>;
51 std::vector<Triplets> trips;
52 for (int64_t i = 1; i <= n_rows; ++i) {
53 auto n_i = row[i] - row[i - 1];
54 for (auto j = 0; j < n_i; ++j) {
55 trips.push_back({static_cast<int>(i - 1),
56 static_cast<int>(col[row[i - 1] + j]),
57 static_cast<float>(value[row[i - 1] + j])});
58 }
59 }
60 Eigen::SparseMatrix<float> m(n_rows, n_cols);
61 m.setFromTriplets(trips.begin(), trips.end());
62 Eigen::IOFormat clean_fmt(4, 0, ", ", "\n", "[", "]");
63 ostr << Eigen::MatrixXf(m.cast<float>()).format(clean_fmt);
64}
65
66template <typename T, typename T1, typename T2>
67T2 get_element_from_csr(int row,
68 int col,
69 T *row_data,
70 T1 *col_data,
71 T2 *value) {
72 for (T i = row_data[row]; i < row_data[row + 1]; ++i) {
73 if (col == col_data[i])
74 return value[i];
75 }
76 // zero entry
77 return 0;
78}
79
80} // namespace
81
82namespace taichi::lang {
83
84SparseMatrixBuilder::SparseMatrixBuilder(int rows,
85 int cols,
86 int max_num_triplets,
87 DataType dtype,
88 const std::string &storage_format,
89 Program *prog)
90 : rows_(rows),
91 cols_(cols),
92 max_num_triplets_(max_num_triplets),
93 dtype_(dtype),
94 storage_format_(storage_format),
95 prog_(prog) {
96 auto element_size = data_type_size(dtype);
97 TI_ASSERT((element_size == 4 || element_size == 8));
98 ndarray_data_base_ptr_ = std::make_unique<Ndarray>(
99 prog_, dtype_, std::vector<int>{3 * (int)max_num_triplets_ + 1});
100}
101
102template <typename T, typename G>
103void SparseMatrixBuilder::print_triplets_template() {
104 auto ptr = get_ndarray_data_ptr();
105 G *data = reinterpret_cast<G *>(ptr);
106 num_triplets_ = data[0];
107 fmt::print("n={}, m={}, num_triplets={} (max={})\n", rows_, cols_,
108 num_triplets_, max_num_triplets_);
109 data += 1;
110 for (int i = 0; i < num_triplets_; i++) {
111 fmt::print("[{}, {}] = {}\n", data[i * 3], data[i * 3 + 1],
112 taichi_union_cast<T>(data[i * 3 + 2]));
113 }
114}
115
116void SparseMatrixBuilder::print_triplets_eigen() {
117 auto element_size = data_type_size(dtype_);
118 switch (element_size) {
119 case 4:
120 print_triplets_template<float32, int32>();
121 break;
122 case 8:
123 print_triplets_template<float64, int64>();
124 break;
125 default:
126 TI_ERROR("Unsupported sparse matrix data type!");
127 break;
128 }
129}
130
131void SparseMatrixBuilder::print_triplets_cuda() {
132#ifdef TI_WITH_CUDA
133 CUDADriver::get_instance().memcpy_device_to_host(
134 &num_triplets_, (void *)get_ndarray_data_ptr(), sizeof(int));
135 fmt::print("n={}, m={}, num_triplets={} (max={})\n", rows_, cols_,
136 num_triplets_, max_num_triplets_);
137 auto len = 3 * num_triplets_ + 1;
138 std::vector<float32> trips(len);
139 CUDADriver::get_instance().memcpy_device_to_host(
140 (void *)trips.data(), (void *)get_ndarray_data_ptr(),
141 len * sizeof(float32));
142 for (auto i = 0; i < num_triplets_; i++) {
143 int row = taichi_union_cast<int>(trips[3 * i + 1]);
144 int col = taichi_union_cast<int>(trips[3 * i + 2]);
145 auto val = trips[i * 3 + 3];
146 fmt::print("[{}, {}] = {}\n", row, col, val);
147 }
148#endif
149}
150
151intptr_t SparseMatrixBuilder::get_ndarray_data_ptr() const {
152 return prog_->get_ndarray_data_ptr_as_int(ndarray_data_base_ptr_.get());
153}
154
155template <typename T, typename G>
156void SparseMatrixBuilder::build_template(std::unique_ptr<SparseMatrix> &m) {
157 using V = Eigen::Triplet<T>;
158 std::vector<V> triplets;
159 auto ptr = get_ndarray_data_ptr();
160 G *data = reinterpret_cast<G *>(ptr);
161 num_triplets_ = data[0];
162 data += 1;
163 for (int i = 0; i < num_triplets_; i++) {
164 triplets.push_back(
165 V(data[i * 3], data[i * 3 + 1], taichi_union_cast<T>(data[i * 3 + 2])));
166 }
167 m->build_triplets(static_cast<void *>(&triplets));
168 clear();
169}
170
171std::unique_ptr<SparseMatrix> SparseMatrixBuilder::build() {
172 TI_ASSERT(built_ == false);
173 built_ = true;
174 auto sm = make_sparse_matrix(rows_, cols_, dtype_, storage_format_);
175 auto element_size = data_type_size(dtype_);
176 switch (element_size) {
177 case 4:
178 build_template<float32, int32>(sm);
179 break;
180 case 8:
181 build_template<float64, int64>(sm);
182 break;
183 default:
184 TI_ERROR("Unsupported sparse matrix data type!");
185 break;
186 }
187 return sm;
188}
189
190std::unique_ptr<SparseMatrix> SparseMatrixBuilder::build_cuda() {
191 TI_ASSERT(built_ == false);
192 built_ = true;
193 auto sm = make_cu_sparse_matrix(rows_, cols_, dtype_);
194#ifdef TI_WITH_CUDA
195 CUDADriver::get_instance().memcpy_device_to_host(
196 &num_triplets_, (void *)get_ndarray_data_ptr(), sizeof(int));
197 auto len = 3 * num_triplets_ + 1;
198 std::vector<float32> trips(len);
199 CUDADriver::get_instance().memcpy_device_to_host(
200 (void *)trips.data(), (void *)get_ndarray_data_ptr(),
201 len * sizeof(float32));
202 std::unordered_map<int, std::tuple<int, int, float32>> entries;
203 for (auto i = 0; i < num_triplets_; i++) {
204 int row = taichi_union_cast<int>(trips[3 * i + 1]);
205 int col = taichi_union_cast<int>(trips[3 * i + 2]);
206 auto val = trips[i * 3 + 3];
207 auto e_idx = row * cols_ + col;
208 if (entries.find(e_idx) == entries.end()) {
209 entries[e_idx] = std::make_tuple(row, col, val);
210 } else {
211 auto [r, c, v] = entries[e_idx];
212 entries[e_idx] = std::make_tuple(r, c, v + val);
213 }
214 }
215 auto entry_size = entries.size();
216 int *row_host = (int *)malloc(sizeof(int) * entry_size);
217 int *col_host = (int *)malloc(sizeof(int) * entry_size);
218 float32 *value_host = (float32 *)malloc(sizeof(float32) * entry_size);
219 int count = 0;
220 for (auto entry : entries) {
221 auto [row, col, value] = entry.second;
222 row_host[count] = row;
223 col_host[count] = col;
224 value_host[count] = value;
225 count++;
226 }
227 void *row_device = nullptr, *col_device = nullptr, *value_device = nullptr;
228 CUDADriver::get_instance().malloc(&row_device, entry_size * sizeof(int));
229 CUDADriver::get_instance().malloc(&col_device, entry_size * sizeof(int));
230 CUDADriver::get_instance().malloc(&value_device,
231 entry_size * sizeof(float32));
232 CUDADriver::get_instance().memcpy_host_to_device(row_device, (void *)row_host,
233 entry_size * sizeof(int));
234 CUDADriver::get_instance().memcpy_host_to_device(col_device, (void *)col_host,
235 entry_size * sizeof(int));
236 CUDADriver::get_instance().memcpy_host_to_device(
237 value_device, (void *)value_host, entry_size * sizeof(float32));
238 sm->build_csr_from_coo(row_device, col_device, value_device, entry_size);
239 clear();
240 free(row_host);
241 free(col_host);
242 free(value_host);
243#endif
244 return sm;
245}
246
247void SparseMatrixBuilder::clear() {
248 built_ = false;
249 ndarray_data_base_ptr_->write_int(std::vector<int>{0}, 0);
250 num_triplets_ = 0;
251}
252
253template <class EigenMatrix>
254const std::string EigenSparseMatrix<EigenMatrix>::to_string() const {
255 Eigen::IOFormat clean_fmt(4, 0, ", ", "\n", "[", "]");
256 // Note that the code below first converts the sparse matrix into a dense one.
257 // https://stackoverflow.com/questions/38553335/how-can-i-print-in-console-a-formatted-sparse-matrix-with-eigen
258 std::ostringstream ostr;
259 ostr << Eigen::MatrixXf(matrix_.template cast<float>()).format(clean_fmt);
260 return ostr.str();
261}
262
263template <class EigenMatrix>
264void EigenSparseMatrix<EigenMatrix>::build_triplets(void *triplets_adr) {
265 std::string sdtype = taichi::lang::data_type_name(dtype_);
266 if (sdtype == "f32") {
267 BUILD(32)
268 } else if (sdtype == "f64") {
269 BUILD(64)
270 } else {
271 TI_ERROR("Unsupported sparse matrix data type {}!", sdtype);
272 }
273}
274
275template <class EigenMatrix>
276void EigenSparseMatrix<EigenMatrix>::spmv(Program *prog,
277 const Ndarray &x,
278 const Ndarray &y) {
279 size_t dX = prog->get_ndarray_data_ptr_as_int(&x);
280 size_t dY = prog->get_ndarray_data_ptr_as_int(&y);
281 std::string sdtype = taichi::lang::data_type_name(dtype_);
282 if (sdtype == "f32") {
283 Eigen::Map<Eigen::VectorXf>((float *)dY, cols_) =
284 matrix_.template cast<float>() *
285 Eigen::Map<Eigen::VectorXf>((float *)dX, cols_);
286 } else if (sdtype == "f64") {
287 Eigen::Map<Eigen::VectorXd>((double *)dY, cols_) =
288 matrix_.template cast<double>() *
289 Eigen::Map<Eigen::VectorXd>((double *)dX, cols_);
290 } else {
291 TI_ERROR("Unsupported sparse matrix data type {}!", sdtype);
292 }
293}
294
295INSTANTIATE_SPMV(float32, ColMajor)
296INSTANTIATE_SPMV(float32, RowMajor)
297INSTANTIATE_SPMV(float64, ColMajor)
298INSTANTIATE_SPMV(float64, RowMajor)
299
300std::unique_ptr<SparseMatrix> make_sparse_matrix(
301 int rows,
302 int cols,
303 DataType dt,
304 const std::string &storage_format = "col_major") {
305 using func_type = std::unique_ptr<SparseMatrix> (*)(int, int, DataType);
306 static const std::unordered_map<Pair, func_type, key_hash> map = {
307 MAKE_MATRIX(32, ColMajor), MAKE_MATRIX(32, RowMajor),
308 MAKE_MATRIX(64, ColMajor), MAKE_MATRIX(64, RowMajor)};
309 std::unordered_map<std::string, std::string> format_map = {
310 {"col_major", "ColMajor"}, {"row_major", "RowMajor"}};
311 std::string tdt = taichi::lang::data_type_name(dt);
312 Pair key = std::make_pair(tdt, format_map.at(storage_format));
313 auto it = map.find(key);
314 if (it != map.end()) {
315 auto func = map.at(key);
316 return func(rows, cols, dt);
317 } else
318 TI_ERROR("Unsupported sparse matrix data type: {}, storage format: {}", tdt,
319 storage_format);
320}
321
322std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(int rows,
323 int cols,
324 DataType dt) {
325 return std::unique_ptr<SparseMatrix>(
326 std::make_unique<CuSparseMatrix>(rows, cols, dt));
327}
328
329std::unique_ptr<SparseMatrix> make_cu_sparse_matrix(cusparseSpMatDescr_t mat,
330 int rows,
331 int cols,
332 DataType dt,
333 void *csr_row_ptr,
334 void *csr_col_ind,
335 void *csr_val_,
336 int nnz) {
337 return std::unique_ptr<SparseMatrix>(std::make_unique<CuSparseMatrix>(
338 mat, rows, cols, dt, csr_row_ptr, csr_col_ind, csr_val_, nnz));
339}
340
341template <typename T>
342void build_ndarray_template(SparseMatrix &sm,
343 intptr_t data_ptr,
344 size_t num_triplets) {
345 using V = Eigen::Triplet<T>;
346 std::vector<V> triplets;
347 T *data = reinterpret_cast<T *>(data_ptr);
348 for (int i = 0; i < num_triplets; i++) {
349 triplets.push_back(
350 V(data[i * 3], data[i * 3 + 1], taichi_union_cast<T>(data[i * 3 + 2])));
351 }
352 sm.build_triplets(static_cast<void *>(&triplets));
353}
354
355void make_sparse_matrix_from_ndarray(Program *prog,
356 SparseMatrix &sm,
357 const Ndarray &ndarray) {
358 std::string sdtype = taichi::lang::data_type_name(sm.get_data_type());
359 auto data_ptr = prog->get_ndarray_data_ptr_as_int(&ndarray);
360 auto num_triplets = ndarray.get_nelement() * ndarray.get_element_size() / 3;
361 if (sdtype == "f32") {
362 build_ndarray_template<float32>(sm, data_ptr, num_triplets);
363 } else if (sdtype == "f64") {
364 build_ndarray_template<float64>(sm, data_ptr, num_triplets);
365 } else {
366 TI_ERROR("Unsupported sparse matrix data type {}!", sdtype);
367 }
368}
369
370void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr,
371 void *coo_col_ptr,
372 void *coo_values_ptr,
373 int nnz) {
374#if defined(TI_WITH_CUDA)
375 // Step 1: Sort coo first
376 cusparseHandle_t cusparse_handle = nullptr;
377 CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
378 cusparseSpVecDescr_t vec_permutation;
379 cusparseDnVecDescr_t vec_values;
380 void *d_permutation = nullptr, *d_values_sorted = nullptr;
381 CUDADriver::get_instance().malloc(&d_permutation, nnz * sizeof(int));
382 CUDADriver::get_instance().malloc(&d_values_sorted, nnz * sizeof(float));
383 CUSPARSEDriver::get_instance().cpCreateSpVec(
384 &vec_permutation, nnz, nnz, d_permutation, d_values_sorted,
385 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
386 CUSPARSEDriver::get_instance().cpCreateDnVec(&vec_values, nnz, coo_values_ptr,
387 CUDA_R_32F);
388 size_t bufferSize = 0;
389 CUSPARSEDriver::get_instance().cpXcoosort_bufferSizeExt(
390 cusparse_handle, rows_, cols_, nnz, coo_row_ptr, coo_col_ptr,
391 &bufferSize);
392 void *dbuffer = nullptr;
393 if (bufferSize > 0)
394 CUDADriver::get_instance().malloc(&dbuffer, bufferSize);
395 // Setup permutation vector to identity
396 CUSPARSEDriver::get_instance().cpCreateIdentityPermutation(
397 cusparse_handle, nnz, d_permutation);
398 CUSPARSEDriver::get_instance().cpXcoosortByRow(cusparse_handle, rows_, cols_,
399 nnz, coo_row_ptr, coo_col_ptr,
400 d_permutation, dbuffer);
401 CUSPARSEDriver::get_instance().cpGather(cusparse_handle, vec_values,
402 vec_permutation);
403 CUDADriver::get_instance().memcpy_device_to_device(
404 coo_values_ptr, d_values_sorted, nnz * sizeof(float));
405 // Step 2: coo to csr
406 void *csr_row_offset_ptr = nullptr;
407 CUDADriver::get_instance().malloc(&csr_row_offset_ptr,
408 sizeof(int) * (rows_ + 1));
409 CUSPARSEDriver::get_instance().cpCoo2Csr(
410 cusparse_handle, (void *)coo_row_ptr, nnz, rows_,
411 (void *)csr_row_offset_ptr, CUSPARSE_INDEX_BASE_ZERO);
412
413 CUSPARSEDriver::get_instance().cpCreateCsr(
414 &matrix_, rows_, cols_, nnz, csr_row_offset_ptr, coo_col_ptr,
415 coo_values_ptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
416 CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
417 if (vec_permutation)
418 CUSPARSEDriver::get_instance().cpDestroySpVec(vec_permutation);
419 if (vec_values)
420 CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_values);
421 if (cusparse_handle)
422 CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
423 if (coo_row_ptr)
424 CUDADriver::get_instance().mem_free(coo_row_ptr);
425 if (d_values_sorted)
426 CUDADriver::get_instance().mem_free(d_values_sorted);
427 if (d_permutation)
428 CUDADriver::get_instance().mem_free(d_permutation);
429 if (dbuffer)
430 CUDADriver::get_instance().mem_free(dbuffer);
431 csr_row_ptr_ = csr_row_offset_ptr;
432 csr_col_ind_ = coo_col_ptr;
433 csr_val_ = coo_values_ptr;
434 nnz_ = nnz;
435#endif
436}
437
438CuSparseMatrix::~CuSparseMatrix() {
439#if defined(TI_WITH_CUDA)
440 if (matrix_)
441 CUSPARSEDriver::get_instance().cpDestroySpMat(matrix_);
442 if (csr_row_ptr_)
443 CUDADriver::get_instance().mem_free(csr_row_ptr_);
444 if (csr_col_ind_)
445 CUDADriver::get_instance().mem_free(csr_col_ind_);
446 if (csr_val_)
447 CUDADriver::get_instance().mem_free(csr_val_);
448#endif
449}
450
451// Reference::https://docs.nvidia.com/cuda/cusparse/index.html#csrgeam2
452std::unique_ptr<SparseMatrix> CuSparseMatrix::addition(
453 const CuSparseMatrix &other,
454 const float alpha,
455 const float beta) const {
456#if defined(TI_WITH_CUDA)
457 // Get information of this matrix: A
458 size_t nrows_A = 0, ncols_A = 0, nnz_A = 0;
459 void *drow_offsets_A = nullptr, *dcol_indices_A = nullptr,
460 *dvalues_A = nullptr;
461 cusparseIndexType_t csrRowOffsetsType_A, csrColIndType_A;
462 cusparseIndexBase_t idxBase_A;
463 cudaDataType valueType_A;
464 TI_ASSERT(matrix_ != nullptr);
465
466 CUSPARSEDriver::get_instance().cpCsrGet(
467 matrix_, &nrows_A, &ncols_A, &nnz_A, &drow_offsets_A, &dcol_indices_A,
468 &dvalues_A, &csrRowOffsetsType_A, &csrColIndType_A, &idxBase_A,
469 &valueType_A);
470 // Get information of other matrix: B
471 size_t nrows_B = 0, ncols_B = 0, nnz_B = 0;
472 void *drow_offsets_B = nullptr, *dcol_indices_B = nullptr,
473 *dvalues_B = nullptr;
474 cusparseIndexType_t csrRowOffsetsType_B, csrColIndType_B;
475 cusparseIndexBase_t idxBase_B;
476 cudaDataType valueType_B;
477 CUSPARSEDriver::get_instance().cpCsrGet(
478 other.matrix_, &nrows_B, &ncols_B, &nnz_B, &drow_offsets_B,
479 &dcol_indices_B, &dvalues_B, &csrRowOffsetsType_B, &csrColIndType_B,
480 &idxBase_B, &valueType_B);
481
482 // Create sparse matrix: C
483 int *drow_offsets_C = nullptr;
484 int *dcol_indices_C = nullptr;
485 float *dvalues_C = nullptr;
486 cusparseMatDescr_t descrA = nullptr, descrB = nullptr, descrC = nullptr;
487 CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrA);
488 CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrB);
489 CUSPARSEDriver::get_instance().cpCreateMatDescr(&descrC);
490 CUSPARSEDriver::get_instance().cpSetMatType(descrA,
491 CUSPARSE_MATRIX_TYPE_GENERAL);
492 CUSPARSEDriver::get_instance().cpSetMatType(descrB,
493 CUSPARSE_MATRIX_TYPE_GENERAL);
494 CUSPARSEDriver::get_instance().cpSetMatType(descrC,
495 CUSPARSE_MATRIX_TYPE_GENERAL);
496 CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrC,
497 CUSPARSE_INDEX_BASE_ZERO);
498 CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrA,
499 CUSPARSE_INDEX_BASE_ZERO);
500 CUSPARSEDriver::get_instance().cpSetMatIndexBase(descrB,
501 CUSPARSE_INDEX_BASE_ZERO);
502
503 // Start to do addition
504 cusparseHandle_t cusparse_handle;
505 CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
506 // alpha, nnzTotalDevHostPtr points to host memory
507 size_t BufferSizeInBytes;
508 char *buffer = nullptr;
509 int nnzC;
510 int *nnzTotalDevHostPtr = &nnzC;
511 CUSPARSEDriver::get_instance().cpSetPointerMode(cusparse_handle,
512 CUSPARSE_POINTER_MODE_HOST);
513 CUDADriver::get_instance().malloc((void **)(&drow_offsets_C),
514 sizeof(int) * (nrows_A + 1));
515 // Prepare buffer
516 CUSPARSEDriver::get_instance().cpScsrgeam2_bufferSizeExt(
517 cusparse_handle, nrows_A, ncols_A, (void *)(&alpha), descrA, nnz_A,
518 dvalues_A, drow_offsets_A, dcol_indices_A, (void *)&beta, descrB, nnz_B,
519 dvalues_B, drow_offsets_B, dcol_indices_B, descrC, dvalues_C,
520 drow_offsets_C, dcol_indices_C, &BufferSizeInBytes);
521
522 if (BufferSizeInBytes > 0)
523 CUDADriver::get_instance().malloc((void **)(&buffer), BufferSizeInBytes);
524
525 // Determine drow_offsets_C and the total number of nonzero elements.
526 CUSPARSEDriver::get_instance().cpXcsrgeam2Nnz(
527 cusparse_handle, nrows_A, ncols_A, descrA, nnz_A, drow_offsets_A,
528 dcol_indices_A, descrB, nnz_B, drow_offsets_B, dcol_indices_B, descrC,
529 drow_offsets_C, nnzTotalDevHostPtr, buffer);
530
531 int baseC;
532 if (nullptr != nnzTotalDevHostPtr) {
533 nnzC = *nnzTotalDevHostPtr;
534 } else {
535 CUDADriver::get_instance().memcpy_device_to_host(
536 (void *)(&nnzC), (void *)(drow_offsets_C + nrows_A), sizeof(int));
537 CUDADriver::get_instance().memcpy_device_to_host(
538 (void *)(&baseC), (void *)(drow_offsets_C), sizeof(int));
539 nnzC -= baseC;
540 }
541
542 CUDADriver::get_instance().malloc((void **)&dcol_indices_C,
543 sizeof(int) * nnzC);
544 CUDADriver::get_instance().malloc((void **)&dvalues_C, sizeof(float) * nnzC);
545
546 CUSPARSEDriver::get_instance().cpScsrgeam2(
547 cusparse_handle, nrows_A, ncols_A, (void *)(&alpha), descrA, nnz_A,
548 dvalues_A, drow_offsets_A, dcol_indices_A, (void *)(&beta), descrB, nnz_B,
549 dvalues_B, drow_offsets_B, dcol_indices_B, descrC, dvalues_C,
550 drow_offsets_C, dcol_indices_C, buffer);
551
552 cusparseSpMatDescr_t matrix_C;
553 CUSPARSEDriver::get_instance().cpCreateCsr(
554 &matrix_C, rows_, cols_, nnzC, drow_offsets_C, dcol_indices_C, dvalues_C,
555 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
556 CUDA_R_32F);
557
558 CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
559 CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrA);
560 CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrB);
561 CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrC);
562 CUDADriver::get_instance().mem_free(buffer);
563 return make_cu_sparse_matrix(matrix_C, rows_, cols_, PrimitiveType::f32,
564 drow_offsets_C, dcol_indices_C, dvalues_C, nnzC);
565 ;
566#else
567 TI_NOT_IMPLEMENTED;
568 return std::unique_ptr<SparseMatrix>();
569#endif
570}
571
572std::unique_ptr<SparseMatrix> CuSparseMatrix::matmul(
573 const CuSparseMatrix &other) const {
574#if defined(TI_WITH_CUDA)
575 return gemm(other, 1.0f, 0.0f);
576#else
577 TI_NOT_IMPLEMENTED;
578 return std::unique_ptr<SparseMatrix>();
579#endif
580}
581
582// Reference:
583// https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/spgemm
584std::unique_ptr<SparseMatrix> CuSparseMatrix::gemm(const CuSparseMatrix &other,
585 const float alpha,
586 const float beta) const {
587#if defined(TI_WITH_CUDA)
588 cusparseHandle_t handle = nullptr;
589 CUSPARSEDriver::get_instance().cpCreate(&handle);
590 cusparseOperation_t op_A = CUSPARSE_OPERATION_NON_TRANSPOSE;
591 cusparseOperation_t op_B = CUSPARSE_OPERATION_NON_TRANSPOSE;
592
593 size_t nrows_A = rows_;
594 size_t ncols_B = other.cols_;
595 auto mat_A = matrix_;
596 auto mat_B = other.matrix_;
597
598 // 1. create resulting matrix `C`
599 cusparseSpMatDescr_t mat_C;
600 CUSPARSEDriver::get_instance().cpCreateCsr(
601 &mat_C, nrows_A, ncols_B, 0, nullptr, nullptr, nullptr,
602 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
603 CUDA_R_32F);
604
605 // 2. create gemm descr
606 cusparseSpGEMMDescr_t spgemm_desc;
607 CUSPARSEDriver::get_instance().cpCreateSpGEMM(&spgemm_desc);
608
609 // 3. ask buffer_size1 bytes for external memory
610 void *d_buffer1 = nullptr;
611 size_t buffer_size1 = 0;
612 CUSPARSEDriver::get_instance().cpSpGEMM_workEstimation(
613 handle, op_A, op_B, &alpha, this->matrix_, other.matrix_, &beta, mat_C,
614 CUDA_R_32F, CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size1, nullptr);
615 CUDADriver::get_instance().malloc((void **)&d_buffer1, buffer_size1);
616 // 4. inspect the matrices A and B to understand the memory requirement for
617 // the next step
618 CUSPARSEDriver::get_instance().cpSpGEMM_workEstimation(
619 handle, op_A, op_B, &alpha, this->matrix_, other.matrix_, &beta, mat_C,
620 CUDA_R_32F, CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size1,
621 d_buffer1);
622
623 // 5. ask buffer_size2 bytes for external memory
624 size_t buffer_size2 = 0;
625 CUSPARSEDriver::get_instance().cpSpGEMM_compute(
626 handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F,
627 CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size2, nullptr);
628 void *d_buffer2 = nullptr;
629 CUDADriver::get_instance().malloc((void **)&d_buffer2, buffer_size2);
630
631 // 6. compute the intermediate product of A * B
632 CUSPARSEDriver::get_instance().cpSpGEMM_compute(
633 handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F,
634 CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size2, d_buffer2);
635
636 // 7. get info of matrix C
637 size_t nrows_C, cols_C, nnz_C;
638 CUSPARSEDriver::get_instance().cpGetSize(mat_C, &nrows_C, &cols_C, &nnz_C);
639
640 // 8. allocate matric C
641 int *d_csr_row_ptr_C, *d_csr_col_ind_C;
642 float *d_values_C;
643 CUDADriver::get_instance().malloc((void **)&d_csr_row_ptr_C,
644 (nrows_A + 1) * sizeof(int));
645 CUDADriver::get_instance().malloc((void **)&d_csr_col_ind_C,
646 nnz_C * sizeof(int));
647 CUDADriver::get_instance().malloc((void **)&d_values_C,
648 nnz_C * sizeof(float));
649
650 // 9. update matrix C with new pointers
651 CUSPARSEDriver::get_instance().cpCsrSetPointers(mat_C, d_csr_row_ptr_C,
652 d_csr_col_ind_C, d_values_C);
653
654 // 10. copy the final products of C.
655 CUSPARSEDriver::get_instance().cpSpGEMM_copy(
656 handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F,
657 CUSPARSE_SPGEMM_DEFAULT, spgemm_desc);
658
659 CUDADriver::get_instance().mem_free(d_buffer1);
660 CUDADriver::get_instance().mem_free(d_buffer2);
661 CUSPARSEDriver::get_instance().cpDestroy(handle);
662 CUSPARSEDriver::get_instance().cpDestroySpGEMM(spgemm_desc);
663
664 return make_cu_sparse_matrix(mat_C, nrows_A, ncols_B, PrimitiveType::f32,
665 d_csr_row_ptr_C, d_csr_col_ind_C, d_values_C,
666 nnz_C);
667#else
668 TI_NOT_IMPLEMENTED;
669 return std::unique_ptr<SparseMatrix>();
670#endif
671}
672
673// Convert CSR to CSC format using routine `Csr2cscEx2`
674// to implement transpose.
675// Reference
676// https://stackoverflow.com/questions/57368010/how-to-transpose-a-sparse-matrix-in-cusparse
677std::unique_ptr<SparseMatrix> CuSparseMatrix::transpose() const {
678#if defined(TI_WITH_CUDA)
679 cusparseHandle_t handle;
680 CUSPARSEDriver::get_instance().cpCreate(&handle);
681 size_t nrows_A, ncols_A, nnz;
682 void *d_csr_val = nullptr, *d_csr_val_AT = nullptr;
683 int *d_csr_row_ptr = nullptr, *d_csr_col_ind = nullptr;
684 int *d_csr_row_ptr_AT = nullptr, *d_csr_col_ptr_AT = nullptr;
685 cusparseIndexType_t csr_row_otr_type, csr_col_otr_type;
686 cusparseIndexBase_t idx_base_type;
687 cudaDataType value_type;
688 size_t buffer_size;
689
690 // 1. get pointers of A
691 CUSPARSEDriver::get_instance().cpCsrGet(
692 matrix_, &nrows_A, &ncols_A, &nnz, (void **)&d_csr_row_ptr,
693 (void **)&d_csr_col_ind, (void **)&d_csr_val, &csr_row_otr_type,
694 &csr_col_otr_type, &idx_base_type, &value_type);
695
696 // 2. ask bufer size for Csr2cscEx2
697 CUSPARSEDriver::get_instance().cpCsr2cscEx2_bufferSize(
698 handle, nrows_A, ncols_A, nnz, (void *)&d_csr_val, (int *)&d_csr_row_ptr,
699 (int *)&d_csr_col_ind, (void *)&d_csr_val_AT, (int *)&d_csr_row_ptr_AT,
700 (int *)&d_csr_col_ptr_AT, CUDA_R_32F, CUSPARSE_ACTION_NUMERIC,
701 CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, &buffer_size);
702 void *buffer = nullptr;
703 CUDADriver::get_instance().malloc((void **)&buffer, buffer_size);
704
705 CUDADriver::get_instance().malloc((void **)&d_csr_val_AT,
706 nnz * sizeof(float));
707 CUDADriver::get_instance().malloc((void **)&d_csr_row_ptr_AT,
708 (ncols_A + 1) * sizeof(int));
709 CUDADriver::get_instance().malloc((void **)&d_csr_col_ptr_AT,
710 nnz * sizeof(int));
711
712 // 3. execute Csr2cscEx2
713 CUSPARSEDriver::get_instance().cpCsr2cscEx2(
714 handle, nrows_A, ncols_A, nnz, d_csr_val, d_csr_row_ptr, d_csr_col_ind,
715 d_csr_val_AT, d_csr_row_ptr_AT, d_csr_col_ptr_AT, CUDA_R_32F,
716 CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
717 buffer);
718
719 // 4. create AT.
720 cusparseSpMatDescr_t mat_AT;
721 CUSPARSEDriver::get_instance().cpCreateCsr(
722 &mat_AT, ncols_A, nrows_A, nnz, (void *)d_csr_row_ptr_AT,
723 (void *)d_csr_col_ptr_AT, (void *)d_csr_val_AT, CUSPARSE_INDEX_32I,
724 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
725 CUDADriver::get_instance().mem_free(buffer);
726 CUSPARSEDriver::get_instance().cpDestroy(handle);
727 return make_cu_sparse_matrix(mat_AT, ncols_A, nrows_A, PrimitiveType::f32,
728 d_csr_row_ptr_AT, d_csr_col_ptr_AT, d_csr_val_AT,
729 nnz);
730#else
731 TI_NOT_IMPLEMENTED;
732 return std::unique_ptr<SparseMatrix>();
733#endif
734}
735
736void CuSparseMatrix::spmv(Program *prog, const Ndarray &x, const Ndarray &y) {
737#if defined(TI_WITH_CUDA)
738 size_t dX = prog->get_ndarray_data_ptr_as_int(&x);
739 size_t dY = prog->get_ndarray_data_ptr_as_int(&y);
740
741 cusparseDnVecDescr_t vecX, vecY;
742 CUSPARSEDriver::get_instance().cpCreateDnVec(&vecX, cols_, (void *)dX,
743 CUDA_R_32F);
744 CUSPARSEDriver::get_instance().cpCreateDnVec(&vecY, rows_, (void *)dY,
745 CUDA_R_32F);
746
747 cusparseHandle_t cusparse_handle;
748 CUSPARSEDriver::get_instance().cpCreate(&cusparse_handle);
749 float alpha = 1.0f, beta = 0.0f;
750 size_t bufferSize = 0;
751 CUSPARSEDriver::get_instance().cpSpMV_bufferSize(
752 cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX,
753 &beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, &bufferSize);
754
755 void *dBuffer = nullptr;
756 if (bufferSize > 0)
757 CUDADriver::get_instance().malloc(&dBuffer, bufferSize);
758 CUSPARSEDriver::get_instance().cpSpMV(
759 cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matrix_, vecX,
760 &beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, dBuffer);
761
762 CUSPARSEDriver::get_instance().cpDestroyDnVec(vecX);
763 CUSPARSEDriver::get_instance().cpDestroyDnVec(vecY);
764 CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle);
765 CUDADriver::get_instance().mem_free(dBuffer);
766#endif
767}
768
769const std::string CuSparseMatrix::to_string() const {
770 std::ostringstream ostr;
771#ifdef TI_WITH_CUDA
772 size_t rows, cols, nnz;
773 float *dR;
774 int *dC, *dV;
775 cusparseIndexType_t row_type, column_type;
776 cusparseIndexBase_t idx_base;
777 cudaDataType value_type;
778 CUSPARSEDriver::get_instance().cpCsrGet(
779 matrix_, &rows, &cols, &nnz, (void **)&dR, (void **)&dC, (void **)&dV,
780 &row_type, &column_type, &idx_base, &value_type);
781
782 auto *hR = new int[rows + 1];
783 auto *hC = new int[nnz];
784 auto *hV = new float[nnz];
785
786 CUDADriver::get_instance().memcpy_device_to_host((void *)hR, (void *)dR,
787 (rows + 1) * sizeof(int));
788 CUDADriver::get_instance().memcpy_device_to_host((void *)hC, (void *)dC,
789 (nnz) * sizeof(int));
790 CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV,
791 (nnz) * sizeof(float));
792
793 print_triplets_from_csr<int, int, float>(rows, cols, hR, hC, hV, ostr);
794 delete[] hR;
795 delete[] hC;
796 delete[] hV;
797#endif
798 return ostr.str();
799}
800
801float CuSparseMatrix::get_element(int row, int col) const {
802 float res = 0.0f;
803#ifdef TI_WITH_CUDA
804 size_t rows, cols, nnz;
805 float *dR;
806 int *dC, *dV;
807 cusparseIndexType_t row_type, column_type;
808 cusparseIndexBase_t idx_base;
809 cudaDataType value_type;
810 CUSPARSEDriver::get_instance().cpCsrGet(
811 matrix_, &rows, &cols, &nnz, (void **)&dR, (void **)&dC, (void **)&dV,
812 &row_type, &column_type, &idx_base, &value_type);
813
814 TI_ASSERT(row < rows);
815 TI_ASSERT(col < cols);
816
817 auto *hR = new int[rows + 1];
818 auto *hC = new int[nnz];
819 auto *hV = new float[nnz];
820
821 CUDADriver::get_instance().memcpy_device_to_host((void *)hR, (void *)dR,
822 (rows + 1) * sizeof(int));
823 CUDADriver::get_instance().memcpy_device_to_host((void *)hC, (void *)dC,
824 (nnz) * sizeof(int));
825 CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV,
826 (nnz) * sizeof(float));
827
828 res = get_element_from_csr<int, int, float>(row, col, hR, hC, hV);
829
830 delete[] hR;
831 delete[] hC;
832 delete[] hV;
833#endif // TI_WITH_CUDA
834 return res;
835}
836
837} // namespace taichi::lang
838