1 | #include "taichi/ir/type_utils.h" |
2 | |
3 | #include "sparse_solver.h" |
4 | |
5 | #include <unordered_map> |
6 | |
7 | namespace taichi::lang { |
8 | #define EIGEN_LLT_SOLVER_INSTANTIATION(dt, type, order) \ |
9 | template class EigenSparseSolver< \ |
10 | Eigen::Simplicial##type<Eigen::SparseMatrix<dt>, Eigen::Lower, \ |
11 | Eigen::order##Ordering<int>>, \ |
12 | Eigen::SparseMatrix<dt>>; |
13 | #define EIGEN_LU_SOLVER_INSTANTIATION(dt, type, order) \ |
14 | template class EigenSparseSolver< \ |
15 | Eigen::Sparse##type<Eigen::SparseMatrix<dt>, \ |
16 | Eigen::order##Ordering<int>>, \ |
17 | Eigen::SparseMatrix<dt>>; |
18 | // Explicit instantiation of EigenSparseSolver |
19 | EIGEN_LLT_SOLVER_INSTANTIATION(float32, LLT, AMD); |
20 | EIGEN_LLT_SOLVER_INSTANTIATION(float32, LLT, COLAMD); |
21 | EIGEN_LLT_SOLVER_INSTANTIATION(float32, LDLT, AMD); |
22 | EIGEN_LLT_SOLVER_INSTANTIATION(float32, LDLT, COLAMD); |
23 | EIGEN_LU_SOLVER_INSTANTIATION(float32, LU, AMD); |
24 | EIGEN_LU_SOLVER_INSTANTIATION(float32, LU, COLAMD); |
25 | EIGEN_LLT_SOLVER_INSTANTIATION(float64, LLT, AMD); |
26 | EIGEN_LLT_SOLVER_INSTANTIATION(float64, LLT, COLAMD); |
27 | EIGEN_LLT_SOLVER_INSTANTIATION(float64, LDLT, AMD); |
28 | EIGEN_LLT_SOLVER_INSTANTIATION(float64, LDLT, COLAMD); |
29 | EIGEN_LU_SOLVER_INSTANTIATION(float64, LU, AMD); |
30 | EIGEN_LU_SOLVER_INSTANTIATION(float64, LU, COLAMD); |
31 | } // namespace taichi::lang |
32 | |
33 | // Explicit instantiation of the template class EigenSparseSolver::solve |
34 | #define EIGEN_LLT_SOLVE_INSTANTIATION(dt, type, order, df) \ |
35 | using T##dt = Eigen::VectorX##df; \ |
36 | using S##dt##type##order = \ |
37 | Eigen::Simplicial##type<Eigen::SparseMatrix<dt>, Eigen::Lower, \ |
38 | Eigen::order##Ordering<int>>; \ |
39 | template T##dt \ |
40 | EigenSparseSolver<S##dt##type##order, Eigen::SparseMatrix<dt>>::solve( \ |
41 | const T##dt &b); |
42 | #define EIGEN_LU_SOLVE_INSTANTIATION(dt, type, order, df) \ |
43 | using LUT##dt = Eigen::VectorX##df; \ |
44 | using LUS##dt##type##order = \ |
45 | Eigen::Sparse##type<Eigen::SparseMatrix<dt>, \ |
46 | Eigen::order##Ordering<int>>; \ |
47 | template LUT##dt \ |
48 | EigenSparseSolver<LUS##dt##type##order, Eigen::SparseMatrix<dt>>::solve( \ |
49 | const LUT##dt &b); |
50 | |
51 | // Explicit instantiation of the template class EigenSparseSolver::solve_rf |
52 | #define INSTANTIATE_LLT_SOLVE_RF(dt, type, order, df) \ |
53 | using llt##dt##type##order = \ |
54 | Eigen::Simplicial##type<Eigen::SparseMatrix<dt>, Eigen::Lower, \ |
55 | Eigen::order##Ordering<int>>; \ |
56 | template void EigenSparseSolver<llt##dt##type##order, \ |
57 | Eigen::SparseMatrix<dt>>::solve_rf<df, \ |
58 | dt>( \ |
59 | Program * prog, const SparseMatrix &sm, const Ndarray &b, \ |
60 | const Ndarray &x); |
61 | |
62 | #define INSTANTIATE_LU_SOLVE_RF(dt, type, order, df) \ |
63 | using lu##dt##type##order = \ |
64 | Eigen::Sparse##type<Eigen::SparseMatrix<dt>, \ |
65 | Eigen::order##Ordering<int>>; \ |
66 | template void EigenSparseSolver<lu##dt##type##order, \ |
67 | Eigen::SparseMatrix<dt>>::solve_rf<df, \ |
68 | dt>( \ |
69 | Program * prog, const SparseMatrix &sm, const Ndarray &b, \ |
70 | const Ndarray &x); |
71 | |
72 | #define MAKE_EIGEN_SOLVER(dt, type, order) \ |
73 | std::make_unique<EigenSparseSolver##dt##type##order>() |
74 | |
75 | #define MAKE_SOLVER(dt, type, order) \ |
76 | { \ |
77 | {#dt, #type, #order}, []() -> std::unique_ptr<SparseSolver> { \ |
78 | return MAKE_EIGEN_SOLVER(dt, type, order); \ |
79 | } \ |
80 | } |
81 | |
82 | using Triplets = std::tuple<std::string, std::string, std::string>; |
83 | namespace { |
84 | struct key_hash { |
85 | std::size_t operator()(const Triplets &k) const { |
86 | auto h1 = std::hash<std::string>{}(std::get<0>(k)); |
87 | auto h2 = std::hash<std::string>{}(std::get<1>(k)); |
88 | auto h3 = std::hash<std::string>{}(std::get<2>(k)); |
89 | return h1 ^ h2 ^ h3; |
90 | } |
91 | }; |
92 | } // namespace |
93 | |
94 | namespace taichi::lang { |
95 | |
96 | #define GET_EM(sm) \ |
97 | const EigenMatrix *mat = (const EigenMatrix *)(sm.get_matrix()); |
98 | |
99 | template <class EigenSolver, class EigenMatrix> |
100 | bool EigenSparseSolver<EigenSolver, EigenMatrix>::compute( |
101 | const SparseMatrix &sm) { |
102 | if (!is_initialized_) { |
103 | SparseSolver::init_solver(sm.num_rows(), sm.num_cols(), sm.get_data_type()); |
104 | } |
105 | GET_EM(sm); |
106 | solver_.compute(*mat); |
107 | if (solver_.info() != Eigen::Success) { |
108 | return false; |
109 | } else |
110 | return true; |
111 | } |
112 | template <class EigenSolver, class EigenMatrix> |
113 | void EigenSparseSolver<EigenSolver, EigenMatrix>::analyze_pattern( |
114 | const SparseMatrix &sm) { |
115 | if (!is_initialized_) { |
116 | SparseSolver::init_solver(sm.num_rows(), sm.num_cols(), sm.get_data_type()); |
117 | } |
118 | GET_EM(sm); |
119 | solver_.analyzePattern(*mat); |
120 | } |
121 | |
122 | template <class EigenSolver, class EigenMatrix> |
123 | void EigenSparseSolver<EigenSolver, EigenMatrix>::factorize( |
124 | const SparseMatrix &sm) { |
125 | GET_EM(sm); |
126 | solver_.factorize(*mat); |
127 | } |
128 | |
129 | template <class EigenSolver, class EigenMatrix> |
130 | template <typename T> |
131 | T EigenSparseSolver<EigenSolver, EigenMatrix>::solve(const T &b) { |
132 | return solver_.solve(b); |
133 | } |
134 | |
135 | EIGEN_LLT_SOLVE_INSTANTIATION(float32, LLT, AMD, f); |
136 | EIGEN_LLT_SOLVE_INSTANTIATION(float32, LLT, COLAMD, f); |
137 | EIGEN_LLT_SOLVE_INSTANTIATION(float32, LDLT, AMD, f); |
138 | EIGEN_LLT_SOLVE_INSTANTIATION(float32, LDLT, COLAMD, f); |
139 | EIGEN_LU_SOLVE_INSTANTIATION(float32, LU, AMD, f); |
140 | EIGEN_LU_SOLVE_INSTANTIATION(float32, LU, COLAMD, f); |
141 | EIGEN_LLT_SOLVE_INSTANTIATION(float64, LLT, AMD, d); |
142 | EIGEN_LLT_SOLVE_INSTANTIATION(float64, LLT, COLAMD, d); |
143 | EIGEN_LLT_SOLVE_INSTANTIATION(float64, LDLT, AMD, d); |
144 | EIGEN_LLT_SOLVE_INSTANTIATION(float64, LDLT, COLAMD, d); |
145 | EIGEN_LU_SOLVE_INSTANTIATION(float64, LU, AMD, d); |
146 | EIGEN_LU_SOLVE_INSTANTIATION(float64, LU, COLAMD, d); |
147 | |
148 | template <class EigenSolver, class EigenMatrix> |
149 | bool EigenSparseSolver<EigenSolver, EigenMatrix>::info() { |
150 | return solver_.info() == Eigen::Success; |
151 | } |
152 | |
153 | template <class EigenSolver, class EigenMatrix> |
154 | template <typename T, typename V> |
155 | void EigenSparseSolver<EigenSolver, EigenMatrix>::solve_rf( |
156 | Program *prog, |
157 | const SparseMatrix &sm, |
158 | const Ndarray &b, |
159 | const Ndarray &x) { |
160 | size_t db = prog->get_ndarray_data_ptr_as_int(&b); |
161 | size_t dX = prog->get_ndarray_data_ptr_as_int(&x); |
162 | Eigen::Map<T>((V *)dX, rows_) = solver_.solve(Eigen::Map<T>((V *)db, cols_)); |
163 | } |
164 | |
165 | INSTANTIATE_LLT_SOLVE_RF(float32, LLT, COLAMD, Eigen::VectorXf) |
166 | INSTANTIATE_LLT_SOLVE_RF(float32, LDLT, COLAMD, Eigen::VectorXf) |
167 | INSTANTIATE_LLT_SOLVE_RF(float32, LLT, AMD, Eigen::VectorXf) |
168 | INSTANTIATE_LLT_SOLVE_RF(float32, LDLT, AMD, Eigen::VectorXf) |
169 | INSTANTIATE_LU_SOLVE_RF(float32, LU, AMD, Eigen::VectorXf) |
170 | INSTANTIATE_LU_SOLVE_RF(float32, LU, COLAMD, Eigen::VectorXf) |
171 | INSTANTIATE_LLT_SOLVE_RF(float64, LLT, COLAMD, Eigen::VectorXd) |
172 | INSTANTIATE_LLT_SOLVE_RF(float64, LDLT, COLAMD, Eigen::VectorXd) |
173 | INSTANTIATE_LLT_SOLVE_RF(float64, LLT, AMD, Eigen::VectorXd) |
174 | INSTANTIATE_LLT_SOLVE_RF(float64, LDLT, AMD, Eigen::VectorXd) |
175 | INSTANTIATE_LU_SOLVE_RF(float64, LU, AMD, Eigen::VectorXd) |
176 | INSTANTIATE_LU_SOLVE_RF(float64, LU, COLAMD, Eigen::VectorXd) |
177 | |
178 | CuSparseSolver::CuSparseSolver() { |
179 | init_solver(); |
180 | } |
181 | |
182 | void CuSparseSolver::init_solver() { |
183 | #if defined(TI_WITH_CUDA) |
184 | if (!CUSPARSEDriver::get_instance().is_loaded()) { |
185 | bool load_success = CUSPARSEDriver::get_instance().load_cusparse(); |
186 | if (!load_success) { |
187 | TI_ERROR("Failed to load cusparse library!" ); |
188 | } |
189 | } |
190 | if (!CUSOLVERDriver::get_instance().is_loaded()) { |
191 | bool load_success = CUSOLVERDriver::get_instance().load_cusolver(); |
192 | if (!load_success) { |
193 | TI_ERROR("Failed to load cusolver library!" ); |
194 | } |
195 | } |
196 | #endif |
197 | } |
198 | void CuSparseSolver::reorder(const CuSparseMatrix &A) { |
199 | #if defined(TI_WITH_CUDA) |
200 | size_t rowsA = A.num_rows(); |
201 | size_t colsA = A.num_cols(); |
202 | size_t nnzA = A.get_nnz(); |
203 | void *d_csrRowPtrA = A.get_row_ptr(); |
204 | void *d_csrColIndA = A.get_col_ind(); |
205 | void *d_csrValA = A.get_val_ptr(); |
206 | CUSOLVERDriver::get_instance().csSpCreate(&cusolver_handle_); |
207 | CUSPARSEDriver::get_instance().cpCreate(&cusparse_handel_); |
208 | CUSPARSEDriver::get_instance().cpCreateMatDescr(&descr_); |
209 | CUSPARSEDriver::get_instance().cpSetMatType(descr_, |
210 | CUSPARSE_MATRIX_TYPE_GENERAL); |
211 | CUSPARSEDriver::get_instance().cpSetMatIndexBase(descr_, |
212 | CUSPARSE_INDEX_BASE_ZERO); |
213 | float *h_csrValA = nullptr; |
214 | h_Q_ = (int *)malloc(sizeof(int) * colsA); |
215 | h_csrRowPtrB_ = (int *)malloc(sizeof(int) * (rowsA + 1)); |
216 | h_csrColIndB_ = (int *)malloc(sizeof(int) * nnzA); |
217 | h_csrValB_ = (float *)malloc(sizeof(float) * nnzA); |
218 | h_csrValA = (float *)malloc(sizeof(float) * nnzA); |
219 | h_mapBfromA_ = (int *)malloc(sizeof(int) * nnzA); |
220 | assert(nullptr != h_Q_); |
221 | assert(nullptr != h_csrRowPtrB_); |
222 | assert(nullptr != h_csrColIndB_); |
223 | assert(nullptr != h_csrValB_); |
224 | assert(nullptr != h_mapBfromA_); |
225 | |
226 | CUDADriver::get_instance().memcpy_device_to_host(h_csrRowPtrB_, d_csrRowPtrA, |
227 | sizeof(int) * (rowsA + 1)); |
228 | CUDADriver::get_instance().memcpy_device_to_host(h_csrColIndB_, d_csrColIndA, |
229 | sizeof(int) * nnzA); |
230 | CUDADriver::get_instance().memcpy_device_to_host(h_csrValA, d_csrValA, |
231 | sizeof(float) * nnzA); |
232 | |
233 | // compoute h_Q_ |
234 | CUSOLVERDriver::get_instance().csSpXcsrsymamdHost(cusolver_handle_, rowsA, |
235 | nnzA, descr_, h_csrRowPtrB_, |
236 | h_csrColIndB_, h_Q_); |
237 | CUDADriver::get_instance().malloc((void **)&d_Q_, sizeof(int) * colsA); |
238 | CUDADriver::get_instance().memcpy_host_to_device((void *)d_Q_, (void *)h_Q_, |
239 | sizeof(int) * (colsA)); |
240 | size_t size_perm = 0; |
241 | CUSOLVERDriver::get_instance().csSpXcsrperm_bufferSizeHost( |
242 | cusolver_handle_, rowsA, colsA, nnzA, descr_, h_csrRowPtrB_, |
243 | h_csrColIndB_, h_Q_, h_Q_, &size_perm); |
244 | void *buffer_cpu = (void *)malloc(sizeof(char) * size_perm); |
245 | assert(nullptr != buffer_cpu); |
246 | for (int j = 0; j < nnzA; j++) { |
247 | h_mapBfromA_[j] = j; |
248 | } |
249 | CUSOLVERDriver::get_instance().csSpXcsrpermHost( |
250 | cusolver_handle_, rowsA, colsA, nnzA, descr_, h_csrRowPtrB_, |
251 | h_csrColIndB_, h_Q_, h_Q_, h_mapBfromA_, buffer_cpu); |
252 | // B = A( mapBfromA ) |
253 | for (int j = 0; j < nnzA; j++) { |
254 | h_csrValB_[j] = h_csrValA[h_mapBfromA_[j]]; |
255 | } |
256 | CUDADriver::get_instance().malloc((void **)&d_csrRowPtrB_, |
257 | sizeof(int) * (rowsA + 1)); |
258 | CUDADriver::get_instance().malloc((void **)&d_csrColIndB_, |
259 | sizeof(int) * nnzA); |
260 | CUDADriver::get_instance().malloc((void **)&d_csrValB_, sizeof(float) * nnzA); |
261 | CUDADriver::get_instance().memcpy_host_to_device( |
262 | (void *)d_csrRowPtrB_, (void *)h_csrRowPtrB_, sizeof(int) * (rowsA + 1)); |
263 | CUDADriver::get_instance().memcpy_host_to_device( |
264 | (void *)d_csrColIndB_, (void *)h_csrColIndB_, sizeof(int) * nnzA); |
265 | CUDADriver::get_instance().memcpy_host_to_device( |
266 | (void *)d_csrValB_, (void *)h_csrValB_, sizeof(float) * nnzA); |
267 | free(h_csrValA); |
268 | free(buffer_cpu); |
269 | #endif |
270 | } |
271 | |
272 | // Reference: |
273 | // https://github.com/NVIDIA/cuda-samples/blob/master/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/cuSolverSp_LowlevelCholesky.cpp |
274 | void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) { |
275 | switch (solver_type_) { |
276 | case SolverType::Cholesky: |
277 | analyze_pattern_cholesky(sm); |
278 | break; |
279 | case SolverType::LU: |
280 | analyze_pattern_lu(sm); |
281 | break; |
282 | default: |
283 | TI_NOT_IMPLEMENTED |
284 | } |
285 | } |
286 | void CuSparseSolver::analyze_pattern_cholesky(const SparseMatrix &sm) { |
287 | #if defined(TI_WITH_CUDA) |
288 | // Retrive the info of the sparse matrix |
289 | SparseMatrix &sm_no_cv = const_cast<SparseMatrix &>(sm); |
290 | CuSparseMatrix &A = static_cast<CuSparseMatrix &>(sm_no_cv); |
291 | |
292 | // step 1: reorder the sparse matrix |
293 | reorder(A); |
294 | |
295 | // step 2: create opaque info structure |
296 | CUSOLVERDriver::get_instance().csSpCreateCsrcholInfo(&info_); |
297 | |
298 | // step 3: analyze chol(A) to know structure of L |
299 | size_t rowsA = A.num_rows(); |
300 | size_t nnzA = A.get_nnz(); |
301 | CUSOLVERDriver::get_instance().csSpXcsrcholAnalysis( |
302 | cusolver_handle_, rowsA, nnzA, descr_, d_csrRowPtrB_, d_csrColIndB_, |
303 | info_); |
304 | is_analyzed_ = true; |
305 | #else |
306 | TI_NOT_IMPLEMENTED |
307 | #endif |
308 | } |
309 | void CuSparseSolver::analyze_pattern_lu(const SparseMatrix &sm) { |
310 | #if defined(TI_WITH_CUDA) |
311 | // Retrive the info of the sparse matrix |
312 | SparseMatrix &sm_no_cv = const_cast<SparseMatrix &>(sm); |
313 | CuSparseMatrix &A = static_cast<CuSparseMatrix &>(sm_no_cv); |
314 | |
315 | // step 1: reorder the sparse matrix |
316 | reorder(A); |
317 | |
318 | // step 2: create opaque info structure |
319 | CUSOLVERDriver::get_instance().csSpCreateCsrluInfoHost(&lu_info_); |
320 | |
321 | // step 3: analyze LU(B) to know structure of Q and R, and upper bound for |
322 | // nnz(L+U) |
323 | size_t rowsA = A.num_rows(); |
324 | size_t nnzA = A.get_nnz(); |
325 | CUSOLVERDriver::get_instance().csSpXcsrluAnalysisHost( |
326 | cusolver_handle_, rowsA, nnzA, descr_, h_csrRowPtrB_, h_csrColIndB_, |
327 | lu_info_); |
328 | is_analyzed_ = true; |
329 | #else |
330 | TI_NOT_IMPLEMENTED |
331 | #endif |
332 | } |
333 | void CuSparseSolver::factorize(const SparseMatrix &sm) { |
334 | switch (solver_type_) { |
335 | case SolverType::Cholesky: |
336 | factorize_cholesky(sm); |
337 | break; |
338 | case SolverType::LU: |
339 | factorize_lu(sm); |
340 | break; |
341 | default: |
342 | TI_NOT_IMPLEMENTED |
343 | } |
344 | } |
345 | void CuSparseSolver::factorize_cholesky(const SparseMatrix &sm) { |
346 | #if defined(TI_WITH_CUDA) |
347 | // Retrive the info of the sparse matrix |
348 | SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm); |
349 | CuSparseMatrix *A = static_cast<CuSparseMatrix *>(sm_no_cv); |
350 | size_t rowsA = A->num_rows(); |
351 | size_t nnzA = A->get_nnz(); |
352 | |
353 | size_t size_internal = 0; |
354 | size_t size_chol = 0; // size of working space for csrlu |
355 | // step 1: workspace for chol(A) |
356 | CUSOLVERDriver::get_instance().csSpScsrcholBufferInfo( |
357 | cusolver_handle_, rowsA, nnzA, descr_, d_csrValB_, d_csrRowPtrB_, |
358 | d_csrColIndB_, info_, &size_internal, &size_chol); |
359 | |
360 | if (size_chol > 0) |
361 | CUDADriver::get_instance().malloc(&gpu_buffer_, sizeof(char) * size_chol); |
362 | |
363 | // step 2: compute A = L*L^T |
364 | CUSOLVERDriver::get_instance().csSpScsrcholFactor( |
365 | cusolver_handle_, rowsA, nnzA, descr_, d_csrValB_, d_csrRowPtrB_, |
366 | d_csrColIndB_, info_, gpu_buffer_); |
367 | // step 3: check if the matrix is singular |
368 | const float tol = 1.e-14; |
369 | int singularity = 0; |
370 | CUSOLVERDriver::get_instance().csSpScsrcholZeroPivot(cusolver_handle_, info_, |
371 | tol, &singularity); |
372 | TI_ASSERT(singularity == -1); |
373 | is_factorized_ = true; |
374 | #else |
375 | TI_NOT_IMPLEMENTED |
376 | #endif |
377 | } |
378 | void CuSparseSolver::factorize_lu(const SparseMatrix &sm) { |
379 | #if defined(TI_WITH_CUDA) |
380 | // Retrive the info of the sparse matrix |
381 | SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm); |
382 | CuSparseMatrix *A = static_cast<CuSparseMatrix *>(sm_no_cv); |
383 | size_t rowsA = A->num_rows(); |
384 | size_t nnzA = A->get_nnz(); |
385 | // step 4: workspace for LU(B) |
386 | size_t size_lu = 0; |
387 | size_t buffer_size = 0; |
388 | CUSOLVERDriver::get_instance().csSpScsrluBufferInfoHost( |
389 | cusolver_handle_, rowsA, nnzA, descr_, h_csrValB_, h_csrRowPtrB_, |
390 | h_csrColIndB_, lu_info_, &buffer_size, &size_lu); |
391 | |
392 | if (cpu_buffer_) |
393 | free(cpu_buffer_); |
394 | cpu_buffer_ = (void *)malloc(sizeof(char) * size_lu); |
395 | assert(nullptr != cpu_buffer_); |
396 | |
397 | // step 5: compute Ppivot * B = L * U |
398 | CUSOLVERDriver::get_instance().csSpScsrluFactorHost( |
399 | cusolver_handle_, rowsA, nnzA, descr_, h_csrValB_, h_csrRowPtrB_, |
400 | h_csrColIndB_, lu_info_, 1.0f, cpu_buffer_); |
401 | |
402 | // step 6: check singularity by tol |
403 | int singularity = 0; |
404 | const float tol = 1.e-6; |
405 | CUSOLVERDriver::get_instance().csSpScsrluZeroPivotHost( |
406 | cusolver_handle_, lu_info_, tol, &singularity); |
407 | TI_ASSERT(singularity == -1); |
408 | is_factorized_ = true; |
409 | #else |
410 | TI_NOT_IMPLEMENTED |
411 | #endif |
412 | } |
413 | void CuSparseSolver::solve_rf(Program *prog, |
414 | const SparseMatrix &sm, |
415 | const Ndarray &b, |
416 | const Ndarray &x) { |
417 | switch (solver_type_) { |
418 | case SolverType::Cholesky: |
419 | solve_cholesky(prog, sm, b, x); |
420 | break; |
421 | case SolverType::LU: |
422 | solve_lu(prog, sm, b, x); |
423 | break; |
424 | default: |
425 | TI_NOT_IMPLEMENTED |
426 | } |
427 | } |
428 | |
429 | void CuSparseSolver::solve_cholesky(Program *prog, |
430 | const SparseMatrix &sm, |
431 | const Ndarray &b, |
432 | const Ndarray &x) { |
433 | #if defined(TI_WITH_CUDA) |
434 | if (is_analyzed_ == false) { |
435 | analyze_pattern(sm); |
436 | } |
437 | if (is_factorized_ == false) { |
438 | factorize(sm); |
439 | } |
440 | // Retrive the info of the sparse matrix |
441 | SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm); |
442 | CuSparseMatrix *A = static_cast<CuSparseMatrix *>(sm_no_cv); |
443 | size_t rowsA = A->num_rows(); |
444 | size_t colsA = A->num_cols(); |
445 | size_t d_b = prog->get_ndarray_data_ptr_as_int(&b); |
446 | size_t d_x = prog->get_ndarray_data_ptr_as_int(&x); |
447 | |
448 | // step 1: d_Qb = Q * b |
449 | void *d_Qb = nullptr; |
450 | CUDADriver::get_instance().malloc(&d_Qb, sizeof(float) * rowsA); |
451 | cusparseDnVecDescr_t vec_b; |
452 | cusparseSpVecDescr_t vec_Qb; |
453 | CUSPARSEDriver::get_instance().cpCreateDnVec(&vec_b, (int)rowsA, (void *)d_b, |
454 | CUDA_R_32F); |
455 | CUSPARSEDriver::get_instance().cpCreateSpVec( |
456 | &vec_Qb, (int)rowsA, (int)rowsA, d_Q_, d_Qb, CUSPARSE_INDEX_32I, |
457 | CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); |
458 | CUSPARSEDriver::get_instance().cpGather(cusparse_handel_, vec_b, vec_Qb); |
459 | |
460 | // step 2: solve B*z = Q*b using cholesky solver |
461 | void *d_z = nullptr; |
462 | CUDADriver::get_instance().malloc(&d_z, sizeof(float) * colsA); |
463 | CUSOLVERDriver::get_instance().csSpScsrcholSolve( |
464 | cusolver_handle_, rowsA, (void *)d_Qb, (void *)d_z, info_, gpu_buffer_); |
465 | |
466 | // step 3: Q*x = z |
467 | cusparseSpVecDescr_t vecX; |
468 | cusparseDnVecDescr_t vecY; |
469 | CUSPARSEDriver::get_instance().cpCreateSpVec( |
470 | &vecX, (int)colsA, (int)colsA, d_Q_, d_z, CUSPARSE_INDEX_32I, |
471 | CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); |
472 | CUSPARSEDriver::get_instance().cpCreateDnVec(&vecY, (int)colsA, (void *)d_x, |
473 | CUDA_R_32F); |
474 | CUSPARSEDriver::get_instance().cpScatter(cusparse_handel_, vecX, vecY); |
475 | |
476 | if (d_Qb != nullptr) |
477 | CUDADriver::get_instance().mem_free(d_Qb); |
478 | if (d_z != nullptr) |
479 | CUDADriver::get_instance().mem_free(d_z); |
480 | CUSPARSEDriver::get_instance().cpDestroySpVec(vec_Qb); |
481 | CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_b); |
482 | CUSPARSEDriver::get_instance().cpDestroySpVec(vecX); |
483 | CUSPARSEDriver::get_instance().cpDestroyDnVec(vecY); |
484 | #else |
485 | TI_NOT_IMPLEMENTED |
486 | #endif |
487 | } |
488 | |
489 | void CuSparseSolver::solve_lu(Program *prog, |
490 | const SparseMatrix &sm, |
491 | const Ndarray &b, |
492 | const Ndarray &x) { |
493 | #if defined(TI_WITH_CUDA) |
494 | if (is_analyzed_ == false) { |
495 | analyze_pattern(sm); |
496 | } |
497 | if (is_factorized_ == false) { |
498 | factorize(sm); |
499 | } |
500 | |
501 | // Retrive the info of the sparse matrix |
502 | SparseMatrix *sm_no_cv = const_cast<SparseMatrix *>(&sm); |
503 | CuSparseMatrix *A = static_cast<CuSparseMatrix *>(sm_no_cv); |
504 | size_t rowsA = A->num_rows(); |
505 | size_t colsA = A->num_cols(); |
506 | |
507 | // step 7: solve L*U*x = b |
508 | size_t d_b = prog->get_ndarray_data_ptr_as_int(&b); |
509 | size_t d_x = prog->get_ndarray_data_ptr_as_int(&x); |
510 | float *h_b = (float *)malloc(sizeof(float) * rowsA); |
511 | float *h_b_hat = (float *)malloc(sizeof(float) * rowsA); |
512 | float *h_x = (float *)malloc(sizeof(float) * colsA); |
513 | float *h_x_hat = (float *)malloc(sizeof(float) * colsA); |
514 | assert(nullptr != h_b); |
515 | assert(nullptr != h_b_hat); |
516 | assert(nullptr != h_x); |
517 | assert(nullptr != h_x_hat); |
518 | CUDADriver::get_instance().memcpy_device_to_host((void *)h_b, (void *)d_b, |
519 | sizeof(float) * rowsA); |
520 | CUDADriver::get_instance().memcpy_device_to_host((void *)h_x, (void *)d_x, |
521 | sizeof(float) * colsA); |
522 | for (int j = 0; j < rowsA; j++) { |
523 | h_b_hat[j] = h_b[h_Q_[j]]; |
524 | } |
525 | CUSOLVERDriver::get_instance().csSpScsrluSolveHost( |
526 | cusolver_handle_, rowsA, h_b_hat, h_x_hat, lu_info_, cpu_buffer_); |
527 | for (int j = 0; j < colsA; j++) { |
528 | h_x[h_Q_[j]] = h_x_hat[j]; |
529 | } |
530 | CUDADriver::get_instance().memcpy_host_to_device((void *)d_x, (void *)h_x, |
531 | sizeof(float) * colsA); |
532 | |
533 | free(h_b); |
534 | free(h_b_hat); |
535 | free(h_x); |
536 | free(h_x_hat); |
537 | #else |
538 | TI_NOT_IMPLEMENTED |
539 | #endif |
540 | } |
541 | |
542 | std::unique_ptr<SparseSolver> make_sparse_solver(DataType dt, |
543 | const std::string &solver_type, |
544 | const std::string &ordering) { |
545 | using key_type = Triplets; |
546 | using func_type = std::unique_ptr<SparseSolver> (*)(); |
547 | static const std::unordered_map<key_type, func_type, key_hash> |
548 | solver_factory = { |
549 | MAKE_SOLVER(float32, LLT, AMD), MAKE_SOLVER(float32, LLT, COLAMD), |
550 | MAKE_SOLVER(float32, LDLT, AMD), MAKE_SOLVER(float32, LDLT, COLAMD), |
551 | MAKE_SOLVER(float64, LLT, AMD), MAKE_SOLVER(float64, LLT, COLAMD), |
552 | MAKE_SOLVER(float64, LDLT, AMD), MAKE_SOLVER(float64, LDLT, COLAMD)}; |
553 | static const std::unordered_map<std::string, std::string> dt_map = { |
554 | {"f32" , "float32" }, {"f64" , "float64" }}; |
555 | auto it = dt_map.find(taichi::lang::data_type_name(dt)); |
556 | if (it == dt_map.end()) |
557 | TI_ERROR("Not supported sparse solver data type: {}" , |
558 | taichi::lang::data_type_name(dt)); |
559 | |
560 | Triplets solver_key = std::make_tuple(it->second, solver_type, ordering); |
561 | if (solver_factory.find(solver_key) != solver_factory.end()) { |
562 | auto solver_func = solver_factory.at(solver_key); |
563 | return solver_func(); |
564 | } else if (solver_type == "LU" ) { |
565 | if (it->first == "f32" ) { |
566 | using EigenMatrix = Eigen::SparseMatrix<float32>; |
567 | using LU = Eigen::SparseLU<EigenMatrix>; |
568 | return std::make_unique<EigenSparseSolver<LU, EigenMatrix>>(); |
569 | } else if (it->first == "f64" ) { |
570 | using EigenMatrix = Eigen::SparseMatrix<float64>; |
571 | using LU = Eigen::SparseLU<EigenMatrix>; |
572 | return std::make_unique<EigenSparseSolver<LU, EigenMatrix>>(); |
573 | } else { |
574 | TI_ERROR("Not supported sparse solver data type: {}" , it->second); |
575 | } |
576 | } else |
577 | TI_ERROR("Not supported sparse solver type: {}" , solver_type); |
578 | } |
579 | |
580 | CuSparseSolver::~CuSparseSolver() { |
581 | #if defined(TI_WITH_CUDA) |
582 | if (h_Q_ != nullptr) |
583 | free(h_Q_); |
584 | if (h_csrRowPtrB_ != nullptr) |
585 | free(h_csrRowPtrB_); |
586 | if (h_csrColIndB_ != nullptr) |
587 | free(h_csrColIndB_); |
588 | if (h_csrValB_ != nullptr) |
589 | free(h_csrValB_); |
590 | if (h_mapBfromA_ != nullptr) |
591 | free(h_mapBfromA_); |
592 | if (cpu_buffer_ != nullptr) |
593 | free(cpu_buffer_); |
594 | if (info_ != nullptr) |
595 | CUSOLVERDriver::get_instance().csSpDestroyCsrcholInfo(info_); |
596 | if (lu_info_ != nullptr) |
597 | CUSOLVERDriver::get_instance().csSpDestroyCsrluInfoHost(lu_info_); |
598 | if (cusolver_handle_ != nullptr) |
599 | CUSOLVERDriver::get_instance().csSpDestory(cusolver_handle_); |
600 | if (cusparse_handel_ != nullptr) |
601 | CUSPARSEDriver::get_instance().cpDestroy(cusparse_handel_); |
602 | if (descr_ != nullptr) |
603 | CUSPARSEDriver::get_instance().cpDestroyMatDescr(descr_); |
604 | if (gpu_buffer_ != nullptr) |
605 | CUDADriver::get_instance().mem_free(gpu_buffer_); |
606 | if (d_Q_ != nullptr) |
607 | CUDADriver::get_instance().mem_free(d_Q_); |
608 | if (d_csrRowPtrB_ != nullptr) |
609 | CUDADriver::get_instance().mem_free(d_csrRowPtrB_); |
610 | if (d_csrColIndB_ != nullptr) |
611 | CUDADriver::get_instance().mem_free(d_csrColIndB_); |
612 | if (d_csrValB_ != nullptr) |
613 | CUDADriver::get_instance().mem_free(d_csrValB_); |
614 | #endif |
615 | } |
616 | std::unique_ptr<SparseSolver> make_cusparse_solver( |
617 | DataType dt, |
618 | const std::string &solver_type, |
619 | const std::string &ordering) { |
620 | if (solver_type == "LLT" || solver_type == "LDLT" ) { |
621 | return std::make_unique<CuSparseSolver>( |
622 | CuSparseSolver::SolverType::Cholesky); |
623 | } else if (solver_type == "LU" ) { |
624 | return std::make_unique<CuSparseSolver>(CuSparseSolver::SolverType::LU); |
625 | } else { |
626 | TI_ERROR("Not supported sparse solver type: {}" , solver_type); |
627 | } |
628 | } |
629 | } // namespace taichi::lang |
630 | |