Solving sparse definite positive linear systems in CUDA
We are having problems using the function cuSOLVER
cusolverSpScsrlsvchol
, possibly due to a misunderstanding of the library cuSOLVER
.
Motivation: We are solving Poisson's equation -divgrad x = b
on a rectangular grid. In dimensions 2
with 5
-stencil, the (1, 1, -4, 1, 1)
Laplacian on the grid provides a (rather sparse) matrix A
. Moreover, the charge distribution on the grid gives a (dense) vector b
. A
positive definite and symmetric.
We will now solve A * x = b
for x
using the new cuSOLVER
nvidia library that ships with CUDA 7.0. It provides a function cusolverSpScsrlsvchol
that should do Cholesky sparse factorization for floats.
Note: we can correctly solve the system using the alternative sparse QR factorization function cusolverSpScsrlsvqr
. For a grid 4 x 4
where all the elements are b
at the edge 1
and the rest 0
, we get for x
:
1 1 0.999999 1 1 1 0.999999 1 1 1 1 1 1 1 1 1
Our problems:
-
cusolverSpScsrlsvchol
returns incorrect results forx
:1 3.33333 2.33333 1 3.33333 2.33333 1.33333 1 2.33333 1.33333 0.666667 1 1 1 1 1
-
(solved, see answer below) Converting the CSR matrix
A
to dense matrix and displaying the output gives strange numbers (10^-44
and the like). Corresponding data from CSR format is correct and verified with python numpy. -
(solved, see answer below) Alternative sparse
LU
and partial rotation withcusolverSpScsrlsvlu
cannot be found:$ nvcc -std=c++11 cusparse_test3.cu -o cusparse_test3 -lcusparse -lcusolver cusparse_test3.cu(208): error: identifier "cusolverSpScsrlsvlu" is undefined
What are we doing wrong? Thank you for your help!
Our CUDA C ++ code:
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cusolverSp.h>
#include <cusparse.h>
#include <vector>
#include <cassert>
// create poisson matrix with Dirichlet bc. of a rectangular grid with
// dimension NxN
void assemble_poisson_matrix_coo(std::vector<float>& vals, std::vector<int>& row, std::vector<int>& col,
std::vector<float>& rhs, int Nrows, int Ncols) {
//nnz: 5 entries per row (node) for nodes in the interior
// 1 entry per row (node) for nodes on the boundary, since we set them explicitly to 1.
int nnz = 5*Nrows*Ncols - (2*(Ncols-1) + 2*(Nrows-1))*4;
vals.resize(nnz);
row.resize(nnz);
col.resize(nnz);
rhs.resize(Nrows*Ncols);
int counter = 0;
for(int i = 0; i < Nrows; ++i) {
for (int j = 0; j < Ncols; ++j) {
int idx = j + Ncols*i;
if (i == 0 || j == 0 || j == Ncols-1 || i == Nrows-1) {
vals[counter] = 1.;
row[counter] = idx;
col[counter] = idx;
counter++;
rhs[idx] = 1.;
// if (i == 0) {
// rhs[idx] = 3.;
// }
} else { // -laplace stencil
// above
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx-Ncols;
counter++;
// left
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx-1;
counter++;
// center
vals[counter] = 4.;
row[counter] = idx;
col[counter] = idx;
counter++;
// right
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx+1;
counter++;
// below
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx+Ncols;
counter++;
rhs[idx] = 0;
}
}
}
assert(counter == nnz);
}
int main() {
// --- create library handles:
cusolverSpHandle_t cusolver_handle;
cusolverStatus_t cusolver_status;
cusolver_status = cusolverSpCreate(&cusolver_handle);
std::cout << "status create cusolver handle: " << cusolver_status << std::endl;
cusparseHandle_t cusparse_handle;
cusparseStatus_t cusparse_status;
cusparse_status = cusparseCreate(&cusparse_handle);
std::cout << "status create cusparse handle: " << cusparse_status << std::endl;
// --- prepare matrix:
int Nrows = 4;
int Ncols = 4;
std::vector<float> csrVal;
std::vector<int> cooRow;
std::vector<int> csrColInd;
std::vector<float> b;
assemble_poisson_matrix_coo(csrVal, cooRow, csrColInd, b, Nrows, Ncols);
int nnz = csrVal.size();
int m = Nrows * Ncols;
std::vector<int> csrRowPtr(m+1);
// --- prepare solving and copy to GPU:
std::vector<float> x(m);
float tol = 1e-5;
int reorder = 0;
int singularity = 0;
float *db, *dcsrVal, *dx;
int *dcsrColInd, *dcsrRowPtr, *dcooRow;
cudaMalloc((void**)&db, m*sizeof(float));
cudaMalloc((void**)&dx, m*sizeof(float));
cudaMalloc((void**)&dcsrVal, nnz*sizeof(float));
cudaMalloc((void**)&dcsrColInd, nnz*sizeof(int));
cudaMalloc((void**)&dcsrRowPtr, (m+1)*sizeof(int));
cudaMalloc((void**)&dcooRow, nnz*sizeof(int));
cudaMemcpy(db, b.data(), b.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dcsrVal, csrVal.data(), csrVal.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dcsrColInd, csrColInd.data(), csrColInd.size()*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dcooRow, cooRow.data(), cooRow.size()*sizeof(int), cudaMemcpyHostToDevice);
cusparse_status = cusparseXcoo2csr(cusparse_handle, dcooRow, nnz, m,
dcsrRowPtr, CUSPARSE_INDEX_BASE_ZERO);
std::cout << "status cusparse coo2csr conversion: " << cusparse_status << std::endl;
cudaDeviceSynchronize(); // matrix format conversion has to be finished!
// --- everything ready for computation:
cusparseMatDescr_t descrA;
cusparse_status = cusparseCreateMatDescr(&descrA);
std::cout << "status cusparse createMatDescr: " << cusparse_status << std::endl;
// optional: print dense matrix that has been allocated on GPU
std::vector<float> A(m*m, 0);
float *dA;
cudaMalloc((void**)&dA, A.size()*sizeof(float));
cusparseScsr2dense(cusparse_handle, m, m, descrA, dcsrVal,
dcsrRowPtr, dcsrColInd, dA, m);
cudaMemcpy(A.data(), dA, A.size()*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "A: \n";
for (int i = 0; i < m; ++i) {
for (int j = 0; j < m; ++j) {
std::cout << A[i*m + j] << " ";
}
std::cout << std::endl;
}
cudaFree(dA);
std::cout << "b: \n";
cudaMemcpy(b.data(), db, (m)*sizeof(int), cudaMemcpyDeviceToHost);
for (auto a : b) {
std::cout << a << ",";
}
std::cout << std::endl;
// --- solving!!!!
// cusolver_status = cusolverSpScsrlsvchol(cusolver_handle, m, nnz, descrA, dcsrVal,
// dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
// &singularity);
cusolver_status = cusolverSpScsrlsvqr(cusolver_handle, m, nnz, descrA, dcsrVal,
dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
&singularity);
cudaDeviceSynchronize();
std::cout << "singularity (should be -1): " << singularity << std::endl;
std::cout << "status cusolver solving (!): " << cusolver_status << std::endl;
cudaMemcpy(x.data(), dx, m*sizeof(float), cudaMemcpyDeviceToHost);
// relocated these 2 lines from above to solve (2):
cusparse_status = cusparseDestroy(cusparse_handle);
std::cout << "status destroy cusparse handle: " << cusparse_status << std::endl;
cusolver_status = cusolverSpDestroy(cusolver_handle);
std::cout << "status destroy cusolver handle: " << cusolver_status << std::endl;
for (auto a : x) {
std::cout << a << " ";
}
std::cout << std::endl;
cudaFree(db);
cudaFree(dx);
cudaFree(dcsrVal);
cudaFree(dcsrColInd);
cudaFree(dcsrRowPtr);
cudaFree(dcooRow);
return 0;
}
source to share
1.cusolverSpScsrlsvchol returns incorrect results for x: 1 3.33333 2.33333 1 3.33333 2.33333 1.33333 1 2.33333 1.33333 0.6666667 1 1 1 1 1
You said:
A is positive definite and symmetric.
No, it is not. It is not symmetrical.
cusolverSpcsrlsvqr () does not require the matrix to A
be symmetric.
cusolverSpcsrlsvchol () has this requirement:
A is an m × m symmetric positive definite sparse matrix
This is the printout your code provides for matrix A:
A:
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 1 0 0 0 -1 0 0 0 0 0 0 0 0 0 0
0 0 1 0 0 0 -1 0 0 0 0 0 0 0 0 0
0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 1 -1 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 4 -1 0 0 -1 0 0 0 0 0 0
0 0 0 0 0 -1 4 0 0 0 -1 0 0 0 0 0
0 0 0 0 0 0 -1 1 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 -1 0 0 0 0 0 0
0 0 0 0 0 -1 0 0 0 4 -1 0 0 0 0 0
0 0 0 0 0 0 -1 0 0 -1 4 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 -1 1 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
0 0 0 0 0 0 0 0 0 -1 0 0 0 1 0 0
0 0 0 0 0 0 0 0 0 0 -1 0 0 0 1 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
If it were symmetrical, I would expect the second line:
0 1 0 0 0 -1 0 0 0 0 0 0 0 0 0 0
to match the second column:
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
By the way, a suggestion about stack overflow. If you answer your own question, my suggestion is that you intend it to be a complete answer. Some people might answer a question and skip it. It might be better to edit content like this to your question, thereby focusing your question (I think) on one question. SO also doesn't work in my opinion when you ask multiple questions per question. This behavior makes the question unnecessarily difficult to answer, and I don't think it serves you well here.
source to share
Although the matrix arising from the Cartesian discretization of the Poisson equation is not positive definite, this question concerns the inversion of "strong" sparse positive definite linear systems.
As it cusolverSpScsrlsvchol
becomes available to the device channel, I think potentially interested users will find it useful to perform sparse positive definite linear systems inversions using the cuSPARSE library. Here's a complete example:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if(CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int Nrows = 4; // --- Number of rows
const int Ncols = 4; // --- Number of columns
const int N = Nrows;
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));
// --- Column-major ordering
h_A_dense[0] = 0.4612f; h_A_dense[4] = -0.0006f; h_A_dense[8] = 0.3566f; h_A_dense[12] = 0.0f;
h_A_dense[1] = -0.0006f; h_A_dense[5] = 0.4640f; h_A_dense[9] = 0.0723f; h_A_dense[13] = 0.0f;
h_A_dense[2] = 0.3566f; h_A_dense[6] = 0.0723f; h_A_dense[10] = 0.7543f; h_A_dense[14] = 0.0f;
h_A_dense[3] = 0.f; h_A_dense[7] = 0.0f; h_A_dense[11] = 0.0f; h_A_dense[15] = 0.1f;
// --- Create device array and copy host array to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));
int nnz = 0; // --- Number of nonzero elements in dense matrix
const int lda = Nrows; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row
int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Device side dense matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side dense matrix
double *h_A = (double *)malloc(nnz * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
printf("\nOriginal matrix in CSR format\n\n");
for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");
printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
// --- Allocating and defining dense host and device data vectors
double *h_x = (double *)malloc(Nrows * sizeof(double));
h_x[0] = 100.0; h_x[1] = 200.0; h_x[2] = 400.0; h_x[3] = 500.0;
double *d_x; gpuErrchk(cudaMalloc(&d_x, Nrows * sizeof(double)));
gpuErrchk(cudaMemcpy(d_x, h_x, Nrows * sizeof(double), cudaMemcpyHostToDevice));
/******************************************/
/* STEP 1: CREATE DESCRIPTORS FOR L AND U */
/******************************************/
cusparseMatDescr_t descr_L = 0;
cusparseSafeCall(cusparseCreateMatDescr (&descr_L));
cusparseSafeCall(cusparseSetMatIndexBase (descr_L, CUSPARSE_INDEX_BASE_ONE));
cusparseSafeCall(cusparseSetMatType (descr_L, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatFillMode (descr_L, CUSPARSE_FILL_MODE_LOWER));
cusparseSafeCall(cusparseSetMatDiagType (descr_L, CUSPARSE_DIAG_TYPE_NON_UNIT));
/********************************************************************************************************/
/* STEP 2: QUERY HOW MUCH MEMORY USED IN CHOLESKY FACTORIZATION AND THE TWO FOLLOWING SYSTEM INVERSIONS */
/********************************************************************************************************/
csric02Info_t info_A = 0; cusparseSafeCall(cusparseCreateCsric02Info(&info_A));
csrsv2Info_t info_L = 0; cusparseSafeCall(cusparseCreateCsrsv2Info (&info_L));
csrsv2Info_t info_Lt = 0; cusparseSafeCall(cusparseCreateCsrsv2Info (&info_Lt));
int pBufferSize_M, pBufferSize_L, pBufferSize_Lt;
cusparseSafeCall(cusparseDcsric02_bufferSize(handle, N, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, info_A, &pBufferSize_M));
cusparseSafeCall(cusparseDcsrsv2_bufferSize (handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nnz, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_L, &pBufferSize_L));
cusparseSafeCall(cusparseDcsrsv2_bufferSize (handle, CUSPARSE_OPERATION_TRANSPOSE, N, nnz, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_Lt, &pBufferSize_Lt));
int pBufferSize = max(pBufferSize_M, max(pBufferSize_L, pBufferSize_Lt));
void *pBuffer = 0; gpuErrchk(cudaMalloc((void**)&pBuffer, pBufferSize));
/******************************************************************************************************/
/* STEP 3: ANALYZE THE THREE PROBLEMS: CHOLESKY FACTORIZATION AND THE TWO FOLLOWING SYSTEM INVERSIONS */
/******************************************************************************************************/
int structural_zero;
cusparseSafeCall(cusparseDcsric02_analysis(handle, N, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, info_A, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer));
cusparseStatus_t status = cusparseXcsric02_zeroPivot(handle, info_A, &structural_zero);
if (CUSPARSE_STATUS_ZERO_PIVOT == status){ printf("A(%d,%d) is missing\n", structural_zero, structural_zero); }
cusparseSafeCall(cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nnz, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_L, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer));
cusparseSafeCall(cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_TRANSPOSE, N, nnz, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_Lt, CUSPARSE_SOLVE_POLICY_USE_LEVEL, pBuffer));
/*************************************/
/* STEP 4: FACTORIZATION: A = L * L' */
/*************************************/
int numerical_zero;
cusparseSafeCall(cusparseDcsric02(handle, N, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, info_A, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer));
status = cusparseXcsric02_zeroPivot(handle, info_A, &numerical_zero);
if (CUSPARSE_STATUS_ZERO_PIVOT == status){ printf("L(%d,%d) is zero\n", numerical_zero, numerical_zero); }
printf("\nNon-zero elements in Cholesky matrix\n\n");
gpuErrchk(cudaMemcpy(h_A, d_A, nnz * sizeof(double), cudaMemcpyDeviceToHost));
for (int k=0; k<nnz; k++) printf("%f\n", h_A[k]);
cusparseSafeCall(cusparseDcsr2dense(handle, Nrows, Ncols, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_A_dense, Nrows));
printf("\nCholesky matrix\n\n");
for(int i = 0; i < Nrows; i++) {
std::cout << "[ ";
for(int j = 0; j < Ncols; j++)
std::cout << h_A_dense[i * Ncols + j] << " ";
std::cout << "]\n";
}
/*********************/
/* STEP 5: L * z = x */
/*********************/
// --- Allocating the intermediate result vector
double *d_z; gpuErrchk(cudaMalloc(&d_z, N * sizeof(double)));
const double alpha = 1.;
cusparseSafeCall(cusparseDcsrsv2_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nnz, &alpha, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_L, d_x, d_z, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer));
/**********************/
/* STEP 5: L' * y = z */
/**********************/
// --- Allocating the host and device side result vector
double *h_y = (double *)malloc(Ncols * sizeof(double));
double *d_y; gpuErrchk(cudaMalloc(&d_y, Ncols * sizeof(double)));
cusparseSafeCall(cusparseDcsrsv2_solve(handle, CUSPARSE_OPERATION_TRANSPOSE, N, nnz, &alpha, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_Lt, d_z, d_y, CUSPARSE_SOLVE_POLICY_USE_LEVEL, pBuffer));
cudaMemcpy(h_x, d_y, N * sizeof(double), cudaMemcpyDeviceToHost);
printf("\n\nFinal result\n");
for (int k=0; k<N; k++) printf("x[%i] = %f\n", k, h_x[k]);
}
source to share
Regarding 2: we destroyed the cusparse handle too early (maybe too many micro tweaks to find sources of errors ....). Also, the dense format is the main column, so we need to transpose A
it to print correctly!
Regarding 3: cusolverSpScsrlsvlu
only exists on the host at the moment - it is written in the documentation in a surprisingly obvious way in point 6.2.1 of note 5 .... http://docs.nvidia.com/cuda/cusolver/index.html#cusolver-lt -t-gt-csrlsvlu
source to share
Another possibility of solving a sparse, positive definite linear system is to use a library cuSOLVER
and, in particular, a procedure cusolverSpDcsrlsvchol
. It works very similarly to the procedures cuSOLVER
used to solve general sparse linear systems in CUDA , but uses the Cholesky factorization A = G * G^H
, where G
is the Cholesky factor, the lower triangular matrix.
As for the subroutines in solving common sparse linear systems in CUDA and CUDA 10.0
, at the moment, only the host pipe is available. Note that the parameter reorder
has no effect, and singularity
is equal -1
if the matrix is A
positive definite.
Below is a fully worked out example:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cusparse.h>
#include <cusolverSp.h>
//https://www.physicsforums.com/threads/all-the-ways-to-build-positive-definite-matrices.561438/
//https://it.mathworks.com/matlabcentral/answers/101132-how-do-i-determine-if-a-matrix-is-positive-definite-using-matlab
/*******************/
/* iDivUp FUNCTION */
/*******************/
//extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__host__ __device__ int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cusolverGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if (CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d, error: %s \nterminating!\n", __FILE__, __LINE__, \
_cusolverGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs", __FILE__, __LINE__, err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int Nrows = 4; // --- Number of rows
const int Ncols = 4; // --- Number of columns
const int N = Nrows;
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));
// --- Column-major ordering
h_A_dense[0] = 1.78; h_A_dense[4] = 0.0; h_A_dense[8] = 0.1736; h_A_dense[12] = 0.0;
h_A_dense[1] = 0.00; h_A_dense[5] = 3.1; h_A_dense[9] = 0.0; h_A_dense[13] = 0.0;
h_A_dense[2] = 0.1736; h_A_dense[6] = 0.0; h_A_dense[10] = 5.0; h_A_dense[14] = 0.0;
h_A_dense[3] = 0.00; h_A_dense[7] = 0.0; h_A_dense[11] = 0.0; h_A_dense[15] = 2.349;
//create device array and copy host to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
int nnz = 0; // --- Number of nonzero elements in dense matrix
const int lda = Nrows; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row
int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Device side dense matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side dense matrix
double *h_A = (double *)malloc(nnz * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
// --- Allocating and defining dense host and device data vectors
double *h_y = (double *)malloc(Nrows * sizeof(double));
h_y[0] = 1.0; h_y[1] = 1.0; h_y[2] = 1.0; h_y[3] = 1.0;
double *d_y; gpuErrchk(cudaMalloc(&d_y, Nrows * sizeof(double)));
gpuErrchk(cudaMemcpy(d_y, h_y, Nrows * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating the host and device side result vector
double *h_x = (double *)malloc(Ncols * sizeof(double));
double *d_x; gpuErrchk(cudaMalloc(&d_x, Ncols * sizeof(double)));
// --- CUDA solver initialization
cusolverSpHandle_t solver_handle;
cusolverSpCreate(&solver_handle);
// --- Using Cholesky factorization
int singularity;
cusolveSafeCall(cusolverSpDcsrlsvcholHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));
printf("Showing the results...\n");
for (int i = 0; i < N; i++) printf("%f\n", h_x[i]);
}
source to share