Mass multiplication of a matrix matrix in CUDA using cuSPARSE
I am comparing sparse multiplication of a matrix matrix on an Nvidia K40 using a library cuSPARSE
. I am creating my own sparse matrix in a format CSR
and I am using a cusparseXcsrgemmNnz
library routine cuSPARSE
. However, as the size of the data grows, an error occurs on the call cusparseXcsrgemmNnz
, i.e. CUSPARSE_STATUS_SUCCESS
not refundable. Also crashes cudaMemcpy
and CUDA_SUCCESS
does not return. The code works fine for matrices 8 x 8
and 16 x 16
. However, from 32 x 32
on, this error is observed.
Edit: I am getting CUSPARSE_STATUS_EXECUTION_FAILED
from cusparseXcsrgemmNnz
for the third size of the matrix. Execution is correct for the first two dimensions of the matrix.
#include <cusparse_v2.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
// error check macros
#define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
double timerval()
{
struct timeval st;
gettimeofday(&st, NULL);
return (st.tv_sec+st.tv_usec*1e-6);
}
// perform sparse-matrix multiplication C=AxB
int main(){
double avg_time = 0, s_time, e_time;
cusparseStatus_t stat;
cusparseHandle_t hndl;
cusparseMatDescr_t descrA, descrB, descrC;
int *csrRowPtrA, *csrRowPtrB, *csrRowPtrC, *csrColIndA, *csrColIndB, *csrColIndC;
int *h_csrRowPtrA, *h_csrRowPtrB, *h_csrRowPtrC, *h_csrColIndA, *h_csrColIndB, *h_csrColIndC,*pos;
float *csrValA, *csrValB, *csrValC, *h_csrValA, *h_csrValB, *h_csrValC;
int nnzA, nnzB, nnzC;
int m=4,n,k,loop;
int i,j;
int iterations;
for (iterations=0;iterations<10;iterations++)
{
m *=2;
n = m;
k = m;
//density of the sparse matrix to be created. Assume 5% density.
double dense_const = 0.05;
int temp5, temp6,temp3,temp4;
int density=(m*n)*(dense_const);
nnzA = density;
nnzB = density;
h_csrRowPtrA = (int *)malloc((m+1)*sizeof(int));
h_csrRowPtrB = (int *)malloc((n+1)*sizeof(int));
h_csrColIndA = (int *)malloc(density*sizeof(int));
h_csrColIndB = (int *)malloc(density*sizeof(int));
h_csrValA = (float *)malloc(density*sizeof(float));
h_csrValB = (float *)malloc(density*sizeof(float));
if ((h_csrRowPtrA == NULL) || (h_csrRowPtrB == NULL) || (h_csrColIndA == NULL) || (h_csrColIndB == NULL) || (h_csrValA == NULL) || (h_csrValB == NULL))
{printf("malloc fail\n"); return -1;}
//position array for random initialisation of positions in input matrix
pos= (int *)calloc((m*n), sizeof(int));
int temp,temp1;
// printf("the density is %d\n",density);
// printf("check 1:\n");
//randomly initialise positions
for(i=0;i<density;i++)
{
temp1=rand()%(m*n);
pos[i]=temp1;
}
// printf("check 2:\n");
//sort the 'pos' array
for (i = 0 ; i < density; i++) {
int d = i;
int t;
while ( d > 0 && pos[d] < pos[d-1]) {
t = pos[d];
pos[d] = pos[d-1];
pos[d-1] = t;
d--;
}
}
// initialise with non zero elements and extract column and row ptr vector
j=1;
//ja[0]=1;
int p=0;
int f=0;
for(i = 0; i < density; i++)
{
temp=pos[i];
h_csrValA[f] = rand();
h_csrValB[f] = rand();
h_csrColIndA[f] = temp%m;
h_csrColIndB[f] = temp%m;
f++;
p++;
temp5= pos[i];
temp6=pos[i+1];
temp3=temp5-(temp5%m);
temp4=temp6-(temp6%m);
if(!(temp3== temp4))
{
if((temp3+m==temp6))
{}
else
{
h_csrRowPtrA[j]=p;
h_csrRowPtrB[j]=p;
j++;
}
}
}
// transfer data to device
cudaMalloc(&csrRowPtrA, (m+1)*sizeof(int));
cudaMalloc(&csrRowPtrB, (n+1)*sizeof(int));
cudaMalloc(&csrColIndA, density*sizeof(int));
cudaMalloc(&csrColIndB, density*sizeof(int));
cudaMalloc(&csrValA, density*sizeof(float));
cudaMalloc(&csrValB, density*sizeof(float));
cudaCheckErrors("cudaMalloc fail");
cudaMemcpy(csrRowPtrA, h_csrRowPtrA, (m+1)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrRowPtrB, h_csrRowPtrB, (n+1)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrColIndA, h_csrColIndA, density*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrColIndB, h_csrColIndB, density*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrValA, h_csrValA, density*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(csrValB, h_csrValB, density*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
// set cusparse matrix types
CUSPARSE_CHECK(cusparseCreate(&hndl));
stat = cusparseCreateMatDescr(&descrA);
CUSPARSE_CHECK(stat);
stat = cusparseCreateMatDescr(&descrB);
CUSPARSE_CHECK(stat);
stat = cusparseCreateMatDescr(&descrC);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
CUSPARSE_CHECK(stat);
cusparseOperation_t transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
// figure out size of C
int baseC;
// nnzTotalDevHostPtr points to host memory
int *nnzTotalDevHostPtr = &nnzC;
stat = cusparseSetPointerMode(hndl, CUSPARSE_POINTER_MODE_HOST);
CUSPARSE_CHECK(stat);
cudaMalloc((void**)&csrRowPtrC, sizeof(int)*(m+1));
cudaCheckErrors("cudaMalloc fail");
s_time=timerval();
stat = cusparseXcsrgemmNnz(hndl, transA, transB, m, n, k,
descrA, nnzA, csrRowPtrA, csrColIndA,
descrB, nnzB, csrRowPtrB, csrColIndB,
descrC, csrRowPtrC, nnzTotalDevHostPtr );
CUSPARSE_CHECK(stat);
if (NULL != nnzTotalDevHostPtr){
nnzC = *nnzTotalDevHostPtr;}
else{
cudaMemcpy(&nnzC, csrRowPtrC+m, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&baseC, csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
nnzC -= baseC;}
cudaMalloc((void**)&csrColIndC, sizeof(int)*nnzC);
cudaMalloc((void**)&csrValC, sizeof(float)*nnzC);
cudaCheckErrors("cudaMalloc fail");
// perform multiplication C = A*B
for(loop=0;loop<1000;loop++)
{
stat = cusparseScsrgemm(hndl, transA, transB, m, n, k,
descrA, nnzA,
csrValA, csrRowPtrA, csrColIndA,
descrB, nnzB,
csrValB, csrRowPtrB, csrColIndB,
descrC,
csrValC, csrRowPtrC, csrColIndC);
CUSPARSE_CHECK(stat);
}
e_time=timerval();
avg_time=avg_time/1000;
// copy result (C) back to host
h_csrRowPtrC = (int *)malloc((m+1)*sizeof(int));
h_csrColIndC = (int *)malloc(nnzC *sizeof(int));
h_csrValC = (float *)malloc(nnzC *sizeof(float));
if ((h_csrRowPtrC == NULL) || (h_csrColIndC == NULL) || (h_csrValC == NULL))
{printf("malloc fail\n"); return -1;}
cudaMemcpy(h_csrRowPtrC, csrRowPtrC, (m+1)*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_csrColIndC, csrColIndC, nnzC*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_csrValC, csrValC, nnzC*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
printf ("\n Input size: %d x %d ,Time: %lf and density is %d \n", m,n, avg_time, density);
cudaFree(csrRowPtrC);
cudaFree(csrColIndC);
cudaFree(csrValC);
cudaFree(csrRowPtrA);
cudaFree(csrColIndA);
cudaFree(csrValA);
cudaFree(csrRowPtrB);
cudaFree(csrColIndB);
cudaFree(csrValB);
free(h_csrRowPtrC);
free(h_csrColIndC);
free(h_csrValC);
free(h_csrRowPtrA);
free(h_csrColIndA);
free(h_csrValA);
free(h_csrRowPtrB);
free(h_csrColIndB);
free(h_csrValB);
}
return 0;
}
source to share
It seems that you have stripped some of this code from here
As stated in this post:
a glitch in
cusparseXcsrgemmNnz
may indicate an underlying problem in the formatting of the CSR matrix.
I'm sure this is the problem here. Your process for creating a properly formatted CSR matrix is broken.
To prove it, add the following code just before the said comment in your posted code:
printf("A RowPtrs: \n");
for (int i = 0; i < m+1; i++) printf("%d ", h_csrRowPtrA[i]);
printf("\nA ColInds: \n");
for (int i = 0; i < nnzA; i++) printf("%d ", h_csrColIndA[i]);
printf("\nB RowPtrs: \n");
for (int i = 0; i < n+1; i++) printf("%d ", h_csrRowPtrB[i]);
printf("\nB ColInds: \n");
for (int i = 0; i < nnzB; i++) printf("%d ", h_csrColIndB[i]);
printf("\n");
// add the above code before this comment:
// transfer data to device
When I do that, recompile and run, I get an output that looks like this:
$ ./t730
A RowPtrs:
0 1 2 3 0 0 0 0 0
A ColInds:
6 7 1
B RowPtrs:
0 1 2 3 0 0 0 0 0
B ColInds:
6 7 1
Input size: 8 x 8 ,Time: 0.000000 and density is 3
A RowPtrs:
0 1 2 3 4 5 6 8 9 12 959542853 1886614883 1702064737 1299346243 1918980205 1232301409 1766154068
A ColInds:
11 6 4 12 11 10 2 13 3 2 8 11
B RowPtrs:
-1688500168 1 2 3 4 5 6 8 9 12 0 0 0 0 0 0 0
B ColInds:
11 6 4 12 11 10 2 13 3 2 8 11
cusparse fail: 6, line: 193
We can see that the first set of CSR-formatted matrices A
and B
up to the line starting with Input size: 8 x 8...
seems to complete without errors, however the formatting is actually broken. Blank lines do not have a zero-based line pointer (line pointers cannot move backward), they have a line pointer starting at the last filled line (so the number of non-zero elements in a line is equal to the current line pointer minus the previous line pointer), and the last value in the sequence of string pointers points to one element after the last element in the matrix (i.e., the last value in the array of CSR pointers is nnz, the number of nonzero elements).
The next set of matrices A and B (corresponding to a 16x16 pass) is clearly broken. At a minimum, you have row pointers in A and B CSR matrices that are clearly out of range.
Your code for generating CSRs is just broken. I suggest you look into CSR matrices and create a tool to validate any matrices you intend to randomly generate this way. CUSP has matrix check functions and I'm sure there are other CSR matrix format check functions you could use.
source to share
Following Robert Rovella's answer, I want to provide a fully processed code that implements the allowed matrix matrix multiplication.
To avoid any ambiguity in the sparse matrix format, the code starts with dense matrices and uses cusparse<t>dense2csr
to convert the matrix format from dense to csr.
The two matrices involved in the code, A
and B
. Matrix B
is a permutation matrix. The code computes C = B * A
.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include "Utilities.cuh"
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int N = 4; // --- Number of rows and columns
// --- Host side dense matrices
double *h_A_dense = (double*)malloc(N * N * sizeof(*h_A_dense));
double *h_B_dense = (double*)malloc(N * N * sizeof(*h_B_dense));
double *h_C_dense = (double*)malloc(N * N * sizeof(*h_C_dense));
// --- Column-major ordering
h_A_dense[0] = 0.4612; h_A_dense[4] = -0.0006; h_A_dense[8] = 0.3566; h_A_dense[12] = 0.0;
h_A_dense[1] = -0.0006; h_A_dense[5] = 0.4640; h_A_dense[9] = 0.0723; h_A_dense[13] = 0.0;
h_A_dense[2] = 0.3566; h_A_dense[6] = 0.0723; h_A_dense[10] = 0.7543; h_A_dense[14] = 0.0;
h_A_dense[3] = 0.; h_A_dense[7] = 0.0; h_A_dense[11] = 0.0; h_A_dense[15] = 0.1;
// --- Column-major ordering
h_B_dense[0] = 0.; h_B_dense[4] = 0.; h_B_dense[8] = 1.; h_B_dense[12] = 0.;
h_B_dense[1] = 1.; h_B_dense[5] = 0.; h_B_dense[9] = 0.; h_B_dense[13] = 0.;
h_B_dense[2] = 0.; h_B_dense[6] = 1.; h_B_dense[10] = 0.; h_B_dense[14] = 0.;
h_B_dense[3] = 0.; h_B_dense[7] = 0.; h_B_dense[11] = 0.; h_B_dense[15] = 1.;
// --- Create device arrays and copy host arrays to them
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, N * N * sizeof(*d_A_dense)));
double *d_B_dense; gpuErrchk(cudaMalloc(&d_B_dense, N * N * sizeof(*d_B_dense)));
double *d_C_dense; gpuErrchk(cudaMalloc(&d_C_dense, N * N * sizeof(*d_C_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, N * N * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_B_dense, h_B_dense, N * N * sizeof(*d_B_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));
// --- Descriptor for sparse matrix B
cusparseMatDescr_t descrB; cusparseSafeCall(cusparseCreateMatDescr(&descrB));
cusparseSafeCall(cusparseSetMatType (descrB, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ONE));
// --- Descriptor for sparse matrix C
cusparseMatDescr_t descrC; cusparseSafeCall(cusparseCreateMatDescr(&descrC));
cusparseSafeCall(cusparseSetMatType (descrC, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ONE));
int nnzA = 0; // --- Number of nonzero elements in dense matrix A
int nnzB = 0; // --- Number of nonzero elements in dense matrix B
const int lda = N; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row of matrix A
int *d_nnzPerVectorA; gpuErrchk(cudaMalloc(&d_nnzPerVectorA, N * sizeof(*d_nnzPerVectorA)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, &nnzA));
// --- Device side number of nonzero elements per row of matrix B
int *d_nnzPerVectorB; gpuErrchk(cudaMalloc(&d_nnzPerVectorB, N * sizeof(*d_nnzPerVectorB)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrB, d_B_dense, lda, d_nnzPerVectorB, &nnzB));
// --- Host side number of nonzero elements per row of matrix A
int *h_nnzPerVectorA = (int *)malloc(N * sizeof(*h_nnzPerVectorA));
gpuErrchk(cudaMemcpy(h_nnzPerVectorA, d_nnzPerVectorA, N * sizeof(*h_nnzPerVectorA), cudaMemcpyDeviceToHost));
// --- Host side number of nonzero elements per row of matrix B
int *h_nnzPerVectorB = (int *)malloc(N * sizeof(*h_nnzPerVectorB));
gpuErrchk(cudaMemcpy(h_nnzPerVectorB, d_nnzPerVectorB, N * sizeof(*h_nnzPerVectorB), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix A = %i\n\n", nnzA);
for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorA[i]);
printf("\n");
printf("Number of nonzero elements in dense matrix B = %i\n\n", nnzB);
for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorB[i]);
printf("\n");
// --- Device side sparse matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnzA * sizeof(*d_A)));
double *d_B; gpuErrchk(cudaMalloc(&d_B, nnzB * sizeof(*d_B)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (N + 1) * sizeof(*d_A_RowIndices)));
int *d_B_RowIndices; gpuErrchk(cudaMalloc(&d_B_RowIndices, (N + 1) * sizeof(*d_B_RowIndices)));
int *d_C_RowIndices; gpuErrchk(cudaMalloc(&d_C_RowIndices, (N + 1) * sizeof(*d_C_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnzA * sizeof(*d_A_ColIndices)));
int *d_B_ColIndices; gpuErrchk(cudaMalloc(&d_B_ColIndices, nnzB * sizeof(*d_B_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, d_A, d_A_RowIndices, d_A_ColIndices));
cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrB, d_B_dense, lda, d_nnzPerVectorB, d_B, d_B_RowIndices, d_B_ColIndices));
// --- Host side sparse matrices
double *h_A = (double *)malloc(nnzA * sizeof(*h_A));
double *h_B = (double *)malloc(nnzB * sizeof(*h_B));
int *h_A_RowIndices = (int *)malloc((N + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnzA * sizeof(*h_A_ColIndices));
int *h_B_RowIndices = (int *)malloc((N + 1) * sizeof(*h_B_RowIndices));
int *h_B_ColIndices = (int *)malloc(nnzB * sizeof(*h_B_ColIndices));
int *h_C_RowIndices = (int *)malloc((N + 1) * sizeof(*h_C_RowIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnzA * sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (N + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnzA * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_B, d_B, nnzB * sizeof(*h_B), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_B_RowIndices, d_B_RowIndices, (N + 1) * sizeof(*h_B_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_B_ColIndices, d_B_ColIndices, nnzB * sizeof(*h_B_ColIndices), cudaMemcpyDeviceToHost));
printf("\nOriginal matrix A in CSR format\n\n");
for (int i = 0; i < nnzA; ++i) printf("A[%i] = %f ", i, h_A[i]); printf("\n");
printf("\nOriginal matrix B in CSR format\n\n");
for (int i = 0; i < nnzB; ++i) printf("B[%i] = %f ", i, h_B[i]); printf("\n");
printf("\n");
for (int i = 0; i < (N + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
printf("\n");
for (int i = 0; i < (N + 1); ++i) printf("h_B_RowIndices[%i] = %i \n", i, h_B_RowIndices[i]); printf("\n");
printf("\n");
for (int i = 0; i < nnzA; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
printf("\n");
for (int i = 0; i < nnzB; ++i) printf("h_B_ColIndices[%i] = %i \n", i, h_B_ColIndices[i]);
// --- Performing the matrix - matrix multiplication
int baseC, nnzC = 0;
// nnzTotalDevHostPtr points to host memory
int *nnzTotalDevHostPtr = &nnzC;
cusparseSafeCall(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
cusparseSafeCall(cusparseXcsrgemmNnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, N, descrB, nnzB,
d_B_RowIndices, d_B_ColIndices, descrA, nnzA, d_A_RowIndices, d_A_ColIndices, descrC, d_C_RowIndices,
nnzTotalDevHostPtr));
if (NULL != nnzTotalDevHostPtr) nnzC = *nnzTotalDevHostPtr;
else {
gpuErrchk(cudaMemcpy(&nnzC, d_C_RowIndices + N, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(&baseC, d_C_RowIndices, sizeof(int), cudaMemcpyDeviceToHost));
nnzC -= baseC;
}
int *d_C_ColIndices; gpuErrchk(cudaMalloc(&d_C_ColIndices, nnzC * sizeof(int)));
double *d_C; gpuErrchk(cudaMalloc(&d_C, nnzC * sizeof(double)));
double *h_C = (double *)malloc(nnzC * sizeof(*h_C));
int *h_C_ColIndices = (int *)malloc(nnzC * sizeof(*h_C_ColIndices));
cusparseSafeCall(cusparseDcsrgemm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, N, descrB, nnzB,
d_B, d_B_RowIndices, d_B_ColIndices, descrA, nnzA, d_A, d_A_RowIndices, d_A_ColIndices, descrC,
d_C, d_C_RowIndices, d_C_ColIndices));
cusparseSafeCall(cusparseDcsr2dense(handle, N, N, descrC, d_C, d_C_RowIndices, d_C_ColIndices, d_C_dense, N));
gpuErrchk(cudaMemcpy(h_C , d_C, nnzC * sizeof(*h_C), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_C_RowIndices, d_C_RowIndices, (N + 1) * sizeof(*h_C_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_C_ColIndices, d_C_ColIndices, nnzC * sizeof(*h_C_ColIndices), cudaMemcpyDeviceToHost));
printf("\nResult matrix C in CSR format\n\n");
for (int i = 0; i < nnzC; ++i) printf("C[%i] = %f ", i, h_C[i]); printf("\n");
printf("\n");
for (int i = 0; i < (N + 1); ++i) printf("h_C_RowIndices[%i] = %i \n", i, h_C_RowIndices[i]); printf("\n");
printf("\n");
for (int i = 0; i < nnzC; ++i) printf("h_C_ColIndices[%i] = %i \n", i, h_C_ColIndices[i]);
gpuErrchk(cudaMemcpy(h_C_dense, d_C_dense, N * N * sizeof(double), cudaMemcpyDeviceToHost));
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++)
printf("%f \t", h_C_dense[i * N + j]);
printf("\n");
}
}
source to share