Mass multiplication of a matrix matrix in CUDA using cuSPARSE

Question

Mass multiplication of a matrix matrix in CUDA using cuSPARSE

I am comparing sparse multiplication of a matrix matrix on an Nvidia K40 using a library cuSPARSE

. I am creating my own sparse matrix in a format CSR

and I am using a cusparseXcsrgemmNnz

library routine cuSPARSE

. However, as the size of the data grows, an error occurs on the call cusparseXcsrgemmNnz

, i.e. CUSPARSE_STATUS_SUCCESS

not refundable. Also crashes cudaMemcpy

and CUDA_SUCCESS

does not return. The code works fine for matrices 8 x 8

and 16 x 16

. However, from 32 x 32

on, this error is observed.

Edit: I am getting CUSPARSE_STATUS_EXECUTION_FAILED

from cusparseXcsrgemmNnz

for the third size of the matrix. Execution is correct for the first two dimensions of the matrix.

#include <cusparse_v2.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>

// error check macros
#define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}

#define cudaCheckErrors(msg) \
do { \
    cudaError_t __err = cudaGetLastError(); \
    if (__err != cudaSuccess) { \
        fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
            msg, cudaGetErrorString(__err), \
            __FILE__, __LINE__); \
        fprintf(stderr, "*** FAILED - ABORTING\n"); \
        exit(1); \
    } \
} while (0)


double timerval()
{
    struct timeval st;
    gettimeofday(&st, NULL);
    return (st.tv_sec+st.tv_usec*1e-6);
}

// perform sparse-matrix multiplication C=AxB
int main(){ 
double avg_time = 0, s_time, e_time;

cusparseStatus_t stat;
cusparseHandle_t hndl;
cusparseMatDescr_t descrA, descrB, descrC;
int *csrRowPtrA, *csrRowPtrB, *csrRowPtrC, *csrColIndA, *csrColIndB, *csrColIndC;
int *h_csrRowPtrA, *h_csrRowPtrB, *h_csrRowPtrC, *h_csrColIndA, *h_csrColIndB, *h_csrColIndC,*pos;
float *csrValA, *csrValB, *csrValC, *h_csrValA, *h_csrValB, *h_csrValC;
int nnzA, nnzB, nnzC;
int m=4,n,k,loop;
int i,j;
int iterations;
for (iterations=0;iterations<10;iterations++)
{
    m *=2;
    n = m;
    k = m;
    //density of the sparse matrix to be created. Assume 5% density.
    double dense_const = 0.05;
    int temp5, temp6,temp3,temp4;
    int density=(m*n)*(dense_const);
    nnzA = density;
    nnzB = density;
    h_csrRowPtrA = (int *)malloc((m+1)*sizeof(int));
    h_csrRowPtrB = (int *)malloc((n+1)*sizeof(int));
    h_csrColIndA = (int *)malloc(density*sizeof(int));
    h_csrColIndB = (int *)malloc(density*sizeof(int));
    h_csrValA  = (float *)malloc(density*sizeof(float));
    h_csrValB  = (float *)malloc(density*sizeof(float));
    if ((h_csrRowPtrA == NULL) || (h_csrRowPtrB == NULL) || (h_csrColIndA == NULL) || (h_csrColIndB == NULL) || (h_csrValA == NULL) || (h_csrValB == NULL))
    {printf("malloc fail\n"); return -1;}

    //position array for random initialisation of positions in input matrix
    pos= (int *)calloc((m*n), sizeof(int));
    int temp,temp1;

    //  printf("the density is %d\n",density);
    //  printf("check 1:\n");

    //randomly initialise positions
    for(i=0;i<density;i++)
    {
        temp1=rand()%(m*n);
        pos[i]=temp1;

    }
    //  printf("check 2:\n");

    //sort the 'pos' array
    for (i = 0 ; i < density; i++) {
        int d = i;
        int t;

        while ( d > 0 && pos[d] < pos[d-1]) {
            t          = pos[d];
            pos[d]   = pos[d-1];
            pos[d-1] = t;        
            d--;
        }
    }
    // initialise with non zero elements and extract column and row ptr vector
    j=1;
    //ja[0]=1;

    int p=0;
    int f=0;

    for(i = 0; i < density; i++)
    {
        temp=pos[i];
         h_csrValA[f] = rand();
         h_csrValB[f] = rand();
         h_csrColIndA[f] = temp%m;
         h_csrColIndB[f] = temp%m;
        f++;
        p++;
        temp5= pos[i];
        temp6=pos[i+1];
        temp3=temp5-(temp5%m);
        temp4=temp6-(temp6%m);

        if(!(temp3== temp4))
        {   
            if((temp3+m==temp6))
            {}
            else    
            {   
                h_csrRowPtrA[j]=p;
                h_csrRowPtrB[j]=p;
                j++;
            }

        }       
    }

    // transfer data to device

    cudaMalloc(&csrRowPtrA, (m+1)*sizeof(int));
    cudaMalloc(&csrRowPtrB, (n+1)*sizeof(int));
    cudaMalloc(&csrColIndA, density*sizeof(int));
    cudaMalloc(&csrColIndB, density*sizeof(int));
    cudaMalloc(&csrValA, density*sizeof(float));
    cudaMalloc(&csrValB, density*sizeof(float));
    cudaCheckErrors("cudaMalloc fail");
    cudaMemcpy(csrRowPtrA, h_csrRowPtrA, (m+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrRowPtrB, h_csrRowPtrB, (n+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrColIndA, h_csrColIndA, density*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrColIndB, h_csrColIndB, density*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrValA, h_csrValA, density*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(csrValB, h_csrValB, density*sizeof(float), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy fail");

    // set cusparse matrix types
    CUSPARSE_CHECK(cusparseCreate(&hndl));
    stat = cusparseCreateMatDescr(&descrA);
    CUSPARSE_CHECK(stat);
    stat = cusparseCreateMatDescr(&descrB);
    CUSPARSE_CHECK(stat);
    stat = cusparseCreateMatDescr(&descrC);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(stat);
    cusparseOperation_t transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
    cusparseOperation_t transB = CUSPARSE_OPERATION_NON_TRANSPOSE;

    // figure out size of C
    int baseC;
    // nnzTotalDevHostPtr points to host memory
    int *nnzTotalDevHostPtr = &nnzC;
    stat = cusparseSetPointerMode(hndl, CUSPARSE_POINTER_MODE_HOST);
    CUSPARSE_CHECK(stat);
    cudaMalloc((void**)&csrRowPtrC, sizeof(int)*(m+1));
    cudaCheckErrors("cudaMalloc fail");

    s_time=timerval();

    stat = cusparseXcsrgemmNnz(hndl, transA, transB, m, n, k,
    descrA, nnzA, csrRowPtrA, csrColIndA,
    descrB, nnzB, csrRowPtrB, csrColIndB,
    descrC, csrRowPtrC, nnzTotalDevHostPtr );
    CUSPARSE_CHECK(stat);
    if (NULL != nnzTotalDevHostPtr){
    nnzC = *nnzTotalDevHostPtr;}
    else{
    cudaMemcpy(&nnzC, csrRowPtrC+m, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&baseC, csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail");
    nnzC -= baseC;}
    cudaMalloc((void**)&csrColIndC, sizeof(int)*nnzC);
    cudaMalloc((void**)&csrValC, sizeof(float)*nnzC);
    cudaCheckErrors("cudaMalloc fail");
    // perform multiplication C = A*B

    for(loop=0;loop<1000;loop++)
    {
        stat = cusparseScsrgemm(hndl, transA, transB, m, n, k,
        descrA, nnzA,
        csrValA, csrRowPtrA, csrColIndA,
        descrB, nnzB,
        csrValB, csrRowPtrB, csrColIndB,
        descrC,
        csrValC, csrRowPtrC, csrColIndC);
        CUSPARSE_CHECK(stat);
    }

    e_time=timerval();

    avg_time=avg_time/1000;
    // copy result (C) back to host
    h_csrRowPtrC = (int *)malloc((m+1)*sizeof(int));
    h_csrColIndC = (int *)malloc(nnzC *sizeof(int));
    h_csrValC  = (float *)malloc(nnzC *sizeof(float));
    if ((h_csrRowPtrC == NULL) || (h_csrColIndC == NULL) || (h_csrValC == NULL))
    {printf("malloc fail\n"); return -1;}
    cudaMemcpy(h_csrRowPtrC, csrRowPtrC, (m+1)*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_csrColIndC, csrColIndC,  nnzC*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_csrValC, csrValC, nnzC*sizeof(float), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail");

    printf ("\n Input size: %d x %d ,Time: %lf and density is %d \n", m,n, avg_time, density); 

    cudaFree(csrRowPtrC);
    cudaFree(csrColIndC);
    cudaFree(csrValC);

    cudaFree(csrRowPtrA);
    cudaFree(csrColIndA);
    cudaFree(csrValA);

    cudaFree(csrRowPtrB);
    cudaFree(csrColIndB);
    cudaFree(csrValB);

    free(h_csrRowPtrC);
    free(h_csrColIndC);
    free(h_csrValC);

    free(h_csrRowPtrA);
    free(h_csrColIndA);
    free(h_csrValA);

    free(h_csrRowPtrB);
    free(h_csrColIndB);
    free(h_csrValB);
}
return 0;

}

+3

sparse-matrix cuda nvidia gpu-programming

yath Apr 17 At 1:21

source to share

2 answers

Robert crovella · Answer 1 · 2015-04-17T05:14:13+0000

It seems that you have stripped some of this code from here

As stated in this post:

a glitch in cusparseXcsrgemmNnz

may indicate an underlying problem in the formatting of the CSR matrix.

I'm sure this is the problem here. Your process for creating a properly formatted CSR matrix is broken.

To prove it, add the following code just before the said comment in your posted code:

printf("A RowPtrs: \n");
for (int i = 0; i < m+1; i++) printf("%d ", h_csrRowPtrA[i]);
printf("\nA ColInds: \n");
for (int i = 0; i < nnzA; i++) printf("%d ", h_csrColIndA[i]);
printf("\nB RowPtrs: \n");
for (int i = 0; i < n+1; i++) printf("%d ", h_csrRowPtrB[i]);
printf("\nB ColInds: \n");
for (int i = 0; i < nnzB; i++) printf("%d ", h_csrColIndB[i]);
printf("\n");

    // add the above code before this comment:
    // transfer data to device

When I do that, recompile and run, I get an output that looks like this:

$ ./t730
A RowPtrs:
0 1 2 3 0 0 0 0 0
A ColInds:
6 7 1
B RowPtrs:
0 1 2 3 0 0 0 0 0
B ColInds:
6 7 1

 Input size: 8 x 8 ,Time: 0.000000 and density is 3
A RowPtrs:
0 1 2 3 4 5 6 8 9 12 959542853 1886614883 1702064737 1299346243 1918980205 1232301409 1766154068
A ColInds:
11 6 4 12 11 10 2 13 3 2 8 11
B RowPtrs:
-1688500168 1 2 3 4 5 6 8 9 12 0 0 0 0 0 0 0
B ColInds:
11 6 4 12 11 10 2 13 3 2 8 11
cusparse fail: 6, line: 193

We can see that the first set of CSR-formatted matrices A

and B

up to the line starting with Input size: 8 x 8...

seems to complete without errors, however the formatting is actually broken. Blank lines do not have a zero-based line pointer (line pointers cannot move backward), they have a line pointer starting at the last filled line (so the number of non-zero elements in a line is equal to the current line pointer minus the previous line pointer), and the last value in the sequence of string pointers points to one element after the last element in the matrix (i.e., the last value in the array of CSR pointers is nnz, the number of nonzero elements).

The next set of matrices A and B (corresponding to a 16x16 pass) is clearly broken. At a minimum, you have row pointers in A and B CSR matrices that are clearly out of range.

Your code for generating CSRs is just broken. I suggest you look into CSR matrices and create a tool to validate any matrices you intend to randomly generate this way. CUSP has matrix check functions and I'm sure there are other CSR matrix format check functions you could use.

JackOLantern · Answer 2 · 2015-10-23T17:24:03+0000

Following Robert Rovella's answer, I want to provide a fully processed code that implements the allowed matrix matrix multiplication.

To avoid any ambiguity in the sparse matrix format, the code starts with dense matrices and uses cusparse<t>dense2csr

to convert the matrix format from dense to csr.

The two matrices involved in the code, A

and B

. Matrix B

is a permutation matrix. The code computes C = B * A

.

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>

#include "Utilities.cuh"

#include <cuda_runtime.h>
#include <cusparse_v2.h>

/********/
/* MAIN */
/********/
int main()
{
    // --- Initialize cuSPARSE
    cusparseHandle_t handle;    cusparseSafeCall(cusparseCreate(&handle));

    /**************************/
    /* SETTING UP THE PROBLEM */
    /**************************/
    const int N     = 4;                // --- Number of rows and columns

    // --- Host side dense matrices
    double *h_A_dense = (double*)malloc(N * N * sizeof(*h_A_dense));
    double *h_B_dense = (double*)malloc(N * N * sizeof(*h_B_dense));
    double *h_C_dense = (double*)malloc(N * N * sizeof(*h_C_dense));

    // --- Column-major ordering
    h_A_dense[0] = 0.4612;  h_A_dense[4] = -0.0006;     h_A_dense[8]  = 0.3566;     h_A_dense[12] = 0.0; 
    h_A_dense[1] = -0.0006; h_A_dense[5] = 0.4640;      h_A_dense[9]  = 0.0723;     h_A_dense[13] = 0.0; 
    h_A_dense[2] = 0.3566;  h_A_dense[6] = 0.0723;      h_A_dense[10] = 0.7543;     h_A_dense[14] = 0.0; 
    h_A_dense[3] = 0.;      h_A_dense[7] = 0.0;         h_A_dense[11] = 0.0;        h_A_dense[15] = 0.1; 

    // --- Column-major ordering
    h_B_dense[0] = 0.;      h_B_dense[4] = 0.;          h_B_dense[8]  = 1.;         h_B_dense[12] = 0.; 
    h_B_dense[1] = 1.;      h_B_dense[5] = 0.;          h_B_dense[9]  = 0.;         h_B_dense[13] = 0.; 
    h_B_dense[2] = 0.;      h_B_dense[6] = 1.;          h_B_dense[10] = 0.;         h_B_dense[14] = 0.; 
    h_B_dense[3] = 0.;      h_B_dense[7] = 0.;          h_B_dense[11] = 0.;         h_B_dense[15] = 1.; 

    // --- Create device arrays and copy host arrays to them
    double *d_A_dense;  gpuErrchk(cudaMalloc(&d_A_dense, N * N * sizeof(*d_A_dense)));
    double *d_B_dense;  gpuErrchk(cudaMalloc(&d_B_dense, N * N * sizeof(*d_B_dense)));
    double *d_C_dense;  gpuErrchk(cudaMalloc(&d_C_dense, N * N * sizeof(*d_C_dense)));
    gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, N * N * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_B_dense, h_B_dense, N * N * sizeof(*d_B_dense), cudaMemcpyHostToDevice));

    // --- Descriptor for sparse matrix A
    cusparseMatDescr_t descrA;      cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSafeCall(cusparseSetMatType     (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
    cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));  

    // --- Descriptor for sparse matrix B
    cusparseMatDescr_t descrB;      cusparseSafeCall(cusparseCreateMatDescr(&descrB));
    cusparseSafeCall(cusparseSetMatType     (descrB, CUSPARSE_MATRIX_TYPE_GENERAL));
    cusparseSafeCall(cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ONE));  

    // --- Descriptor for sparse matrix C
    cusparseMatDescr_t descrC;      cusparseSafeCall(cusparseCreateMatDescr(&descrC));
    cusparseSafeCall(cusparseSetMatType     (descrC, CUSPARSE_MATRIX_TYPE_GENERAL));
    cusparseSafeCall(cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ONE));  

    int nnzA = 0;                           // --- Number of nonzero elements in dense matrix A
    int nnzB = 0;                           // --- Number of nonzero elements in dense matrix B

    const int lda = N;                      // --- Leading dimension of dense matrix

    // --- Device side number of nonzero elements per row of matrix A
    int *d_nnzPerVectorA;   gpuErrchk(cudaMalloc(&d_nnzPerVectorA, N * sizeof(*d_nnzPerVectorA)));
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, &nnzA));

    // --- Device side number of nonzero elements per row of matrix B
    int *d_nnzPerVectorB;   gpuErrchk(cudaMalloc(&d_nnzPerVectorB, N * sizeof(*d_nnzPerVectorB)));
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrB, d_B_dense, lda, d_nnzPerVectorB, &nnzB));

    // --- Host side number of nonzero elements per row of matrix A
    int *h_nnzPerVectorA = (int *)malloc(N * sizeof(*h_nnzPerVectorA));
    gpuErrchk(cudaMemcpy(h_nnzPerVectorA, d_nnzPerVectorA, N * sizeof(*h_nnzPerVectorA), cudaMemcpyDeviceToHost));

    // --- Host side number of nonzero elements per row of matrix B
    int *h_nnzPerVectorB = (int *)malloc(N * sizeof(*h_nnzPerVectorB));
    gpuErrchk(cudaMemcpy(h_nnzPerVectorB, d_nnzPerVectorB, N * sizeof(*h_nnzPerVectorB), cudaMemcpyDeviceToHost));

    printf("Number of nonzero elements in dense matrix A = %i\n\n", nnzA);
    for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorA[i]);
    printf("\n");

    printf("Number of nonzero elements in dense matrix B = %i\n\n", nnzB);
    for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorB[i]);
    printf("\n");

    // --- Device side sparse matrix
    double *d_A;            gpuErrchk(cudaMalloc(&d_A, nnzA * sizeof(*d_A)));
    double *d_B;            gpuErrchk(cudaMalloc(&d_B, nnzB * sizeof(*d_B)));

    int *d_A_RowIndices;    gpuErrchk(cudaMalloc(&d_A_RowIndices, (N + 1) * sizeof(*d_A_RowIndices)));
    int *d_B_RowIndices;    gpuErrchk(cudaMalloc(&d_B_RowIndices, (N + 1) * sizeof(*d_B_RowIndices)));
    int *d_C_RowIndices;    gpuErrchk(cudaMalloc(&d_C_RowIndices, (N + 1) * sizeof(*d_C_RowIndices)));
    int *d_A_ColIndices;    gpuErrchk(cudaMalloc(&d_A_ColIndices, nnzA * sizeof(*d_A_ColIndices)));
    int *d_B_ColIndices;    gpuErrchk(cudaMalloc(&d_B_ColIndices, nnzB * sizeof(*d_B_ColIndices)));

    cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, d_A, d_A_RowIndices, d_A_ColIndices));
    cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrB, d_B_dense, lda, d_nnzPerVectorB, d_B, d_B_RowIndices, d_B_ColIndices));

    // --- Host side sparse matrices
    double *h_A = (double *)malloc(nnzA * sizeof(*h_A));        
    double *h_B = (double *)malloc(nnzB * sizeof(*h_B));        
    int *h_A_RowIndices = (int *)malloc((N + 1) * sizeof(*h_A_RowIndices));
    int *h_A_ColIndices = (int *)malloc(nnzA * sizeof(*h_A_ColIndices));
    int *h_B_RowIndices = (int *)malloc((N + 1) * sizeof(*h_B_RowIndices));
    int *h_B_ColIndices = (int *)malloc(nnzB * sizeof(*h_B_ColIndices));
    int *h_C_RowIndices = (int *)malloc((N + 1) * sizeof(*h_C_RowIndices));
    gpuErrchk(cudaMemcpy(h_A, d_A, nnzA * sizeof(*h_A), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (N + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnzA * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_B, d_B, nnzB * sizeof(*h_B), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_B_RowIndices, d_B_RowIndices, (N + 1) * sizeof(*h_B_RowIndices), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_B_ColIndices, d_B_ColIndices, nnzB * sizeof(*h_B_ColIndices), cudaMemcpyDeviceToHost));

    printf("\nOriginal matrix A in CSR format\n\n");
    for (int i = 0; i < nnzA; ++i) printf("A[%i] = %f ", i, h_A[i]); printf("\n");

    printf("\nOriginal matrix B in CSR format\n\n");
    for (int i = 0; i < nnzB; ++i) printf("B[%i] = %f ", i, h_B[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < (N + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < (N + 1); ++i) printf("h_B_RowIndices[%i] = %i \n", i, h_B_RowIndices[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < nnzA; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);  

    printf("\n");
    for (int i = 0; i < nnzB; ++i) printf("h_B_ColIndices[%i] = %i \n", i, h_B_ColIndices[i]);  

    // --- Performing the matrix - matrix multiplication
    int baseC, nnzC = 0;
    // nnzTotalDevHostPtr points to host memory
    int *nnzTotalDevHostPtr = &nnzC;    

    cusparseSafeCall(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));

    cusparseSafeCall(cusparseXcsrgemmNnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, N, descrB, nnzB, 
                                         d_B_RowIndices, d_B_ColIndices, descrA, nnzA, d_A_RowIndices, d_A_ColIndices, descrC, d_C_RowIndices, 
                                         nnzTotalDevHostPtr));
    if (NULL != nnzTotalDevHostPtr) nnzC = *nnzTotalDevHostPtr;
    else {
        gpuErrchk(cudaMemcpy(&nnzC,  d_C_RowIndices + N, sizeof(int), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(&baseC, d_C_RowIndices,     sizeof(int), cudaMemcpyDeviceToHost));
        nnzC -= baseC;
    }
    int *d_C_ColIndices;    gpuErrchk(cudaMalloc(&d_C_ColIndices, nnzC * sizeof(int)));
    double *d_C;            gpuErrchk(cudaMalloc(&d_C, nnzC * sizeof(double)));
    double *h_C = (double *)malloc(nnzC * sizeof(*h_C));        
    int *h_C_ColIndices = (int *)malloc(nnzC * sizeof(*h_C_ColIndices));
    cusparseSafeCall(cusparseDcsrgemm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, N, descrB, nnzB,
                                      d_B, d_B_RowIndices, d_B_ColIndices, descrA, nnzA, d_A, d_A_RowIndices, d_A_ColIndices, descrC,
                                      d_C, d_C_RowIndices, d_C_ColIndices));

    cusparseSafeCall(cusparseDcsr2dense(handle, N, N, descrC, d_C, d_C_RowIndices, d_C_ColIndices, d_C_dense, N));

    gpuErrchk(cudaMemcpy(h_C ,           d_C,            nnzC * sizeof(*h_C), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_C_RowIndices, d_C_RowIndices, (N + 1) * sizeof(*h_C_RowIndices), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_C_ColIndices, d_C_ColIndices, nnzC * sizeof(*h_C_ColIndices), cudaMemcpyDeviceToHost));

    printf("\nResult matrix C in CSR format\n\n");
    for (int i = 0; i < nnzC; ++i) printf("C[%i] = %f ", i, h_C[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < (N + 1); ++i) printf("h_C_RowIndices[%i] = %i \n", i, h_C_RowIndices[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < nnzC; ++i) printf("h_C_ColIndices[%i] = %i \n", i, h_C_ColIndices[i]);  

    gpuErrchk(cudaMemcpy(h_C_dense, d_C_dense, N * N * sizeof(double), cudaMemcpyDeviceToHost));

    for (int j = 0; j < N; j++) {
        for (int i = 0; i < N; i++) 
            printf("%f \t", h_C_dense[i * N + j]);
        printf("\n");
        }
}

Mass multiplication of a matrix matrix in CUDA using cuSPARSE

More articles: