cusparse<t>dense2csr for 2D array

Hi! all

I have a 2D array and I want store it as a sparse matrix and I have full information about cusparsedense2csr but I can’t apply it because it 2D and I don’t want to make it as 1D because memory is a very big issue. I have tried write my own code but it’s not optimal and sometimes not working(I don’t know why). Any kind of help is appreciated. Thanks in advance.

#include<stdio.h>
#include<cuda.h>
#include<math.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<cusparse_v2.h>
#include <thrust/scan.h>

const int N = 6;
const int M = 6;

#define checkCuda(ans) { gpuAssert((ans), FILE, LINE); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

#define checkCusparse(call)
{
cusparseStatus_t err;
if ((err = (call)) != CUSPARSE_STATUS_SUCCESS)
{
fprintf(stderr, “Got error %d at %s:%d\n”, err, FILE, LINE);
cudaError_t cuda_err = cudaGetLastError();
if (cuda_err != cudaSuccess)
{
fprintf(stderr, " CUDA error "%s" also detected\n",
cudaGetErrorString(cuda_err));
}
exit(1);
}
}

global void findNnz1(double *Krproduct, size_t krpitch, int *Nnzrowsum)
{
int i = threadIdx.x; int count = 0;

         for(int j = 0; j< M; j++)
            {	                
             double temp = *((double*)((char*)Krproduct + i*krpitch)+ j);
             if(temp >0.0 || temp<0.0) 
               count++;	                	                 
	    }
	
	 Nnzrowsum[i] = count;  
	}

global void dense2csr(double *denceMat, size_t pitch, int *CsrRowPtr, double *CsrVal, int *CsrColInd)
{
int i = threadIdx.x; int id = CsrRowPtr[i]; int count = 0;

	 for(int j = 0; j<M; j++)
	     {
	      double temp = *((double*)((char*)denceMat + i*pitch)+ j);
              if(temp >0.0 || temp<0.0)
                {
                 CsrVal[id+count] = temp;
                 CsrColInd[id+count] = j;
                 count++;
                 } 	                     
	     }		
	}

int main()
{

double B[M][N] = {
		  {10.0, 0.0, 0.0, 0.0, -2.0, 0.0},
		  {3.0, 9.0, 0.0, 0.0, 0.0, 3.0},
		  {0.0, 7.0, 8.0, 7.0, 0.0, 0.0},
		  {0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
		  {0.0, 8.0, 0.0, 9.0, 9.0, 13.0},
		  {0.0, 4.0, 0.0, 0.0, 2.0, -1.0}
		 }; 

double *d_B;  size_t pitchB;

checkCuda(cudaMallocPitch((void**)&d_B, &pitchB, M*sizeof(double), N));	              
    checkCuda(cudaMemcpy2D(d_B, pitchB, B,  M*sizeof(double), M*sizeof(double), N, cudaMemcpyHostToDevice)); 
    
    int *RowNonzero;	int totalNnz; 	int *CsrRowPtr;      int *CsrColInd;	double *CsrVal;	
		
checkCuda(cudaMalloc((void**) &RowNonzero, N*sizeof(int)));
    checkCuda(cudaMalloc((void**) &CsrRowPtr, (N+1)*sizeof(int)));
    
    findNnz1<<<1, N>>>(d_B, pitchB, RowNonzero);
checkCuda( cudaPeekAtLastError() );	 checkCuda(cudaDeviceSynchronize()); 
       
   int  U5[N+1]; 
checkCuda(cudaMemcpy(U5, RowNonzero, N*sizeof(int), cudaMemcpyDeviceToHost));
printf("nnzperrow\n"); 
for(int i = 0; i<N; i++)
        printf("%d ", U5[i]);
    printf("\n");
        
    totalNnz = U5[N-1]; 
       
    thrust::exclusive_scan(U5, U5+N , U5);
    
    totalNnz = U5[N-1]+totalNnz;
U5[N] = totalNnz;

printf(" rowptr \n");
for(int i = 0; i<N+1; i++)
        printf("%d  ", U5[i]);
    printf("\n");
           
    checkCuda(cudaMemcpy(CsrRowPtr, U5, (N+1)*sizeof(int), cudaMemcpyHostToDevice));
    checkCuda(cudaMalloc((void **)&CsrVal, sizeof(double) * totalNnz));
	  checkCuda(cudaMalloc((void **)&CsrColInd, sizeof(int) * totalNnz));
    
    dense2csr<<<1, N>>>(d_B, pitchB, CsrRowPtr, CsrVal, CsrColInd);
    
    int U6[totalNnz];   double U7[totalNnz];
    
    printf(" colInd \n");
    checkCuda(cudaMemcpy(U6, CsrColInd, totalNnz*sizeof(int), cudaMemcpyDeviceToHost));            
    for(int i = 0; i<totalNnz; i++)
        printf("%d  ", U6[i]);
    printf("\n");
        
    printf(" value \n");
    checkCuda(cudaMemcpy(U7, CsrVal, totalNnz*sizeof(double), cudaMemcpyDeviceToHost));           
    for(int i = 0; i<totalNnz; i++)
        printf("%lf  ", U7[i]);
    printf("\n");  

return 0;
}

I think cusparsedense2csr for 2D as follows:

#include<cuda_runtime.h>
#include<cusparse_v2.h>

#include <iostream>
#include <iomanip>

const int N = 6; 
const int M = 6;

 int main() {

   double B[M][N] = {
     { 10.0,  0.0,  0.0,  0.0, -2.0,  0.0 },
     {  3.0,  9.0,  0.0,  0.0,  0.0,  3.0 },
     {  0.0,  7.0,  8.0,  7.0,  0.0,  0.0 },
     {  0.0,  0.0,  0.0,  0.0,  0.0,  0.0 },
     {  0.0,  8.0,  0.0,  9.0,  9.0, 13.0 },
     {  0.0,  4.0,  0.0,  0.0,  2.0, -1.0 }
   }; 

  double *d_B;

  cudaMallocManaged( &d_B, M*N*sizeof(double));
  cudaMemcpy(d_B, B, M*N*sizeof(double), cudaMemcpyDefault);

  int *RowNonzero; 
  int *CsrRowPtr; 
  int *CsrColInd; 
  double *CsrVal; 
  int totalNnz; 

  cudaMallocManaged(&RowNonzero, N*sizeof(int));
  cudaMallocManaged(&CsrRowPtr, (N+1)*sizeof(int));

  cusparseHandle_t handle;
  cusparseCreate(&handle);

  cusparseMatDescr_t descr;
  cusparseCreateMatDescr(&descr);

  cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, M, N, descr, d_B, N, RowNonzero, &totalNnz);
  cudaMallocManaged(&CsrColInd, totalNnz*sizeof(int));
  cudaMallocManaged(&CsrVal, totalNnz*sizeof(double));

  cusparseDdense2csr(handle, M, N, descr, d_B, N, RowNonzero, CsrVal, CsrRowPtr, CsrColInd); 
  cudaDeviceSynchronize();

  std::cout << "totalNnz : " << totalNnz << std::endl;
  std::cout << "Val:    ";
  for ( int i = 0; i < totalNnz; ++i ) {
    std::cout << std::setw(4) << CsrVal[i];
  }
  std::cout << "\nColInd: ";
  for ( int i = 0; i < totalNnz; ++i ) {
    std::cout << std::setw(4) << CsrColInd[i];
  }
  std::cout << "\nRowPtr: ";
  for ( int i = 0; i < N+1; ++i ) {
    std::cout << std::setw(4) << CsrRowPtr[i];
  }
  std::cout << std::endl;

  cusparseDestroyMatDescr(descr);
  cusparseDestroy(handle);
  cudaFree(d_B);
  cudaFree(CsrColInd);
  cudaFree(RowNonzero);
  cudaFree(CsrRowPtr);
  cudaFree(CsrVal);

  cudaDeviceReset();
}

I really appreciate your help (:

But let assume that the matrix d_B contains the Kronecker product of two matrix and since there is no inbuilt function(I suppose) for Kronecker product in CUDA, so I make my suboptimal algo to construct a Kronecker product and which stores the product in 2D array. I am unable to store it in a 1D array.

In this case who do I make d_B sparse ?

Thanks

Do you ask another problem: “How to make Kronecker product A*B from sparse A and B”?

yes, I am unable to do that, it would very helpful for me. Thanks in advance

I’ve tried. You’ll be able to port this to CUDA.

#include <iostream>
#include <utility>
#include <array>
#include <algorithm>

using namespace std;

struct coo {
    int    row;
    int    col;
    double val;
};

inline bool operator<(const coo& x, const coo& y) {
  return make_pair(x.row, x.col) < make_pair(y.row, y.col);
}

int main() {

  /*
      A = 1 2 3
          4 5 6
   */
  const int nnzA = 6;
  const int rowA = 2;
  const int colA = 3;

  const array<coo,nnzA> cooA = 
    {  0, 0, 1.0 ,
       0, 1, 2.0 ,
       0, 2, 3.0 ,
       1, 0, 4.0 ,
       1, 1, 5.0 ,
       1, 2, 6.0 ,
    };

  /*
     B = 1 1
         1 1
   */

  const int nnzB = 4;
  const int rowB = 2;
  const int colB = 2;
  const array<coo, nnzA> cooB =
    { 0, 0, 1.0 ,
      0, 1, 1.0 ,
      1, 0, 1.0 ,
      1, 1, 1.0
    };

  /*
    K : Kronecker product A * B
   */
  const int nnzK = nnzA * nnzB;
  const int rowK = rowA * rowB;
  const int colK = colA * colB;
  array<coo,nnzK> cooK; // result

  int count = 0;
  coo tmp;
  for (int iA = 0; iA < nnzA; ++iA) {
    for (int iB = 0; iB < nnzB; ++iB) {
      cooK[count].row = cooA[iA].row * rowB + cooB[iB].row;
      cooK[count].col = cooA[iA].col * colB + cooB[iB].col;
      cooK[count].val = cooA[iA].val * cooB[iB].val;
      ++count;
    }
  }

  // coo to dense K
  double K[rowK][colK];
  fill_n((double*)K, rowK*colK, 0.0);
  for (coo item : cooK) {
    K[item.row][item.col] = item.val;
  }

  // print K
  for (int r = 0; r < rowK; ++r) {
    for (int c = 0; c < colK; ++c) {
      cout << K[r][c] << ' ';
    }
    cout << endl;
  }
}

Thanks for your support through out
What about CSR format? Can you help me in this manner because I need use LU decomposition and as you know CuSparse preconditioners only accept CSR format, this is one of the bottle neck of my project.

In my project, I have two dense matrix A and B and the algorithm goes as follows

2D dense Kronecker product(A,B) ----> 1D (which take the same amount of memory to store it)----->cusparseDnnz and cusparseDdense2csr(to create sparse matrix)--------> LU decompotion.

But I want
2D Dence-----> CSR Sparse-------> LU decomotion

or

CSR Kron product --------> LU decomotion

It’s easy to convert COO to CSR.

and I’m afraid I have no idea for LUdecomp.
I think [url]http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-preconditioners-reference[/url] will help you.

Thank You very much Sir for your help. It’s my pleasure to interact with you. Have a nice day