Hi! all
I have a 2D array and I want store it as a sparse matrix and I have full information about cusparsedense2csr but I can’t apply it because it 2D and I don’t want to make it as 1D because memory is a very big issue. I have tried write my own code but it’s not optimal and sometimes not working(I don’t know why). Any kind of help is appreciated. Thanks in advance.
#include<stdio.h>
#include<cuda.h>
#include<math.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<cusparse_v2.h>
#include <thrust/scan.h>
const int N = 6;
const int M = 6;
#define checkCuda(ans) { gpuAssert((ans), FILE, LINE); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define checkCusparse(call)
{
cusparseStatus_t err;
if ((err = (call)) != CUSPARSE_STATUS_SUCCESS)
{
fprintf(stderr, “Got error %d at %s:%d\n”, err, FILE, LINE);
cudaError_t cuda_err = cudaGetLastError();
if (cuda_err != cudaSuccess)
{
fprintf(stderr, " CUDA error "%s" also detected\n",
cudaGetErrorString(cuda_err));
}
exit(1);
}
}
global void findNnz1(double *Krproduct, size_t krpitch, int *Nnzrowsum)
{
int i = threadIdx.x; int count = 0;
for(int j = 0; j< M; j++)
{
double temp = *((double*)((char*)Krproduct + i*krpitch)+ j);
if(temp >0.0 || temp<0.0)
count++;
}
Nnzrowsum[i] = count;
}
global void dense2csr(double *denceMat, size_t pitch, int *CsrRowPtr, double *CsrVal, int *CsrColInd)
{
int i = threadIdx.x; int id = CsrRowPtr[i]; int count = 0;
for(int j = 0; j<M; j++)
{
double temp = *((double*)((char*)denceMat + i*pitch)+ j);
if(temp >0.0 || temp<0.0)
{
CsrVal[id+count] = temp;
CsrColInd[id+count] = j;
count++;
}
}
}
int main()
{
double B[M][N] = {
{10.0, 0.0, 0.0, 0.0, -2.0, 0.0},
{3.0, 9.0, 0.0, 0.0, 0.0, 3.0},
{0.0, 7.0, 8.0, 7.0, 0.0, 0.0},
{0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
{0.0, 8.0, 0.0, 9.0, 9.0, 13.0},
{0.0, 4.0, 0.0, 0.0, 2.0, -1.0}
};
double *d_B; size_t pitchB;
checkCuda(cudaMallocPitch((void**)&d_B, &pitchB, M*sizeof(double), N));
checkCuda(cudaMemcpy2D(d_B, pitchB, B, M*sizeof(double), M*sizeof(double), N, cudaMemcpyHostToDevice));
int *RowNonzero; int totalNnz; int *CsrRowPtr; int *CsrColInd; double *CsrVal;
checkCuda(cudaMalloc((void**) &RowNonzero, N*sizeof(int)));
checkCuda(cudaMalloc((void**) &CsrRowPtr, (N+1)*sizeof(int)));
findNnz1<<<1, N>>>(d_B, pitchB, RowNonzero);
checkCuda( cudaPeekAtLastError() ); checkCuda(cudaDeviceSynchronize());
int U5[N+1];
checkCuda(cudaMemcpy(U5, RowNonzero, N*sizeof(int), cudaMemcpyDeviceToHost));
printf("nnzperrow\n");
for(int i = 0; i<N; i++)
printf("%d ", U5[i]);
printf("\n");
totalNnz = U5[N-1];
thrust::exclusive_scan(U5, U5+N , U5);
totalNnz = U5[N-1]+totalNnz;
U5[N] = totalNnz;
printf(" rowptr \n");
for(int i = 0; i<N+1; i++)
printf("%d ", U5[i]);
printf("\n");
checkCuda(cudaMemcpy(CsrRowPtr, U5, (N+1)*sizeof(int), cudaMemcpyHostToDevice));
checkCuda(cudaMalloc((void **)&CsrVal, sizeof(double) * totalNnz));
checkCuda(cudaMalloc((void **)&CsrColInd, sizeof(int) * totalNnz));
dense2csr<<<1, N>>>(d_B, pitchB, CsrRowPtr, CsrVal, CsrColInd);
int U6[totalNnz]; double U7[totalNnz];
printf(" colInd \n");
checkCuda(cudaMemcpy(U6, CsrColInd, totalNnz*sizeof(int), cudaMemcpyDeviceToHost));
for(int i = 0; i<totalNnz; i++)
printf("%d ", U6[i]);
printf("\n");
printf(" value \n");
checkCuda(cudaMemcpy(U7, CsrVal, totalNnz*sizeof(double), cudaMemcpyDeviceToHost));
for(int i = 0; i<totalNnz; i++)
printf("%lf ", U7[i]);
printf("\n");
return 0;
}