Hi, I’m using CUDA for tomography purposes so I need using sparse matrix to avoid out of memory errors (tomography data need very lot of memory for 512x512 images, e.g. using full matrices it would be needed 4GB of video memory just to store initial data!).
I’ve found by google (http://www.nada.kth.se/~tomaso/gpu08/sptest.cu) a code to test cudpp library and sparse matrix, that is what I need:
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include “cudpp.h”
global static void zero(int n,float v) {
const int pid = threadIdx.x+blockIdx.x*blockDim.x;
const int np = blockDim.x*gridDim.x;
int i;
for(i = pid; i<n; i+=np) v[i] = 0.0;
}
void spmul(int nrows,int ncols,int first,int cidx,float A,
float x[],float y[]) {
int i,j;
for(i = 0; i<nrows; i++) {
y[i] = 0;
for(j = first[i]; j<first[i+1]; j++)
y[i] = y[i] + A[j]*x[cidx[j]];
}
}
void sprand(int nrows,int ncols,int nnz,int first,
int cidx[],float A[]) {
int i,j,k,n;
double r;
n = nrows*ncols;
k = 0;
for(i = 0; i<nrows; i++) {
first[i] = k;
for(j = 0; j<ncols; j++) {
r = rand() / (double) RAND_MAX;
if(r*(n-(i*ncols+j)) < (nnz-k)) {
cidx[k] = j;
A[k] = 1.0;
k = k+1;
}
}
}
first[nrows] = k;
}
double gettime() {
struct timeval tv;
gettimeofday(&tv,NULL);
return tv.tv_sec + 1e-6*tv.tv_usec;
}
int main(void) {
int n = 10000,nnz = n*n/100;
int *first,*cidx;
float *A,*x,*y1,*y2,*x_g,*y_g;
int i,iter,niter = 100;
first = (int *) malloc(sizeof(int) * (n+1));
cidx = (int *) malloc(sizeof(int) * nnz);
A = (float *) malloc(sizeof(float) * nnz);
sprand(n,n,nnz,first,cidx,A);
x = (float *) malloc(sizeof(float) * n);
y1 = (float *) malloc(sizeof(float) * n);
y2 = (float *) malloc(sizeof(float) * n);
for(i = 0; i<n; i++) {
x[i] = (rand() / (float) RAND_MAX > 0.5);
}
double t = gettime();
for(iter = 0; iter<5; iter++)
spmul(n,n,first,cidx,A,x,y1);
t = gettime() - t;
printf("(CPU) flops = %.3e, time per iteration = %.3fms\n",
2.0*nnz*5/t,t/5*1e3);
CUDPPConfiguration config;
config.datatype = CUDPP_FLOAT;
config.options = (CUDPPOption)0;
config.algorithm = CUDPP_SPMVMULT;
CUDPPHandle sparseMatrixHandle;
CUDPPResult result = CUDPP_SUCCESS;
result = cudppSparseMatrix(&sparseMatrixHandle, config, nnz, n,
(void *) A, (unsigned int *) first,
(unsigned int *) cidx);
if (result != CUDPP_SUCCESS) {
fprintf(stderr, "Error creating Sparse matrix object\n");
return 1;
}
cudaMalloc((void **) &x_g,sizeof(float)*n);
cudaMalloc((void **) &y_g,sizeof(float)*n);
cudaMemcpy(x_g,x,sizeof(float)*n,cudaMemcpyHostToDevice);
// Run it once to avoid timing startup overhead
zero<<<14*6,128>>>(n,y_g);
cudppSparseMatrixVectorMultiply(sparseMatrixHandle, y_g, x_g);
cudaThreadSynchronize();
double t0 = gettime();
for(iter = 0; iter<niter; iter++) {
zero<<<14*6,128>>>(n,y_g);
cudppSparseMatrixVectorMultiply(sparseMatrixHandle, y_g, x_g);
}
cudaThreadSynchronize();
double t1 = gettime();
printf(" flops = %.3e, time per iteration = %.3fms\n",
2.0*nnz*niter/(t1-t0),(t1-t0)/niter*1e3);
cudaMemcpy(y2,y_g,sizeof(float)*n,cudaMemcpyDeviceToHost);
for(i = 0; i<n; i++)
if(y1[i] != y2[i])
printf("Error: y1(%d)=%15.5e y2(%d)=%15.5e\n",i,y1[i],i,y2[i]);
cudaFree(y_g);
cudaFree(x_g);
free(y2);
free(y1);
free(x);
free(A);
free(cidx);
free(first);
return 0;
}[/codebox]
I tried to build and run it with Ubuntu 9.10 32bit, CodeBlock 8.02, Geforce 8400GS, 190.53 drivers and SDK 2.3.
Moreover, I tried to build and run the code on a HP Z800 workstation with a TESLA c1060 and the same feature described above (this time, 64bit version).
Systems seem to be well configured, since I tested the SDK demos and other programs written by myself and everything has always gone well (including algorithm results and MEX file creation to use CUDA with MATLAB)
Building process has gone well, included linking step. But, when I run the code I get the following message (both on 32 and 64 bit workstations)
====================================
(CPU) flops = 5.276e+08, time per iteration = 3.791ms
Error creating Sparse matrix object
====================================
and the program terminate.
Can anyone help me?
Thanks in advance.