Problems allocating sparse Matrix

Hi, I’m using CUDA for tomography purposes so I need using sparse matrix to avoid out of memory errors (tomography data need very lot of memory for 512x512 images, e.g. using full matrices it would be needed 4GB of video memory just to store initial data!).

I’ve found by google (http://www.nada.kth.se/~tomaso/gpu08/sptest.cu) a code to test cudpp library and sparse matrix, that is what I need:

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <sys/time.h>

#include “cudpp.h”

global static void zero(int n,float v) {

const int pid = threadIdx.x+blockIdx.x*blockDim.x;

const int np = blockDim.x*gridDim.x;

int i;

for(i = pid; i<n; i+=np) v[i] = 0.0;

}

void spmul(int nrows,int ncols,int first,int cidx,float A,

   float x[],float y[]) {

int i,j;

for(i = 0; i<nrows; i++) {

y[i] = 0;

for(j = first[i]; j<first[i+1]; j++)

  y[i] = y[i] + A[j]*x[cidx[j]];

}

}

void sprand(int nrows,int ncols,int nnz,int first,

    int cidx[],float A[]) {

int i,j,k,n;

double r;

n = nrows*ncols;

k = 0;

for(i = 0; i<nrows; i++) {

first[i] = k;

for(j = 0; j<ncols; j++) {

  r = rand() / (double) RAND_MAX;

  if(r*(n-(i*ncols+j)) < (nnz-k)) {

cidx[k] = j;

A[k] = 1.0;

k = k+1;

  }

}

}

first[nrows] = k;

}

double gettime() {

struct timeval tv;

gettimeofday(&tv,NULL);

return tv.tv_sec + 1e-6*tv.tv_usec;

}

int main(void) {

int n = 10000,nnz = n*n/100;

int *first,*cidx;

float *A,*x,*y1,*y2,*x_g,*y_g;

int i,iter,niter = 100;

first = (int *) malloc(sizeof(int) * (n+1));

cidx = (int *) malloc(sizeof(int) * nnz);

A = (float *) malloc(sizeof(float) * nnz);

sprand(n,n,nnz,first,cidx,A);

x = (float *) malloc(sizeof(float) * n);

y1 = (float *) malloc(sizeof(float) * n);

y2 = (float *) malloc(sizeof(float) * n);

for(i = 0; i<n; i++) {

x[i] = (rand() / (float) RAND_MAX > 0.5);

}

double t = gettime();

for(iter = 0; iter<5; iter++)

spmul(n,n,first,cidx,A,x,y1);

t = gettime() - t;

printf("(CPU) flops = %.3e, time per iteration = %.3fms\n",

 2.0*nnz*5/t,t/5*1e3);

CUDPPConfiguration config;

config.datatype = CUDPP_FLOAT;

config.options = (CUDPPOption)0;

config.algorithm = CUDPP_SPMVMULT;

CUDPPHandle sparseMatrixHandle;

CUDPPResult result = CUDPP_SUCCESS;

result = cudppSparseMatrix(&sparseMatrixHandle, config, nnz, n,

		     (void *) A, (unsigned int *) first,

		     (unsigned int *) cidx);

if (result != CUDPP_SUCCESS) {

fprintf(stderr, "Error creating Sparse matrix object\n");

return 1;

}

cudaMalloc((void **) &x_g,sizeof(float)*n);

cudaMalloc((void **) &y_g,sizeof(float)*n);

cudaMemcpy(x_g,x,sizeof(float)*n,cudaMemcpyHostToDevice);

// Run it once to avoid timing startup overhead

zero<<<14*6,128>>>(n,y_g);

cudppSparseMatrixVectorMultiply(sparseMatrixHandle, y_g, x_g);

cudaThreadSynchronize();

double t0 = gettime();

for(iter = 0; iter<niter; iter++) {

zero<<<14*6,128>>>(n,y_g);

cudppSparseMatrixVectorMultiply(sparseMatrixHandle, y_g, x_g);

}

cudaThreadSynchronize();

double t1 = gettime();

printf(" flops = %.3e, time per iteration = %.3fms\n",

 2.0*nnz*niter/(t1-t0),(t1-t0)/niter*1e3);

cudaMemcpy(y2,y_g,sizeof(float)*n,cudaMemcpyDeviceToHost);

for(i = 0; i<n; i++)

if(y1[i] != y2[i])

  printf("Error: y1(%d)=%15.5e  y2(%d)=%15.5e\n",i,y1[i],i,y2[i]);

cudaFree(y_g);

cudaFree(x_g);

free(y2);

free(y1);

free(x);

free(A);

free(cidx);

free(first);

return 0;

}[/codebox]

I tried to build and run it with Ubuntu 9.10 32bit, CodeBlock 8.02, Geforce 8400GS, 190.53 drivers and SDK 2.3.

Moreover, I tried to build and run the code on a HP Z800 workstation with a TESLA c1060 and the same feature described above (this time, 64bit version).

Systems seem to be well configured, since I tested the SDK demos and other programs written by myself and everything has always gone well (including algorithm results and MEX file creation to use CUDA with MATLAB)

Building process has gone well, included linking step. But, when I run the code I get the following message (both on 32 and 64 bit workstations)

====================================

(CPU) flops = 5.276e+08, time per iteration = 3.791ms

Error creating Sparse matrix object

====================================

and the program terminate.

Can anyone help me?

Thanks in advance.

CUDPP is superb in many ways, but it’s SpMV perf isn’t. Have you considered these ones (if you google long enough, then you’ll find the associated source code). Matlab interfacing is not their strong point though…

@INPROCEEDINGS{Bell:2009:ISM,
author = {Nathan Bell and Michael Garland},
title = {Implementing sparse matrix-vector multiplication on throughput-oriented processors},
booktitle = {SC '09: Proceedings of the 2009 ACM/IEEE conference on Supercomputing},
year = {2009},
month = nov,
note = {Article No. 18},
doi = {10.1145/1654059.1654078}
}

@TECHREPORT{Bell:2008:ESM,
author = {Nathan Bell and Michael Garland},
title = {Efficient Sparse Matrix-Vector Multiplication using {CUDA}},
institution = {NVIDIA},
year = {2008},
number = {NVR-2008-4},
month = dec
}

@ARTICLE{Buatois:CNC:2009,
author = {Luc Buatois and Guillaume Caumon and Bruno L{'e}vy},
title = {Concurrent number cruncher – {A} {GPU} implementation of a general sparse linear solver},
journal = {International Journal of Parallel, Emergent and Distributed Systems},
year = {2009},
volume = {24},
pages = {205–223},
number = {9},
month = jun,
doi = {10.1080/17445760802337010}
}

@INPROCEEDINGS{Buatois:CNC:2007,
author = {Luc Buatois and Guillaume Caumon and Bruno L{'e}vy},
title = {Concurrent Number Cruncher: An Efficient Sparse Linear Solver on the {GPU}},
booktitle = {High Performance Computing and Communications},
year = {2007},
volume = {4782},
series = {Lecture Notes In Computer Science},
pages = {358–371},
month = sep,
publisher = {Springer},
doi = {10.1007/978-3-540-75444-2_37}
}

@TECHREPORT{Baskaran:2008:OSM,
author = {Muthu Manikandan Baskaran and Rajesh Bordawekar},
title = {Optimizing Sparse Matrix-Vector Multiplication on {GPUs}},
institution = {IBM Research},
year = {2008},
number = {RC24704},
month = dec
}

I followed your indication: I downloaded and compiled the IBM framework for sparse matrices.

I added this code:

[codebox]//FUNCTION ADDED TO READ A SPARSE MATRIX FROM indexes AND values ARRAYS

void createSparseMatrixFromData(SpMatrix m, unsigned int rIndexes, unsigned int* cIndexes, float* values, int sizeM, int sizeN, int nnz, int format)

{

//use standard C indexes in input indexes arrays!

m->numRows = sizeM;

m->numCols = sizeN;

m->numNZEntries = nnz;

m->nzentries = (NZEntry *) malloc(sizeof(NZEntry) * m->numNZEntries);

m->rowPtrs = (unsigned int *) malloc(sizeof(unsigned int) * m->numRows);

m->colPtrs = (unsigned int *) malloc(sizeof(unsigned int) * m->numCols);

NZEntry e;

unsigned int i;

for (i = 0; i < m->numNZEntries; i++)

{

    e.rowNum = rIndexes[i];

	e.colNum = cIndexes[i];

	e.val = values[i];

    (m->nzentries)[i]= e;

}

// sort into row-major order or column major order based on the format

if (format == 0) // ROW-MAJOR

    qsort(m->nzentries, m->numNZEntries, sizeof(NZEntry), cmpRow);

else if (format == 1) // COLUMN-MAJOR

{    qsort(m->nzentries, m->numNZEntries, sizeof(NZEntry), cmpCol);

}else { }

if (format == 0) {

	// set index of first elt in each row

	// relies on at least one item in each row

 	m->rowPtrs[0]=0;

	unsigned int row, prevrow=0;

    for (i = 1; i < m->numNZEntries; i++) {

        row = (m->nzentries)[i].rowNum;

        if (row != prevrow) {

	prevrow = row;

	m->rowPtrs[prevrow]=i;

    }

    }

}

if (format == 1) {

	// set index of first elt in each col

	// relies on at least one item in each col

 	m->colPtrs[0]=0;

	unsigned int col, prevcol=0;

    for (i = 1; i < m->numNZEntries; i++) {

        col = (m->nzentries)[i].colNum;

        if (col != prevcol) {

	prevcol = col;

	m->colPtrs[prevcol]=i;

    }

    }

}

}

[/codebox]

to be able creating a sparse matrix from three data arrays (rowIndexes, columnIndexes and matrixData). The code is a trivial modification of the original readSparseMatrix implemented in the library so I didn’t do any particular operation, just fill the sparse structure in a different way with respect to readSparseMatrix.

I also added few lines in comp_SpMV.cu file, to test the new method for the creation of the matrix

[codebox]

unsigned int *rIndexes, *cIndexes;

float *values;

unsigned int cost = 5;

unsigned int nnz = 10;

int sizeM =10;

int sizeN =10;

if(argc >1) //use MATRIX FILE INPUT, default in comp_SpMV.cu

{

	strcpy(spmfileName,argv[1]);

	if ((f = fopen(spmfileName, "r")) == NULL) {

	printf("Non-existent input matrix file\n");

		exit(-1);

	}

	else { fclose(f); }

	readSparseMatrix(&m, spmfileName, 0);

}else //use DATA ARRAYS INPUT

{

	rIndexes = (unsigned int*)malloc(sizeof(unsigned int)*nnz);

	cIndexes = (unsigned int*)malloc(sizeof(unsigned int)*nnz);

	values = (float*)malloc(sizeof(float)*nnz);

	for(unsigned int i = 0; i < nnz; i++)

	{

		rIndexes[i] = (unsigned int)rand()%sizeM;

		cIndexes[i] = (unsigned int)rand()%sizeN;

		values[i] = (unsigned int)rand()%100;

	}

	//print matrix

	printSparse(values, rIndexes, cIndexes, sizeM, sizeN, nnz);

	createSparseMatrixFromData(&m, rIndexes, cIndexes, values, sizeM, sizeN, nnz, 0);

}[/codebox]

Finally, I created a 1’s array changing the 80th row in the same original file

[codebox] h_y[i] = 1.0;//default is ------->rand() / (float)RAND_MAX;

[/codebox]

In this way I can manually check results.

Unfortunately I continue obtaining strange results (remember that this matrix is multiplied with 1-padded array).

This is an example of computation:

============================================

M: rows x columns x non-zero

M: 10 x 10 x 10

0.00 0.00 68.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00

0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 85.00 87.00

0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 44.00

0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00

0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00

0.00 0.00 55.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00

0.00 0.00 0.00 0.00 0.00 86.00 0.00 0.00 0.00 62.00

0.00 0.00 0.00 0.00 42.00 0.00 0.00 0.00 0.00 0.00

0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00

0.00 0.00 0.00 0.00 83.00 0.00 0.00 0.00 0.00 0.00

GPU (ms) CPU (ms) GFLOPS

0.028300 0.003000 0.000707

Test PASSED

y[0]: 1.000000 ref[0]: 68.000000 x[0]: 68.000000

y[1]: 1.000000 ref[1]: 172.000000 x[1]: 172.000000

y[2]: 1.000000 ref[2]: 0.000000 x[2]: 0.000000

y[3]: 1.000000 ref[3]: 0.000000 x[3]: 0.000000

y[4]: 1.000000 ref[4]: 284.000000 x[4]: 284.000000

y[5]: 1.000000 ref[5]: 55.000000 x[5]: 55.000000

y[6]: 1.000000 ref[6]: 148.000000 x[6]: 148.000000

y[7]: 1.000000 ref[7]: 0.000000 x[7]: 0.000000

y[8]: 1.000000 ref[8]: 604.000000 x[8]: 604.000000

y[9]: 1.000000 ref[9]: 83.000000 x[9]: 83.000000

========================================

Any suggestion or example-code to use sparse matrix?

GIOVANNI

I discovered there is a bug in the IBM library and I will write a mail to the authors as soon as possible.

Strangely the library failed when I used a sparse matrix like this:

[font=“Courier New”]

4 4 3 <---------------- FIRST ROW OF THE FILE: number of rows, number of columns, non zero elements

1 1 1 <---------------- OTHERS ROWS OF THE FILE: index of row, index of column, element’s value

2 2 1

3 3 1

[/font]

More in general library fails when there is at least one row in the matrix without elements. If anyone need this library for tomography applications, takes this into account!

You can use this code to create matrix from data input array:

[codebox]//FUNCTION ADDED TO READ A SPARSE MATRIX FROM indexes AND values ARRAYS

void createSparseMatrixFromData(SpMatrix m, unsigned int rIndexes, unsigned int* cIndexes, float* values, unsigned int sizeM, unsigned int sizeN, unsigned int nnz, int format)

{

/***************************************************/

/***************************************************/

bool* existsRow = (bool*)calloc(sizeof(bool), sizeM);

//mark all the rows with at least one value

for (unsigned int i = 0; i < nnz; i++)

{	int current = rIndexes[i];

	printf("[%d]\n", current);

	existsRow[current] = true;

}

//count how many rows are without data

int cont = 0;

for(unsigned int i = 0; i < sizeM; i++)

	if(!existsRow[i])

		cont++;

printf("%i rows without values\n", cont);

//adjust the number of non-zero elements

nnz += cont;

/***************************************************/

/***************************************************/

//use standard C indexes for input indexes arrays!

m->numRows = sizeM;

m->numCols = sizeN;

m->numNZEntries = nnz;

          m->nzentries = (NZEntry *) malloc(sizeof(NZEntry) * m->numNZEntries);

          m->rowPtrs = (unsigned int *) malloc(sizeof(unsigned int) * m->numRows);

          m->colPtrs = (unsigned int *) malloc(sizeof(unsigned int) * m->numCols);

NZEntry e;

          unsigned int i;

          for (i = 0; i < m->numNZEntries-cont; i++)

          {

                   e.rowNum = rIndexes[i];

                   e.colNum = cIndexes[i];

                   e.val = values[i];

                   (m->nzentries)[i]= e;

          }

/***************************************************/

/***************************************************/

//when a row doesn't have elements, insert new non-zero element with explicit 0 value

int newElement = nnz-cont;

for(unsigned int index = 0; index < sizeM; index++)

	if(!existsRow[index])

	{	printf("\n Need to change row [%i]", index);

		e.rowNum = index;

		e.colNum = 0;

		e.val = 0;

		(m->nzentries)[newElement]= e;

		++newElement;

		printf(" row [%i] updated\n", index);

	}

/***************************************************/

/***************************************************/

// sort into row-major order or column major order based on the format

if (format == 0) // ROW-MAJOR

    qsort(m->nzentries, m->numNZEntries, sizeof(NZEntry), cmpRow);

else if (format == 1) // COLUMN-MAJOR

{    qsort(m->nzentries, m->numNZEntries, sizeof(NZEntry), cmpCol);

}else { }

if (format == 0) {

	// set index of first elt in each row

	// relies on at least one item in each row

 	m->rowPtrs[0]=0;

	unsigned int row, prevrow=0;

    for (i = 1; i < m->numNZEntries; i++) {

        row = (m->nzentries)[i].rowNum;

        if (row != prevrow) {

	prevrow = row;

	m->rowPtrs[prevrow]=i;

    }

    }

}

if (format == 1) {

	// set index of first elt in each col

	// relies on at least one item in each col

 	m->colPtrs[0]=0;

	unsigned int col, prevcol=0;

    for (i = 1; i < m->numNZEntries; i++) {

        col = (m->nzentries)[i].colNum;

        if (col != prevcol) {

	prevcol = col;

	m->colPtrs[prevcol]=i;

    }

    }

}

}[/codebox]

The code can be obviously optimized but anyway for the moment It is possible to work with the library.