Product Matrix Matrix (CSR Format)

Hi mates,

i’m developing an algorithm to solve the matrix-matrix product with sparse matrix. I need to use CSR format, for specification requirements, and i’m in trouble with a kernel. Kernel is working well with small matrix (until 80k * 80k), giving me the right values i need for the matrix multiplication, but for larger matrix, the kernel freezes my machine and i must restart every time.

After some trys, i printed the memory usage, and i’m not going out of memory (for the examples that freeze my machine, at the moment of kernel execute, i’ve 230Mb free of mem in my graphics card, and of course, no problem of CPU memory).

Without understanding the whole algorithm, the kernel is the following one:

__global__ void calculateMatrixProduct (size_t* a_indices, size_t* a_ptr, float* a_data,

		size_t* b_indices, size_t* b_ptr, float* b_data, size_t* c_indices, size_t* c_ptr, float* c_data, size_t numRows){

	size_t id = GlobalIdx();

	if(id < numRows){

		float acum;

		for(size_t c_row = c_ptr[id]; c_row < c_ptr[id+1]; c_row++){

			acum = 0;

			size_t col = c_indices[c_row];

			for(size_t a_row = a_ptr[id]; a_row < a_ptr[id+1]; a_row++){

				for(size_t b_col = b_ptr[a_indices[a_row]]; b_col < b_ptr[a_indices[a_row]+1]; b_col++){

					if(b_indices[b_col] == col){

						acum += a_data[a_row] * b_data[b_col];

					}

				}

				

			}

			c_data[c_row] = acum;

		}

	}

}

and the function from where i call this kernel, is this simplier one too:

void reconstructMatrix_GPU(const SparseMatrix_f& A, const SparseMatrix_f& B, SparseMatrix_f& C){

	size_t* a_indices;

	size_t* a_ptr;

	size_t* b_indices;

	size_t* b_ptr;

	size_t* c_indices;

	size_t* c_ptr;

	float* a_data;

	float* b_data;

	float* c_data;

	

	//Allocate and copy resources

	CUDA_CHECK(cudaMalloc((void**) &a_indices, sizeof(size_t) * A.matrixNNZ));

	CUDA_CHECK(cudaMalloc((void**) &a_ptr, sizeof(size_t) * (A.matrixRows+1)));

	CUDA_CHECK(cudaMalloc((void**) &a_data, sizeof(float) * (A.matrixNNZ)));

	CUDA_CHECK(cudaMalloc((void**) &b_indices, sizeof(size_t) * B.matrixNNZ));

	CUDA_CHECK(cudaMalloc((void**) &b_ptr, sizeof(size_t) * (B.matrixRows+1)));

	CUDA_CHECK(cudaMalloc((void**) &b_data, sizeof(float) * (B.matrixNNZ)));

	CUDA_CHECK(cudaMalloc((void**) &c_indices, sizeof(size_t) * C.matrixNNZ));

	CUDA_CHECK(cudaMalloc((void**) &c_ptr, sizeof(size_t) * (C.matrixRows+1)));

	CUDA_CHECK(cudaMalloc((void**) &c_data, sizeof(float) * (C.matrixNNZ)));

	cuMemGetInfo(&free, &total);

	cout << "Memory available for the device: free(" << (((double)free)/1024)/1024 << " Mb), total(" << (((double)total)/1024)/1024 <<" Mb)" << endl;

	//Copy mem from device to host

	CUDA_CHECK(cudaMemcpy(a_indices, &A.indices[0], sizeof(size_t) * A.matrixNNZ, cudaMemcpyHostToDevice));

	CUDA_CHECK(cudaMemcpy(a_ptr, &A.ptr[0], sizeof(size_t) * (A.matrixRows+1), cudaMemcpyHostToDevice));

	CUDA_CHECK(cudaMemcpy(a_data, &A.data[0], sizeof(float) * (A.matrixNNZ), cudaMemcpyHostToDevice));

	CUDA_CHECK(cudaMemcpy(b_indices, &B.indices[0], sizeof(size_t) * B.matrixNNZ, cudaMemcpyHostToDevice));

	CUDA_CHECK(cudaMemcpy(b_ptr, &B.ptr[0], sizeof(size_t) * (B.matrixRows+1), cudaMemcpyHostToDevice));

	CUDA_CHECK(cudaMemcpy(b_data, &B.data[0], sizeof(float) * (B.matrixNNZ), cudaMemcpyHostToDevice));

	CUDA_CHECK(cudaMemcpy(c_indices, &C.indices[0], sizeof(size_t) * C.matrixNNZ, cudaMemcpyHostToDevice));

	CUDA_CHECK(cudaMemcpy(c_ptr, &C.ptr[0], sizeof(size_t) * (C.matrixRows+1), cudaMemcpyHostToDevice));

	cuMemGetInfo(&free, &total);

	cout << "Memory available for the device: free(" << (((double)free)/1024)/1024 << " Mb), total(" << (((double)total)/1024)/1024 <<" Mb)" << endl;

	//Dimension

	dim3 Grid = Build_Grid(C.matrixRows, BLOCK_SIZE);

	//Kernel invocation

	calculateMatrixProduct <<<Grid, BLOCK_SIZE >>> (a_indices, a_ptr, a_data, b_indices, b_ptr, b_data, c_indices, c_ptr, c_data, C.matrixRows);

	//Copy results

	CUDA_CHECK(cudaMemcpy(&C.data[0], c_data, sizeof(float) * C.matrixNNZ, cudaMemcpyDeviceToHost));

	//Free resources

	CUDA_CHECK(cudaFree(a_indices));

	CUDA_CHECK(cudaFree(a_ptr));

	CUDA_CHECK(cudaFree(a_data));

	CUDA_CHECK(cudaFree(b_indices));

	CUDA_CHECK(cudaFree(b_ptr));

	CUDA_CHECK(cudaFree(b_data));

	CUDA_CHECK(cudaFree(c_indices));

	CUDA_CHECK(cudaFree(c_ptr));

	CUDA_CHECK(cudaFree(c_data));

}

the last thing is to show SparseMatrix struct, is an easy one

typedef struct {

size_t matrixRows;

    size_t matrixCols;

size_t matrixNNZ;

size_t* ptr;

vector<size_t> indices;

float* data;

} SparseMatrix_f;

Just an appointment: at the moment of the call of the function, the matrix C (which must be the result from product of matrix A and matrix B), have all indices of non-zero elements in his structure, only remain the needed of calculate the right value for each indice.

Thanks in advance for help