Stackoverflow pointed out the solution http://stackoverflow.com/questions/24932784/cusparse-illegal-memory-access-unless-i-increase-the-sparsity-of-the-sparse-matr/24938396#24938396
I am trying to make an existing piece of software that uses hand tuned sparse multiplication of special CSC matrices that have exactly k nonzero elements per column. I decided to use cusparse for the job, but unfortunately I encounter that the matrix multiplication takes over 7 seconds in some cases, which is much slower than the CPU version of the code. (largest sparse matrix concerned is 19871x1000 largest dense matrix concerned is 1000*150, nnz = 101000).
When trying to reproduce the problem in a self contained example, I always encounter an “illegal memory access error” when nnz != sparse_cols.
After some investigation turns out that if I increase the size of matrices 10fold the problem disappears. If I make the matrices small enough I don’t experience crashes. However with large matrices the sparse matrix has to not cross over some degree of denseness, otherwise multiplication produces a bunch of illegal memory accesses.
#include <cuda.h>
#include <cusparse.h>
#include <iostream>
#include <stdlib.h>
#define CALL_CUDA( err ) \
{ if (err != cudaSuccess) \
{std::cout<<"cuda Error "<< cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<<__LINE__<<"\n"; exit(EXIT_FAILURE); }\
}
int main(){
//cusparse status and handle
cusparseStatus_t status;
cusparseHandle_t handle = 0;
status = cusparseCreate(&handle);
if (status != CUSPARSE_STATUS_SUCCESS){
std::cout << "Error creating handle: " << status << std::endl;
}
//Set matrix description
cusparseMatDescr_t descr; //Describe the matrices
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
//Sparse matrix properties
int sparse_rows = 19871;
int sparse_cols = 1000;
int nnz_new = 101000;
//int nnz_new = 1000; //Works with that value
//Dense matrix properties
int bmat_rows = 1000;
int bmat_cols = 150;
//Generate a special type of sparse matrix that has exactly k nonzero elements in each column in CSC format
float * amat_vals;
CALL_CUDA(cudaMallocHost((void **)&amat_vals, nnz_new*sizeof(float)));
int * amat_idx;
CALL_CUDA(cudaMallocHost((void **)&amat_idx, nnz_new*sizeof(int)));
int * crccolptr;
CALL_CUDA(cudaMallocHost((void **)&crccolptr, (sparse_cols+1)*sizeof(int)));
//Fill in values with random values
for (int i = 0; i < nnz_new; i++){
amat_vals[i] = (float)rand()/(float)RAND_MAX;
}
//Generate indexes for those rows
for (int i = 0; i < nnz_new; i++){
amat_idx[i] = rand() % (sparse_rows - 1);
}
//generate crccolptr
int k = (int)(nnz_new/sparse_cols); //Number of elements per row
for (int i = 0; i < sparse_cols; i++){
crccolptr[i] = k*i;
}
crccolptr[sparse_cols] = nnz_new;
//Generate bmat_array with random floats
float * bmat_array;
CALL_CUDA(cudaMallocHost((void **)&bmat_array, bmat_rows*bmat_cols*sizeof(float)));
for (int i = 0; i < bmat_rows*bmat_cols; i++){
bmat_array[i] = (float)rand()/(float)RAND_MAX;
}
//generate array for output
float * outmatrix_test;
CALL_CUDA(cudaMallocHost((void **)&outmatrix_test, sparse_rows*bmat_cols*sizeof(float)));
//Allocate and copy device memory for sparse matrix
float * cudavals;
int * colIdx;
int * colPtr;
CALL_CUDA(cudaMalloc((void **)&colPtr, (sparse_cols + 1)*sizeof(int)));
CALL_CUDA(cudaMemcpy(colPtr, crccolptr, (sparse_cols + 1)*sizeof(int), cudaMemcpyHostToDevice));
CALL_CUDA(cudaMalloc((void **)&cudavals, nnz_new*sizeof(float)));
CALL_CUDA(cudaMalloc((void **)&colIdx, nnz_new*sizeof(int)));
CALL_CUDA(cudaMemcpy(cudavals, amat_vals, nnz_new*sizeof(float), cudaMemcpyHostToDevice));
CALL_CUDA(cudaMemcpy(colIdx, amat_idx, nnz_new*sizeof(int), cudaMemcpyHostToDevice));
//Allocate and copy device memory for dense matrix
float * B_gpumatrix;
CALL_CUDA(cudaMalloc((void **)&B_gpumatrix, bmat_rows*bmat_cols*sizeof(float)));
CALL_CUDA(cudaMemcpy(B_gpumatrix, bmat_array, bmat_rows*bmat_cols*sizeof(float), cudaMemcpyHostToDevice));
//Allocate output matrix
float * outmatrix_gpu;
CALL_CUDA(cudaMalloc((void **)&outmatrix_gpu, (sparse_rows*bmat_cols)*sizeof(float)));
//sparse_cols is passed as sparse_rows, because we're multiplying a CSC matrix instead of a CSR so we need
// to transpose it and invert the rows and columns.
const float alpha = 1.0;
const float beta = 0.0;
/*
float * outmatrix_gpu2;
CALL_CUDA(cudaMalloc((void **)&outmatrix_gpu2, (sparse_rows*sparse_cols)*sizeof(float)));
cusparseStatus_t mat_mul = cusparseScsc2dense(handle, sparse_rows, sparse_cols, descr, cudavals, colIdx, colPtr, outmatrix_gpu2, sparse_rows);
float * outmatrix_test2;
CALL_CUDA(cudaMallocHost((void **)&outmatrix_test2, sparse_rows*sparse_cols*sizeof(float)));
CALL_CUDA(cudaMemcpy(outmatrix_test2, outmatrix_gpu2, (sparse_rows*sparse_cols)*sizeof(float), cudaMemcpyDeviceToHost));
*/
cusparseStatus_t mat_mul = cusparseScsrmm(handle, //Cusparse handle
CUSPARSE_OPERATION_TRANSPOSE, //Transposing the matrix
sparse_cols, //Number of sparse rows. Since we're using CSC matrix it's the columns.
bmat_cols, //Number of columns of the dense matrix
sparse_rows, //Number of sparse cols, Since we're using CSC matrix it's the rows
nnz_new, //Non zero elements
&alpha, //Pointer to alpha (1.0)
descr, //Description of the matrix
cudavals, //The values vector
colPtr, //The column pointer
colIdx, //The indexes of the sparse matrix
B_gpumatrix, //Dense matrix array
bmat_rows, //ldb - the rows of the dense matrix
&beta, //Pointer to beta. It's 0
outmatrix_gpu, //Pointer to the output matrix
sparse_rows); //ldc - leading dimensions of the output matrix.
if (mat_mul != CUSPARSE_STATUS_SUCCESS){
std::cout << "MULTIPLICATION ERROR: " << mat_mul << std::endl;
}
cudaThreadSynchronize(); //Syncs before copy back to memory should not be necessary
cudaDeviceSynchronize();
//Copy matrix back to host
CALL_CUDA(cudaMemcpy(outmatrix_test, outmatrix_gpu, (sparse_rows*bmat_cols)*sizeof(float), cudaMemcpyDeviceToHost));
CALL_CUDA(cudaFree(outmatrix_gpu));
CALL_CUDA(cudaFree(cudavals));
CALL_CUDA(cudaFree(colPtr));
CALL_CUDA(cudaFree(colIdx));
CALL_CUDA(cudaFree(B_gpumatrix));
CALL_CUDA(cudaFreeHost(crccolptr));
CALL_CUDA(cudaFreeHost(amat_vals));
CALL_CUDA(cudaFreeHost(amat_idx));
CALL_CUDA(cudaFreeHost(bmat_array));
CALL_CUDA(cudaFreeHost(outmatrix_test));
return 1;
}
I believe i am generating a valid sparse matrix, because I can convert it to dense one using the appropriate cusparse function (lines 105 - 112) without triggering any invalid memory accesses.
If I uncomment line 32 and comment line 33 the code runs just fine. The block between 105 and 112 runs fine regardless of nnz.
When running the above code under cuda-memcheck you can see many illegal accesses from within the cusparseScsrmm. Running without cuda-memcheck you would see an error in the first cuda operation after the matrix multiplication.
Any ideas what I am doing wrong? I hope that if I can solve this problem, I would be able to diagnoze (or at least isolate) a self contained example that exhibits the painfully slow matrix multiplications.
EDIT:
Using smaller matrices I don’t experience the problem.
sparse matrix with 50200 works fine for NNZ until about 1000, but takes forever with NNZ = 5000 (I killed it after half a minute). Increasing matrix size to 200500 works performs instantaneously with NNZ = 5000… Strange
EDIT2:
Basically if I increase the size of the sparse and dense matrices by 10 times the problem disappears. I appear to have a problem multiplying large, relatively densely populated sparse matrices with large dense matrices.