I am relatively new to CUDA, and have been trying to get to grips with the cuSparse library for the problem I am working on. I am in the situation where my sparse multiplication cusparseScsrmv exits with no errors, but my subsequent cudaMemcpy exits with error code 77.
Having dug into this using cuda-memcheck, I have found that there is an “Address out of bounds” error occurring within cusparseScsrmv, but I can’t work out where I am going wrong with it. I may have got something wrong in the CSR specification, but can’t see where.
A simplified version of the problem I am trying to solve is included below, and gives the same error that I am seeing in my actual code. Any suggestions/corrections would be gratefully received!
// includes, system
#include <iostream>
// includes, project
#include <cuda_runtime.h>
#include "cusparse_v2.h"
using namespace std;
int main()
{
// Define sparse matrix and dense vector, and expected output:
// | 0 2 0 0 | * 10 = 16
// | 0 0 0 1 | 8 4
// | 7 0 0 0 | 6 70
// | 0 6 5 0 | 4 78
// | 1 0 0 0 | 10
// | 0 9 0 2 | 80
// Initialise variables on host
const int nnz = 8;
const int m = 6;
const int n = 4;
float alpha = 1.0;
float beta = 0.0;
float csrVal[nnz] = { 2, 1, 7, 6, 5, 1, 9, 2 };
float csrColInd[nnz] = { 1, 3, 0, 1, 2, 0, 1, 3};
float csrRowPtr[m + 1] = { 0, 1, 2, 3, 5, 6, 8 };
float x[n] = { 10, 8, 6, 4 };
float y[m];
// Initialise device variables
float * x_GPU;
float * csrVal_GPU;
int * csrColInd_GPU;
int * csrRowPtr_GPU;
float * y_GPU;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
cout << "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?" << endl;
return 1;
}
// Allocate memory
if (cudaMalloc((void**)&csrVal_GPU, sizeof(float)*nnz) != cudaSuccess) {
cout << "Error allocating memory on GPU" << endl;
return 1;
}
if (cudaMalloc((void**)&csrColInd_GPU, sizeof(int)*nnz) != cudaSuccess) {
cout << "Error allocating memory on GPU" << endl;
return 1;
}
if (cudaMalloc((void**)&csrRowPtr_GPU, sizeof(int)*(m+1)) != cudaSuccess) {
cout << "Error allocating memory on GPU" << endl;
return 1;
}
if (cudaMalloc((void**)&x_GPU, sizeof(float)*(n)) != cudaSuccess) {
cout << "Error allocating memory on GPU" << endl;
return 1;
}
if (cudaMalloc((void**)&y_GPU, sizeof(float)*(m)) != cudaSuccess) {
cout << "Error allocating memory on GPU" << endl;
return 1;
}
// Copy data
if (cudaMemcpy(csrVal_GPU, csrVal, sizeof(float)*nnz, cudaMemcpyHostToDevice) != cudaSuccess) {
cout << "Error copying data to GPU (csrVal)" << endl;
return 1;
}
if (cudaMemcpy(csrColInd_GPU, csrColInd, sizeof(float)*nnz, cudaMemcpyHostToDevice) != cudaSuccess) {
cout << "Error copying data to GPU (csrColInd)" << endl;
return 1;
}
if (cudaMemcpy(csrRowPtr_GPU, csrRowPtr, sizeof(float)*(m+1), cudaMemcpyHostToDevice) != cudaSuccess) {
cout << "Error copying data to GPU (csrVal)" << endl;
return 1;
}
if (cudaMemcpy(x_GPU, x, sizeof(float)*n, cudaMemcpyHostToDevice) != cudaSuccess) {
cout << "Error copying data to GPU (x)" << endl;
return 1;
}
// Build cuSparse components
cusparseStatus_t status;
cusparseHandle_t h_sparseMult = 0;
cusparseMatDescr_t sparseDescr = 0;
status = cusparseCreate(&h_sparseMult);
if (status != CUSPARSE_STATUS_SUCCESS) {
cout << "CUSPARSE error: Library initialisation failed" << endl;
return 1;
}
status = cusparseCreateMatDescr(&sparseDescr);
if (status != CUSPARSE_STATUS_SUCCESS) {
cout << "CUSPARSE error: Sparse matrix description initialisation failed" << endl;
return 1;
}
cusparseSetMatType(sparseDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(sparseDescr, CUSPARSE_INDEX_BASE_ZERO);
// Try sparse multiplication
cusparseStatus_t stat1;
stat1 = cusparseScsrmv(h_sparseMult, CUSPARSE_OPERATION_NON_TRANSPOSE,
m, n, nnz, &alpha, sparseDescr,
csrVal_GPU, csrRowPtr_GPU, csrColInd_GPU,
x_GPU, &beta, y_GPU);
if (stat1 != CUSPARSE_STATUS_SUCCESS) {
cout << "cudaSparse error: Failed in cusparseScsrmv" << endl;
return 1;
}
// Copy output data back to the host
cudaStatus = cudaMemcpy(y, y_GPU, sizeof(float)*n, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
cout << "CUDA error code from cusparseScsrmv: " << stat1 << endl;
cout << "CUDA error code from cudaMemcpy: " << cudaStatus << endl;
cout << "Error copying data back from GPU (y)" << endl;
return 1;
}
// Write output data back to console to check results
cout << "Results of sparse multiplication: " << endl;
for (int ii = 0; ii < n; ii++) {
cout << y[ii] << endl;
}
return 0;
}