cuSparse multiplication "Address out of bounds" error

I am relatively new to CUDA, and have been trying to get to grips with the cuSparse library for the problem I am working on. I am in the situation where my sparse multiplication cusparseScsrmv exits with no errors, but my subsequent cudaMemcpy exits with error code 77.

Having dug into this using cuda-memcheck, I have found that there is an “Address out of bounds” error occurring within cusparseScsrmv, but I can’t work out where I am going wrong with it. I may have got something wrong in the CSR specification, but can’t see where.

A simplified version of the problem I am trying to solve is included below, and gives the same error that I am seeing in my actual code. Any suggestions/corrections would be gratefully received!

// includes, system
#include <iostream>

// includes, project
#include <cuda_runtime.h>
#include "cusparse_v2.h"

using namespace std;

int main()
{
	// Define sparse matrix and dense vector, and expected output:
	//		| 0	2 0 0 |	* 10  = 16
	//		| 0 0 0 1 |    8     4
	//		| 7 0 0 0 |    6	70
	//		| 0 6 5 0 |    4	78
	//		| 1 0 0 0 |			10
	//		| 0 9 0 2 |			80

	// Initialise variables on host
	const int nnz = 8;
	const int m = 6;
	const int n = 4;
	float alpha = 1.0;
	float beta = 0.0;
    float csrVal[nnz] = { 2, 1, 7, 6, 5, 1, 9, 2 };
	float csrColInd[nnz] = { 1, 3, 0, 1, 2, 0, 1, 3};
	float csrRowPtr[m + 1] = { 0, 1, 2, 3, 5, 6, 8 };
	float x[n] = { 10, 8, 6, 4 };
	float y[m];
    
	// Initialise device variables
	float * x_GPU;
	float * csrVal_GPU;
	int * csrColInd_GPU;
	int * csrRowPtr_GPU;
	float * y_GPU;

	// Choose which GPU to run on, change this on a multi-GPU system.
	cudaError_t cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess) {
		cout << "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?" << endl;
		return 1;
	}

    // Allocate memory
	if (cudaMalloc((void**)&csrVal_GPU, sizeof(float)*nnz) != cudaSuccess) {
		cout << "Error allocating memory on GPU" << endl;
		return 1;
	}
	if (cudaMalloc((void**)&csrColInd_GPU, sizeof(int)*nnz) != cudaSuccess) {
		cout << "Error allocating memory on GPU" << endl;
		return 1;
	}
	if (cudaMalloc((void**)&csrRowPtr_GPU, sizeof(int)*(m+1)) != cudaSuccess) {
		cout << "Error allocating memory on GPU" << endl;
		return 1;
	}
	if (cudaMalloc((void**)&x_GPU, sizeof(float)*(n)) != cudaSuccess) {
		cout << "Error allocating memory on GPU" << endl;
		return 1;
	}
	if (cudaMalloc((void**)&y_GPU, sizeof(float)*(m)) != cudaSuccess) {
		cout << "Error allocating memory on GPU" << endl;
		return 1;
	}

	// Copy data
	if (cudaMemcpy(csrVal_GPU, csrVal, sizeof(float)*nnz, cudaMemcpyHostToDevice) != cudaSuccess) {
		cout << "Error copying data to GPU (csrVal)" << endl;
		return 1;
	}
	if (cudaMemcpy(csrColInd_GPU, csrColInd, sizeof(float)*nnz, cudaMemcpyHostToDevice) != cudaSuccess) {
		cout << "Error copying data to GPU (csrColInd)" << endl;
		return 1;
	}
	if (cudaMemcpy(csrRowPtr_GPU, csrRowPtr, sizeof(float)*(m+1), cudaMemcpyHostToDevice) != cudaSuccess) {
		cout << "Error copying data to GPU (csrVal)" << endl;
		return 1;
	}
	if (cudaMemcpy(x_GPU, x, sizeof(float)*n, cudaMemcpyHostToDevice) != cudaSuccess) {
		cout << "Error copying data to GPU (x)" << endl;
		return 1;
	}

	// Build cuSparse components
	cusparseStatus_t status;
	cusparseHandle_t h_sparseMult = 0;
	cusparseMatDescr_t sparseDescr = 0;
	status = cusparseCreate(&h_sparseMult);
	if (status != CUSPARSE_STATUS_SUCCESS) {
		cout << "CUSPARSE error: Library initialisation failed" << endl;
		return 1;
	}
	status = cusparseCreateMatDescr(&sparseDescr);
	if (status != CUSPARSE_STATUS_SUCCESS) {
		cout << "CUSPARSE error: Sparse matrix description initialisation failed" << endl;
		return 1;
	}
	cusparseSetMatType(sparseDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
	cusparseSetMatIndexBase(sparseDescr, CUSPARSE_INDEX_BASE_ZERO);

	// Try sparse multiplication
	cusparseStatus_t stat1;
	stat1 = cusparseScsrmv(h_sparseMult, CUSPARSE_OPERATION_NON_TRANSPOSE, 
		m, n, nnz, &alpha, sparseDescr, 
		csrVal_GPU, csrRowPtr_GPU, csrColInd_GPU, 
		x_GPU, &beta, y_GPU);
	if (stat1 != CUSPARSE_STATUS_SUCCESS) {
		cout << "cudaSparse error: Failed in cusparseScsrmv" << endl;
		return 1;
	}

	// Copy output data back to the host
	cudaStatus = cudaMemcpy(y, y_GPU, sizeof(float)*n, cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		cout << "CUDA error code from cusparseScsrmv: " << stat1 << endl;
		cout << "CUDA error code from cudaMemcpy: " << cudaStatus << endl;
		cout << "Error copying data back from GPU (y)" << endl;
		return 1;
	}

	// Write output data back to console to check results
	cout << "Results of sparse multiplication: " << endl;
	for (int ii = 0; ii < n; ii++) {
		cout << y[ii] << endl;
	}

	return 0;
}

This is not correct:

float csrColInd[nnz] = { 1, 3, 0, 1, 2, 0, 1, 3};
	float csrRowPtr[m + 1] = { 0, 1, 2, 3, 5, 6, 8 };

it should be:

int csrColInd[nnz] = { 1, 3, 0, 1, 2, 0, 1, 3};
	int csrRowPtr[m + 1] = { 0, 1, 2, 3, 5, 6, 8 };

Aha! That was a nice simple fix - I’m glad it wasn’t a more fundamental issue. Thanks for the very quick reply.