For my own learning, I’m trying to implement an optimized algorithm for sparse matrix-vector multiplication on a GPU with the sparse matrix stored in CSR format. I’m learning from the nVidia research paper here: http://www.nvidia.com/docs/IO/66889/nvr-2008-004.pdf
The issue that I’m having (and I’m sure it’s simple) is that the compiler is raising an “error: incomplete type is not allowed” message for the “shared float vals;” line when I try to compile the following kernel:
__global__ void spmv_csr_vector_kernel (const int num_rows, const int *ptr, const int *indices, const float *data, const float *x , float *y) {
__shared__ float vals[];
int thread_id = blockDim.x * blockIdx.x + threadIdx.x; // global thread index
int warp_id = thread_id / 32; // global warp index
int lane = thread_id & (32 - 1); // thread index within the warp
// one warp per row
int row = warp_id;
if (row < num_rows) {
int row_start = ptr[row];
int row_end = ptr[row+1];
// compute running sum per thread
vals [threadIdx.x] = 0;
for (int jj = row_start + lane; jj < row_end; jj += 32)
vals [ threadIdx.x ] += data [ jj ] * x [ indices [ jj ]];
// parallel reduction in shared memory
if ( lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16];
if ( lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8];
if ( lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4];
if ( lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2];
if ( lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1];
// first thread writes the result
if (lane == 0)
y[row] += vals[threadIdx.x];
}
}
Note: this is the code from page 17 of the report.
Any idea how to fix this simple error?