I’m trying to implement the sparse vector-matrix multiplication routine as described in “Effcient Sparse Matrix-Vector Multiplication on CUDA” by Bell Garlandy (spmv_csr_vector_kernel function). The source code is pretty similar to that given in the article and it seems to work correctly but it is painfully slow(more than 3 times slower than simple scalar implementation). I wonder, what can be the reason of it? I use GeForce 8800 GT for testing.
the code:
#define WARP_SZ 32
__shared__ float vals [THREAD_BLOCK_SIZE*THREAD_BLOCK_SIZE];
int thread_id = compute_thread_index(); // global thread index
int warp_id = thread_id / WARP_SZ; // global warp index
int lane = thread_id & (WARP_SZ - 1); // thread index within the warp
// one warp per row
int row = warp_id;
const int thread_block_id = threadIdx.x + THREAD_BLOCK_SIZE*threadIdx.y;
if ( row < size_vec)
{
uint2 rowptr_bounds = rowptr[row];
const int row_start = rowptr_bounds.x;
const int row_end = rowptr_bounds.y;
// compute running sum per thread
vals [ thread_block_id ] = 0;
for ( int jj = row_start + lane; jj < row_end; jj += WARP_SZ){
const float xv = tex1Dfetch(texX,colind[jj]);
vals [ thread_block_id ] += matrix [jj] * xv;
}
// parallel reduction in shared memory
if ( lane < 16) vals [ thread_block_id ] += vals [ thread_block_id + 16];
if ( lane < 8) vals [ thread_block_id ] += vals [ thread_block_id + 8];
if ( lane < 4) vals [ thread_block_id ] += vals [ thread_block_id + 4];
if ( lane < 2) vals [ thread_block_id ] += vals [ thread_block_id + 2];
if ( lane < 1) vals [ thread_block_id ] += vals [ thread_block_id + 1];
// first thread writes the result
if (lane == 0){
b[ row ] = vals [ thread_block_id];
}
}