optimization of a sparse matrix vector multiplication kernel

Dear CUDA development community,

we are a group of university researchers working on a scientific library for the simulation of plasma turbulence (the full repository is accessible here: https://github.com/feltor-dev/feltor ). The performance of the library greatly depends on the execution time of a sparse matrix-vector multiplication (y ← aMx + by, a and b are scalars, x and y vectors and M is the matrix in our own format). We went for a custom implementation instead of e.g. a library CSR matrix format since we effectively only need to store a finite number of rows (say 3) of the matrix. Recent optimization efforts for the Intel Xeon Phi (knights landing) show that this operation can be done almost (it’s 30% slower in fact) as fast as vector addition daxpby (y ← ax + b*y). However, on the GPU there is still a factor 2 to 3 between the execution time of our matrix -vector kernel and the daxpby function (consistently benchmarked for various GTX and Tesla GPUs, if you want to try it out just follow the instructions in the Quick Start guide) and we think it could be faster, but we do not know. Since our expertise is not sufficient to optimize the kernel we need your help. We are grateful for any constructive comment and will of course acknowledge any contribution on our homepage and in our publications.

best regards
Matthias

//kernel for a derivative in y – direction  (taken from the file 
//path/to/feltor/inc/dg/backend/sparseblockmat_gpu_kernels.cuh )
//value_type is double
template<class value_type>
 __global__ void ell_multiply_kernel33(value_type alpha, value_type beta,
         const value_type* __restrict__  data, const int* __restrict__  cols_idx, const int* __restrict__  data_idx,
         const int num_rows, const int num_cols,
         const int size,
         const int right,
         const int* __restrict__  right_range,
         const value_type* __restrict__  x, value_type * __restrict__ y
         )
{
    const int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
    const int grid_size = gridDim.x*blockDim.x;
    const int right_ = right_range[1]-right_range[0];
    //every thread takes num_rows/grid_size rows
    for( int row = thread_id; row<size; row += grid_size)
    {
        int rr = row/right_, rrn = rr/3;
        int s=rrn/num_rows,
            i = (rrn)%num_rows,
            k = (rr)%3,
            j=right_range[0]+row%right_;
        value_type temp=0;
        {
            int B = (data_idx[i*3  ]*3+k)*3;
            int J = (s*num_cols+cols_idx[i*3  ])*3;
            temp +=data[ B  ]* x[(J  )*right+j];
            temp +=data[ B+1]* x[(J+1)*right+j];
            temp +=data[ B+2]* x[(J+2)*right+j];
            B = (data_idx[i*3+1]*3+k)*3;
            J = (s*num_cols+cols_idx[i*3+1])*3;
            temp +=data[ B  ]* x[(J  )*right+j];
            temp +=data[ B+1]* x[(J+1)*right+j];
            temp +=data[ B+2]* x[(J+2)*right+j];
            B = (data_idx[i*3+2]*3+k)*3;
            J = (s*num_cols+cols_idx[i*3+2])*3;
            temp +=data[ B  ]* x[(J  )*right+j];
            temp +=data[ B+1]* x[(J+1)*right+j];
            temp +=data[ B+2]* x[(J+2)*right+j];
        }
        int idx = ((s*num_rows+i)*3+k)*right+j;
        y[idx]=alpha*temp+beta*y[idx];
    }
}
//here are the relevant lines for calling this kernel (similar to how kernels are called in thrust)
const size_t BLOCK_SIZE = 256;
const size_t size = (left_size)*(right_range[1]-right_range[0])*num_rows*n; //number of lines
const size_t NUM_BLOCKS = std::min<size_t>((size-1)/BLOCK_SIZE+1, 65000);
ell_multiply_kernel33<value_type> <<<NUM_BLOCKS, BLOCK_SIZE>>> (alpha,beta, data_ptr, cols_ptr, block_ptr, num_rows, num_cols, size, right_size, right_range_ptr, x_ptr,y_ptr);