gemv tiling kernel

Hello,

I am trying to implement a tiled version of GEMV which uses shared memory for matrix and vector for a fixed size matrix (256x256). This kernel is instantiated using a fixed number of blocks (16x16) and threads (16x16) where in each thread computes just one matmul. These dimensions of matrix, thread and blocks are fixed for my requirement to pass this cuda code to a tool called fcuda to transform it to HLS/RTL for FPGA.

However, I am unable to find the bug in this implementation and looking for some help to debug the failure of this test.

#include <stdio.h>

#define TILE_WIDTH 16
#define NUM_ROWS 256
#define NUM_COLS 256


__global__ void kernel(double * dA, double * dB, double * dC) {
  unsigned int bid = blockIdx.x;
  double Cval = 0.0;

  double * Asub;
  double * Bsub;
  double * Csub;

  unsigned int row = threadIdx.x;
  unsigned int col = threadIdx.y;

  __shared__ double A_shared[TILE_WIDTH * TILE_WIDTH];
  __shared__ double B_shared[TILE_WIDTH];

  Asub = dA + bid * blockDim.x * NUM_COLS + blockIdx.y * blockDim.y;
  Bsub = dB + blockIdx.y * blockDim.y;
  Csub = dC + bid * TILE_WIDTH;

  if (row < TILE_WIDTH && col < TILE_WIDTH) {
    B_shared[row] = Bsub[row];
    A_shared[row * TILE_WIDTH + col] = Asub[row * NUM_COLS + col];
    __syncthreads();

     Cval = A_shared[row * TILE_WIDTH + col] * B_shared[row];
    __syncthreads();

    Csub[row] += Cval;
   }

}

#define cudaErrorCheck(err) \
  if(err != cudaSuccess) { \
    printf("##### error in cuda call at %s: %d. %s: %s #####\n", __FILE__, __LINE__, cudaGetErrorName(err), cudaGetErrorString(err)); \
    exit(1); \
  }

int main(){
    double M[NUM_ROWS*NUM_COLS], N[NUM_COLS], P[NUM_ROWS];

    //init
    for(int i=0; i<NUM_ROWS; i++){
        for(int j=0; j<NUM_COLS; j++)
            M[i*NUM_COLS + j] = i*NUM_COLS + j;
        P[i] = 0;
    }
    for(int j=0; j<NUM_COLS; j++)
        N[j] = j;



    //GPU calc
    double *m, *n, *p;

    cudaErrorCheck(cudaMalloc (( void **)&m , NUM_ROWS*NUM_COLS*sizeof(double)));
    cudaErrorCheck(cudaMalloc (( void **)&n , NUM_COLS*sizeof(double)));
    cudaErrorCheck(cudaMalloc (( void **)&p , NUM_ROWS*sizeof(double)));


    cudaErrorCheck(cudaMemcpy(m, M, NUM_ROWS*NUM_COLS*sizeof(double), cudaMemcpyHostToDevice));
    cudaErrorCheck(cudaMemcpy(n, N, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice));
    cudaErrorCheck(cudaMemcpy(p, P, NUM_ROWS*sizeof(double), cudaMemcpyHostToDevice));

    dim3 dimGrid(16, 16);
    dim3 dimBlock(16, 16);

    kernel<<<dimGrid,dimBlock>>>(m, n, p);

    cudaErrorCheck(cudaMemcpy(P, p, NUM_ROWS*sizeof(double), cudaMemcpyDeviceToHost));
    cudaErrorCheck(cudaDeviceSynchronize());

    //cpu check
    for(int i=0; i<NUM_ROWS; i++){
        double temp = 0;
        for(int j=0; j<NUM_COLS; j++)
            temp += M[i*NUM_COLS + j] * N[j];
        if(temp != P[i]){
            printf("error for element %d. CPU: %f, GPU: %f\n", i, temp, P[i]);
            return 0;
        }
                                                                                                                                                                              }
    printf("success!\n");
    
	return 0;
}