No improvement with tiled (vs untiled) 2D convolution using 3x3 filters

Hi

I wrote 2D convolution and when I run the benchmarks, I see the tiled version does not provide any performance gains the naive untiled version (at times it slower).

Naive

__global__ void gpu_conv2d_kernel(float const *d_N_ptr, float const *d_F_ptr, float *d_P_ptr, int const n_rows, int const n_cols)
{
    // Which output element this thread works on
    int const out_col = blockIdx.x*blockDim.x + threadIdx.x;
    int const out_row = blockIdx.y*blockDim.y + threadIdx.y;
    
    // Check if output element is valid
    if (out_row < n_rows && out_col < n_cols) 
    {
        // Result (in thread register)
        float p_val = 0.0f;
        
        // Loop over elements of the filter array
        #pragma unroll
        for (int f_row = 0; f_row < 2*FILTER_RADIUS+1; f_row++) 
        {
            for (int f_col = 0; f_col < 2*FILTER_RADIUS+1; f_col++) 
            {
                // Input element to filter element mapping
                // int in_row = out_row + (f_row - FILTER_RADIUS);
                // int in_col = out_col + (f_col - FILTER_RADIUS);
                        
                // Boundary check
                if ((out_row + (f_row - FILTER_RADIUS)) >= 0 && (out_row + (f_row - FILTER_RADIUS)) < n_rows && (out_col + (f_col - FILTER_RADIUS)) >= 0 && (out_col + (f_col - FILTER_RADIUS)) < n_cols) 
                    p_val += d_F_ptr[f_row*(2*FILTER_RADIUS+1) + f_col] * d_N_ptr[(out_row + (f_row - FILTER_RADIUS))*n_cols + (out_col + (f_col - FILTER_RADIUS))];
                    // p_val += d_F_ptr[f_row*(2*FILTER_RADIUS+1) + f_col] * d_N_ptr[in_row*n_cols + in_col];
            }
        }
        d_P_ptr[out_row*n_cols + out_col] = p_val;
    }
}

Tiled

#define FILTER_RADIUS 1
#define INPUT_TILE_DIM 16
#define OUTPUT_TILE_DIM (INPUT_TILE_DIM - 2*FILTER_RADIUS)

extern __constant__ float d_F[(2*FILTER_RADIUS+1)*(2*FILTER_RADIUS+1)];

__global__ void gpu_conv2d_tiled_kernel(float *d_N_ptr, float *d_P_ptr, int n_rows, int n_cols)
{
    // Which output element this thread works on
    int out_col = blockIdx.x*OUTPUT_TILE_DIM + threadIdx.x - FILTER_RADIUS;
    int out_row = blockIdx.y*OUTPUT_TILE_DIM + threadIdx.y - FILTER_RADIUS;

    // Allocate shared memory
    __shared__ float N_sh[INPUT_TILE_DIM][INPUT_TILE_DIM];

    // Checking for ghost cells and loading tiles into shared memory
    if (out_row >= 0 && out_row < n_rows && out_col >= 0 && out_col < n_cols)
        N_sh[threadIdx.y][threadIdx.x] = d_N_ptr[out_row*n_cols + out_col];
    else
        N_sh[threadIdx.y][threadIdx.x] = 0.0f;
    
    // Ensure all elements are loaded
    __syncthreads();

    // Computing output elements
    int tile_col = threadIdx.x - FILTER_RADIUS;
    int tile_row = threadIdx.y - FILTER_RADIUS;
    
    // Check if output element is valid
    if (out_row >= 0 && out_row < n_rows && out_col >= 0 && out_col < n_cols) 
    {
        // Checking for threads outside the tile bounds
        if (tile_row >= 0 && tile_row < OUTPUT_TILE_DIM && tile_col >= 0 && tile_col < OUTPUT_TILE_DIM) 
        {
            // Result (in thread register)
            float p_val = 0.0f;
            
            // Loop over elements of the filter array
            #pragma unroll
            for (int f_row = 0; f_row < 2*FILTER_RADIUS+1; f_row++) 
            {
                for (int f_col = 0; f_col < 2*FILTER_RADIUS+1; f_col++) 
                {
                    // Input element (in shared memory) to filter element mapping
                    int in_row = tile_row + f_row;
                    int in_col = tile_col + f_col;
                
                    p_val += d_F[f_row*(2*FILTER_RADIUS+1) + f_col] * N_sh[in_row][in_col];
                }
            }

            // Storing the final result
            d_P_ptr[out_row*n_cols + out_col] = p_val;
        }
    }
}

Benchmark

Naive

------------------------ 
GPU Benchmarking details 
------------------------ 
Time for GPU memory allocation (seconds): 0.000326656

Time for input data transfer (seconds): 0.00280998

Time for filter data transfer (seconds): 0.000384928

Time for kernel execution (seconds): 4.49434e-05

Time for output data transfer (seconds): 0.00618854

Time (total): 0.00975506
FPS (total): 102.511

Time (kernel): 4.49434e-05
FPS (kernel): 22250.2
GFLOPS (kernel): 1679.84
------------------------

Tiled

--------------------------------- 
GPU (Tiling) Benchmarking details 
--------------------------------- 
Time for GPU memory allocation (seconds): 0.000304704

Time for input data transfer (seconds): 0.00280374

Time for filter data transfer (seconds): 0.00014032

Time for kernel execution (seconds): 5.63507e-05

Time for output data transfer (seconds): 0.00619062

Time (total): 0.00949574
FPS (total): 105.31

Time (kernel): 5.63507e-05
FPS (kernel): 17746
GFLOPS (kernel): 1339.78
---------------------------------

Your tiles with 3x3 are too small to make any difference. The rest is done by the L1 cache. You are nearing the speed of a copy kernel as shown in the other thread.