Hi
I wrote 2D convolution and when I run the benchmarks, I see the tiled version does not provide any performance gains the naive untiled version (at times it slower).
Naive
__global__ void gpu_conv2d_kernel(float const *d_N_ptr, float const *d_F_ptr, float *d_P_ptr, int const n_rows, int const n_cols)
{
// Which output element this thread works on
int const out_col = blockIdx.x*blockDim.x + threadIdx.x;
int const out_row = blockIdx.y*blockDim.y + threadIdx.y;
// Check if output element is valid
if (out_row < n_rows && out_col < n_cols)
{
// Result (in thread register)
float p_val = 0.0f;
// Loop over elements of the filter array
#pragma unroll
for (int f_row = 0; f_row < 2*FILTER_RADIUS+1; f_row++)
{
for (int f_col = 0; f_col < 2*FILTER_RADIUS+1; f_col++)
{
// Input element to filter element mapping
// int in_row = out_row + (f_row - FILTER_RADIUS);
// int in_col = out_col + (f_col - FILTER_RADIUS);
// Boundary check
if ((out_row + (f_row - FILTER_RADIUS)) >= 0 && (out_row + (f_row - FILTER_RADIUS)) < n_rows && (out_col + (f_col - FILTER_RADIUS)) >= 0 && (out_col + (f_col - FILTER_RADIUS)) < n_cols)
p_val += d_F_ptr[f_row*(2*FILTER_RADIUS+1) + f_col] * d_N_ptr[(out_row + (f_row - FILTER_RADIUS))*n_cols + (out_col + (f_col - FILTER_RADIUS))];
// p_val += d_F_ptr[f_row*(2*FILTER_RADIUS+1) + f_col] * d_N_ptr[in_row*n_cols + in_col];
}
}
d_P_ptr[out_row*n_cols + out_col] = p_val;
}
}
Tiled
#define FILTER_RADIUS 1
#define INPUT_TILE_DIM 16
#define OUTPUT_TILE_DIM (INPUT_TILE_DIM - 2*FILTER_RADIUS)
extern __constant__ float d_F[(2*FILTER_RADIUS+1)*(2*FILTER_RADIUS+1)];
__global__ void gpu_conv2d_tiled_kernel(float *d_N_ptr, float *d_P_ptr, int n_rows, int n_cols)
{
// Which output element this thread works on
int out_col = blockIdx.x*OUTPUT_TILE_DIM + threadIdx.x - FILTER_RADIUS;
int out_row = blockIdx.y*OUTPUT_TILE_DIM + threadIdx.y - FILTER_RADIUS;
// Allocate shared memory
__shared__ float N_sh[INPUT_TILE_DIM][INPUT_TILE_DIM];
// Checking for ghost cells and loading tiles into shared memory
if (out_row >= 0 && out_row < n_rows && out_col >= 0 && out_col < n_cols)
N_sh[threadIdx.y][threadIdx.x] = d_N_ptr[out_row*n_cols + out_col];
else
N_sh[threadIdx.y][threadIdx.x] = 0.0f;
// Ensure all elements are loaded
__syncthreads();
// Computing output elements
int tile_col = threadIdx.x - FILTER_RADIUS;
int tile_row = threadIdx.y - FILTER_RADIUS;
// Check if output element is valid
if (out_row >= 0 && out_row < n_rows && out_col >= 0 && out_col < n_cols)
{
// Checking for threads outside the tile bounds
if (tile_row >= 0 && tile_row < OUTPUT_TILE_DIM && tile_col >= 0 && tile_col < OUTPUT_TILE_DIM)
{
// Result (in thread register)
float p_val = 0.0f;
// Loop over elements of the filter array
#pragma unroll
for (int f_row = 0; f_row < 2*FILTER_RADIUS+1; f_row++)
{
for (int f_col = 0; f_col < 2*FILTER_RADIUS+1; f_col++)
{
// Input element (in shared memory) to filter element mapping
int in_row = tile_row + f_row;
int in_col = tile_col + f_col;
p_val += d_F[f_row*(2*FILTER_RADIUS+1) + f_col] * N_sh[in_row][in_col];
}
}
// Storing the final result
d_P_ptr[out_row*n_cols + out_col] = p_val;
}
}
}
Benchmark
Naive
------------------------
GPU Benchmarking details
------------------------
Time for GPU memory allocation (seconds): 0.000326656
Time for input data transfer (seconds): 0.00280998
Time for filter data transfer (seconds): 0.000384928
Time for kernel execution (seconds): 4.49434e-05
Time for output data transfer (seconds): 0.00618854
Time (total): 0.00975506
FPS (total): 102.511
Time (kernel): 4.49434e-05
FPS (kernel): 22250.2
GFLOPS (kernel): 1679.84
------------------------
Tiled
---------------------------------
GPU (Tiling) Benchmarking details
---------------------------------
Time for GPU memory allocation (seconds): 0.000304704
Time for input data transfer (seconds): 0.00280374
Time for filter data transfer (seconds): 0.00014032
Time for kernel execution (seconds): 5.63507e-05
Time for output data transfer (seconds): 0.00619062
Time (total): 0.00949574
FPS (total): 105.31
Time (kernel): 5.63507e-05
FPS (kernel): 17746
GFLOPS (kernel): 1339.78
---------------------------------