How to use more efficiently the shared memory and 2D tiles

My question is similar to this one (c++ - 2D tiled convolution taking more time than untiled version - Stack Overflow).

I also am observing that Gauss 5x5 filter with tiles and using the shared memory has lower FPS than the non-tiled filter (using only the global memory). The explanation offered in the link above didn’t worked for me so I prefer to ask it here.

Tiles are using shared memory with first slicing the image in blocks with size (DX,DY), then copying the tiles and apron area around them in the block shared memory and then applying the filter kernel. This should be faster because shared memory is prefetched in the L1 cache and access should be better. The implementation is described here. I wonder if I am making something wrong.

Here is my Gauss 5x5 kernel filter with tiles:

__device__ __constant__ float kernel_gauss5x5[25] =
{
    0.00296902,      0.0133062,       0.0219382,       0.0133062,       0.00296902,
    0.0133062,       0.0596343,       0.0983203,       0.0596343,       0.0133062,
    0.0219382,       0.0983203,       0.162103,        0.0983203,       0.0219382,
    0.0133062,       0.0596343,       0.0983203,       0.0596343,       0.0133062,
    0.00296902,      0.0133062,       0.0219382,       0.0133062,       0.00296902
};

#define TILE_W 32
#define TILE_H 20
__global__ void gauss5x5_tiles_kernel(const float *in, float *out, int w, int h)
{   
    const int R = 2;
    const int BLOCK_W = (TILE_W + 2*R);
    const int BLOCK_H = (TILE_H + 2*R);
    __shared__ int smem[BLOCK_W*BLOCK_H];

    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int offset = blockIdx.x * TILE_W + tx-R;
    int x = offset % w;
    int c = offset / w;
    int y = blockIdx.y * TILE_H + ty-R;

    // clamp to edge of image
    x = max(0, x);
    x = min(x, w-1);
    y = max(y, 0);
    y = min(y, h-1);
    //x = clamp(x, 0, w - 1);
    //y = clamp(y, 0, h - 1);

    unsigned int idx = y*w*4 + c*w + x;
    unsigned int bindex = threadIdx.y*BLOCK_W+threadIdx.x;
    
    // each thread copies its pixel of the block to shared memory
    smem[bindex] = in[idx];
    __syncthreads();

    float sum = 0;
#pragma unroll
    // only threads inside the apron will write results
    if (threadIdx.x >= R && threadIdx.x < (BLOCK_W-R) && threadIdx.y >= R && threadIdx.y < (BLOCK_H-R))
    {
        for(int i = -R; i <=R; i++)
        for(int j = -R; j <=R; j++)
        {
            sum += smem[bindex + (i*blockDim.x) + j] * kernel_gauss5x5[(i + R) * 5 + (j + R)];
        }

        out[idx] = sum;
    }
}

I am calling it like this:


extern "C" void gauss5x5_tiles_kernel(float* d_src, float* d_dest, unsigned char* d_result, int w, int h, int cycles)
{
    // sync host and start computation timer_kernel
    CHECK_CUDA(cudaDeviceSynchronize());

    dim3 dimGrid ((w*3) / TILE_W, h / TILE_H);
    dim3 dimBlock(TILE_W+4, TILE_H+4);

    float* src = d_src;
    while (cycles--)
    {
        gauss5x5_tiles_kernel<< < dimGrid, dimBlock, 0 >> > (src, d_dest, w, h);
        src = d_dest;
    }
    // sync host and stop computation timer_kernel
    CHECK_CUDA(cudaDeviceSynchronize());   
}

And this is the more simple non-tiled variant:

__global__ void gauss5x5_kernel(const float *in, float *out, int w, int h)
{
    unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx >=  w * h * 3)
        return;

    unsigned int y = idx / (3*w);
    unsigned int offset = idx - y * 3 * w;
    unsigned int x = offset % w;
    unsigned int c = offset / w;
    
    unsigned int idx4 = y * w * 4 + c*w + x;

    if (x <= 1 || y <= 1 || x >= w-2 || y >= h-2)
    {
        out[idx4] = in[idx4];
        return;
    }

    float sum = 0;
#pragma unroll
    for(int i = -2; i <=2; i++)
    for(int j = -2; j <=2; j++)
    {
        sum += in[idx4 + j + 4*w*i] *kernel_gauss5x5[(i + 2) * 5 + (j + 2)];
    }

    out[idx4] = sum;
}

When I compare these two functions for 100 cycles I get following FPS:

gauss5x5_gpu filter radius: 2, threads: 101470212 7.00 [ms] 142.86
gauss5x5_tiles_gpu filter radius: 2, threads: 101470212 8.17 [ms] 122.40

These are the results for the larger gauss filter (7x7).
gauss7x7_gpu, filter radius: 3, threads: 101470212 10.33 [ms] 96.81
gauss7x7_tiles_gpu , filter radius: 3, threads: 101470212 12.47 [ms] 80.19

Similar are my results with smaller tile sizes:

#define TILE_W 16
#define TILE_H 16

gauss5x5_gpu, filter radius: 2, threads: 101470212 7.22 [ms] 138.50
gauss5x5_tiles_gpu, filter radius: 2, threads: 101470212 7.33 [ms] 136.43
gauss7x7_gpu, filter radius: 3, threads: 101470212 9.97 [ms] 100.30
gauss7x7_tiles_gpu, filter radius: 3, threads: 101470212 10.98 [ms] 91.07

For this test I was using laptop with video card RTX3060 and compute capability 8.6.

How to explain that the tiled functions are slower? Please recommend a better way to implement the tile algorithm.

It seems quite odd to declare a shared memory tile of type int when your incoming and outgoing data is of type float.

Your block has a dimension of (36,24). That is 864 threads, so you will only get 1 of those blocks resident on a SM in your cc8.6 GPU. I don’t know if your non-tiled kernel launches blocks of the same size, but it would be an odd choice in the non-tiled case. You might have better occupancy in the non-tiled case.

Of those 864 threads, all are being productively utilized here, which basically constitutes a single operation (per thread):

However of those 864 threads, you are only using (32,20) of them here:

which involves 25 operations per thread. 32x20 = 640. So 864-640 = 224 threads (about 25%) are idle during the preponderance of operations executed by your kernel.

For this reason, shared memory loading in such a 2D tile kernel is usually the operation “penalized” with extra work per thread, as opposed to penalizing the worker loop with some idle threads.

I will also point out that the source for your implementation dates from 2008, approximately 15 years ago. GPUs have changed considerably in that time. In 2008, there were no CUDA GPUs that had “ordinary” L1 or L2 caches. Therefore if you didn’t use shared memory for data reuse, the penalty could be large. With the introduction of L1 and L2 caches, depending on the data usage pattern, the cost for not performing a shared memory optimization is reduced. In fact, NVIDIA made some statements about this (" Enhanced L1 Data Cache and Shared Memory") starting with the L1 design in the Volta generation.

Beyond that, to make definitive statements about what you have, would require a testable case. I don’t wish to reverse engineer one around what you have shown, and you’ve chosen not to show a complete test case, so I’ll stop there. Others may be able to help also.

Hello, Robert.
Thanks, I prepared a full demo reproducing the two Gauss filter implementations - with and without tiles.

filter.cu

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>

#define TILE_W 16
#define TILE_H 16
#define BLOCK 512

inline int get_number_of_blocks(int array_size, int block_size)
{
    return array_size / block_size + ((array_size % block_size > 0) ? 1 : 0);
}

__device__ __constant__ float kernel_gauss5x5[25] =
{
    0.00296902,      0.0133062,       0.0219382,       0.0133062,       0.00296902,
    0.0133062,       0.0596343,       0.0983203,       0.0596343,       0.0133062,
    0.0219382,       0.0983203,       0.162103,        0.0983203,       0.0219382,
    0.0133062,       0.0596343,       0.0983203,       0.0596343,       0.0133062,
    0.00296902,      0.0133062,       0.0219382,       0.0133062,       0.00296902
};

__device__ __constant__ float kernel_gauss7x7[49] =
{
    0.00001965,	0.00023941,	0.00107296,	0.00176901,	0.00107296,	0.00023941,	0.00001965,
    0.00023941,	0.0029166,	0.01307131,	0.02155094,	0.01307131,	0.0029166,	0.00023941,
    0.00107296,	0.01307131,	0.05858154,	0.09658462,	0.05858154,	0.01307131,	0.00107296,
    0.00176901,	0.02155094,	0.09658462,	0.15924113,	0.09658462,	0.02155094,	0.00176901,
    0.00107296,	0.01307131,	0.05858154,	0.09658462,	0.05858154,	0.01307131,	0.00107296,
    0.00023941,	0.0029166,	0.01307131,	0.02155094,	0.01307131,	0.0029166,	0.00023941,
    0.00001965,	0.00023941,	0.00107296,	0.00176901,	0.00107296,	0.00023941,	0.00001965,
};


__global__ void gauss5x5_kernel(const float* __restrict__ in, float *out, int w, int h)
{
    unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx >=  w * h * 3)
        return;

    unsigned int y = idx / (3*w);
    unsigned int offset = idx - y * 3 * w;
    unsigned int x = offset % w;
    unsigned int c = offset / w;
    
    unsigned int idx4 = y * w * 4 + c*w + x;

    if (x <= 1 || y <= 1 || x >= w-2 || y >= h-2)
    {
        out[idx4] = in[idx4];
        return;
    }

    float sum = 0;
#pragma unroll
    for(int i = -2; i <=2; i++)
    for(int j = -2; j <=2; j++)
    {
        sum += in[idx4 + j + 4*w*i] *kernel_gauss5x5[(i + 2) * 5 + (j + 2)];
    }

    out[idx4] = sum;
}

__global__ void gauss5x5_tiles_kernel(const float* __restrict__ in, float *out, int w, int h)
{   
    const int R = 2;
    const int BLOCK_W = (TILE_W + 2*R);
    const int BLOCK_H = (TILE_H + 2*R);
    __shared__ float smem[BLOCK_W*BLOCK_H];

    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int offset = blockIdx.x * TILE_W + tx-R;
    int x = offset % w;
    int c = offset / w;
    int y = blockIdx.y * TILE_H + ty-R;

    // clamp to edge of image
    x = max(0, x);
    x = min(x, w-1);
    y = max(y, 0);
    y = min(y, h-1);
    //x = clamp(x, 0, w - 1);
    //y = clamp(y, 0, h - 1);

    unsigned int idx = y*w*4 + c*w + x;
    unsigned int bindex = threadIdx.y*BLOCK_W+threadIdx.x;
    
    // each thread copies its pixel of the block to shared memory
    smem[bindex] = in[idx];
    __syncthreads();

    float sum = 0;

    // only threads inside the apron will write results
    if (threadIdx.x >= R && threadIdx.x < (BLOCK_W-R) && threadIdx.y >= R && threadIdx.y < (BLOCK_H-R))
    {
#pragma unroll
        for(int i = -R; i <=R; i++)
        for(int j = -R; j <=R; j++)
        {
            sum += smem[bindex + (i*blockDim.x) + j] * kernel_gauss5x5[(i + R) * 5 + (j + R)];
        }

        out[idx] = sum;
    }
}

__global__ void gauss7x7_tiles_kernel(const float* __restrict__ in, float *out, int w, int h)
{   
    const int R = 3;
    const int BLOCK_W = (TILE_W + 2*R);
    const int BLOCK_H = (TILE_H + 2*R);
    __shared__ float smem[BLOCK_W*BLOCK_H];

    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int offset = blockIdx.x * TILE_W + tx-R;
    unsigned int x = offset % w;
    unsigned int c = offset / w;
    int y = blockIdx.y * TILE_H + ty-R;

    // clamp to edge of image
    x = max(0, x);
    x = min(x, w-1);
    y = max(y, 0);
    y = min(y, h-1);

    unsigned int idx = y*w*4 + c*w + x;
    unsigned int bindex = threadIdx.y*BLOCK_W+threadIdx.x;
    
    // each thread copies its pixel of the block to shared memory
    smem[bindex] = in[idx];
    __syncthreads();

    float sum = 0;

    // only threads inside the apron will write results
    if (threadIdx.x >= R && threadIdx.x < (BLOCK_W-R) && threadIdx.y >= R && threadIdx.y < (BLOCK_H-R))
    {
#pragma unroll
        for(int i = -R; i <=R; i++)
        for(int j = -R; j <=R; j++)
        {
            sum += smem[bindex + (i*blockDim.x) + j] *kernel_gauss7x7[(i + R) * 7 + (j + R)];
        }

        out[idx] = sum;
    }
}

__global__ void gauss7x7_kernel(const  float* __restrict__ in, float *out, int w, int h)
{
    unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx >=  w * h * 3)
        return;

    unsigned int y = idx / (3*w);
    unsigned int offset = idx - y * 3 * w;
    unsigned int x = offset % w;
    unsigned int c = offset / w;
    
    unsigned int idx4 = y * w * 4 + c*w + x;

    if (x <= 2 || y <= 2 || x >= w-3 || y >= h-3)
    {
        out[idx4] = in[idx4];
        return;
    }

    float sum = 0;
#pragma unroll
    for(int i = -3; i <=3; i++)
    for(int j = -3; j <=3; j++)
    {
        sum += in[idx4 + j + 4*w*i] *kernel_gauss7x7[(i + 3) * 7 + (j + 3)];
    }

    out[idx4] = sum;
}

extern "C" float* gauss5x5_gpu_tiles(float* d_src, float* d_dest, int w, int h, int cycles)
{
    dim3 dimGrid ((w*3) / TILE_W, h / TILE_H);
    dim3 dimBlock(TILE_W+4, TILE_H+4);

    float* src = d_src,*dst = d_dest, *tmp = d_dest;
    while (cycles--)
    {
        gauss5x5_tiles_kernel << < dimGrid, dimBlock, 0 >> > (src, d_dest, w, h);
        tmp = dst;
        dst = src;
        src = tmp;
    }

    return tmp;
}

extern "C" float* gauss5x5_gpu(float* d_src, float* d_dest, int w, int h, int cycles)
{
    float* src = d_src,*dst = d_dest, *tmp = d_dest;
    while (cycles--)
    {
        gauss5x5_kernel << < get_number_of_blocks(w*h * 3, BLOCK), BLOCK, 0 >> > (src, d_dest, w, h);
        tmp = dst;
        dst = src;
        src = tmp;
    }

    return tmp;
}

extern "C" float* gauss7x7_gpu_tiles(float* d_src, float* d_dest, int w, int h, int cycles)
{
    dim3 dimGrid ((w*3) / TILE_W, h / TILE_H);
    dim3 dimBlock(TILE_W+6, TILE_H+6);

    float* src = d_src,*dst = d_dest, *tmp = d_dest;
    while (cycles--)
    {
        gauss7x7_tiles_kernel << < dimGrid, dimBlock, 0 >> > (src, d_dest, w, h);
        tmp = dst;
        dst = src;
        src = tmp;
    }

    return tmp;
}

extern "C" float* gauss7x7_gpu(float* d_src, float* d_dest, int w, int h, int cycles)
{
    float* src = d_src,*dst = d_dest, *tmp = d_dest;
    while (cycles--)
    {
        gauss7x7_kernel << < get_number_of_blocks(w*h*3, BLOCK), BLOCK, 0 >> > (src, d_dest, w, h);
        tmp = dst;
        dst = src;
        src = tmp;
    }

    return tmp;
}

main.cpp

#include <stdlib.h>
#include <stdio.h>
#include <string>
#include <vector>    
#include <chrono>
#include <algorithm>
// CUDA utilities and system includes
#include <cuda_runtime.h>

void check_error(cudaError_t status)
{
    cudaError_t status2 = cudaGetLastError();
    if (status != cudaSuccess)
    {
        const char *s = cudaGetErrorString(status);
        printf("\n CUDA Error: %s\n", s);
        getchar();
    }
    if (status2 != cudaSuccess)
    {
        const char *s = cudaGetErrorString(status2);
        printf("\n CUDA Error Prev: %s\n", s);
        getchar();
    }
}

void printTime(const char* name, double time)
{
    float fps = 1000 / time;
    printf("%-#40s",name);
    char tmp[32];
    sprintf(tmp, "%0.2f [ms]", time);
    printf("%-#20s%0.2f\n", tmp, fps);
}

#define CHECK_CUDA(X) check_error((cudaError_t)X);

extern "C" float* gauss5x5_gpu_tiles(float* d_src, float* d_dest,  int w, int h, int cycles);
extern "C" float* gauss5x5_gpu(float* d_src, float* d_dest, int w, int h, int cycles);
extern "C" float* gauss7x7_gpu_tiles(float* d_src, float* d_dest, int w, int h, int cycles);
extern "C" float* gauss7x7_gpu(float* d_src, float* d_dest, int w, int h, int cycles);

int main(void)
{
    const int IMAGE_W = 5202 ; // pixels
    const int IMAGE_H = 6502 ;   
    const int N = 5202 * 6502 * 4;
    const int cycles = 100;

    // image is loaded as RGBA. fill with random values
    float* img_cpu = new float[N];
    for (int k = 0; k < N; k++)
        img_cpu[k] = std::rand() % 255;
  
    float* img_gpu = nullptr;
    CHECK_CUDA(cudaMalloc((void **) &img_gpu, (N * sizeof(float))));

    float* temp_gpu = nullptr;
    CHECK_CUDA(cudaMalloc((void **) &temp_gpu, (N * sizeof(float))));

    printf("image size: %d x %d\n", IMAGE_W, IMAGE_H);
    printf("%-#40s%-#20s%0s\n", "filter", "time", "FPS");
    printf("---------------------------------------------------------------------\n");

    // warmup
    gauss5x5_gpu_tiles(img_gpu, temp_gpu, IMAGE_W, IMAGE_H, 1);
    CHECK_CUDA(cudaDeviceSynchronize());
    auto timeStart = std::chrono::system_clock::now();
    gauss5x5_gpu_tiles(img_gpu, temp_gpu, IMAGE_W, IMAGE_H, cycles);
    CHECK_CUDA(cudaDeviceSynchronize());  
    auto timeEnd = std::chrono::system_clock::now();
    double dProcessingTime = (double)std::chrono::duration_cast<std::chrono::milliseconds>(timeEnd - timeStart).count() / cycles;
    printTime("gauss5x5_gpu_tiles", dProcessingTime);

    // warmup
    gauss5x5_gpu(img_gpu, temp_gpu, IMAGE_W, IMAGE_H, 1);
    CHECK_CUDA(cudaDeviceSynchronize());
    timeStart = std::chrono::system_clock::now();
    gauss5x5_gpu(img_gpu, temp_gpu, IMAGE_W, IMAGE_H, cycles);
    CHECK_CUDA(cudaDeviceSynchronize());  
    timeEnd = std::chrono::system_clock::now();
    dProcessingTime = (double)std::chrono::duration_cast<std::chrono::milliseconds>(timeEnd - timeStart).count() / cycles;
    printTime("gauss5x5_gpu", dProcessingTime);

    // warmup
    gauss7x7_gpu_tiles(img_gpu, temp_gpu, IMAGE_W, IMAGE_H, 1);
    CHECK_CUDA(cudaDeviceSynchronize());
    timeStart = std::chrono::system_clock::now();
    gauss7x7_gpu_tiles(img_gpu, temp_gpu, IMAGE_W, IMAGE_H, cycles);
    CHECK_CUDA(cudaDeviceSynchronize());  
    timeEnd = std::chrono::system_clock::now();
    dProcessingTime = (double)std::chrono::duration_cast<std::chrono::milliseconds>(timeEnd - timeStart).count() / cycles;
    printTime("gauss7x7_gpu_tiles", dProcessingTime);

    // warmup
    gauss7x7_gpu(img_gpu, temp_gpu, IMAGE_W, IMAGE_H, 1);
    CHECK_CUDA(cudaDeviceSynchronize());
    timeStart = std::chrono::system_clock::now();
    gauss7x7_gpu(img_gpu, temp_gpu, IMAGE_W, IMAGE_H, cycles);
    CHECK_CUDA(cudaDeviceSynchronize());  
    timeEnd = std::chrono::system_clock::now();
    dProcessingTime = (double)std::chrono::duration_cast<std::chrono::milliseconds>(timeEnd - timeStart).count() / cycles;
    printTime("gauss7x7_gpu", dProcessingTime);


    delete img_cpu;
    cudaFree(img_gpu);
    cudaFree(temp_gpu);

    return 0;
}

On my PC this outputs:

image size: 5202 x 6502
filter                                  time                FPS
---------------------------------------------------------------------
gauss5x5_gpu_tiles                      6.85 [ms]           145.99
gauss5x5_gpu                            4.67 [ms]           214.13
gauss7x7_gpu_tiles                      10.63 [ms]          94.07
gauss7x7_gpu                            7.61 [ms]           131.41

Sorry about the wrong int type in the first code example, the float buffers actually contain pixel values (0-255). I also understood that not all threads from the block are used, only those inside the apron (about 80% of all threads), but this is also and how the original tiles filter (from the link) works. I don’t know about better way to implement the tiles.

About the “(” Enhanced L1 Data Cache and Shared Memory" reference, it may actually explain my results, seems that after cc7 the available shared memory is now used much better as data cache and then such tiles algorithms counting of shared memory are actually slower and inefficient, not the whole warp is used and there is one a call to __syncthreads(); but application will place what it needs in the L1 Data cache anyway. It is good to know this.

I gave a link under the word “usually” which indicates what I was suggesting. Size the threadblock to match the threads needed in the work loop. Then use those threads to load both the central region and apron region of shared mem.

I don’t believe your two implementations produce approximately the same result for the same input data, so I personally wouldn’t bother to do performance analysis on it.

Beyond that, it is puzzling to me how you are mixing per-pixel multipliers of 3 and 4.

Good luck!

I decided to write my own test based on your code, simplified, with corrections, and with output comparison for testing. I also modified it so the filter width can be easily changed.

The conclusion I reached is that for small filter sizes (e.g. 5x5) the L1 cache easily holds all values needed by the blocks resident on the SM, and shared memory gives no benefit - it’s actually worse because the loads are less efficient.

However if you increase the filter size (e.g. 17x17), you reach the point where the shared memory implementation looks better. Here is my test case:

$ cat t13.cu
#include <chrono>
#include <iostream>

#ifndef RF
#define RF 2
#endif
const int R = RF;
const int WF = 2*R+1;
const int LF = WF*WF;
const float FC = 1.0f/LF;
__constant__ float kernel_gauss5x5[LF] = { FC };

__global__ void gauss5x5_kernel(const float* __restrict__ in, float * __restrict__ out, int w, int h)
{
    w = 3*w;
    unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x;
    unsigned int idy = blockIdx.y*blockDim.y+threadIdx.y;
    if ((idx >=  w) || (idy >= h))
        return;

    unsigned int id = idy * w + idx;

    if (idx <= R-1 || idy <= R-1 || idx >= w-R || idy >= h-R)
    {
        out[id] = in[id];
        return;
    }

    float sum = 0;
    for(int i = -R; i <=R; i++)
      for(int j = -R; j <=R; j++)
      {
        sum += in[id + j + w*i] *kernel_gauss5x5[(i + R) * WF + (j + R)];
      }

    out[id] = sum;
}
const int TILE_W = 32;
const int TILE_H = 16;
__global__ void gauss5x5_tiles_kernel(const float* __restrict__ in, float * __restrict__ out, int w, int h)
{
    const int BLOCK_W = (TILE_W + 2*R);
    const int BLOCK_H = (TILE_H + 2*R);
    w = 3*w;
    __shared__ float smem[BLOCK_H][BLOCK_W];

    int tx = threadIdx.x;
    int ty = threadIdx.y;
    unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x;
    unsigned int idy = blockIdx.y*blockDim.y+threadIdx.y;
    // load central block
    if ((idx < w) && (idy < h)) smem[ty+R][tx+R] = in[idy*w+idx];
    // load left bar
    if ((idx > R-1) && (idx < w) && (tx < R) && (idy < h)) smem[ty+R][tx] = in[idy*w+idx-R];
    // load right bar
    if ((idx < w-R) && (tx >= blockDim.x-R) && (idy < h)) smem[ty+R][tx+2*R] = in[idy*w+idx+R];
    // load top bar
    if ((idx < w) && (idy > R-1) && (idy < h) && (ty < R)) smem[ty][tx+R] = in[(idy-R)*w+idx];
    // load bottom bar
    if ((idx < w) && (idy < h-R) && (ty >=blockDim.y-R)) smem[ty+2*R][tx+R] = in[(idy+R)*w+idx];
    // load UL corner
    if ((idx > R-1) && (idy > R-1) && (tx < R) && (ty < R) && (idy < h) && (idx < w)) smem[ty][tx] = in[(idy-R)*w+idx-R];
    // load UR corner
    if ((idx < w-R) && (tx >= blockDim.x-R) && (idy > R-1) && (ty < R) && (idy < h)) smem[ty][tx+2*R] = in[(idy-R)*w+idx+R];
    // load LL corner
    if ((idx > R-1) && (idy < h-R) && (tx < R) && (ty >= blockDim.y-R) && (idx < w)) smem[ty+2*R][tx] = in[(idy+R)*w+idx-R];
    // load LR corner
    if ((idx < w-R) && (idy < h-R) && (tx >=blockDim.x-R) && (ty >=blockDim.y-R)) smem[ty+2*R][tx+2*R] = in[(idy+R)*w+idx+R];
    __syncthreads();
    if (idx <= R-1 || idy <= R-1 || idx >= w-R || idy >= h-R)
    {
        out[idy*w+idx] = smem[ty+R][tx+R];
        return;
    }

    float sum = 0;

    for(int i = -R; i <=R; i++)
      for(int j = -R; j <=R; j++)
        {
            sum += smem[ty+R+i][tx+R+j] * kernel_gauss5x5[(i + R)*WF+(j + R)];
        }
    out[idy*w+idx] = sum;
}

void gauss5x5_gpu_tiles(float* d_src, float* d_dest, unsigned char* d_result, int w, int h, int cycles)
{
    dim3 dimGrid ((w*3) / TILE_W, h / TILE_H);
    dim3 dimBlock(TILE_W, TILE_H);

    float* src = d_src;
    while (cycles--)
    {
        gauss5x5_tiles_kernel << < dimGrid, dimBlock>> > (src, d_dest, w, h);
        src = d_dest;
    }
}

void gauss5x5_gpu(float* d_src, float* d_dest, unsigned char* d_result, int w, int h, int cycles)
{
    dim3 dimGrid ((w*3) / TILE_W, h / TILE_H);
    dim3 dimBlock(TILE_W, TILE_H);
    float* src = d_src;
    while (cycles--)
    {
        gauss5x5_kernel << < dimGrid, dimBlock>> > (src, d_dest, w, h);
        src = d_dest;
    }
}


void check_error(cudaError_t status)
{
    cudaError_t status2 = cudaGetLastError();
    if (status != cudaSuccess)
    {
        const char *s = cudaGetErrorString(status);
        printf("\n CUDA Error: %s\n", s);
        getchar();
    }
    if (status2 != cudaSuccess)
    {
        const char *s = cudaGetErrorString(status2);
        printf("\n CUDA Error Prev: %s\n", s);
        getchar();
    }
}

void printTime(const char* name, double time)
{
    float fps = 1000 / time;
    printf("%-#40s",name);
    char tmp[32];
    sprintf(tmp, "%0.2f [ms]", time);
    printf("%-#20s%0.2f\n", tmp, fps);
}

#define CHECK_CUDA(X) check_error((cudaError_t)X);


int main(void)
{
    const int IMAGE_W = 5202 ; // pixels
    const int IMAGE_H = 6502 ;
    const int N = 5202 * 6502 * 3;
    const int cycles = 1;
    std::cout << "Filter Radius: " << R << std::endl;
    // image is loaded as RGBA. fill with random values
    float* img_cpu = new float[N];
    float *t1 = new float[N];
    float *t2 = new float[N];
    for (int k = 0; k < N; k++)
        img_cpu[k] = std::rand() % 255;

    float* img_gpu = nullptr;
    CHECK_CUDA(cudaMalloc((void **) &img_gpu, (N * sizeof(float))));
    cudaMemcpy(img_gpu, img_cpu, (N*sizeof(float)), cudaMemcpyHostToDevice);
    float* temp_gpu = nullptr;
    CHECK_CUDA(cudaMalloc((void **) &temp_gpu, (N * sizeof(float))));

    printf("image size: %d x %d\n", IMAGE_W, IMAGE_H);
    printf("%-#40s%-#20s%0s\n", "filter", "time", "FPS");
    printf("---------------------------------------------------------------------\n");

    // warmup
    gauss5x5_gpu_tiles(img_gpu, temp_gpu, nullptr, IMAGE_W, IMAGE_H, cycles);
    CHECK_CUDA(cudaDeviceSynchronize());
    cudaMemcpy(t1, temp_gpu, N*sizeof(float),cudaMemcpyDeviceToHost);
    auto timeStart = std::chrono::system_clock::now();
    gauss5x5_gpu_tiles(img_gpu, temp_gpu, nullptr, IMAGE_W, IMAGE_H, cycles);
    CHECK_CUDA(cudaDeviceSynchronize());
    auto timeEnd = std::chrono::system_clock::now();
    double dProcessingTime = (double)std::chrono::duration_cast<std::chrono::milliseconds>(timeEnd - timeStart).count() / cycles;
    printTime("gauss5x5_gpu_tiles", dProcessingTime);

    // warmup
    gauss5x5_gpu(img_gpu, temp_gpu, nullptr, IMAGE_W, IMAGE_H, cycles);
    CHECK_CUDA(cudaDeviceSynchronize());
    cudaMemcpy(t2, temp_gpu, N*sizeof(float),cudaMemcpyDeviceToHost);
    timeStart = std::chrono::system_clock::now();
    gauss5x5_gpu(img_gpu, temp_gpu, nullptr, IMAGE_W, IMAGE_H, cycles);
    CHECK_CUDA(cudaDeviceSynchronize());
    timeEnd = std::chrono::system_clock::now();
    dProcessingTime = (double)std::chrono::duration_cast<std::chrono::milliseconds>(timeEnd - timeStart).count() / cycles;
    printTime("gauss5x5_gpu", dProcessingTime);
    for (int i = 0; i < N; i++) if (t1[i] != t2[i]) {std::cout << "mismatch at: " << i << " t1: " << t1[i] << " t2: " << t2[i] << std::endl; return 0;}

    delete img_cpu;
    cudaFree(img_gpu);
    cudaFree(temp_gpu);

    return 0;
}
$ nvcc -o t13 t13.cu -arch=sm_80
t13.cu: In function ‘void printTime(const char*, double)’:
t13.cu:132:8: warning: '#' flag used with ‘%s’ gnu_printf format [-Wformat=]
  132 |     printf("%-#40s",name);
      |        ^~~~~~~~
t13.cu:135:8: warning: '#' flag used with ‘%s’ gnu_printf format [-Wformat=]
  135 |     printf("%-#20s%0.2f\n", tmp, fps);
      |        ^~~~~~~~~~~~~~~
t13.cu: In function ‘int main()’:
t13.cu:162:8: warning: '#' flag used with ‘%s’ gnu_printf format [-Wformat=]
  162 |     printf("%-#40s%-#20s%0s\n", "filter", "time", "FPS");
      |        ^~~~~~~~~~~~~~~~~~~
t13.cu:162:8: warning: '#' flag used with ‘%s’ gnu_printf format [-Wformat=]
t13.cu:162:8: warning: '0' flag used with ‘%s’ gnu_printf format [-Wformat=]
$ ./t13
Filter Radius: 2
image size: 5202 x 6502
filter                                  time                FPS
---------------------------------------------------------------------
gauss5x5_gpu_tiles                      1.00 [ms]           1000.00
gauss5x5_gpu                            1.00 [ms]           1000.00
$ nvcc -o t13 t13.cu -arch=sm_80 -Xptxas=-v -DRF=8
ptxas info    : 0 bytes gmem, 1156 bytes cmem[3]
ptxas info    : Compiling entry function '_Z21gauss5x5_tiles_kernelPKfPfii' for 'sm_80'
ptxas info    : Function properties for _Z21gauss5x5_tiles_kernelPKfPfii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 23 registers, 6144 bytes smem, 376 bytes cmem[0]
ptxas info    : Compiling entry function '_Z15gauss5x5_kernelPKfPfii' for 'sm_80'
ptxas info    : Function properties for _Z15gauss5x5_kernelPKfPfii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 376 bytes cmem[0]
t13.cu: In function ‘void printTime(const char*, double)’:
t13.cu:132:8: warning: '#' flag used with ‘%s’ gnu_printf format [-Wformat=]
  132 |     printf("%-#40s",name);
      |        ^~~~~~~~
t13.cu:135:8: warning: '#' flag used with ‘%s’ gnu_printf format [-Wformat=]
  135 |     printf("%-#20s%0.2f\n", tmp, fps);
      |        ^~~~~~~~~~~~~~~
t13.cu: In function ‘int main()’:
t13.cu:162:8: warning: '#' flag used with ‘%s’ gnu_printf format [-Wformat=]
  162 |     printf("%-#40s%-#20s%0s\n", "filter", "time", "FPS");
      |        ^~~~~~~~~~~~~~~~~~~
t13.cu:162:8: warning: '#' flag used with ‘%s’ gnu_printf format [-Wformat=]
t13.cu:162:8: warning: '0' flag used with ‘%s’ gnu_printf format [-Wformat=]
$ ./t13
Filter Radius: 8
image size: 5202 x 6502
filter                                  time                FPS
---------------------------------------------------------------------
gauss5x5_gpu_tiles                      8.00 [ms]           125.00
gauss5x5_gpu                            13.00 [ms]          76.92
$

This is on an A100. The A100 is fast enough that the measurement granularity becomes an issue at the smaller filter size. But profiling with nsight systems shows that the kernel duration of the tiled kernel is longer than the non-tiled kernel at the small filter size. But the situation reverses eventually when the filter becomes large enough. Because of my particular block size choice, I don’t think you can trivially increase the filter beyond a radius of 16.

I also don’t think your test case makes sense for cycles > 1. After the first cycle, the input and output buffers are the same. So that isn’t really the same test case. It also doesn’t make sense computationally. Because of the overlap of the border regions, you cannot rationally have the output overwrite the input, and expect to get sensible or predictable results.

Thanks for your help. But your code needs some changes to work for real images, anyway the changes do not alters much the performance results.

I also don’t think your test case makes sense for cycles

Ok, you are right, both buffers should not overlap, I fixed in the demo buffers to swap after each call, but I need the cycles because with only one call FPS varies too much.

it is puzzling to me how you are mixing per-pixel multipliers of 3 and 4.

This is because he original image was RGBA. I saw that you switched to RGB and removed some indexing but still you have to define and the color channels order this to work correctly. I recommend to use NCHW (also called channels-first) , image rows contains [RR…] [GG…] [BB…] instead of [RGB] [RGB] [RGB], the difference is that this simplifies the indexing and some boundary checks (but should not alters the performance results much).

For RTX3060 your test gives rather different results, the Tiled non-apron variant is also slower until the filter size is less than 15 x 15, then it becomes faster but only marginally, here are my results for tile size 32 x 16 and image size 5200 x 6500:
bulr_performance

So for 5x5 I have :

Filter: 5 x 5
image size: 5200 x 6500
filter                                  time                FPS
---------------------------------------------------------------------
gauss_gpu_tiles                         6.65 [ms]           150.38
gauss_gpu                               4.96 [ms]           201.61

And for 17x17 when the tiled filter is faster:

Filter: 17 x 17
image size: 5200 x 6500
filter                                  time                FPS
---------------------------------------------------------------------
gauss_gpu_tiles                         37.83 [ms]          26.43
gauss_gpu                               40.92 [ms]          24.44

So the difference is only 10% in favor of the tiles.

I also tested the apron and non-apron difference in performance. It can be further simplified but if using the NCHW format as I suggested, the non-apron variant performs about 20-25% better than the original apron variant. But as shown above it is still quite behind the non-tiled filter.

This is the full code of my NCHW demo:

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <chrono>
#include <iostream>

#ifndef RF
#define RF 8
#endif
const int R = RF;
const int D = 2*R+1;
const int LF = D*D;
const float FC = 1.0f/LF;
const int TILE_W = 32;
const int TILE_H = 16;

__constant__ float kernel_blur[LF] ;

__global__ void gauss_kernel(const float* __restrict__ in, float * __restrict__ out, int w, int h)
{
    unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
    unsigned int x = idx % w;
    unsigned int c = idx / w;   
    unsigned int w3 = 3 * w;

    if ((idx >= w3) || (y >= h))
        return;

    unsigned int id = y * w3 + c*w + x;   // image index (NCHW)

    if (x <= R-1 || y <= R-1 || x >= w-R || y >= h-R)
    {
        out[id] = in[id];
        return;
    }
    
    float sum = 0;
    for(int i = -R; i <=R; i++)
    for(int j = -R; j <=R; j++)
    {
        sum += in[id + j + w3*i] *kernel_blur[(i + R) * D + (j + R)];
    }

    out[id] = sum;
}

__global__ void gauss_tiles_kernel(const float* __restrict__ in, float * __restrict__ out, int w, int h)
{
    const int BLOCK_W = (TILE_W + 2*R);
    const int BLOCK_H = (TILE_H + 2*R);
    __shared__ float smem[BLOCK_H][BLOCK_W];

    int tx = threadIdx.x;
    int ty = threadIdx.y;
    unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x;
    unsigned int idy = blockIdx.y*blockDim.y+threadIdx.y;

    unsigned int x = idx % w;
    unsigned int c = idx / w;
    unsigned int y = idy;

    int w3 = w * 3;
    int idx_out = y * w3 + c * w + x;

    // load central block
    smem[ty+R][tx+R] = in[idx_out];  //if ((x < w) && (y < h)) 
    // load left bar
    if ((x >= R) && (tx < R)) smem[ty+R][tx] = in[idx_out-R];
    // load right bar
    if ((x < w-R) && (tx >= blockDim.x-R) ) smem[ty+R][tx+2*R] = in[idx_out+R];
    // load top bar
    if ((x < w) && (y > R-1) && (ty < R)) smem[ty][tx+R] = in[idx_out-R*w3];
    // load bottom bar
    if ((x < w) && (y < h-R) && (ty >=blockDim.y-R)) smem[ty+2*R][tx+R] = in[idx_out+R*w3];
    // load UL corner
    if ((x > R-1) && (y > R-1) && (tx < R) && (ty < R)) smem[ty][tx] = in[idx_out - R*w3 - R];
    // load UR corner
    if ((x < w-R) && (y > R-1) && (tx >= blockDim.x-R) && (ty < R) ) smem[ty][tx+2*R] = in[idx_out - R * w3 + R];
    // load LL corner
    if ((x > R-1) && (y < h-R) && (tx < R) && (ty >= blockDim.y-R) ) smem[ty+2*R][tx] = in[idx_out + R * w3 - R];
    // load LR corner
    if ((x < w-R) && (y < h-R) && (tx >=blockDim.x-R) && (ty >= blockDim.y-R)) smem[ty+2*R][tx+2*R] = in[idx_out + R * w3 + R];
    
    __syncthreads();

    if (x <= R-1 || y <= R-1 || x >= w-R || y >= h-R)
    {
        out[idx_out] = in[idx_out];
        return;
    }

    float sum = 0;
    for(int i = -R; i <=R; i++)
    for(int j = -R; j <=R; j++)
    {
        sum += smem[ty+R+i][tx+R+j] * kernel_blur[(i+R)*D+(j+R)];
    }
    out[idx_out] = sum;
}

void gauss_gpu_tiles(float* d_src, float* d_dest, unsigned char* d_result, int w, int h, int cycles)
{
    dim3 dimGrid ((w*3) / TILE_W, h / TILE_H);
    dim3 dimBlock(TILE_W, TILE_H);

    float* src = d_src;
    while (cycles--)
    {
        gauss_tiles_kernel << < dimGrid, dimBlock>> > (src, dst, w, h);
        tmp = dst;
        dst = src;
        src = tmp;;
    }
}

void gauss_gpu(float* d_src, float* d_dest, unsigned char* d_result, int w, int h, int cycles)
{
    dim3 dimGrid ((w*3) / TILE_W, h / TILE_H);
    dim3 dimBlock(TILE_W, TILE_H);
    float* src = d_src,*dst = d_dest, *tmp = d_dest;
    while (cycles--)
    {
        gauss_kernel << < dimGrid, dimBlock>> > (src, dst, w, h);
        tmp = dst;
        dst = src;
        src = tmp;
    }
}

void check_error(cudaError_t status)
{
    cudaError_t status2 = cudaGetLastError();
    if (status != cudaSuccess)
    {
        const char *s = cudaGetErrorString(status);
        printf("\n CUDA Error: %s\n", s);
        getchar();
    }
    if (status2 != cudaSuccess)
    {
        const char *s = cudaGetErrorString(status2);
        printf("\n CUDA Error Prev: %s\n", s);
        getchar();
    }
}

void printTime(const char* name, double time)
{
    float fps = 1000 / time;
    printf("%-#40s",name);
    char tmp[32];
    sprintf(tmp, "%0.2f [ms]", time);
    printf("%-#20s%0.2f\n", tmp, fps);
}

#define CHECK_CUDA(X) check_error((cudaError_t)X);

int main()
{
    const int IMAGE_W = 5200 ; // pixels
    const int IMAGE_H = 6500 ;
    const int N = IMAGE_W * IMAGE_H * 3;
    const int cycles = 100;
    std::cout << "Filter: " << 2*R + 1 << " x " << 2*R + 1 << std::endl;
    // image is loaded as RGBA. fill with random values
    float* img_cpu = new float[N];
    for (int k = 0; k < N; k++)
        img_cpu[k] = std::rand() % 255;

    float *t1 = new float[N];
    float *t2 = new float[N];

    float* kernel_cpu = new float[LF];
    for (int k = 0; k < LF; k++)
        kernel_cpu[k] = 1.0f/LF;

    unsigned char* kernel_gpu_adr;
    CHECK_CUDA(cudaGetSymbolAddress((void **)&kernel_gpu_adr, kernel_blur));
    cudaMemcpy(kernel_gpu_adr, kernel_cpu, LF*sizeof(float),cudaMemcpyHostToDevice);

    float* img_gpu = nullptr;
    CHECK_CUDA(cudaMalloc((void **) &img_gpu, (N * sizeof(float))));
    cudaMemcpy(img_gpu, img_cpu, (N*sizeof(float)), cudaMemcpyHostToDevice);
    float* temp_gpu = nullptr;
    CHECK_CUDA(cudaMalloc((void **) &temp_gpu, (N * sizeof(float))));

    printf("image size: %d x %d\n", IMAGE_W, IMAGE_H);
    printf("%-#40s%-#20s%0s\n", "filter", "time", "FPS");
    printf("---------------------------------------------------------------------\n");

    // warmup
    gauss_gpu_tiles(img_gpu, temp_gpu, nullptr, IMAGE_W, IMAGE_H, 1);
    CHECK_CUDA(cudaDeviceSynchronize());
    cudaMemcpy(t1, temp_gpu, N*sizeof(float),cudaMemcpyDeviceToHost);
    auto timeStart = std::chrono::system_clock::now();
    gauss_gpu_tiles(img_gpu, temp_gpu, nullptr, IMAGE_W, IMAGE_H, cycles);
    CHECK_CUDA(cudaDeviceSynchronize());
    auto timeEnd = std::chrono::system_clock::now();
    double dProcessingTime = (double)std::chrono::duration_cast<std::chrono::milliseconds>(timeEnd - timeStart).count() / cycles;
    printTime("gauss_gpu_tiles", dProcessingTime);

    // warmup
   cudaMemcpy(img_gpu, img_cpu, (N*sizeof(float)), cudaMemcpyHostToDevice);
    gauss_gpu(img_gpu, temp_gpu, nullptr, IMAGE_W, IMAGE_H, 1);
    CHECK_CUDA(cudaDeviceSynchronize());
    cudaMemcpy(t2, temp_gpu, N*sizeof(float),cudaMemcpyDeviceToHost);
    timeStart = std::chrono::system_clock::now();
    gauss_gpu(img_gpu, temp_gpu, nullptr, IMAGE_W, IMAGE_H, cycles);
    CHECK_CUDA(cudaDeviceSynchronize());
    timeEnd = std::chrono::system_clock::now();
    dProcessingTime = (double)std::chrono::duration_cast<std::chrono::milliseconds>(timeEnd - timeStart).count() / cycles;
    printTime("gauss_gpu", dProcessingTime);
    for (int i = 0; i < N; i++)
    {
        if (t1[i] != t2[i])
        {
            std::cout << "mismatch at: " << i << " t1: " << t1[i] << " t2: " << t2[i] << std::endl; 
        }
    }

    delete img_cpu;
    delete t1;
    delete t2;
    cudaFree(img_gpu);
    cudaFree(temp_gpu);

    return 0;
}