Problems when using shared memory

Hi,

I recently started using cuda and I have some troubles using shared memory. I hope someone here can help me figure out the problems within my code. I employed two different methods for the calculation: a simple approach and a more complex one utilizing shared memory. However, I observed that the full utilization of shared memory led to degraded performance, resulting in longer computation time for the kernel function. I would appreciate it if someone could kindly provide an explanation. Thank you very much.

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <cuda.h>
#include <cuda_runtime.h>

const int Nx = 1024 * 2;
const int Ny = 1024 * 2;
const int blockSizeX = 32;
const int blockSizeY = 32;
const int TIMES = 10;

// The program calculates by dividing the data in the global memory into shared memory blocks. 
// Edge handling is done by extra processing in edge threads and synchronization.

__global__ void two_stencil(const int n, const double *__restrict__ in_xy, double *__restrict__ out_xy)
{
  int xindex = blockIdx.x * blockDim.x + threadIdx.x;
  int yindex = blockIdx.y * blockDim.y + threadIdx.y;
  // Declare shared memory
  extern __shared__ double tiled[];
  // Define shared memory index and use padding to avoid bank conflict
  int shared_index = (threadIdx.y + 1) * (blockDim.x + 2 + 1) + (threadIdx.x + 1);
  // Copy data to shared memory and handle edges
  if (xindex < Nx && yindex < Ny)
  {
    tiled[shared_index] = in_xy[yindex * Nx + xindex];

    if (threadIdx.x == 0)
      tiled[shared_index - 1] = (xindex > 0) ? in_xy[yindex * Nx + xindex - 1] : 0.0;
    if (threadIdx.x == blockDim.x - 1)
      tiled[shared_index + 1] = (xindex < Nx - 1) ? in_xy[yindex * Nx + xindex + 1] : 0.0;
    if (threadIdx.y == 0)
      tiled[shared_index - (blockDim.x + 2)] = (yindex > 0) ? in_xy[(yindex - 1) * Nx + xindex] : 0.0;
    if (threadIdx.y == blockDim.y - 1)
      tiled[shared_index + (blockDim.x + 2)] = (yindex < Ny - 1) ? in_xy[(yindex + 1) * Nx + xindex] : 0.0;
  }

  __syncthreads();
  
  // Weighted average calculation
  if ((0 < xindex && xindex < (Nx - 1)) && (0 < yindex && yindex < (Ny - 1)))
  {
    out_xy[yindex * Nx + xindex] = 0.2 * (tiled[shared_index] +
                                          tiled[shared_index - 1] +
                                          tiled[shared_index + 1] +
                                          tiled[shared_index - (blockDim.x + 2)] +
                                          tiled[shared_index + (blockDim.x + 2)]);
  }
}

void fill_array(const int n, double *array)
{
  double init = (rand() % 1000) * 0.2;
  for (int ii = 0; ii < n; ++ii)
  {
    *(array + ii) = init + ii * 0.00001;
  }
}

inline int64_t GetUsec()
{
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return (tv.tv_sec * 1000000l) + tv.tv_usec;
}

int main()
{
  srand(202405);
  double *host_in_xy = new double[Nx * Ny];
  double *host_out_xy = new double[Nx * Ny];
  fill_array(Nx * Ny, host_in_xy);
  printf("host_in_xy[1000]=%.5f\n", host_in_xy[1000]);
  fflush(stdout);

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  double *dev_in_xy = nullptr, *dev_out_xy = nullptr;
  cudaError_t result = cudaMalloc(&dev_in_xy, sizeof(double) * (size_t)Nx * (size_t)Ny);
  result = cudaMalloc(&dev_out_xy, sizeof(double) * (size_t)Nx * (size_t)Ny);
  printf("result=%d\n", result);
  cudaMemcpy(dev_in_xy, host_in_xy, sizeof(double) * (size_t)Nx * (size_t)Ny, cudaMemcpyHostToDevice);
  cudaMemset(dev_out_xy, 0, sizeof(double) * (size_t)Nx * (size_t)Ny);

  int numBlocksX = (Nx + blockSizeX - 1) / blockSizeX;
  int numBlocksY = (Ny + blockSizeY - 1) / blockSizeY;
  printf("numBlocks=%d", numBlocksX*numBlocksY);
  fflush(stdout);

  // warm up
  // Call block as a 2D grid
  two_stencil<<<dim3(numBlocksX, numBlocksY), dim3(blockSizeX, blockSizeY), sizeof(double) * (blockSizeX + 2) * (blockSizeY + 2)>>>(Nx * Ny, dev_in_xy, dev_out_xy);
  cudaDeviceSynchronize();

  cudaEventRecord(start);
  int64_t ustart = GetUsec();
  for (int loop = 0; loop < TIMES; ++loop)
  {
    two_stencil<<<dim3(numBlocksX, numBlocksY), dim3(blockSizeX, blockSizeY), sizeof(double) * (blockSizeX + 2) * (blockSizeY + 2)>>>(Nx * Ny, dev_in_xy, dev_out_xy);
  }
  cudaDeviceSynchronize();
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  int64_t ufinish = GetUsec();
  cudaMemcpy(host_out_xy, dev_out_xy, sizeof(double) * (size_t)Nx * (size_t)Ny, cudaMemcpyDeviceToHost);
  float ms = 0.0f;
  cudaEventElapsedTime(&ms, start, stop);
  printf("kernel time=%.5f\n", ms / TIMES);
  fflush(stdout);
  printf("kernel usec=%ld\nhost_out_xy[10000]=%.5f, host_out_xy[Nx*Ny - Nx - 16]=%.5f\n", (ufinish - ustart) / TIMES, host_out_xy[10000], host_out_xy[Nx * Ny - Nx - 16]);
  fflush(stdout);

  cudaFree(dev_in_xy);
  dev_in_xy = nullptr;
  cudaFree(dev_out_xy);
  dev_out_xy = nullptr;
  delete[] host_in_xy;
  host_in_xy = nullptr;
  delete[] host_out_xy;
  host_out_xy = nullptr;
  return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <cuda.h>
#include <cuda_runtime.h>

const int Nx = 1024 * 2;
const int Ny = 1024 * 2;
const int blockSize = 256;  // Thread block size
const int TIMES = 10;

// Stencil computation kernel function optimized using shared memory
__global__ void two_stencil_optimized(const int n, const double * __restrict__ in_xy, double * __restrict__ out_xy) {
    extern __shared__ double tile[];
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int yindex = index / Nx;
    int xindex = index % Nx;

    // Load input data into shared memory
    tile[threadIdx.x] = in_xy[index];

    __syncthreads();

    // Ensure only internal data is calculated
    if ((0 < xindex && xindex < (Nx - 1)) && (0 < yindex && yindex < (Ny - 1))) {
        out_xy[index] = 0.2 * (tile[threadIdx.x] + in_xy[index - 1] + in_xy[index + 1] + in_xy[index - Nx] + in_xy[index + Nx]);
    }

    __syncthreads(); // Ensure all operations are completed before continuing
}

void fill_array(const int n, double *array) {
    double init = (rand() % 1000) * 0.2;
    for (int ii = 0; ii < n; ++ii) {
        *(array + ii) = init + ii * 0.00001;
    }
}

inline int64_t GetUsec() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (tv.tv_sec * 1000000l) + tv.tv_usec;
}

int main() {
    srand(202405);
    double *host_in_xy = new double[Nx * Ny];
    double *host_out_xy = new double[Nx * Ny];
    fill_array(Nx * Ny, host_in_xy);
    printf("host_in_xy[1000]=%.5f\n", host_in_xy[1000]);
    
    // CUDA event creation and timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    double *dev_in_xy = nullptr, *dev_out_xy = nullptr;
    cudaMalloc(&dev_in_xy, sizeof(double) * Nx * Ny);
    cudaMalloc(&dev_out_xy, sizeof(double) * Nx * Ny);
    cudaMemcpy(dev_in_xy, host_in_xy, sizeof(double) * Nx * Ny, cudaMemcpyHostToDevice);
    cudaMemset(dev_out_xy, 0, sizeof(double) * Nx * Ny);

    int numBlocks = (Nx * Ny + blockSize - 1) / blockSize;
    printf("numBlocks=%d\n", numBlocks);
    
    // warm up
    two_stencil_optimized<<<dim3(numBlocks, 1, 1), dim3(blockSize, 1, 1), blockSize * sizeof(double)>>>(Nx * Ny, dev_in_xy, dev_out_xy);
    cudaDeviceSynchronize();

    cudaEventRecord(start);
    int64_t ustart = GetUsec();
    for (int loop = 0; loop < TIMES; ++loop) {
        two_stencil_optimized<<<dim3(numBlocks, 1, 1), dim3(blockSize, 1, 1), blockSize * sizeof(double)>>>(Nx * Ny, dev_in_xy, dev_out_xy);
    }
    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    int64_t ufinish = GetUsec();
    cudaMemcpy(host_out_xy, dev_out_xy, sizeof(double) * Nx * Ny, cudaMemcpyDeviceToHost);
    
    float ms = 0.0f;
    cudaEventElapsedTime(&ms, start, stop);
    printf("kernel time=%.5f\n", ms / TIMES);
    printf("kernel usec=%ld, host_out_xy[10000]=%.5f, host_out_xy[Nx*Ny - Nx - 16]=%.5f\n", (ufinish - ustart) / TIMES, host_out_xy[10000], host_out_xy[Nx * Ny - Nx - 16]);
    
    cudaFree(dev_in_xy);
    cudaFree(dev_out_xy);
    delete[] host_in_xy;
    delete[] host_out_xy;

    return 0;
}

When I run your codes on a L4 GPU on CUDA 12.2 on linux, I get the following reports. From the first code you have posted, I get:

kernel time=0.28438

From the second code you have posted, I get:

kernel time=0.28106

So the difference is quite small, about 1.2%

Why didn’t the first version show better performance, since it uses shared memory “fully”?

There are probably 2 factors to consider:

  1. Not enough reuse of data. There is a cost/overhead to load shared memory, and the cost/overhead doesn’t have much to do with the stencil size, in many cases. In order to recover this cost and show a benefit, a certain level of reuse of data is required. A reuse factor of 5 is apparently not enough.

  2. Benefit of cache. Modern GPUs have larger L1 and L2 caches, and so the benefit we get from them may be greater. The caches make your “non-fully-shared” code not have as much of a performance loss as might be the case with smaller caches.