Problems when using shared memory

jibinghu23 · May 15, 2024, 2:38am

Hi,

I recently started using cuda and I have some troubles using shared memory. I hope someone here can help me figure out the problems within my code. I employed two different methods for the calculation: a simple approach and a more complex one utilizing shared memory. However, I observed that the full utilization of shared memory led to degraded performance, resulting in longer computation time for the kernel function. I would appreciate it if someone could kindly provide an explanation. Thank you very much.

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <cuda.h>
#include <cuda_runtime.h>

const int Nx = 1024 * 2;
const int Ny = 1024 * 2;
const int blockSizeX = 32;
const int blockSizeY = 32;
const int TIMES = 10;

// The program calculates by dividing the data in the global memory into shared memory blocks. 
// Edge handling is done by extra processing in edge threads and synchronization.

__global__ void two_stencil(const int n, const double *__restrict__ in_xy, double *__restrict__ out_xy)
{
  int xindex = blockIdx.x * blockDim.x + threadIdx.x;
  int yindex = blockIdx.y * blockDim.y + threadIdx.y;
  // Declare shared memory
  extern __shared__ double tiled[];
  // Define shared memory index and use padding to avoid bank conflict
  int shared_index = (threadIdx.y + 1) * (blockDim.x + 2 + 1) + (threadIdx.x + 1);
  // Copy data to shared memory and handle edges
  if (xindex < Nx && yindex < Ny)
  {
    tiled[shared_index] = in_xy[yindex * Nx + xindex];

    if (threadIdx.x == 0)
      tiled[shared_index - 1] = (xindex > 0) ? in_xy[yindex * Nx + xindex - 1] : 0.0;
    if (threadIdx.x == blockDim.x - 1)
      tiled[shared_index + 1] = (xindex < Nx - 1) ? in_xy[yindex * Nx + xindex + 1] : 0.0;
    if (threadIdx.y == 0)
      tiled[shared_index - (blockDim.x + 2)] = (yindex > 0) ? in_xy[(yindex - 1) * Nx + xindex] : 0.0;
    if (threadIdx.y == blockDim.y - 1)
      tiled[shared_index + (blockDim.x + 2)] = (yindex < Ny - 1) ? in_xy[(yindex + 1) * Nx + xindex] : 0.0;
  }

  __syncthreads();
  
  // Weighted average calculation
  if ((0 < xindex && xindex < (Nx - 1)) && (0 < yindex && yindex < (Ny - 1)))
  {
    out_xy[yindex * Nx + xindex] = 0.2 * (tiled[shared_index] +
                                          tiled[shared_index - 1] +
                                          tiled[shared_index + 1] +
                                          tiled[shared_index - (blockDim.x + 2)] +
                                          tiled[shared_index + (blockDim.x + 2)]);
  }
}

void fill_array(const int n, double *array)
{
  double init = (rand() % 1000) * 0.2;
  for (int ii = 0; ii < n; ++ii)
  {
    *(array + ii) = init + ii * 0.00001;
  }
}

inline int64_t GetUsec()
{
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return (tv.tv_sec * 1000000l) + tv.tv_usec;
}

int main()
{
  srand(202405);
  double *host_in_xy = new double[Nx * Ny];
  double *host_out_xy = new double[Nx * Ny];
  fill_array(Nx * Ny, host_in_xy);
  printf("host_in_xy[1000]=%.5f\n", host_in_xy[1000]);
  fflush(stdout);

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  double *dev_in_xy = nullptr, *dev_out_xy = nullptr;
  cudaError_t result = cudaMalloc(&dev_in_xy, sizeof(double) * (size_t)Nx * (size_t)Ny);
  result = cudaMalloc(&dev_out_xy, sizeof(double) * (size_t)Nx * (size_t)Ny);
  printf("result=%d\n", result);
  cudaMemcpy(dev_in_xy, host_in_xy, sizeof(double) * (size_t)Nx * (size_t)Ny, cudaMemcpyHostToDevice);
  cudaMemset(dev_out_xy, 0, sizeof(double) * (size_t)Nx * (size_t)Ny);

  int numBlocksX = (Nx + blockSizeX - 1) / blockSizeX;
  int numBlocksY = (Ny + blockSizeY - 1) / blockSizeY;
  printf("numBlocks=%d", numBlocksX*numBlocksY);
  fflush(stdout);

  // warm up
  // Call block as a 2D grid
  two_stencil<<<dim3(numBlocksX, numBlocksY), dim3(blockSizeX, blockSizeY), sizeof(double) * (blockSizeX + 2) * (blockSizeY + 2)>>>(Nx * Ny, dev_in_xy, dev_out_xy);
  cudaDeviceSynchronize();

  cudaEventRecord(start);
  int64_t ustart = GetUsec();
  for (int loop = 0; loop < TIMES; ++loop)
  {
    two_stencil<<<dim3(numBlocksX, numBlocksY), dim3(blockSizeX, blockSizeY), sizeof(double) * (blockSizeX + 2) * (blockSizeY + 2)>>>(Nx * Ny, dev_in_xy, dev_out_xy);
  }
  cudaDeviceSynchronize();
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  int64_t ufinish = GetUsec();
  cudaMemcpy(host_out_xy, dev_out_xy, sizeof(double) * (size_t)Nx * (size_t)Ny, cudaMemcpyDeviceToHost);
  float ms = 0.0f;
  cudaEventElapsedTime(&ms, start, stop);
  printf("kernel time=%.5f\n", ms / TIMES);
  fflush(stdout);
  printf("kernel usec=%ld\nhost_out_xy[10000]=%.5f, host_out_xy[Nx*Ny - Nx - 16]=%.5f\n", (ufinish - ustart) / TIMES, host_out_xy[10000], host_out_xy[Nx * Ny - Nx - 16]);
  fflush(stdout);

  cudaFree(dev_in_xy);
  dev_in_xy = nullptr;
  cudaFree(dev_out_xy);
  dev_out_xy = nullptr;
  delete[] host_in_xy;
  host_in_xy = nullptr;
  delete[] host_out_xy;
  host_out_xy = nullptr;
  return 0;
}

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <cuda.h>
#include <cuda_runtime.h>

const int Nx = 1024 * 2;
const int Ny = 1024 * 2;
const int blockSize = 256;  // Thread block size
const int TIMES = 10;

// Stencil computation kernel function optimized using shared memory
__global__ void two_stencil_optimized(const int n, const double * __restrict__ in_xy, double * __restrict__ out_xy) {
    extern __shared__ double tile[];
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int yindex = index / Nx;
    int xindex = index % Nx;

    // Load input data into shared memory
    tile[threadIdx.x] = in_xy[index];

    __syncthreads();

    // Ensure only internal data is calculated
    if ((0 < xindex && xindex < (Nx - 1)) && (0 < yindex && yindex < (Ny - 1))) {
        out_xy[index] = 0.2 * (tile[threadIdx.x] + in_xy[index - 1] + in_xy[index + 1] + in_xy[index - Nx] + in_xy[index + Nx]);
    }

    __syncthreads(); // Ensure all operations are completed before continuing
}

void fill_array(const int n, double *array) {
    double init = (rand() % 1000) * 0.2;
    for (int ii = 0; ii < n; ++ii) {
        *(array + ii) = init + ii * 0.00001;
    }
}

inline int64_t GetUsec() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (tv.tv_sec * 1000000l) + tv.tv_usec;
}

int main() {
    srand(202405);
    double *host_in_xy = new double[Nx * Ny];
    double *host_out_xy = new double[Nx * Ny];
    fill_array(Nx * Ny, host_in_xy);
    printf("host_in_xy[1000]=%.5f\n", host_in_xy[1000]);
    
    // CUDA event creation and timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    double *dev_in_xy = nullptr, *dev_out_xy = nullptr;
    cudaMalloc(&dev_in_xy, sizeof(double) * Nx * Ny);
    cudaMalloc(&dev_out_xy, sizeof(double) * Nx * Ny);
    cudaMemcpy(dev_in_xy, host_in_xy, sizeof(double) * Nx * Ny, cudaMemcpyHostToDevice);
    cudaMemset(dev_out_xy, 0, sizeof(double) * Nx * Ny);

    int numBlocks = (Nx * Ny + blockSize - 1) / blockSize;
    printf("numBlocks=%d\n", numBlocks);
    
    // warm up
    two_stencil_optimized<<<dim3(numBlocks, 1, 1), dim3(blockSize, 1, 1), blockSize * sizeof(double)>>>(Nx * Ny, dev_in_xy, dev_out_xy);
    cudaDeviceSynchronize();

    cudaEventRecord(start);
    int64_t ustart = GetUsec();
    for (int loop = 0; loop < TIMES; ++loop) {
        two_stencil_optimized<<<dim3(numBlocks, 1, 1), dim3(blockSize, 1, 1), blockSize * sizeof(double)>>>(Nx * Ny, dev_in_xy, dev_out_xy);
    }
    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    int64_t ufinish = GetUsec();
    cudaMemcpy(host_out_xy, dev_out_xy, sizeof(double) * Nx * Ny, cudaMemcpyDeviceToHost);
    
    float ms = 0.0f;
    cudaEventElapsedTime(&ms, start, stop);
    printf("kernel time=%.5f\n", ms / TIMES);
    printf("kernel usec=%ld, host_out_xy[10000]=%.5f, host_out_xy[Nx*Ny - Nx - 16]=%.5f\n", (ufinish - ustart) / TIMES, host_out_xy[10000], host_out_xy[Nx * Ny - Nx - 16]);
    
    cudaFree(dev_in_xy);
    cudaFree(dev_out_xy);
    delete[] host_in_xy;
    delete[] host_out_xy;

    return 0;
}

Robert_Crovella · May 15, 2024, 2:32pm

When I run your codes on a L4 GPU on CUDA 12.2 on linux, I get the following reports. From the first code you have posted, I get:

kernel time=0.28438

From the second code you have posted, I get:

kernel time=0.28106

So the difference is quite small, about 1.2%

Why didn’t the first version show better performance, since it uses shared memory “fully”?

There are probably 2 factors to consider:

Not enough reuse of data. There is a cost/overhead to load shared memory, and the cost/overhead doesn’t have much to do with the stencil size, in many cases. In order to recover this cost and show a benefit, a certain level of reuse of data is required. A reuse factor of 5 is apparently not enough.
Benefit of cache. Modern GPUs have larger L1 and L2 caches, and so the benefit we get from them may be greater. The caches make your “non-fully-shared” code not have as much of a performance loss as might be the case with smaller caches.

Topic		Replies	Views
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16295	January 30, 2011
Cuda Latency problems Slow Cuda CUDA Programming and Performance	15	13931	September 5, 2008
CUDA and Quadro FX 580 memory usage/corruption CUDA Programming and Performance	0	716	July 27, 2014
When i use CUDA, my program gets slower CUDA Programming and Performance	5	528	April 2, 2019
Why cannot run this program CUDA Programming and Performance	7	951	May 11, 2018
Using shared memory CUDA Programming and Performance	3	83	August 9, 2024
Serial vs Parallel Performance CUDA Programming and Performance	5	904	April 3, 2017
[SOLVED] Code his own shared memory with device memory! CUDA Programming and Performance	15	2559	October 7, 2015
Outrageous speedups over serial code: advice needed CUDA Programming and Performance	17	2801	November 18, 2012
Efficient use of shared memory CUDA Programming and Performance	29	4383	December 2, 2019

Problems when using shared memory

Related topics