Issue with a much larger grid than data

The following is an excercise code I worked out for the Nvidia CUDA C++ course. It doesn’t produce the right answer. The only difference between my code and the solution is how the grid is defined. My grid size is much larger than 100x200, being 256 x 256 = 65,536. I was thinking that the if statement in the kernel fuction should clearly tell the GPU not to use the redundant blocks and threads.

My grid definition is:

  dim3 threads_per_block(16, 16, 1);
  dim3 number_of_blocks(16, 16, 1);

The grid definition from the solution is more restrained and well calculated:

 dim3 tblocks(32, 16, 1);
 dim3 grid((nj/tblocks.x)+1, (ni/tblocks.y)+1, 1);

Could any one tell me why my code doesn’t work properly?

#include <stdio.h>
#include <math.h>

// Simple define to index into a 1D array from 2D space
#define I2D(num, c, r) ((r)*(num)+(c))

/*
 * `step_kernel_mod` is currently a direct copy of the CPU reference solution
 * `step_kernel_ref` below. Accelerate it to run as a CUDA kernel.
 */

__global__ void step_kernel_mod(int ni, int nj, float fact, float* temp_in, float* temp_out)
{
  int i00, im10, ip10, i0m1, i0p1;
  float d2tdx2, d2tdy2;

  int row = blockIdx.x * blockDim.x + threadIdx.x;
  int col = blockIdx.y * blockDim.y + threadIdx.y;

  int j = row;
  int i = col;

  // loop over all points in domain (except boundary)
  if (j >= 1 && j < nj -1 && i >=1 && i < ni - 1)
  {
      // find indices into linear memory
      // for central point and neighbours
      i00 = I2D(ni, i, j);
      im10 = I2D(ni, i-1, j);
      ip10 = I2D(ni, i+1, j);
      i0m1 = I2D(ni, i, j-1);
      i0p1 = I2D(ni, i, j+1);

      // evaluate derivatives
      d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10];
      d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1];

      // update temperatures
      temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2);
      //printf("Thread (%d, %d): i00 = %d, temp_in[i00] = %f, temp_out[i00] = %f\n", i, j, i00, temp_in[i00], temp_out[i00]);
      // if (j == 1) printf("j = %d\n", j);
      // if (i == 1) printf("i = %d\n", i);
      // if (j == nj - 2) printf("nj - 2 = %d\n", nj - 2);
      // if (i == ni - 2) printf("ni - 2 = %d\n", ni - 2);
      // if (i == 198) printf("i = %d", i);
  }
}

void step_kernel_ref(int ni, int nj, float fact, float* temp_in, float* temp_out)
{
  int i00, im10, ip10, i0m1, i0p1;
  float d2tdx2, d2tdy2;


  // loop over all points in domain (except boundary)
  for ( int j=1; j < nj-1; j++ ) {
    for ( int i=1; i < ni-1; i++ ) {
      // find indices into linear memory
      // for central point and neighbours
      i00 = I2D(ni, i, j);
      im10 = I2D(ni, i-1, j);
      ip10 = I2D(ni, i+1, j);
      i0m1 = I2D(ni, i, j-1);
      i0p1 = I2D(ni, i, j+1);

      // evaluate derivatives
      d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10];
      d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1];

      // update temperatures
      temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2);
    }
  }
}

int main()
{
  int istep;
  int nstep = 200; // number of time steps

  // Specify our 2D dimensions
  const int ni = 200;
  const int nj = 100;
  float tfac = 8.418e-5; // thermal diffusivity of silver

  float *temp1_ref, *temp2_ref, *temp1, *temp2, *temp_tmp;

  const int size = ni * nj * sizeof(float);

  temp1_ref = (float*)malloc(size);
  temp2_ref = (float*)malloc(size);
  // temp1 = (float*)malloc(size);
  // temp2 = (float*)malloc(size);

  // allocate memory for both CPU and GPU
  cudaMallocManaged (&temp1, size);
  cudaMallocManaged (&temp2, size);

  // Initialize with random data
  for( int i = 0; i < ni*nj; ++i) {
    temp1_ref[i] = temp2_ref[i] = temp1[i] = temp2[i] = (float)rand()/(float)(RAND_MAX/100.0f);
    cudaDeviceSynchronize();
  }
  
  

  // Execute the CPU-only reference version
  for (istep=0; istep < nstep; istep++) {
    step_kernel_ref(ni, nj, tfac, temp1_ref, temp2_ref);

    // swap the temperature pointers
    temp_tmp = temp1_ref;
    temp1_ref = temp2_ref;
    temp2_ref= temp_tmp;
  }

  // Execute the modified version using same data
  dim3 threads_per_block(16, 16, 1);
  dim3 number_of_blocks(16, 16, 1);

  
  for (istep=0; istep < nstep; istep++) {
    step_kernel_mod<<<number_of_blocks, threads_per_block>>>(ni, nj, tfac, temp1, temp2);

    // swap the temperature pointers
    temp_tmp = temp1;
    temp1 = temp2;
    temp2= temp_tmp;
  }

  float maxError = 0;
  // Output should always be stored in the temp1 and temp1_ref at this point
  for( int i = 0; i < ni*nj; ++i ) {
    if (abs(temp1[i]-temp1_ref[i]) > maxError) { maxError = abs(temp1[i]-temp1_ref[i]); }
  }

  // Check and see if our maxError is greater than an error bound
  if (maxError > 0.0005f)
    printf("Problem! The Max Error of %.5f is NOT within acceptable bounds.\n", maxError);
  else
    printf("The Max Error of %.5f is within acceptable bounds.\n", maxError);

  free( temp1_ref );
  free( temp2_ref );
  cudaFree( temp1 );
  cudaFree( temp2 );

  return 0;
}

The issue is that you need a cudaDeviceSynchronize(); statement, after you are done with the kernel launch loop, but before you begin doing error checking.

Kernel launches are asynchronous. You must wait for the kernel activity to complete before you begin checking the results.

It’s possible that a very large grid “masks” the problem, although I haven’t studied that. However the issue is not the grid size, it is the lack of synchronization.

Thanks I think you are right. I did try to put sync in but may still have missed it due to some reasons.

Hi team i am trying to launch the kernel but i am facing an issue, i had implemented the function in cpp for blending the imges now i want to launch the kernel i had done it but i am getting the out put that src image is proper and the destination image i am getting stripes in that (simply i am getting the improper image )its actually 2d image. and the code is the kernal.cu file i am passing
" static global void overlayKernel(uchar4* dst, const uchar4* overlay, int dstRows, int dstCols, int overlayRows, int overlayCols, int locationX, int locationY) "
in that i am Calculate the current thread’s position(x = blockIdx.x * blockDim.x + threadIdx.x;)
and Checking bounds to ensure we are within the destination image and Getting the overlay pixel at the current position like this
uchar4 overlayPx = overlay[fY * overlayCols + fX];
uchar4 dstPx = dst[y * dstCols + x];

Computing the opacity from the overlay image’s alpha channel
float opacity = overlayPx.w / 255.0f;
based on that If the opacity is greater than zero, blend the overlay pixel with the destination pixel
if (opacity > 0.0f) {
uchar4 resultPx;
resultPx.x = dstPx.x * (1.0f - opacity) + overlayPx.x * opacity; this will be done
getting no error but in the output image i am getting stripes can anyone help me to solve


There are 2 images in it one we can see and the above one with stripes i am getting but without kernel launch i am getting it properly but in cuda kernal launch its getting like above image.
can anyone help me to solve it.

Just again i realized I’ve already added the cudaDeviceSynchronize() in but at a wrong place. I was in a rush debugging while worrying about the time limit for total online study hours… So the problem has been carried on and on…

Hi team i am trying to launch the kernel but i am facing an issue, i had implemented the function in cpp for blending the imges now i want to launch the kernel i had done it but i am getting the out put that src image is proper and the destination image i am getting stripes in that (simply i am getting the improper image )its actually 2d image. and the code is the kernal.cu file i am passing
" static global void overlayKernel(uchar4* dst, const uchar4* overlay, int dstRows, int dstCols, int overlayRows, int overlayCols, int locationX, int locationY) "
in that i am Calculate the current thread’s position(x = blockIdx.x * blockDim.x + threadIdx.x;)
and Checking bounds to ensure we are within the destination image and Getting the overlay pixel at the current position like this
uchar4 overlayPx = overlay[fY * overlayCols + fX];
uchar4 dstPx = dst[y * dstCols + x];

Computing the opacity from the overlay image’s alpha channel
float opacity = overlayPx.w / 255.0f;
based on that If the opacity is greater than zero, blend the overlay pixel with the destination pixel
if (opacity > 0.0f) {
uchar4 resultPx;
resultPx.x = dstPx.x * (1.0f - opacity) + overlayPx.x * opacity; this will be done
getting no error but in the output image i am getting stripes can anyone help me to solve

Please format the code.

***************this is the defination in the deffile.cu
void overlayImage(cv::cuda::GpuMat& srcGpu, const cv::cuda::GpuMat& overlayGpu, const cv::Point& location){
if (srcGpu.type() != CV_8UC4 || overlayGpu.type() != CV_8UC4){
throw std::runtime_error(“Images must be of type CV_8UC4.”);
}
int srcWidth = srcGpu.cols;
int srcHeight = srcGpu.rows;
int overlayWidth = overlayGpu.cols;
int overlayHeight = overlayGpu.rows;
dim3 threadsPerBlock(16, 16);
dim3 numBlocks((srcWidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (srcHeight + threadsPerBlock.y - 1) / threadsPerBlock.y);
overlayKernel<<<numBlocks, threadsPerBlock>>>(srcGpu.ptr(), overlayGpu.ptr(),
srcWidth, srcHeight, overlayWidth, overlayHeight,
location.x, location.y);
cudaDeviceSynchronize(); // Ensure the kernel has finished executing
}
**************the defination of the overlay kernel in kernel .cu file ****************************
global void overlayKernel(uchar4
src, const uchar4
overlay, int srcWidth, int srcHeight, int overlayWidth, int overlayHeight, int locationX, int locationY)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= srcWidth || y >= srcHeight)
return;
int overlayX = x - locationX;
int overlayY = y - locationY;
if (overlayX >= 0 && overlayX < overlayWidth && overlayY >= 0 && overlayY < overlayHeight)
{ uchar4 srcPixel = src[y * srcWidth + x];
uchar4 overlayPixel = overlay[overlayY * overlayWidth + overlayX];
float opacity = overlayPixel.w / 255.0f;
src[y * srcWidth + x].x = (unsigned char)((float)srcPixel.x * (1.0f - opacity) + (float)overlayPixel.x * opacity);
src[y * srcWidth + x].y = (unsigned char)((float)srcPixel.y * (1.0f - opacity) + (float)overlayPixel.y * opacity);
src[y * srcWidth + x].z = (unsigned char)((float)srcPixel.z * (1.0f - opacity) + (float)overlayPixel.z * opacity);
src[y * srcWidth + x].w = 255; // Alpha channel set to opaque
}
}

in the same function if we pass the gpu_images it was not working but if we pass the cpu_images then its working can you help me to figure it out what i am missing.

The code formatting works with the </> button or with ` or ```` delimiters.