The following is an excercise code I worked out for the Nvidia CUDA C++ course. It doesn’t produce the right answer. The only difference between my code and the solution is how the grid is defined. My grid size is much larger than 100x200, being 256 x 256 = 65,536. I was thinking that the if statement in the kernel fuction should clearly tell the GPU not to use the redundant blocks and threads.
My grid definition is:
dim3 threads_per_block(16, 16, 1);
dim3 number_of_blocks(16, 16, 1);
The grid definition from the solution is more restrained and well calculated:
dim3 tblocks(32, 16, 1);
dim3 grid((nj/tblocks.x)+1, (ni/tblocks.y)+1, 1);
Could any one tell me why my code doesn’t work properly?
#include <stdio.h>
#include <math.h>
// Simple define to index into a 1D array from 2D space
#define I2D(num, c, r) ((r)*(num)+(c))
/*
* `step_kernel_mod` is currently a direct copy of the CPU reference solution
* `step_kernel_ref` below. Accelerate it to run as a CUDA kernel.
*/
__global__ void step_kernel_mod(int ni, int nj, float fact, float* temp_in, float* temp_out)
{
int i00, im10, ip10, i0m1, i0p1;
float d2tdx2, d2tdy2;
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
int j = row;
int i = col;
// loop over all points in domain (except boundary)
if (j >= 1 && j < nj -1 && i >=1 && i < ni - 1)
{
// find indices into linear memory
// for central point and neighbours
i00 = I2D(ni, i, j);
im10 = I2D(ni, i-1, j);
ip10 = I2D(ni, i+1, j);
i0m1 = I2D(ni, i, j-1);
i0p1 = I2D(ni, i, j+1);
// evaluate derivatives
d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10];
d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1];
// update temperatures
temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2);
//printf("Thread (%d, %d): i00 = %d, temp_in[i00] = %f, temp_out[i00] = %f\n", i, j, i00, temp_in[i00], temp_out[i00]);
// if (j == 1) printf("j = %d\n", j);
// if (i == 1) printf("i = %d\n", i);
// if (j == nj - 2) printf("nj - 2 = %d\n", nj - 2);
// if (i == ni - 2) printf("ni - 2 = %d\n", ni - 2);
// if (i == 198) printf("i = %d", i);
}
}
void step_kernel_ref(int ni, int nj, float fact, float* temp_in, float* temp_out)
{
int i00, im10, ip10, i0m1, i0p1;
float d2tdx2, d2tdy2;
// loop over all points in domain (except boundary)
for ( int j=1; j < nj-1; j++ ) {
for ( int i=1; i < ni-1; i++ ) {
// find indices into linear memory
// for central point and neighbours
i00 = I2D(ni, i, j);
im10 = I2D(ni, i-1, j);
ip10 = I2D(ni, i+1, j);
i0m1 = I2D(ni, i, j-1);
i0p1 = I2D(ni, i, j+1);
// evaluate derivatives
d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10];
d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1];
// update temperatures
temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2);
}
}
}
int main()
{
int istep;
int nstep = 200; // number of time steps
// Specify our 2D dimensions
const int ni = 200;
const int nj = 100;
float tfac = 8.418e-5; // thermal diffusivity of silver
float *temp1_ref, *temp2_ref, *temp1, *temp2, *temp_tmp;
const int size = ni * nj * sizeof(float);
temp1_ref = (float*)malloc(size);
temp2_ref = (float*)malloc(size);
// temp1 = (float*)malloc(size);
// temp2 = (float*)malloc(size);
// allocate memory for both CPU and GPU
cudaMallocManaged (&temp1, size);
cudaMallocManaged (&temp2, size);
// Initialize with random data
for( int i = 0; i < ni*nj; ++i) {
temp1_ref[i] = temp2_ref[i] = temp1[i] = temp2[i] = (float)rand()/(float)(RAND_MAX/100.0f);
cudaDeviceSynchronize();
}
// Execute the CPU-only reference version
for (istep=0; istep < nstep; istep++) {
step_kernel_ref(ni, nj, tfac, temp1_ref, temp2_ref);
// swap the temperature pointers
temp_tmp = temp1_ref;
temp1_ref = temp2_ref;
temp2_ref= temp_tmp;
}
// Execute the modified version using same data
dim3 threads_per_block(16, 16, 1);
dim3 number_of_blocks(16, 16, 1);
for (istep=0; istep < nstep; istep++) {
step_kernel_mod<<<number_of_blocks, threads_per_block>>>(ni, nj, tfac, temp1, temp2);
// swap the temperature pointers
temp_tmp = temp1;
temp1 = temp2;
temp2= temp_tmp;
}
float maxError = 0;
// Output should always be stored in the temp1 and temp1_ref at this point
for( int i = 0; i < ni*nj; ++i ) {
if (abs(temp1[i]-temp1_ref[i]) > maxError) { maxError = abs(temp1[i]-temp1_ref[i]); }
}
// Check and see if our maxError is greater than an error bound
if (maxError > 0.0005f)
printf("Problem! The Max Error of %.5f is NOT within acceptable bounds.\n", maxError);
else
printf("The Max Error of %.5f is within acceptable bounds.\n", maxError);
free( temp1_ref );
free( temp2_ref );
cudaFree( temp1 );
cudaFree( temp2 );
return 0;
}