Hello,
I work on a big kernel and I have some “weird” results.
I work on 2D arrays T(x,y), drawing T in function of x, for each y.
The CPU reference code show a smooth curve under tecplot.
When running the kernel using CUDA, some points are clearly diverging, at regular intervals. I used to change the block size, and it shows that the diverging points are multiples of the block size.
I searched the web, and try to change my kernel all the ways I could imagine, but I could not find the solution to this problem.
I guess, it is not a very difficult problem, so I think I am missing a fundamental point.
Can someone explain me the reason of the divergences I observe and what I did wrong ?
I had isolated the problem, so I include a small code exhibiting this matter, hoping it will help to make my explanation clearer.
/* Allocate the GPU and call the kernel */
void myFunc(int ni, int nj, int nit, float* ro1){
float *d_ro1;
float *d_dro1;
float *d_flu1;
size_t pitch_bytes_ro;
int pitch_ro;
int size_ro;
size_ro = (ni+3)*sizeof(float);
/**********************************/
// Allocate arrays on the device
CUDA_SAFE_CALL(cudaMallocPitch((void**)&d_ro1, &pitch_bytes_ro, size_ro, nj+3));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&d_dro1, &pitch_bytes_ro, size_ro, nj+3));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&d_flu1, &pitch_bytes_ro, size_ro, nj+3));
pitch_ro = pitch_bytes_ro / sizeof(float);
// Fill arrays on the device (initialization)
CUDA_SAFE_CALL(cudaMemcpy2D(d_ro1, pitch_bytes_ro, ro1, sizeof(float)*(ni+3),
sizeof(float)*(ni+3), nj+3, cudaMemcpyHostToDevice));
/**********************************/
/* Grid and Block definition */
int block_x = BLOCK_X;
int block_y = BLOCK_Y;
int grid_x = (ni+3-1)/block_x + 1;
int grid_y = (nj+3-1)/block_y + 1;
dim3 dimGrid(grid_x, grid_y);
dim3 dimBlock(block_x, block_y);
printf("grid : %d %d %d\n", dimGrid.x, dimGrid.y, dimGrid.z);
printf("block: %d %d %d\n", dimBlock.x, dimBlock.y, dimBlock.z);
/* Kernel Call */
myKernel<<<dimGrid, dimBlock>>>(d_ro1, d_dro1, d_flu1,
pitch_ro, ni, nj, nit);
cutilCheckMsg("Kernel failed");
CUDA_SAFE_CALL(cudaThreadSynchronize());
/**********************************/
CUDA_SAFE_CALL(cudaMemcpy2D(ro1, sizeof(float)*(ni+3), d_ro1, pitch_bytes_ro,
sizeof(float)*(ni+3), nj+3, cudaMemcpyDeviceToHost));
/**********************************/
// cleaning
CUDA_SAFE_CALL(cudaFree(d_ro1));
CUDA_SAFE_CALL(cudaFree(d_dro1));
CUDA_SAFE_CALL(cudaFree(d_flu1));
}
/************************************************************
******************/
the kernel is (i put syncthreads to isolate each steps) :
__global__ void myKernel(float* ro1, float* dro1, float* flu1,
int pitch_ro, int ni, int nj, int nit)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
float dt = 1.e-3;
int it;
int index, index2;
for(it=0; it<nit; it++){
if(i>1 && i<ni+3 && j>1 && j<nj+2){
index = i + j*pitch_ro;
flu1[index]=i;
}
__syncthreads();
if(i>1 && i<ni+2 && j>1 && j<nj+2){
index = i + j*pitch_ro;
dro1[index] = dt*(flu1[index]
-flu1[index+1]);
}
__syncthreads();
if(i>1 && i<ni+2 && j>1 && j<nj+3){
index = i + j*pitch_ro;
flu1[index] =1;
}
__syncthreads();
if(i>1 && i<ni+2 && j>1 && j<nj+2){
index = i + j*pitch_ro;
index2= index + pitch_ro;
dro1[index] += dt*(flu1[index]
-flu1[index2]);
}
__syncthreads();
if(i>1 && i<ni+2 && j>1 && j<nj+2){
index = i + j*pitch_ro;
ro1[index] += dro1[index];
}
__syncthreads();
}
}
what is strange is when
if(i>1 && i<ni+2 && j>1 && j<nj+3){
index = i + j*pitch_ro;
flu1[index] =1;
}
is commented, I have got a perfect straight line.
Mat