I have a kernel that need to be executed 3096 times. When I run this code in emulation, my results match with the simple C program. When I run this loop and kernel on the device, my values get messed up. Has some one seen this issue?
If everything looks correct in simulation, how can we determine what fails to work on the device. Is this a memory issue?? How can I resolve this ??
Is there a different way to malloc huge arrays?? the zonez array in this case has 12 million values (4096 x 3096).
[codebox]
int Nx = 64;
int Ny = 64;
int sizez = 3096;
double *zonez1, *zonez2;
cudaMalloc((void**) &zonez1, sizeof(double) * Nx * Ny * sizez);
cudaMalloc((void**) &zonez2, sizeof(double) * Nx * Ny * sizez);
dim3 block3(32,16);
dim3 grid3(2,4);
cutResetTimer(timer);
cutStartTimer(timer);
for(num = 0; num < sizez; ++num)
{
zonez_create <<< grid3, block3 >>> (zonez1, im1_d, pix1, Nx, Ny, sizez, u1, u2, v1, v2, num);
zonez_create <<< grid3, block3 >>> (zonez2, im2_d, pix1, Nx, Ny, sizez, u3, u4, v3, v4, num);
}
cudaThreadSynchronize();
cutStopTimer( timer ); // Stop timer
float zt1 = cutGetTimerValue(timer);
printf("zonez_create computation time: %0.3f ms\n\n", zt1);
// Test zonez
double *zonez1_h, *zonez2_h;
cudaMallocHost((void**) &zonez1_h, sizeof(double) * Nx * Ny * sizez);
cudaMallocHost((void**) &zonez2_h, sizeof(double) * Nx * Ny * sizez);
cudaMemcpy(zonez1_h, zonez1, sizeof(double) * Nx * Ny * sizez, cudaMemcpyDeviceToHost);
cudaMemcpy(zonez2_h, zonez2, sizeof(double) * Nx * Ny * sizez, cudaMemcpyDeviceToHost);
for(i = 0; i < Nx * Ny * sizez; ++i)
{
printf("zonez2[%d] = %lf\n", i, zonez2_h[i]);
}
[/codebox]
=========================================
Kernel:
[codebox]
global void zonez_create(double *zone, double *im, int pix1, int Nx, int Ny, int sizez, int *u1, int *u2, int *v1, int *v2, int num)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int z1 = u1[num]+i;
int z2 = v1[num]+j;
if (i < Nx && j < Ny && z1 < u2[num] && z2 < v2[num])
{
zone[(num*Nx*Ny)+i*Nx+j] = im[z2*pix1+z1];
}
}
[/codebox]