Hello everyone,
I have been struggling with this problem for a while now and am almost at my wits end. Please help!
Basically the code shown below is part of an imaging process, specifically it deals with accelerating the gridding section. It runs as expected in device emulation mode and returns the right answer but seems to stop before completing execution when run on the device. All I’ve done is to expand what was originally a nested for loop to run on nSamples number of blocks each with nChan threads.
I have a GeForce 8600GTS and am running Ubuntu 7.10 with the CUDA 2.0beta2 tooklit and the 177.13 driver.
Any help will be much appreciated!
nSamples = 1000;
nChan = 16;
// Copy data to GPU memory
.....
// run the code using 1000 grids of 16 threads each
dim3 dimGrid(nSamples, 1, 1);
dim3 dimBlock(nChan, 1, 1);
start = clock();
kernel<<< dimGrid, dimBlock >>>((float*)d_u, (float*)d_v, (float*)d_w, (cuComplex*)d_data, (cuComplex*)d_grid, (float*)d_freq, (int*)d_cOffset, (float*)d_C, (int*)d_ints, (float*)d_flts);
cudaThreadSynchronize();
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
CUDA_SAFE_CALL(cudaMemcpy(ints, d_ints, intsMemSize, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(flts, d_flts, fltsMemSize, cudaMemcpyDeviceToHost));
finish = clock();
printf(" Count = %d \n\n", ints[4]);
// Report on timings
printf(" Total weight = %e \n", flts[1]);
time = (double(finish)-double(start))/CLOCKS_PER_SEC;
printf(" Time %f(s) \n", time);
....
__global__ void
kernel(float* u, float* v, float* w, cuComplex* data, cuComplex* grid, float* freq, int* cOffset, float* C, int* ints, float* flts)
{
float cellSize = flts[0];
float sumwt = flts[1];
float sumviswt = flts[2];
int nChan = ints[0];
int overSample = ints[1];
int gSize = ints[2];
int support = ints[3];
int i = blockIdx.x;
int chan = threadIdx.x;
int find, coff, iu, fracu, iv, fracv, suppv, suppu, vind, gind;
float uScaled, vScaled, wt;
int cSize=2*(support+1)*overSample+1;
int cCenter=(cSize-1)/2;
find=i*nChan+chan;
coff=cOffset[find];
uScaled=freq[chan]*u[i]/cellSize;
iu=(int)(uScaled);
fracu=(int)(overSample*(uScaled-(float)(iu)));
iu+=gSize/2;
vScaled=freq[chan]*v[i]/cellSize;
iv=(int)(vScaled);
fracv=(int)(overSample*(vScaled-(float)(iv)));
iv+=gSize/2;
for (suppv=-support;suppv<+support;suppv++)
{
vind=cSize*(fracv+overSample*suppv+cCenter)+fracu+cCenter+coff;
gind=iu+gSize*(iv+suppv);
for (suppu=-support;suppu<+support;suppu++)
{
wt=C[vind+overSample*suppu];
grid[gind+suppu][0]+=wt*data[find][0];
sumwt+=wt;
}
}
flts[0] = cellSize;
flts[1] = sumwt;
flts[2] = sumviswt;
ints[4]++;
__syncthreads();
}