Dear All,
I am having a problem running a cuda kernel. It stucks for long time and I don't have a cluse as to what is causing it. The kernel is given below.
__global__ void Inside_Error_Calculations(
float *d_proj_cyls_data,
unsigned char *d_FGMaps, float *errors, int *sample_points){
float error = 0;
int samples=0;
int FGMap_stride = (640*480/8)+1;
int error_sample_index=blockIdx.x*blockDim.x + threadIdx.x;
int my_index;
int proj_data_stride = InError_Threads*InError_Blocks*8; // 64 blocks, 64 threads per block
int proj_data_size = 4096*10*8;
int cnt = proj_data_size/proj_data_stride;
for (int map = 0; map < 4; map++){
int FGMap_index = map*FGMap_stride;
my_index = blockIdx.x*8*blockDim.x + threadIdx.x*8 + map*proj_data_size;
for (int i = 0; i < cnt; i++){
Inside_Error(&d_proj_cyls_data[my_index], &d_FGMaps[FGMap_index], &error, &samples);
my_index += proj_data_stride;
errors[error_sample_index] = error;
sample_points[error_sample_index] = samples;
error=0; samples=0;
error_sample_index += InError_Threads*InError_Blocks; // 64 blocks, 64 threads per block
}
}
}
When I set cnt = 1 and run 64x64 threads, it runs ok, but when cnt is not one it just stucks even when it is 2. I am running the kernel on Fermi (GTX 480) with cuda run time 3.2. Can some one tell me as to what is going wrong.
Best regards