Hi,
I am new to CUDA programming (using 3.2 SDK on Linux 32 bits) and I’m having some troubles with a raytracing application in the following kernel:
__constant__ t_obj current_obj;
__global__void inter_sphere(cu_view *view, unsigned int *color, float *best_dist)
{
const int id = blockIdx.x * blockDim.x + threadIdx.x;
float dist;
cu_view tmp_v;
tmp_v.posx = view[id].posx - current_obj.posx;
tmp_v.posy = view[id].posy - current_obj.posy;
tmp_v.posz = view[id].posz - current_obj.posz;
tmp_v.vx = view[id].vx;
tmp_v.vy = view[id].vy;
tmp_v.vz = view[id].vz;
cal_dist_poly(&tmp_v, current_obj.rayon, &dist);
if (__saturatef(dist) && (dist < best_dist[id] || __saturatef(best_dist[id]) == 0))
{
best_dist[id] = dist;
color[id] = (current_obj.r << 16) + (current_obj.g << 8) + current_obj.b;
}
}
The two lines inside the if statement under cal_dist_poly() seems to be responsible for 99% of the running time of my kernel. If I comment them, the running time goes from 12 secs down to half a second.
I am calling this kernel with some 1 000 000 threads multiple times, view, color and best_dist are stored into global memory.
nThreads = 256;
nBlocks = (param->winx * param->winy) / nThreads;
while (tmp)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(current_obj, tmp, sizeof(cu_obj)));
if (tmp->type == SPHERE)
inter_sphere<<<nBlocks, nThreads>>>(view, d_color, d_dist);
cudaThreadSynchronize();
tmp = tmp->next;
}
Am I doing something wrong?