Hello everyone,
Already this is my first post so hopefully I perfect spot at the right post, and sorry for my English but I’m French … External Image
So here’s my problem:
I’m working on a Tesla c2050 in double precision, and watching the result with cuda profile I don’t understand how cuda determined the number of register per threads.
For exemple here is kernel (it is a little long sorry):
__global__ void
FindIntersection (int modelId,
TModel *models,
int nRay,
int *panels,
double * thit,
double *icrapPtr,
double *tPtr,
double *nv)
{
if (blockIdx.x >= nRay) return;
if (threadIdx.x >= MAX_THREAD_PER_BLOCK_2) return;
__shared__ int s_found;
__shared__ double s_tmin[MAX_THREAD_PER_BLOCK_2];
__shared__ int s_ipanel[MAX_THREAD_PER_BLOCK_2];
s_found = 0;
s_tmin[threadIdx.x] = MY_INFINITY;
s_ipanel[threadIdx.x] = -1;
__syncthreads();
int i, delta;
TModel *model = &models[modelId];
double th_thit;
delta = (model->nCube / blockDim.x) + 1;
for (i = threadIdx.x * delta ; (i < threadIdx.x * delta + delta) && (i < model->nCube) ; i++)
{
th_thit = thit[i * MAX_NB_RAY + blockIdx.x];
if (th_thit < s_tmin[threadIdx.x])
{
s_tmin[threadIdx.x] = th_thit;
s_ipanel[threadIdx.x] = panels[i * MAX_NB_RAY + blockIdx.x];
s_found = 1;
}
}
__syncthreads();
if (threadIdx.x == 0)
{
if (s_found)
{
th_thit = MY_INFINITY;
for (i=0 ; i < MAX_THREAD_PER_BLOCK_2 ; i++)
{
if (s_tmin[i] < th_thit)
{
th_thit = s_tmin[i];
delta = s_ipanel[i];
}
}
icrapPtr[blockIdx.x] = 1;
tPtr[blockIdx.x] = th_thit;
nv[3 * blockIdx.x] = model->triangles[delta * 12 + 7];
nv[3 * blockIdx.x + 1] = model->triangles[delta * 12 + 8];
nv[3 * blockIdx.x + 2] = model->triangles[delta * 12 + 9];
}
else
{
icrapPtr[blockIdx.x] = 0;
tPtr[blockIdx.x] = -1;
}
}
}
So from this code, how many register are used by each thread in your opinion ?
Thank you
PS : The correct answer according cuda profile is 24 … BUT WHY ???