The following CUDA code is executed in more than 2 seconds (2260.611084 ms):
global void computeDetection(int numberAircraft, int nbSampleByAircraft, float distRefMin, int altitudeRefMin)
{
int indiceMaster = blockDim.x * blockIdx.x + threadIdx.x;
int indiceSlave = blockDim.y * blockIdx.y + threadIdx.y;
int localIdx = indiceMaster * numberAircraft + indiceSlave;
if (indiceMaster <= indiceSlave)
return;
if (indiceMaster > numberAircraft)
return;
deviceArrayC[localIdx] = -1;
int tmp = 10000;
for (int indSample = 0; indSample < 120; indSample++)
{
for (int indVolumeMaster = 0; indVolumeMaster < 364; indVolumeMaster++)
{
for (int indVolumeSlave = 0; indVolumeSlave < 364; indVolumeSlave++)
{
// tmp = deviceArrayC[localIdx] - indVolumeSlave;
tmp = (indVolumeSlave < tmp) ? indVolumeSlave : tmp;
}
}
}
deviceArrayC[localIdx] = tmp;
}
But if I replace “tmp = (indVolumeSlave < tmp) ? indVolumeSlave : tmp;” with “tmp = deviceArrayC[localIdx] - indVolumeSlave;” (switch commented line with the next one), then the code is executed in 0.052192 ms.
Can anybody explain me why I see such a difference of performance ?
Environment: Linux redhat 6.2, Cuda toolkit 5.0, last drivers downloaded for GTX 690.
And here my command line nvcc :
/usr/local/cuda-5.0/bin/nvcc -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 …
Thanks in advance.