hello,
I am trying to use nlopt with CUDA kernels.
since nlopt uses double precision variables, I has to parse arguments back and forth double to float between mey earlier float-based cud code and nlopt optimization.
I wanted to get rid of this performance hit, so I decided to change all relevant variables to doubles.
after doing that the code compiled, but kernel started returning 0 instead of proper results.
code for kernel
__global__ void mulKernel(int *processedList, int *processedCount, double *processedValues, const int *vox, const int *beam, const double *depos, const double *settings, const int *voxPerChunk, const int *chunksize, int *lastVoxel)
{
int i = threadIdx.x;
int bx = blockIdx.x;
int startindex = (bx * 1024 + i);
int current = 0;
long voxelordernumber;
long kUpperLimit = (startindex + 1)*(*chunksize);
if (kUpperLimit > *lastVoxel)
{
kUpperLimit = *lastVoxel; //czy last Voxel może być < startindex?
}
//#pragma unroll
__syncthreads();
for (size_t k = startindex * (*chunksize); k < kUpperLimit; k++)
{
for (size_t j = 0; j < *voxPerChunk; j++)
{
current = startindex * *voxPerChunk + j;
if (processedList[current] == vox[k])
{
voxelordernumber = j;
break;
}
if (processedList[current] == -1)
{
processedList[current] = vox[k];
voxelordernumber = j;
processedCount[startindex] = processedCount[startindex] + 1;
break;
}
}
processedValues[startindex* *voxPerChunk + voxelordernumber] += depos[k] * settings[beam[k]];
}
__syncthreads();
}
the important bit is
processedValues[startindex* *voxPerChunk + voxelordernumber] += depos[k] * settings[beam[k]];
processedValues =0
depos =/= 0
settings =1
the result of the operation however is 0.
I am using Visual with Nsight under CUDA 10 and NVidia GTX 1080
Please Help :-)