Using too much Registers??
First I got the variable mean from the last kernel from global memory. Then I started the second kernel and passed the real value of mean directly to the kernel.
But now I want to save time and not load the variable from global mem to the host. So I passed only a pointer to the location in global memory to the second kernel. → float mean = *dmean;
But now the kernel produces false results and finishes a lot faster then normal.
When I comment out one of the reduction statements in the kernel it works again (beside the outcommented fragment).
So I guess there are not enough Registers? Can I verify this in a way? Or are there any other suggestions what problem I have?
__global__
void f1_to_fx_gpu (float *P, int Ng, float *odata, float *dmean, volatile unsigned int *mutex)
{
extern __shared__ float shmem[];
float* sum1 = (float *)shmem;
float* sum2 = (float *)&sum1[Ng];
float* var = (float *)&sum2[Ng];
float* idm = (float *)&var[Ng];
float* prom = (float *)&idm[Ng];
float* shade = (float *)&prom[Ng];
float* maxim = (float *)&shade[Ng];
float* entropy = (float *)&maxim[Ng];
float* corr = (float *)&entropy[Ng];
const int tid = threadIdx.x;
float mean = *dmean;
for (int bid = blockIdx.x; bid<Ng; bid += gridDim.x)
{
sum1[tid] = 0;
sum2[tid] = 0;
var[tid] = 0;
idm[tid] = 0;
prom[tid] = 0;
shade[tid] = 0;
maxim[tid] = 0;
entropy[tid] = 0;
corr[tid] = 0;
float val = P[tid + Ng*bid];
int x = (bid-mean)+(tid-mean);
if (val != 0){
sum1[tid] = val * val;
sum2[tid] = val * (tid-bid)*(tid-bid);
var[tid] = (tid - mean) * (tid - mean) * val;
idm[tid] = val / (1 + (tid - bid) * (tid - bid));
prom[tid] = val*x*x*x*x;
shade[tid] = val*x*x*x;
maxim[tid] = val;
entropy[tid] = val * log(val);
corr[tid] = (tid - mean) * (bid - mean) * val;
}
__syncthreads();
parallel_reduction(sum1, odata, bid);
parallel_reduction(sum2, odata+1*Ng, bid);
parallel_reduction(var, odata+2*Ng, bid);
parallel_reduction(idm, odata+3*Ng, bid);
parallel_reduction(prom, odata+4*Ng, bid);
parallel_reduction(shade, odata+5*Ng, bid);
parallel_reduction_max(maxim, odata+6*Ng, bid);
parallel_reduction(entropy, odata+7*Ng, bid);
//parallel_reduction(corr, odata+8*Ng, bid);
//sehr wahrscheinlich zuwenig register vorhanden
}
#ifndef __DEVICE_EMULATION__
__syncblocks(mutex);
#endif
if (blockIdx.x == 0){
parallel_reduction(odata, odata, 0);
parallel_reduction(odata+1*Ng, odata,1);
parallel_reduction(odata+2*Ng, odata,2);
parallel_reduction(odata+3*Ng, odata,3);
parallel_reduction(odata+4*Ng, odata,4);
parallel_reduction(odata+5*Ng, odata,5);
parallel_reduction_max(odata+6*Ng, odata,6);
parallel_reduction(odata+7*Ng, odata,7);
parallel_reduction(odata+8*Ng, odata,8);
}
}
__device__ void __syncblocks(volatile unsigned int *mutex)
{
__syncthreads();
if (threadIdx.x == 0) // only let the first block thread in from here/
{
atomicAdd((unsigned int*)mutex,1);
if (blockIdx.x == 0) // only let the master thread through this point.
{
while (*mutex < gridDim.x){ } // this will be true when all blocks are waiting.
*mutex = 0; // let all blocks through the barrier.
}
// keep sitting idle until we're allowed through.
while (*mutex > 0) { }
}
__syncthreads();
}