Code work in Emulation mode but not with the Card...

Hi,

Length = 120,
iter = 8

abc[0][0] = 0;
for(int m = 1; m < iter; m++)
abc[0][m] = 0;
for(int k = 1; k <= Length; k++)
{
x[k] = 0;
for(m = 0; m < iter; m++)
{
abc[k][m] = (abc[k-1][m] + y[k-1][m])>( abc[k-1][m] + y[k-1][m]) ? (abc[k-1][m] + y[k-1][m]) : ( abc[k-1][m] + y[k-1][m]);
x[k] = x[k]> abc[k][m] ? x[k] : abc[k][m];
}
for(m = 0; m < iter; m++)
{
abc[k][m] = abc[k][m]-x[k];
}
}

Converted the above loop iterations into cuda…

#define MAX(X, Y) (X > Y)? X : Y

global
void kernel_alp(float* alp,float* total,float* gg,int codeLength)
{

float x1,x2;
float temp[8],temp1[8];
int bx = blockIdx.x;
int tidInit = blockIdx.x*blockDim.x+threadIdx.x;

if(bx == 0){
	alp[tidInit] = (float) -INFINITY;
	alp[0] = 0;
	__syncthreads();
}
else{
	int tidZero = ((bx-1)*blockDim.x*2)+(to[threadIdx.x][0]*2);
	int tidOne = ((bx-1)*blockDim.x*2)+(to[threadIdx.x][1]*2+1);

	if(threadIdx.x<8)
	{
		temp[threadIdx.x]=alp[(bx-1)*blockDim.x+to[threadIdx.x][0]];
		temp1[threadIdx.x]=alp[(bx-1)*blockDim.x+to[threadIdx.x][1]];
	}
	x1 = temp[threadIdx.x]+gg[tidZero];
	x2 = temp1[threadIdx.x]+gg[tidOne];
	alp[tidInit] = MAX(x1,x2);
	if(threadIdx.x == 0)
	{
		total[bx] = (float) -INFINITY;
	}
	total[bx] = MAX(total[bx],alp[tidInit]);
	__syncthreads();
}
total[bx] = MAX(total[bx],alp[tidInit]);
alp[tidInit] =alp[tidInit]-total[bx];	
__syncthreads();

}
BlkLength = 120;
BLOCK = 8;
kernel_alp<<< (BlkLength), BLOCK>>>(alp,total,gg,BlkLength-1);

This code works fine in Emulation mode…
But doesnot work with the NVIDIA Card …Please check the problem…

Thanks…!

You have syncthreads in if block, it will not work if if in not always work same way for all threads.