Hi,
Length = 120,
iter = 8
abc[0][0] = 0;
for(int m = 1; m < iter; m++)
abc[0][m] = 0;
for(int k = 1; k <= Length; k++)
{
x[k] = 0;
for(m = 0; m < iter; m++)
{
abc[k][m] = (abc[k-1][m] + y[k-1][m])>( abc[k-1][m] + y[k-1][m]) ? (abc[k-1][m] + y[k-1][m]) : ( abc[k-1][m] + y[k-1][m]);
x[k] = x[k]> abc[k][m] ? x[k] : abc[k][m];
}
for(m = 0; m < iter; m++)
{
abc[k][m] = abc[k][m]-x[k];
}
}
Converted the above loop iterations into cuda…
#define MAX(X, Y) (X > Y)? X : Y
global
void kernel_alp(float* alp,float* total,float* gg,int codeLength)
{
float x1,x2;
float temp[8],temp1[8];
int bx = blockIdx.x;
int tidInit = blockIdx.x*blockDim.x+threadIdx.x;
if(bx == 0){
alp[tidInit] = (float) -INFINITY;
alp[0] = 0;
__syncthreads();
}
else{
int tidZero = ((bx-1)*blockDim.x*2)+(to[threadIdx.x][0]*2);
int tidOne = ((bx-1)*blockDim.x*2)+(to[threadIdx.x][1]*2+1);
if(threadIdx.x<8)
{
temp[threadIdx.x]=alp[(bx-1)*blockDim.x+to[threadIdx.x][0]];
temp1[threadIdx.x]=alp[(bx-1)*blockDim.x+to[threadIdx.x][1]];
}
x1 = temp[threadIdx.x]+gg[tidZero];
x2 = temp1[threadIdx.x]+gg[tidOne];
alp[tidInit] = MAX(x1,x2);
if(threadIdx.x == 0)
{
total[bx] = (float) -INFINITY;
}
total[bx] = MAX(total[bx],alp[tidInit]);
__syncthreads();
}
total[bx] = MAX(total[bx],alp[tidInit]);
alp[tidInit] =alp[tidInit]-total[bx];
__syncthreads();
}
BlkLength = 120;
BLOCK = 8;
kernel_alp<<< (BlkLength), BLOCK>>>(alp,total,gg,BlkLength-1);
This code works fine in Emulation mode…
But doesnot work with the NVIDIA Card …Please check the problem…
Thanks…!