Hello Everyone,
I have written a kernel which reflects correct result to host with emulation mode, but when i execute on device it returns all zero bits.
Please suggest, what is going wrong with this case.
Here is my kernel
const int BLOCK_SIZE= 8;
global void CUDA_substitution_kernel(int *d_sub,int *xor1key,int s1[16],int s2[16],int s3[16],int s4[16],int s5[16],int s6[16],int s7[16],int s8[16])
{
int index,i,j,k,lb,ub;
index=blockIdx.x * blockDim.x + threadIdx.x;
//**/ printf(“index=%d\n”,index);
// shared int Stemp[8][6];
int Stemp[8][6];
lb=((4index)+1)-1;
ub=(4(index+1))-1;
k=0;
//**/ printf(“\nSubstituion Box Temporary:\n”);
// if(index==0)
for(i=0;i<8;i++)
for(j=0;j<6;j++)
{
Stemp[i][j]=xor1key[k++]; //putting 48 bit data into Stemp[8][6]
//**/ printf(“%d\t”,Stemp[i][j]);
}
int p,q,multi,l,Boxnum,cross,s;
int c,a[4],r; //a[4] to conatain final 4 bit from each SBox and appended to SubChoice[32]
q=0;
multi=1;
j=4;
c=3;
p=(Stemp[index][0]*2)+(Stemp[index][5]*1);
//**/ printf(“\nindex=%d p=%d\n”,index,p);
while(j>0) //target to get decimal of 4 bits among six
{
k=Stemp[index][j]; //accepts 4 bits one by one reversely
l=k*multi; //
q=q+l;
//**/ printf(“\nFor index=%d k=%d l=%d q=%d multi=%d j=%d\n”,index,k,l,q,multi,j);
multi=multi*2;
j–;
}
//**/ printf(“\nindex=%d q=%d\n”,index,q);
//we get final num in q
Boxnum=index+1;
switch(Boxnum)
{
case 1: cross=s1[p][q]; break;
case 2: cross=s2[p][q]; break;
case 3: cross=s3[p][q]; break;
case 4: cross=s4[p][q]; break;
case 5: cross=s5[p][q]; break;
case 6: cross=s6[p][q]; break;
case 7: cross=s7[p][q]; break;
case 8: cross=s8[p][q]; break;
}
//**/ printf(“\n index=%d and cross=%d \n”,index,cross);
while(cross>0)
{
r=cross%2;
a[c–]=r;
cross=cross/2;
}
__syncthreads();
while(c>=0)
{
a[c–]=0;
}
for(l=lb,s=0;l<=ub;l++)
{
d_sub[l]=a[s];
//**/ printf(“\n index= %d , d_sub[%d]=%d and s= %d \n”,index,l,d_sub[l],s);
s++;
}
__syncthreads();
/* printf(“\n\n\nFinal:d_sub\n”);
for(i=0;i<32;i++)
{
printf(“d_sub[%d]=%d\t”,i,d_sub[i]);
}
*/
return;
}
And this how i call my kernel from main
int xor1key[48]=
{ 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0 };
dim3 dimblock(BLOCK_SIZE);
dim3 dimgrid(8/BLOCK_SIZE);
cudaMallocHost((void **)&SubChoice,32 * sizeof(int));
memset(SubChoice,0,32 * sizeof(int));
int* d_sub;
cudaMalloc((void **)&d_sub,32 * sizeof(int));
cudaMemcpy(d_sub,SubChoice,32 * sizeof(int),cudaMemcpyHostToDevice);
printf("\ncuda work\n");
CUDA_substitution_kernel<<<dimgrid,dimblock>>>(d_sub,xor1key,s1,s2,s3,s4,s5,s6,s7,s8);
CUT_CHECK_ERROR("kernel error");
cudaMemcpy(SubChoice,d_sub,32 * sizeof(int),cudaMemcpyDeviceToHost);
printf("\ncuda done\n");
printf("\nSubstitution-choice-32-final\n");
for(i=0;i<32;i++)
{
printf("%d\t",SubChoice[i]);
}
printf("\n\n");
cudaFree(d_sub);
This is just the part of encryption algorithm, all the arrays from s1 to s8 are two dimensional arrayof 4*16.
with emulation code i get correct bits as
Substitution-choice-32-final
0 0 0 1 1 0 1 1 1 1 0 0 0 1 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 1 0 0
but on device i get all zeros.
Thanks in advance.
Regards,
Deepti