Hello!
I have a huge problem using CUDA. I tried to parallelize the Finite Element method using CUDA in order to have real time funtionality, but unfortunately the kernel is not quick enough. All data that remains constant I already use texture, I dont know what else to do, to improve performance. Here is the kernel I am talking about:
[codebox]global void element_loop(int4 *Edof4, float *detj_d, float4 *FIX4, float *U, float *Fm, float lambda, float mu, int ndof, int nele, int nsteps, int t)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx<nele)
{
int it, jt, s, g;
float J;
float Ue[3][4];
float FIX[4][3];
float FG[3][3];
float Ct[3][3];
float delta[3][3];
float Sm[3][3];
float S[6];
float BL0[6][12];
float BL[6][12];
float sum;
float Ft[24];
for (it=0;it<3;it++)
{
for (jt=0;jt<3;jt++)
{
delta[it][jt]=0;
Ct[it][jt]=0;
}
delta[it][it]=1;
}
///Fetching EDOF
int4 test1=tex1Dfetch(tex_edof,3*idx);
int4 test2=tex1Dfetch(tex_edof,3*idx+1);
int4 test3=tex1Dfetch(tex_edof,3*idx+2);
int Edof[12]={test1.x,test1.y, test1.z, test1.w,test2.x,test2.y, test2.z, test2.w,test3.x,test3.y, test3.z, test3.w};
//Fetching FIX
float4 FIX1=tex1Dfetch(tex_float4,3*idx);
float4 FIX2=tex1Dfetch(tex_float4,3*idx+1);
float4 FIX3=tex1Dfetch(tex_float4,3*idx+2);
FIX[0][0]=FIX1.x;
FIX[1][0]=FIX1.y;
FIX[2][0]=FIX1.z;
FIX[3][0]=FIX1.w;
FIX[0][1]=FIX2.x;
FIX[1][1]=FIX2.y;
FIX[2][1]=FIX2.z;
FIX[3][1]=FIX2.w;
FIX[0][2]=FIX3.x;
FIX[1][2]=FIX3.y;
FIX[2][2]=FIX3.z;
FIX[3][2]=FIX3.w;
//Fetching detj
float detj=tex1Dfetch(tex_detj,idx);
//Element deformation matrix Ue
s=0;
for (it=0;it<4;it++)
{
for (jt=0;jt<3;jt++)
{
Ue[jt][it]=U[ndof+(Edof[s])-1];
s++;
}
}
/////Computing deformation gradient
for (it=0;it<3;it++)
{
for (jt=0;jt<3;jt++)
{
FG[it][jt]=0;
for (s=0;s<4;s++)
{
FG[it][jt]=FG[it][jt]+Ue[it][s]*FIX[s][jt];
}
FG[it][jt]=FG[it][jt]+delta[it][jt];
}
}
//////////////Neo-Hookean material law
///////////Determinant of deformation gradient
J=FG[0][0]*FG[1][1]*FG[2][2]+
FG[1][0]*FG[2][1]*FG[0][2]+
FG[0][1]*FG[1][2]*FG[2][0]-
FG[2][0]*FG[1][1]*FG[0][2]-
FG[1][0]*FG[0][1]*FG[2][2]-
FG[0][0]*FG[2][1]*FG[1][2];
//Strain-displacement matrix
//Setting BL to zero again
for (it=0;it<6;it++)
{
for (jt=0;jt<12;jt++)
{
BL[it][jt]=0;
BL0[it][jt]=0;
}
}
//Compute BL0
jt=0;
for (it=0;it<4;it++)
{
BL0[0][jt]=FIX[it][0];
BL0[1][jt+1]=FIX[it][1];
BL0[2][jt+2]=FIX[it][2];
BL0[3][jt]=FIX[it][1];
BL0[3][jt+1]=FIX[it][0];
BL0[4][jt+1]=FIX[it][2];
BL0[4][jt+2]=FIX[it][1];
BL0[5][jt]=FIX[it][2];
BL0[5][jt+2]=FIX[it][0];
jt=jt+3;
}
for (g=0;g<4;g++)
{
for (it=0;it<6;it++)
{
for (jt=0;jt<3;jt++)
{
for (s=0;s<3;s++)
{
BL[it][g*3+jt]=BL[it][g*3+jt]+BL0[it][g*3+s]*FG[jt][s];
}
}
}
}//End strain-displacement matrix
/////////////////right Cauchy Green deformation tensor Ct=FG'*FG
for (it=0;it<3;it++)
{
for (jt=0;jt<3;jt++)
{
for (s=0;s<3;s++)
{
Ct[it][jt]=Ct[it][jt]+FG[s][it]*FG[s][jt];
}
}
}
//////////////Inverting Ct
float detCt, temp[3][3];
detCt= Ct[0][0]*Ct[1][1]*Ct[2][2]+
Ct[1][0]*Ct[2][1]*Ct[0][2]+
Ct[0][1]*Ct[1][2]*Ct[2][0]-
Ct[2][0]*Ct[1][1]*Ct[0][2]-
Ct[1][0]*Ct[0][1]*Ct[2][2]-
Ct[0][0]*Ct[2][1]*Ct[1][2];
temp[0][0]=Ct[1][1]*Ct[2][2]-Ct[2][1]*Ct[1][2];
temp[0][1]=Ct[0][2]*Ct[2][1]-Ct[2][2]*Ct[0][1];
temp[0][2]=Ct[0][1]*Ct[1][2]-Ct[1][1]*Ct[0][2];
temp[1][0]=Ct[1][2]*Ct[2][0]-Ct[2][2]*Ct[1][0];
temp[1][1]=Ct[0][0]*Ct[2][2]-Ct[2][0]*Ct[0][2];
temp[1][2]=Ct[0][2]*Ct[1][0]-Ct[1][2]*Ct[0][0];
temp[2][0]=Ct[1][0]*Ct[2][1]-Ct[2][0]*Ct[1][1];
temp[2][1]=Ct[0][1]*Ct[2][0]-Ct[2][1]*Ct[0][0];
temp[2][2]=Ct[0][0]*Ct[1][1]-Ct[1][0]*Ct[0][1];
for (it=0;it<3;it++)
{
for (jt=0;jt<3;jt++)
{
Ct[it][jt]=1/detCt*temp[it][jt];
}
}
for (it=0;it<3;it++)
{
for (jt=0;jt<3;jt++)
{
Sm[it][jt]=mu*(delta[it][jt]-Ct[it][jt])+lambda*J*(J-1)*Ct[it][jt];
}
}
for (it=0;it<3;it++)
{
S[it]=Sm[it][it];
}
S[3]=2*Sm[0][1];
S[4]=2*Sm[1][2];
S[5]=2*Sm[0][2];
////////////Force Vector per element
for (it=0;it<12;it++)
{
Fm[idx*24+it]=Edof[it];
Fm[idx*24+it+12]=0;
for (jt=0;jt<6;jt++)
{
Fm[idx*24+it+12]=Fm[idx*24+it+12]+detj/6*BL[jt][it]*S[jt];
}
}
/*printf("Kernel Fm\n");
for(it=0;it<12;it++)
printf("%f\n", Fm[idx*24+it+12]);
printf("\n");*/
}///end if
}
[/codebox]
exept for U and Fm, (U is input, Fm is output) which are residing in global memory, everything else is in the texture memory.
Any help is appreciated, thank you a lot in advance!
Greetings
Phil