Optimization

Hello!

I have a huge problem using CUDA. I tried to parallelize the Finite Element method using CUDA in order to have real time funtionality, but unfortunately the kernel is not quick enough. All data that remains constant I already use texture, I dont know what else to do, to improve performance. Here is the kernel I am talking about:

[codebox]global void element_loop(int4 *Edof4, float *detj_d, float4 *FIX4, float *U, float *Fm, float lambda, float mu, int ndof, int nele, int nsteps, int t)

{

int idx = blockIdx.x*blockDim.x + threadIdx.x;

if (idx<nele)

{

	int it, jt, s, g;

	float J;

	float Ue[3][4];

	float FIX[4][3];

	float FG[3][3];

	float Ct[3][3];

	float delta[3][3];

	float Sm[3][3];

    float S[6];

	float BL0[6][12];

	float BL[6][12];

	float sum;

	float Ft[24];

		

	for (it=0;it<3;it++)

		{

			for (jt=0;jt<3;jt++)

			{

				delta[it][jt]=0;

				Ct[it][jt]=0;

			}

			delta[it][it]=1;

		}



  

  ///Fetching EDOF

  int4 test1=tex1Dfetch(tex_edof,3*idx);

  int4 test2=tex1Dfetch(tex_edof,3*idx+1);

  int4 test3=tex1Dfetch(tex_edof,3*idx+2);

  int Edof[12]={test1.x,test1.y, test1.z, test1.w,test2.x,test2.y, test2.z, test2.w,test3.x,test3.y, test3.z, test3.w};

  //Fetching FIX

  float4 FIX1=tex1Dfetch(tex_float4,3*idx);

  float4 FIX2=tex1Dfetch(tex_float4,3*idx+1);

  float4 FIX3=tex1Dfetch(tex_float4,3*idx+2);

  	  FIX[0][0]=FIX1.x;

	  FIX[1][0]=FIX1.y;

	  FIX[2][0]=FIX1.z;

	  FIX[3][0]=FIX1.w;

	  FIX[0][1]=FIX2.x;

	  FIX[1][1]=FIX2.y;

	  FIX[2][1]=FIX2.z;

	  FIX[3][1]=FIX2.w;

	  FIX[0][2]=FIX3.x;

	  FIX[1][2]=FIX3.y;

	  FIX[2][2]=FIX3.z;

	  FIX[3][2]=FIX3.w;

  //Fetching detj

  float detj=tex1Dfetch(tex_detj,idx);

  

	

  

  //Element deformation matrix Ue

  s=0;

  for (it=0;it<4;it++)

  {

	  for (jt=0;jt<3;jt++)

	  {

		  Ue[jt][it]=U[ndof+(Edof[s])-1];

		  s++;

	  }

  }

/////Computing deformation gradient

for (it=0;it<3;it++)

{

	for (jt=0;jt<3;jt++)

	{

		FG[it][jt]=0;

		for (s=0;s<4;s++)

		{

			FG[it][jt]=FG[it][jt]+Ue[it][s]*FIX[s][jt];

		}

		FG[it][jt]=FG[it][jt]+delta[it][jt];

	}

}

			//////////////Neo-Hookean material law

		///////////Determinant of deformation gradient

		J=FG[0][0]*FG[1][1]*FG[2][2]+

	   FG[1][0]*FG[2][1]*FG[0][2]+

	   FG[0][1]*FG[1][2]*FG[2][0]-

	   FG[2][0]*FG[1][1]*FG[0][2]-

	   FG[1][0]*FG[0][1]*FG[2][2]-

	   FG[0][0]*FG[2][1]*FG[1][2];

		//Strain-displacement matrix

		//Setting BL to zero again

		for (it=0;it<6;it++)

		{

			for (jt=0;jt<12;jt++)

			{

				BL[it][jt]=0;

				BL0[it][jt]=0;

			}

		}

		//Compute BL0

		jt=0;

		for (it=0;it<4;it++)

		{

			BL0[0][jt]=FIX[it][0];

			BL0[1][jt+1]=FIX[it][1];

			BL0[2][jt+2]=FIX[it][2];

			BL0[3][jt]=FIX[it][1];

			BL0[3][jt+1]=FIX[it][0];

			BL0[4][jt+1]=FIX[it][2];

			BL0[4][jt+2]=FIX[it][1];

			BL0[5][jt]=FIX[it][2];

			BL0[5][jt+2]=FIX[it][0];

			jt=jt+3;

		}

		for (g=0;g<4;g++)

		{

			for (it=0;it<6;it++)

			{

				for (jt=0;jt<3;jt++)

				{

					for (s=0;s<3;s++)

					{

						BL[it][g*3+jt]=BL[it][g*3+jt]+BL0[it][g*3+s]*FG[jt][s];

					}

				}

			}

	

		}//End strain-displacement matrix

		/////////////////right Cauchy Green deformation tensor Ct=FG'*FG

		for (it=0;it<3;it++)

		{

			for (jt=0;jt<3;jt++)

			{

				for (s=0;s<3;s++)

				{

					Ct[it][jt]=Ct[it][jt]+FG[s][it]*FG[s][jt];

				}

			}

		}

		//////////////Inverting Ct

		float detCt, temp[3][3];

		detCt=	Ct[0][0]*Ct[1][1]*Ct[2][2]+

			Ct[1][0]*Ct[2][1]*Ct[0][2]+

			Ct[0][1]*Ct[1][2]*Ct[2][0]-

			Ct[2][0]*Ct[1][1]*Ct[0][2]-

			Ct[1][0]*Ct[0][1]*Ct[2][2]-

			Ct[0][0]*Ct[2][1]*Ct[1][2];

		temp[0][0]=Ct[1][1]*Ct[2][2]-Ct[2][1]*Ct[1][2];

		temp[0][1]=Ct[0][2]*Ct[2][1]-Ct[2][2]*Ct[0][1];

		temp[0][2]=Ct[0][1]*Ct[1][2]-Ct[1][1]*Ct[0][2];

		temp[1][0]=Ct[1][2]*Ct[2][0]-Ct[2][2]*Ct[1][0];

		temp[1][1]=Ct[0][0]*Ct[2][2]-Ct[2][0]*Ct[0][2];

		temp[1][2]=Ct[0][2]*Ct[1][0]-Ct[1][2]*Ct[0][0];

		temp[2][0]=Ct[1][0]*Ct[2][1]-Ct[2][0]*Ct[1][1];

		temp[2][1]=Ct[0][1]*Ct[2][0]-Ct[2][1]*Ct[0][0];

		temp[2][2]=Ct[0][0]*Ct[1][1]-Ct[1][0]*Ct[0][1];

	

		for (it=0;it<3;it++)

		{

			for (jt=0;jt<3;jt++)

			{

				Ct[it][jt]=1/detCt*temp[it][jt];

			}

		}	

	

		for (it=0;it<3;it++)

		{

			for (jt=0;jt<3;jt++)

			{

				Sm[it][jt]=mu*(delta[it][jt]-Ct[it][jt])+lambda*J*(J-1)*Ct[it][jt];

			}

		}

		for (it=0;it<3;it++)

		{

			S[it]=Sm[it][it];

		}

		

		S[3]=2*Sm[0][1];

		S[4]=2*Sm[1][2];

		S[5]=2*Sm[0][2];

		



		////////////Force Vector per element

		for (it=0;it<12;it++)

		{

			Fm[idx*24+it]=Edof[it];

			Fm[idx*24+it+12]=0;

			

			for (jt=0;jt<6;jt++)

			{

				Fm[idx*24+it+12]=Fm[idx*24+it+12]+detj/6*BL[jt][it]*S[jt];

			}

		}

		/*printf("Kernel Fm\n");

		for(it=0;it<12;it++)

			printf("%f\n", Fm[idx*24+it+12]);

		printf("\n");*/

}///end if

}

[/codebox]

exept for U and Fm, (U is input, Fm is output) which are residing in global memory, everything else is in the texture memory.

Any help is appreciated, thank you a lot in advance!

Greetings

Phil

In these situations, the profiler is you friend and occupancy calculator are your friends. Until you can get a quantitative handle on register usage, instruction throughput, memory access patterns, knowing where to even start is difficult.

Have said that, some of those inner loops look like prime candidates for unrolling, either via compiler directives, or by hand.

I had a chance to look at this code further (FEM and FDM methods are what I mostly use CUDA for), and there are some serious memory access problems that will make this code dreadfully slow. Consider the loop where the element force contributions are written out into global memory:

////////////Force Vector per element

		for (it=0;it<12;it++) {

			Fm[idx*24+it]=Edof[it];

			Fm[idx*24+it+12]=0.f;

			for (jt=0;jt<6;jt++) {

				Fm[idx*24+it+12]=Fm[idx*24+it+12]+detj/6.f*BL[jt][it]*S[jt];

			}

		}

You need to write 12 summed force components out. To do that this code executes 84 writes and 72 reads, all uncoalesced. That isn’t a recipe for fast code.

The code where you read the displacements into memory has similar problems:

//Element deformation matrix Ue

		s=0;

		for (it=0;it<4;it++) {

			for (jt=0;jt<3;jt++) {

				Ue[jt][it]=U[ndof+(Edof[s])-1];

				s++;

			}

		}

That will be completely uncoalesced and very slow. You might want to consider binding U to a texture as well. For the purposes of a single kernel run it is constant.

You could also move the material constants and number of degrees of freedom into constant memory, that usually provides an improvement. When I compile this, it used 25 registers. If you can get that down a bit, there are significant occupancy improvements to be had. As I posted earlier, profile the kernel and see where is it performing poorly and it should become pretty obvious what has to be done (or at least tried) to get speed up.