LAUNCH_OUT_OF_RESOURCES on TESLA D870, same algorithm on 9800 GX2 works ok..

Hi…

I’m, developing simulation of atom behavior in crystals, (BCC, FCC lattice, [Fe, Au, Cu, …])

I have 2 different CUDA devices:

  1. GeForce 9800 GX2 (Capability 1.1)

  2. TESLA D870 (Capability 1.0)

Problem part of device code for generating atom network:

for(int i = 0; i < sizeB; i++){

	curG.x = curB.x; curG.y = curB.y; curG.z = curB.z;

	cellID_G = cellID_B;

	for(int j = 0; j < sizeG; j++){

	  mask = maskBase;

	  /* If curG is behind triangle (grain border) inline device method "IsBehindTriang" returns 0 so cntConj will by 0 at the end.. */

	  cntConj = 1;

	  for(uint it = 0; it < cntTriangs; it++){

		a = triangShared[it].n.x; b = triangShared[it].n.y; c = triangShared[it].n.z; d = triangShared[it].d;

		cntConj *= IsBehindTriang(a, b, c, d, curG.x, curG.y, curG.z);

	  }

	  __syncthreads();

	  if(cntConj == 1){

		out[idxOut].type = type;

		out[idxOut].tempr = tempr;

		out[idxOut].id = cellID_G;

		out[idxOut].grainID = grainID;

		out[idxOut].mask = mask;

		out[idxOut].pos.x = curG.x;

		out[idxOut].pos.y = curG.y;

		out[idxOut].pos.z = curG.z;

		out[idxOut].mate[DIR_BCC_0] = cellID_G + mateID[DIR_BCC_0];

		out[idxOut].mate[DIR_BCC_1] = cellID_G + mateID[DIR_BCC_1];

		out[idxOut].mate[DIR_BCC_2] = cellID_G + mateID[DIR_BCC_2];

		out[idxOut].mate[DIR_BCC_3] = cellID_G + mateID[DIR_BCC_3];

		out[idxOut].mate[DIR_BCC_4] = cellID_G + mateID[DIR_BCC_4];

		out[idxOut].mate[DIR_BCC_5] = cellID_G + mateID[DIR_BCC_5];

		out[idxOut].mate[DIR_BCC_6] = cellID_G + mateID[DIR_BCC_6];

		out[idxOut].mate[DIR_BCC_7] = cellID_G + mateID[DIR_BCC_7];

	  }

	  __syncthreads();

	  curG.x += gx; curG.y += gy; curG.z += gz;

	  idxOut++;

	  cellID_G++;

	}

	curB.x += bx; curB.y += by; curB.z += bz;

	cellID_B += stepB;

  }

When i run this code on GeForce 9800 GX2, atom network is generated OK…

On Tesla i receive CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES…

When i disable this part of code Tesla works OK:

//out[idxOut].mate[DIR_BCC_0] = cellID_G + mateID[DIR_BCC_0];

		//out[idxOut].mate[DIR_BCC_1] = cellID_G + mateID[DIR_BCC_1];

		//out[idxOut].mate[DIR_BCC_2] = cellID_G + mateID[DIR_BCC_2];

		//out[idxOut].mate[DIR_BCC_3] = cellID_G + mateID[DIR_BCC_3];

		//out[idxOut].mate[DIR_BCC_4] = cellID_G + mateID[DIR_BCC_4];

		//out[idxOut].mate[DIR_BCC_5] = cellID_G + mateID[DIR_BCC_5];

		//out[idxOut].mate[DIR_BCC_6] = cellID_G + mateID[DIR_BCC_6];

		//out[idxOut].mate[DIR_BCC_7] = cellID_G + mateID[DIR_BCC_7];

Any ideas what causes this problem?

Used stuff:

#define Vec float3

long cntConj, cntConjMate[MAX_MATES];

Atom* out ...

struct Atom{

  char		type;

  ushort	  tempr;

  int		 id;

  uint		grainID;

  long		mask;

  Vec		 pos;

  int		 mate[MAX_MATES];

};

My second problem is, that i want to improve inside loop to check actual curB and all his closest mates too:

for(uint it = 0; it < cntTriangs; it++){

		a = triangShared[it].n.x; b = triangShared[it].n.y; c = triangShared[it].n.z; d = triangShared[it].d;

		cntConj					   *= IsBehindTriang(a, b, c, d, curG.x, curG.y, curG.z);

		cntConjMate[DIR_BCC_0] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_0].x, matePos[DIR_BCC_0].y, matePos[DIR_BCC_0].z);

		cntConjMate[DIR_BCC_1] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_1].x, matePos[DIR_BCC_1].y, matePos[DIR_BCC_1].z);

		cntConjMate[DIR_BCC_2] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_2].x, matePos[DIR_BCC_2].y, matePos[DIR_BCC_2].z);

		cntConjMate[DIR_BCC_3] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_3].x, matePos[DIR_BCC_3].y, matePos[DIR_BCC_3].z);

		cntConjMate[DIR_BCC_4] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_4].x, matePos[DIR_BCC_4].y, matePos[DIR_BCC_4].z);

		cntConjMate[DIR_BCC_5] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_5].x, matePos[DIR_BCC_5].y, matePos[DIR_BCC_5].z);

		cntConjMate[DIR_BCC_6] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_6].x, matePos[DIR_BCC_6].y, matePos[DIR_BCC_6].z);

		cntConjMate[DIR_BCC_6] *= IsBehindTriang(a, b, c, d, matePos[DIR_BCC_7].x, matePos[DIR_BCC_7].y, matePos[DIR_BCC_7].z);

	  }

	  __syncthreads();

	  mask |= (CELL_MASK_MATE_0_BIT_1 * cntConjMate[DIR_BCC_0]);

	  mask |= (CELL_MASK_MATE_1_BIT_1 * cntConjMate[DIR_BCC_1]);

	  mask |= (CELL_MASK_MATE_2_BIT_1 * cntConjMate[DIR_BCC_2]);

	  mask |= (CELL_MASK_MATE_3_BIT_1 * cntConjMate[DIR_BCC_3]);

	  mask |= (CELL_MASK_MATE_4_BIT_1 * cntConjMate[DIR_BCC_4]);

	  mask |= (CELL_MASK_MATE_5_BIT_1 * cntConjMate[DIR_BCC_5]);

	  mask |= (CELL_MASK_MATE_6_BIT_1 * cntConjMate[DIR_BCC_6]);

	  mask |= (CELL_MASK_MATE_7_BIT_1 * cntConjMate[DIR_BCC_7]);

	  if(cntConj == 1){

		   .........

		out[idxOut].mask = mask; /* This causes CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES on GX2 and Tesla too... */

		   .........

	  }

	  __syncthreads();

So after inside for loop array cntConjMate consists 0 or 1,

and i want to set bit for that mate direction in atom mask…

When setting up this mask to global memory CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES is thrown…

I’m developing on Linux - Ubuntu 8.04 - 64bit , NVIDIA Driver Version: 180.06

Thanks for any help

Regards BasY

Hi!

I’ve gotten the Out of Resources error when I am running too many threads/registers. You can check the number of registers your kernel is using by compling with –ptxas-options=-v to your nvcc. This would show how many registers and many much shared memory your kernel is using.

Thanks…
Very helpful info for me, and you are right… :thumbup:
When i make it, i receive:
ptxas info : Used 32 registers, 144+144 bytes smem, 12 bytes cmem[1]

Does it mean that each thread will use 32 registers?

Tesla and GX2 have same Registers per block: 8192
When i run this on Tesla i can use 3x more memory then on GX2.
This means that i have less partial database count migrating between host and device but each Tesla partial database is bigger then on GX2.
So there are more threads computing on Tesla.

When max registers per block is 8192, and i want to use full 512 threads the method have to use max 16 registers?
Or for my example when method uses 32 registers i can run only 256 threads, and more threads will cause CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES…
Is it right?

As much as I know, yes. To be sure, you can plug in the registers # and smem usage to the Occupancy Calculator to find the maximum # of threads you can use for the Tesla or GX2

OK, thanks…

So I can get number of used regs of method during compilation time…
And use this value as constant in main code to determine max number of threads…
( :thumbsdown: Forget to synchronize this will get me into problems :wacko: )

But how to determine dynamically number of used regs in currently loaded CUfunction during run time?
Is it possible ?

I’m using Device API and i check docs twice but can’t find anything :wacko: (Except Programming guide cap. 5.2)

Thanks…

:blink: I haven’t tried this yet so I dunno. Hopefully a more experienced coder can answer this for us.

Dinh