reduce he number of register per thread in the kernel

in my kernel named CalcSTLDistance_Kernel it is seen that 58 register are used in it with occupancy of 50%. from occupancy calculator it is seen that when the register is decreased to 56, I can increse occupany by some amount. I learned that automatic variables in the code uses the register thus I tried to minimize the local variables but it did no change in the register uses.

How can I reduce the register usuage in kernel in the below code

__global__ static void  CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
	const Integer TID = CudaGetTargetID();
	const Integer ID  = TID;
	if(ID >= ComputeParticleNumber)
	{
		return ;
	}
	CDistance NearestDistance;
	Integer NearestID = -1;
	NearestDistance.Magnitude = 1e8;
	NearestDistance.Direction = make_Scalar3(0,0,0);
	if(c_daOutputParticleID[ID] < -1)
	{
		c_daSTLDistance[ID] = NearestDistance;
		c_daSTLID[ID] = NearestID;
		return;
	}

	Scalar3 TargetPosition = c_daParticlePosition[ID];

	Integer TriangleID;		
	Integer CIDX, CIDY, CIDZ;
	Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&TargetPosition,CIDX, CIDY, CIDZ);
	if(CID >=0 && CID < c_CellNum)
	{
		Integer Range = 1;
		for(Integer k = -Range; k <= Range; ++k)
		{
			for(Integer j = -Range; j <= Range; ++j)
			{
				for(Integer i = -Range; i <= Range; ++i)
				{
					Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
					if(MCID < 0 || MCID >= c_CellNum)
					{
						continue;
					}
					unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
					for(unsigned int l = 0; l < TriangleNum; ++l)
					{
						TriangleID = c_daCell[MCID].m_TriangleID[l];
						
						/*if(c_daTrianglesParameters[c_daTriangles[TriangleID].ModelIDNumber].isDrag)
						{
							continue;
						}*/
						//c_daTrianglesParameters[c_daTriangles[TriangleID].ModelIDNumber].isDrag;

						if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
						{
							CDistance Distance ;
							Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &TargetPosition, &Distance.Direction);
							if(Distance.Magnitude < NearestDistance.Magnitude)
							{
								NearestDistance = Distance;
								NearestID = TriangleID;
							}
						}
					}	
				}
			}
		}
	}
	c_daSTLDistance[ID] = NearestDistance;
	c_daSTLID[ID] = NearestID;
}

and

static __inline__ __device__ __host__ Scalar CalcDistance(const CTriangle* const pTriangle, const Scalar3* const Position,Scalar3* const DistanceVector)
{
	Scalar3 dv = CalcNormalDistance(pTriangle,Position);
	Scalar3 P = make_Scalar3(Position->x - dv.x, Position->y - dv.y, Position->z - dv.z);
	// Compute vectors
	Scalar3 V0 = pTriangle->Vertex[2] - pTriangle->Vertex[0];//SubtractVector(&pTriangle->Vertex[2], &pTriangle->Vertex[0]);
	Scalar3 V1 = pTriangle->Vertex[1] - pTriangle->Vertex[0];//SubtractVector(&pTriangle->Vertex[1], &pTriangle->Vertex[0]);
	Scalar3 V2 = P - pTriangle->Vertex[0];//SubtractVector(&P, &pTriangle->Vertex[0]);
	// Compute dot products
	Scalar dot00 = DotProduct(&V0, &V0);
	Scalar dot01 = DotProduct(&V0, &V1);
	Scalar dot02 = DotProduct(&V0, &V2);
	Scalar dot11 = DotProduct(&V1, &V1);
	Scalar dot12 = DotProduct(&V1, &V2);
	// compute barycentric coordinates
	Scalar invDenom = 1 / (dot00 * dot11 - dot01 * dot01);
	Scalar u = (dot11 * dot02 - dot01 * dot12 ) * invDenom;
	Scalar v = (dot00 * dot12 - dot01 * dot02 ) * invDenom;

	if(u >= 0 && v >= 0 && u + v > 1)
	{
		dv = CalcDistanceVectorLine(&pTriangle->Vertex[1],&pTriangle->Vertex[2], Position);
	}
	if(u < 0 )
	{
		dv = CalcDistanceVectorLine(&pTriangle->Vertex[0],&pTriangle->Vertex[1], Position);
	}
	if(v < 0 )
	{
		dv = CalcDistanceVectorLine(&pTriangle->Vertex[0],&pTriangle->Vertex[2], Position);
	}
	if(DistanceVector)
	{
		DistanceVector->x = dv.x;
		DistanceVector->y = dv.y;
		DistanceVector->z = dv.z;
	}		
	return Magnitude(&dv);	
}
static __inline__ __device__ __host__ Scalar3 CalcNormalDistance(const CTriangle* const pTriangle, const Scalar3* const Position)
{
	const Scalar3 Normal = pTriangle->Normal;

	Scalar3 Center;
	Center.x = 0;
	Center.y = 0;
	Center.z = 0;
	for(Integer i = 0; i < 3; ++i)
	{
		Center.x += pTriangle->Vertex[i].x;
		Center.y += pTriangle->Vertex[i].y;
		Center.z += pTriangle->Vertex[i].z;
	}	
	Center.x /= 3.0;
	Center.y /= 3.0;
	Center.z /= 3.0;

	const Scalar lNumerator = Normal.x * Center.x + Normal.y * Center.y + Normal.z * Center.z;
	const Scalar rNumerator = Normal.x * Position->x + Normal.y * Position->y + Normal.z * Position->z;
	const Scalar denominator = Normal.x * Normal.x + Normal.y * Normal.y + Normal.z * Normal.z;
	const Scalar sDenom = sqrt(denominator);
	Scalar3 DistanceVector = make_Scalar3(0,0,0) ;
	if(sDenom == 0.0)
	{
		return DistanceVector;
	}
	const Scalar Coefficient = (rNumerator - lNumerator) / sDenom;

	DistanceVector.x = Coefficient / sDenom * Normal.x;
	DistanceVector.y = Coefficient / sDenom * Normal.y;
	DistanceVector.z = Coefficient / sDenom * Normal.z;

	return DistanceVector;
}
static __inline__ __device__ __host__ Scalar3 CalcDistanceVectorLine(const Scalar3* const L0,const Scalar3* const L1, const Scalar3* const Position)
{
	Scalar u = SegmentRatio(L0, L1, Position);
	Scalar3 DistanceVector;
	if(u >= 0 && u <= 1)
	{
		Scalar3 myVec = *L1 - *L0;//SubtractVector(L1, L0);
		Scalar3 P;
		P.x = L0->x + u * myVec.x;
		P.y = L0->y + u * myVec.y;
		P.z = L0->z + u * myVec.z;
		DistanceVector = *Position - P;//SubtractVector(Position, &P);
	}
	if( u < 0)
	{
		DistanceVector = *L1 - *Position;//CalcDistanceVector(L1, Position);
	}
	if( u > 1)
	{
		DistanceVector = *L0 - *Position;//CalcDistanceVector(L0, Position);
	}
	return DistanceVector;
}

static __inline__ __device__ __host__ Integer GetCellID(const CGridBox* const BoundingBox, Integer i, Integer j, Integer k) 
{	
	const Integer CellID = (BoundingBox->m_GridSize.y * BoundingBox->m_GridSize.z) * i + BoundingBox->m_GridSize.z * j + k;
	return CellID;	
}

there are several strategies

a) -maxrregcount for entire .cu files
b) launch_bounds for individual kernels
c) move some local variables to shared memory
d) the “volatile trick” (not sure if still applicable to the LVVM compiler for Compute 2.0 and later)
e) distributing the algorithm state (local variables) across several threads and using warp shuffles for inter-thread communications where necessary. Feasibility depends heavily on the algorithm, but for us this worked great on some hash functions like e.g. the SIMD hash, or the Groestl hash function).

I am using visual studio to compile and run the cuda code. Is it possible to use -maxregcount or __launch_bounds__within the code itself