Use of shared memory increasing register count?


I have a problem when I tried to move from the use of local arrays to shared memory arrays for the purpose of collaboration between threads, and also to decrease register pressure: the nvidia compiler actually reports increased register count.

In the original version I have two arrays mapping[8] and localQuad[18], and with blockDim = 256, and 2 threads working on the same array, I now have shared float mapping[8128]; shared float localQuad[18128];

These arrays are addressed statically in the first version, so they are not spilled to global memory. Moving to shared arrays and introducing the required indexing changes, my register count went up from 39 to 46.

The new code is as follows (arrays not declared are in constant memory). The old one is analogous to this one, except for the two arrays are not shared, and indexed differently.

I have no idea why the register count went up, I tried hardcoding the inline functions thinking what if calling them requires moving input data to local memory, but this had no effect either.

Could someone point me into the right direction here?

__device__ inline float force(float x, float y) {

    return sin(M_PI*x)*sin(M_PI*y);


__device__ inline void jacInvT(float x, float y, float *map, float *result) {

	float det = (map[2*128]+map[6*128]*y)*(map[4*128+1]+map[6*128+1]*x)-(map[4*128]+map[6*128]*x)*(map[2*128+1]+map[6*128+1]*y);

    result[0] = 1.0f/det * (map[4*128+1]+map[6*128+1]*x);

    result[1] = -1.0f/det * (map[2*128+1]+map[6*128+1]*y);

    result[2] = -1.0f/det * (map[4*128]+map[6*128]*x);

    result[3] = 1.0f/det * (map[2*128]+map[6*128]*y);


__global__ void fem_kernel(int* squareNodeList, int* nodePtrs, float* nodes, float *value, float *load, int *loadIndex, int Ne, int quad, int pointsPerElement, int degree) {

	int tid = blockIdx.x*blockDim.x + threadIdx.x;

	int i = tid/2;

	int localIndex = threadIdx.x/2;

	int first = tid%2;

	if (i<Ne) {


// Main loop for each element


		__shared__ float mapping[8*128];

		__shared__ float localQuad[18*128];

		float g[9];

		float jacInvTranspose[4];

		int ind1, ind2;

		float I;

		for (int j=0; j<4; j++) {

			mapping[2*j*128+2*localIndex+first] = d_Minv[j*4]*nodes[2*(squareNodeList[i])+first] + d_Minv[j*4+1]*nodes[2*(squareNodeList[degree*Ne+i])+first] + d_Minv[j*4+2]*nodes[2*(squareNodeList[(pointsPerElement-(degree+1))*Ne+i])+first]+d_Minv[j*4+3]*nodes[2*(squareNodeList[(pointsPerElement-1)*Ne+i])+first];


		for (int j = 0 ; j<9; j++) {

			localQuad[2*j*128+2*localIndex+first] = mapping[2*localIndex+first] + mapping[2*128+2*localIndex+first]*d_quadPoints[j*2] + mapping[4*128+2*localIndex+first]*d_quadPoints[2*j+1] + mapping[6*128+2*localIndex+first]*d_quadPoints[j*2]*d_quadPoints[j*2+1];


			//evaluate kappa at quadrature points and multiply by quadrature weights

		for (int j = 0; j<quad; j++) {

			g[j] = kappa(localQuad[2*j*128+2*localIndex],localQuad[2*j*128+2*localIndex+1/*18*localIndex+2*j+1*/])*d_quadWeights[j];


		for (int j = first; j<pointsPerElement; j+=2) {

			ind1 = nodePtrs[squareNodeList[j*Ne+i]];

			if (ind1>=0) {

				for (int k = j; k<pointsPerElement; k++) {

					ind2 = nodePtrs[squareNodeList[k*Ne+i]];

					if (ind2>=0) {

						I = 0;

						for (int m = 0; m<quad; m++) {

							jacInvT(localQuad[2*m*128+2*localIndex],localQuad[2*m*128+2*localIndex+1], &mapping[2*localIndex],jacInvTranspose);

							I += g[m]*((jacInvTranspose[0]*d_Vx[m*pointsPerElement+j]+jacInvTranspose[1]*d_Vy[m*pointsPerElement+j]) * (jacInvTranspose[0]*d_Vx[m*pointsPerElement+k]+jacInvTranspose[1]*d_Vy[m*pointsPerElement+k]) +

								(jacInvTranspose[2]*d_Vx[m*pointsPerElement+j]+jacInvTranspose[3]*d_Vy[m*pointsPerElement+j]) * (jacInvTranspose[2]*d_Vx[m*pointsPerElement+k]+jacInvTranspose[3]*d_Vy[m*pointsPerElement+k]));


						value[j*pointsPerElement*Ne + k*Ne +i] = I;

						if (ind1 != ind2) {

							value[k*pointsPerElement*Ne + j*Ne +i] = I;




				I = 0;

				for (int k = 0; k<quad; k++) {

					I += d_V[k*pointsPerElement+j]*force(localQuad[2*k*128+2*localIndex],localQuad[2*k*128+2*localIndex+1])*d_quadWeights[k];


				load[j*Ne + i] = I;

				loadIndex[j*Ne + i] = ind1;





I assume you are compiling for compute capability 2.x? 2.x is a true load-store architecture even with regard to shared memory, so shared memory cannot directly serve as instruction operands and has to be moved through registers instead.

Use launch_bounds() (see appendix B.17 of the Programming Guide) to limit register use. If [font=“Courier New”]quad[/font] is a constant, unrolling the inner loops (by preceeding the with [font=“Courier New”]#pragma unroll[/font]) might help saving a register or two. You will probably have to combine unrolling with use of launch_bounds(), or the compiler will optimize through pipelining and use even more registers.

You can also write [font=“Courier New”]threadIdx.x[/font] instead of [font=“Courier New”]2*localIndex+first[/font] in a few places.