# Increasing the number of registers at cycle expand

Hi.

I have next procedure which is called many times in a cycle:

``````// block1 and block2 from shared memory

__device__ int L2D_4x4(unsigned char *block1, unsigned char *block2 ,int blocksize, int stride1, int stride2)

{

int sum = 0;

int j = 0;

for (int i = 0; i<stride1*blocksize; i+=stride1, j+=stride2)

{

unsigned char *pb1 = &block1[i];

unsigned char *pb2 = &block2[j];

for (int j = 0; j<blocksize; j++)

sum+=sum+__mul24((pb1[j]-pb2[j]),(pb1[j]-pb2[j]));

}

}
``````

At compilation of the given procedure the cubin file shows 22 registers.

If I shall open a cycle (with the purpose of a prize of time) like so:

``````__device__ int L2D_4x4(unsigned char *block1, unsigned char *block2 ,int blocksize, int stride1, int stride2)

{

int sum = 0;

sum=sum+__mul24((block1[0]-block2[0]),(block1[0]-block2[0]));

sum=sum+__mul24((block1[1]-block2[1]),(block1[1]-block2[1]));

sum=sum+__mul24((block1[2]-block2[2]),(block1[2]-block2[2]));

sum=sum+__mul24((block1[3]-block2[3]),(block1[3]-block2[3]));

sum=sum+__mul24((block1[stride1]-block2[stride2]),(block1[stride1]-block2[stride2]));

sum=sum+__mul24((block1[stride1+1]-block2[stride2+1]),(block1[stride1+1]-block2[stride2+1]));

sum=sum+__mul24((block1[stride1+2]-block2[stride2+2]),(block1[stride1+2]-block2[stride2+2]));

sum=sum+__mul24((block1[stride1+3]-block2[stride2+3]),(block1[stride1+3]-block2[stride2+3]));

sum=sum+__mul24((block1[2*stride1]-block2[2*stride2]),(block1[2*stride1]-block2[2*stride2]));

sum=sum+__mul24((block1[2*stride1+1]-block2[2*stride2+1]),(block1[2*stride1+1]-block2[2*stride2+1]));

sum=sum+__mul24((block1[2*stride1+2]-block2[2*stride2+2]),(block1[2*stride1+2]-block2[2*stride2+2]));

sum=sum+__mul24((block1[2*stride1+3]-block2[2*stride2+3]),(block1[2*stride1+3]-block2[2*stride2+3]));

sum=sum+__mul24((block1[3*stride1]-block2[3*stride2]),(block1[3*stride1]-block2[3*stride2]));

sum=sum+__mul24((block1[3*stride1+1]-block2[3*stride2+1]),(block1[3*stride1+1]-block2[3*stride2+1]));

sum=sum+__mul24((block1[3*stride1+2]-block2[3*stride2+2]),(block1[3*stride1+2]-block2[3*stride2+2]));

sum=sum+__mul24((block1[3*stride1+3]-block2[3*stride2+3]),(block1[3*stride1+3]-block2[3*stride2+3]));

return sum;

}
``````

Than number of registers increases twice (!). How can I keep equal number of registers in both cases?

PS. the number of registers increases at expanding other cycles too.

I use cuda 0.9. In previous version I could not detect this problem.

Many thanks.