Ways of manual loop unrolling as a workaround to avoid unnecessary register spills

Hello,

Recently I’ve hit a problem where compiler performs dull register spills to the local memory. The solution I found is to manually unroll loops. I remember someone long time ago mentioned that there is macros based loop unrolling technique which can simplify the task, but I couldn’t find it.

So, how do you unroll loops ?

Here is the kernel (compile with --ptx) causing problems:

#define INSERT_SORTED(j) \

  if (minval[j] > t) \

  {\

    float tt = minval[j];\

    float nn = minind[j];\

    minval[j] = t;\

    minind[j] = n;\

    t = tt;\

    n = nn;\

 }

#define MERGE_SORTED(i) \

  {\

      sd[threadIdx.x] = minval[i];\

      \

      if (k >= 32)\

        __syncthreads();\

        \

      float t = sd[threadIdx.x + k];\

      \

      sd[threadIdx.x] = __int_as_float(minind[i]);\

      \

      if (k >= 32)\

        __syncthreads();\

        \

      int n = __float_as_int(sd[threadIdx.x + k]);\

      \

      if ((threadIdx.x & (k*2-1)) == 0)\

      {\

        INSERT_SORTED(0);\

        INSERT_SORTED(1);\

        INSERT_SORTED(2);\

        INSERT_SORTED(3);\

        INSERT_SORTED(4);\

        INSERT_SORTED(5);\

      }\

  }

__constant__ struct

{

  int numvals;

  float*  in;

  float2* out;

} c;

__global__ void test_kernel()

{

  const int BLOCK_DIM_X = 64;

__shared__ float sd[BLOCK_DIM_X];

int stride = gridDim.x * blockDim.x;

float  minval[6];

  int    minind[6];

for (int i = 0; i < 6; ++i)

    minval[i] = 3.402823466e+38F;

for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < c.numvals; i += stride)

  {

    float t = c.in[i];

    int   n = i;

for (int j = 0; j < 6; ++j)

      INSERT_SORTED(j);

  }

for (int k = 1; k != BLOCK_DIM_X; k *= 2)

  {

#if 1

    //BUG!! - this code is the same as below, but this version makes minval and minind arrays get spilled to the local mem!!!

    for (int i = 0; i < 6; ++i)

      MERGE_SORTED(i);

#else

    MERGE_SORTED(0);

    MERGE_SORTED(1);

    MERGE_SORTED(2);

    MERGE_SORTED(3);

    MERGE_SORTED(4);

    MERGE_SORTED(5);

#endif

  }

if (threadIdx.x == 0 )

  {

    for (int i = 0; i < 6; ++i)

    {

      c.out[blockIdx.x] = make_float2(__int_as_float(minind[i]), minval[i]);

    }

  }

}

One can specifically request that the compiler unroll a particular loop by using a pragma (see section E.2 of the CUDA C Programming Guide):

for (int k = 1; k != BLOCK_DIM_X; k *= 2) {

#pragma unroll 6

        for (int i = 0; i < 6; ++i) {

            MERGE_SORTED(i);

        }

    }

Thanks for the tip - works like a charm!