Hello,
Recently I’ve hit a problem where compiler performs dull register spills to the local memory. The solution I found is to manually unroll loops. I remember someone long time ago mentioned that there is macros based loop unrolling technique which can simplify the task, but I couldn’t find it.
So, how do you unroll loops ?
Here is the kernel (compile with --ptx) causing problems:
#define INSERT_SORTED(j) \
if (minval[j] > t) \
{\
float tt = minval[j];\
float nn = minind[j];\
minval[j] = t;\
minind[j] = n;\
t = tt;\
n = nn;\
}
#define MERGE_SORTED(i) \
{\
sd[threadIdx.x] = minval[i];\
\
if (k >= 32)\
__syncthreads();\
\
float t = sd[threadIdx.x + k];\
\
sd[threadIdx.x] = __int_as_float(minind[i]);\
\
if (k >= 32)\
__syncthreads();\
\
int n = __float_as_int(sd[threadIdx.x + k]);\
\
if ((threadIdx.x & (k*2-1)) == 0)\
{\
INSERT_SORTED(0);\
INSERT_SORTED(1);\
INSERT_SORTED(2);\
INSERT_SORTED(3);\
INSERT_SORTED(4);\
INSERT_SORTED(5);\
}\
}
__constant__ struct
{
int numvals;
float* in;
float2* out;
} c;
__global__ void test_kernel()
{
const int BLOCK_DIM_X = 64;
__shared__ float sd[BLOCK_DIM_X];
int stride = gridDim.x * blockDim.x;
float minval[6];
int minind[6];
for (int i = 0; i < 6; ++i)
minval[i] = 3.402823466e+38F;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < c.numvals; i += stride)
{
float t = c.in[i];
int n = i;
for (int j = 0; j < 6; ++j)
INSERT_SORTED(j);
}
for (int k = 1; k != BLOCK_DIM_X; k *= 2)
{
#if 1
//BUG!! - this code is the same as below, but this version makes minval and minind arrays get spilled to the local mem!!!
for (int i = 0; i < 6; ++i)
MERGE_SORTED(i);
#else
MERGE_SORTED(0);
MERGE_SORTED(1);
MERGE_SORTED(2);
MERGE_SORTED(3);
MERGE_SORTED(4);
MERGE_SORTED(5);
#endif
}
if (threadIdx.x == 0 )
{
for (int i = 0; i < 6; ++i)
{
c.out[blockIdx.x] = make_float2(__int_as_float(minind[i]), minval[i]);
}
}
}