I have a CUDA problem that appears to be a compiler bug. It’s a fair amount of work to isolate the problem further, so I thought that I’d describe it and see if this is brand new or old hat.
I have a kernel (trilinearRunBy4) that attempts to do trilinear interpolation on an RGB pixel expressed as 3 bytes. The inner step is easy enough, and converts the RGB pixel into a CMYK value with a float4 representation, using two 2D texture lookups and interpolating the result (I wish that we had 3D lookups).
I fetch 4 bytes at a time from each of 3 separations and store 4 bytes at a time to 4 separations. This is MUCH better than a single byte at a time because I get to coalesce the global memory accesses. But it also leads to the problem.
The bug I’ve noted is that if I OMIT the noinline from the definition of oneSample that the compiler gets confused about register usage and I get the wrong values in the output. By including noinline I get the right results.
Further, if I omit the noinline, but add some code that does some useless floating point calculations on the results of oneSample, then I get the right results. I also get the right results in emulation.
So, the question is, should I go to the work of coming up with a simpler failing case?
For the record, I’m using CUDA 1.1 on Windows XP, with an EVGA 8800 GTX board, without overclocking.
// note: the following definition of FourBytes generates better code
// than using uchar4 (which would be more natural)
typedef unsigned char Byte;
typedef unsigned int FourBytes;
// use Intel byte order for FB and MK4
#define FB(XXX,NNN) ((XXX >> (NNN*8)) & 255)
#define MK4(X0,X1,X2,X3) (X0 + (X1 << 8) + (X2 << 16) + (X3 << 24))
__device__ __constant__ struct ConstStruct constData;
__device__ __constant__ float cvtR[256];
__device__ __constant__ float cvtG[256];
__device__ __constant__ float cvtB[256];
texture<float4, 2, cudaReadModeElementType> cornersTexture;
__device__ static Byte Clampus(float f) {
return (__saturatef(f)*255)+0.5f;
}
__device__ __noinline__ float4
oneSample(Byte r, Byte g, Byte b) {
float rF = cvtR[r];
float gF = cvtG[g];
float bF = cvtB[b];
Byte rI = rF;
float rFrac = rF - rI;
gF = gF + 0.5f;
bF = bF + 0.5f;
int off = __mul24(rI, NumBinsY);
float4 v0 = tex2D(cornersTexture, bF, gF+off);
float4 v1 = tex2D(cornersTexture, bF, gF+(off+NumBinsY));
float4 ret = make_float4(
v0.x + rFrac * (v1.x-v0.x),
v0.y + rFrac * (v1.y-v0.y),
v0.z + rFrac * (v1.z-v0.z),
v0.w + rFrac * (v1.w-v0.w));
return ret;
}
__global__ static void
trilinearRunBy4 () {
int tx = threadIdx.x + __mul24(blockIdx.x, blockDim.x);
int ty = threadIdx.y + __mul24(blockIdx.y, blockDim.y);
if (tx < constData.width/4 && ty < constData.height) {
int off = tx + ty * (constData.pitch / 4);
FourBytes r = ((FourBytes *) constData.rP)[off];
FourBytes g = ((FourBytes *) constData.gP)[off];
FourBytes b = ((FourBytes *) constData.bP)[off];
float4 v1 = oneSample(FB(r,0), FB(g,0), FB(b,0));
float4 v2 = oneSample(FB(r,1), FB(g,1), FB(b,1));
float4 v3 = oneSample(FB(r,2), FB(g,2), FB(b,2));
float4 v4 = oneSample(FB(r,3), FB(g,3), FB(b,3));
FourBytes c = MK4(Clampus(v1.x), Clampus(v2.x), Clampus(v3.x), Clampus(v4.x));
FourBytes m = MK4(Clampus(v1.y), Clampus(v2.y), Clampus(v3.y), Clampus(v4.y));
FourBytes y = MK4(Clampus(v1.z), Clampus(v2.z), Clampus(v3.z), Clampus(v4.z));
FourBytes k = MK4(Clampus(v1.w), Clampus(v2.w), Clampus(v3.w), Clampus(v4.w));
((FourBytes *) constData.cP)[off] = c;
((FourBytes *) constData.mP)[off] = m;
((FourBytes *) constData.yP)[off] = y;
((FourBytes *) constData.kP)[off] = k;
}
}