Possible nvcc register usage bug

I have a CUDA problem that appears to be a compiler bug. It’s a fair amount of work to isolate the problem further, so I thought that I’d describe it and see if this is brand new or old hat.

I have a kernel (trilinearRunBy4) that attempts to do trilinear interpolation on an RGB pixel expressed as 3 bytes. The inner step is easy enough, and converts the RGB pixel into a CMYK value with a float4 representation, using two 2D texture lookups and interpolating the result (I wish that we had 3D lookups).

I fetch 4 bytes at a time from each of 3 separations and store 4 bytes at a time to 4 separations. This is MUCH better than a single byte at a time because I get to coalesce the global memory accesses. But it also leads to the problem.

The bug I’ve noted is that if I OMIT the noinline from the definition of oneSample that the compiler gets confused about register usage and I get the wrong values in the output. By including noinline I get the right results.

Further, if I omit the noinline, but add some code that does some useless floating point calculations on the results of oneSample, then I get the right results. I also get the right results in emulation.

So, the question is, should I go to the work of coming up with a simpler failing case?

For the record, I’m using CUDA 1.1 on Windows XP, with an EVGA 8800 GTX board, without overclocking.

// note: the following definition of FourBytes generates better code

// than using uchar4 (which would be more natural)

typedef unsigned char Byte;

typedef unsigned int FourBytes;

// use Intel byte order for FB and MK4

#define FB(XXX,NNN) ((XXX >> (NNN*8)) & 255)

#define MK4(X0,X1,X2,X3) (X0 + (X1 << 8) + (X2 << 16) + (X3 << 24))

__device__ __constant__ struct ConstStruct constData;

__device__ __constant__ float cvtR[256];

__device__ __constant__ float cvtG[256];

__device__ __constant__ float cvtB[256];

texture<float4, 2, cudaReadModeElementType> cornersTexture;

__device__ static Byte Clampus(float f) {

	return (__saturatef(f)*255)+0.5f;

}

__device__ __noinline__ float4

oneSample(Byte r, Byte g, Byte b) {

	float rF = cvtR[r];

	float gF = cvtG[g];

	float bF = cvtB[b];

	Byte rI = rF;

	float rFrac = rF - rI;

	gF = gF + 0.5f;

	bF = bF + 0.5f;

	int off = __mul24(rI, NumBinsY);

	float4 v0 = tex2D(cornersTexture, bF, gF+off);

	float4 v1 = tex2D(cornersTexture, bF, gF+(off+NumBinsY));

	float4 ret = make_float4(

  v0.x + rFrac * (v1.x-v0.x),

  v0.y + rFrac * (v1.y-v0.y),

  v0.z + rFrac * (v1.z-v0.z),

  v0.w + rFrac * (v1.w-v0.w));

	return ret;

}

__global__ static void

trilinearRunBy4 () {

	int tx = threadIdx.x + __mul24(blockIdx.x, blockDim.x);

	int ty = threadIdx.y + __mul24(blockIdx.y, blockDim.y);

	if (tx < constData.width/4 && ty < constData.height) {

  int off = tx + ty * (constData.pitch / 4);

  FourBytes r = ((FourBytes *) constData.rP)[off];

  FourBytes g = ((FourBytes *) constData.gP)[off];

  FourBytes b = ((FourBytes *) constData.bP)[off];

  float4 v1 = oneSample(FB(r,0), FB(g,0), FB(b,0));

  float4 v2 = oneSample(FB(r,1), FB(g,1), FB(b,1));

  float4 v3 = oneSample(FB(r,2), FB(g,2), FB(b,2));

  float4 v4 = oneSample(FB(r,3), FB(g,3), FB(b,3));

  FourBytes c = MK4(Clampus(v1.x), Clampus(v2.x), Clampus(v3.x), Clampus(v4.x));

  FourBytes m = MK4(Clampus(v1.y), Clampus(v2.y), Clampus(v3.y), Clampus(v4.y));

  FourBytes y = MK4(Clampus(v1.z), Clampus(v2.z), Clampus(v3.z), Clampus(v4.z));

  FourBytes k = MK4(Clampus(v1.w), Clampus(v2.w), Clampus(v3.w), Clampus(v4.w));

  ((FourBytes *) constData.cP)[off] = c; 

  ((FourBytes *) constData.mP)[off] = m; 

  ((FourBytes *) constData.yP)[off] = y; 

  ((FourBytes *) constData.kP)[off] = k;

	}

}