compiler bug? bit shift

Isn’t it a bug of the compiler in CUDA2.0 and CUDA2.1beta?

Source code(.cu) is

[codebox] unsigned int val = g_idata[tid];

sdata[ tid * 4 + 0] = tex1Dfetch( tex, (val >> 24) & 0xff );

sdata[ tid * 4 + 1] = tex1Dfetch( tex, (val >> 16) & 0xff );

sdata[ tid * 4 + 2] = tex1Dfetch( tex, (val >> 8) & 0xff );

sdata[ tid * 4 + 3] = tex1Dfetch( tex, (val ) & 0xff );

[/codebox]

compilation result(.ptx) is

[codebox] ld.global.u32 %r4, [%r3+0];

shr.u32 	%r5, %r4, 24;		// (val >> 24) & 0xff

mov.s32 	%r6, 0;

mov.s32 	%r7, 0;

mov.s32 	%r8, 0;

tex.1d.v4.u32.s32 {%r9,%r10,%r11,%r12},[tex,{%r5,%r6,%r7,%r8}];

.loc	14	73	0

mov.s32 	%r13, %r9;

mul.wide.u16 	%r14, %rh1, 8;

mov.u32 	%r15, __cuda_sdata12;

add.u32 	%r16, %r14, %r15;

st.shared.u16 	[%r16+0], %r13;

shl.b32 	%r17, %r4, 8;		//

shr.s32 	%r18, %r17, 24;		// (val >> 16) & 0xff ??

mov.s32 	%r19, 0;

mov.s32 	%r20, 0;

mov.s32 	%r21, 0;

tex.1d.v4.u32.s32 {%r22,%r23,%r24,%r25},[tex,{%r18,%r19,%r20,%r21}];

.loc	14	74	0

mov.s32 	%r26, %r22;

st.shared.u16 	[%r16+2], %r26;

shl.b32 	%r27, %r4, 16;		//

shr.s32 	%r28, %r27, 24;		// (val >>  8) & 0xff ??

mov.s32 	%r29, 0;

mov.s32 	%r30, 0;

mov.s32 	%r31, 0;

tex.1d.v4.u32.s32 {%r32,%r33,%r34,%r35},[tex,{%r28,%r29,%r30,%r31}];

.loc	14	75	0

mov.s32 	%r36, %r32;

st.shared.u16 	[%r16+4], %r36;

and.b32 	%r37, %r4, 255;		// (val      ) & 0xff

mov.s32 	%r38, 0;

mov.s32 	%r39, 0;

mov.s32 	%r40, 0;

tex.1d.v4.u32.s32 {%r41,%r42,%r43,%r44},[tex,{%r37,%r38,%r39,%r40}];

.loc	14	76	0

mov.s32 	%r45, %r41;

st.shared.u16 	[%r16+6], %r45;

[/codebox]

‘shr.s32’ must be ‘shr.u32’.

I couldn’t find the prototype for tex1Dfetch(), but could it’s 2nd argument be a signed int by chance? The compiler might be saving you from some type casting operations.

It looks like it’s being “clever” and turning a right-shift followed by bitwise-and into a left-shift followed by a right-shift, but using the wrong kind of right-shift.

Clearly a compiler bug. There’s many of these in relation with integer ops, which are at the same time trickier and less tested than floating-point.

The 2nd argument to tex1Dfetch has int type (it’s defined in “texture_fetch_functions.h”).

To be clear, is this actually producing incorrect results?

In future, please post complete code for reproducing the problem.

It shouldn’t matter that it’s signed int type. The code in parentheses has to be evaluated first, before type conversion is done.

But in fact, the PTX would be incorrect even if ‘val’ is signed type, because in the optimization the compiler is trying to do the right-shift always has to be logical.

This is the critical part of the PTX:

shl.b32 	%r17, %r4, 8;		//

shr.s32 	%r18, %r17, 24;	// (val >> 16) & 0xff ??

The right-shift-followed-by-and is converted into a shift-left-followed-by-shift-right. Maybe it saves a few bytes of code? But right-shift MUST be logical no matter what (ie, u32).

The sample code was made.

[codebox]include <stdlib.h>

include <stdio.h>

include <string.h>

include <math.h>

include <cutil.h>

texture tex;

global void testKernel( unsigned char* g_src, int* g_dst)

{

shared int sdata[ 256 ];

const unsigned int tid = threadIdx.x;

unsigned int g_idata = (unsigned int)g_src;

unsigned int val = g_idata[ tid];

sdata[ tid * 4 + 3] = tex1Dfetch( tex, (val >> 24) & 0xff );

sdata[ tid * 4 + 2] = tex1Dfetch( tex, (val >> 16) & 0xff );

sdata[ tid * 4 + 1] = tex1Dfetch( tex, (val >> 8) & 0xff );

sdata[ tid * 4 + 0] = tex1Dfetch( tex, (val ) & 0xff );

__syncthreads();

for (unsigned int i = tid; i < 256; i += blockDim.x) {

g_dst[ i] = sdata[ i];

}

}

host void computeGold( int* reference, unsigned char* idata, int *tbl )

{

for (int i = 0; i < 256; ++i) {

reference[i] = tbl[ idata[i] ];

}

}

void runTest( int argc, char** argv);

int main( int argc, char** argv)

{

runTest( argc, argv);

CUT_EXIT(argc, argv);

}

void runTest( int argc, char** argv)

{

CUT_DEVICE_INIT(argc, argv);

unsigned char* h_idata = (unsigned char*) malloc( 256 );

for( unsigned int i = 0; i < 256; ++i) {

    h_idata[i] = i;

}

int datasize = sizeof(int) * 256;

int *h_tbl = (int*) malloc( datasize );

for (int i = 0;  i < 256;  ++i) {

    h_tbl[i] = i * i;

}

unsigned char* d_idata;

CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, 256));

CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, 256, cudaMemcpyHostToDevice) );

int *d_tbl;

CUDA_SAFE_CALL( cudaMalloc( (void**)&d_tbl, datasize ) );

CUDA_SAFE_CALL( cudaMemcpy( d_tbl, h_tbl, datasize, cudaMemcpyHostToDevice ) );

int* d_odata;

CUDA_SAFE_CALL( cudaMalloc( (void**) &d_odata, datasize));

size_t offset = 0;

CUDA_SAFE_CALL( cudaBindTexture( &offset, tex, d_tbl, datasize ));

testKernel<<< 1, 256 / 4 >>>( d_idata, d_odata);

CUDA_SAFE_CALL( cudaUnbindTexture( tex ));

CUT_CHECK_ERROR(“Kernel execution failed”);

int* h_odata = (int*) malloc( datasize );

CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, datasize, cudaMemcpyDeviceToHost) );

int* reference = (int*)malloc( datasize );

computeGold( reference, h_idata, h_tbl );

CUTBoolean res = cutComparei( reference, h_odata, 256 );

printf( "Test %s\n", (1 == res) ? "PASSED" : "FAILED");

free( h_idata);

free( h_odata);

free( reference);

free( h_tbl);

CUDA_SAFE_CALL(cudaFree(d_idata));

CUDA_SAFE_CALL(cudaFree(d_odata));

CUDA_SAFE_CALL(cudaFree(d_tbl));

}

[/codebox]

This code is “PASSED” on Emulation, but “FAILED” on GPU.

Thanks for posting the code. I’ve filed a bug and will let you know the outcome.

I’d encourage you to sign up as a registered developer, then you can file bugs yourself and get a quicker response.

Thank you for advice, Simon and everyone.