compiler bug? bit shift

tsisc · November 23, 2008, 10:30am

Isn’t it a bug of the compiler in CUDA2.0 and CUDA2.1beta?

Source code(.cu) is

[codebox] unsigned int val = g_idata[tid];

sdata[ tid * 4 + 0] = tex1Dfetch( tex, (val >> 24) & 0xff );

sdata[ tid * 4 + 1] = tex1Dfetch( tex, (val >> 16) & 0xff );

sdata[ tid * 4 + 2] = tex1Dfetch( tex, (val >> 8) & 0xff );

sdata[ tid * 4 + 3] = tex1Dfetch( tex, (val ) & 0xff );

[/codebox]

compilation result(.ptx) is

[codebox] ld.global.u32 %r4, [%r3+0];

shr.u32 	%r5, %r4, 24;		// (val >> 24) & 0xff

mov.s32 	%r6, 0;

mov.s32 	%r7, 0;

mov.s32 	%r8, 0;

tex.1d.v4.u32.s32 {%r9,%r10,%r11,%r12},[tex,{%r5,%r6,%r7,%r8}];

.loc	14	73	0

mov.s32 	%r13, %r9;

mul.wide.u16 	%r14, %rh1, 8;

mov.u32 	%r15, __cuda_sdata12;

add.u32 	%r16, %r14, %r15;

st.shared.u16 	[%r16+0], %r13;

shl.b32 	%r17, %r4, 8;		//

shr.s32 	%r18, %r17, 24;		// (val >> 16) & 0xff ??

mov.s32 	%r19, 0;

mov.s32 	%r20, 0;

mov.s32 	%r21, 0;

tex.1d.v4.u32.s32 {%r22,%r23,%r24,%r25},[tex,{%r18,%r19,%r20,%r21}];

.loc	14	74	0

mov.s32 	%r26, %r22;

st.shared.u16 	[%r16+2], %r26;

shl.b32 	%r27, %r4, 16;		//

shr.s32 	%r28, %r27, 24;		// (val >>  8) & 0xff ??

mov.s32 	%r29, 0;

mov.s32 	%r30, 0;

mov.s32 	%r31, 0;

tex.1d.v4.u32.s32 {%r32,%r33,%r34,%r35},[tex,{%r28,%r29,%r30,%r31}];

.loc	14	75	0

mov.s32 	%r36, %r32;

st.shared.u16 	[%r16+4], %r36;

and.b32 	%r37, %r4, 255;		// (val      ) & 0xff

mov.s32 	%r38, 0;

mov.s32 	%r39, 0;

mov.s32 	%r40, 0;

tex.1d.v4.u32.s32 {%r41,%r42,%r43,%r44},[tex,{%r37,%r38,%r39,%r40}];

.loc	14	76	0

mov.s32 	%r45, %r41;

st.shared.u16 	[%r16+6], %r45;

[/codebox]

‘shr.s32’ must be ‘shr.u32’.

Fugl · November 24, 2008, 11:28am

Isn’t it a bug of the compiler in CUDA2.0 and CUDA2.1beta?

Source code(.cu) is

[codebox] unsigned int val = g_idata[tid];

sdata[ tid * 4 + 0] = tex1Dfetch( tex, (val >> 24) & 0xff );

sdata[ tid * 4 + 1] = tex1Dfetch( tex, (val >> 16) & 0xff );

sdata[ tid * 4 + 2] = tex1Dfetch( tex, (val >> 8) & 0xff );

sdata[ tid * 4 + 3] = tex1Dfetch( tex, (val ) & 0xff );

[/codebox]

compilation result(.ptx) is

[codebox] ld.global.u32 %r4, [%r3+0];
shr.u32 	%r5, %r4, 24;		// (val >> 24) & 0xff

mov.s32 	%r6, 0;

mov.s32 	%r7, 0;

mov.s32 	%r8, 0;

tex.1d.v4.u32.s32 {%r9,%r10,%r11,%r12},[tex,{%r5,%r6,%r7,%r8}];

.loc	14	73	0

mov.s32 	%r13, %r9;

mul.wide.u16 	%r14, %rh1, 8;

mov.u32 	%r15, __cuda_sdata12;

add.u32 	%r16, %r14, %r15;

st.shared.u16 	[%r16+0], %r13;

shl.b32 	%r17, %r4, 8;		//

shr.s32 	%r18, %r17, 24;		// (val >> 16) & 0xff ??

mov.s32 	%r19, 0;

mov.s32 	%r20, 0;

mov.s32 	%r21, 0;

tex.1d.v4.u32.s32 {%r22,%r23,%r24,%r25},[tex,{%r18,%r19,%r20,%r21}];

.loc	14	74	0

mov.s32 	%r26, %r22;

st.shared.u16 	[%r16+2], %r26;

shl.b32 	%r27, %r4, 16;		//

shr.s32 	%r28, %r27, 24;		// (val >>  8) & 0xff ??

mov.s32 	%r29, 0;

mov.s32 	%r30, 0;

mov.s32 	%r31, 0;

tex.1d.v4.u32.s32 {%r32,%r33,%r34,%r35},[tex,{%r28,%r29,%r30,%r31}];

.loc	14	75	0

mov.s32 	%r36, %r32;

st.shared.u16 	[%r16+4], %r36;

and.b32 	%r37, %r4, 255;		// (val      ) & 0xff

mov.s32 	%r38, 0;

mov.s32 	%r39, 0;

mov.s32 	%r40, 0;

tex.1d.v4.u32.s32 {%r41,%r42,%r43,%r44},[tex,{%r37,%r38,%r39,%r40}];

.loc	14	76	0

mov.s32 	%r45, %r41;

st.shared.u16 	[%r16+6], %r45;
[/codebox]

‘shr.s32’ must be ‘shr.u32’.

I couldn’t find the prototype for tex1Dfetch(), but could it’s 2nd argument be a signed int by chance? The compiler might be saving you from some type casting operations.

alex_dubinsky · November 25, 2008, 6:51am

It looks like it’s being “clever” and turning a right-shift followed by bitwise-and into a left-shift followed by a right-shift, but using the wrong kind of right-shift.

Clearly a compiler bug. There’s many of these in relation with integer ops, which are at the same time trickier and less tested than floating-point.

Simon_Green · November 25, 2008, 6:09pm

The 2nd argument to tex1Dfetch has int type (it’s defined in “texture_fetch_functions.h”).

To be clear, is this actually producing incorrect results?

In future, please post complete code for reproducing the problem.

alex_dubinsky · November 25, 2008, 6:20pm

It shouldn’t matter that it’s signed int type. The code in parentheses has to be evaluated first, before type conversion is done.

But in fact, the PTX would be incorrect even if ‘val’ is signed type, because in the optimization the compiler is trying to do the right-shift always has to be logical.

This is the critical part of the PTX:

shl.b32 	%r17, %r4, 8;		//

shr.s32 	%r18, %r17, 24;	// (val >> 16) & 0xff ??

The right-shift-followed-by-and is converted into a shift-left-followed-by-shift-right. Maybe it saves a few bytes of code? But right-shift MUST be logical no matter what (ie, u32).

tsisc · November 26, 2008, 2:38pm

The sample code was made.

[codebox]include <stdlib.h>

include <stdio.h>

include <string.h>

include <math.h>

include <cutil.h>

texture tex;

global void testKernel( unsigned char* g_src, int* g_dst)

{

shared int sdata[ 256 ];

const unsigned int tid = threadIdx.x;

unsigned int g_idata = (unsigned int)g_src;

unsigned int val = g_idata[ tid];

sdata[ tid * 4 + 3] = tex1Dfetch( tex, (val >> 24) & 0xff );

sdata[ tid * 4 + 2] = tex1Dfetch( tex, (val >> 16) & 0xff );

sdata[ tid * 4 + 1] = tex1Dfetch( tex, (val >> 8) & 0xff );

sdata[ tid * 4 + 0] = tex1Dfetch( tex, (val ) & 0xff );

__syncthreads();

for (unsigned int i = tid; i < 256; i += blockDim.x) {

g_dst[ i] = sdata[ i];

}

host void computeGold( int* reference, unsigned char* idata, int *tbl )

{

for (int i = 0; i < 256; ++i) {

reference[i] = tbl[ idata[i] ];

}

void runTest( int argc, char** argv);

int main( int argc, char** argv)

{

runTest( argc, argv);

CUT_EXIT(argc, argv);

}

void runTest( int argc, char** argv)

{

CUT_DEVICE_INIT(argc, argv);

unsigned char* h_idata = (unsigned char*) malloc( 256 );

for( unsigned int i = 0; i < 256; ++i) {

    h_idata[i] = i;

}

int datasize = sizeof(int) * 256;

int *h_tbl = (int*) malloc( datasize );

for (int i = 0;  i < 256;  ++i) {

    h_tbl[i] = i * i;

}

unsigned char* d_idata;

CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, 256));

CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, 256, cudaMemcpyHostToDevice) );

int *d_tbl;

CUDA_SAFE_CALL( cudaMalloc( (void**)&d_tbl, datasize ) );

CUDA_SAFE_CALL( cudaMemcpy( d_tbl, h_tbl, datasize, cudaMemcpyHostToDevice ) );

int* d_odata;

CUDA_SAFE_CALL( cudaMalloc( (void**) &d_odata, datasize));

size_t offset = 0;

CUDA_SAFE_CALL( cudaBindTexture( &offset, tex, d_tbl, datasize ));

testKernel<<< 1, 256 / 4 >>>( d_idata, d_odata);

CUDA_SAFE_CALL( cudaUnbindTexture( tex ));

CUT_CHECK_ERROR(“Kernel execution failed”);

int* h_odata = (int*) malloc( datasize );

CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, datasize, cudaMemcpyDeviceToHost) );

int* reference = (int*)malloc( datasize );

computeGold( reference, h_idata, h_tbl );

CUTBoolean res = cutComparei( reference, h_odata, 256 );

printf( "Test %s\n", (1 == res) ? "PASSED" : "FAILED");

free( h_idata);

free( h_odata);

free( reference);

free( h_tbl);

CUDA_SAFE_CALL(cudaFree(d_idata));

CUDA_SAFE_CALL(cudaFree(d_odata));

CUDA_SAFE_CALL(cudaFree(d_tbl));

}

[/codebox]

This code is “PASSED” on Emulation, but “FAILED” on GPU.

Simon_Green · November 28, 2008, 11:51am

Thanks for posting the code. I’ve filed a bug and will let you know the outcome.

I’d encourage you to sign up as a registered developer, then you can file bugs yourself and get a quicker response.

tsisc · November 28, 2008, 11:08pm

Thank you for advice, Simon and everyone.

Topic		Replies	Views
Processing image with a CUDA kernel gives me different result than a seemingly equivalent CPU function CUDA Programming and Performance opencv , cuda	27	1433	May 8, 2024
Compiler Error for Byte To Int Conversion CUDA Programming and Performance	16	1425	May 21, 2011
[Bug Report] Simple but weird CUDA Programming and Performance	7	1412	July 9, 2015
Cuda compiler will optimize code to use more registers than available by attempting to cache parameters CUDA Programming and Performance	12	2269	November 14, 2017
CUDA 5.5 produces different PTX code from CUDA 5.0 with degraded performance CUDA Programming and Performance	3	1437	August 15, 2013
cufft question CUDA Programming and Performance	6	8653	March 9, 2009
CudaAPI calls in functions, compiler/linking bug? CUDA Programming and Performance	6	361	August 17, 2023
Bug in nvcc, incorrect sign-extend for textures Incorrect sign-extend when the results of one textur CUDA Programming and Performance	3	2832	May 5, 2009
CUDA Build Error CUDA Setup and Installation	1	599	July 17, 2020
Array Assigned to Register and Unrolled Loop Variable Bug CUDA Programming and Performance	6	1853	May 11, 2009

compiler bug? bit shift

Related topics