Bug in nvcc, incorrect sign-extend for textures Incorrect sign-extend when the results of one textur

The bug appears to occur when the results of a tex1Dfetch is used as the direct index for another tex1Dfetch. Example kernel code:

typedef unsigned char Byte;

typedef unsigned short Half;

typedef unsigned int Word;

texture<Half, 1, cudaReadModeElementType> trcTexture;

texture<Byte, 1, cudaReadModeElementType> sourceTexture;

__global__ void

checkBuggy(Word *dst) {

	const int gx = threadIdx.x + blockDim.x*blockIdx.x;

	Word b = tex1Dfetch(sourceTexture, gx);

	// Bug: b is sign-extended, despite unsigned declaration

	// Hint, look for "cvt.s8.s32" in the ptx file.

	Word in = tex1Dfetch(trcTexture, b);

	dst[gx] = in;

}

The bug can be easily observed in the ptx file where a “cvt.s8.s32” instruction is used when a “cvt.u8.u32” should be used.

Adding intervening operations (such as masking or addition) on the variable b after the first tex1Dfetch and before the second tex1Dfetch causes the problem to go away (no sign extension). However, casting is ineffective, and adding a constant 0 does not make the problem go away.

I’m using a recently fetched CUDA 2.1 on a Vista 64 system running Visual Studio 2005, compiling for capability 1.1.

This bug occurs more frequently than I originally suspected. One can make it occur even with one texture in a very simple kernel program. For example:

texture<float, 1, cudaReadModeElementType> testTexture;

__global__ static void

testSignBug(float *dst) {

	unsigned int tx = threadIdx.x;

	unsigned char x = tx;

	float v = tex1Dfetch(testTexture, x);

	dst[tx] = v;

}

In the above program, x gets sign-extended when used to index testTexture. The bug can be easily observed in the ptx file where a “cvt.s8.s32” instruction is used.

This bug is making me a little paranoid about using textures. I now have two programs where it has shown up.

I’m using a recently fetched CUDA 2.1 on a Vista 64 system running Visual Studio 2005, compiling for capability 1.1.

Any news from nVidia that this bug has been noted?

Russ, if you’ve not already done so, you might want to become a registered developer and file an official bug report.

The bug is fixed in CUDA 2.2:

.version 1.4

		.target sm_10, map_f64_to_f32

		// compiled with /usr/local/cuda-linux64-rel-nightly-2.2.1636/open64/lib//be

		// nvopencc built on 2009-03-05

		.reg .u32 %ra<17>;

		.reg .u64 %rda<17>;

		.reg .f32 %fa<17>;

		.reg .f64 %fda<17>;

		.reg .u32 %rv<5>;

		.reg .u64 %rdv<5>;

		.reg .f32 %fv<5>;

		.reg .f64 %fdv<5>;

		//-----------------------------------------------------------

		// Compiling /tmp/tmpxft_00000a0e_00000000-7_bug_ptx.cpp3.i (/tmp/ccBI#.0CZnUP)

		//-----------------------------------------------------------

		//-----------------------------------------------------------

		// Options:

		//-----------------------------------------------------------

		//  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64

		//  -O3 (Optimization level)

		//  -g0 (Debug level)

		//  -m2 (Report advisories)

		//-----------------------------------------------------------

		.file   1	   "/tmp/tmpxft_00000a0e_00000000-6_bug_ptx.cudafe2.gpu"

		.file   2	   "/usr/lib/gcc/x86_64-redhat-linux/3.4.6/include/stddef.h"

		.file   3	   "/usr/local/cuda/bin/../include/crt/device_runtime.h"

		.file   4	   "/usr/local/cuda/bin/../include/host_defines.h"

		.file   5	   "/usr/local/cuda/bin/../include/builtin_types.h"

		.file   6	   "/usr/local/cuda/bin/../include/device_types.h"

		.file   7	   "/usr/local/cuda/bin/../include/driver_types.h"

		.file   8	   "/usr/local/cuda/bin/../include/texture_types.h"

		.file   9	   "/usr/local/cuda/bin/../include/vector_types.h"

		.file   10	  "/usr/local/cuda/bin/../include/device_launch_parameters.h"

		.file   11	  "/usr/local/cuda/bin/../include/crt/storage_class.h"

		.file   12	  "/usr/include/bits/types.h"

		.file   13	  "/usr/include/time.h"

		.file   14	  "bug_ptx.cu"

		.file   15	  "/usr/local/cuda/bin/../include/common_functions.h"

		.file   16	  "/usr/local/cuda/bin/../include/crt/func_macro.h"

		.file   17	  "/usr/local/cuda/bin/../include/math_functions.h"

		.file   18	  "/usr/local/cuda/bin/../include/device_functions.h"

		.file   19	  "/usr/local/cuda/bin/../include/math_constants.h"

		.file   20	  "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"

		.file   21	  "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"

		.file   22	  "/usr/local/cuda/bin/../include/sm_13_double_functions.h"

		.file   23	  "/usr/local/cuda/bin/../include/common_types.h"

		.file   24	  "/usr/local/cuda/bin/../include/texture_fetch_functions.h"

		.file   25	  "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"

		.tex .u64 trcTexture;

		.tex .u64 sourceTexture;

		.entry _Z10checkBuggyPj (

				.param .u64 __cudaparm__Z10checkBuggyPj_dst)

		{

		.reg .u16 %rh<4>;

		.reg .u32 %r<24>;

		.reg .u64 %rd<6>;

		.loc	14	  9	   0

$LBB1__Z10checkBuggyPj:

		cvt.u32.u16	 %r1, %tid.x;			// 

		mov.u16		 %rh1, %ctaid.x;		 // 

		mov.u16		 %rh2, %ntid.x;		  // 

		mul.wide.u16	%r2, %rh1, %rh2;		// 

		add.u32		 %r3, %r1, %r2;		  // 

		mov.s32		 %r4, %r3;			   // 

		mov.s32		 %r5, 0;				 // 

		mov.s32		 %r6, 0;				 // 

		mov.s32		 %r7, 0;				 // 

		tex.1d.v4.u32.s32 {%r8,%r9,%r10,%r11},[sourceTexture,{%r4,%r5,%r6,%r7}];

		.loc	14	  12	  0

		mov.s32		 %r12, %r8;			  // 

		cvt.u8.u32	  %r13, %r12;			 // 

		mov.s32		 %r14, 0;				// 

		mov.s32		 %r15, 0;				// 

		mov.s32		 %r16, 0;				// 

		tex.1d.v4.u32.s32 {%r17,%r18,%r19,%r20},[trcTexture,{%r13,%r14,%r15,%r16}];

		.loc	14	  17	  0

		mov.s32		 %r21, %r17;			 // 

		.loc	14	  18	  0

		cvt.u16.u32	 %r22, %r21;			 // 

		ld.param.u64	%rd1, [__cudaparm__Z10checkBuggyPj_dst];		// id:26 __cudaparm__Z10checkBuggyPj_dst+0x0

		cvt.u64.s32	 %rd2, %r3;			  // 

		mul.lo.u64	  %rd3, %rd2, 4;		  // 

		add.u64		 %rd4, %rd1, %rd3;	   // 

		st.global.u32   [%rd4+0], %r22; // id:27

		.loc	14	  19	  0

		exit;						   // 

$LDWend__Z10checkBuggyPj:

		} // _Z10checkBuggyPj
.version 1.4

		.target sm_10, map_f64_to_f32

		// compiled with /usr/local/cuda-linux64-rel-nightly-2.2.1636/open64/lib//be

		// nvopencc built on 2009-03-05

		.reg .u32 %ra<17>;

		.reg .u64 %rda<17>;

		.reg .f32 %fa<17>;

		.reg .f64 %fda<17>;

		.reg .u32 %rv<5>;

		.reg .u64 %rdv<5>;

		.reg .f32 %fv<5>;

		.reg .f64 %fdv<5>;

		//-----------------------------------------------------------

		// Compiling /tmp/tmpxft_00000a2f_00000000-7_bug_ptx2.cpp3.i (/tmp/ccBI#.QIglVC)

		//-----------------------------------------------------------

		//-----------------------------------------------------------

		// Options:

		//-----------------------------------------------------------

		//  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64

		//  -O3 (Optimization level)

		//  -g0 (Debug level)

		//  -m2 (Report advisories)

		//-----------------------------------------------------------

		.file   1	   "/tmp/tmpxft_00000a2f_00000000-6_bug_ptx2.cudafe2.gpu"

		.file   2	   "/usr/lib/gcc/x86_64-redhat-linux/3.4.6/include/stddef.h"

		.file   3	   "/usr/local/cuda/bin/../include/crt/device_runtime.h"

		.file   4	   "/usr/local/cuda/bin/../include/host_defines.h"

		.file   5	   "/usr/local/cuda/bin/../include/builtin_types.h"

		.file   6	   "/usr/local/cuda/bin/../include/device_types.h"

		.file   7	   "/usr/local/cuda/bin/../include/driver_types.h"

		.file   8	   "/usr/local/cuda/bin/../include/texture_types.h"

		.file   9	   "/usr/local/cuda/bin/../include/vector_types.h"

		.file   10	  "/usr/local/cuda/bin/../include/device_launch_parameters.h"

		.file   11	  "/usr/local/cuda/bin/../include/crt/storage_class.h"

		.file   12	  "/usr/include/bits/types.h"

		.file   13	  "/usr/include/time.h"

		.file   14	  "bug_ptx2.cu"

		.file   15	  "/usr/local/cuda/bin/../include/common_functions.h"

		.file   16	  "/usr/local/cuda/bin/../include/crt/func_macro.h"

		.file   17	  "/usr/local/cuda/bin/../include/math_functions.h"

		.file   18	  "/usr/local/cuda/bin/../include/device_functions.h"

		.file   19	  "/usr/local/cuda/bin/../include/math_constants.h"

		.file   20	  "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"

		.file   21	  "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"

		.file   22	  "/usr/local/cuda/bin/../include/sm_13_double_functions.h"

		.file   23	  "/usr/local/cuda/bin/../include/common_types.h"

		.file   24	  "/usr/local/cuda/bin/../include/texture_fetch_functions.h"

		.file   25	  "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"

		.tex .u64 testTexture;

		.entry _Z11testSignBugPf (

				.param .u64 __cudaparm__Z11testSignBugPf_dst)

		{

		.reg .u32 %r<7>;

		.reg .u64 %rd<6>;

		.reg .f32 %f<7>;

		.loc	14	  3	   0

$LBB1__Z11testSignBugPf:

		cvt.u32.u16	 %r1, %tid.x;			// 

		cvt.u8.u32	  %r2, %r1;			   // 

		mov.s32		 %r3, 0;				 // 

		mov.s32		 %r4, 0;				 // 

		mov.s32		 %r5, 0;				 // 

		tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[testTexture,{%r2,%r3,%r4,%r5}];

		.loc	14	  6	   0

		mov.f32		 %f5, %f1;			   // 

		.loc	14	  7	   0

		ld.param.u64	%rd1, [__cudaparm__Z11testSignBugPf_dst];	   // id:14 __cudaparm__Z11testSignBugPf_dst+0x0

		cvt.u64.u32	 %rd2, %r1;			  // 

		mul.lo.u64	  %rd3, %rd2, 4;		  // 

		add.u64		 %rd4, %rd1, %rd3;	   // 

		st.global.f32   [%rd4+0], %f5;  // id:15

		.loc	14	  8	   0

		exit;						   // 

$LDWend__Z11testSignBugPf:

		} // _Z11testSignBugPf