Hello,
I get correct results in DEBUG build, but wrong results in optimized RELEASE build.
Below you can find the source and both PTX files of a minimized demo kernel that shows the effect (of course my real kernel is much bigger).
The release build will show the wrong output 10h.
The debug build will show the correct output 00h.
It seems that the following code is ignorred in the release build, which seems to cause the error:
byte = byte & 0xFF;
It seems “cvt.u32.u8” is missing in the release ptx in front of the shift right (shr) command.
I’m use “GeForce GTX 470” and the latest “SDK 3.2”.
Is this a compiler bug or have I done something wrong?
//PTX release build (will show the wrong output 10h)
.entry _Z6KernelPh (
.param .u32 __cudaparm__Z6KernelPh_d_A)
{
.reg .u16 %rh<6>;
.reg .u32 %r<3>;
.loc 28 3 0
$LDWbegin__Z6KernelPh:
.loc 28 8 0
ld.param.u32 %r1, [__cudaparm__Z6KernelPh_d_A];
ld.global.u8 %rh1, [%r1+0];
add.u16 %rh2, %rh1, 128;
mov.u16 %rh3, %rh2;
shr.u16 %rh4, %rh3, 4;
st.global.u8 [%r1+0], %rh4;
.loc 28 9 0
exit;
$LDWend__Z6KernelPh:
} // _Z6KernelPh
//PTX debug build (will show the correct output 00h)
.entry _Z6KernelPh (
.param .u32 __cudaparm__Z6KernelPh_d_A)
{
.reg .u16 %rh<3>;
.reg .u32 %r<10>;
.loc 28 3 0
$LDWbegin__Z6KernelPh:
$LDWbeginblock_203_1:
.loc 28 6 0
ld.param.u32 %r1, [__cudaparm__Z6KernelPh_d_A];
ld.global.u8 %r2, [%r1+0];
add.u32 %r3, %r2, 128;
cvt.u8.u32 %r4, %r3;
cvt.u8.u32 %rh1, %r4;
.loc 28 7 0
cvt.u32.u8 %r5, %rh1;
cvt.u8.u32 %rh1, %r5;
.loc 28 8 0
cvt.u32.u8 %r6, %rh1;
shr.u32 %r7, %r6, 4;
ld.param.u32 %r8, [__cudaparm__Z6KernelPh_d_A];
st.global.u8 [%r8+0], %r7;
$LDWendblock_203_1:
.loc 28 9 0
exit;
$LDWend__Z6KernelPh:
} // _Z6KernelPh
//.cu src file
#include <stdio.h>
__global__ void Kernel(unsigned char* d_A)
{
unsigned char byte;
byte = d_A[0] + 0x80;
byte = byte & 0xFF;
d_A[0] = byte >> 4;
}
int main(int argc, char** argv)
{
unsigned char *h_A;
unsigned char *d_A;
int N = 1;
size_t size = N * sizeof(unsigned char);
h_A = (unsigned char*)malloc(size);
cudaMalloc((void**)&d_A, size);
h_A[0] = 0x80;
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
Kernel<<<1, 1>>>(d_A);
cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
printf("Result: %02Xh\n", h_A[0]);
cudaFree(d_A);
free(h_A);
getchar();
}