Hi,
I run into a very strange thing that I’m not able to solve in the last three days. In my host code I allocate more than 1GB of GPU-memory. I get this pointer in my global function and write some values into it. They strange thing is now, that this kernel works only, if I add a printf command into it. It doesn’t need to be executed - it needs just to be there. After the kernel finishes execution I check the memory. This is the code that does not work - after execution the global memory is unchanged if I verify the memory in the host code:
__global__ void _CudaCreateDag( uint32_t p_ui32Start, ulonglong2* p_pDest )
{
p_pDest->x = p_ui32Start;
p_pDest->y = 0xA55AEF09;
}
On the other side this works well - the value of p_ui32Start is ALWAYS 0 - so the printf is NEVER executed:
__global__ void _CudaCreateDag( uint32_t p_ui32Start, ulonglong2* p_pDest )
{
p_pDest->x = p_ui32Start;
p_pDest->y = 0xA55AEF09;
if( 0xFFFFFFFF == p_ui32Start )
{
printf( "" );
}
}
If I check the memory in my host code the values are as expected. Here is the ptx from both:
not working version - no printf call:
// .globl _Z14_CudaCreateDagjP10ulonglong2
.visible .entry _Z14_CudaCreateDagjP10ulonglong2(
.param .u32 _Z14_CudaCreateDagjP10ulonglong2_param_0,
.param .u64 _Z14_CudaCreateDagjP10ulonglong2_param_1
)
{
.reg .b64 %rd<5>;
ld.param.u64 %rd1, [_Z14_CudaCreateDagjP10ulonglong2_param_1];
.loc 1 3320 2
mov.u64 %rd2, 2774200073;
.loc 1 3319 2
ld.param.u32 %rd3, [_Z14_CudaCreateDagjP10ulonglong2_param_0];
.loc 1 3320 2
cvta.to.global.u64 %rd4, %rd1;
st.global.v2.u64 [%rd4], {%rd3, %rd2};
.loc 1 3370 2
ret;
}
working version with a not executed printf call:
// .globl _Z14_CudaCreateDagjP10ulonglong2
.visible .entry _Z14_CudaCreateDagjP10ulonglong2(
.param .u32 _Z14_CudaCreateDagjP10ulonglong2_param_0,
.param .u64 _Z14_CudaCreateDagjP10ulonglong2_param_1
)
{
.reg .pred %p<2>;
.reg .b32 %r<2>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [_Z14_CudaCreateDagjP10ulonglong2_param_1];
.loc 1 3320 2
mov.u64 %rd2, 2774200073;
.loc 1 3319 2
ld.param.u32 %rd3, [_Z14_CudaCreateDagjP10ulonglong2_param_0];
.loc 1 3320 2
cvta.to.global.u64 %rd4, %rd1;
st.global.v2.u64 [%rd4], {%rd3, %rd2};
setp.ne.s64 %p1, %rd3, 4294967295;
.loc 1 3322 2
@%p1 bra BB1_2;
.loc 1 3324 3
mov.u64 %rd5, $str;
cvta.global.u64 %rd6, %rd5;
mov.u64 %rd7, 0;
.loc 1 3324 3
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd6;
.param .b64 param1;
st.param.b64 [param1+0], %rd7;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r1, [retval0+0];
//{
}// Callseq End 0
BB1_2:
.loc 1 3370 2
ret;
}
I run this test with cuda 10.1 and 10.2, on a 1070 and a 2060, Driver version 441.66. Always the same results.
Does anyone have an idea what I’m doing wrong here???
Thanks.