strange behavior writing to global memory

Hi,

I run into a very strange thing that I’m not able to solve in the last three days. In my host code I allocate more than 1GB of GPU-memory. I get this pointer in my global function and write some values into it. They strange thing is now, that this kernel works only, if I add a printf command into it. It doesn’t need to be executed - it needs just to be there. After the kernel finishes execution I check the memory. This is the code that does not work - after execution the global memory is unchanged if I verify the memory in the host code:

__global__ void _CudaCreateDag( uint32_t p_ui32Start, ulonglong2* p_pDest )
{
	p_pDest->x = p_ui32Start;
	p_pDest->y = 0xA55AEF09;
}

On the other side this works well - the value of p_ui32Start is ALWAYS 0 - so the printf is NEVER executed:

__global__ void _CudaCreateDag( uint32_t p_ui32Start, ulonglong2* p_pDest )
{
	p_pDest->x = p_ui32Start;
	p_pDest->y = 0xA55AEF09;

        if( 0xFFFFFFFF == p_ui32Start )
	{
		printf( "" );
	}
}

If I check the memory in my host code the values are as expected. Here is the ptx from both:
not working version - no printf call:

// .globl	_Z14_CudaCreateDagjP10ulonglong2
.visible .entry _Z14_CudaCreateDagjP10ulonglong2(
	.param .u32 _Z14_CudaCreateDagjP10ulonglong2_param_0,
	.param .u64 _Z14_CudaCreateDagjP10ulonglong2_param_1
)
{
	.reg .b64 	%rd<5>;

	ld.param.u64 	%rd1, [_Z14_CudaCreateDagjP10ulonglong2_param_1];
	.loc 1 3320 2
	mov.u64 	%rd2, 2774200073;
	.loc 1 3319 2
	ld.param.u32 	%rd3, [_Z14_CudaCreateDagjP10ulonglong2_param_0];
	.loc 1 3320 2
	cvta.to.global.u64 	%rd4, %rd1;
	st.global.v2.u64 	[%rd4], {%rd3, %rd2};
	.loc 1 3370 2
	ret;
}

working version with a not executed printf call:

// .globl	_Z14_CudaCreateDagjP10ulonglong2
.visible .entry _Z14_CudaCreateDagjP10ulonglong2(
	.param .u32 _Z14_CudaCreateDagjP10ulonglong2_param_0,
	.param .u64 _Z14_CudaCreateDagjP10ulonglong2_param_1
)
{
	.reg .pred 	%p<2>;
	.reg .b32 	%r<2>;
	.reg .b64 	%rd<8>;

	ld.param.u64 	%rd1, [_Z14_CudaCreateDagjP10ulonglong2_param_1];
	.loc 1 3320 2
	mov.u64 	%rd2, 2774200073;
	.loc 1 3319 2
	ld.param.u32 	%rd3, [_Z14_CudaCreateDagjP10ulonglong2_param_0];
	.loc 1 3320 2
	cvta.to.global.u64 	%rd4, %rd1;
	st.global.v2.u64 	[%rd4], {%rd3, %rd2};
	setp.ne.s64	%p1, %rd3, 4294967295;
	.loc 1 3322 2
	@%p1 bra 	BB1_2;

	.loc 1 3324 3
	mov.u64 	%rd5, $str;
	cvta.global.u64 	%rd6, %rd5;
	mov.u64 	%rd7, 0;
	.loc 1 3324 3
	// Callseq Start 0
	{
	.reg .b32 temp_param_reg;
	// <end>}
	.param .b64 param0;
	st.param.b64	[param0+0], %rd6;
	.param .b64 param1;
	st.param.b64	[param1+0], %rd7;
	.param .b32 retval0;
	call.uni (retval0), 
	vprintf, 
	(
	param0, 
	param1
	);
	ld.param.b32	%r1, [retval0+0];
	
	//{
	}// Callseq End 0

BB1_2:
	.loc 1 3370 2
	ret;
}

I run this test with cuda 10.1 and 10.2, on a 1070 and a 2060, Driver version 441.66. Always the same results.
Does anyone have an idea what I’m doing wrong here???

Thanks.

Use proper CUDA error checking and run your code with cuda-memcheck

I have error checking for each function in my host code - all of them report success. I use JIT compilation - do I have any chance to use cuda memchecks?

I suggest running with cuda-memcheck also.

You may get additional help if you provide a short but complete test case. There really isn’t anything wrong with either of the kernels you have shown, so the problem may lie somewhere else in your code.

Yes- I’m pretty sure that it is on my side. Will will do some more testing. Is there a way that I can use cuda-memcheck with JIT compiled kernels?

cuda-memcheck should work with JIT compiled kernels

Here is what I get from memcheck. Is there any option that I should/can try?

========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors

So, I am down to this small kernel now which does NOT work without printf:

#include "stdafx.h"

__global__ void _CudaFindSolution( uint64_t* p_pResultDetails, const uint64_t p_ui64Target, uint64_t p_ui64StartNonce )
{
}

__global__ void _CudaCreateDag( uint32_t p_ui32Start, ulonglong2* p_pDest )
{
	p_pDest->x = p_ui32Start;
	p_pDest->y = 0xA55AEF09;
	assert( p_pDest->x == p_ui32Start );
	assert( p_pDest->y == 0xA55AEF09 );

	if( 0xFFFFFFFF == p_ui32Start )
	{
		printf( "" );
	}
	return;
}

which translates to this ptx:

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-27506705
// Cuda compilation tools, release 10.2, V10.2.89
// Based on LLVM 3.4svn
//

.version 6.5
.target sm_61
.address_size 64

	// .globl	_Z17_CudaFindSolutionPyyy
.extern .func  (.param .b32 func_retval0) vprintf
(
	.param .b64 vprintf_param_0,
	.param .b64 vprintf_param_1
)
;
.global .align 1 .b8 $str3[1];

.visible .entry _Z17_CudaFindSolutionPyyy(
	.param .u64 _Z17_CudaFindSolutionPyyy_param_0,
	.param .u64 _Z17_CudaFindSolutionPyyy_param_1,
	.param .u64 _Z17_CudaFindSolutionPyyy_param_2
)
{

	.loc 1 50 1
	ret;
}

	// .globl	_Z14_CudaCreateDagjP10ulonglong2
.visible .entry _Z14_CudaCreateDagjP10ulonglong2(
	.param .u32 _Z14_CudaCreateDagjP10ulonglong2_param_0,
	.param .u64 _Z14_CudaCreateDagjP10ulonglong2_param_1
)
{
	.reg .pred 	%p<2>;
	.reg .b32 	%r<2>;
	.reg .b64 	%rd<8>;

	ld.param.u64 	%rd1, [_Z14_CudaCreateDagjP10ulonglong2_param_1];
	.loc 1 56 2
	mov.u64 	%rd2, 2774200073;
	.loc 1 55 2
	ld.param.u32 	%rd3, [_Z14_CudaCreateDagjP10ulonglong2_param_0];
	.loc 1 56 2
	cvta.to.global.u64 	%rd4, %rd1;
	st.global.v2.u64 	[%rd4], {%rd3, %rd2};
	setp.ne.s64	%p1, %rd3, 4294967295;
	.loc 1 60 2
	@%p1 bra 	BB1_2;

	.loc 1 62 3
	mov.u64 	%rd5, $str3;
	cvta.global.u64 	%rd6, %rd5;
	mov.u64 	%rd7, 0;
	.loc 1 62 3
	// Callseq Start 0
	{
	.reg .b32 temp_param_reg;
	// <end>}
	.param .b64 param0;
	st.param.b64	[param0+0], %rd6;
	.param .b64 param1;
	st.param.b64	[param1+0], %rd7;
	.param .b32 retval0;
	call.uni (retval0), 
	vprintf, 
	(
	param0, 
	param1
	);
	ld.param.b32	%r1, [retval0+0];
	
	//{
	}// Callseq End 0

BB1_2:
	.loc 1 64 2
	ret;
}

The following works perfect if I remove the printf. The I get this ptx:

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-27506705
// Cuda compilation tools, release 10.2, V10.2.89
// Based on LLVM 3.4svn
//

.version 6.5
.target sm_61
.address_size 64

	// .globl	_Z17_CudaFindSolutionPyyy

.visible .entry _Z17_CudaFindSolutionPyyy(
	.param .u64 _Z17_CudaFindSolutionPyyy_param_0,
	.param .u64 _Z17_CudaFindSolutionPyyy_param_1,
	.param .u64 _Z17_CudaFindSolutionPyyy_param_2
)
{

	.loc 1 50 1
	ret;
}

	// .globl	_Z14_CudaCreateDagjP10ulonglong2
.visible .entry _Z14_CudaCreateDagjP10ulonglong2(
	.param .u32 _Z14_CudaCreateDagjP10ulonglong2_param_0,
	.param .u64 _Z14_CudaCreateDagjP10ulonglong2_param_1
)
{
	.reg .b64 	%rd<5>;

	ld.param.u64 	%rd1, [_Z14_CudaCreateDagjP10ulonglong2_param_1];
	.loc 1 56 2
	mov.u64 	%rd2, 2774200073;
	.loc 1 55 2
	ld.param.u32 	%rd3, [_Z14_CudaCreateDagjP10ulonglong2_param_0];
	.loc 1 56 2
	cvta.to.global.u64 	%rd4, %rd1;
	st.global.v2.u64 	[%rd4], {%rd3, %rd2};
	.loc 1 64 2
	ret;
}

But this does not work. I see not a single error in any cuda function call.

If I run the app WITHOUT printf in cuda debugging (in VS2019) I get the expected results!

This is the code to check the contents of the global memory

uint8_t* l_pTmp = (uint8_t*)malloc( l_DagSize.ui32Bytes );
CUresult l_cuKernelError = cuMemcpyDtoH( l_pTmp, m_CudaMemoryDag, 256 );
_ASSERT( CUDA_SUCCESS == l_cuKernelError );
free( l_pTmp );