PTX code invalid? (CUDA_ERROR_INVALID_PTX)

Hello,

I am currently trying to get cuRAND to run with NVRTC. I built the program und I am getting the following PTX code. Can anybody tell me why it is invalid?

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19856038
// Cuda compilation tools, release 7.5, V7.5.17
// Based on LLVM 3.4svn
//

.version 4.3
.target sm_20
.address_size 64

	// .globl	rngSetupStates
.extern .func _Z11curand_inityyyP17curandStateXORWOW
(
	.param .b64 _Z11curand_inityyyP17curandStateXORWOW_param_0,
	.param .b64 _Z11curand_inityyyP17curandStateXORWOW_param_1,
	.param .b64 _Z11curand_inityyyP17curandStateXORWOW_param_2,
	.param .b64 _Z11curand_inityyyP17curandStateXORWOW_param_3
)
;

.visible .entry rngSetupStates(
	.param .u64 rngSetupStates_param_0,
	.param .u32 rngSetupStates_param_1
)
{
	.reg .b32 	%r<8>;
	.reg .b64 	%rd<7>;


	ld.param.u64 	%rd1, [rngSetupStates_param_0];
	ld.param.u32 	%r1, [rngSetupStates_param_1];
	mov.u32 	%r2, %tid.x;
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r4, %ntid.x;
	mad.lo.s32 	%r5, %r4, %r3, %r2;
	mov.u32 	%r6, %nctaid.x;
	mad.lo.s32 	%r7, %r6, %r1, %r3;
	cvt.u64.u32	%rd2, %r7;
	cvt.u64.u32	%rd3, %r2;
	mul.wide.s32 	%rd4, %r5, 48;
	add.s64 	%rd5, %rd1, %rd4;
	mov.u64 	%rd6, 0;
	// Callseq Start 0
	{
	.reg .b32 temp_param_reg;
	// <end>}
	.param .b64 param0;
	st.param.b64	[param0+0], %rd2;
	.param .b64 param1;
	st.param.b64	[param1+0], %rd3;
	.param .b64 param2;
	st.param.b64	[param2+0], %rd6;
	.param .b64 param3;
	st.param.b64	[param3+0], %rd5;
	call.uni 
	_Z11curand_inityyyP17curandStateXORWOW, 
	(
	param0, 
	param1, 
	param2, 
	param3
	);
	
	//{
	}// Callseq End 0
	ret;
}

Comma at the end of line 58?

I’m curious, what was the source code you used to create this PTX?

struct curandStateXORWOW {
    unsigned int d, v[5];
    int boxmuller_flag;
    int boxmuller_flag_double;
    float boxmuller_extra;
    double boxmuller_extra_double;
};

typedef struct curandStateXORWOW curandStateXORWOW_t;

typedef struct curandStateXORWOW curandState;

struct curandStateTest {
    unsigned int v;
};

typedef struct curandStateTest curandStateTest_t;

__device__ void curand_init(unsigned long long seed,
                        unsigned long long subsequence,
                        unsigned long long offset,
                        curandStateXORWOW_t *state);

extern "C" __global__ void rngSetupStates(
    curandState *rngState,
    int device_id)
{
    // determine global thread id
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    // Each threadblock gets different seed,
    // Threads within a threadblock get different sequence numbers
    curand_init(blockIdx.x + gridDim.x * device_id, threadIdx.x, 0, &rngState[tid]);
}