How to invoke ptx kernel from host code

Hi everyone,

I am wondering that is there way to call ptx kernel ? I 'm trying to use cuModuleLoad and having error. I attached my code following. My error is “CUDA error is: initialization error Name CUDA_ERROR_NOT_INITIALIZED”. Is loading fatbin or cubin more realiable than ptx ?

/* vecadd.ptx */
.version 4.2
.target sm_35, debug
.address_size 64
	// .globl	main_21_gpu
.visible .entry main_21_gpu(
	.param .u32 main_21_gpu_param_0,
	.param .u64 main_21_gpu_param_1
)
.maxntid 128, 1, 1
{
	.local .align 8 .b8 	__local_depot0[64];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<5>;
	.reg .s32 	%r<22>;
	.reg .f64 	%fd<2>;
	.reg .s64 	%rd<6>;
	.loc 1 21 1
func_begin0:
	.loc	1 21 0
	.loc 1 21 1
	mov.u64 	%rd5, __local_depot0;
	cvta.local.u64 	%SP, %rd5;
	ld.param.u32 	%r5, [main_21_gpu_param_0];
	ld.param.u64 	%rd1, [main_21_gpu_param_1];
tmp0:
	st.u64 	[%SP+56], %rd1;
	mov.u32 	%r6, %tid.x;
	mov.u32 	%r7, %ctaid.x;
	mov.u32 	%r8, %ntid.x;
	mov.u32 	%r9, %tid.y;
	mov.u32 	%r10, %ctaid.y;
	mov.u32 	%r11, %ntid.y;
	mov.u32 	%r12, %tid.z;
	mov.u32 	%r13, %ctaid.z;
	mov.u32 	%r14, %ntid.z;
	st.volatile.u32 	[%SP+40], %r6;
	st.volatile.u32 	[%SP+44], %r9;
	st.volatile.u32 	[%SP+48], %r12;
	st.volatile.u32 	[%SP+24], %r7;
	st.volatile.u32 	[%SP+28], %r10;
	st.volatile.u32 	[%SP+32], %r13;
	st.volatile.u32 	[%SP+8], %r8;
	st.volatile.u32 	[%SP+12], %r11;
	st.volatile.u32 	[%SP+16], %r14;
	mov.u32 	%r15, 0;
	st.volatile.u32 	[%SP+0], %r15;
	mov.u32 	%r16, 0;
func_exec_begin0:
	.loc	1 25 1
tmp1:
	mov.b32 	%r21, %r16;
BB0_6:
	mov.u32 	%r17, %tid.x;
	add.s32 	%r3, %r17, %r21;
tmp2:
	setp.ge.s32	%p1, %r3, %r5;
	selp.u32	%r18, 1, 0, %p1;
	setp.ne.s32	%p2, %r18, 0;
	@%p2 bra 	BB0_13;
	bra.uni 	BB0_8;
BB0_8:
	.loc	1 26 1
	cvt.rn.f64.s32	%fd1, %r3;
	ld.u64 	%rd2, [%SP+56];
	mul.lo.s32 	%r19, %r3, 8;
	cvt.s64.s32	%rd3, %r19;
	add.s64 	%rd4, %rd2, %rd3;
	st.f64 	[%rd4], %fd1;
BB0_13:
	.loc	1 30 1
	add.s32 	%r21, %r21, 128;
	setp.lt.s32	%p3, %r21, %r5;
	selp.u32	%r20, 1, 0, %p3;
	setp.ne.s32	%p4, %r20, 0;
	@%p4 bra 	BB0_6;
	bra.uni 	BB0_14;
BB0_14:
	ret;
tmp3:
func_end0:
}
/* Host Code*/ 
void gpu_ol_main_0_2693505384_unpacked(double *c, int n) throw(){	
	CUmodule module;
	CUfunction function;
	CUresult err;

        err = cuModuleLoad(&module, "vecadd.ptx");
	check(err);
	err = cuModuleGetFunction(&function, module, "main_21_gpu") ;	
	check(err);
        void *arr[] = { (void *)&n, (void *)&c };
        err = cuLaunchKernel(function, 1, 1, 1, 	512, 1, 1, 0,0,&args[0], 0);
	check(err).
}

The vectorAddDrv sample code demonstrates how to load a compiled ptx kernel from host code.

You may want to study it.

For example, when you use the driver API, you must do a cuInit first before doing any other driver API calls.

hi txbob, sorry I asked question before checking examples. I solved my main issue. I mean I can invoke kernel successfully. However, this time I am getting error while using streams with cuLaunchKernel. my error is “kernel launch returns: 400 invalid resource handle”

I fixed my host code like this:

void gpu_ol_main_0_2693505384_unpacked(double * a, double * b, double * c, int n)
{	
	CUresult error;
	error = cuInit(0);
	printf("cuinit returns: %d\n",int(error));
	
	CUdevice cuDevice0;
	error = cuDeviceGet(&cuDevice0, 0);
	printf("cuDeviceGet returns: %d\n",int(error));
	
	CUcontext cuContext0;
	error = cuCtxCreate(&cuContext0, 0, cuDevice0);
	printf("cuCtxCreate returns: %d\n",int(error));
	
	CUmodule cuModule0;
	error = cuModuleLoad(&cuModule0, ptxfile);
	printf("cuModuleLoad returns: %d\n",int(error));

	CUfunction kernel0;
	error = cuModuleGetFunction( &kernel0, cuModule0, function);
	printf("getting the function handle returns: %d\n",int(error));
	
	void *args[] = { &n , &a, &b, &c };

	CUstream str = (CUstream)nanos_get_kernel_execution_stream();
	error = cuLaunchKernel(kernel0, 512, 1, 1, 128, 1, 1, 0, str, &args[0], NULL);
	
 	const char *errStr;   
	cuGetErrorString(error, &errStr);
	printf("kernel launch returns: %d\t%s\n",int(error), errStr );
}

maybe your nanos_get_kernel_execution_stream() is not really returning a valid stream handle.

It’s difficult to say. I don’t generally respond to questions asking for debugging assistance unless a complete, self-contained example is given.