Hi everyone,
I am wondering that is there way to call ptx kernel ? I 'm trying to use cuModuleLoad and having error. I attached my code following. My error is “CUDA error is: initialization error Name CUDA_ERROR_NOT_INITIALIZED”. Is loading fatbin or cubin more realiable than ptx ?
/* vecadd.ptx */
.version 4.2
.target sm_35, debug
.address_size 64
// .globl main_21_gpu
.visible .entry main_21_gpu(
.param .u32 main_21_gpu_param_0,
.param .u64 main_21_gpu_param_1
)
.maxntid 128, 1, 1
{
.local .align 8 .b8 __local_depot0[64];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<5>;
.reg .s32 %r<22>;
.reg .f64 %fd<2>;
.reg .s64 %rd<6>;
.loc 1 21 1
func_begin0:
.loc 1 21 0
.loc 1 21 1
mov.u64 %rd5, __local_depot0;
cvta.local.u64 %SP, %rd5;
ld.param.u32 %r5, [main_21_gpu_param_0];
ld.param.u64 %rd1, [main_21_gpu_param_1];
tmp0:
st.u64 [%SP+56], %rd1;
mov.u32 %r6, %tid.x;
mov.u32 %r7, %ctaid.x;
mov.u32 %r8, %ntid.x;
mov.u32 %r9, %tid.y;
mov.u32 %r10, %ctaid.y;
mov.u32 %r11, %ntid.y;
mov.u32 %r12, %tid.z;
mov.u32 %r13, %ctaid.z;
mov.u32 %r14, %ntid.z;
st.volatile.u32 [%SP+40], %r6;
st.volatile.u32 [%SP+44], %r9;
st.volatile.u32 [%SP+48], %r12;
st.volatile.u32 [%SP+24], %r7;
st.volatile.u32 [%SP+28], %r10;
st.volatile.u32 [%SP+32], %r13;
st.volatile.u32 [%SP+8], %r8;
st.volatile.u32 [%SP+12], %r11;
st.volatile.u32 [%SP+16], %r14;
mov.u32 %r15, 0;
st.volatile.u32 [%SP+0], %r15;
mov.u32 %r16, 0;
func_exec_begin0:
.loc 1 25 1
tmp1:
mov.b32 %r21, %r16;
BB0_6:
mov.u32 %r17, %tid.x;
add.s32 %r3, %r17, %r21;
tmp2:
setp.ge.s32 %p1, %r3, %r5;
selp.u32 %r18, 1, 0, %p1;
setp.ne.s32 %p2, %r18, 0;
@%p2 bra BB0_13;
bra.uni BB0_8;
BB0_8:
.loc 1 26 1
cvt.rn.f64.s32 %fd1, %r3;
ld.u64 %rd2, [%SP+56];
mul.lo.s32 %r19, %r3, 8;
cvt.s64.s32 %rd3, %r19;
add.s64 %rd4, %rd2, %rd3;
st.f64 [%rd4], %fd1;
BB0_13:
.loc 1 30 1
add.s32 %r21, %r21, 128;
setp.lt.s32 %p3, %r21, %r5;
selp.u32 %r20, 1, 0, %p3;
setp.ne.s32 %p4, %r20, 0;
@%p4 bra BB0_6;
bra.uni BB0_14;
BB0_14:
ret;
tmp3:
func_end0:
}
/* Host Code*/
void gpu_ol_main_0_2693505384_unpacked(double *c, int n) throw(){
CUmodule module;
CUfunction function;
CUresult err;
err = cuModuleLoad(&module, "vecadd.ptx");
check(err);
err = cuModuleGetFunction(&function, module, "main_21_gpu") ;
check(err);
void *arr[] = { (void *)&n, (void *)&c };
err = cuLaunchKernel(function, 1, 1, 1, 512, 1, 1, 0,0,&args[0], 0);
check(err).
}