Some additional information:
- I have 2 graphic cards and I'm using one of them for my displays and the other solely for CUDA stuff (I'm able to debug simple programs)
- I'm using CUDA 5.0 on ubuntu 12.04
- My Kernel is using some inline functions
When I try to step through a Kernel, I get the following output from cuda-gdb:
first step:
namesp::__wrapper__device_stub_myKernel<namesp::someclass> (__cuda_0=@0x7fffa0fbb018,
__cuda_1=@0x7fffa0fbb010, __cuda_2=@0x7fffa0fbb008, __cuda_3=@0x7fffa0fbb004, __cuda_4=@0x7fffa0fbb000,
__cuda_5=@0x7fffa0fbaffc, __cuda_6=@0x7fffa0fbb040, __cuda_7=@0x7fffa0fbb048, __cuda_8=@0x7fffa0fbb050,
__cuda_9=@0x7fffa0fbaff8, __cuda_10=@0x7fffa0fbb058, __cuda_11=@0x7fffa0fbaff4, __cuda_12=@0x7fffa0fbaff0,
__cuda_13=@0x7fffa0fbafec, __cuda_14=@0x7fffa0fbafe8, __cuda_15=@0x7fffa0fbb060, __cuda_16=@0x7fffa0fbb068)
at mySource.cudafe1.stub.c:662
662 template<> __specialization_static void __wrapper__device_stub_myKernel< ::namesp::someclass>( _ZN10namesp12someclassE *&__cuda_0, _ZN10namesp14someclassMSE *&__cuda_1,uint32_t *&__cuda_2,const uint32_t &__cuda_3,const uint32_t &__cuda_4,const uint32_t &__cuda_5,const uint32_t &__cuda_6,unsigned *&__cuda_7,const int &__cuda_8,const float &__cuda_9,const int &__cuda_10,const float &__cuda_11,const float &__cuda_12,const float &__cuda_13,const float &__cuda_14,float *&__cuda_15,const uint32_t &__cuda_16){__device_stub__ZN10namesp19myKernelINS_12someclassEEEvPT_PNS_1someclassMSEPjjjjjS6_ififfffPfj( __cuda_0,__cuda_1,__cuda_2,__cuda_3,__cuda_4,__cuda_5,__cuda_6,__cuda_7,__cuda_8,__cuda_9,__cuda_10,__cuda_11,__cuda_12,__cuda_13,__cuda_14,__cuda_15,__cuda_16);}}
second step:
__device_stub__ZN10namesp19myKernelINS_12someclassEEEvPT_PNS_14someclassMSEPjjjjjS6_ififfffPfj (
__par0=0x5008c0000, __par1=0x5007f0000, __par2=0x5006c0000, __par3=3, __par4=10, __par5=512, __par6=7680,
__par7=0x5006c3000, __par8=73, __par9=0.300000012, __par10=1392, __par11=1319.91235, __par12=1323.49597,
__par13=714.734375, __par14=487.081604, __par15=0x501720000, __par16=128) at mySource.cudafe1.stub.c:660
660 static void __device_stub__ZN10namesp19myKernelINS_12someclassEEEvPT_PNS_14someclassMSEPjjjjjS6_ififfffPfj( _ZN10namesp12someclassE *__par0, _ZN10namesp14someclassMSE *__par1, uint32_t *__par2, const uint32_t __par3, const uint32_t __par4, const uint32_t __par5, const uint32_t __par6, unsigned *__par7, const int __par8, const float __par9, const int __par10, const float __par11, const float __par12, const float __par13, const float __par14, float *__par15, const uint32_t __par16){__cudaSetupArgSimple(__par0, 0UL);__cudaSetupArgSimple(__par1, 8UL);__cudaSetupArgSimple(__par2, 16UL);__cudaSetupArgSimple(__par3, 24UL);__cudaSetupArgSimple(__par4, 28UL);__cudaSetupArgSimple(__par5, 32UL);__cudaSetupArgSimple(__par6, 36UL);__cudaSetupArgSimple(__par7, 40UL);__cudaSetupArgSimple(__par8, 48UL);__cudaSetupArgSimple(__par9, 52UL);__cudaSetupArgSimple(__par10, 56UL);__cudaSetupArgSimple(__par11, 60UL);__cudaSetupArgSimple(__par12, 64UL);__cudaSetupArgSimple(__par13, 68UL);__cudaSetupArgSimple(__par14, 72UL);__cudaSetupArgSimple(__par15, 80UL);__cudaSetupArgSimple(__par16, 88UL);__cudaLaunch(((char *)((void ( *)( _ZN10namesp12someclassE *, _ZN10namesp14someclassMSE *, uint32_t *, const uint32_t, const uint32_t, const uint32_t, const uint32_t, unsigned *, const int, const float, const int, const float, const float, const float, const float, float *, const uint32_t))namesp::myKernel<namesp::someclass> )));}namespace namesp{
when I step again, I’m back in my host code. During the steps, cuda-gdb tells me: “Focus not set on any active CUDA kernel.” (when using the “cuda kernel” command in gdb)
As you can see, I’m using c+±templates, but I already tried the same without templates and debugging didn’t work either.
I also tried reducing the block-size, but I still could not debug.
Any suggestions?