Debugging cuda kernels: printing and analysis after ULF How to extract data from failing kernels?

Carsten_Scholtes · March 4, 2009, 3:23pm

I’ve got two questions concerning the debugging of cuda kernels:

[list=1]

[*] Does there exist a convenient way for “printing” from a running cuda kernel? If so, I’d like to know where to get it and how to use it. (A real debugger would be even better.)

[*] Also, I’d like to know if there was a way of extracting data from a kernel run after it finished with an ULF (unspecified launch failure). After an ULF cudaMemcpy() appears to not copy any data from device to host.

Some more details:

I’m trying to port the raytracing application from the splash2 benchmark suit. Most of the work should be done. Simple tracing (list) works, hierarchical tracing should work, too, but produces ULFs. (Emulation mode works and valgrind registers 0 errors. (Ok, with a trick: Without the trick, many complaints about conditional jumps or moves based on uninitialized data appear. The trick is using “a=f(); a=NULL;if(a != NULL)” instead of “a=f(); if(a != NULL)”. I don’t know how in the latter case “a” can be uninitialized: f() always returns with a value!).)

In order to investigate more closely, I developed some quick and dirty implementation of what I’m imagining for printing in cuda kernels. I’m posting the code below in order to give a better impression of what I’m looking for. At the moment, it kind of works for one host thread starting one global function on one block of cuda threads.

Most unfortunately, I don’t appear to be able to extract printed content after an ULF (unspecified launch failure). During a kernel run, printed data is logged to a piece of global device memory. After the run finishes normally, the data is fetched with cudaMemcpy and printed by the host. If the run finishes with an ULF, cudaMemcpy appears to not copy any data from device to host. (If you look closer at the code: “Wait” is displayed in this case, not “Init” or other data.)

[codebox]

#ifdef DEBUG_CKS

#define CUDA_PRINT_BS 100000

typedef struct {

char b[CUDA_PRINT_BS]; /* cuda print buffer */

int p; /* cuda print position in buffer */

} cuda_print_t;

cuda_print_t cph;

cuda_print_t *cpd;

device cuda_print_t *cp;

void cuda_print_finalize() {

cudaError e;

e = cudaGetLastError();

if(cudaSuccess != e) {

	fprintf(stderr, "Cuda error in file \"%s\" in line %i:\n%s\n",

	        __FILE__, __LINE__, cudaGetErrorString(e));

// exit(1);

}                                                         	

cudaMemcpy(&cph, cpd, sizeof(cuda_print_t), cudaMemcpyDeviceToHost);

e = cudaGetLastError();

if(cudaSuccess != e) {

	fprintf(stderr, "Cuda error in file \"%s\" in line %i:\n%s\n",

	        __FILE__, __LINE__, cudaGetErrorString(e));

// exit(1);

}                                                         	

printf("Rest of cuda print buffer at exit:\n");

fwrite(cph.b, cph.p, sizeof(char), stdout);

printf("\n");

cudaFree(cpd);

}

void cuda_print_init() {

cudaError e;

int i;

for (i = 0; i < CUDA_PRINT_BS; ++i) {

	cph.b[i] = '\0';

}

strcpy(cph.b, “Init”);

cph.p = 4;

// cph.p = 0;

cudaMalloc((void **)&cpd, sizeof(cuda_print_t));

e = cudaGetLastError();

if(cudaSuccess != e) {

	fprintf(stderr, "Cuda error in file \"%s\" in line %i:\n%s\n",

	        __FILE__, __LINE__, cudaGetErrorString(e));

	exit(1);

}                                                         	

cudaMemcpy(cpd, &cph, sizeof(cuda_print_t), cudaMemcpyHostToDevice);

e = cudaGetLastError();

if(cudaSuccess != e) {

	fprintf(stderr, "Cuda error in file \"%s\" in line %i:\n%s\n",

	        __FILE__, __LINE__, cudaGetErrorString(e));

	exit(1);

}

strcpy(cph.b, “Wait”);

atexit(cuda_print_finalize);

}

#define CP_INIT() cuda_print_init();

device void cuda_print_init_block(cuda_print_t *p) {

if((0 == threadIdx.x) && (0 == threadIdx.y) && (0 == threadIdx.z)) {

	cp = p;

	cp->p = 0;

}

__syncthreads(); /* sync all threads of block */

}

#define CP_INIT_BLOCK() ((char *)cuda_print_device_p)[0] = ‘A’; cuda_print_init_block(cuda_print_device_p);

#define CP_PAR_FORM , cuda_print_t *cuda_print_device_p

#define CP_PAR_CALL , cpd

device void cuda_print_char(CHAR c) {

int p;

p = atomicAdd(&cp->p, 1);

if((0 <= p) && (p < CUDA_PRINT_BS)) cp->b[p] = c;

}

#define CPC© cuda_print_char©;

device void cuda_print_txt(CHAR *t) {

while('\0' != *t) {

	CPC(*t)

	++t;

}

}

#define CPT(T) cuda_print_txt(T);

device void cuda_print_int(CUDA_INT i) {

CUDA_INT j, k;

if(i < 0) {

	CPC('-')

	i = -i;

}

if(i == 0) {

	CPC('0')

	return;

}

k = 1;

for(j = i; j > 9; j /= 10) {

	k *= 10;

}

while(k > 0) {

	j = i / k;

	CPC('0' + j)

	i -= j * k;

	k /= 10;

}

}

#define CPI(I) cuda_print_int(I);

#define DDIGITS 6

device void cuda_print_real(CUDA_REAL r) {

CUDA_INT i, o;

CUDA_INT l;

CUDA_REAL eps = exp10((CUDA_REAL) -DDIGITS);

if(r < 0) {

	r = -r;

	CPC('-')

}

if(r == 0.0) {

	CPC('0')

	return;

}

l = floor(log10®);

r = r * exp10((CUDA_REAL) -l) + 0.5 * eps;

o = floor®;

CPC('0' + o)

r = r - o;

if(r >= eps) CPC('.')

for(i = 0; (i < DDIGITS) && (r >= eps); ++i) {

	r *= 10.0; eps *= 10.0;

	o = floor®;

	CPC('0' + o)

	r = r - o;

}

if(((CUDA_INT)l) != 0) {

	CPC('E')

	CPI(l)

}

}

#define CPR® cuda_print_real®;

device void cuda_print_pointer(void *p) {

int i;

char *hd = "0123456789ABCDEF";

CPT("0x")

for(i = 7; i >= 0; --i) {

	CPC(hd[(((char *)p - (char *)0)>>(4 * i)) & 15])

}

}

#define CPP(P) cuda_print_pointer(P);

void cuda_print_print() {

cudaError e;

int p;

cudaThreadSynchronize();

e = cudaGetLastError();

if(cudaSuccess != e) {

	fprintf(stderr, "Cuda error in file \"%s\" in line %i:\n%s\n",

	        __FILE__, __LINE__, cudaGetErrorString(e));

// exit(1);

}                                                         	

cudaMemcpy(&cph, cpd, sizeof(cuda_print_t), cudaMemcpyDeviceToHost);

e = cudaGetLastError();

if(cudaSuccess != e) {

	fprintf(stderr, "Cuda error in file \"%s\" in line %i:\n%s\n",

	        __FILE__, __LINE__, cudaGetErrorString(e));

// exit(1);

}                                                         	

p = cph.p;

if((p < 0) || (p >= CUDA_PRINT_BS)) p = CUDA_PRINT_BS;

fwrite(cph.b, cph.p, sizeof(char), stdout);

printf("\n");

}

#define CP_PRINT() cuda_print_print();

#else

#define CP_INIT() ;

#define CP_INIT_BLOCK() ;

#define CP_PAR_FORM

#define CP_PAR_CALL

#define CPC© ;

#define CPT(T) ;

#define CPI(I) ;

#define CPR® ;

#define CPP(P) ;

#define CP_PRINT() ;

#endif

[/codebox]

To use it, first DEBUG_CKS hast to be #define-d. Then, the code displayed can be #include-d. After that, the host thread has to call CP_INIT(), parameter lists of cuda call and called global function have to be patched with “…>>>(… CP_PAR_CALL)” and “global …(… CP_PAR_FORM)”, respectively and the global function must call CP_INIT_BLOCK(). Then, CPC(‘a’) within the kernel wrote the character ‘a’ into a buffer. (CPT, CPI, CPR and CPP did likewise for texts, ints, reals and pointers, where CUDA_REAL and CUDA_INT had to be defined.) After the kernel has finished, CP_PRINT() transfers and displays the buffer contents on the host.

The wish list is long:

[*] cleanup (no need to discuss, this cose is more or less only a proof of concept)

[*] more convenient printing functions

[*] like printf: probably difficult without varargs …

[*] like cout: difficult without guaranteedly working overloading of operator<<

[*] more datatypes / formats

[*] support multiple host threads: maybe add the printing buffer to the cuda context?

[*] support multiple blocks of cuda threads: one separate buffer per block?

[*] separate output per cuda thread: using separate buffer partition per thread?

[*] output while kernel still running:

[*] difficult from other thread since there appears to be no way of looking inside a context, that is still owned by another thread

[*] from same host thread should be possible since kernels are launched asynchronously: copy to host, print already collected output starting at position where last printing ended, store last position printed for next call

[*] better integration:

[*] no explicit modification of the kernel call: <<<…>>>(… CP_PAR_CALL);

[*] no explicit modification of global function:

[*] global void …(… CP_PAR_FORM) {

[*] CP_INIT_BLOCK()

[*] no explicit host initialization: CP_INIT()

[*] switch on or off by simple means (e.g.: nvcc -dbg_print instead of #define DEBUG_CKS)

It would be great if all this already existed since I don’t really like wasting that much time (re-?)developing debugging facilities instead of advancing the actual application.

MisterAnderson42 · March 4, 2009, 4:10pm

One exists: http://www.nvidia.com/object/cuda_get.html . Currently it is on linux 32-bit only.

Once a CUDA kernel triggers the ULF error, it hits some global flag in that context. Any further kernel laucnhes, memcpies or other CUDA calls all bail out before running.

Do you get a ULF on every kernel launch? That is usually an indication of out of bounds memory accesses (which you have already checked for). Accidentally passing a host pointer to the device will also cause this problem and you won’t notice it in emulation mode (which I assumed you have already checked for).

Or is the ULF randomly occurring only once in every thousand or million kernel launches? If that is the case, you may find that your kernel works on GTX 285/295 and Tesla C1060, but not older hardware: http://forums.nvidia.com/index.php?showtopic=87803

Carsten_Scholtes · March 5, 2009, 3:00pm

Dear MisterAnderson42,

Thank you very much for replying so quickly and helpfully to my questions!

My ULF problem is resolved: Indeed, it was a host pointer well hidden within a larger device data structure. Thank you for stressing this possibility! (And yes, the ULF occurred reliably on each kernel run.)

I’m happy to learn that there exists a debugger for 32 bit Linux binaries although I’m working on a 64 bit Linux system (AMD Opteron). Maybe I can cross-compile to 32 bits or we may setup a 32 bit Linux as another boot option.

Concerning the ULF flag: Is there a way to clean this flag, so that the context became usable again? This would surely be quite helpful for postmortem analysis.

MisterAnderson42 · March 5, 2009, 3:48pm

Not that I’m aware of. I wish there were, because then my code that has trouble with this bug could easily redo the kernel call and then continue on as if nothing had happened.

Carsten_Scholtes · March 6, 2009, 7:51am

Maybe the driver API offered a workaround for the intermittent bug you are experiencing: If, after an ULF, it is really only the current context that cannot be used anymore, you might be able to discard the failed context (in order to free resources) and replace it with a freshly created functional one?

tmurray · March 6, 2009, 8:52am

64-bit Linux debugger is part of 2.2.

MisterAnderson42 · March 6, 2009, 12:48pm

Maybe that would work. I’ve never used the Driver API so I wouldn’t know.

This is good news :) Honestly, I have yet to even try out the debugger simply because all my boxes are x86_64 (and have been for 5+ years) and I’m too lazy to take the time to download a 32-bit CentOS just to try it out.

With the number of times you’ve been mentioning “this is in 2.2” lately, that must mean we are approaching a release date, right?

E.D_Riedijk · March 6, 2009, 1:29pm

The sound of loud rejoicing is echoing through the forums! External Media

tmurray · March 6, 2009, 4:57pm

It’s not too far away.

_Big_Mac · March 8, 2009, 9:41pm

With the risk of getting off-topic, any news on --multicore compilation?

seibert · March 9, 2009, 12:02am

I wonder how long tmurray can mention future CUDA features before his bosses cut off his Internet connection (or his fingers)… :)

Sarnath · March 9, 2009, 4:58am

:-) Your boss muss b very cruel, I guess…

Anyway, I got a serious question on the debugger stuff:

Does this debugger debug real hardware OR just the device emulation thing?

If it is real hardware, I would be surprised. So, Can I watch all the multi-processors and watch what all code is running???

Mu-Chi_Sung · March 9, 2009, 6:31am

You can try the cudagdb 32bit version, which debugs the actual code running on GPU, and yes you can watch all stream processors running and pause any of them just like normal gdb.

I’ve been expecting to see 64bit gdb for quite a while; in addition, is it possible to debug ptx directly in the next version?

Topic		Replies	Views
Cuda-gdb doesn't break and/or step into Kernels CUDA Programming and Performance	26	53795	August 1, 2011
CUDA Toolkit 3.0 update GPU HW debugging tools to replace device emulation CUDA Programming and Performance	44	29470	April 29, 2010
Code hangs... CUDA Programming and Performance	24	19918	August 18, 2010
Will calling a kernel from a kernel help on performance? CUDA Programming and Performance	9	2247	December 23, 2022
How to debug kernel throwing an exception? CUDA Programming and Performance	16	7970	June 14, 2013
Silent kernel failure CUDA Programming and Performance	25	8328	May 18, 2020
code that crashes unpredictably CUDA Programming and Performance	15	12653	April 28, 2010
CUDA 2.1 discussion CUDA Programming and Performance	71	63947	February 17, 2009
CUDA 2.1 beta CUDA Programming and Performance	49	67186	December 3, 2008
Establishing GPU processor and memory usage CUDA Programming and Performance	26	18929	November 17, 2008

Debugging cuda kernels: printing and analysis after ULF How to extract data from failing kernels?

Related topics