I’ve got two questions concerning the debugging of cuda kernels:
[list=1]
[*] Does there exist a convenient way for “printing” from a running cuda kernel? If so, I’d like to know where to get it and how to use it. (A real debugger would be even better.)
[*] Also, I’d like to know if there was a way of extracting data from a kernel run after it finished with an ULF (unspecified launch failure). After an ULF cudaMemcpy() appears to not copy any data from device to host.
Some more details:
I’m trying to port the raytracing application from the splash2 benchmark suit. Most of the work should be done. Simple tracing (list) works, hierarchical tracing should work, too, but produces ULFs. (Emulation mode works and valgrind registers 0 errors. (Ok, with a trick: Without the trick, many complaints about conditional jumps or moves based on uninitialized data appear. The trick is using “a=f(); a=NULL;if(a != NULL)” instead of “a=f(); if(a != NULL)”. I don’t know how in the latter case “a” can be uninitialized: f() always returns with a value!).)
In order to investigate more closely, I developed some quick and dirty implementation of what I’m imagining for printing in cuda kernels. I’m posting the code below in order to give a better impression of what I’m looking for. At the moment, it kind of works for one host thread starting one global function on one block of cuda threads.
Most unfortunately, I don’t appear to be able to extract printed content after an ULF (unspecified launch failure). During a kernel run, printed data is logged to a piece of global device memory. After the run finishes normally, the data is fetched with cudaMemcpy and printed by the host. If the run finishes with an ULF, cudaMemcpy appears to not copy any data from device to host. (If you look closer at the code: “Wait” is displayed in this case, not “Init” or other data.)
[codebox]
#ifdef DEBUG_CKS
#define CUDA_PRINT_BS 100000
typedef struct {
char b[CUDA_PRINT_BS]; /* cuda print buffer */
int p; /* cuda print position in buffer */
} cuda_print_t;
cuda_print_t cph;
cuda_print_t *cpd;
device cuda_print_t *cp;
void cuda_print_finalize() {
cudaError e;
e = cudaGetLastError();
if(cudaSuccess != e) {
fprintf(stderr, "Cuda error in file \"%s\" in line %i:\n%s\n",
__FILE__, __LINE__, cudaGetErrorString(e));
// exit(1);
}
cudaMemcpy(&cph, cpd, sizeof(cuda_print_t), cudaMemcpyDeviceToHost);
e = cudaGetLastError();
if(cudaSuccess != e) {
fprintf(stderr, "Cuda error in file \"%s\" in line %i:\n%s\n",
__FILE__, __LINE__, cudaGetErrorString(e));
// exit(1);
}
printf("Rest of cuda print buffer at exit:\n");
fwrite(cph.b, cph.p, sizeof(char), stdout);
printf("\n");
cudaFree(cpd);
}
void cuda_print_init() {
cudaError e;
int i;
for (i = 0; i < CUDA_PRINT_BS; ++i) {
cph.b[i] = '\0';
}
strcpy(cph.b, “Init”);
cph.p = 4;
// cph.p = 0;
cudaMalloc((void **)&cpd, sizeof(cuda_print_t));
e = cudaGetLastError();
if(cudaSuccess != e) {
fprintf(stderr, "Cuda error in file \"%s\" in line %i:\n%s\n",
__FILE__, __LINE__, cudaGetErrorString(e));
exit(1);
}
cudaMemcpy(cpd, &cph, sizeof(cuda_print_t), cudaMemcpyHostToDevice);
e = cudaGetLastError();
if(cudaSuccess != e) {
fprintf(stderr, "Cuda error in file \"%s\" in line %i:\n%s\n",
__FILE__, __LINE__, cudaGetErrorString(e));
exit(1);
}
strcpy(cph.b, “Wait”);
atexit(cuda_print_finalize);
}
#define CP_INIT() cuda_print_init();
device void cuda_print_init_block(cuda_print_t *p) {
if((0 == threadIdx.x) && (0 == threadIdx.y) && (0 == threadIdx.z)) {
cp = p;
cp->p = 0;
}
__syncthreads(); /* sync all threads of block */
}
#define CP_INIT_BLOCK() ((char *)cuda_print_device_p)[0] = ‘A’; cuda_print_init_block(cuda_print_device_p);
#define CP_PAR_FORM , cuda_print_t *cuda_print_device_p
#define CP_PAR_CALL , cpd
device void cuda_print_char(CHAR c) {
int p;
p = atomicAdd(&cp->p, 1);
if((0 <= p) && (p < CUDA_PRINT_BS)) cp->b[p] = c;
}
#define CPC© cuda_print_char©;
device void cuda_print_txt(CHAR *t) {
while('\0' != *t) {
CPC(*t)
++t;
}
}
#define CPT(T) cuda_print_txt(T);
device void cuda_print_int(CUDA_INT i) {
CUDA_INT j, k;
if(i < 0) {
CPC('-')
i = -i;
}
if(i == 0) {
CPC('0')
return;
}
k = 1;
for(j = i; j > 9; j /= 10) {
k *= 10;
}
while(k > 0) {
j = i / k;
CPC('0' + j)
i -= j * k;
k /= 10;
}
}
#define CPI(I) cuda_print_int(I);
#define DDIGITS 6
device void cuda_print_real(CUDA_REAL r) {
CUDA_INT i, o;
CUDA_INT l;
CUDA_REAL eps = exp10((CUDA_REAL) -DDIGITS);
if(r < 0) {
r = -r;
CPC('-')
}
if(r == 0.0) {
CPC('0')
return;
}
l = floor(log10®);
r = r * exp10((CUDA_REAL) -l) + 0.5 * eps;
o = floor®;
CPC('0' + o)
r = r - o;
if(r >= eps) CPC('.')
for(i = 0; (i < DDIGITS) && (r >= eps); ++i) {
r *= 10.0; eps *= 10.0;
o = floor®;
CPC('0' + o)
r = r - o;
}
if(((CUDA_INT)l) != 0) {
CPC('E')
CPI(l)
}
}
#define CPR® cuda_print_real®;
device void cuda_print_pointer(void *p) {
int i;
char *hd = "0123456789ABCDEF";
CPT("0x")
for(i = 7; i >= 0; --i) {
CPC(hd[(((char *)p - (char *)0)>>(4 * i)) & 15])
}
}
#define CPP(P) cuda_print_pointer(P);
void cuda_print_print() {
cudaError e;
int p;
cudaThreadSynchronize();
e = cudaGetLastError();
if(cudaSuccess != e) {
fprintf(stderr, "Cuda error in file \"%s\" in line %i:\n%s\n",
__FILE__, __LINE__, cudaGetErrorString(e));
// exit(1);
}
cudaMemcpy(&cph, cpd, sizeof(cuda_print_t), cudaMemcpyDeviceToHost);
e = cudaGetLastError();
if(cudaSuccess != e) {
fprintf(stderr, "Cuda error in file \"%s\" in line %i:\n%s\n",
__FILE__, __LINE__, cudaGetErrorString(e));
// exit(1);
}
p = cph.p;
if((p < 0) || (p >= CUDA_PRINT_BS)) p = CUDA_PRINT_BS;
fwrite(cph.b, cph.p, sizeof(char), stdout);
printf("\n");
}
#define CP_PRINT() cuda_print_print();
#define CP_INIT() ;
#define CP_INIT_BLOCK() ;
#define CP_PAR_FORM
#define CP_PAR_CALL
#define CPC© ;
#define CPT(T) ;
#define CPI(I) ;
#define CPR® ;
#define CPP(P) ;
#define CP_PRINT() ;
[/codebox]
(© and ® actually mean ( C ) and ( r ) respectively. I don’t know how to have “codebox” leave them intact.)
To use it, first DEBUG_CKS hast to be #define-d. Then, the code displayed can be #include-d. After that, the host thread has to call CP_INIT(), parameter lists of cuda call and called global function have to be patched with “…>>>(… CP_PAR_CALL)” and “global …(… CP_PAR_FORM)”, respectively and the global function must call CP_INIT_BLOCK(). Then, CPC(‘a’) within the kernel wrote the character ‘a’ into a buffer. (CPT, CPI, CPR and CPP did likewise for texts, ints, reals and pointers, where CUDA_REAL and CUDA_INT had to be defined.) After the kernel has finished, CP_PRINT() transfers and displays the buffer contents on the host.
The wish list is long:
[*] cleanup (no need to discuss, this cose is more or less only a proof of concept)
[*] more convenient printing functions
[*] like printf: probably difficult without varargs …
[*] like cout: difficult without guaranteedly working overloading of operator<<
[*] more datatypes / formats
[*] support multiple host threads: maybe add the printing buffer to the cuda context?
[*] support multiple blocks of cuda threads: one separate buffer per block?
[*] separate output per cuda thread: using separate buffer partition per thread?
[*] output while kernel still running:
[*] difficult from other thread since there appears to be no way of looking inside a context, that is still owned by another thread
[*] from same host thread should be possible since kernels are launched asynchronously: copy to host, print already collected output starting at position where last printing ended, store last position printed for next call
[*] better integration:
[*] no explicit modification of the kernel call: <<<…>>>(… CP_PAR_CALL);
[*] no explicit modification of global function:
[*] global void …(… CP_PAR_FORM) {
[*] CP_INIT_BLOCK()
[*] no explicit host initialization: CP_INIT()
[*] switch on or off by simple means (e.g.: nvcc -dbg_print instead of #define DEBUG_CKS)
It would be great if all this already existed since I don’t really like wasting that much time (re-?)developing debugging facilities instead of advancing the actual application.