In non-EMU mode, we can not call any standard library functions from device or global functions, which is especially inconvenient when we are in debugging. Recently, nVidia has released a function named ‘cuPrintf()’ to registered developers which can overcome this problem. I also want to use this function, but I’m not a registered developer so I can not get it. However, emulating printf() in kernels is not quite difficult on current CUDA platform, so I decide to implement it by myself.
Now it has been done. The following codes realized a device_printf function which make it possible to output debug message from kernels. The implemetation is primary but really works. With similar methods, we are also possible to implement functions such as ‘device_fopen()’, '‘device_sendmsg()’, and so on, things may become interesting.
Notes:
- For simplicity, in my current implementation device_printf only supports one or two arguments, and only single ‘%d’, ‘%u’, ‘%c’, ‘%f’, ‘%e’ fmt specifiers are supported, and among them ‘%f’ is just implemented as an alias for ‘%e’.
Seveal valid examples:
[font=“Courier New”] device_printf(“1+1=%d\n”, 2);
device_printf(“A+B=%c\n”, ‘C’);
device_printf(“1/2=%f\n”, 1/2.0);
device_printf(“unsigned(-1)=%u\n”, -1);
device_printf(“OK, test passed.\n”); [/font]
-
device_printf_init() should be called from host, not from device;
-
The default buffer flush interval time is 200ms, which is defined as PRINTF_FLUSH_INTERVAL. You can change it if you feel 200ms is not suitable.
-
To ensure that all print buffers have been flushed, a getchar() or _sleep(1000) is required to be called before host program exiting.
-
The default print buffer size is 4096 bytes, which is defined as PRINTF_BUF_SIZE. This value also can be changed but must be kept to be mutiple power of 2.
-
If you print too fast (>4096 bytes/200ms for default parameters), the print buffer will be overflow, the data in overloped ranges will be lost but the function still works.
Code:
/*
* device_printf.h - my own implemetation of 'cuprintf', enable limited printf() from device
* v0.11 by cuda2010 @ csdn, Feb 21, 2010
*/
#ifndef DEVICE_PRINFT_H
#define DEVICE_PRINTF_H
#include "cutil_inline.h"
#include "windows.h"
#pragma comment(lib, "winmm.lib")
using namespace std;
#define PRINTF_BUF_SIZE 4096
#define PRINTF_FLUSH_INTERVAL 200
struct BUFFER {
char data[PRINTF_BUF_SIZE];
unsigned int ptr;
} *host_buffer;
__device__ struct BUFFER *device_buffer;
static void CALLBACK device_printf_flush(UINT uid, UINT umsg, DWORD dwusr, DWORD dw1, DWORD dw2) {
static int old_ptr;
int ptr=(host_buffer->ptr)%PRINTF_BUF_SIZE;
if(old_ptr<=ptr) {
for(int i=old_ptr; i<ptr; i++) {
printf("%c", host_buffer->data[i]);
}
} else {
for(int i=old_ptr; i<PRINTF_BUF_SIZE; i++) printf("%c", host_buffer->data[i]);
for(int i=0; i<ptr; i++) printf("%c", host_buffer->data[i]);
}
old_ptr=ptr;
}
__global__ void set_device_buffer(void *ptr) {
device_buffer=(struct BUFFER*)ptr;
}
void device_printf_init() {
static int inited=0;
void *ptr;
if(inited) return;
timeSetEvent(PRINTF_FLUSH_INTERVAL, 0, device_printf_flush, 0, (UINT)TIME_PERIODIC);
cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
cutilSafeCall(cudaHostAlloc((void**)&host_buffer, sizeof(struct BUFFER), cudaHostAllocMapped));
memset(host_buffer, 0, sizeof(struct BUFFER));
cutilSafeCall(cudaHostGetDevicePointer((void**)&ptr, host_buffer, 0));
set_device_buffer<<<1,1>>>(ptr);
cudaThreadSynchronize();
inited=1;
}
__device__ int sprintf_int(char *s, int v) {
int len=0;
if(v==0) {
s[0]='0';
return (len=1);
}
if(v<0) {
s[len++]='-';
v=-v;
}
unsigned int base;
for(base=1000000000; base>0; base/=10) if((unsigned int)v>=base) break;
while(base>0) {
s[len++]='0'+(v/base)%10;
base/=10;
}
return len;
}
__device__ int sprintf_uint(char *s, unsigned int v) {
int len=0;
if(v==0) {
s[0]='0';
return (len=1);
}
unsigned int base;
for(base=1000000000; base>0; base/=10) if(v>=base) break;
while(base>0) {
s[len++]='0'+(v/base)%10;
base/=10;
}
return len;
}
__device__ int sprintf_float(char *s, double v) {
int len=0;
if(isinf(v)) {
s[len++]='I';
s[len++]='N';
s[len++]='F';
return len;
}
if(isnan(v)) {
s[len++]='N';
s[len++]='A';
s[len++]='N';
return len;
}
if(v<0) {
s[len++]='-';
v=-v;
}
double e=log10(v);
int a, b;
if(isinf(e)) e=0;
b=(int)e;
if(b<0) b--;
a=(int)(v/pow(10.0,b)*1000000+0.5);
s[len++]='0'+(a/1000000)%10;
s[len++]='.';
s[len++]='0'+(a/100000)%10;
s[len++]='0'+(a/10000)%10;
s[len++]='0'+(a/1000)%10;
s[len++]='0'+(a/100)%10;
s[len++]='0'+(a/10)%10;
s[len++]='0'+(a)%10;
s[len++]='E';
s[len++]=(b<0) ? '-' : '+';
b=abs(b);
s[len++]='0'+(b/100)%10;
s[len++]='0'+(b/10)%10;
s[len++]='0'+(b)%10;
return len;
}
__device__ int sprintf_str(char *s, char *s2, int maxlen) {
int i;
for(i=0; s2[i] && i<maxlen; i++) s[i]=s2[i];
return i;
}
__device__ int do_device_printf(char *s, int len) {
unsigned int from=atomicAdd(&(device_buffer->ptr), len);
for(int i=0; i<len; i++) device_buffer->data[(from+i)&(PRINTF_BUF_SIZE-1)]=s[i];
return len;
}
template <class TYPE> __device__ int device_printf(TYPE fmt) {
char *s=(char*)fmt;
int len;
for(len=0; s[len] && len<255; len++);
do_device_printf(s, len);
return len;
}
template <class TYPE> __device__ int device_printf(char *fmt, TYPE a) {
char buf[300];
int i, j;
/* in current version only '%d', '%u', '%c', '%f', '%e' specifiers are supported,
and '%f' is just implemented as an alias for '%e' */
for(i=j=0; fmt[i]; i++) {
if(j>=255) break;
if(fmt[i]!='%') {
buf[j++]=fmt[i];
continue;
}
if(fmt[i+1]=='d') {
j+=sprintf_int(buf+j, a);
} else if(fmt[i+1]=='u') {
j+=sprintf_uint(buf+j, a);
} else if(fmt[i+1]=='c') {
buf[j++]=a;
} else if(fmt[i+1]=='f' || fmt[i+1]=='e') {
j+=sprintf_float(buf+j, (double)a);
}
i+=2;
j+=sprintf_str(buf+j, fmt+i, 255-j);
break;
}
if(j>255) j=255;
buf[j]=0;
return do_device_printf(buf, j);
}
#endif //end of DEVICE_PRINFT_H
// main.c
__global__ void cuda1() {
int inx=blockIdx.x*blockDim.x+threadIdx.x;
float sum=0;
device_printf("Thread %d is started\n", inx);
if(inx==0) device_printf("1+1=%d\n", 2);
if(inx==1) device_printf("A+B=%c\n", 'C');
if(inx==2) device_printf("1.0/2.0=%e\n", 1.0f/2.0f);
if(inx==3) device_printf("(UINT)(-1)=%u\n", -1);
for(int i=1; i<20000000; i++) {
if(i%5000000==0) device_printf("i=%d\n", i);
}
device_printf("Thread %d is finished\n", inx);
}
int main() {
device_printf_init();
cuda1<<<2,2>>>();
cudaThreadSynchronize();
/* getchar() or _sleep(1000) is required before main program exits */
getchar();
}
The code has been tested under CUDA 2.3 with VS2008.
Output:
[font=“Courier New”]Thread 0 is started
Thread 2 is started
Thread 1 is started
Thread 3 is started
1+1=2
A+B=C
1.0/2.0=0.500000E+000
(UINT)(-1)=4294967295
i=5000000
i=5000000
i=5000000
i=5000000
i=10000000
i=10000000
i=10000000
i=10000000
i=15000000
i=15000000
i=15000000
i=15000000
Thread 0 is finished
Thread 1 is finished
Thread 2 is finished
Thread 3 is finished[/font]