a simple logging mechanism for CUDA device code

I have been trying to debug a piece of CUDA code, and found it extremely hard. Sometimes, the bug won’t reproduce in -deviceemu, nor -G. And sometimes, when it does come up in -deviceemu, I am not able to set a breakpoint in the right function, because it has been inlined. So I composed this simple logging “library”. I would like to share it with you guys, hope it would be helpful for someone.

/***********************************************************

 *  DLog.h

 *  Simple logging mechanism for CUDA device code

 *

 *  Usage: 

 *	   1. include it in your .cu file

 *	   2. call DLog_Init(size) to reserve size bytes as output buffer 

 *		   on device

 *	   3. call DLog<<data in your CUDA device code

 *	   4. call DLog_Dump() after kernel invocation to dump the output

 *

 *  This implementation is very simple, and inefficient. It supports only 

 *	 char, int and const char *.

 *

 *  Author: Chuntao HONG (chuntao.hong@gmail.com)

 *  Last modified: Oct. 13th, 2009

************************************************************/

#ifndef DLOG_H

#define DLOG_H

#include <cuda.h>

#include <iostream>

using namespace std;

const int MIN_ALIGN=4;

// align n to b bytes

template<class T>

__device__ T Align(T n, uint32_t b){

		return ((uint32_t)n&(b-1))==NULL ? n : n+b-((uint32_t)n&(b-1));

}

template<class T>

__device__ T MinAlign(T n){

		return ((uint32_t)n&(MIN_ALIGN-1))==NULL ? n : n+MIN_ALIGN-((uint32_t)n&(MIN_ALIGN-1));

}

// set some empty bytes, so that the logging data structures won't be polluted by out-of-bound writes

const int SAFTY_BYTES=1024*1024;

// data types supported, now only three

enum DataType{

		CHAR,

		INT,

		STRING

};

struct LogPack{

		int size;

		DataType type;

		union{

				char c;

				int i;

				char str[];

		}data;

};

__device__ int requiredLogSpace(const char*);

__device__ int requiredLogSpace(const int);

__device__ int requiredLogSpace(const unsigned int);

__device__ int requiredLogSpace(const char);

__device__ void assignLog(LogPack *, const char c);

__device__ void assignLog(LogPack *, const char * c);

__device__ void assignLog(LogPack *, const int c);

__device__ void assignLog(LogPack *, const unsigned int c);

struct MyLog{

public:

		char empty_bytes[SAFTY_BYTES];

		char * buf;

		unsigned int curPos;

		char empty_bytes2[SAFTY_BYTES];

public:

		template <class T>

		__device__ MyLog & operator<<(const T & t){

				int packSize=MinAlign(requiredLogSpace(t));

				int start=atomicAdd(&curPos, packSize);

				LogPack * pack=(LogPack *)(buf+start);

				assignLog(pack,t);

				pack->size=packSize;

				return *this;

		}

};

__device__ MyLog DLog;

__device__ int requiredLogSpace(const int i){

		return sizeof(i)+sizeof(DataType)+sizeof(int);

}

__device__ int requiredLogSpace(const char * str){

		int len=0;

		while(*str++){

				len++;

		}

		return len+1+sizeof(DataType)+sizeof(int);

}

__device__ static int requiredLogSpace(const unsigned int i){

		return sizeof(i)+sizeof(DataType)+sizeof(int);

}

__device__ static int requiredLogSpace(const char c){

		return sizeof(c)+sizeof(DataType)+sizeof(int);

}

__device__ void assignLog(LogPack * pack, const char c){

		pack->data.c=c;

		pack->type=CHAR;

}

__device__ void assignLog(LogPack * pack, const int i){

		pack->data.i=i;

		pack->type=INT;

}

__device__ void assignLog(LogPack * pack, const unsigned int i){

		pack->data.i=i;

		pack->type=INT;

}

__device__ void assignLog(LogPack * pack, const char * str){

		int i=0;

		while(*str){

				pack->data.str[i++]=*str;

				str++;

		}

		pack->type=STRING;

}

__host__ void DLog_Init(unsigned int size){

		MyLog localLog;

		localLog.curPos=SAFTY_BYTES;

		cudaMalloc((void**)&localLog.buf, size+SAFTY_BYTES);

		cudaMemset(localLog.buf, 0, size);

		cudaMemcpyToSymbol(DLog, &localLog, sizeof(MyLog), 0, cudaMemcpyHostToDevice);

}

__host__ void DLog_Dump(){

		MyLog localLog;

		cudaMemcpyFromSymbol(&localLog, DLog, sizeof(MyLog), 0, cudaMemcpyDeviceToHost);

		char * h_buf=new char[localLog.curPos+1];

		cudaMemcpy(h_buf, localLog.buf, localLog.curPos, cudaMemcpyDeviceToHost);

		cudaFree(localLog.buf);

		char * ptr=h_buf+SAFTY_BYTES;

		while(ptr<h_buf+localLog.curPos){

				LogPack * pack=(LogPack *)ptr;

				if(pack->type==CHAR){

						cout<<pack->data.c<<endl;

				}

				else if(pack->type==INT){

						cout<<"0x"<<hex<<pack->data.i<<endl;

				}

				else{

						cout<<pack->data.str<<endl;

				}

				ptr+=pack->size;

		}

		delete[] h_buf;

}

#endif
// sample code for the usage of DLog

#include <cuda.h>

#include <iostream>

using namespace std;

#include "DLog.h"

const int numThreads=2;

const int numBlocks=2;

__global__ void foo(){

		DLog<<threadIdx.x;

}

int main(){

		DLog_Init(1024*1024);

		foo<<<numBlocks, numThreads>>>();

		DLog_Dump();

		return 0;

}