I’m also fairly new to this, but what I’m currently doing is keeping the kernels separate from the rest of my application, compiling them to PTX code, and using the CUDA Driver API to access them, i.e., something like:
[codebox] struct stat file_info;
stat (name.c_str (), &file_info);
char * file_buffer = new char[file_info.st_size+1];
int fd = open (name.c_str (), O_RDONLY);
read (fd, file_buffer, file_info.st_size);
close (fd);
file_buffer[file_info.st_size] = 0;
CUjit_option options[1] = { CU_JIT_MAX_REGISTERS };
void * optionValues[1] = { &max_registers };
cuModuleLoadDataEx (&cuModule, file_buffer, 1, options, optionValues);
[/codebox]
where “name” is the PTX file I’m loading.
Here is the Makefile I am using with my little test project - maybe it will answer some questions:
[codebox]CC= /Developer/usr/bin/llvm-gcc-4.2
CPP= /Developer/usr/bin/llvm-cpp-4.2
CXX= /Developer/usr/bin/llvm-g+±4.2
NVCC= /usr/local/cuda/bin/nvcc
INCLUDES= -I. -I/usr/local/include -I/usr/local/cuda/include -framework OpenGL
NVCCFLAGS= -m32 --ptx --compiler-options -fno-strict-aliasing
QUADRO=
#QUADRO= -arch sm_13
CXXFLAGS= -m32 -fno-strict-aliasing -O3
CFLAGS= -m32 -fno-strict-aliasing -O3
LDFLAGS= -L/usr/local/lib -L/usr/local/cuda/lib -fPIC
LIBS= -lcuda
RTLIBS= -lcudart -lcuda
#LIBS= -lcufft -lcublas
all: dQ dQ.ptx CASS.ptx
clean:
rm *.cubin *.ptx
dQ: dQ.cpp CUDA_Q.cpp CUDA_Q.h CUDA_Q_private.h
$(CXX) -o dQ dQ.cpp CUDA_Q.cpp $(INCLUDES) $(CXXFLAGS) $(LDFLAGS) $(RTLIBS)
dQ.ptx: dQ.cu
$(NVCC) -o dQ.ptx dQ.cu $(QUADRO) $(NVCCFLAGS)
CASS.ptx: CASS.cu
$(NVCC) -o CASS.ptx CASS.cu $(QUADRO) $(NVCCFLAGS)
info: dQ.ptx CASS.ptx
ptxas -o dQ.cubin dQ.ptx -v -mem
ptxas -o CASS.cubin CASS.ptx -v -mem --maxrregcount=16
[/codebox]