Hello,
I’m trying to write some PTX code by hand, compile it to .cubin, and have the driver api execute it directly, and was running into some problems getting off the ground. The reason I’m trying write it this way instead of using the inline PTX is because I think I can do significantly better than the nvcc compiler in terms of allocating registers for what I’m trying to do.
Anyway, I’ve written an extremely simple PTX program, and I can’t figure out what’s wrong. I tried looking at the examples, and I’m not having any luck. My program works as follows: The main program (written in c) is supposed to create an array with one value, set it to “777”, setup my cuda device/context/kernel stuff, and copy that single-element array to the device. From here, my kernel is launched (the cubin file). My kernel is comically simple. It’s one thread that writes “2222” to the single-element array and exits. Once control is returned to the host, it copies the data back, prints the result (which is some random number and not “777” or “2222”), and finally it de-allocates the device memory.
I check for errors after every cuXXX call, and oddly enough none occur except when I free up the device memory, where I get an error code 700 which is “CUDA_ERROR_LAUNCH_FAILED”. If anyone can comment on what’s causing that I’d appreciate it.
I’m using version 4.0 of the nvcc compiler, and I have the latest drivers installed.
Attached is code for my main function, the ptx code, and my makefile. The executable (simple_test) and the cubin file (kernel.cubin) are copied one directory level above, where I try running the code.
I would greatly appreciate it if anyone can explain what I’m doing wrong.
Thanks.
kernel.ptx:
.version 2.3
.target sm_20
.address_size 32
.entry kernel( .param .u32 A)
{
.reg .u32 %a;
ld.param.u32 %a, [A];
st.global.u32 [%a], 2222;
exit;
}
main.c:
#include <stdio.h>
#include <cuda.h>
int main(int argv, char* argc[])
{
unsigned int h_a[1] = {777};
//using driver api
//setup something to put error codes in
CUresult error;
//initialize
error = cuInit(0);
printf("cuinit returns: %d\n",int(error));
//get device 0, put the handle in cuDevice0
CUdevice cuDevice0;
error = cuDeviceGet(&cuDevice0, 0);
printf("cuDeviceGet returns: %d\n",int(error));
//create a context to run on device 0
CUcontext cuContext0;
error = cuCtxCreate(&cuContext0, 0, cuDevice0);
printf("cuCtxCreate returns: %d\n",int(error));
//create module from cubin file
CUmodule cuModule0;
error = cuModuleLoad(&cuModule0, "kernel.cubin");
printf("cuModuleLoad returns: %d\n",int(error));
//device memory pointers
CUdeviceptr d_a;
//allocate device memory
unsigned int allocsize = 1*sizeof(unsigned int);
error = cuMemAlloc(&d_a, allocsize);
printf("allocating d_a returns: %d\n",int(error));
//copy host arrays to the device
error = cuMemcpyHtoD(d_a, h_a, allocsize);
printf("copy data for a returns: %d\n",int(error));
//get function handle
CUfunction kernel0;
error = cuModuleGetFunction(&kernel0, cuModule0, "kernel");
printf("getting the function handle returns: %d\n",int(error));
//launch kernel
void *args[] = { &d_a};
error = cuLaunchKernel(kernel0, 1, 1, 1, 1, 1, 1, 0, NULL, args, NULL);
printf("kernel launch returns: %d\n",int(error));
error = cuMemcpyDtoH(h_a, d_a, allocsize);
printf("copy from device to host returns: %d\n",int(error));
//free up device memory
error = cuMemFree(d_a);
printf("freeing up d_a returns: %d\n",int(error));
//print out the result
printf("h_a[1] = %u\n",h_a[1]);
return 0;
}
Makefile:
NVCC_FLAGS = --ptxas-options -v
FLAGS =
all: simple_test
simple_test: main.o kernel.cubin
gcc $(FLAGS) -L /usr/local/cuda/lib64 -lcuda -lcudart -o simple_test main.o
cp simple_test ../.
main.o: main.cu
nvcc $(NVCC_FLAGS) -c -gencode arch=compute_20,code=sm_20 -o main.o main.cu
kernel.cubin:
nvcc $(NVCC_FLAGS) -cubin -gencode arch=compute_20,code=sm_20 -o kernel.cubin kernel.ptx
cp kernel.cubin ../.
clean:
-rm *.o
-rm simple_test
-rm ../simple_test
-rm kernel.cubin
-rm ../kernel.cubin
.PHONY: all clean