Hello,
I’ve been working with the Driver API, and wanted to use some Thrust functions to save some time. However, I’ve been running into a few problems. For some background, I want to have a C++ file as my program entry point, and have it call a wrapper-like function that first does some setup with Thrust, then calls a kernel.
The problem is that the Thrust code needs to be compiled by nvcc, so I’ve been putting my wrapper-function in my .cu file(s). However, Thrust is meant to execute on the host, not the device, so I can’t invoke the wrapper function as I might a normal kernel (via cuLaunchKernel). Using the Runtime API, this isn’t a problem; I can just declare the host-based function as being extern, and call it from my C++ file. However, whenever I try using the Driver API, I can never link to anything in any of the .cu files (as an aside, is this because they’re generating a ptx file?).
I wrote a small example to illustrate the crux of what I want to do; although it doesn’t involve Thrust, it illustrates my linking problem.
This is in my .cu file, (named thekernel.cu):
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <builtin_types.h>
#include <cutil_inline.h>
__global__ void cudaKernel(float *img, unsigned const int width, unsigned const int height){
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x >= width)
return;
if(y >= height)
return;
if(img[y * width + x] == 0)
img[y * width + x] = 1;
else
img[y * width + x] = 0;
}
void initCUDA(CUfunction *theKernel){
CUcontext context;
CUdevice cuDevice = 0;
CUmodule cuMod;
CUresult result = cuInit(0);
cuDeviceGet(&cuDevice, 0);
cuCtxCreate(&context, 0, cuDevice);
cuModuleLoad(&cuMod, "./thekernel.ptx");
cuModuleGetFunction(theKernel, cuMod, "cudaKernel");
}
int iDivUp(int a, int b){
return (a % b != 0) ? (a / b + 1) : (a / b);
}
extern "C" void cudaKernelWrapper(float *img, unsigned const int height, unsigned const int width){
CUfunction theKernel;
initCUDA(&theKernel);
dim3 threads(16,16,1);
dim3 blocks(iDivUp(width, threads.x), iDivUp(height, threads.y), 1);
float* devData;
cuMemAlloc((CUdeviceptr *) &devData, height*width*sizeof(float));
cuMemcpyHtoD((CUdeviceptr)devData, img, height*width*sizeof(float));
void *kernelArgs[] = {&devData, (void *)&width, (void *)&height};
cuLaunchKernel(theKernel, blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, 0, 0, kernelArgs, NULL);
cuCtxSynchronize();
cuMemcpyDtoH(img, (CUdeviceptr)devData, height*width*sizeof(float));
cuMemFree((CUdeviceptr)devData);
}
Then in my C++ file:
#include <stdio.h>
#include <stdlib.h>
//openCV includes
#include <cv.h>
#include <highgui.h>
extern "C" void cudaKernelWrapper(float *, unsigned const int, unsigned const int);
int main(){
unsigned const int height = 256;
unsigned const int width = 512;
float *fakeImg = (float *)malloc(height * width * sizeof(float));
cv::Mat outImg;
outImg = cv::Mat(height, width, CV_32FC1);
for (unsigned int i = 0; i < height; i++){
for (unsigned int j = 0; j < width-height; j++){
fakeImg[i*width+j] = 1;
}
for (unsigned int k = height; k < width; k++){
fakeImg[i*width+k] = 0;
}
}
outImg.data = (unsigned char *)fakeImg;
imshow("PreImage", outImg);
cv::waitKey(10);
cudaKernelWrapper(fakeImg, height, width);
outImg.data = (unsigned char *)fakeImg;
imshow("PostImage", outImg);
cv::waitKey(0);
free(fakeImg);
return 1;
}
This always yields a “error LNK2019: unresolved external symbol _cudaKernelWrapper referenced in function _main” linkage error.
If anyone has any insights, they’d be much appreciated
Thanks,
-Al
EDIT:
Not sure if it matters, but I’m using CUDA 4.0 and a GTX 580 (compute capability 2.0)