I’m new to CUDA and as a learing exercise, I’m attempting to incorporate the Image Denoising sample into a DMO to be used with DShow. In my DMO (.cpp), I make this call:
{
init();
displayFunc( pbSource, pbTarget,
dwWidth, dwHeight,
dwWidthOut, dwHeightOut );
close();
}
where init() is in imageDenoiser.cu:
extern “C”
void init()
{
CUT_DEVICE_INIT();
}
and close() is in imageDenoiser.cu:
extern “C”
void close()
{
CUT_EXIT(0, NULL);
}
and displayFunc() is in imageDenoiser.cu:
extern “C”
void displayFunc( unsigned char *h_DataSrc, unsigned char *h_DataDst,
int imgWidthSrc, int imgHeightSrc,
int imgWidthDst, int imgHeightDst )
{
TColor *d_dst = NULL;
unsigned char *d_DataSrc, *d_DataDst;
double timerValue;
unsigned int hTimer;
// Input data size (Source)
int IMG_WIDTH_SRC = imgWidthSrc;
int IMG_HEIGHT_SRC = imgHeightSrc;
// Output data size (Source)
int IMG_WIDTH_DST = imgWidthDst;
int IMG_HEIGHT_DST = imgHeightDst;
int DATA_N = IMG_WIDTH_SRC * IMG_HEIGHT_SRC;
int DATA_SIZE_SRC = DATA_N * sizeof(unsigned char) * 4;
int DATA_SIZE_DST = IMG_WIDTH_DST * IMG_HEIGHT_DST * sizeof(unsigned char) * 4;
CUT_SAFE_CALL( cutCreateTimer(&hTimer) );
CUT_SAFE_CALL( cutStartTimer(hTimer) );
{
CUDA_SAFE_CALL( cudaMalloc((void **)&d_DataSrc, DATA_SIZE_SRC ) );
CUDA_SAFE_CALL( cudaMalloc((void **)&d_DataDst, DATA_SIZE_DST ) );
CUDA_SAFE_CALL( cudaMemcpy(d_DataSrc, h_DataSrc, DATA_SIZE_SRC, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(d_DataDst, h_DataDst, DATA_SIZE_DST, cudaMemcpyHostToDevice) );
}
{
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_SAFE_CALL( cutStartTimer(hTimer) );
KNNdiag(/*d_dst,*/ imgWidthSrc, imgHeightSrc, 1.0f / (knnNoise * knnNoise), lerpC, (unsigned int *)d_DataSrc, (unsigned int *)d_DataDst);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_SAFE_CALL(cutStopTimer(hTimer));
timerValue = cutGetTimerValue(hTimer);
}
CUDA_SAFE_CALL( cudaMemcpy(h_DataDst, d_DataDst, DATA_SIZE_DST, cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpy(h_DataSrc, d_DataSrc, DATA_SIZE_SRC, cudaMemcpyDeviceToHost) );
CUT_SAFE_CALL(cutDeleteTimer(hTimer));
CUDA_SAFE_CALL( cudaFree(d_DataDst) );
CUDA_SAFE_CALL( cudaFree(d_DataSrc) );
free(h_DataDst);
free(h_DataSrc);
}
and KNNdiag() is in imageDenoiser_knn_kernel.cu:
void KNNdiag(
// TColor *dst,
int imageW,
int imageH,
float Noise,
float lerpC,
unsigned int *d_DataSrc,
unsigned int *d_DataDst)
{
dim3 dimBlock(8, 8, 1);
dim3 dimGridDst(iDivUp(imageW, dimBlock.x), iDivUp(imageH, dimBlock.y), 1);
KNN_kernel<<<dimGridDst, dimBlock>>>(/*d_dst,*/ imageW, imageH, Noise, lerpC, (uchar4*)d_DataSrc, (uchar4*)d_DataDst);
cudaError_t err = cudaGetLastError();
const char* zsErr = cudaGetErrorString( err );
return;
}
and KNN_kernel<<<>>>() is in imageDenoiser_knn_kernel.cu:
global void KNN_kernel(
int imageW,
int imageH,
float Noise,
float lerpC,
uchar4 *d_DataSrc,
uchar4 *d_DataDst)
{
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
//Add half of a texel to always address exact texel centers
const float x = (float)ix + 0.5f;
const float y = (float)iy + 0.5f;
}
this method is cut short as I’m just trying to get a clean run…
My problem is this:
When I make this call KNN_kernel<<<dimGridDst, dimBlock>>>(/d_dst,/ imageW, imageH, Noise, lerpC, (uchar4*)d_DataSrc, (uchar4*)d_DataDst);
I get a cudaError_t that say’s “invalid device function” and First-chance exception at 0x7c812a5b in graphedt.exe: Microsoft C++ exception: cudaError at memory location 0x030df894… in the output window.
If I use CUT_ERROR_CHECK instead of
cudaError_t err = cudaGetLastError();
const char* zsErr = cudaGetErrorString( err );
the process fails with no output but same first chance exception.
The behavior seems to be the same regardless of if I’m in emulation mode or not.
I’m running this on XPsp2/Visual Studio 2005sp1/Gforce 8800.
Any help will be greatly appreciate!
Thanks,
Mike