invalid device function testing CUDA in DMO

I’m new to CUDA and as a learing exercise, I’m attempting to incorporate the Image Denoising sample into a DMO to be used with DShow. In my DMO (.cpp), I make this call:
{
init();

	displayFunc(  pbSource, pbTarget, 
			dwWidth, dwHeight,
			dwWidthOut, dwHeightOut );
	
	close();
}

where init() is in imageDenoiser.cu:
extern “C”
void init()
{
CUT_DEVICE_INIT();
}

and close() is in imageDenoiser.cu:
extern “C”
void close()
{
CUT_EXIT(0, NULL);
}

and displayFunc() is in imageDenoiser.cu:

extern “C”
void displayFunc( unsigned char *h_DataSrc, unsigned char *h_DataDst,
int imgWidthSrc, int imgHeightSrc,
int imgWidthDst, int imgHeightDst )
{
TColor *d_dst = NULL;

unsigned char *d_DataSrc, *d_DataDst;
double timerValue;
unsigned int hTimer;

// Input data size (Source)
int	IMG_WIDTH_SRC = imgWidthSrc;
int IMG_HEIGHT_SRC = imgHeightSrc;

// Output data size (Source)
int	IMG_WIDTH_DST = imgWidthDst;
int IMG_HEIGHT_DST = imgHeightDst;

int DATA_N = IMG_WIDTH_SRC * IMG_HEIGHT_SRC;

int	DATA_SIZE_SRC = DATA_N * sizeof(unsigned char) * 4;
int	DATA_SIZE_DST = IMG_WIDTH_DST * IMG_HEIGHT_DST * sizeof(unsigned char) * 4;

CUT_SAFE_CALL( cutCreateTimer(&hTimer) );
CUT_SAFE_CALL( cutStartTimer(hTimer)   );
{
	CUDA_SAFE_CALL( cudaMalloc((void **)&d_DataSrc,   DATA_SIZE_SRC ) );
	CUDA_SAFE_CALL( cudaMalloc((void **)&d_DataDst,   DATA_SIZE_DST ) );

	CUDA_SAFE_CALL( cudaMemcpy(d_DataSrc, h_DataSrc, DATA_SIZE_SRC, cudaMemcpyHostToDevice) );
	CUDA_SAFE_CALL( cudaMemcpy(d_DataDst, h_DataDst, DATA_SIZE_DST, cudaMemcpyHostToDevice) );
}

{

	CUDA_SAFE_CALL( cudaThreadSynchronize() );
	CUT_SAFE_CALL( cutResetTimer(hTimer) );
	CUT_SAFE_CALL( cutStartTimer(hTimer) );
    KNNdiag(/*d_dst,*/ imgWidthSrc, imgHeightSrc, 1.0f / (knnNoise * knnNoise), lerpC, (unsigned int *)d_DataSrc, (unsigned int *)d_DataDst);

	CUDA_SAFE_CALL( cudaThreadSynchronize() );
	CUT_SAFE_CALL(cutStopTimer(hTimer));
	timerValue = cutGetTimerValue(hTimer);

}

CUDA_SAFE_CALL( cudaMemcpy(h_DataDst, d_DataDst, DATA_SIZE_DST, cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpy(h_DataSrc, d_DataSrc, DATA_SIZE_SRC, cudaMemcpyDeviceToHost) );


CUT_SAFE_CALL(cutDeleteTimer(hTimer));
CUDA_SAFE_CALL( cudaFree(d_DataDst) );
CUDA_SAFE_CALL( cudaFree(d_DataSrc) );
free(h_DataDst);
free(h_DataSrc);

}

and KNNdiag() is in imageDenoiser_knn_kernel.cu:

void KNNdiag(
// TColor *dst,
int imageW,
int imageH,
float Noise,
float lerpC,
unsigned int *d_DataSrc,
unsigned int *d_DataDst)
{
dim3 dimBlock(8, 8, 1);
dim3 dimGridDst(iDivUp(imageW, dimBlock.x), iDivUp(imageH, dimBlock.y), 1);

KNN_kernel<<<dimGridDst, dimBlock>>>(/*d_dst,*/ imageW, imageH, Noise, lerpC, (uchar4*)d_DataSrc, (uchar4*)d_DataDst);
cudaError_t err = cudaGetLastError();
const char* zsErr = cudaGetErrorString( err );
return;

}

and KNN_kernel<<<>>>() is in imageDenoiser_knn_kernel.cu:
global void KNN_kernel(
int imageW,
int imageH,
float Noise,
float lerpC,
uchar4 *d_DataSrc,
uchar4 *d_DataDst)
{
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
//Add half of a texel to always address exact texel centers
const float x = (float)ix + 0.5f;
const float y = (float)iy + 0.5f;

}

this method is cut short as I’m just trying to get a clean run…

My problem is this:

When I make this call KNN_kernel<<<dimGridDst, dimBlock>>>(/d_dst,/ imageW, imageH, Noise, lerpC, (uchar4*)d_DataSrc, (uchar4*)d_DataDst);

I get a cudaError_t that say’s “invalid device function” and First-chance exception at 0x7c812a5b in graphedt.exe: Microsoft C++ exception: cudaError at memory location 0x030df894… in the output window.

If I use CUT_ERROR_CHECK instead of
cudaError_t err = cudaGetLastError();
const char* zsErr = cudaGetErrorString( err );
the process fails with no output but same first chance exception.

The behavior seems to be the same regardless of if I’m in emulation mode or not.

I’m running this on XPsp2/Visual Studio 2005sp1/Gforce 8800.

Any help will be greatly appreciate!

Thanks,

Mike

Have you tried running in the debug device emulation mode? I assume lerpC and knnNoise are global variables because I do not see them defined prior to calling KnnDiag

Thanks for the response. I’ve tried this in emulation mode and get same error. the vars are global.

I’ve modified the code somewhat so that KNNdiag now looks like this:

void KNNdiag(

TColor *d_dst,

int imageW,

int imageH,

float Noise,

float lerpC,

unsigned int *d_DataSrc, 

unsigned int *d_DataDst)

{

cudaError_t err;

dim3 dimBlock(8, 8, 1);

dim3 dimGridDst(iDivUp(imageW, dimBlock.x), iDivUp(imageH, dimBlock.y), 1);

uchar4tex = cudaCreateChannelDesc<uchar4>();



err = cudaMallocArray( &a_Src, &uchar4tex, imageW, imageH ); 

size_t x = sizeof( uchar4 );

err = cudaMemcpyToArray( a_Src, 0, 0, ( void* )d_DataSrc, sizeof( uchar4 ) * imageW * imageH, cudaMemcpyDeviceToDevice);





// Bind the array to the texture

err = cudaGetTextureReference( &texImageRef, "texImage" );

err = cudaBindTextureToArray( texImageRef, a_Src, &uchar4tex );

KNN_kernel<<<dimGridDst, dimBlock>>>( a_Src, imageW, imageH, Noise, lerpC, (uchar4*)d_DataDst);

err = cudaGetLastError();

const char* szErr = cudaGetErrorString( err );

return;

}

and moved the declaration of

texture<uchar4, 2> texImage;

textureReference* texImageRef;

cudaChannelFormatDesc uchar4tex;

cudaArray *a_Src;

from the imageDenoising.cu to imageDenoising_knn_kernel.cu

Now, I’m getting and error when I try to get the texture reference

err = cudaGetTextureReference( &texImageRef, "texImage" );

This returns a cudaErrorInvalidTexture…

Is it possible that I don’t have the context is not initialized correctly? I can run samples on the same machine that work but they all have a main within a .cu file and the process starts from there. In my case, the CUDA initialization code is done within a dll and graphedt is the exe the process. I wouldn’t think that would matter. I’m going to take my code into a simpler test where I have a main inside a .cu to see if that makes a difference.

Thanks,

Mike

Textures cannot be shared between two different .cu files compiled in separate compilation units. You are probably binding the texture in one compilation unit and then trying to use it in another: hence the invalid texture error.

The only way to fix this is to #include all the .cu files together into one file which is the only one you actually compile. Yes, it is an ugly kludge, but it is the only way.

I have two .cu files, imageDenoiser.cu and imageDenoiser_knn_kernel.cu. the _kernel is included in the first .cu and only imageDenoiser.cu is compiled. I ran a test with the original imageDenoiser sample that I downloaded from the nvidia site and included my two files in it. The main function in imageDenoiser.cu, which I have removed from my dshow filter becuase it is a dll, looks like this:

int main(int argc, char **argv){

#ifndef DEVICE_EMULATION

if( CUTFalse == isInteropSupported()) {

    return 1;

}

endif // DEVICE_EMULATION

g_Kernel = atoi( argv[ 1 ] );

CUT_DEVICE_INIT();

printf("Allocating host and CUDA memory and loading image file...\n");

// const char *image_path = cutFindFilePath(“G:\avi\files\raw_bmp\test_uncmp000000.bmp”, argv[0]);

    LoadBMPFile(&h_Src, &imageW, &imageH, "G:\\avi\\objecttracker\\raw_bmp\\test_uncmp000000.bmp"/*image_path*/);

printf("Data init done.\n");

displayFunc(  h_Src, NULL, 

		imageW, imageH,

		imageH, imageH );

CUT_EXIT(argc, argv);

}

This is basically a cut down version of what comes with the sample and the LoadBMPfile is the same function in the sample. The displayFunc is as listed above but I chaged the input from unsigned char* to uchar4*. Otherwise, all looks to be the same. I have the same compiler options for the .cu in my filter as the sample has. Is there anything in the nvidia compiler that is specific to targeting a exe vs. and dll?

Thanks,

Mike

Hmm, so you already are including the files together. I’ve only seen the invalid texture reference error for 2 reasons: 1) binding the texture in another compilation unit and 2) compiling with the sm_11 option and trying to run on sm_10 hardware.

I don’t know anything about windows dlls and CUDA, sorry. On linux, nothing special needs to be done to use CUDA in a shared library.

This turned out to be a problem with the C Runtime not being initialized properly. I now have this working in emulation mode but won’t be until Monday to see if works on actual device…

Thanks for the help.

Mike

Hi mbx,

could you post what your solution actually was? Thx!

raphael

MisterAnderson42 – I know this is an old post, but I just ran across this problem and your solution corrected my issue (invalid sm_xx option). Just wanted to say thanks!!