cudaErrorInvalidDeviceFunction Simple program throwing cudaErrorInvalidDeviceFunction error

Hi,

I just started getting my hands dirty with CUDA programming. I am using GTX 285; on window server 2003, and using VS2005 for IDE.

I am trying to execute below code on the device and it throws cudaErrorInvalidDeviceFunction when in debug\release mode, however when in emulation mode it works fine. Not sure where to start trouble shooting, any pointers will be helpful :)

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>

#define EMU_RELEASE 0
global void incrementArrayOnDevice(float *a, int N);
void checkError(cudaError_t mem_error);

int main(int argc, char** argv)
{
int N = 10;
float *a_h, *b_h;
float *a_d;
int count = 0;
int BLOCK_SIZE = 4;
int nBlocks = 0;
cudaError_t mem_error;
// allocate memory on the host
a_h = (float *)malloc(sizeof(float)*N);
b_h = (float *)malloc(sizeof(float)*N);

for(count = 0; count < N; count++)
{
	a_h[count] = 0;
	b_h[count] = 0;
}

for(count = 0; count < N; count++)
{
	printf(" %lf", b_h[count]);
}
// allocate memory on the device
mem_error = cudaMalloc((void **) &a_d, sizeof(float)*N);
checkError(mem_error);


// copy values of the array to device for calculations
mem_error = cudaMemcpy(a_d, a_h, sizeof(float)*N, cudaMemcpyHostToDevice);	
checkError(mem_error);
nBlocks = N/BLOCK_SIZE + (N%BLOCK_SIZE==0?0:1);

incrementArrayOnDevice<<<nBlocks, BLOCK_SIZE>>>(a_d, N);
mem_error = cudaGetLastError(); 
checkError(mem_error);


mem_error =  cudaMemcpy(b_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
checkError(mem_error);
printf("\n After copy...");
for(count = 0; count < N; count++)
{
	printf(" %lf", b_h[count]);
}

system ("pause");
return 0;

}

global void incrementArrayOnDevice(float *a, int N)
{
int threadID = blockIdx.x * blockDim.x + threadIdx.x;
if(threadID < N)
{
a[threadID] += 1;
//printf(“\n%f”, a[threadID]);
}
}

void checkError(cudaError_t mem_error)
{
if(mem_error == cudaSuccess)
{
printf(“No errors.”);
}
if(mem_error == cudaErrorMissingConfiguration)
{
printf(“\nMissing configuration error.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorMemoryAllocation)
{
printf(“\nMemory allocation error.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInitializationError)
{
printf(“\nInitialization error.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorLaunchFailure)
{
printf(“\nLaunch failure.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorPriorLaunchFailure)
{
printf(“\nPrior launch failure.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorLaunchTimeout)
{
printf(“\nLaunch timeout error.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorLaunchOutOfResources)
{
printf(“\nLaunch out of resources error.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidDeviceFunction)
{
printf(“\nInvalid device function.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidConfiguration)
{
printf(“\nInvalid configuration.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidDevice)
{
printf(“\nInvalid device.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidValue)
{
printf(“\nInvalid value.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidPitchValue)
{
printf(“\nInvalid pitch value.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidSymbol)
{
printf(“\nInvalid symbol.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorMapBufferObjectFailed)
{
printf(“\nMap buffer object failed.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorUnmapBufferObjectFailed)
{
printf(“\nUnmap buffer object failed.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidHostPointer)
{
printf(“\nInvalid host pointer.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidDevicePointer)
{
printf(“\nInvalid device pointer.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidTexture)
{
printf(“\nInvalid texture.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidTextureBinding)
{
printf(“\nInvalid texture binding.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidChannelDescriptor)
{
printf(“\nInvalid channel descriptor.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidMemcpyDirection)
{
printf(“\nInvalid memcpy direction.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorAddressOfConstant)
{
printf(“\nAddress of constant error.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorTextureFetchFailed)
{
printf(“\nTexture fetch failed.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorTextureNotBound)
{
printf(“\nTexture not bound error.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorSynchronizationError)
{
printf(“\nSynchronization error.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidFilterSetting)
{
printf(“\nInvalid filter setting.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidNormSetting)
{
printf(“\nInvalid norm setting.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorMixedDeviceExecution)
{
printf(“\nMixed device execution.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorCudartUnloading)
{
printf(“\nCUDA runtime unloading.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorUnknown)
{
printf(“\nUnknown error condition.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorNotYetImplemented)
{
printf(“\nFunction not yet implemented.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorMemoryValueTooLarge)
{
printf(“\nMemory value too large.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInvalidResourceHandle)
{
printf(“\nInvalid resource handle.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorNotReady)
{
printf(“\nNot ready error.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorInsufficientDriver)
{
printf(“\nCUDA runtime is newer than driver.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorSetOnActiveProcess)
{
printf(“\nSet on active process error.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorNoDevice)
{
printf(“\nNo available CUDA device.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorStartupFailure)
{
printf(“\nStartup failure.\n”);
system (“pause”);
exit(0);
}
if(mem_error == cudaErrorApiFailureBase)
{
printf(“\nAPI failure base.\n”);
system (“pause”);
exit(0);
}
}

Cheers

Sorry problem was not with the code, but actually with the nvcc command arguments. All I had to do was use correct project creating wizard (i have installed two cuda project creation templates, guess one comes when cuda sdk is installed and other was from the forum not very reliable). And set appropriate lib path for the project, as mentioned in cuda docs.