Error when using cudaLaunch cudaErrorInvalidDeviceFunction error

I’m trying to use cudaLaunch to be able to specify which kernel I’d like to execute on a particular piece of data, and every time I call cudaLaunch, I get a cudaErrorInvalidDeviceFunction error. Any help anyone can provide would be much appreciated.

System/CUDA Info

32-bit machine

CentOS release 5.2


Cuda 2.2

GeForce 8800 GTX


All the files are in the same directory & compile fine w/no errors. The code should just generate numbers from 0 to 255, print them out, pass them to the kernel to be scaled by 3, then print the output values. (main function)

#include <iostream>

#include <stdlib.h>

#include <string>

#include "dat_gpu.h"

#define N_ELEM 256

int main(int argc, char *argv[])


	  int inBuff[N_ELEM];

	int outBuff[N_ELEM];

	int i = 0;

	int dataSize = N_ELEM * sizeof(int);

	memset(inBuff, 0, dataSize);

	memset(outBuff, 0, dataSize);

	printf("Input Vector: ");

	for (i = 0; i < N_ELEM; i++) {

		inBuff[i] = i;

		printf("%d ",inBuff[i]);



	dat_gpu("scale_vect_op", inBuff, outBuff);

	printf("Output Vector: ");

	for (i = 0; i < N_ELEM; i++) {

		printf("%d ", outBuff[i]);



return 0;


not shown: dat_gpu.h (just declares dat_gpu function)

#include <stdio.h>

#include <stdlib.h>

#include <cutil_inline.h>

#include ""

extern "C" int dat_gpu(const char * func, int * inData, int * outData) {

int data_size = 256 * sizeof(int);

   int * gpu_in_data; 

   int * gpu_out_data;

cudaSetDevice( cutGetMaxGflopsDeviceId() );

   cudaMalloc((void**)&gpu_in_data, data_size);

   cudaMalloc((void**)&gpu_out_data, data_size);

   cudaMemset(gpu_in_data, 0, data_size);

   cudaMemset(gpu_out_data, 0, data_size);

   cudaMemcpy(gpu_in_data, inData, data_size, cudaMemcpyHostToDevice);

dim3 gridDim(1, 1, 1);

   dim3 blockDim(256, 1, 1);

   size_t offset = 0;	

cudaConfigureCall(gridDim, blockDim, data_size);

   cudaSetupArgument(gpu_in_data, offset);

   offset += sizeof(gpu_in_data);

   cudaSetupArgument(gpu_out_data, offset);



cudaError_t err = cudaGetLastError();

   if(err != cudaSuccess) {

	const char * err_str = cudaGetErrorString(err);

	printf("ERROR: ");




cudaMemcpy(outData, gpu_out_data, data_size, cudaMemcpyDeviceToHost);




return 0;


extern "C" __global__ static void scale_vect_op( int * inValues, int * outValues, int scale ) {

   extern __shared__ int shared[];

   const unsigned int tid = threadIdx.x;

shared[tid] = inValues[tid];

   outValues[tid] = shared[tid] * scale;



Thanks for any help!

UPDATE: passing the function pointer works, just not the function name as a string. Both should work according to the reference manual.