Launching Kernels with Runtime Calls Instead of <<< >>> Notation?


I’m having some issues launching kernels without using the NVCC specific <<< >>> chevron notation. Here’s a simple example:

#include <stdio.h>

#include <cuda_runtime.h>


__global__ void testKernel(int* output, int* input, int size)


	int i = blockDim.x * blockIdx.x + threadIdx.x;


	output[i] = input[i];


int main(int argc, char** argv)




	int N = 256;

	size_t size = N * sizeof(int);


	int* h_input = new int[N];

	int* h_output = new int[N];


	int* d_input;

	int* d_output;


	cudaMalloc((void**)&d_input, size);

	cudaMalloc((void**)&d_output, size);


	for (int i = 0; i < N; i++)		h_input[i] = 10;

	cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

	cudaMemset(d_output, 0, size);


	int blocks = 1;

	int threads = 256;


	dim3 gridDim(blocks, 1, 1);

	dim3 blockDim(threads, 1, 1);



	testKernel<<<gridDim, blockDim>>>(d_output, d_input, N);


	cudaConfigureCall(gridDim, blockDim, 0);


	size_t offset = 0;


	cudaSetupArgument(d_output, offset);

	offset += sizeof(d_output);


	cudaSetupArgument(d_input, offset);

	offset += sizeof(d_input);


	cudaSetupArgument(N, offset);







	cudaError_t err = cudaGetLastError();


	if(err != cudaSuccess)	    printf("ERROR: %s\n", cudaGetErrorString(err));


	cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);


	printf("output[0] = %d\n", h_output[0]);

	printf("output[127] = %d\n", h_output[127]);

	printf("output[255] = %d\n", h_output[255]);





	delete[] h_input;

	delete[] h_output;


	return 0;


I compile this example like this (x86_64 linux):

/apps/Linux64/cuda/cuda-3.2/bin/nvcc -m64 -arch sm_21 -o main

Which works perfectly with the NVCC_LAUNCH option enabled:

output[0] = 10

output[127] = 10

output[255] = 10

but when I disable this, I get the “invalid device function” warning:

ERROR: invalid device function

output[0] = 0

output[127] = 0

output[255] = 0

Can anyone spot any errors in this example or point to why this might not be working?

I’m using a Fermi GPU (tried both Tesla C2050 and Quadro 4000) with CUDA 3.2 and sm_21.



You probably need to use the C++ mangled name of the compiled kernel in the argument supplied to cudaLaunch() (or declared the kernel as extern “C” if you are not using any C++ features). Certainly that is how it works in the driver API.

Spot on, thanks. Name mangling seems to have tripped me up quite a few times recently!