Launching Kernels with Runtime Calls Instead of <<< >>> Notation?

Hi,

I’m having some issues launching kernels without using the NVCC specific <<< >>> chevron notation. Here’s a simple example:

#include <stdio.h>

#include <cuda_runtime.h>

#define NVCC_LAUNCH

__global__ void testKernel(int* output, int* input, int size)

{

	int i = blockDim.x * blockIdx.x + threadIdx.x;

	

	output[i] = input[i];

}

int main(int argc, char** argv)

{

	cudaSetDevice(0);

	

	int N = 256;

	size_t size = N * sizeof(int);

	

	int* h_input = new int[N];

	int* h_output = new int[N];

	

	int* d_input;

	int* d_output;

	

	cudaMalloc((void**)&d_input, size);

	cudaMalloc((void**)&d_output, size);

	

	for (int i = 0; i < N; i++)		h_input[i] = 10;

	cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

	cudaMemset(d_output, 0, size);

	

	int blocks = 1;

	int threads = 256;

	

	dim3 gridDim(blocks, 1, 1);

	dim3 blockDim(threads, 1, 1);

	

#ifdef NVCC_LAUNCH

	testKernel<<<gridDim, blockDim>>>(d_output, d_input, N);

#else

	cudaConfigureCall(gridDim, blockDim, 0);

	

	size_t offset = 0;

	

	cudaSetupArgument(d_output, offset);

	offset += sizeof(d_output);

	

	cudaSetupArgument(d_input, offset);

	offset += sizeof(d_input);

	

	cudaSetupArgument(N, offset);

	

	cudaLaunch("testKernel");

#endif

	

	cudaThreadSynchronize();

	

	cudaError_t err = cudaGetLastError();

	

	if(err != cudaSuccess)	    printf("ERROR: %s\n", cudaGetErrorString(err));

	

	cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);

	

	printf("output[0] = %d\n", h_output[0]);

	printf("output[127] = %d\n", h_output[127]);

	printf("output[255] = %d\n", h_output[255]);

	

	cudaFree(d_input);

	cudaFree(d_output);

		

	delete[] h_input;

	delete[] h_output;

	

	return 0;

}

I compile this example like this (x86_64 linux):

/apps/Linux64/cuda/cuda-3.2/bin/nvcc -m64 -arch sm_21 -o main main.cu

Which works perfectly with the NVCC_LAUNCH option enabled:

output[0] = 10

output[127] = 10

output[255] = 10

but when I disable this, I get the “invalid device function” warning:

ERROR: invalid device function

output[0] = 0

output[127] = 0

output[255] = 0

Can anyone spot any errors in this example or point to why this might not be working?

I’m using a Fermi GPU (tried both Tesla C2050 and Quadro 4000) with CUDA 3.2 and sm_21.

Thanks,

Dan

You probably need to use the C++ mangled name of the compiled kernel in the argument supplied to cudaLaunch() (or declared the kernel as extern “C” if you are not using any C++ features). Certainly that is how it works in the driver API.

Spot on, thanks. Name mangling seems to have tripped me up quite a few times recently!