cudaLaunchKernel failed to launch kernel


I am trying to launch kernel function using the runtime API. For some reason, I am not able the directly call cudaLaunchKernel. Instead, I have call a function that calls cudaLaunchKernel inside it. Here is an example:


void hello()
printf(“hello from kernel. \n”);

template<typename T>
int launchKernel (T kernel , const size_t grid[3] , const size_t block[3])
cudaError_t res;
dim3 grid3d = {(unsigned int)grid[0] , (unsigned int)grid[1] , (unsigned int)grid[2]};
dim3 block3d = {(unsigned int)block[0] , (unsigned int)block[1] , (unsigned int)block[2]};
res = cudaLaunchKernel ((void*)kernel , grid3d , block3d, NULL, 0, NULL);
if (res != CUDA_SUCCESS)
char msg[256];
printf (“error during kernel launch \n”);
return -1;
return 0;

int main(void)
float *hx, *dx;
hx = (float*)malloc(32 * sizeof(float));
cudaMalloc(&dx, 32 * sizeof(float));
unsigned int threads = 32;
unsigned int blocks = 1;
///////////////// option 1: directly call runtime api: cudaLaunchKernel /////////////////
//cudaLaunchKernel((void*)hello, dim3(blocks), dim3(threads), NULL, 0, NULL);
///////// option 2: call a function which further calls cudaLaunchKernel /////////
const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};
launchKernel (hello , grid3d , block3d);
cudaMemcpy(hx, dx, 32 * sizeof(float), cudaMemcpyDeviceToHost);
return 0;

Option 1, which directly calls the cudaLaunchKernel works. However, option 2, which indirectly invokes the cudaLaunchKernel, does not work. Using option 2, no message was printed from the device, and the return value is not equal to CUDA_SUCCESS.

I was wondering if anyone has any insights into this problem.

Thank you in advance for your help and time.

The issue was resolved by Robert’s solution.