Cuda driver call returns error 1 (cudaErrorMissingConfiguration)

Hello,

I’m trying to load cuda driver api functions on runtime with dlsym, and i have encountered a strange error. I have this code that runs smoothly on my system ( compiled with nvcc etc ) :

#include <cuda.h>

#include <stdio.h>

int main(int argc,char *argv[]){

if(argc<3){

    printf("Usage: ./test.cu <ptx_file> <cuda_device>\n");

    exit(0);

  }

// Error code

  CUresult error;

// My number

  unsigned int h_var=7;

// Initialize driver API

  error = cuInit(0);

  if((int)error!=0){

    printf("Error! cuInit returned: %d\n",(int)error); 

    exit(0);

  }

// Get Cuda Device and give handle

  CUdevice cu_device;

  error = cuDeviceGet(&cu_device,atoi(argv[2]));

  if((int)error!=0){

    printf("Error! cuDeviceGet returned: %d\n",(int)error);

    exit(0);

  }

// Create context to run on device 

  CUcontext cu_context;

  error = cuCtxCreate(&cu_context, 0, cu_device);

  if((int)error!=0){

    printf("Error! cuCtxCreate returned: %d\n",(int)error);

    exit(0);

  }

// Load ptx code

  CUmodule cu_module;

  error = cuModuleLoad(&cu_module,argv[1]);

  if((int)error!=0){

    printf("Error! cuModuleLoad returned: %d\n",(int)error);

    exit(0);

  }

// Get kernel function

  CUfunction func;

  error = cuModuleGetFunction(&func,cu_module,"testing"); 

  if((int)error!=0){

    printf("Error! cuModuleGetFunction returned: %d\n",(int)error);

    exit(0);

  }

CUdeviceptr var;

// Allocate device memory

  unsigned int size = sizeof(unsigned int);

  error = cuMemAlloc(&var, size);

  if((int)error!=0){

    printf("Error! cuMemAlloc returned: %d\n",(int)error);

    exit(0);

  }

// Copy variable to host

  error = cuMemcpyHtoD(var,&h_var,size);

  if((int)error!=0){

    printf("Error! cuMemcpyHtoD returned: %d\n",(int)error);

    exit(0);

  }

// Lauch kernel

  void *args[] = {&var};

  error = cuLaunchKernel(func, 1, 1, 1, 1, 1, 1, 0, NULL, args, NULL);

  if((int)error!=0){

    printf("Error! cuLaunchKernel returned: %d\n",(int)error);

    exit(0);

  }

// Get result to host

  error = cuMemcpyDtoH(&h_var,var,size);

  if((int)error!=0){

    printf("Error! cuMemcpyDtoH returned: %d\n",(int)error);

    exit(0);

  }

// Free device memory

  error = cuMemFree(var);

  if((int)error!=0){

    printf("Error! cuMemFree returned: %d\n",(int)error);

    exit(0);

  }

// Destroy context

  error = cuCtxDestroy(cu_context);

  if((int)error!=0){

    printf("Error! cuCtxDestroy returned: %d\n",(int)error);

    exit(0);

  }

// Print result

  printf("var: %d\n",h_var);

}

with a ptx code written by me: (it’s a simple addition, just to test that it works)

.version 1.4

.target sm_10, map_f64_to_f32

.entry testing (

  .param .u64 mynum)

{

.reg .u64 %r;

  .reg .u64 %i;

  ld.param.u64 %r,[mynum];

  ld.global.u64 %i,[%r];

  add.u64 %i,%i,3;

  st.global.u64 [%r+0],%i;

  exit;

}

So far so good. Then i took the code and loaded every function with dlsym like this:

/* Lauch kernel */

  void *args[] = {&var};

  int (*_cuLaunchKernel)( void *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, void *, void **, void ** );

  *(void **)(&_cuLaunchKernel) = dlsym(dlhandle, "cuLaunchKernel");

  (*_cuLaunchKernel)(cu_func,1 , 1, 1, 1, 1, 1, 0, NULL, args, NULL);

/* Get result to host  */

  int (*_cuMemcpyDtoH)( void *, void *, size_t );

  *(void **)(&_cuMemcpyDtoH) = dlsym(dlhandle, "cuMemcpyHtoD");

  error = (*_cuMemcpyDtoH)(&h_var,var,size);

( The rest of the code is in the same logic ). All of my functions return with 0, which means everything went ok, except the last one _cuMemcpyDtoH , which returns error 1 ( = cudaErrorMissingConfiguration ). Can someone explain what this error means, and why it occurs? Is there a way to solve it? And why it shows up when i load the functions on runtime?

Thanks.

My system:

nvcc release 4.1

GPU : GTX 480

NVRM version: NVIDIA UNIX x86_64 Kernel Module 285.05.32

GCC version: gcc version 4.5.2 (Ubuntu/Linaro 4.5.2-8ubuntu4)

Solved:

I had a typo here:
*(void **)(&_cuMemcpyDtoH) = dlsym(dlhandle, “cuMemcpyHtoD”);

The correct is :
*(void **)(&_cuMemcpyDtoH) = dlsym(dlhandle, “cuMemcpyDtoH”);