Runtime API to Driver API : translation pbs ?

Helloo !

I need to use a cuda code that has been developped in C and using cuda runtime API.

My own code is in c++ and my environnement is Eclipse CDT.
So I made a cubin with nvcc and use cuModuleLoad to load the cuda functions in my code and converted the code from Runtime API to Driver API.

The first version of the code didn’t use texture and there was no problem.

But since the code is using texture, I don’t get same results between c code using runtime API and c++ code using driver API !
Worst, if I run several time my c++ code, results can be differents (but shoudn’t). >.<

Has anybody already have experienced these kind of problems ?

If it helps I can provide the two funcs I’m talking about.

Thanks for your help ! :)

I haven’t had that such problems, but when using textires with Driver API make sure to:

  1. Initialize: cuModuleGetTexRef(), cuTexRefSetFormat(), cuTexRefSetFlags()

  2. cuParamSetTexRef() before every grid launch.

If you’re doing so and you still ahve problems please post the code here.

I did.

I did but only for the function using the texture.

I try to put it for all of them but it doesn’t change anything.

Here are the 2 versions of the function :

	//////// My version in C++ + API Driver

	I've got an object called Cudaloader (singleton). I init my context in the ctor.		

	

	//// initialization in ctor
 CUT_DEVICE_INIT_DRV(_cuDevice, sArgc, sArgv);

  

    	CUresult status = cuCtxCreate( &_cuContext, 0, _cuDevice);

    	int major, minor;

    	cuDeviceComputeCapability(&major, &minor, _cuDevice);

    	

    	status = cuModuleLoad(&_cuModule,"func.cubin");      

    	(...)

    	_cuComputeDistanceGlobal = new CUfunction;

    	status = cuModuleGetFunction(_cuComputeDistanceGlobal, _cuModule,

      	"cuComputeDistanceGlobal");    	

    	(...)

    	_cuComputeDistanceTexture = new CUfunction;

    	status = cuModuleGetFunction(_cuComputeDistanceTexture, _cuModule,

      	"cuComputeDistanceTexture");

    	(...)

    	_cuInsertionSort = new CUfunction;

    	status = cuModuleGetFunction(_cuInsertionSort, _cuModule,

      	"cuInsertionSort");

    	(...)

  

    	_cuParallelSqrt = new CUfunction;

    	status = cuModuleGetFunction(_cuParallelSqrt, _cuModule,

      	"cuParallelSqrt");

    
	////////////////////////////////////
 

  float* CudaLoader::implDriverFunc(float * ref_host, int ref_width,

    float * query_host, int query_width, int dim, int k,

    CUfunction *cuComputeDistanceGlobal,

    CUfunction *cuComputeDistanceTexture,

    CUfunction *cuInsertionSort, CUfunction *cuParallelSqrt)

  {

  	// Device array

  	CUdeviceptr query_dev;

  	CUdeviceptr ref_dev;

  	CUdeviceptr dist_dev;

  	CUarray ref_array;

 	// Variables

 	CUresult res;

 	size_t query_pitch;

  	size_t query_pitch_in_bytes;

  	size_t ref_pitch;

  	size_t ref_pitch_in_bytes;

  	size_t max_nb_query_traited;

  	size_t actual_nb_query_width;

  	unsigned int memory_total;

  	unsigned int memory_free;

  	unsigned int use_texture = (ref_width*sizeof(float)

    	<=MAX_TEXTURE_WIDTH_IN_BYTES && dim*sizeof(float)

    	<=MAX_TEXTURE_HEIGHT_IN_BYTES);

 	float *output_host = new float[query_width];

  	// Allocation CUDA memory

  	int elementSizeBytes = 4; /// --> 4 ??

  	cuMemGetInfo(&memory_free, &memory_total);

  	max_nb_query_traited = (size_t)(memory_free

    	* MAX_PART_OF_FREE_MEMORY_USED - sizeof(float) * ref_width

    	*dim ) / (sizeof(float) * (dim + ref_width) );

  	max_nb_query_traited = min((unsigned int)query_width,

    	(max_nb_query_traited / 16) * 16);

 	res

    	= CU_SAFE_CALL(cuMemAllocPitch( &query_dev, &query_pitch_in_bytes, max_nb_query_traited*sizeof(float), (dim+ref_width), elementSizeBytes));

  	(...)

 	query_pitch = query_pitch_in_bytes/sizeof(float);

  	dist_dev = query_dev + dim * query_pitch;

  	

  	if (use_texture)

  	{

    // Allocation of texture memory for reference points

   CUDA_ARRAY_DESCRIPTOR desc;

    desc.Format = CU_AD_FORMAT_FLOAT;

    desc.Width = ref_width;

    desc.Height = dim;

    desc.NumChannels = 1; //doit être le meme que TexRefSetFormat

    res = cuArrayCreate(&ref_array, &desc);

   (...)

   // cudaMemcpyToArray( ref_array, 0, 0, ref_host, ref_width*height*sof,cudaMemcpyHostToDevice );

    CUDA_MEMCPY2D memcpy2Dparams;

    memset(&memcpy2Dparams, 0, sizeof(memcpy2Dparams));

    memcpy2Dparams.srcMemoryType = CU_MEMORYTYPE_HOST;

    memcpy2Dparams.dstMemoryType = CU_MEMORYTYPE_ARRAY;

    memcpy2Dparams.srcHost = ref_host;

    memcpy2Dparams.srcPitch = ref_width*sizeof(float);

    memcpy2Dparams.dstArray = ref_array;

    memcpy2Dparams.WidthInBytes = memcpy2Dparams.srcPitch;

    memcpy2Dparams.Height = dim;

    AlertAndKillIfCudaError(cuMemcpy2D(&memcpy2Dparams));

   CU_SAFE_CALL(cuModuleGetTexRef(&_tex, _cuModule, "texA"));

    CU_SAFE_CALL(cuTexRefSetArray(_tex, ref_array, CU_TRSA_OVERRIDE_FORMAT));

    CU_SAFE_CALL(cuTexRefSetAddressMode(_tex, 0, CU_TR_ADDRESS_MODE_CLAMP));

    CU_SAFE_CALL(cuTexRefSetAddressMode(_tex, 1, CU_TR_ADDRESS_MODE_CLAMP));

    CU_SAFE_CALL(cuTexRefSetFilterMode(_tex, CU_TR_FILTER_MODE_POINT));

    CU_SAFE_CALL(cuTexRefSetFlags(_tex, 0));

    CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, 1));

 	}

  	else

  	{

    //res = cudaMallocPitch( (void **) &ref_dev, &ref_pitch_in_bytes,ref_width * size_of_float, height);

    res

      = CU_SAFE_CALL(cuMemAllocPitch( &ref_dev, &ref_pitch_in_bytes, ref_width*sizeof(float), dim, elementSizeBytes));

    (...)

    ref_pitch = ref_pitch_in_bytes/sizeof(float);

   //cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref_host, ref_width*size_of_float, ref_width*size_of_float, height,cudaMemcpyHostToDevice);  

    CUDA_MEMCPY2D memcpy2Dparams2;

    memset(&memcpy2Dparams2, 0, sizeof(memcpy2Dparams2));

    memcpy2Dparams2.srcMemoryType = CU_MEMORYTYPE_HOST;

    memcpy2Dparams2.dstMemoryType = CU_MEMORYTYPE_DEVICE;

    memcpy2Dparams2.srcHost = ref_host;

    memcpy2Dparams2.srcPitch = ref_width*sizeof(float);

    memcpy2Dparams2.dstDevice = ref_dev;

    memcpy2Dparams2.dstPitch = ref_pitch_in_bytes;

    memcpy2Dparams2.WidthInBytes = ref_width*sizeof(float);

    memcpy2Dparams2.Height = dim;

    res = CU_SAFE_CALL(cuMemcpy2D(&memcpy2Dparams2));

   (...)

  	}

  	////

 	// Split queries to fit on GPU memory

  	for (int i=0; i<query_width; i+=max_nb_query_traited)

  	{

   actual_nb_query_width = min((int)max_nb_query_traited,

      query_width-i);

   // Copy of part of query actually being treated

    //cudaMemcpy2D(query_dev, query_pitch_in_bytes, &query_host[i],query_width*size_of_float, actual_nb_query_width*size_of_float,height, cudaMemcpyHostToDevice);

   CUDA_MEMCPY2D memcpy2Dparams2;

    memset(&memcpy2Dparams2, 0, sizeof(memcpy2Dparams2));

    memcpy2Dparams2.srcMemoryType = CU_MEMORYTYPE_HOST;

    memcpy2Dparams2.dstMemoryType = CU_MEMORYTYPE_DEVICE;

    memcpy2Dparams2.srcHost = &query_host[i];

    memcpy2Dparams2.srcPitch = query_width*sizeof(float);

    memcpy2Dparams2.dstDevice = query_dev;

    memcpy2Dparams2.dstPitch = query_pitch_in_bytes;

    memcpy2Dparams2.WidthInBytes = actual_nb_query_width

      *sizeof(float);

    memcpy2Dparams2.Height = dim;

    res = CU_SAFE_CALL(cuMemcpy2D(&memcpy2Dparams2));

   (...)

    

    // GRIDS ANS THREADS

    //dim3 g_16x16(actual_nb_query_width/16, ref_width/16, 1);

    unsigned int g_16x16_x = actual_nb_query_width/16;

    unsigned int g_16x16_y = ref_width/16;

   //dim3 t_16x16(16, 16, 1);

    unsigned int t_16x16_x = 16;

    unsigned int t_16x16_y = 16;

    unsigned int t_16x16_z = 1;

   if (actual_nb_query_width%16 != 0)

    	g_16x16_x += 1;

    if (ref_width %16 != 0)

    	g_16x16_y += 1;

   //dim3 g_256x1(actual_nb_query_width/256, 1, 1);

    unsigned int g_256x1_x = actual_nb_query_width/256;

    unsigned int g_256x1_y = 1;

   //dim3 t_256x1(256, 1, 1);

    unsigned int t_256x1_x = 256;

    unsigned int t_256x1_y = 1;

    unsigned int t_256x1_z = 1;

   if (actual_nb_query_width%256 != 0)

    	g_256x1_x += 1;

   // KERNEL 1 : Compute all the distances

   if (use_texture)

    {

    	

    	//cuComputeDistanceTexture<<<g_16x16,t_16x16>>>(ref_width, query_dev, actual_nb_query_width, query_pitch, height, dist_dev);

    	int offset = 0;

    	AlertAndKillIfCudaError(cuFuncSetBlockShape(

      	*cuComputeDistanceTexture, t_16x16_x, t_16x16_y,

      	t_16x16_z));

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceTexture, offset, ref_width));

    	offset += sizeof(ref_width);

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceTexture, offset, query_dev));

    	offset += sizeof(query_dev);

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceTexture, offset,

      	actual_nb_query_width));

    	offset += sizeof(actual_nb_query_width);

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceTexture, offset, query_pitch /*/sizeof(float)*/)); //??????????????????

    	offset += sizeof(query_pitch);

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceTexture, offset, dim));

    	offset += sizeof(dim);

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceTexture, offset, dist_dev));

    	offset += sizeof(dist_dev);

   	AlertAndKillIfCudaError(cuParamSetSize(

      	*cuComputeDistanceTexture, offset));

   	CU_SAFE_CALL(cuParamSetTexRef(*cuComputeDistanceTexture, CU_PARAM_TR_DEFAULT, _tex));

   	CU_SAFE_CALL( cuCtxSynchronize() );

    	AlertAndKillIfCudaError(cuLaunchGrid(

      	*cuComputeDistanceTexture, g_16x16_x, g_16x16_y));

    	CU_SAFE_CALL( cuCtxSynchronize() );

    }

    else

    {

    	cout<<"pas texture"<<endl;

    	//cuComputeDistanceGlobal<<<g_16x16,t_16x16>>>(ref_dev, ref_width, ref_pitch, query_dev, actual_nb_query_width, query_pitch, height, dist_dev);

    	int offset = 0;

    	AlertAndKillIfCudaError(cuFuncSetBlockShape(

      	*cuComputeDistanceGlobal, t_16x16_x, t_16x16_y,

      	t_16x16_z));

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceGlobal, offset, ref_dev));

    	offset += sizeof(ref_dev);

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceGlobal, offset, ref_width));

    	offset += sizeof(ref_width);

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceGlobal, offset, ref_pitch));

    	offset += sizeof(ref_pitch);

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceGlobal, offset, query_dev));

    	offset += sizeof(query_dev);

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceGlobal, offset,

      	actual_nb_query_width));

    	offset += sizeof(actual_nb_query_width);

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceGlobal, offset, query_pitch));

    	offset += sizeof(query_pitch);

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceGlobal, offset, dim));

    	offset += sizeof(dim);

    	AlertAndKillIfCudaError(cuParamSeti(

      	*cuComputeDistanceGlobal, offset, dist_dev));

    	offset += sizeof(dist_dev);

   	AlertAndKillIfCudaError(cuParamSetSize(

      	*cuComputeDistanceGlobal, offset));

    	AlertAndKillIfCudaError(cuLaunchGrid(

      	*cuComputeDistanceGlobal, g_16x16_x, g_16x16_y));

    }

   // KERNEL 2 : Sort each column

    //cuInsertionSort<<<g_256x1,t_256x1>>>(dist_dev, actual_nb_query_width, query_pitch, ref_width, k);

   int offset = 0;

    AlertAndKillIfCudaError(cuFuncSetBlockShape(*cuInsertionSort,

      t_256x1_x, t_256x1_y, t_256x1_z));

    AlertAndKillIfCudaError(cuParamSeti(*cuInsertionSort, offset,

      dist_dev));

    offset += sizeof(dist_dev);

    AlertAndKillIfCudaError(cuParamSeti(*cuInsertionSort, offset,

      actual_nb_query_width));

    offset += sizeof(actual_nb_query_width);

    AlertAndKillIfCudaError(cuParamSeti(*cuInsertionSort, offset,

      query_pitch/sizeof(float)));

    offset += sizeof(query_pitch);

    AlertAndKillIfCudaError(cuParamSeti(*cuInsertionSort, offset,

      ref_width));

    offset += sizeof(ref_width);

    AlertAndKillIfCudaError(cuParamSeti(*cuInsertionSort, offset, k));

    offset += sizeof(k);

   AlertAndKillIfCudaError(cuParamSetSize(*cuInsertionSort, offset));

    AlertAndKillIfCudaError(cuLaunchGrid(*cuInsertionSort,

      g_256x1_x, g_256x1_y));

   // KERNEL 3 : Compute square root of k-th element

    //cuParallelSqrt<<<g_256x1,t_256x1>>>(dist_dev+(k-1)*query_pitch, query_width);

    offset = 0;

    AlertAndKillIfCudaError(cuFuncSetBlockShape(*cuParallelSqrt,

      t_256x1_x, t_256x1_y, t_256x1_z));

    AlertAndKillIfCudaError(cuParamSeti(*cuParallelSqrt, offset,

      dist_dev +(k-1)*query_pitch));

    offset += sizeof(dist_dev);

    AlertAndKillIfCudaError(cuParamSeti(*cuParallelSqrt, offset,

      query_width));

    offset += sizeof(query_width);

   AlertAndKillIfCudaError(cuParamSetSize(*cuParallelSqrt, offset));

    AlertAndKillIfCudaError(cuLaunchGrid(*cuParallelSqrt,

      g_256x1_x, g_256x1_y));

   // Memory copy of output from device to host

    //cudaMemcpy2D(&output_host[i], query_width*size_of_float, dist_dev+(k-1)*query_pitch, query_pitch_in_bytes, actual_nb_query_width*size_of_float, 1, cudaMemcpyDeviceToHost);

    CUDA_MEMCPY2D memcpy2Dparams3;

    memset(&memcpy2Dparams3, 0, sizeof(memcpy2Dparams3));

    memcpy2Dparams3.srcMemoryType = CU_MEMORYTYPE_DEVICE;

    memcpy2Dparams3.dstMemoryType = CU_MEMORYTYPE_HOST;

    memcpy2Dparams3.srcDevice = dist_dev+(k-1)*query_pitch;

    memcpy2Dparams3.srcPitch = query_pitch_in_bytes; //// ici : pb

    memcpy2Dparams3.dstHost = &output_host[i];

    memcpy2Dparams3.dstPitch = query_width*sizeof(float);

    memcpy2Dparams3.WidthInBytes = actual_nb_query_width

      *sizeof(float);

    memcpy2Dparams3.Height = 1;

   cuMemcpy2D(&memcpy2Dparams3);

    res = CU_SAFE_CALL(cuMemcpy2D(&memcpy2Dparams3));

   (...)

  	}

 	// FREE MEMORY

  	if (use_texture)

    cuArrayDestroy(ref_array);

  	else

    cuMemFree(ref_dev);

  	cuMemFree(query_dev);

 	////

  	return output_host;

  }

//////// version in C + Runtime API

void implRuntimeFunc(float* ref_host, int ref_width, float* query_host, int query_width, int height, int k, float* output_host){

   unsigned int size_of_float = sizeof(float);

   // Variables

    float        *query_dev;

    float        *ref_dev;

    float        *dist_dev;

    cudaArray    *ref_array;

    cudaError_t  result;

    size_t       query_pitch;

    size_t      query_pitch_in_bytes;

    size_t       ref_pitch;

    size_t       ref_pitch_in_bytes;

    size_t       max_nb_query_traited;

    size_t       actual_nb_query_width;

    unsigned int memory_total;

    unsigned int memory_free;

    unsigned int use_texture = (ref_width*size_of_float<=MAX_TEXTURE_WIDTH_IN_BYTES && height*size_of_float<=MAX_TEXTURE_HEIGHT_IN_BYTES);

   // CUDA Initialisation and get free memory (80% of free memory used)

    CUdevice  cuDevice=0;

    CUcontext cuContext;

    cuInit(0);

    cuCtxCreate(&cuContext, 0, cuDevice);

    cuMemGetInfo(&memory_free,&memory_total);

   // Determine maximum number of query that can be treated

    max_nb_query_traited = ( memory_free * MAX_PART_OF_FREE_MEMORY_USED - size_of_float * ref_width*height ) / ( size_of_float * (height + ref_width) );

    max_nb_query_traited = min( query_width, (max_nb_query_traited / 16) * 16 );

   // Allocation of global memory for query points and for distances

    result = cudaMallocPitch( (void **) &query_dev, &query_pitch_in_bytes, max_nb_query_traited * size_of_float, height + ref_width);

    (...)

    query_pitch = query_pitch_in_bytes/size_of_float;

    dist_dev    = query_dev + height * query_pitch;

   if (use_texture){

        // Allocation of texture memory for reference points

        cudaChannelFormatDesc channelDescA = cudaCreateChannelDesc<float>();

        result = cudaMallocArray( &ref_array, &channelDescA, ref_width, height );

       (...)

        cudaMemcpyToArray( ref_array, 0, 0, ref_host, ref_width * height * size_of_float, cudaMemcpyHostToDevice );

       // Set texture parameters and bind texture to array

        texA.addressMode[0] = cudaAddressModeClamp;

        texA.addressMode[1] = cudaAddressModeClamp;

        texA.filterMode     = cudaFilterModePoint;

        texA.normalized     = 0;

        cudaBindTextureToArray(texA, ref_array);

    }

    else

    {

        result = cudaMallocPitch( (void **) &ref_dev, &ref_pitch_in_bytes, ref_width * size_of_float, height);

       (...)

        ref_pitch = ref_pitch_in_bytes/size_of_float;

        cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref_host, ref_width*size_of_float,  ref_width*size_of_float, height, cudaMemcpyHostToDevice);

    }

   // Split queries to fit on GPU memory

    for (int i=0;i<query_width;i+=max_nb_query_traited){

       actual_nb_query_width = min( max_nb_query_traited, query_width-i );

       // Copy of part of query actually being treated

        cudaMemcpy2D(query_dev, query_pitch_in_bytes, &query_host[i], query_width*size_of_float, actual_nb_query_width*size_of_float, height, cudaMemcpyHostToDevice);

       // GRIDS ANS THREADS

        dim3 g_16x16(actual_nb_query_width/16, ref_width/16, 1);

        dim3 t_16x16(16, 16, 1);

        if (actual_nb_query_width%16 != 0) g_16x16.x += 1;

        if (ref_width  %16 != 0) g_16x16.y += 1;

        //

        dim3 g_256x1(actual_nb_query_width/256, 1, 1);

        dim3 t_256x1(256, 1, 1);

        if (actual_nb_query_width%256 != 0) g_256x1.x += 1;

       // KERNEL 1 : Compute all the distances

        if (use_texture)

            cuComputeDistanceTexture<<<g_16x16,t_16x16>>>(ref_width, query_dev, actual_nb_query_width, query_pitch, height, dist_dev);

        else

            cuComputeDistanceGlobal<<<g_16x16,t_16x16>>>(ref_dev, ref_width, ref_pitch, query_dev, actual_nb_query_width, query_pitch, height, dist_dev);

       // KERNEL 2 : Sort each column

        cuInsertionSort<<<g_256x1,t_256x1>>>(dist_dev, actual_nb_query_width, query_pitch, ref_width, k);

       // KERNEL 3 : Compute square root of k-th element

        cuParallelSqrt<<<g_256x1,t_256x1>>>(dist_dev+(k-1)*query_pitch, query_width);

       // Memory copy of output from device to host

        cudaMemcpy2D(&output_host[i], query_width*size_of_float, dist_dev+(k-1)*query_pitch, query_pitch_in_bytes, actual_nb_query_width*size_of_float, 1, cudaMemcpyDeviceToHost);

    }

   // FREE MEMORY

    if (use_texture)

        cudaFreeArray(ref_array);

    else

        cudaFree(ref_dev);

    cudaFree(query_dev);

}

I guess it must be an initialization problem but I don’t find why…

Thanks for your help :)

Hellooo !

I resolve the initialisation problem by rewriting my function from scratch.

But I still have different results between the runtime API code and the driver API one.
I don’t understand why…

Has no one have this kind of pbs ?

Thanks !

Don’t know what your bug is, but you can use C++ with the Runtime API just fine. (Especially if you only use it in host code. But it works in device code the majority of the time too.)

Thanks for your answer ;)

Well the first version of my code didn’t use cuda.

I developped it on Eclipse CDT and compiled it with Mingw gcc. I also use boost and some libraries.

When I start to integrate the cuda code in my program, the use of dynamic loading (API driver) seemed to be the easiest way and it works fine for the first version (not using texture).

So I don’t know how to use this runtime code without using nvcc for compilation.

SDK Sample codes just show Runtime API with .cu file and compilation with nvcc and Driver API with cpp file compiling with visual’s compiler.

Do you know if in my case, it’s really possible to use Runtime API and if there’s a sample somewhere to help me ?

Thanks !

You’re right, to use Runtime API you need to compile everything with nvcc.

However, nvcc should be able to compile your whole program, as long as you don’t use gcc extensions with a windows version of nvcc. (Linux nvcc uses gcc on the back end, but Windows nvcc relies on vc++.)

[double post]

Too bad for me, I’m actually using gcc extensions on a windows version of nvcc ;(

I presume there’s still no way to tell nvcc to use gcc on windows (I tried with cuda 1.0 few months ago) ?

Not officially, no. But I was re-reading the nvcc docs and it looks like there’s a new option “-dryrun” that prints the commands that nvcc actually runs. (pg 17-19 of nvcc_2.0.pdf) (“nvcc” is not a real compiler, just a sort of script that calls compilers.) You could do this on a linux box with gcc and re-trace the steps on windows.

Thanks again for your reply ;)

I tried to link a .lib to my code but sooner or later I was blocked by the fact that my lib is compiled with cl and my code with gcc.

I’m gonna try to retrace the steps using dryrun or verbose options.

But I keep asking myself why my API Driver code is wrong.

So I put below a simplest version of the code, so if anybody has time to read this and see something strange, let me know ;)

Problem appears when ref = query (and query_width = ref_with).

float* func(float * ref, int ref_width, float * query,

				int query_width, int height, int k,

				CUfunction *cuComputeDistance, CUfunction *insertionSort,

				CUfunction *cuParallelSqrt)

		{

			// Device array

			CUdeviceptr query_dev;

			CUdeviceptr dist_dev;

			CUdeviceptr output_dev;

			CUarray ref_array;

			int sof = sizeof(float);

			float *output_host = new float[query_width];

			// Allocation CUDA memory

			int elementSizeBytes =4;

			size_t pitch;

			unsigned int memory_total;

			unsigned int memory_free;

			CU_SAFE_CALL( cuMemGetInfo(&memory_free,&memory_total));

			//cudaMallocPitch( (void **) &query_dev, &pitch,query_width*sof, (height+ref_width+1));

			CU_SAFE_CALL( cuMemAllocPitch(&query_dev, &pitch, query_width*sof, (height+ref_width+1), elementSizeBytes));

			dist_dev = query_dev + height * pitch/sof;

			output_dev = dist_dev + ref_width * pitch/sof;

			//cudaMemcpy2D(query_dev, pitch, query_host, query_width*sof,

			//query_width*sof, height, cudaMemcpyHostToDevice);

			CUDA_MEMCPY2D copyParam1;

			memset(&copyParam1, 0, sizeof(copyParam1));

			copyParam1.dstMemoryType = CU_MEMORYTYPE_DEVICE;

			copyParam1.dstDevice = query_dev;

			copyParam1.dstPitch = pitch;

			copyParam1.srcMemoryType = CU_MEMORYTYPE_HOST;

			copyParam1.srcHost = query;

			copyParam1.srcPitch = query_width*sof;

			copyParam1.WidthInBytes = copyParam1.srcPitch;

			copyParam1.Height = height;

			CU_SAFE_CALL(cuMemcpy2D(&copyParam1));

			//cudaChannelFormatDesc channelDescA = cudaCreateChannelDesc(32, 0, 0,

			//0, cudaChannelFormatKindFloat);

			CUDA_ARRAY_DESCRIPTOR desc;

			desc.Format = CU_AD_FORMAT_FLOAT;

			desc.NumChannels = 1;

			desc.Width = ref_width;

			desc.Height = height;

			CU_SAFE_CALL( cuArrayCreate( &ref_array, &desc ));

			// cudaMemcpyToArray( ref_array, 0, 0, ref_host, ref_width*height*sof,

			//cudaMemcpyHostToDevice );

			CUDA_MEMCPY2D copyParam;

			memset(&copyParam, 0, sizeof(copyParam));

			copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;

			copyParam.dstArray = ref_array;

			copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;

			copyParam.srcHost = ref;

			copyParam.srcPitch = ref_width * sizeof(float); // ou ref_width *height * sizeof(float)

			copyParam.WidthInBytes = copyParam.srcPitch;

			copyParam.Height = height; //ou 1

			CU_SAFE_CALL(cuMemcpy2D(&copyParam));

			/// texA.addressMode[0] = cudaAddressModeWrap;

			//   texA.addressMode[1] = cudaAddressModeWrap;

			//   texA.filterMode = cudaFilterModePoint;

			//   texA.normalized = false;

			// Bind the arrays to the textures

			//   cudaBindTextureToArray( texA, ref_array, channelDescA );

			CUtexref cu_texref;

			CU_SAFE_CALL(cuModuleGetTexRef(&cu_texref, _cuModule, "texA"));

			CU_SAFE_CALL(cuTexRefSetArray(cu_texref, ref_array, CU_TRSA_OVERRIDE_FORMAT));

			CU_SAFE_CALL(cuTexRefSetAddressMode(cu_texref, 0, CU_TR_ADDRESS_MODE_WRAP));

			CU_SAFE_CALL(cuTexRefSetAddressMode(cu_texref, 1, CU_TR_ADDRESS_MODE_WRAP));

			CU_SAFE_CALL(cuTexRefSetFilterMode(cu_texref, CU_TR_FILTER_MODE_POINT));

			CU_SAFE_CALL(cuTexRefSetFlags(cu_texref, 0 /*CU_TRSF_NORMALIZED_COORDINATES*/));

			CU_SAFE_CALL(cuTexRefSetFormat(cu_texref, CU_AD_FORMAT_FLOAT, 1));

			int g_16x16[3] =

			{ query_width/BLOCK_DIM, ref_width/BLOCK_DIM, 1 };

			if (query_width%BLOCK_DIM !=0)

				g_16x16[0]+=1;

			if (ref_width%BLOCK_DIM !=0)

				g_16x16[1]+=1;

			int t_16x16[3] =

			{ BLOCK_DIM, BLOCK_DIM, 1 };

			int grid_256x1[3] =

			{ query_width/(BLOCK_DIM*BLOCK_DIM), 1, 1 };

			int t_256x1[3] =

			{ BLOCK_DIM*BLOCK_DIM, 1, 1 };

			if (query_width%(BLOCK_DIM*BLOCK_DIM) !=0)

				grid_256x1[0]+=1;

			//			cuComputeDistance_tex<<<grid_16x16,threads_16x16>>>(ref_width,

			//			query_dev, query_width, pitch/sof, height, dist_dev);

			CU_SAFE_CALL(cuFuncSetBlockShape( *_cuComputeDistance_tex, t_16x16[0], t_16x16[1], t_16x16[2] ));

			int offset = 0;

			CU_SAFE_CALL(cuParamSeti(*_cuComputeDistance_tex, offset, ref_width));

			offset += sizeof(ref_width);

			CU_SAFE_CALL(cuParamSeti(*_cuComputeDistance_tex, offset, query_dev));

			offset += sizeof(query_dev);

			CU_SAFE_CALL(cuParamSeti(*_cuComputeDistance_tex, offset, query_width));

			offset += sizeof(query_width);

			CU_SAFE_CALL(cuParamSeti(*_cuComputeDistance_tex, offset, pitch/sof));

			offset += sizeof(pitch/sof);

			CU_SAFE_CALL(cuParamSeti(*_cuComputeDistance_tex, offset, height));

			offset += sizeof(height);

			CU_SAFE_CALL(cuParamSeti(*_cuComputeDistance_tex, offset, dist_dev));

			offset += sizeof(dist_dev);

			CU_SAFE_CALL(cuParamSetSize(*_cuComputeDistance_tex, offset));

			CU_SAFE_CALL(cuParamSetTexRef(*_cuComputeDistance_tex, CU_PARAM_TR_DEFAULT, cu_texref));

			// warmup

			CU_SAFE_CALL(cuLaunchGrid( *_cuComputeDistance_tex, g_16x16[0], g_16x16[1] ));

			//			  cuInsertionSort<<<grid_256x1,threads_256x1>>>(dist_dev, query_width,

			//			pitch/sof, ref_width, k);

			CU_SAFE_CALL(cuFuncSetBlockShape( *_cuInsertionSort, t_256x1[0], t_256x1[1], t_256x1[2] ));

			offset = 0;

			CU_SAFE_CALL(cuParamSeti(*_cuInsertionSort, offset, dist_dev));

			offset += sizeof(dist_dev);

			CU_SAFE_CALL(cuParamSeti(*_cuInsertionSort, offset, query_width));

			offset += sizeof(query_width);

			CU_SAFE_CALL(cuParamSeti(*_cuInsertionSort, offset, pitch/sof));

			offset += sizeof(pitch/sof);

			CU_SAFE_CALL(cuParamSeti(*_cuInsertionSort, offset, ref_width));

			offset += sizeof(ref_width);

			CU_SAFE_CALL(cuParamSeti(*_cuInsertionSort, offset, k));

			offset += sizeof(k);

			CU_SAFE_CALL(cuParamSetSize(*_cuInsertionSort, offset));

			//					CU_SAFE_CALL(cuParamSetTexRef(*_cuComputeDistance_tex, CU_PARAM_TR_DEFAULT, cu_texref));					

			//			   cuParallelSqrt<<<grid_256x1,threads_256x1>>>(dist_dev, query_width,

			//			pitch/sof, k-1, output_dev);

			CU_SAFE_CALL(cuFuncSetBlockShape( *_cuParallelSqrt, t_256x1[0], t_256x1[1], t_256x1[2] ));

			offset = 0;

			CU_SAFE_CALL(cuParamSeti(*_cuParallelSqrt, offset, dist_dev));

			offset += sizeof(dist_dev);

			CU_SAFE_CALL(cuParamSeti(*_cuParallelSqrt, offset, query_width));

			offset += sizeof(query_width);

			CU_SAFE_CALL(cuParamSeti(*_cuParallelSqrt, offset, pitch/sof));

			offset += sizeof(pitch/sof);

			CU_SAFE_CALL(cuParamSeti(*_cuParallelSqrt, offset, k-1));

			offset += sizeof(k-1);

			CU_SAFE_CALL(cuParamSeti(*_cuParallelSqrt, offset, output_dev));

			offset += sizeof(output_dev);

			CU_SAFE_CALL(cuParamSetSize(*_cuParallelSqrt, offset));

			//					CU_SAFE_CALL(cuParamSetTexRef(*_cuComputeDistance_tex, CU_PARAM_TR_DEFAULT, cu_texref));					

			// warmup

			CU_SAFE_CALL(cuLaunchGrid( *_cuParallelSqrt, grid_256x1[0], grid_256x1[1] ));

			//			   cudaMemcpy2D(output_host, query_width*sof, output_dev, pitch,

			//			query_width*sof, 1, cudaMemcpyDeviceToHost);

			CUDA_MEMCPY2D copyParam2;

			memset(&copyParam2, 0, sizeof(copyParam2));

			copyParam2.dstMemoryType = CU_MEMORYTYPE_HOST;

			copyParam2.dstHost = output_host;

			copyParam2.dstPitch = query_width*sof;

			copyParam2.srcMemoryType = CU_MEMORYTYPE_DEVICE;

			copyParam2.srcDevice = output_dev;

			copyParam2.srcPitch = pitch;

			copyParam2.WidthInBytes= query_width*sof;

			copyParam2.Height = 1;

			CU_SAFE_CALL(cuMemcpy2D(&copyParam2));

			CU_SAFE_CALL(cuArrayDestroy(ref_array));

			cuMemFree(query_dev);

			return output_host;

		}

Just to end this thread, I tried the “dry-run tricks” but I quickly come upon an inextricable cudafe crash.
In the meantime, I found a workaround to link the runtime code in my project.

See my post.

I still don’t know what happens in my driver code but now I can go on ;p