Cuda dll with pointer of function : trouble with division operator

Hi everyone,
I would like to create a dll with some functions in its.
In this dll a function (init) is able to allocate a table pointing on the previous functions.
The code using the dll will use the path to make the link.
I use VS2012(Express) and CUDA Toolkit 6.

Header.h

#include <cuda.h>
#include <cuda_runtime.h>
#define N 5

typedef double (*doublefunct)(double *,int,double*,int);
extern "C" _declspec(dllexport) void initdouble(doublefunct * h_f,doublefunct * d_f);

Source.cu

#include "Header.h"
__device__ double myformatfunc1(double * d_x,int dimx,double * d_Params,int dimParams)
{
	return d_x[0]/d_x[0];
}
__device__ double myformatfunc2(double * d_x,int dimx,double * d_Params,int dimParams)
{
	return d_x[0]+d_Params[1];
}

__device__ double myformatfunc3(double * d_x,int dimx,double * d_Params,int dimParams)
{
	return d_x[1]+d_Params[1];
}

__device__ doublefunct p_myformatfunc1 = myformatfunc1;
__device__ doublefunct p_myformatfunc2 = myformatfunc2;
__device__ doublefunct p_myformatfunc3 = myformatfunc3;
void initdouble(doublefunct * h_f,doublefunct * d_f)
{
  cudaMemcpyFromSymbol( &h_f[0], p_myformatfunc1, sizeof(doublefunct));
  cudaMemcpyFromSymbol( &h_f[1], p_myformatfunc1, sizeof(doublefunct));
  cudaMemcpyFromSymbol( &h_f[2], p_myformatfunc2, sizeof(doublefunct));
  cudaMemcpyFromSymbol( &h_f[3], p_myformatfunc3, sizeof(doublefunct));
  cudaMemcpyFromSymbol( &h_f[4], p_myformatfunc3, sizeof(doublefunct));

   cudaMemcpy(d_f,h_f,N*sizeof(doublefunct),cudaMemcpyHostToDevice);
}

main.cu

#include <iostream>
#include <cuda_runtime.h>
#include <windows.h>

typedef double (*myformatfptr)(double *,int,double*,int);
typedef void (*importmyformatFunction)(myformatfptr *, myformatfptr *);

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void kernelmyformat(myformatfptr * d_f,double * d_x,double * d_Params,double * d_res)
{
	for(int i =0;i<5;i++)
		d_res[i]=d_f[i](d_x,2,d_Params,6); 
}

int main()
{
	HINSTANCE hinstLib;
	hinstLib = LoadLibrary(TEXT("C:\Projects\VS2012\Dllfct\Debug\Dllfct.dll"));
	if (hinstLib == NULL) {
		printf("ERROR: unable to load DLL\n");
		return 1;
	}

	importmyformatFunction doubleinit;
	doubleinit = (importmyformatFunction) GetProcAddress(hinstLib, "initdouble");
	if (doubleinit == NULL) {
		printf("point : ERROR: unable to find DLL function\n");
		FreeLibrary(hinstLib);
		return 1;
	}
        myformatfptr
		*d_doublef=NULL,
		*h_doublef=NULL;

	double 
		*h_myformatres,
		*d_myformatres,
		*h_coords=NULL,
		*d_coords=NULL,
		*h_Params=NULL,
		*d_Params=NULL;

	h_myformatres=(double*)malloc(5*sizeof(double));
	if(h_myformatres==NULL) return 1;
	h_coords=(double*)malloc(2*sizeof(double));
	if(h_coords==NULL) return 1;
	h_Params=(double*)malloc(6*sizeof(double));
	if(h_Params==NULL) return 1;

	gpuErrchk(cudaMalloc((void**)&d_myformatres,5*sizeof(double)));
	gpuErrchk(cudaMalloc((void**)&d_coords,2*sizeof(double)));
	gpuErrchk(cudaMalloc((void**)&d_Params,6*sizeof(double)));

	h_coords[0]=4.0;
	h_coords[1]=4.0;

	h_Params[0]=60.0;
	h_Params[1]=4.0;
	h_Params[2]=4.0;
	h_Params[3]=1.0;
	h_Params[4]=1.0;
	h_Params[5]=0.0;

        gpuErrchk(cudaMemcpy(d_coords,h_coords,2*sizeof(double),cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(d_Params,h_Params,6*sizeof(double),cudaMemcpyHostToDevice));

	
	h_doublef=(myformatfptr *)malloc(5*sizeof(myformatfptr));
	gpuErrchk(cudaMalloc((void**)&d_doublef,5*sizeof(myformatfptr)));

	doubleinit(h_doublef,d_doublef);
	kernelmyformat<<<1,1>>>(d_doublef,d_coords,d_Params,d_myformatres);

	gpuErrchk(cudaMemcpy(h_myformatres,d_myformatres,5*sizeof(double),cudaMemcpyDeviceToHost));

	for(int i =0;i<5;i++)
		std::cout << h_myformatres[i] <<std::endl;
}

But actually the division doesn’t work :
“GPUassert: an illegal instruction was encountered …”

d_x[0]/2.0 doesn’t work neither.
When I changed the code for something like 4.0/4.0 it’s work properly.

1
1
8
8
8

Compilation parameters :
compute_30,sm_30
-rdc=true
-cudart shared

I don’t know how to fix it, I have to use some divisions inside my functions.

Thanks for reading.

if d_x[0] / d_x[0] does not work, and d_x[0] / constant (2.0) does not work either, but constant / constant works, I would seriously consider examining d_x[0] and its actual value

Perhaps first write a “normal”/ basic program, debug/ test that, and then do a minor revision that implements dll and all the extra whistles

Thank you for answering.

I tried it but when I return only d_x[0] the value is the good one, that means 4.0.

I’ll try to program a simple call of the division operator from a dll you are right.

Thanks.

Hi again,

This is i think the simpliest version I can obtain.
And I still have the same problem with the division operation.

Header.h

#include <cuda.h>
#include <cuda_runtime.h>

typedef double(*fct)(double);

extern "C" _declspec(dllexport) void init(fct *,fct *);

Source.cu

#include "Header.h"

__device__ double division(double x)
{
	return x/x;
}

__device__ fct p_division = division;

void init(fct * h_f,fct * d_f)
{
	 cudaMemcpyFromSymbol( &h_f[0], p_division, sizeof(fct));

	 cudaMemcpy(d_f,h_f,sizeof(fct),cudaMemcpyHostToDevice);
}

main.cu

#include "Header.h"
#include <iostream>
#include <windows.h>

/*type fct is inside the Header.h*/
typedef void (*importFunction)(fct *,fct *);

__global__ void kernel(fct* d_f,double * d_res)
{
	*d_res=d_f[0](3.0);
}

int main()
{
	/*  ---DLL IMPORT---    */
	importFunction init;
	HINSTANCE hinstLib;
	hinstLib = LoadLibrary(TEXT("C:\Projects\VS2012\divdll\Debug\divdll.dll"));
	if (hinstLib == NULL) {
		printf("ERROR: unable to load DLL\n");
		return 1;
	}
	init = (importFunction) GetProcAddress(hinstLib, "init");
	if (init == NULL) {
		printf("ERROR: unable to find DLL function\n");
		FreeLibrary(hinstLib);
		return 1;
	}
	fct *h_f=NULL,
		*d_f=NULL;

	/*  ---ALLOCATION---  */
	double * h_res=(double*)malloc(sizeof(double)),
		   * d_res;
	cudaMalloc((void **)&d_res,sizeof(double));
	h_f=(fct*)malloc(sizeof(fct));
	cudaMalloc((void**)&d_f,sizeof(fct));

	/*  ---FUNCTION CALL---  */
	init(h_f,d_f);
	kernel<<<1,1>>>(d_f,d_res);

	/*  ---COPY & PRINT RESULT---  */
	cudaMemcpy(h_res,d_res,sizeof(double),cudaMemcpyDeviceToHost);
	std::cout<< *h_res << std::endl;

	/*  ---DEALLOCATION---  */
	free(h_f);
	cudaFree(d_f);
	cudaFree(d_res);
	return 0;
}

When the operation inside the function division is x/x it return -6.27744e+066 which is strange.
And when I change for x+x its return is 6, the good result.
same for x*x its return is 9.

Thanks.

at the very start of your kernel (global void kernel(fct* d_f,double * d_res))

try this first:

double d1;

d1 = 4.5 / 3.5; // or whatever values

d_res[0] = 4.5;
d1 = d_res[0];
d1 = d1 / d1;

(use a break-point just after the last instruction, and mouse-over to get the value)

Ok, thanks.
This is weird, the break-point + mouse-over didn’t give me any value so I change the code like that first:

double d1;

	d1 = 4.5 / 3.5; // or whatever values

	d_res[0] = 4.5;
	d1 = d_res[0];
	d1 = d1 / d1;

	*d_res=d1;

The return is 1 which is normal so I tried this second :

double d1;

	d1 = 4.5 / 3.5; // or whatever values

	d_res[0] = 4.5;
	d1 = d_res[0];
	d1 = d1 / d1;

	*d_res=d1*d_f[0](3.0);

And guess what, the result is 1.

To be sure that the call of d_f[0] is done properly I change my function division with :

return x/x/2;

And the return is 0.5. So correct !

In a last step the shortest way I found for my kernel is this one :

double d1=1.0;
	d1=d1/d1;

	*d_res=d1*d_f[0](3.0);

without the line d1=d1/d1; the result is wrong.

So first thank you.
But I really don’t understand this situation.

Do not quote me on this, but:

doubles are 64 bit, or 8 bytes

threads normally take 2 steps to move anything over 32 bits - 4 bytes

When you use - double d - you “freeze” the value in a local variable first; without it, I guess it is possible that the return only pushes the last 4 bytes, instead of the entire 8 bytes