Hello,
I want to build a dll that is getting called from LabVIEW the first code block is the c file that will be compiled as a dll (with VS 2008) and then called from LabVIEW it doesn’t make much but importing the dll (from th .cu file) with the CUDA stuff and call a function in it.
The second code block is the .cu file (compiled with nvcc) at first the caller on the bottom will be called from the c file dll, that will call the function that includes a few CUDA functions and the kernel caller. Both files are compiled without an error message.
To test it I filled one array with ones and then called the kernel where I filled it with another value. Unfortunately the result is still a array filled with ones. So the kernel vall or the memcopy functions seems to have a problem -that I am not able to find.
I hope someone can see my mistake and perhaps tell me if my way to create a from LabVIEW executeable dll is to hard and whose steps I could save.
I use Visual Studio 2008, CUDA 3.0 on a Win7 64bit system.
This is the .c file that will be compiled as dll and called via LabVIEW
[codebox]#include <stdio.h>
#include <math.h>
#include <extcode.h>
#include <cuda_runtime_api.h>
//define block size should be at least # of MPUs (GTX285:30MPUs)
#define BLOCK_SIZE 64
__declspec(dllexport)
int32_t OxyDeoxyCu(float *RRef, float *GRef, float *BRef, float *R,
float *G, float *B, float *oxy, float *deoxy, float *pinv, int n);
__declspec(dllimport)
void OxyDeoxy_caller(float*, float*, float*, float*,
float*, float*, float*, float*, float*, int32_t *);
int32_t OxyDeoxyCu(float *RRef, float *GRef, float *BRef, float *R,
float *G, float *B, float *oxy, float *deoxy, float *pinv, int32_t *N)
{
int n;
n=N[0]*N[1];
//Call extern CUDA kernel caller
OxyDeoxy_caller(RRef,GRef,BRef,R,G,B,oxy,deoxy,pinv,n);
return 0;
}[/codebox]
This is the .cu file that will be compiled with nvcc
[codebox]//define block size should be at least # of MPUs (GTX285:30MPUs)
#define BLOCK_SIZE 64
global void OxyDeoxy_gpu(float *RRef,float *GRef,float *BRef,float *R,float *G,float *B,float *oxy,float *deoxy,float *pinv,int n)
{
int i;
for(i=blockIdx.x*blockDim.x+threadIdx.x; i<n; i+=blockDim.x*gridDim.x)
{
oxy[i]=5;
deoxy[i]=44;
}
}
extern “C”
void KernelCaller(float *RRef, float *GRef, float *BRef, float *R,
float *G, float *B, float *oxy, float *deoxy, float *pinv, int n)
{
float *d_R,*d_G,*d_B,*d_RRef,*d_GRef,*d_BRef,*d_oxy,*d_deoxy,*d_pi
nv;
int size=n*sizeof(float);
int i;
dim3 grid, block;
block.x=BLOCK_SIZE;
grid.x=(n/BLOCK_SIZE);
//Allocate memory on GPU
cudaMalloc((void**)&d_R,size);
cudaMalloc((void**)&d_G,size);
cudaMalloc((void**)&d_B,size);
cudaMalloc((void**)&d_RRef,size);
cudaMalloc((void**)&d_GRef,size);
cudaMalloc((void**)&d_BRef,size);
cudaMalloc((void**)&d_oxy,size);
cudaMalloc((void**)&d_deoxy,size);
cudaMalloc((void**)&d_pinv,(6*(sizeof(float))));
//Load data to device memory
//cudaMemcpy(d_R,R,n,cudaMemcpyHostToDevice);
for(i=0;i<n;i++)
{
oxy[i]=1;
}
//Call CUDA kernel
OxyDeoxy_gpu<<<grid,block>>>(d_RRef,d_GRef,d_BRef,d_R,d_G,d_B,d_oxy,d_deoxy,d_pinv,n)
;
//Read data from device memory
cudaMemcpy(oxy,d_oxy,size,cudaMemcpyDeviceToHost);
cudaMemcpy(deoxy,d_deoxy,size,cudaMemcpyDeviceToHost);
//Free Memory on GPU
cudaFree(d_R);
cudaFree(d_G);
cudaFree(d_B);
cudaFree(d_RRef);
cudaFree(d_GRef);
cudaFree(d_BRef);
cudaFree(d_oxy);
cudaFree(d_deoxy);
cudaFree(d_pinv);
}
//Will be called from the cpp DLL
extern “C”
__declspec(dllexport)
void OxyDeoxy_caller(float *RRef, float *GRef, float *BRef, float *R,
float *G, float *B, float *oxy, float *deoxy, float *pinv, int n)
{
KernelCaller(RRef,GRef,BRef,R,G,B,oxy,deoxy,pinv,n);
}
[/codebox]