Kernel and/or memcopy problem

Hello,

I want to build a dll that is getting called from LabVIEW the first code block is the c file that will be compiled as a dll (with VS 2008) and then called from LabVIEW it doesn’t make much but importing the dll (from th .cu file) with the CUDA stuff and call a function in it.

The second code block is the .cu file (compiled with nvcc) at first the caller on the bottom will be called from the c file dll, that will call the function that includes a few CUDA functions and the kernel caller. Both files are compiled without an error message.

To test it I filled one array with ones and then called the kernel where I filled it with another value. Unfortunately the result is still a array filled with ones. So the kernel vall or the memcopy functions seems to have a problem -that I am not able to find.

I hope someone can see my mistake and perhaps tell me if my way to create a from LabVIEW executeable dll is to hard and whose steps I could save.

I use Visual Studio 2008, CUDA 3.0 on a Win7 64bit system.

This is the .c file that will be compiled as dll and called via LabVIEW

[codebox]#include <stdio.h>

#include <math.h>

#include <extcode.h>

#include <cuda_runtime_api.h>

//define block size should be at least # of MPUs (GTX285:30MPUs)

#define BLOCK_SIZE 64

__declspec(dllexport)

int32_t OxyDeoxyCu(float *RRef, float *GRef, float *BRef, float *R,

float *G, float *B, float *oxy, float *deoxy, float *pinv, int n);

__declspec(dllimport)

void OxyDeoxy_caller(float*, float*, float*, float*,

float*, float*, float*, float*, float*, int32_t *);

int32_t OxyDeoxyCu(float *RRef, float *GRef, float *BRef, float *R,

float *G, float *B, float *oxy, float *deoxy, float *pinv, int32_t *N)

{

int n;

n=N[0]*N[1];

//Call extern CUDA kernel caller

OxyDeoxy_caller(RRef,GRef,BRef,R,G,B,oxy,deoxy,pinv,n);

return 0;

}[/codebox]

This is the .cu file that will be compiled with nvcc

[codebox]//define block size should be at least # of MPUs (GTX285:30MPUs)

#define BLOCK_SIZE 64

global void OxyDeoxy_gpu(float *RRef,float *GRef,float *BRef,float *R,float *G,float *B,float *oxy,float *deoxy,float *pinv,int n)

{

int i;

for(i=blockIdx.x*blockDim.x+threadIdx.x; i<n; i+=blockDim.x*gridDim.x)

{

oxy[i]=5;

deoxy[i]=44;

}

}

extern “C”

void KernelCaller(float *RRef, float *GRef, float *BRef, float *R,

float *G, float *B, float *oxy, float *deoxy, float *pinv, int n)

{

float *d_R,*d_G,*d_B,*d_RRef,*d_GRef,*d_BRef,*d_oxy,*d_deoxy,*d_pi

nv;

int size=n*sizeof(float);

int i;

dim3 grid, block;

block.x=BLOCK_SIZE;

grid.x=(n/BLOCK_SIZE);

//Allocate memory on GPU

cudaMalloc((void**)&d_R,size);

cudaMalloc((void**)&d_G,size);

cudaMalloc((void**)&d_B,size);

cudaMalloc((void**)&d_RRef,size);

cudaMalloc((void**)&d_GRef,size);

cudaMalloc((void**)&d_BRef,size);

cudaMalloc((void**)&d_oxy,size);

cudaMalloc((void**)&d_deoxy,size);

cudaMalloc((void**)&d_pinv,(6*(sizeof(float))));



//Load data to device memory

//cudaMemcpy(d_R,R,n,cudaMemcpyHostToDevice);

for(i=0;i<n;i++)

{

oxy[i]=1;

}

//Call CUDA kernel

OxyDeoxy_gpu<<<grid,block>>>(d_RRef,d_GRef,d_BRef,d_R,d_G,d_B,d_oxy,d_deoxy,d_pinv,n)

;

//Read data from device memory

cudaMemcpy(oxy,d_oxy,size,cudaMemcpyDeviceToHost);

cudaMemcpy(deoxy,d_deoxy,size,cudaMemcpyDeviceToHost);

//Free Memory on GPU

cudaFree(d_R);

cudaFree(d_G);

cudaFree(d_B);

cudaFree(d_RRef);

cudaFree(d_GRef);

cudaFree(d_BRef);

cudaFree(d_oxy);

cudaFree(d_deoxy);

cudaFree(d_pinv);

}

//Will be called from the cpp DLL

extern “C”

__declspec(dllexport)

void OxyDeoxy_caller(float *RRef, float *GRef, float *BRef, float *R,

float *G, float *B, float *oxy, float *deoxy, float *pinv, int n)

{

KernelCaller(RRef,GRef,BRef,R,G,B,oxy,deoxy,pinv,n);

}

[/codebox]