Hi, everyone.
I met a problem in my project. There are some large volume data that will be used as input for several GPU programs. Because the transferring data from host to device is time consuming. I am thinking whether we can load the data to GPU one time and remain them on the GPU and pass out the pointer which is the address of the data out. When we want to reuse those data, we can just pass this pointer to the new GPU program and find them on GPU. Is this reasonable?
I wrote a test program about this thought. Because I use matlab on mac laptop. So I wrote a mex function. The GPU program is just to double the density of each pixel of an image. I copied the address of input data and the result data to a long type variable on CPU and pass out to matlab. But sometimes matlab will quit unexpectedly, sometimes the mexPrintf will print out: "the address of d_img_float on GPU is 0x4836866000921821184
the address of d_outimg_float on GPU is 0x4872894797949173760"
which looks not right.
The test program is attached here. I am not good at using pointer in C. So maybe there are some silly errors in it.
#include <string.h>
#include <math.h>
#include "mex.h"
#include "cutil.h"
void gpuAllocMemory(float** ptr_dev, int mem_size)
{
cudaMalloc((void**)ptr_dev, mem_size);
}
void gpuTransferToDevice(float* ptr_dev, float* ptr_host,int mem_size)
{
cudaMemcpy(ptr_dev, ptr_host, mem_size,cudaMemcpyHostToDevice); // memory copy
}
void gpuTransferAddressToHost(long* ptr_host, long* ptr_dev,int mem_size)
{
cudaMemcpy(ptr_host, ptr_dev, mem_size,cudaMemcpyDeviceToHost); // memory copy
}
void gpuFreeMemory(float *ptr_dev)
{
cudaFree(ptr_dev);
}
void copyDoubleToFloat(float *f, double *d, int size)
{
int i;
for (i=0;i<size;i++) *(f+i)=(float) *(d+i);
}
__global__ void doubleX_kernel( float *in_image , int cols, int rows, float* out_image )
{
int x = blockIdx.x*blockDim.x+threadIdx.x;
int y = blockIdx.y*blockDim.y+threadIdx.y;
out_image[y*cols+x] = 2*in_image[y*cols+x];
}
void gpuDoubleX(float* d_img, int Cols, int Rows, float* d_outimg )
{
dim3 dimBlock(8, 8, 1);
dim3 dimGrid((Cols+ dimBlock.x -1) / dimBlock.x, (Rows+ dimBlock.y -1) / dimBlock.y, 1);
doubleX_kernel<<< dimGrid, dimBlock,0 >>>( d_img, Cols, Rows,d_outimg);
}
void mexFunction(int nlhs, mxArray *plhs[],int nrhs,const mxArray *prhs[])
{
double* h_img_double;
// double* d_outimg_double;
float* h_img_float;
float* d_img_float;
float* d_outimg_float;
mwSize *dimsizes; /* number of elements in each dimension*/
long* h_pointer;
int Cols, Rows;
h_img_double = mxGetPr(prhs[0]);
dimsizes=(mwSize *)mxGetDimensions(prhs[0]);
Cols = dimsizes[0];
Rows = dimsizes[1];
h_img_float = (float*) mxCalloc(Cols*Rows,sizeof(float));
copyDoubleToFloat(h_img_float,h_img_double,Cols*Rows);
gpuAllocMemory(&d_img_float, Cols*Rows*sizeof(float));
gpuAllocMemory(&d_outimg_float, Cols*Rows*sizeof(float));
gpuTransferToDevice(d_img_float, h_img_float, Cols*Rows*sizeof(float));
gpuDoubleX(d_img_float,Cols,Rows,d_outimg_float);
h_pointer=(long*) mxCalloc(2,sizeof(long));
plhs[0] = mxCreateDoubleMatrix(2, 1, mxREAL); //create an mxArray
h_pointer =(long*) mxGetPr(plhs[0]);
gpuTransferAddressToHost(h_pointer,(long*) &d_img_float[0], sizeof(long)); // copy the address of input data to h_pointer
gpuTransferAddressToHost(h_pointer+1,(long*) &d_outimg_float[0], sizeof(long)); // copy the address of output data to h_pointer+1
mexPrintf("the address of d_img_float on GPU is 0x%ld\n",*h_pointer);
mexPrintf("the address of d_outimg_float on GPU is 0x%ld\n",*(h_pointer+1));
mxFree(h_img_float);
gpuFreeMemory(d_img_float);
gpuFreeMemory(d_outimg_float);
mxFree(h_pointer);
}
Could you please help me to figure it out? Thank you so much.