Hi, everyone. I have met a problem when I was using cudaMemcpy3D. I write a test code here. The function stores a stack of Fourier images into a 3D texture and then reads them one by one and then outputs them. When I use less number and small size images the result is right (such as Fourier image size is 8080, image number is 400) but when I use larger size and larger number of images(such as size is 160160, number is 2000 ), Matlab crashes. I insert some mexPrintfs in it and find that crash always happens in cudaMemcpy3D line. It seems that CUDA_SAFE_CALL cannot catch the errors so I add my own error handling function cudasafe, but it does not work either. The code is just like this:
// includes, header files
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "mex.h"
#include "matrix.h"
#include <cutil.h>
#include <cuda.h>
texture<float2, 3, cudaReadModeElementType> texFims; // texture store Fourier particle images
/***************************************************************************
* ReadImKernel : read one layer in 3D texture
**************************************************************************/
__global__ void ReadImKernel(float2* odata,int width, int height, float layer)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < width && y < height) {
float2 c = tex3D(texFims, x, y, layer);
odata[y*width+x] = c;
}
}
void pack_c2c(float2 *output_float, float *input_re, float *input_im, int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_float[i].x = input_re[i];
output_float[i].y = input_im[i];
}
}
void unpack_c2c(float2 *input_float, float *output_re, float *output_im, int Ntot)
{
int i;
for (i = 0; i < Ntot; i++)
{
output_re[i] = input_float[i].x;
output_im[i] = input_float[i].y;
}
}
/********Error handling function*************************************/
void cudasafe( cudaError_t error, char* message)
{
if(error!=cudaSuccess) { fprintf(stderr,"ERROR: %s : %i\n",message,error); exit(-1); }
}
/*************************************************************************************
* Main program
*************************************************************************************/
void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] )
{
float2 *input_single_im;
float2 *h_odata;
float *ar,*ai;
mexPrintf("debug point 1\n");
// Check inputs
if (nrhs !=4) mexErrMsgTxt("Must have nine input arguments: Fims_dm, n_i, ImageW, ImageH.");
if (nlhs !=1) mexErrMsgTxt("Must have one output argument.");
// get all the scalars
int n_i = int(mxGetScalar(prhs[1]));
int ImageW = int(mxGetScalar(prhs[2]));
int ImageH = int(mxGetScalar(prhs[3]));
/* Allocating working array on host for the first argument*/
input_single_im = (float2*) mxCalloc(ImageW*ImageH*n_i,sizeof(float2));
/* Pointer for the real part of the input */
ar = (float *) mxGetData(prhs[0]);
/* Pointer for the imagnary part of the input */
ai = (float *) mxGetImagData(prhs[0]);
/* pack them into CUDA data type */
pack_c2c(input_single_im, ar, ai, ImageW*ImageH*n_i);
mexPrintf("debug point 2\n");
//read the input to GPU
// Use 3D texture to store the Fourier images
const cudaExtent Size_stack_ims = make_cudaExtent(ImageW, ImageH, n_i); // size for particle images stack
// create 3D array
cudaArray* d_Fims;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float2>();
CUDA_SAFE_CALL( cudaMalloc3DArray(&d_Fims, &channelDesc, Size_stack_ims) );
mexPrintf("debug point 3\n");
// copy data to 3D array
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void*)input_single_im, Size_stack_ims.width*sizeof(float2), Size_stack_ims.width, Size_stack_ims.height);
copyParams.dstArray = d_Fims;
copyParams.extent = Size_stack_ims;
copyParams.kind = cudaMemcpyHostToDevice;
mexPrintf("debug point 4 \n");
// CUDA_SAFE_CALL(cudaMemcpy3D(©Params));
cudasafe(cudaMemcpy3D(©Params),"cudaMemcpy3D"); // use my own error handling function
// set texture parameters
texFims.normalized = false; // access with unnormalized texture coordinates
texFims.filterMode = cudaFilterModePoint; // do not want interpolation
texFims.addressMode[0] = cudaAddressModeClamp; // clamp texture coordinates
texFims.addressMode[1] = cudaAddressModeClamp;
texFims.addressMode[2] = cudaAddressModeClamp;
// bind array to 3D texture to store the particle images
CUDA_SAFE_CALL(cudaBindTextureToArray(texFims, d_Fims, channelDesc));
// allocate the spaces on device to store one Fourier image
float2* d_im = NULL;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_im, (ImageW*ImageH*sizeof(float2))));
/***** Host memory allocated *****/
h_odata=(float2 *) mxCalloc(n_i*ImageW*ImageH,sizeof(float2)); // store the result
dim3 dimBlock(8, 8, 1);
dim3 dimGrid((ImageW+ dimBlock.x -1) / dimBlock.x, (ImageH+ dimBlock.y -1) / dimBlock.y, 1);
mexPrintf("begin loop\n");
// loop all the transforms
for (int i = 0; i < n_i; i++)
{
// read in 3D texture one layer to get the one image
ReadImKernel<<< dimGrid, dimBlock, 0>>>( d_im, ImageW, ImageH,i );
CUDA_SAFE_CALL( cudaThreadSynchronize());
// copy the result to host space
CUDA_SAFE_CALL( cudaMemcpy(h_odata+i*ImageW*ImageH,(float2 *)d_im, ImageW*ImageH*sizeof(float2),cudaMemcpyDeviceToHost)); // memory copy
}
/* get a pointer to the output */
const mwSize dims[]={ImageH,ImageW,n_i};
plhs[0] = mxCreateNumericArray(3,dims,mxSINGLE_CLASS,mxCOMPLEX);
ar = (float *)mxGetPr(plhs[0]);
ai = (float *)mxGetPi(plhs[0]);
unpack_c2c(h_odata, ar, ai, ImageH * ImageW * n_i);
// clear memory
mxFree(h_odata);
mxFree(input_single_im);
CUDA_SAFE_CALL(cudaFree(d_im));
CUDA_SAFE_CALL(cudaFreeArray(d_Fims));
mexPrintf("finish\n");
return;
}
I know that 3D texture has a limit of 2048, but I only use the number of 2000 and as to float2 image type, the whole size of the image is 1601602000*8 bytes=391M bytes. I think it is not a very large size for GPU memory. Does anyone know what is the problem? And how to catch the error when doing the memory copy operation? Thank you in advance.