Error handling in CUDA when using cudaMemcpy3D

Hi, everyone. I have met a problem when I was using cudaMemcpy3D. I write a test code here. The function stores a stack of Fourier images into a 3D texture and then reads them one by one and then outputs them. When I use less number and small size images the result is right (such as Fourier image size is 8080, image number is 400) but when I use larger size and larger number of images(such as size is 160160, number is 2000 ), Matlab crashes. I insert some mexPrintfs in it and find that crash always happens in cudaMemcpy3D line. It seems that CUDA_SAFE_CALL cannot catch the errors so I add my own error handling function cudasafe, but it does not work either. The code is just like this:

// includes, header files

#include <stdlib.h>

#include <stdio.h>

#include <math.h>

#include "mex.h"

#include "matrix.h"

#include <cutil.h>

#include <cuda.h>

texture<float2, 3, cudaReadModeElementType> texFims;       // texture store Fourier particle images

/***************************************************************************

 * ReadImKernel : read one layer in 3D texture

 **************************************************************************/

__global__ void ReadImKernel(float2* odata,int width, int height, float layer)

{

    int x = blockIdx.x*blockDim.x + threadIdx.x;

    int y = blockIdx.y*blockDim.y + threadIdx.y;

if (x < width && y < height) {

      float2 c = tex3D(texFims, x, y, layer);

      odata[y*width+x] = c;

   }

}

void pack_c2c(float2  *output_float,  float *input_re, float *input_im, int Ntot)

{

    int i;

    for (i = 0; i < Ntot; i++) 

	     {

               output_float[i].x = input_re[i];

               output_float[i].y = input_im[i];

	     }

}

void unpack_c2c(float2  *input_float, float *output_re, float *output_im,  int Ntot)

{

    int i;

    for (i = 0; i < Ntot; i++) 

    {

               output_re[i] = input_float[i].x;

               output_im[i] = input_float[i].y;

    }

}

/********Error handling function*************************************/

void cudasafe( cudaError_t error, char* message)

{

   if(error!=cudaSuccess) { fprintf(stderr,"ERROR: %s : %i\n",message,error); exit(-1); }

}

/*************************************************************************************

 *  Main program

*************************************************************************************/

void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] ) 

{	

    float2 *input_single_im;

    float2 *h_odata;

    float *ar,*ai;    

    mexPrintf("debug point 1\n");

// Check inputs

	if (nrhs !=4) mexErrMsgTxt("Must have nine input arguments: Fims_dm,  n_i, ImageW, ImageH.");

	if (nlhs !=1) mexErrMsgTxt("Must have one output argument.");

    // get all the scalars

    int n_i    = int(mxGetScalar(prhs[1]));

    int ImageW = int(mxGetScalar(prhs[2]));

    int ImageH = int(mxGetScalar(prhs[3]));

/* Allocating working array on host for the first argument*/

    input_single_im  = (float2*) mxCalloc(ImageW*ImageH*n_i,sizeof(float2));

    /* Pointer for the real part of the input */

    ar = (float *) mxGetData(prhs[0]);

    /* Pointer for the imagnary part of the input */

    ai = (float *) mxGetImagData(prhs[0]); 

    /* pack them into CUDA data type */

    pack_c2c(input_single_im, ar, ai, ImageW*ImageH*n_i);

mexPrintf("debug point 2\n");

//read the input to GPU

    // Use 3D texture to store the Fourier images

    const cudaExtent Size_stack_ims = make_cudaExtent(ImageW, ImageH, n_i);   // size for particle images stack

    // create 3D array

    cudaArray* d_Fims;

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float2>();

    CUDA_SAFE_CALL( cudaMalloc3DArray(&d_Fims, &channelDesc, Size_stack_ims) );

    mexPrintf("debug point 3\n");

    // copy data to 3D array

    cudaMemcpy3DParms copyParams = {0};

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)input_single_im, Size_stack_ims.width*sizeof(float2), Size_stack_ims.width, Size_stack_ims.height);

    copyParams.dstArray = d_Fims;

    copyParams.extent   = Size_stack_ims;

    copyParams.kind     = cudaMemcpyHostToDevice;

mexPrintf("debug point 4 \n");

   // CUDA_SAFE_CALL(cudaMemcpy3D(&copyParams));

    cudasafe(cudaMemcpy3D(&copyParams),"cudaMemcpy3D");   // use my own error handling function

// set texture parameters

    texFims.normalized = false;                      // access with unnormalized texture coordinates    

    texFims.filterMode = cudaFilterModePoint;        // do not want interpolation

    texFims.addressMode[0] = cudaAddressModeClamp;   // clamp texture coordinates

    texFims.addressMode[1] = cudaAddressModeClamp;

    texFims.addressMode[2] = cudaAddressModeClamp;

    // bind array to 3D texture to store the particle images

    CUDA_SAFE_CALL(cudaBindTextureToArray(texFims, d_Fims, channelDesc));  

    // allocate the spaces on device to store one Fourier image

    float2* d_im = NULL;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_im, (ImageW*ImageH*sizeof(float2))));     

/***** Host memory allocated *****/

    h_odata=(float2 *) mxCalloc(n_i*ImageW*ImageH,sizeof(float2)); // store the result 

dim3 dimBlock(8, 8, 1);

    dim3 dimGrid((ImageW+ dimBlock.x -1) / dimBlock.x, (ImageH+ dimBlock.y -1) / dimBlock.y, 1);    

mexPrintf("begin loop\n");

    // loop all the transforms

    for (int i = 0; i < n_i; i++)

    { 

        // read in 3D texture one layer to get the one image        

        ReadImKernel<<< dimGrid, dimBlock, 0>>>( d_im, ImageW, ImageH,i );         

        CUDA_SAFE_CALL( cudaThreadSynchronize());

         // copy the result to host space          

        CUDA_SAFE_CALL( cudaMemcpy(h_odata+i*ImageW*ImageH,(float2 *)d_im, ImageW*ImageH*sizeof(float2),cudaMemcpyDeviceToHost));  // memory copy

    }

/* get a pointer to  the output  */

    const mwSize dims[]={ImageH,ImageW,n_i};

    plhs[0] = mxCreateNumericArray(3,dims,mxSINGLE_CLASS,mxCOMPLEX);    

    ar = (float *)mxGetPr(plhs[0]); 

    ai = (float *)mxGetPi(plhs[0]);

    unpack_c2c(h_odata, ar, ai, ImageH * ImageW * n_i); 

    // clear memory

    mxFree(h_odata);

    mxFree(input_single_im);

    CUDA_SAFE_CALL(cudaFree(d_im));

    CUDA_SAFE_CALL(cudaFreeArray(d_Fims));

    mexPrintf("finish\n");

    return;

}

I know that 3D texture has a limit of 2048, but I only use the number of 2000 and as to float2 image type, the whole size of the image is 1601602000*8 bytes=391M bytes. I think it is not a very large size for GPU memory. Does anyone know what is the problem? And how to catch the error when doing the memory copy operation? Thank you in advance.

I’ve never used CUDA_SAFE_CALL in Matlab. If it’s not working there, how do you know it worked earlier? Maybe your cudaMalloc3DArray failed.

I use

void checkCudaError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err ) mexErrMsgTxt(msg);
}

which I use after each CUDA call. For example in your case, I would do

cudaMemcpy3D(&copyParams);
checkCudaError(“at cudaMemcpy3D”);

Thank you Dittoaway.

I used your function and indeed when the image number is 800, cudaMalloc3DArray failed.But I still feel very strange of the memory size of 3D Array on GPU. When the image size is 160, number is 800, it only need 160160800*8 bytes ( for fourier image, float2 needs 8 bytes). I donot think it is a very large size for GPU global memory.What do you think about the memory limit and how to find out the size of free blocks of memory on the GPU so we can avoid allocation errors beforehand? Thank you !

What card are you using?

Oh, you are online. Thank you .

My device is

“GeForce 9400M”

Ahhh. That’s a mobile chip, maybe without dedicated graphics memory. Did your system say how much memory is/could be allocated to graphics?

I am a beginner . So I know little about it. I post here what I got from devicequery:

Total amount of global memory: 265945088 bytes

Multiprocessors x Cores/MP = Cores: 2 (MP) x 8 (Cores/MP) = 16 (Cores)

Total amount of constant memory: 65536 bytes

Total amount of shared memory per block: 16384 bytes

Total number of registers available per block: 8192

Warp size: 32

Maximum number of threads per block: 512

Maximum sizes of each dimension of a block: 512 x 512 x 64

Maximum sizes of each dimension of a grid: 65535 x 65535 x 1

Maximum memory pitch: 2147483647 bytes

Texture alignment: 256 bytes

Clock rate: 1.10 GHz

Do you mean that my card is not an advanced one? Does 3D Array use global memory? The limit is quite large I think.

Your 3D array will be in global memory.
You are trying to allocate ~400 mB and have only ~265 MB available.

That chip is for notebooks and has modest CUDA abilities.

Thank you for your patient reply. But I still have some questions.

  1. Just now, I tried image number=400, it crashed. But in that case, the memory allocated only 1601604008=78M. It should not crash. Why the running is not stable. And several days ago,when I havenot added the error handling, I indeed ran it with image number=400, 600,800,1000,2000 which increased gradually. It all worked only crashed when I used 2000 at first time when I start Matlab. And 16016020008=390M > 265M. Why it works when I gradually increase the image number from a small one? And Why it now crashes with only 400? I am totally confused.

2, Can we get the free memory size before allocation? Is there any function?

3, You said you never use CUDA_SAFE_CALL in Matlab, do you mean it will work in other environment? In SDK code, they use cutilSafeCall. What is the difference and can it work in Matlab?

Sorry for giving you so much trouble. Thank you again.

Sorry, but I don’t have an answer for you.
I’m not familiar with your chip or the system it is in. Some mobile chips can grab variable amounts of the system memory for their use. It’s possible that is a partial explanation.
Also likely that there is a bug in your code ;(

When you say it crashes, are you getting cuda errors or is Matlab itself crashing? Is Matlab running out of memory?

Sorry for my incomplete information. I am running in CUDA 3.2, Mac OsX 10.6.5 and Matlab R2010b.

My test code for the former cuda function is

% Try large numbers of images .

n=160;  % image size

nim=400;  % Number of images

images = randn(n,n,nim);   % store images

% Compute FT of images

Fimgs=single(complex(zeros(n,n,nim)));

for i=1:nim

    Fimgs(:,:,i)=fftshift(fftn(fftshift(images(:,:,i))));

end;

figure(1)

SetComplex;

subplot(3,3,1);

imacx(Fimgs(:,:,1),.3);

subplot(3,3,4);

imacx(Fimgs(:,:,2),.3);

subplot(3,3,7);

imacx(Fimgs(:,:,3),.3);

disp(' ');

disp('using GPU');

tic

CC=testCUDAMemcpy3D(Fimgs,nim,n,n);

toc

subplot(3,3,3);

imacx(CC(:,:,1),.3);

subplot(3,3,6);

imacx(CC(:,:,2),.3);

subplot(3,3,9);

imacx(CC(:,:,2),.3);

drawnow;

and the running result is :

using GPU

debug point 1

debug point 2

??? Error using ==> testCUDAMemcpy3D

at cudaMalloc3DArray

Error in ==> testBigCudaMemcpy3D at 33

CC=testCUDAMemcpy3D(Fimgs,nim,n,n);

in which " at cudaMalloc3DArray" is printed from your error handling function checkCudaError. I called it after each CUDA call. So, could you help me to find the reason?

“So, could you help me to find the reason?”

Afraid not. You may just not have the graphics memory to do the job. The 256 MB of shared system memory is not used for just CUDA. It’s also used to drive the Matlab graphics. I also don’t know how your system assigns system memory for graphics. It’s possible (?) that Matlab itself is using so much memory that the full 256 MB is not given to the graphics sub-system. That’s for a Mac techie (not me) to know. Have you tried running devicequery in parallel with Matlab running your code to see if the full 256 MB of global memory is listed.

I tried running devicequery in parallel with Matlab and found that it still listed 256 MB of global memory . But maybe it can not list dynamic memory size.

You have already given me so much help. Many thanks to you!