cudaMalloc error in big loop

pixelhead · March 20, 2008, 1:08am

Hi -

I believe I am encountering an error on cudaMalloc that I’m having trouble tracking down the cause.

Here’s a quick description of the algorithm:

main() has a long loop, anywhere from 200-500 iterations
each iteration calls 4 GPU kernels

for each kernel call, device memory is allocated and freed
for each kernel, there is a host-device and then a device-host transfer

The problem:

around 300 iterations there is an “unknown error” returned by cudaMalloc causing “invalid device pointer” error on subsequent cudaMemcpy calls

Other symptoms:

the GPU crash occurs for a certain dataset size and above (28MBx7=~200MB); the kernel it crashes in allocates 7 arrays of 28MB.
the error returned is apparently not “cudaErrorMemoryAllocation” and is only caught with cudaGetLastError()
there seems to be enough free memory before the cudaMalloc call (via cuMemGetInfo())

System Info:

Hardware: nVidia GTX 8800

cuda version: 1.1

OS: Win XP

Questions:

hints what may be causing the crash?
instead of allocating device memory each iteration, it is possible to allocate the device memory once before the loop? since I have 4 separate kernel calls, are there any restrictions with this approach? (ie: are the device pointers still valid after each kernel exits).

   //-------------------------------------------------------------------

    // 0. Memory allocation

    //-------------------------------------------------------------------

    // device allocation

    CUDA_SAFE_CALL( cudaMalloc( (void **)&d_u1, nDataSizeBytes ) );

    CUDA_SAFE_CALL( cudaMalloc( (void **)&d_u2, nDataSizeBytes ) );

    CUDA_SAFE_CALL( cudaMalloc( (void **)&d_u3, nDataSizeBytes ) );

    CUDA_SAFE_CALL( cudaMalloc( (void **)&d_f1, nDataSizeBytes ) );

    CUDA_SAFE_CALL( cudaMalloc( (void **)&d_f2, nDataSizeBytes ) );

    CUDA_SAFE_CALL( cudaMalloc( (void **)&d_f3, nDataSizeBytes ) );

//    CUDA_SAFE_CALL( cudaMalloc( (void **)&d_J, nDataSizeBytes ) );

   cudaError_t error;

   cuMemGetInfo( &nFree, &nTotal );

    printf("dlGPUEvaluate_f_MI (malloc7): nFree= %d nTotal=%d\n", nFree, nTotal );

    error = cudaMalloc( (void **)&d_J, nDataSizeBytes );

//    if( error == cudaErrorMemoryAllocation )

    if( cudaCheckErrors() )

    {

        printf("Cuda Error in dlGPUEvaluate_f_MI()!: cudaMalloc 7: nDataSizeBytes= %d\n", nDataSizeBytes );

        cuMemGetInfo( &nFree, &nTotal );

        printf("dlGPUEvaluate_f_MI (malloc7): nFree= %d nTotal=%d\n", nFree, nTotal );

    }

   // setup interp grid params

    float pGridParams[6];

    pGridParams[ GRID_DX_INV ] = 1.f / dx;

    pGridParams[ GRID_DY_INV ] = 1.f / dy;

    pGridParams[ GRID_DZ_INV ] = 1.f / dz;

    pGridParams[ GRID_DX2_INV ] = 1.f / (2*dx);

    pGridParams[ GRID_DY2_INV ] = 1.f / (2*dy);

    pGridParams[ GRID_DZ2_INV ] = 1.f / (2*dz);

    int pDimParams[3];

    pDimParams[ X ] = nWidth;

    pDimParams[ Y ] = nHeight;

    pDimParams[ Z ] = nDepth;

   //-------------------------------------------------------------------

    // 1. Copy data from host to device

    //-------------------------------------------------------------------

    CUDA_SAFE_CALL( cudaMemcpy( d_u1, h_u1, nDataSizeBytes, cudaMemcpyHostToDevice ) );

    CUDA_SAFE_CALL( cudaMemcpy( d_u2, h_u2, nDataSizeBytes, cudaMemcpyHostToDevice ) );

    CUDA_SAFE_CALL( cudaMemcpy( d_u3, h_u3, nDataSizeBytes, cudaMemcpyHostToDevice ) );

    CUDA_SAFE_CALL( cudaMemcpy( d_f1, h_f1, nDataSizeBytes, cudaMemcpyHostToDevice ) );

    CUDA_SAFE_CALL( cudaMemcpy( d_f2, h_f2, nDataSizeBytes, cudaMemcpyHostToDevice ) );

    CUDA_SAFE_CALL( cudaMemcpy( d_f3, h_f3, nDataSizeBytes, cudaMemcpyHostToDevice ) );

   if( cudaCheckErrors() )

    {

        printf("Cuda Error in dlGPUEvaluate_f_MI()!: cudaMemcpy: host-to-device\n" );

    }

   CUDA_SAFE_CALL( cudaMemcpyToSymbol( d_GridF, pGridParams, 6*sizeof( float ) ) );

    CUDA_SAFE_CALL( cudaMemcpyToSymbol( d_DimF, pDimParams, 3*sizeof( int ) ) );

    CUDA_SAFE_CALL( cudaMemcpyToSymbol( d_lambda, &lambda, sizeof( float ) ) );

    if( cudaCheckErrors() )

    {

        printf("Cuda Error in dlGPUEvaluate_f_MI()!: cudaMemcpyToSymbol\n" );

    }

   //-------------------------------------------------------------------

    // 2. Setup CUDA kernel params

    //-------------------------------------------------------------------

    dim3 blockGrid( iDivUp( nDepth, BLOCKWIDTH_MI), iDivUp( nHeight, (BLOCKHEIGHT-2*PAD) ) );

    dim3 threadBlock( PAD_ALIGNED+BLOCKWIDTH_MI+8, BLOCKHEIGHT );

   //-------------------------------------------------------------------

    // 3. Compute f

    //-------------------------------------------------------------------

    evaluate_f_MI<<<blockGrid, threadBlock >>>(

        d_u1, d_u2, d_u3,

        d_f1, d_f2, d_f3, d_J );

   //-------------------------------------------------------------------

    // 4. Copy results from device to host

    //-------------------------------------------------------------------

    CUDA_SAFE_CALL( cudaMemcpy( h_f1, d_f1, nDataSizeBytes, cudaMemcpyDeviceToHost) );

    CUDA_SAFE_CALL( cudaMemcpy( h_f2, d_f2, nDataSizeBytes, cudaMemcpyDeviceToHost) );

    CUDA_SAFE_CALL( cudaMemcpy( h_f3, d_f3, nDataSizeBytes, cudaMemcpyDeviceToHost) );

    CUDA_SAFE_CALL( cudaMemcpy( h_J,  d_J,  nDataSizeBytes, cudaMemcpyDeviceToHost) );

   if( cudaCheckErrors() )

    {

        printf("Cuda Error in dlGPUEvaluate_f_MI()!: cudaMemcpy: device-to-host\n" );

    }

   //-------------------------------------------------------------------

    // 5. evaluate SKL

    //-------------------------------------------------------------------

    // reuse previously allocated memory for SKL

    float *d_SKL = d_u1;

    // setup kernel params

    blockGrid = dim3( iDivUp( nDepth, BLOCKWIDTH_SKL), iDivUp( nHeight, BLOCKHEIGHT_SKL ) );

    threadBlock = dim3( BLOCKWIDTH_SKL, BLOCKHEIGHT_SKL );

    // copy constant from host to device

    float fSKL_default = (.001f-1.f) * logf( .001f );

    CUDA_SAFE_CALL( cudaMemcpyToSymbol( d_SKL_default, &fSKL_default, sizeof( float ) ) );

    CUDA_SAFE_CALL( cudaMemcpyToSymbol( d_DimF, pDimParams, 3*sizeof( int ) ) );

    // compute SKL per voxel

    evaluate_SKL<<< blockGrid, threadBlock >>>(

        d_J, d_SKL

        );

   // sum SKL

    dlCUDASumNPOT( fSKL, d_SKL, d_J, nDataSize );

   if( cudaCheckErrors() )

    {

        printf("Cuda Error in dlGPUEvaluate_f_MI()!: SKL\n" );

    }

   // clean up

    CUDA_SAFE_CALL( cudaFree(d_u1) );

    CUDA_SAFE_CALL( cudaFree(d_u2) );

    CUDA_SAFE_CALL( cudaFree(d_u3) );

    CUDA_SAFE_CALL( cudaFree(d_f1) );

    CUDA_SAFE_CALL( cudaFree(d_f2) );

    CUDA_SAFE_CALL( cudaFree(d_f3) );

    CUDA_SAFE_CALL( cudaFree(d_J) );

Output:

dlGPUEvaluate_f_MI (malloc7): nFree= 558338048 nTotal=804978688

Cuda error: unknown error 

Cuda Error in dlGPUEvaluate_f_MI()!: cudaMalloc 7: nDataSizeBytes= 28311552

dlGPUEvaluate_f_MI (malloc7): nFree= 530026496 nTotal=804978688

Cuda error: invalid device pointer 

Cuda Error in dlGPUEvaluate_f_MI()!: cudaMemcpy: device-to-host

AndreiB · March 20, 2008, 4:58am

Yes. And IMO this is much better from performance point of view since memory allocation/dealocation may take considerable time (comparing with kernel running time itself).

DenisR · March 20, 2008, 7:25am

Also, I did not find any CudaFree calls in your code, so you were allocating memory until there was no memory left on your device, that is why the calls to CudaMalloc started to fail. But allocating once (the largest size you will need) is much much faster.

pixelhead · March 20, 2008, 4:06pm

Thanks for the info about allocating once. I do call free on the device memory, I just didn’t copy and paste it (I’ve edited the code above).

If there is a memory leak, shouldn’t cudaMalloc return “cudaErrorMemoryAllocation”? I just get “unknown error”.

Anyway, I’ll give the initing memory once a go and see it the problem disappears.

Thanks for the help.

pixelhead · March 20, 2008, 8:29pm

Changing to init once did the trick! Thanks all.

garciav · March 20, 2008, 8:43pm

Hi,
I have exactely the same problem. I do a lot of cudaMalloc() but I can’t do only one malloc as you.
Actually, the loop is not in my function but in my matlab program.
I can’t understand why an “unknown error” appears without any reason.
You have solved the problem by doing only one malloc but actually you have not exactly solved the problem. You have just modify your code to avoid this problem. Note that this is not reproach. I think that this is interesting to understand how CUDA works.
:)
Vince

DenisR · March 21, 2008, 6:51am

garciav,
I am also using CUDA from matlab, and I have the idea that mex files run in the same thread as matlab itself, so what you can probably do is make a persistent variable in your mex function, allocate the memory once and free it when the mex file is unloaded. I haven’t tried yet, because I did not need it yet, but I have been thinking of doing this in the near future.

If I have my machine back up & running I will try to make a small program to try this out.

garciav · March 21, 2008, 7:12am

Hum… Very interesting! However, pixelhead had the same problem. If he use matlab, it could be the answer to our problem. But if he’s not, the problem is elsewhere.

Pixelhead , could you tell us how do you use CUDA? (Matlab or C)

Maybe we could mention this problem to Mathworks.
I will continue my investigations until I perfectly understand why it doesn’t work.

External Image The Truth is Out There External Image

vnena · March 28, 2008, 2:43pm

Hey.

I’m doing something very similar and have noticed that in my case the problem occurs when using DX9 API on XPDM drivers (on 2003 or Vista) and not on the WDDM drivers (Vista).

Long story short is that on XPDM drivers allocating videomemory for textures with surface format of R32F (IEEE format 32-bit float format using 32 bits for the red channel) the memory available seems to be 1/4 of that in WDDM when using 32 bit float/a8r8g8b8 etc and 1/2 with others as shown below.

After playing with various SurfaceFormat settings I have found something weird:

//# of 256*256 surfaces allocated in specified format before out of video memory exception:

XPDM:
//4857 halfsingle
//1121 single/b8g8r8/A8R8G8B8 << DRIVER ISSUE. Single & other 32 bit formats seem to take twice the amount of memory than the should!
//624 vector4 << notice how 4x 32bit takes only half more than one 32 bit!
//1242 vector2/halfvector4/r16b16g16a16

WDDM:
//9639 halfsingle
//4891 single/color surface * 256 yres *256 xres * 32 / 8 / 1024 / 1024 = 1222 MB, kind of weird since I have only 1024 MB of video memory out of which Aero takes some.
//1204 vector4
//2409 vector2/halfvector4

I don’t know why the others are 2x with WDDM (I have 1 GB of video memory in 8800 GT) but notice how the Single (32 bit float)/Color (8,8,8,8) is approx four (4) times larger while the rest are only twice as large with WDDM.

My code is using XNA so it uses DirectX 9 on both XPDM and WDDM drivers. It’s just a bunch of “new RenderTarget2D” calls.

Hardware: ds2r,q6600,4gb,Palit brand 8800 gt/1GB (memory chips on both sides of card)

garciav · April 2, 2008, 10:15am

DenisR,

You wrote this :

Actually, in my case, CUDA is called inside two imbricated loops :

Use different value of the matrix
Split the matrix in smaller matrixes

So, I can’t allocate the memory once…

Any idea?

Thanks,

Vince

DenisR · April 2, 2008, 12:12pm

You can allocate the maximum memory you will need, and reuse that memory also for the smaller matrices (if I understand your problem correctly) I have not had time to test if it works yet unfortunately.

djoni · May 21, 2008, 10:34am

I have a similar issue with cudaMalloc returning the unknown error rather than the “cudaErrorMemoryAllocation” error. This happens at the very beginning of my C code. It’s not in a loop or anything. That’s the line in question:

CUDA_SAFE_CALL( cudaMalloc( (void **)&d_ImageGreyFull2, IMAGE_SIZE_MAX_1F) );

(I did check that the error returned by this cudaMalloc was not cudaErrorMemoryAllocation)

I don’t understand the problem as this worked fine until recently.
Pixelhead mention “initing”. What does that mean?

Thanks for your help.

djoni · May 21, 2008, 11:09am

I solved my problem. That came from an unrelated pointer problem in C CPU code happening just before.

Sorry about this then. Feel free to delete this post and my previous one then.

Topic		Replies	Views
using cudaMalloc and cudaFree within a loop unspecified launch failure! CUDA Programming and Performance	21	37702	April 23, 2009
CudaMalloc on Vista : strange behaviour Works on XP, Fails on Vista CUDA Programming and Performance	6	12258	July 1, 2009
Cudamalloc attempting to allocate more memory than it is supposed to CUDA Programming and Performance cuda	15	223	January 13, 2025
Pinned memory limit CUDA Programming and Performance	16	13549	May 1, 2016
How to cudaMalloc two-dimensional array ? CUDA Programming and Performance	46	66595	September 7, 2023
cudaFree is returning an unrecognised error code CUDA Programming and Performance	10	7947	March 13, 2009
cudaMalloc causes segmentation fault 2 Mo is far from my 1,2 Go card memory limit CUDA Programming and Performance	7	7472	June 28, 2011
Can we do malloc inside a __global__ function CUDA Programming and Performance	26	9696	February 21, 2010
cudaMalloc failure CUDA Programming and Performance	5	28478	June 22, 2008
Error handling in CUDA when using cudaMemcpy3D CUDA Programming and Performance	12	11923	December 22, 2010

cudaMalloc error in big loop

Related topics