Hi -
I believe I am encountering an error on cudaMalloc that I’m having trouble tracking down the cause.
Here’s a quick description of the algorithm:
-
main() has a long loop, anywhere from 200-500 iterations
-
each iteration calls 4 GPU kernels
-
for each kernel call, device memory is allocated and freed
-
for each kernel, there is a host-device and then a device-host transfer
The problem:
- around 300 iterations there is an “unknown error” returned by cudaMalloc causing “invalid device pointer” error on subsequent cudaMemcpy calls
Other symptoms:
-
the GPU crash occurs for a certain dataset size and above (28MBx7=~200MB); the kernel it crashes in allocates 7 arrays of 28MB.
-
the error returned is apparently not “cudaErrorMemoryAllocation” and is only caught with cudaGetLastError()
-
there seems to be enough free memory before the cudaMalloc call (via cuMemGetInfo())
System Info:
Hardware: nVidia GTX 8800
cuda version: 1.1
OS: Win XP
Questions:
-
hints what may be causing the crash?
-
instead of allocating device memory each iteration, it is possible to allocate the device memory once before the loop? since I have 4 separate kernel calls, are there any restrictions with this approach? (ie: are the device pointers still valid after each kernel exits).
//-------------------------------------------------------------------
// 0. Memory allocation
//-------------------------------------------------------------------
// device allocation
CUDA_SAFE_CALL( cudaMalloc( (void **)&d_u1, nDataSizeBytes ) );
CUDA_SAFE_CALL( cudaMalloc( (void **)&d_u2, nDataSizeBytes ) );
CUDA_SAFE_CALL( cudaMalloc( (void **)&d_u3, nDataSizeBytes ) );
CUDA_SAFE_CALL( cudaMalloc( (void **)&d_f1, nDataSizeBytes ) );
CUDA_SAFE_CALL( cudaMalloc( (void **)&d_f2, nDataSizeBytes ) );
CUDA_SAFE_CALL( cudaMalloc( (void **)&d_f3, nDataSizeBytes ) );
// CUDA_SAFE_CALL( cudaMalloc( (void **)&d_J, nDataSizeBytes ) );
cudaError_t error;
cuMemGetInfo( &nFree, &nTotal );
printf("dlGPUEvaluate_f_MI (malloc7): nFree= %d nTotal=%d\n", nFree, nTotal );
error = cudaMalloc( (void **)&d_J, nDataSizeBytes );
// if( error == cudaErrorMemoryAllocation )
if( cudaCheckErrors() )
{
printf("Cuda Error in dlGPUEvaluate_f_MI()!: cudaMalloc 7: nDataSizeBytes= %d\n", nDataSizeBytes );
cuMemGetInfo( &nFree, &nTotal );
printf("dlGPUEvaluate_f_MI (malloc7): nFree= %d nTotal=%d\n", nFree, nTotal );
}
// setup interp grid params
float pGridParams[6];
pGridParams[ GRID_DX_INV ] = 1.f / dx;
pGridParams[ GRID_DY_INV ] = 1.f / dy;
pGridParams[ GRID_DZ_INV ] = 1.f / dz;
pGridParams[ GRID_DX2_INV ] = 1.f / (2*dx);
pGridParams[ GRID_DY2_INV ] = 1.f / (2*dy);
pGridParams[ GRID_DZ2_INV ] = 1.f / (2*dz);
int pDimParams[3];
pDimParams[ X ] = nWidth;
pDimParams[ Y ] = nHeight;
pDimParams[ Z ] = nDepth;
//-------------------------------------------------------------------
// 1. Copy data from host to device
//-------------------------------------------------------------------
CUDA_SAFE_CALL( cudaMemcpy( d_u1, h_u1, nDataSizeBytes, cudaMemcpyHostToDevice ) );
CUDA_SAFE_CALL( cudaMemcpy( d_u2, h_u2, nDataSizeBytes, cudaMemcpyHostToDevice ) );
CUDA_SAFE_CALL( cudaMemcpy( d_u3, h_u3, nDataSizeBytes, cudaMemcpyHostToDevice ) );
CUDA_SAFE_CALL( cudaMemcpy( d_f1, h_f1, nDataSizeBytes, cudaMemcpyHostToDevice ) );
CUDA_SAFE_CALL( cudaMemcpy( d_f2, h_f2, nDataSizeBytes, cudaMemcpyHostToDevice ) );
CUDA_SAFE_CALL( cudaMemcpy( d_f3, h_f3, nDataSizeBytes, cudaMemcpyHostToDevice ) );
if( cudaCheckErrors() )
{
printf("Cuda Error in dlGPUEvaluate_f_MI()!: cudaMemcpy: host-to-device\n" );
}
CUDA_SAFE_CALL( cudaMemcpyToSymbol( d_GridF, pGridParams, 6*sizeof( float ) ) );
CUDA_SAFE_CALL( cudaMemcpyToSymbol( d_DimF, pDimParams, 3*sizeof( int ) ) );
CUDA_SAFE_CALL( cudaMemcpyToSymbol( d_lambda, &lambda, sizeof( float ) ) );
if( cudaCheckErrors() )
{
printf("Cuda Error in dlGPUEvaluate_f_MI()!: cudaMemcpyToSymbol\n" );
}
//-------------------------------------------------------------------
// 2. Setup CUDA kernel params
//-------------------------------------------------------------------
dim3 blockGrid( iDivUp( nDepth, BLOCKWIDTH_MI), iDivUp( nHeight, (BLOCKHEIGHT-2*PAD) ) );
dim3 threadBlock( PAD_ALIGNED+BLOCKWIDTH_MI+8, BLOCKHEIGHT );
//-------------------------------------------------------------------
// 3. Compute f
//-------------------------------------------------------------------
evaluate_f_MI<<<blockGrid, threadBlock >>>(
d_u1, d_u2, d_u3,
d_f1, d_f2, d_f3, d_J );
//-------------------------------------------------------------------
// 4. Copy results from device to host
//-------------------------------------------------------------------
CUDA_SAFE_CALL( cudaMemcpy( h_f1, d_f1, nDataSizeBytes, cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpy( h_f2, d_f2, nDataSizeBytes, cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpy( h_f3, d_f3, nDataSizeBytes, cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaMemcpy( h_J, d_J, nDataSizeBytes, cudaMemcpyDeviceToHost) );
if( cudaCheckErrors() )
{
printf("Cuda Error in dlGPUEvaluate_f_MI()!: cudaMemcpy: device-to-host\n" );
}
//-------------------------------------------------------------------
// 5. evaluate SKL
//-------------------------------------------------------------------
// reuse previously allocated memory for SKL
float *d_SKL = d_u1;
// setup kernel params
blockGrid = dim3( iDivUp( nDepth, BLOCKWIDTH_SKL), iDivUp( nHeight, BLOCKHEIGHT_SKL ) );
threadBlock = dim3( BLOCKWIDTH_SKL, BLOCKHEIGHT_SKL );
// copy constant from host to device
float fSKL_default = (.001f-1.f) * logf( .001f );
CUDA_SAFE_CALL( cudaMemcpyToSymbol( d_SKL_default, &fSKL_default, sizeof( float ) ) );
CUDA_SAFE_CALL( cudaMemcpyToSymbol( d_DimF, pDimParams, 3*sizeof( int ) ) );
// compute SKL per voxel
evaluate_SKL<<< blockGrid, threadBlock >>>(
d_J, d_SKL
);
// sum SKL
dlCUDASumNPOT( fSKL, d_SKL, d_J, nDataSize );
if( cudaCheckErrors() )
{
printf("Cuda Error in dlGPUEvaluate_f_MI()!: SKL\n" );
}
// clean up
CUDA_SAFE_CALL( cudaFree(d_u1) );
CUDA_SAFE_CALL( cudaFree(d_u2) );
CUDA_SAFE_CALL( cudaFree(d_u3) );
CUDA_SAFE_CALL( cudaFree(d_f1) );
CUDA_SAFE_CALL( cudaFree(d_f2) );
CUDA_SAFE_CALL( cudaFree(d_f3) );
CUDA_SAFE_CALL( cudaFree(d_J) );
Output:
dlGPUEvaluate_f_MI (malloc7): nFree= 558338048 nTotal=804978688
Cuda error: unknown error
Cuda Error in dlGPUEvaluate_f_MI()!: cudaMalloc 7: nDataSizeBytes= 28311552
dlGPUEvaluate_f_MI (malloc7): nFree= 530026496 nTotal=804978688
Cuda error: invalid device pointer
Cuda Error in dlGPUEvaluate_f_MI()!: cudaMemcpy: device-to-host