Hello everyone,
I am relatively new to cuda, and actually facing a very strange problem:
I made a class function to process images which allocates quite a bit of memory
on the device. If I create the class object and test the function repeatedly,
the device wont provide any more memory:
Available: memory 2GB (GTX 750)
each image is a bit more of 10 MB big.
I run the test with 160 images (1,6 GB) => success.
if I call the function twice 80 images then 160 images, 80 Images succeed, 160 fails:
C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug>ieC_ds.exe
160 images…Available Memory: 1932804096 / 2147483648 processed within 4734 ms
C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug>ieC_ds.exe
80 images…Available Memory: 1916719104 / 2147483648 processed within 2375 ms
160 images…Available Memory: 1916719104 / 2147483648 Cannot allocate 3d Cube
Test aborted
Of course I carefully freed the device at the end of the member function,
the prints use cudaMemGetInfo(&free, &total) right before allocating the cube of memory.
Yet here’s what memcheck says:
C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug>cuda-memcheck --leak-check full ieC_ds.exe
========= CUDA-MEMCHECK
80 images…Available Memory: 1955438592 / 2147483648 processed within 2598 ms
160 images…Available Memory: 1955438592 / 2147483648 Cannot allocate 3d Cube
Test aborded
========= Program hit cudaErrorMemoryAllocation (error 2) due to “out of memory” on CUDA API call to cudaMalloc3D.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\Windows\system32\nvcuda.dll (cuProfilerStop + 0xa5379) [0xbecb9]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\cudart32_65.dll (cudaMalloc3D + 0x11d) [0x167cd]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (iecEngine::QuotImg + 0x222) [0x2c072]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (main + 0x1a0) [0x7da0]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (__tmainCRTStartup + 0x199) [0x29af9]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (mainCRTStartup + 0xd) [0x29c3d]
========= Host Frame:C:\Windows\syswow64\kernel32.dll (BaseThreadInitThunk + 0x12) [0x1338a]
========= Host Frame:C:\Windows\SysWOW64\ntdll.dll (RtlInitializeExceptionChain + 0x63) [0x39f72]
========= Host Frame:C:\Windows\SysWOW64\ntdll.dll (RtlInitializeExceptionChain + 0x36) [0x39f45]
========= Program hit cudaErrorInvalidDevicePointer (error 17) due to “invalid device pointer” on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\Windows\system32\nvcuda.dll (cuProfilerStop + 0xa5379) [0xbecb9]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\cudart32_65.dll (cudaFree + 0xe2) [0x15dc2]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (iecEngine::QuotImg + 0x24b) [0x2c09b]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (main + 0x1a0) [0x7da0]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (__tmainCRTStartup + 0x199) [0x29af9]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (mainCRTStartup + 0xd) [0x29c3d]
========= Host Frame:C:\Windows\syswow64\kernel32.dll (BaseThreadInitThunk + 0x12) [0x1338a]
========= Host Frame:C:\Windows\SysWOW64\ntdll.dll (RtlInitializeExceptionChain + 0x63) [0x39f72]
========= Host Frame:C:\Windows\SysWOW64\ntdll.dll (RtlInitializeExceptionChain + 0x36) [0x39f45]
========= LEAK SUMMARY: 0 bytes leaked in 0 allocations
========= ERROR SUMMARY: 2 errors
Now that the issue is clear, the code is inserted at the end.
You will notice, I disabled the kernel launch to minimize
error suspicion.
Thanks for any assistance.
for calling the function
#include "ieC.h"
#include <time.h>
#define numOfImages 160
#define step 80
#define startImage 1
int main()
{
iecEngine * engine = new iecEngine;
iecError err;
std::vector<std::string> refImages;
char fileName[100];
clock_t time;
for(int j = 1; j * step <= numOfImages; ++j)
{
for(int i = startImage; i < startImage + j * step; i++)
{
sprintf(fileName, "C:/testData/JB113_S113(3)_%04d.tif", i);
refImages.push_back(fileName);
}
printf("%d images...", j* step);
time = clock();
err = engine->QuotImg(&refImages, &refImages);
time = clock() - time;
if(err != iecError::success)
{
printf("Test aborded\n");
engine->~iecEngine();
return -1;
}
printf("processed within %d ms\n",
(int)(time * 1000) / CLOCKS_PER_SEC);
}
engine->~iecEngine();
return 0;
}
for the class implementation
#include <stdint.h>
#include <vector>
#include <string>
#include <time.h>
#include "tiffio.h"
#include "cuda_runtime.h"
#include "ieC_cuKernels.h"
#include "ieC.h"
cudaDeviceProp deviceProp;
// _________________________________________________________________________________________________
iecEngine::iecEngine()
{
// Start by looking for cuda capable devices
cudaError error;
int deviceCount;
error = cudaGetDeviceCount(&deviceCount);
if(error != cudaSuccess)
{
// Cuda returned an unexpected answer
}
else
{
// TODO: add a program behavior for choosing the
// right CUDA Device
if(deviceCount >= 1)
{
error = cudaGetDeviceProperties(&deviceProp, 0);
if(error != cudaSuccess)
{
}
cudaSetDevice(0);
cudaDeviceReset();
cudaDeviceSynchronize();
cudaThreadSynchronize();
}
}
}
// _________________________________________________________________________________________________
iecEngine::~iecEngine()
{
// Free all memory.
}
// _________________________________________________________________________________________________
iecError iecEngine::QuotImg(std::vector<std::string> * refImages,
std::vector<std::string> * sampleImages)
{
TIFFSetWarningHandler(NULL); // Disable TIFF Warnings
TIFF * file = TIFFOpen(refImages->begin()->c_str(), "r");// Open first image
if(file==0) // Verify opened
{
// TODO: Manage file error
printf("Cannot open first file\n");
return iecError::fail;
}
uint32_t imageWidth; // Image width var in pixel
uint32_t imageLength; // Image length var in pixel
uint32_t imageSize; // Image size var in pixel
TIFFGetField(file, TIFFTAG_IMAGEWIDTH, &imageWidth); // Load image width
TIFFGetField(file, TIFFTAG_IMAGELENGTH, &imageLength); // Load image length
imageSize = imageWidth * imageLength; // Calculate image size
TIFFClose(file); // close first file
cudaError cuErr; // create a cude err container
uint16_t * hostImagePtr; // Create ptr to loading image
cuErr = cudaHostAlloc(&hostImagePtr,
imageSize * sizeof(uint16_t),
cudaHostAllocPortable); // Alloc host RAM for image
if(cuErr != cudaError::cudaSuccess)
{
printf("Cannot allocate memory on Host\n");
return iecError::fail;
}
cudaStream_t cuStream;
cuErr = cudaStreamCreate(&cuStream); // Create a cuda stream
if(cuErr != cudaError::cudaSuccess)
{
printf("Cannot create Stream to GPU\n");
return iecError::fail;
}
size_t free, total;
cudaMemGetInfo(&free, &total);
printf("Available Memory: %u / %u\t", free, total);
cudaExtent cubeVolume = // Create extent to init a cube
make_cudaExtent(sizeof(uint16) * imageWidth,
imageLength, refImages->size());
cudaPitchedPtr devCubePtr; // Create a ptr for the cube.
cuErr = cudaMalloc3D(&devCubePtr, cubeVolume); // Alocate cube.
if(cuErr != cudaError::cudaSuccess)
{
printf("Cannot allocate 3d Cube\n");
cudaFree(hostImagePtr);
return iecError::fail;
}
// Proceed to load the refImages.
for(uint16_t i = 0; i < refImages->size(); i++) // For all images
{
TIFF * file = TIFFOpen(refImages->at(i).c_str(), "r"); // Open Image
if(file == 0) // Verify opened
{
printf("Cannot open %s\n", refImages->at(i).c_str());
cudaFree(devCubePtr.ptr);
cudaFreeHost(hostImagePtr);
cudaStreamDestroy(cuStream);
return iecError::fail;
}
// Verify Attributes
uint32_t imgW; // Image width in pixel var
uint32_t imgL; // Image length in pixel var
TIFFGetField(file, TIFFTAG_IMAGEWIDTH, &imgW); // Load image width
TIFFGetField(file, TIFFTAG_IMAGELENGTH, &imgL); // Load image length
if(imgW != imageWidth || imgL != imageLength) // Verify attributes
{
// TODO: handle images difference
// WARINING: need to free the memory before exiting the function
printf("%s has a different resolution %d vs %d and %d vs %d\n",
refImages->at(i).c_str(), imgW, imageWidth, imgL, imageLength);
cudaFree(devCubePtr.ptr);
cudaFreeHost(hostImagePtr);
cudaStreamDestroy(cuStream);
return iecError::fail;
}
// This statement is very important, it synchronize the stream
// from previous operations so the buffer does not get overwritten.
cuErr = cudaStreamSynchronize(cuStream);
if(cuErr != cudaError::cudaSuccess)
{
printf("Cannot sync stream\n");
cudaFree(devCubePtr.ptr);
cudaFreeHost(hostImagePtr);
cudaStreamDestroy(cuStream);
return iecError::fail;
}
uint16_t * destPtr;
for(uint32_t j = 0; j < imageLength; ++j)
{
// TODO: enhance image I/O...
TIFFReadScanline(file, hostImagePtr + j * imageWidth, j); // Load a row of pixels to cuRAM
destPtr = (uint16_t *)((uint8_t *)devCubePtr.ptr
+ j * devCubePtr.pitch
+ i * devCubePtr.pitch * devCubePtr.ysize);
cudaMemcpyAsync(destPtr, hostImagePtr + j * imageWidth,
imageWidth * sizeof(uint16_t),
cudaMemcpyHostToDevice, cuStream);
}
TIFFClose(file); // Close image file
}
dim3 threads(32, 16);
dim3 blocks(imageWidth / 32, imageLength / 16);
uint16_t slice = (refImages->size() & 1) ?
refImages->size() / 2 : (refImages->size() / 2) - 1;
/*
cuKernels::MedianQS << <blocks, threads, 0, cuStream >> >
(devCubePtr, refImages->size(),
slice,
(devCubePtr.pitch * devCubePtr.ysize) / sizeof(uint16_t));*/
uint16_t * destPtr;
destPtr = (uint16_t *)((uint8_t *)devCubePtr.ptr
+ slice * devCubePtr.pitch * devCubePtr.ysize);
cudaMemcpy2DAsync(hostImagePtr,
mageWidth * sizeof(uint16_t),
destPtr,
devCubePtr.pitch,
imageWidth * sizeof(uint16_t),
imageLength,
cudaMemcpyDeviceToHost, cuStream);
cudaStreamSynchronize(cuStream);
cudaStreamDestroy(cuStream);
TIFF * out =
TIFFOpen("C:\testData\output.tif", "w"); // Open Image
TIFFSetField(out, TIFFTAG_IMAGEWIDTH, imageWidth);
TIFFSetField(out, TIFFTAG_IMAGELENGTH, imageLength);
TIFFSetField(out, TIFFTAG_SAMPLESPERPIXEL, 1); // set number of channels per pixel
TIFFSetField(out, TIFFTAG_BITSPERSAMPLE, 16); // set the size of the channels
TIFFSetField(out, TIFFTAG_SUBFILETYPE, 0);
// Some other essential fields to set that you do not have to understand for now.
TIFFSetField(out, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);
TIFFSetField(out, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_MINISBLACK);
TIFFSetField(out, TIFFTAG_RESOLUTIONUNIT, RESUNIT_NONE);
for(uint32_t j = 0; j < imageLength; ++j)
{
TIFFWriteScanline(out, hostImagePtr + j * imageWidth, j);
}
TIFFClose(out);
cuErr = cudaFree(devCubePtr.ptr);
if(cuErr != cudaError::cudaSuccess)
{
printf("Cannot free cube\n");
cudaFree(devCubePtr.ptr);
cudaFreeHost(hostImagePtr);
cudaStreamDestroy(cuStream);
return iecError::fail;
}
cuErr = cudaFreeHost(hostImagePtr);
if(cuErr != cudaError::cudaSuccess)
{
printf("Cannot free host\n");
cudaFree(devCubePtr.ptr);
cudaFreeHost(hostImagePtr);
cudaStreamDestroy(cuStream);
return iecError::fail;
}
cudaDeviceSynchronize();
cudaThreadSynchronize();
return iecError::success;
}