Memory Fragmentation: 3d memory not completely freed

Hello everyone,

I am relatively new to cuda, and actually facing a very strange problem:
I made a class function to process images which allocates quite a bit of memory
on the device. If I create the class object and test the function repeatedly,
the device wont provide any more memory:
Available: memory 2GB (GTX 750)
each image is a bit more of 10 MB big.
I run the test with 160 images (1,6 GB) => success.
if I call the function twice 80 images then 160 images, 80 Images succeed, 160 fails:

C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug>ieC_ds.exe
160 images…Available Memory: 1932804096 / 2147483648 processed within 4734 ms

C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug>ieC_ds.exe
80 images…Available Memory: 1916719104 / 2147483648 processed within 2375 ms
160 images…Available Memory: 1916719104 / 2147483648 Cannot allocate 3d Cube
Test aborted

Of course I carefully freed the device at the end of the member function,
the prints use cudaMemGetInfo(&free, &total) right before allocating the cube of memory.

Yet here’s what memcheck says:

C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug>cuda-memcheck --leak-check full ieC_ds.exe
========= CUDA-MEMCHECK
80 images…Available Memory: 1955438592 / 2147483648 processed within 2598 ms
160 images…Available Memory: 1955438592 / 2147483648 Cannot allocate 3d Cube
Test aborded
========= Program hit cudaErrorMemoryAllocation (error 2) due to “out of memory” on CUDA API call to cudaMalloc3D.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\Windows\system32\nvcuda.dll (cuProfilerStop + 0xa5379) [0xbecb9]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\cudart32_65.dll (cudaMalloc3D + 0x11d) [0x167cd]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (iecEngine::QuotImg + 0x222) [0x2c072]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (main + 0x1a0) [0x7da0]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (__tmainCRTStartup + 0x199) [0x29af9]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (mainCRTStartup + 0xd) [0x29c3d]
========= Host Frame:C:\Windows\syswow64\kernel32.dll (BaseThreadInitThunk + 0x12) [0x1338a]
========= Host Frame:C:\Windows\SysWOW64\ntdll.dll (RtlInitializeExceptionChain + 0x63) [0x39f72]
========= Host Frame:C:\Windows\SysWOW64\ntdll.dll (RtlInitializeExceptionChain + 0x36) [0x39f45]

========= Program hit cudaErrorInvalidDevicePointer (error 17) due to “invalid device pointer” on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\Windows\system32\nvcuda.dll (cuProfilerStop + 0xa5379) [0xbecb9]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\cudart32_65.dll (cudaFree + 0xe2) [0x15dc2]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (iecEngine::QuotImg + 0x24b) [0x2c09b]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (main + 0x1a0) [0x7da0]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (__tmainCRTStartup + 0x199) [0x29af9]
========= Host Frame:C:\Users\nb1020\Documents\Visual Studio 2013\Projects\ieC_CUDA_devspace\Debug\ieC_ds.exe (mainCRTStartup + 0xd) [0x29c3d]
========= Host Frame:C:\Windows\syswow64\kernel32.dll (BaseThreadInitThunk + 0x12) [0x1338a]
========= Host Frame:C:\Windows\SysWOW64\ntdll.dll (RtlInitializeExceptionChain + 0x63) [0x39f72]
========= Host Frame:C:\Windows\SysWOW64\ntdll.dll (RtlInitializeExceptionChain + 0x36) [0x39f45]

========= LEAK SUMMARY: 0 bytes leaked in 0 allocations
========= ERROR SUMMARY: 2 errors

Now that the issue is clear, the code is inserted at the end.
You will notice, I disabled the kernel launch to minimize
error suspicion.

Thanks for any assistance.

for calling the function

#include "ieC.h"
#include <time.h>


#define numOfImages	160
#define step		80
#define startImage	1



int main()
{
	iecEngine * engine = new iecEngine;
	iecError err;
	std::vector<std::string> refImages;
	char fileName[100];
	clock_t time;
	for(int j = 1; j * step <= numOfImages; ++j)
	{

		for(int i = startImage; i < startImage + j * step; i++)
		{
			sprintf(fileName, "C:/testData/JB113_S113(3)_%04d.tif", i);
			refImages.push_back(fileName);
		}
		printf("%d  images...", j* step);
		time = clock();
		err = engine->QuotImg(&refImages, &refImages);
		time = clock() - time;
		if(err != iecError::success)
		{
			printf("Test aborded\n");
			engine->~iecEngine();
			return -1;
		}
		printf("processed within %d ms\n",
			(int)(time * 1000) / CLOCKS_PER_SEC);
	}
	engine->~iecEngine();
	return 0;
}

for the class implementation

#include <stdint.h>
#include <vector>
#include <string>
#include <time.h>
#include "tiffio.h"
#include "cuda_runtime.h"
#include "ieC_cuKernels.h"
#include "ieC.h"




cudaDeviceProp deviceProp;


// _________________________________________________________________________________________________

iecEngine::iecEngine()
{
	// Start by looking for cuda capable devices
	cudaError error;
	int deviceCount;
	error = cudaGetDeviceCount(&deviceCount);
	if(error != cudaSuccess)
	{
	// Cuda returned an unexpected answer
	}
	else
	{
		// TODO: add a program behavior for choosing the 
		// right CUDA Device
		if(deviceCount >= 1)
		{
		        error = cudaGetDeviceProperties(&deviceProp, 0);
			if(error != cudaSuccess)
			{
			}
			cudaSetDevice(0);
			cudaDeviceReset();
			cudaDeviceSynchronize();
			cudaThreadSynchronize();
		}
	}
}

// _________________________________________________________________________________________________

iecEngine::~iecEngine()
{
	// Free all memory.
}

// _________________________________________________________________________________________________

iecError iecEngine::QuotImg(std::vector<std::string> * refImages,
	std::vector<std::string> * sampleImages)
{
	TIFFSetWarningHandler(NULL);				// Disable TIFF Warnings
	TIFF * file = TIFFOpen(refImages->begin()->c_str(), "r");// Open first image
if(file==0)																								// Verify opened
	{
		// TODO: Manage file error
		printf("Cannot open first file\n");
		return iecError::fail;
	}
	uint32_t imageWidth;																						// Image width var in pixel
	uint32_t imageLength;																						// Image length var in pixel
	uint32_t imageSize;																							// Image size var in pixel
	TIFFGetField(file, TIFFTAG_IMAGEWIDTH, &imageWidth);						// Load image width
	TIFFGetField(file, TIFFTAG_IMAGELENGTH, &imageLength);					// Load image length
	imageSize = imageWidth * imageLength;														// Calculate image size
	TIFFClose(file);																								// close first file

	cudaError cuErr;																								// create a cude err container
	uint16_t * hostImagePtr;																				// Create ptr to loading image
	cuErr = cudaHostAlloc(&hostImagePtr,
												 imageSize * sizeof(uint16_t),
												 cudaHostAllocPortable);									// Alloc host RAM for image
	if(cuErr != cudaError::cudaSuccess)
	{
		printf("Cannot allocate memory on Host\n");
		return iecError::fail;
	}

	cudaStream_t cuStream;
	cuErr = cudaStreamCreate(&cuStream);														// Create a cuda stream
	if(cuErr != cudaError::cudaSuccess)
	{

		printf("Cannot create Stream to GPU\n");
		return iecError::fail;
	}
	size_t free, total;
	cudaMemGetInfo(&free, &total);
	printf("Available Memory: %u / %u\t", free, total);
	cudaExtent cubeVolume =																					// Create extent to init a cube
		make_cudaExtent(sizeof(uint16) * imageWidth,
		imageLength, refImages->size());
	cudaPitchedPtr devCubePtr;																			// Create a ptr for the cube.
	cuErr = cudaMalloc3D(&devCubePtr, cubeVolume);									// Alocate cube.
	if(cuErr != cudaError::cudaSuccess)
	{
		printf("Cannot allocate 3d Cube\n");
		cudaFree(hostImagePtr);
		return iecError::fail;
	}
	// Proceed to load the refImages.

	for(uint16_t i = 0; i < refImages->size(); i++)									// For all images
	{
		TIFF * file = TIFFOpen(refImages->at(i).c_str(), "r");				// Open Image
		if(file == 0)																									// Verify opened
		{
			printf("Cannot open %s\n", refImages->at(i).c_str());
			cudaFree(devCubePtr.ptr);
			cudaFreeHost(hostImagePtr);
			cudaStreamDestroy(cuStream);
			return iecError::fail;
		}
		// Verify Attributes
		uint32_t  imgW;																								// Image width in pixel var
		uint32_t  imgL;																								// Image length in pixel var
		TIFFGetField(file, TIFFTAG_IMAGEWIDTH, &imgW);								// Load image width
		TIFFGetField(file, TIFFTAG_IMAGELENGTH, &imgL);								// Load image length
		if(imgW != imageWidth || imgL != imageLength)									// Verify attributes
		{
			// TODO: handle images difference
			// WARINING: need to free the memory before exiting the function
			printf("%s has a different resolution %d vs %d and %d vs %d\n",
			      refImages->at(i).c_str(), imgW, imageWidth, imgL, imageLength);
			cudaFree(devCubePtr.ptr);
			cudaFreeHost(hostImagePtr);
			cudaStreamDestroy(cuStream);
			return iecError::fail;
		}
		// This statement is very important, it synchronize the stream
		// from previous operations so the buffer does not get overwritten.
		cuErr = cudaStreamSynchronize(cuStream);
		if(cuErr != cudaError::cudaSuccess)
		{
			printf("Cannot sync stream\n");
			cudaFree(devCubePtr.ptr);
			cudaFreeHost(hostImagePtr);
			cudaStreamDestroy(cuStream);
			return iecError::fail;
		}

		uint16_t * destPtr;
		for(uint32_t j = 0; j < imageLength; ++j)
		{
			// TODO: enhance image I/O...
			TIFFReadScanline(file, hostImagePtr + j * imageWidth, j);		// Load a row of pixels to cuRAM
			destPtr = (uint16_t *)((uint8_t *)devCubePtr.ptr
			      + j * devCubePtr.pitch
		              + i * devCubePtr.pitch * devCubePtr.ysize);
			cudaMemcpyAsync(destPtr, hostImagePtr + j * imageWidth,
			       imageWidth * sizeof(uint16_t),
			       cudaMemcpyHostToDevice, cuStream);
		}
		TIFFClose(file);																							// Close image file

	}

	dim3 threads(32, 16);
	dim3 blocks(imageWidth / 32, imageLength / 16);
	uint16_t slice = (refImages->size() & 1) ?
		refImages->size() / 2 : (refImages->size() / 2) - 1;
	/*
	cuKernels::MedianQS << <blocks, threads, 0, cuStream >> >
		(devCubePtr, refImages->size(),
		slice,
		(devCubePtr.pitch * devCubePtr.ysize) / sizeof(uint16_t));*/

	uint16_t * destPtr;
	destPtr = (uint16_t *)((uint8_t *)devCubePtr.ptr
	    + slice * devCubePtr.pitch * devCubePtr.ysize);
	cudaMemcpy2DAsync(hostImagePtr,
                mageWidth * sizeof(uint16_t),
	        destPtr,
		devCubePtr.pitch,
		imageWidth * sizeof(uint16_t),
		imageLength,
		cudaMemcpyDeviceToHost, cuStream);

	cudaStreamSynchronize(cuStream);
	cudaStreamDestroy(cuStream);
	TIFF * out =
		TIFFOpen("C:\testData\output.tif", "w");				// Open Image
	TIFFSetField(out, TIFFTAG_IMAGEWIDTH, imageWidth);
	TIFFSetField(out, TIFFTAG_IMAGELENGTH, imageLength);
	TIFFSetField(out, TIFFTAG_SAMPLESPERPIXEL, 1);   // set number of channels per pixel
	TIFFSetField(out, TIFFTAG_BITSPERSAMPLE, 16);    // set the size of the channels
	TIFFSetField(out, TIFFTAG_SUBFILETYPE, 0);
	//   Some other essential fields to set that you do not have to understand for now.
	TIFFSetField(out, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);
	TIFFSetField(out, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_MINISBLACK);
	TIFFSetField(out, TIFFTAG_RESOLUTIONUNIT, RESUNIT_NONE);
	for(uint32_t j = 0; j < imageLength; ++j)
	{
		TIFFWriteScanline(out, hostImagePtr + j * imageWidth, j);
	}
	TIFFClose(out);

	cuErr = cudaFree(devCubePtr.ptr);
	if(cuErr != cudaError::cudaSuccess)
	{
		printf("Cannot free cube\n");
		cudaFree(devCubePtr.ptr);
		cudaFreeHost(hostImagePtr);
		cudaStreamDestroy(cuStream);
		return iecError::fail;
	}
	cuErr = cudaFreeHost(hostImagePtr);
	if(cuErr != cudaError::cudaSuccess)
	{
		printf("Cannot free host\n");
		cudaFree(devCubePtr.ptr);
		cudaFreeHost(hostImagePtr);
		cudaStreamDestroy(cuStream);
		return iecError::fail;
	}
	cudaDeviceSynchronize();
	cudaThreadSynchronize();
	return iecError::success;
}

Hello,

the thread is cancelled =>

The error was lying within the caller function,
which did not reset the vector, causing it to multiply in value…

I’m sorry for the bother.
Thanks for viewing the topic

Nessim