Multithread CPUs with Parallel OpenCL Kernels reports CL_OUT_OF_RESOURCES

I am running 24 CPU threads to call the same OpenCL Kernel in parallel, which adds two arrays A and B each one with 12800 long ints which means 4 bytes ~ 51Kb each. So the total (if I am correct) kernel memory needs is about 51Kb X 3 = 153Kb for each kernel execution (array A, B, and C=result).

For 24 CPU Threads, the total video memory needs are approaching ~3.68MB which is very near to my Video Card installed memory 4MB (GeForce GTX 960m). Increasing the MAX_THREADS sometimes (not every time) I get CL_OUT_OF_RESOURCES (-5) clenqueuereadbuffer.

If my code is not buggy, the main question here is if there any way to prevent the CL_OUT_OF_RESOURCES by stalling kernel execution until some video card memory is freed.

Device Kernel

__kernel void vector_add(__global const long *A, __global const long *B, __global long *C) {
 
    // Get the index of the current element to be processed
    long i = get_global_id(0);
	
    C[i] = A[i] + B[i];
}

and the host program


#include <windows.h>
#include <tchar.h>
#include <strsafe.h>
#include <CL\cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
//#define CL_USE_DEPRECATED_OPENCL_2_0_APIS

#define MAX_THREADS 24
#define BUF_SIZE 255



DWORD WINAPI MyThreadFunction(LPVOID lpParam);
void ErrorHandler(LPCTSTR lpszFunction);
BOOL DisplayMessage(LPCTSTR text, DWORD Param1, DWORD Param2);

// Sample custom data structure for threads to use.
// This is passed by void pointer so it can be any data type
// that can be passed using a single void pointer (LPVOID).
typedef struct TData {
	int		nThreadNum;
	DWORD 	dwThreadID;
} _TDATA, *_pTDATA;

HANDLE  hThreadArray[MAX_THREADS];


#define MAX_SOURCE_SIZE (0x100000)
#define UseOpenCL 1

const long LIST_SIZE = 12800;

_pTDATA pDataArray[MAX_THREADS];

long *A = (long*)malloc(sizeof(long)*LIST_SIZE);
long *B = (long*)malloc(sizeof(long)*LIST_SIZE);
long *C = (long*)malloc(sizeof(long)*LIST_SIZE);
long *DestArrayA[MAX_THREADS];
long *DestArrayB[MAX_THREADS];
long *AA = (long*)malloc(MAX_THREADS * LIST_SIZE * sizeof(long));
long *BB = (long*)malloc(MAX_THREADS * LIST_SIZE * sizeof(long));

cl_mem KernelAreaBufferA[MAX_THREADS], KernelAreaBufferB[MAX_THREADS];  ///< Global Buffer 
cl_command_queue command_queue;
cl_queue_properties qprop[] = { CL_QUEUE_PROPERTIES,  CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, 0 };
cl_context context;
cl_device_id device_id = NULL;
cl_program program;
cl_kernel kernel;
cl_mem c_mem_obj[MAX_THREADS];

BYTE GetThreadIDNum(DWORD threadid) {

	for (int i = 0; i < MAX_THREADS; i++) {

		if (pDataArray[i]->dwThreadID == threadid)

			return pDataArray[i]->nThreadNum;

	}

	return 0;
}


void ShowResults() {

	// Display the result to the screen/*
	printf("\n--------------------------------------\n");
	for (int i = 0; i < LIST_SIZE; i++)
		printf("%lu + %lu = %lu\n", A[i], B[i], C[i]);

}

inline BOOL checkError(cl_int err, const char * name)
{
	BOOL success = true;
	if (err != CL_SUCCESS) {
		fprintf(stderr, "ERROR: %s ( %d )\n", name, err);
		success = false;
	}
	return success;
}

int InitializeOpenCL_Kernel() {

	printf("InitializeOpenCL_Kernel()");

	cl_int                  errNum; ///< Indicate error 


	for (int i = 0; i < LIST_SIZE; i++) {
		A[i] = i;
		B[i] = LIST_SIZE - i;
		//*(AA + ThreadNum * MAX_THREADS + i) = i;
		//*(BB + ThreadNum * MAX_THREADS + i) = LIST_SIZE - i;
	}

	if (UseOpenCL == 0)
		return 0;

	// Load the kernel source code into the array source_str
	FILE *fp;
	char *source_str;
	size_t source_size;

	fp = fopen("vector_add_kernel.cl", "r");
	if (!fp) {
		fprintf(stderr, "Failed to load kernel.\n");
		exit(1);
	}
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);





	// Get platform and device information
	cl_platform_id platform_id = NULL;

	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;
	errNum = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); //Get information about the platform and the devices available on the computer (line 42)

	errNum = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); //Select devices to use in execution (line 43)

	size_t paramValueSize;         ///< Info Device
	errNum = clGetDeviceInfo(device_id, CL_DEVICE_NAME, 0, NULL, &paramValueSize);
	char* info = (char *)malloc(sizeof(char) * paramValueSize);  // String to display vendor name info
	errNum = clGetDeviceInfo(device_id, CL_DEVICE_NAME, paramValueSize, info, NULL);


	//printf("\n.:OpenCL Benchmark is started\n\n");
	printf("Using GPU device: %s\n", info);


	// Create an OpenCL context
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &errNum); //Create an OpenCL context (line 47)

	// Create a command queue
	//cl_command_queue command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &errNum); //Create a command queue (line 50)		

	// Create a program from the kernel source
	cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &errNum); //Create program object (line 67)

	// Build the program
	errNum = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); //Load the kernel source code (line 24-35) and compile it (line 71) (online exeuction) or load the precompiled binary OpenCL program (offline execution)
	checkError(errNum, "clEnqueueUnmapMemObject error");

	// Create the OpenCL kernel
	kernel = clCreateKernel(program, "vector_add", &errNum); //Create kernel object (line 74)
	

	return (int)errNum;

}

int CleanUpOpenCL_Kernel() {

	printf("CleanUpOpenCL_Kernel()");
	cl_int                  errNum; ///< Indicate error 
	errNum = clReleaseCommandQueue(command_queue);
	errNum = clReleaseKernel(kernel);
	errNum = clReleaseProgram(program);
	errNum = clReleaseContext(context);

	return errNum;
}



int OpenCL_Kernel(void) {
	double dResult = 0;
	
	DWORD ThreadID = GetCurrentThreadId();
	int ThreadNum = GetThreadIDNum(ThreadID);
	

	cl_int                  errNum; ///< Indicate error 


	// Create the two input vectors
	long i;
	
	
	clock_t lBefore = clock();



	if (UseOpenCL == 1) {


		command_queue= clCreateCommandQueueWithProperties(context, device_id, qprop, &errNum); //Create a command queue (line 50)


		//********* ARRAY A

		//cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(long), NULL, &errNum);
		//errNum = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(long), A, 0, NULL, NULL);
		KernelAreaBufferA[ThreadNum] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * LIST_SIZE, NULL, &errNum);
		checkError(errNum, "clCreateBuffer error");
		DestArrayA[ThreadNum] = (long*)clEnqueueMapBuffer(command_queue, KernelAreaBufferA[ThreadNum], CL_TRUE, CL_MAP_WRITE, 0, sizeof(long) * LIST_SIZE, 0, NULL, NULL, &errNum);
		checkError(errNum, "clEnqueueMapBuffer error");
		for (i = 0; i < LIST_SIZE; i++) {
			DestArrayA[ThreadNum][i] = A[i]; //*(AA + ThreadNum * MAX_THREADS + i);
		}
		errNum = clEnqueueUnmapMemObject(command_queue, KernelAreaBufferA[ThreadNum], DestArrayA[ThreadNum], 0, NULL, NULL);
		checkError(errNum, "clEnqueueUnmapMemObject error");
		//********* ARRAY B

		//cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(long), NULL, &errNum);
		//errNum = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(long), B, 0, NULL, NULL); //Transfer data (list A and B) to memory buffers on the device (line 61-64)
		KernelAreaBufferB[ThreadNum] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * LIST_SIZE, NULL, &errNum);
		checkError(errNum, "clCreateBuffer error");
		DestArrayB[ThreadNum] = (long*)clEnqueueMapBuffer(command_queue, KernelAreaBufferB[ThreadNum], CL_TRUE, CL_MAP_WRITE, 0, sizeof(long) * LIST_SIZE, 0, NULL, NULL, &errNum);
		checkError(errNum, "clEnqueueMapBuffer error");
		for (i = 0; i < LIST_SIZE; i++) {
			DestArrayB[ThreadNum][i] = B[i]; //*(BB + ThreadNum * MAX_THREADS + i);
		}
		errNum = clEnqueueUnmapMemObject(command_queue, KernelAreaBufferB[ThreadNum], DestArrayB[ThreadNum], 0, NULL, NULL);
		checkError(errNum, "clEnqueueUnmapMemObject error");

		//********* ARRAY C

		c_mem_obj[ThreadNum] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(long), NULL, &errNum); 	//Create memory buffer objects(line 53-58)
		checkError(errNum, "clCreateBuffer error");


		// Set the arguments of the kernel
		errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &KernelAreaBufferA[ThreadNum] /* &a_mem_obj */);//Set kernel arguments (line 77-79)
		checkError(errNum, "clSetKernelArg error");
		errNum = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &KernelAreaBufferB[ThreadNum] /* &b_mem_obj */);
		checkError(errNum, "clSetKernelArg error");
		errNum = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj[ThreadNum]);
		checkError(errNum, "clSetKernelArg error");
		// Execute the OpenCL kernel on the list
		size_t global_item_size = LIST_SIZE; // Process the entire lists
		size_t local_item_size = 64; // Divide work items into groups of 64
		
		DWORD ThreadID = GetCurrentThreadId();
		
		DisplayMessage(TEXT("clEnqueueNDRangeKernel initiated with Id=[%ld] & Num=[%ld]\n"), ThreadID, ThreadNum);
		errNum = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); //Execute the kernel (line 84)
		checkError(errNum, "clEnqueueNDRangeKernel error");
		//errNum = clEnqueueTask(command_queue, kernel, 0, NULL, NULL); //Execute the kernel (line 84)

	// Read the memory buffer C on the device to the local variable C

		errNum = clEnqueueReadBuffer(command_queue, c_mem_obj[ThreadNum], CL_TRUE, 0, LIST_SIZE * sizeof(long), C, 0, NULL, NULL); //Read memory objects.In this case we read the list C from the compute device(line 88 - 90)
		checkError(errNum, "clEnqueueReadBuffer error");
		dResult = (double)(clock() - lBefore) / CLOCKS_PER_SEC;


		// Clean up
		errNum = clFlush(command_queue);
		errNum = clFinish(command_queue);

		//errNum = clReleaseMemObject(a_mem_obj);
		errNum = clReleaseMemObject(KernelAreaBufferA[ThreadNum]);
		//errNum = clReleaseMemObject(b_mem_obj);
		errNum = clReleaseMemObject(KernelAreaBufferB[ThreadNum]);
		errNum = clReleaseMemObject(c_mem_obj[ThreadNum]);
		


	}
	else {

		//printf("\n.:CPU Benchmark is started\n\n");
		clock_t lBefore = clock();

		//for (i = 0; i < LIST_SIZE; i++) {

		//	C[i] = A[i] + B[i];
		//}

		dResult = (double)(clock() - lBefore) / CLOCKS_PER_SEC;
	}



	//printf("\n.:Benchmark ended. Total Time %12.3f sec\n\n", dResult);





	
	printf("Thread ID=%lu completed.\n", ThreadID);


	return 0;
}


//------------------------------------------
// A function to Display the message
// indicating in which tread we are
//------------------------------------------
BOOL DisplayMessage(LPCTSTR text, DWORD Param1, DWORD Param2)
{
	TCHAR msgBuf[BUF_SIZE];

	size_t cchStringSize;
	DWORD dwChars;
	HANDLE hStdout;
	// Make sure there is a console to receive output results. 

	hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
	if (hStdout == INVALID_HANDLE_VALUE)
		return TRUE;

	// Print message using thread-safe functions.
	StringCchPrintf(msgBuf, BUF_SIZE, text, Param1, Param2);
	StringCchLength(msgBuf, BUF_SIZE, &cchStringSize);
	WriteConsole(hStdout, msgBuf, cchStringSize, &dwChars, NULL);

	return FALSE;
}


void ErrorHandler(LPCTSTR lpszFunction)
{
	// Retrieve the system error message for the last-error code.

	LPVOID lpMsgBuf;
	LPVOID lpDisplayBuf;
	DWORD dw = GetLastError();

	FormatMessage(
		FORMAT_MESSAGE_ALLOCATE_BUFFER |
		FORMAT_MESSAGE_FROM_SYSTEM |
		FORMAT_MESSAGE_IGNORE_INSERTS,
		NULL,
		dw,
		MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
		(LPTSTR)&lpMsgBuf,
		0, NULL);

	// Display the error message.

	lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT,
		(lstrlen((LPCTSTR)lpMsgBuf) + lstrlen((LPCTSTR)lpszFunction) + 40) * sizeof(TCHAR));
	StringCchPrintf((LPTSTR)lpDisplayBuf,
		LocalSize(lpDisplayBuf) / sizeof(TCHAR),
		TEXT("%s failed with error %d: %s"),
		lpszFunction, dw, lpMsgBuf);
	MessageBox(NULL, (LPCTSTR)lpDisplayBuf, TEXT("Error"), MB_OK);

	// Free error-handling buffer allocations.

	LocalFree(lpMsgBuf);
	LocalFree(lpDisplayBuf);
}

int _tmain()
{


	DWORD threadID[MAX_THREADS], dwEvent;
	


	InitializeOpenCL_Kernel();

	// Create MAX_THREADS worker threads.

	for (int i = 0; i < MAX_THREADS; i++)
	{


		// Allocate memory for thread data.
		pDataArray[i] = (_pTDATA)HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(_TDATA));	

		if (pDataArray[i] == NULL)
		{
			// If the array allocation fails, the system is out of memory
			// so there is no point in trying to print an error message.
			// Just terminate execution.
			ExitProcess(2);
		}

		// Generate unique data for each thread to work with.

		pDataArray[i]->nThreadNum = i;


		// Create the thread to begin execution on its own.

		hThreadArray[i] = CreateThread(
			NULL,                   // default security attributes
			0,                      // use default stack size  
			MyThreadFunction,       // thread function name
			(LPVOID)pDataArray[i],			// argument to thread function 
			0,                      // use default creation flags 
			&threadID[i]);   // returns the thread identifier 

		pDataArray[i]->dwThreadID = threadID[i];
		printf("*Thread %d with Id=[%ld] created.\n", pDataArray[i]->nThreadNum, pDataArray[i]->dwThreadID);
		// Check the return value for success.
		// If CreateThread fails, terminate execution. 
		// This will automatically clean up threads and memory. 

		if (hThreadArray[i] == NULL)
		{
			ErrorHandler(TEXT("CreateThread"));
			ExitProcess(3);
		}
	} // End of main thread creation loop.

	// Wait until all threads have terminated.	
	dwEvent =  WaitForMultipleObjects(MAX_THREADS, hThreadArray, TRUE, INFINITE);

	// Close all thread handles and free memory allocations.
	for (int i = 0; i < MAX_THREADS; i++)
	{
		CloseHandle(hThreadArray[i]);
		printf("Handle %ld with  ID=%ld, just closed.\n", hThreadArray[i], threadID[i]);
		if (pDataArray[i] != NULL)
		{
			HeapFree(GetProcessHeap(), 0, pDataArray[i]);
			pDataArray[i] = NULL;    // Ensure address is not reused.
		}
	}

	//ShowResults();

	CleanUpOpenCL_Kernel();

	free(A);
	free(B);
	free(C);

	free(AA);
	free(BB);

	return 0;
}


DWORD WINAPI MyThreadFunction(LPVOID lpParam) {

	_pTDATA pDataArray;

	// Cast the parameter to the correct data type.
	// The pointer is known to be valid because 
	// it was checked for NULL before the thread was created.
	// make a copy of the parameter for convenience
	//pDataArray2 = *static_cast<_pTDATA*>(lpParam)
	
	pDataArray = (_pTDATA)lpParam;

	// Print the parameter values using thread-safe functions.	
	DWORD ThreadID = GetCurrentThreadId();
	DisplayMessage(TEXT("-Thread %ld with Id=[%ld] initiated\n"), pDataArray->nThreadNum, ThreadID);
	OpenCL_Kernel();

	return 0;
}