Multithread CPUs with Parallel OpenCL Kernels reports CL_OUT_OF_RESOURCES

g_papaioannou · June 8, 2021, 8:25am

I am running 24 CPU threads to call the same OpenCL Kernel in parallel, which adds two arrays A and B each one with 12800 long ints which means 4 bytes ~ 51Kb each. So the total (if I am correct) kernel memory needs is about 51Kb X 3 = 153Kb for each kernel execution (array A, B, and C=result).

For 24 CPU Threads, the total video memory needs are approaching ~3.68MB which is very near to my Video Card installed memory 4MB (GeForce GTX 960m). Increasing the MAX_THREADS sometimes (not every time) I get CL_OUT_OF_RESOURCES (-5) clenqueuereadbuffer.

If my code is not buggy, the main question here is if there any way to prevent the CL_OUT_OF_RESOURCES by stalling kernel execution until some video card memory is freed.

Device Kernel

__kernel void vector_add(__global const long *A, __global const long *B, __global long *C) {
 
    // Get the index of the current element to be processed
    long i = get_global_id(0);
	
    C[i] = A[i] + B[i];
}

and the host program


#include <windows.h>
#include <tchar.h>
#include <strsafe.h>
#include <CL\cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
//#define CL_USE_DEPRECATED_OPENCL_2_0_APIS

#define MAX_THREADS 24
#define BUF_SIZE 255



DWORD WINAPI MyThreadFunction(LPVOID lpParam);
void ErrorHandler(LPCTSTR lpszFunction);
BOOL DisplayMessage(LPCTSTR text, DWORD Param1, DWORD Param2);

// Sample custom data structure for threads to use.
// This is passed by void pointer so it can be any data type
// that can be passed using a single void pointer (LPVOID).
typedef struct TData {
	int		nThreadNum;
	DWORD 	dwThreadID;
} _TDATA, *_pTDATA;

HANDLE  hThreadArray[MAX_THREADS];


#define MAX_SOURCE_SIZE (0x100000)
#define UseOpenCL 1

const long LIST_SIZE = 12800;

_pTDATA pDataArray[MAX_THREADS];

long *A = (long*)malloc(sizeof(long)*LIST_SIZE);
long *B = (long*)malloc(sizeof(long)*LIST_SIZE);
long *C = (long*)malloc(sizeof(long)*LIST_SIZE);
long *DestArrayA[MAX_THREADS];
long *DestArrayB[MAX_THREADS];
long *AA = (long*)malloc(MAX_THREADS * LIST_SIZE * sizeof(long));
long *BB = (long*)malloc(MAX_THREADS * LIST_SIZE * sizeof(long));

cl_mem KernelAreaBufferA[MAX_THREADS], KernelAreaBufferB[MAX_THREADS];  ///< Global Buffer 
cl_command_queue command_queue;
cl_queue_properties qprop[] = { CL_QUEUE_PROPERTIES,  CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, 0 };
cl_context context;
cl_device_id device_id = NULL;
cl_program program;
cl_kernel kernel;
cl_mem c_mem_obj[MAX_THREADS];

BYTE GetThreadIDNum(DWORD threadid) {

	for (int i = 0; i < MAX_THREADS; i++) {

		if (pDataArray[i]->dwThreadID == threadid)

			return pDataArray[i]->nThreadNum;

	}

	return 0;
}


void ShowResults() {

	// Display the result to the screen/*
	printf("\n--------------------------------------\n");
	for (int i = 0; i < LIST_SIZE; i++)
		printf("%lu + %lu = %lu\n", A[i], B[i], C[i]);

}

inline BOOL checkError(cl_int err, const char * name)
{
	BOOL success = true;
	if (err != CL_SUCCESS) {
		fprintf(stderr, "ERROR: %s ( %d )\n", name, err);
		success = false;
	}
	return success;
}

int InitializeOpenCL_Kernel() {

	printf("InitializeOpenCL_Kernel()");

	cl_int                  errNum; ///< Indicate error 


	for (int i = 0; i < LIST_SIZE; i++) {
		A[i] = i;
		B[i] = LIST_SIZE - i;
		//*(AA + ThreadNum * MAX_THREADS + i) = i;
		//*(BB + ThreadNum * MAX_THREADS + i) = LIST_SIZE - i;
	}

	if (UseOpenCL == 0)
		return 0;

	// Load the kernel source code into the array source_str
	FILE *fp;
	char *source_str;
	size_t source_size;

	fp = fopen("vector_add_kernel.cl", "r");
	if (!fp) {
		fprintf(stderr, "Failed to load kernel.\n");
		exit(1);
	}
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);





	// Get platform and device information
	cl_platform_id platform_id = NULL;

	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;
	errNum = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); //Get information about the platform and the devices available on the computer (line 42)

	errNum = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); //Select devices to use in execution (line 43)

	size_t paramValueSize;         ///< Info Device
	errNum = clGetDeviceInfo(device_id, CL_DEVICE_NAME, 0, NULL, &paramValueSize);
	char* info = (char *)malloc(sizeof(char) * paramValueSize);  // String to display vendor name info
	errNum = clGetDeviceInfo(device_id, CL_DEVICE_NAME, paramValueSize, info, NULL);


	//printf("\n.:OpenCL Benchmark is started\n\n");
	printf("Using GPU device: %s\n", info);


	// Create an OpenCL context
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &errNum); //Create an OpenCL context (line 47)

	// Create a command queue
	//cl_command_queue command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &errNum); //Create a command queue (line 50)		

	// Create a program from the kernel source
	cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &errNum); //Create program object (line 67)

	// Build the program
	errNum = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); //Load the kernel source code (line 24-35) and compile it (line 71) (online exeuction) or load the precompiled binary OpenCL program (offline execution)
	checkError(errNum, "clEnqueueUnmapMemObject error");

	// Create the OpenCL kernel
	kernel = clCreateKernel(program, "vector_add", &errNum); //Create kernel object (line 74)
	

	return (int)errNum;

}

int CleanUpOpenCL_Kernel() {

	printf("CleanUpOpenCL_Kernel()");
	cl_int                  errNum; ///< Indicate error 
	errNum = clReleaseCommandQueue(command_queue);
	errNum = clReleaseKernel(kernel);
	errNum = clReleaseProgram(program);
	errNum = clReleaseContext(context);

	return errNum;
}



int OpenCL_Kernel(void) {
	double dResult = 0;
	
	DWORD ThreadID = GetCurrentThreadId();
	int ThreadNum = GetThreadIDNum(ThreadID);
	

	cl_int                  errNum; ///< Indicate error 


	// Create the two input vectors
	long i;
	
	
	clock_t lBefore = clock();



	if (UseOpenCL == 1) {


		command_queue= clCreateCommandQueueWithProperties(context, device_id, qprop, &errNum); //Create a command queue (line 50)


		//********* ARRAY A

		//cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(long), NULL, &errNum);
		//errNum = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(long), A, 0, NULL, NULL);
		KernelAreaBufferA[ThreadNum] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * LIST_SIZE, NULL, &errNum);
		checkError(errNum, "clCreateBuffer error");
		DestArrayA[ThreadNum] = (long*)clEnqueueMapBuffer(command_queue, KernelAreaBufferA[ThreadNum], CL_TRUE, CL_MAP_WRITE, 0, sizeof(long) * LIST_SIZE, 0, NULL, NULL, &errNum);
		checkError(errNum, "clEnqueueMapBuffer error");
		for (i = 0; i < LIST_SIZE; i++) {
			DestArrayA[ThreadNum][i] = A[i]; //*(AA + ThreadNum * MAX_THREADS + i);
		}
		errNum = clEnqueueUnmapMemObject(command_queue, KernelAreaBufferA[ThreadNum], DestArrayA[ThreadNum], 0, NULL, NULL);
		checkError(errNum, "clEnqueueUnmapMemObject error");
		//********* ARRAY B

		//cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(long), NULL, &errNum);
		//errNum = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(long), B, 0, NULL, NULL); //Transfer data (list A and B) to memory buffers on the device (line 61-64)
		KernelAreaBufferB[ThreadNum] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * LIST_SIZE, NULL, &errNum);
		checkError(errNum, "clCreateBuffer error");
		DestArrayB[ThreadNum] = (long*)clEnqueueMapBuffer(command_queue, KernelAreaBufferB[ThreadNum], CL_TRUE, CL_MAP_WRITE, 0, sizeof(long) * LIST_SIZE, 0, NULL, NULL, &errNum);
		checkError(errNum, "clEnqueueMapBuffer error");
		for (i = 0; i < LIST_SIZE; i++) {
			DestArrayB[ThreadNum][i] = B[i]; //*(BB + ThreadNum * MAX_THREADS + i);
		}
		errNum = clEnqueueUnmapMemObject(command_queue, KernelAreaBufferB[ThreadNum], DestArrayB[ThreadNum], 0, NULL, NULL);
		checkError(errNum, "clEnqueueUnmapMemObject error");

		//********* ARRAY C

		c_mem_obj[ThreadNum] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(long), NULL, &errNum); 	//Create memory buffer objects(line 53-58)
		checkError(errNum, "clCreateBuffer error");


		// Set the arguments of the kernel
		errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &KernelAreaBufferA[ThreadNum] /* &a_mem_obj */);//Set kernel arguments (line 77-79)
		checkError(errNum, "clSetKernelArg error");
		errNum = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &KernelAreaBufferB[ThreadNum] /* &b_mem_obj */);
		checkError(errNum, "clSetKernelArg error");
		errNum = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj[ThreadNum]);
		checkError(errNum, "clSetKernelArg error");
		// Execute the OpenCL kernel on the list
		size_t global_item_size = LIST_SIZE; // Process the entire lists
		size_t local_item_size = 64; // Divide work items into groups of 64
		
		DWORD ThreadID = GetCurrentThreadId();
		
		DisplayMessage(TEXT("clEnqueueNDRangeKernel initiated with Id=[%ld] & Num=[%ld]\n"), ThreadID, ThreadNum);
		errNum = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); //Execute the kernel (line 84)
		checkError(errNum, "clEnqueueNDRangeKernel error");
		//errNum = clEnqueueTask(command_queue, kernel, 0, NULL, NULL); //Execute the kernel (line 84)

	// Read the memory buffer C on the device to the local variable C

		errNum = clEnqueueReadBuffer(command_queue, c_mem_obj[ThreadNum], CL_TRUE, 0, LIST_SIZE * sizeof(long), C, 0, NULL, NULL); //Read memory objects.In this case we read the list C from the compute device(line 88 - 90)
		checkError(errNum, "clEnqueueReadBuffer error");
		dResult = (double)(clock() - lBefore) / CLOCKS_PER_SEC;


		// Clean up
		errNum = clFlush(command_queue);
		errNum = clFinish(command_queue);

		//errNum = clReleaseMemObject(a_mem_obj);
		errNum = clReleaseMemObject(KernelAreaBufferA[ThreadNum]);
		//errNum = clReleaseMemObject(b_mem_obj);
		errNum = clReleaseMemObject(KernelAreaBufferB[ThreadNum]);
		errNum = clReleaseMemObject(c_mem_obj[ThreadNum]);
		


	}
	else {

		//printf("\n.:CPU Benchmark is started\n\n");
		clock_t lBefore = clock();

		//for (i = 0; i < LIST_SIZE; i++) {

		//	C[i] = A[i] + B[i];
		//}

		dResult = (double)(clock() - lBefore) / CLOCKS_PER_SEC;
	}



	//printf("\n.:Benchmark ended. Total Time %12.3f sec\n\n", dResult);





	
	printf("Thread ID=%lu completed.\n", ThreadID);


	return 0;
}


//------------------------------------------
// A function to Display the message
// indicating in which tread we are
//------------------------------------------
BOOL DisplayMessage(LPCTSTR text, DWORD Param1, DWORD Param2)
{
	TCHAR msgBuf[BUF_SIZE];

	size_t cchStringSize;
	DWORD dwChars;
	HANDLE hStdout;
	// Make sure there is a console to receive output results. 

	hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
	if (hStdout == INVALID_HANDLE_VALUE)
		return TRUE;

	// Print message using thread-safe functions.
	StringCchPrintf(msgBuf, BUF_SIZE, text, Param1, Param2);
	StringCchLength(msgBuf, BUF_SIZE, &cchStringSize);
	WriteConsole(hStdout, msgBuf, cchStringSize, &dwChars, NULL);

	return FALSE;
}


void ErrorHandler(LPCTSTR lpszFunction)
{
	// Retrieve the system error message for the last-error code.

	LPVOID lpMsgBuf;
	LPVOID lpDisplayBuf;
	DWORD dw = GetLastError();

	FormatMessage(
		FORMAT_MESSAGE_ALLOCATE_BUFFER |
		FORMAT_MESSAGE_FROM_SYSTEM |
		FORMAT_MESSAGE_IGNORE_INSERTS,
		NULL,
		dw,
		MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
		(LPTSTR)&lpMsgBuf,
		0, NULL);

	// Display the error message.

	lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT,
		(lstrlen((LPCTSTR)lpMsgBuf) + lstrlen((LPCTSTR)lpszFunction) + 40) * sizeof(TCHAR));
	StringCchPrintf((LPTSTR)lpDisplayBuf,
		LocalSize(lpDisplayBuf) / sizeof(TCHAR),
		TEXT("%s failed with error %d: %s"),
		lpszFunction, dw, lpMsgBuf);
	MessageBox(NULL, (LPCTSTR)lpDisplayBuf, TEXT("Error"), MB_OK);

	// Free error-handling buffer allocations.

	LocalFree(lpMsgBuf);
	LocalFree(lpDisplayBuf);
}

int _tmain()
{


	DWORD threadID[MAX_THREADS], dwEvent;
	


	InitializeOpenCL_Kernel();

	// Create MAX_THREADS worker threads.

	for (int i = 0; i < MAX_THREADS; i++)
	{


		// Allocate memory for thread data.
		pDataArray[i] = (_pTDATA)HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(_TDATA));	

		if (pDataArray[i] == NULL)
		{
			// If the array allocation fails, the system is out of memory
			// so there is no point in trying to print an error message.
			// Just terminate execution.
			ExitProcess(2);
		}

		// Generate unique data for each thread to work with.

		pDataArray[i]->nThreadNum = i;


		// Create the thread to begin execution on its own.

		hThreadArray[i] = CreateThread(
			NULL,                   // default security attributes
			0,                      // use default stack size  
			MyThreadFunction,       // thread function name
			(LPVOID)pDataArray[i],			// argument to thread function 
			0,                      // use default creation flags 
			&threadID[i]);   // returns the thread identifier 

		pDataArray[i]->dwThreadID = threadID[i];
		printf("*Thread %d with Id=[%ld] created.\n", pDataArray[i]->nThreadNum, pDataArray[i]->dwThreadID);
		// Check the return value for success.
		// If CreateThread fails, terminate execution. 
		// This will automatically clean up threads and memory. 

		if (hThreadArray[i] == NULL)
		{
			ErrorHandler(TEXT("CreateThread"));
			ExitProcess(3);
		}
	} // End of main thread creation loop.

	// Wait until all threads have terminated.	
	dwEvent =  WaitForMultipleObjects(MAX_THREADS, hThreadArray, TRUE, INFINITE);

	// Close all thread handles and free memory allocations.
	for (int i = 0; i < MAX_THREADS; i++)
	{
		CloseHandle(hThreadArray[i]);
		printf("Handle %ld with  ID=%ld, just closed.\n", hThreadArray[i], threadID[i]);
		if (pDataArray[i] != NULL)
		{
			HeapFree(GetProcessHeap(), 0, pDataArray[i]);
			pDataArray[i] = NULL;    // Ensure address is not reused.
		}
	}

	//ShowResults();

	CleanUpOpenCL_Kernel();

	free(A);
	free(B);
	free(C);

	free(AA);
	free(BB);

	return 0;
}


DWORD WINAPI MyThreadFunction(LPVOID lpParam) {

	_pTDATA pDataArray;

	// Cast the parameter to the correct data type.
	// The pointer is known to be valid because 
	// it was checked for NULL before the thread was created.
	// make a copy of the parameter for convenience
	//pDataArray2 = *static_cast<_pTDATA*>(lpParam)
	
	pDataArray = (_pTDATA)lpParam;

	// Print the parameter values using thread-safe functions.	
	DWORD ThreadID = GetCurrentThreadId();
	DisplayMessage(TEXT("-Thread %ld with Id=[%ld] initiated\n"), pDataArray->nThreadNum, ThreadID);
	OpenCL_Kernel();

	return 0;
}

Topic		Replies	Views
OpenCL in Fedora vs Windows Same code. Runs in Windows. Kernel error in Fedora CUDA Programming and Performance	7	2621	December 11, 2009
clEnqueueNDRangeKernel throws CL_OUT_OF_RESOURCES CUDA Setup and Installation	17	3359	August 3, 2016
A problem in reading opencl kernel results in a simple reduction implementation CUDA Programming and Performance	0	809	August 4, 2016
Program crash when copying data onto device CUDA Programming and Performance	2	4158	January 15, 2012
CL_INVALID_COMMAND_QUEUE error on clFinish command - a lot of operations in each kernel driver crash CUDA Programming and Performance	3	8407	September 1, 2011
Is task parallel programming (aka "concurrent kernels") in OpenCL supported? CUDA Programming and Performance	1	4287	August 31, 2011
clEnqueueNDRangeKernel() returns CL_OUT_OF_RESOURCES and I have no idea why CUDA Programming and Performance	1	7454	July 26, 2010
OpenCL CL_OUT_OF_RESOURCES on event.wait() CUDA Programming and Performance	63	25102	November 19, 2010
launch kernels in parallel? CUDA Programming and Performance	16	23992	July 29, 2010
Significant speedup of OpenCL vs CUDA CUDA Programming and Performance	23	7863	February 12, 2022

Multithread CPUs with Parallel OpenCL Kernels reports CL_OUT_OF_RESOURCES

Related topics