I am running 24 CPU threads to call the same OpenCL Kernel in parallel, which adds two arrays A and B each one with 12800 long ints which means 4 bytes ~ 51Kb each. So the total (if I am correct) kernel memory needs is about 51Kb X 3 = 153Kb for each kernel execution (array A, B, and C=result).
For 24 CPU Threads, the total video memory needs are approaching ~3.68MB which is very near to my Video Card installed memory 4MB (GeForce GTX 960m). Increasing the MAX_THREADS sometimes (not every time) I get CL_OUT_OF_RESOURCES (-5) clenqueuereadbuffer
.
If my code is not buggy, the main question here is if there any way to prevent the CL_OUT_OF_RESOURCES by stalling kernel execution until some video card memory is freed.
Device Kernel
__kernel void vector_add(__global const long *A, __global const long *B, __global long *C) {
// Get the index of the current element to be processed
long i = get_global_id(0);
C[i] = A[i] + B[i];
}
and the host program
#include <windows.h>
#include <tchar.h>
#include <strsafe.h>
#include <CL\cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
//#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#define MAX_THREADS 24
#define BUF_SIZE 255
DWORD WINAPI MyThreadFunction(LPVOID lpParam);
void ErrorHandler(LPCTSTR lpszFunction);
BOOL DisplayMessage(LPCTSTR text, DWORD Param1, DWORD Param2);
// Sample custom data structure for threads to use.
// This is passed by void pointer so it can be any data type
// that can be passed using a single void pointer (LPVOID).
typedef struct TData {
int nThreadNum;
DWORD dwThreadID;
} _TDATA, *_pTDATA;
HANDLE hThreadArray[MAX_THREADS];
#define MAX_SOURCE_SIZE (0x100000)
#define UseOpenCL 1
const long LIST_SIZE = 12800;
_pTDATA pDataArray[MAX_THREADS];
long *A = (long*)malloc(sizeof(long)*LIST_SIZE);
long *B = (long*)malloc(sizeof(long)*LIST_SIZE);
long *C = (long*)malloc(sizeof(long)*LIST_SIZE);
long *DestArrayA[MAX_THREADS];
long *DestArrayB[MAX_THREADS];
long *AA = (long*)malloc(MAX_THREADS * LIST_SIZE * sizeof(long));
long *BB = (long*)malloc(MAX_THREADS * LIST_SIZE * sizeof(long));
cl_mem KernelAreaBufferA[MAX_THREADS], KernelAreaBufferB[MAX_THREADS]; ///< Global Buffer
cl_command_queue command_queue;
cl_queue_properties qprop[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, 0 };
cl_context context;
cl_device_id device_id = NULL;
cl_program program;
cl_kernel kernel;
cl_mem c_mem_obj[MAX_THREADS];
BYTE GetThreadIDNum(DWORD threadid) {
for (int i = 0; i < MAX_THREADS; i++) {
if (pDataArray[i]->dwThreadID == threadid)
return pDataArray[i]->nThreadNum;
}
return 0;
}
void ShowResults() {
// Display the result to the screen/*
printf("\n--------------------------------------\n");
for (int i = 0; i < LIST_SIZE; i++)
printf("%lu + %lu = %lu\n", A[i], B[i], C[i]);
}
inline BOOL checkError(cl_int err, const char * name)
{
BOOL success = true;
if (err != CL_SUCCESS) {
fprintf(stderr, "ERROR: %s ( %d )\n", name, err);
success = false;
}
return success;
}
int InitializeOpenCL_Kernel() {
printf("InitializeOpenCL_Kernel()");
cl_int errNum; ///< Indicate error
for (int i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
//*(AA + ThreadNum * MAX_THREADS + i) = i;
//*(BB + ThreadNum * MAX_THREADS + i) = LIST_SIZE - i;
}
if (UseOpenCL == 0)
return 0;
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
errNum = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); //Get information about the platform and the devices available on the computer (line 42)
errNum = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); //Select devices to use in execution (line 43)
size_t paramValueSize; ///< Info Device
errNum = clGetDeviceInfo(device_id, CL_DEVICE_NAME, 0, NULL, ¶mValueSize);
char* info = (char *)malloc(sizeof(char) * paramValueSize); // String to display vendor name info
errNum = clGetDeviceInfo(device_id, CL_DEVICE_NAME, paramValueSize, info, NULL);
//printf("\n.:OpenCL Benchmark is started\n\n");
printf("Using GPU device: %s\n", info);
// Create an OpenCL context
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &errNum); //Create an OpenCL context (line 47)
// Create a command queue
//cl_command_queue command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &errNum); //Create a command queue (line 50)
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &errNum); //Create program object (line 67)
// Build the program
errNum = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); //Load the kernel source code (line 24-35) and compile it (line 71) (online exeuction) or load the precompiled binary OpenCL program (offline execution)
checkError(errNum, "clEnqueueUnmapMemObject error");
// Create the OpenCL kernel
kernel = clCreateKernel(program, "vector_add", &errNum); //Create kernel object (line 74)
return (int)errNum;
}
int CleanUpOpenCL_Kernel() {
printf("CleanUpOpenCL_Kernel()");
cl_int errNum; ///< Indicate error
errNum = clReleaseCommandQueue(command_queue);
errNum = clReleaseKernel(kernel);
errNum = clReleaseProgram(program);
errNum = clReleaseContext(context);
return errNum;
}
int OpenCL_Kernel(void) {
double dResult = 0;
DWORD ThreadID = GetCurrentThreadId();
int ThreadNum = GetThreadIDNum(ThreadID);
cl_int errNum; ///< Indicate error
// Create the two input vectors
long i;
clock_t lBefore = clock();
if (UseOpenCL == 1) {
command_queue= clCreateCommandQueueWithProperties(context, device_id, qprop, &errNum); //Create a command queue (line 50)
//********* ARRAY A
//cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(long), NULL, &errNum);
//errNum = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(long), A, 0, NULL, NULL);
KernelAreaBufferA[ThreadNum] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * LIST_SIZE, NULL, &errNum);
checkError(errNum, "clCreateBuffer error");
DestArrayA[ThreadNum] = (long*)clEnqueueMapBuffer(command_queue, KernelAreaBufferA[ThreadNum], CL_TRUE, CL_MAP_WRITE, 0, sizeof(long) * LIST_SIZE, 0, NULL, NULL, &errNum);
checkError(errNum, "clEnqueueMapBuffer error");
for (i = 0; i < LIST_SIZE; i++) {
DestArrayA[ThreadNum][i] = A[i]; //*(AA + ThreadNum * MAX_THREADS + i);
}
errNum = clEnqueueUnmapMemObject(command_queue, KernelAreaBufferA[ThreadNum], DestArrayA[ThreadNum], 0, NULL, NULL);
checkError(errNum, "clEnqueueUnmapMemObject error");
//********* ARRAY B
//cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(long), NULL, &errNum);
//errNum = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(long), B, 0, NULL, NULL); //Transfer data (list A and B) to memory buffers on the device (line 61-64)
KernelAreaBufferB[ThreadNum] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * LIST_SIZE, NULL, &errNum);
checkError(errNum, "clCreateBuffer error");
DestArrayB[ThreadNum] = (long*)clEnqueueMapBuffer(command_queue, KernelAreaBufferB[ThreadNum], CL_TRUE, CL_MAP_WRITE, 0, sizeof(long) * LIST_SIZE, 0, NULL, NULL, &errNum);
checkError(errNum, "clEnqueueMapBuffer error");
for (i = 0; i < LIST_SIZE; i++) {
DestArrayB[ThreadNum][i] = B[i]; //*(BB + ThreadNum * MAX_THREADS + i);
}
errNum = clEnqueueUnmapMemObject(command_queue, KernelAreaBufferB[ThreadNum], DestArrayB[ThreadNum], 0, NULL, NULL);
checkError(errNum, "clEnqueueUnmapMemObject error");
//********* ARRAY C
c_mem_obj[ThreadNum] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(long), NULL, &errNum); //Create memory buffer objects(line 53-58)
checkError(errNum, "clCreateBuffer error");
// Set the arguments of the kernel
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &KernelAreaBufferA[ThreadNum] /* &a_mem_obj */);//Set kernel arguments (line 77-79)
checkError(errNum, "clSetKernelArg error");
errNum = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &KernelAreaBufferB[ThreadNum] /* &b_mem_obj */);
checkError(errNum, "clSetKernelArg error");
errNum = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj[ThreadNum]);
checkError(errNum, "clSetKernelArg error");
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
DWORD ThreadID = GetCurrentThreadId();
DisplayMessage(TEXT("clEnqueueNDRangeKernel initiated with Id=[%ld] & Num=[%ld]\n"), ThreadID, ThreadNum);
errNum = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); //Execute the kernel (line 84)
checkError(errNum, "clEnqueueNDRangeKernel error");
//errNum = clEnqueueTask(command_queue, kernel, 0, NULL, NULL); //Execute the kernel (line 84)
// Read the memory buffer C on the device to the local variable C
errNum = clEnqueueReadBuffer(command_queue, c_mem_obj[ThreadNum], CL_TRUE, 0, LIST_SIZE * sizeof(long), C, 0, NULL, NULL); //Read memory objects.In this case we read the list C from the compute device(line 88 - 90)
checkError(errNum, "clEnqueueReadBuffer error");
dResult = (double)(clock() - lBefore) / CLOCKS_PER_SEC;
// Clean up
errNum = clFlush(command_queue);
errNum = clFinish(command_queue);
//errNum = clReleaseMemObject(a_mem_obj);
errNum = clReleaseMemObject(KernelAreaBufferA[ThreadNum]);
//errNum = clReleaseMemObject(b_mem_obj);
errNum = clReleaseMemObject(KernelAreaBufferB[ThreadNum]);
errNum = clReleaseMemObject(c_mem_obj[ThreadNum]);
}
else {
//printf("\n.:CPU Benchmark is started\n\n");
clock_t lBefore = clock();
//for (i = 0; i < LIST_SIZE; i++) {
// C[i] = A[i] + B[i];
//}
dResult = (double)(clock() - lBefore) / CLOCKS_PER_SEC;
}
//printf("\n.:Benchmark ended. Total Time %12.3f sec\n\n", dResult);
printf("Thread ID=%lu completed.\n", ThreadID);
return 0;
}
//------------------------------------------
// A function to Display the message
// indicating in which tread we are
//------------------------------------------
BOOL DisplayMessage(LPCTSTR text, DWORD Param1, DWORD Param2)
{
TCHAR msgBuf[BUF_SIZE];
size_t cchStringSize;
DWORD dwChars;
HANDLE hStdout;
// Make sure there is a console to receive output results.
hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
if (hStdout == INVALID_HANDLE_VALUE)
return TRUE;
// Print message using thread-safe functions.
StringCchPrintf(msgBuf, BUF_SIZE, text, Param1, Param2);
StringCchLength(msgBuf, BUF_SIZE, &cchStringSize);
WriteConsole(hStdout, msgBuf, cchStringSize, &dwChars, NULL);
return FALSE;
}
void ErrorHandler(LPCTSTR lpszFunction)
{
// Retrieve the system error message for the last-error code.
LPVOID lpMsgBuf;
LPVOID lpDisplayBuf;
DWORD dw = GetLastError();
FormatMessage(
FORMAT_MESSAGE_ALLOCATE_BUFFER |
FORMAT_MESSAGE_FROM_SYSTEM |
FORMAT_MESSAGE_IGNORE_INSERTS,
NULL,
dw,
MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
(LPTSTR)&lpMsgBuf,
0, NULL);
// Display the error message.
lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT,
(lstrlen((LPCTSTR)lpMsgBuf) + lstrlen((LPCTSTR)lpszFunction) + 40) * sizeof(TCHAR));
StringCchPrintf((LPTSTR)lpDisplayBuf,
LocalSize(lpDisplayBuf) / sizeof(TCHAR),
TEXT("%s failed with error %d: %s"),
lpszFunction, dw, lpMsgBuf);
MessageBox(NULL, (LPCTSTR)lpDisplayBuf, TEXT("Error"), MB_OK);
// Free error-handling buffer allocations.
LocalFree(lpMsgBuf);
LocalFree(lpDisplayBuf);
}
int _tmain()
{
DWORD threadID[MAX_THREADS], dwEvent;
InitializeOpenCL_Kernel();
// Create MAX_THREADS worker threads.
for (int i = 0; i < MAX_THREADS; i++)
{
// Allocate memory for thread data.
pDataArray[i] = (_pTDATA)HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(_TDATA));
if (pDataArray[i] == NULL)
{
// If the array allocation fails, the system is out of memory
// so there is no point in trying to print an error message.
// Just terminate execution.
ExitProcess(2);
}
// Generate unique data for each thread to work with.
pDataArray[i]->nThreadNum = i;
// Create the thread to begin execution on its own.
hThreadArray[i] = CreateThread(
NULL, // default security attributes
0, // use default stack size
MyThreadFunction, // thread function name
(LPVOID)pDataArray[i], // argument to thread function
0, // use default creation flags
&threadID[i]); // returns the thread identifier
pDataArray[i]->dwThreadID = threadID[i];
printf("*Thread %d with Id=[%ld] created.\n", pDataArray[i]->nThreadNum, pDataArray[i]->dwThreadID);
// Check the return value for success.
// If CreateThread fails, terminate execution.
// This will automatically clean up threads and memory.
if (hThreadArray[i] == NULL)
{
ErrorHandler(TEXT("CreateThread"));
ExitProcess(3);
}
} // End of main thread creation loop.
// Wait until all threads have terminated.
dwEvent = WaitForMultipleObjects(MAX_THREADS, hThreadArray, TRUE, INFINITE);
// Close all thread handles and free memory allocations.
for (int i = 0; i < MAX_THREADS; i++)
{
CloseHandle(hThreadArray[i]);
printf("Handle %ld with ID=%ld, just closed.\n", hThreadArray[i], threadID[i]);
if (pDataArray[i] != NULL)
{
HeapFree(GetProcessHeap(), 0, pDataArray[i]);
pDataArray[i] = NULL; // Ensure address is not reused.
}
}
//ShowResults();
CleanUpOpenCL_Kernel();
free(A);
free(B);
free(C);
free(AA);
free(BB);
return 0;
}
DWORD WINAPI MyThreadFunction(LPVOID lpParam) {
_pTDATA pDataArray;
// Cast the parameter to the correct data type.
// The pointer is known to be valid because
// it was checked for NULL before the thread was created.
// make a copy of the parameter for convenience
//pDataArray2 = *static_cast<_pTDATA*>(lpParam)
pDataArray = (_pTDATA)lpParam;
// Print the parameter values using thread-safe functions.
DWORD ThreadID = GetCurrentThreadId();
DisplayMessage(TEXT("-Thread %ld with Id=[%ld] initiated\n"), pDataArray->nThreadNum, ThreadID);
OpenCL_Kernel();
return 0;
}