If you wish to provide a complete code along with compile instructions, I’ll be happy to try it.
Here’s my version of what you posted, it seems to work correctly for me:
t3.cl:
// OpenCL Kernel Function for 2D kernel test
__kernel void test2Dim(
__global uint* field)
{
size_t id = get_global_id(1)*get_global_size(0) + get_global_id(0);
field[id] = id;
}
t3.cpp:
// *********************************************************************
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <CL/opencl.h>
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typicall
a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* sz
FinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
#ifdef _WIN32 // Windows version
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
{
return NULL;
}
#else // Linux version
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
#endif
size_t szPreambleLength = strlen(cPreamble);
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
memcpy(cSourceString, cPreamble, szPreambleLength);
if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream
) != 1)
{
fclose(pFileStream);
free(cSourceString);
return 0;
}
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength + szPreambleLength;
}
cSourceString[szSourceLength + szPreambleLength] = '\0';
return cSourceString;
}
// Name of the file with the source code for the computation kernel
// *********************************************************************
const char* cSourceFile = "t3.cl";
// Host buffers for demo
// *********************************************************************
void *srcA; // Host buffers for OpenCL test
// OpenCL Vars
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQueue;// OpenCL command que
cl_platform_id cpPlatform; // OpenCL platform
cl_device_id cdDevice; // OpenCL device
cl_program cpProgram; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel
cl_mem cmDevSrcA; // OpenCL device source buffer A
size_t szGlobalWorkSize[2]; // 1D var for Total # of work items
size_t szLocalWorkSize[2]; // 1D var for # of work items in the work group
size_t szParmDataBytes; // Byte size of context information
size_t szKernelLength; // Byte size of kernel code
cl_int ciErr1, ciErr2; // Error code var
char* cPathAndName = NULL; // var for full paths to data, src, etc.
char* cSourceCL = NULL; // Buffer to hold source for compilation
// demo config vars
int iNumElements = 4; // Length of float arrays to process
// Forward Declarations
// *********************************************************************
void Cleanup (int iExitCode);
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
printf("%s Starting...\n\n# of elements \t= %i\n", argv[0], iNumElements);
// Allocate and initialize host arrays
printf( "Allocate and Init Host Mem...\n");
srcA = (void *)malloc(sizeof(cl_uint) * iNumElements);
//Get an OpenCL platform
ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL);
printf("clGetPlatformID...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
//Get the devices
ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
printf("clGetDeviceIDs...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
//Create the context
cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1);
printf("clCreateContext...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// Create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1);
printf("clCreateCommandQueue...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * iNumElements, NULL, &ciErr1);
printf("clCreateBuffer...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// Read the OpenCL kernel in from source file
printf("oclLoadProgSource (%s)...\n", cSourceFile);
cPathAndName = "./t3.cl";
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
// Create the program
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
printf("clCreateProgramWithSource...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// Build the program with 'mad' Optimization option
char* flags = "-cl-fast-relaxed-math";
ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
printf("clBuildProgram...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// Create the kernel
ckKernel = clCreateKernel(cpProgram, "test2Dim", &ciErr1);
printf("clCreateKernel (test2Dim)...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// Set the Argument values
ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
printf("clSetKernelArg 0 ...\n\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// --------------------------------------------------------
// Start Core sequence... compute, copy results back
szGlobalWorkSize[0] = 2;
szGlobalWorkSize[1] = 2;
szLocalWorkSize[0] = 2;
szLocalWorkSize[1] = 2;
// Launch kernel
ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
printf("clEnqueueNDRangeKernel (test2Dim)...\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// Synchronous/blocking read of results, and check accumulated errors
ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmDevSrcA, CL_TRUE, 0, sizeof(cl_uint) * iNumElements, srcA, 0, NULL, NULL);
printf("clEnqueueReadBuffer (srcA)...\n\n");
if (ciErr1 != CL_SUCCESS)
{
printf("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
//--------------------------------------------------------
// print results
for (int i = 0; i < iNumElements; i++)
printf("srcA[%d] = %d\n",i, *((uint *)srcA+i));
printf("Success!\n");
// Cleanup and leave
Cleanup (EXIT_SUCCESS);
}
void Cleanup (int iExitCode)
{
// Cleanup allocated objects
printf("Starting Cleanup...\n\n");
if(cSourceCL)free(cSourceCL);
if(ckKernel)clReleaseKernel(ckKernel);
if(cpProgram)clReleaseProgram(cpProgram);
if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
if(cxGPUContext)clReleaseContext(cxGPUContext);
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
// Free host memory
free(srcA);
exit (iExitCode);
}
compile:
g++ -L/usr/lib64 -lOpenCL -I/usr/local/cuda/include -o t3 t3.cpp
(using CUDA 6 standard linux install)
results:
$ ./t3
./t3 Starting...
# of elements = 4
Allocate and Init Host Mem...
clGetPlatformID...
clGetDeviceIDs...
clCreateContext...
clCreateCommandQueue...
clCreateBuffer...
oclLoadProgSource (t3.cl)...
clCreateProgramWithSource...
clBuildProgram...
clCreateKernel (test2Dim)...
clSetKernelArg 0 ...
clEnqueueNDRangeKernel (test2Dim)...
clEnqueueReadBuffer (srcA)...
srcA[0] = 0
srcA[1] = 1
srcA[2] = 2
srcA[3] = 3
Success!
Starting Cleanup...