Hi,
I have written a code snippet that simply allocates an Image Object, launches a kernel to save global thread ID values into this Image object. clEnqueueReadImage gives me all values zeros.
If I use a buffer object instead, I get right results.
Here is the code snippet
// utilities, system and OpenCL includes
#include <oclUtils.h>
//#define USE_BUFFER
// OpenCL Stuff
cl_int ciErrNum; // Error code var
cl_platform_id cpPlatform; // OpenCL platform
cl_device_id* cdDevices = NULL; // device list
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQueue; // OpenCL command queue
cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample
cl_program cpProgram; // OpenCL program
char* cPathAndName = NULL; // var for full paths to data, src, etc.
const char* clSourcefile = "kernels.cl"; // OpenCL kernel source file
size_t szKernelLength; // Byte size of kernel code
char* cSourceCL = NULL; // Buffer to hold source for compilation
void initData(unsigned char *in, int rows, int cols)
{
int size = rows * cols;
for ( int i = 0; i < size; i++)
{
in[i] = (unsigned char) (rand() * 10.0f / (float)RAND_MAX);
}
}
void initDataInt(unsigned int *in, int rows, int cols)
{
int size = rows * cols;
for ( int i = 0; i < size; i++)
{
in[i] = (unsigned char) (rand() * 10.0f / (float)RAND_MAX);
}
}
void displayImage(unsigned char *in, int rows, int cols)
{
for ( int i = 0; i < rows; i++)
{
for ( int j = 0; j < cols; j++)
{
printf("%d ", in[ i * cols + j ]);
}
printf("\n");
}
printf("\n");
}
void displayImageInt(unsigned int *in, int rows, int cols)
{
for ( int i = 0; i < rows; i++)
{
for ( int j = 0; j < cols; j++)
{
printf("%d ", in[ i * cols + j ]);
}
printf("\n");
}
printf("\n");
}
size_t DivUp(size_t dividend, size_t divisor)
{
return (dividend % divisor == 0) ? (dividend / divisor) : (dividend / divisor + 1);
}
void initOpenCL()
{
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
printf("clGetPlatformID...\n");
//Get all the devices
cl_uint uiNumDevices = 0; // Number of devices available
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
printf("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
// Set target device and Query number of compute units on uiTargetDevice
printf("# of Devices Available = %u\n", uiNumDevices);
printf("Using Device %u: ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
printf("\n# of Compute Units = %u\n", uiNumComputeUnits);
cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
printf("clCreateContext created successfully...\n");
// Create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
printf("clCreateCommandQueue...\n");
cPathAndName = shrFindFilePath(clSourcefile, NULL);
oclCheckErrorEX(cPathAndName != NULL, shrTRUE, NULL);
cSourceCL = oclLoadProgSource(cPathAndName, "// My comment\n", &szKernelLength);
oclCheckErrorEX(cSourceCL != NULL, shrTRUE, NULL);
printf("oclLoadProgSource...\n");
// Create the program
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clCreateProgramWithSource...\n");
// Setup build options string
//--------------------------------
std::string sBuildOpts = " -cl-fast-relaxed-math";
// sBuildOpts += " -D USE_BUFFER";
// Build the program
ciErrNum = clBuildProgram(cpProgram, 0, NULL, sBuildOpts.c_str(), NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(cpProgram, cdDevices[uiTargetDevice]);
oclLogPtx(cpProgram, cdDevices[uiTargetDevice], "oclBoxFilter.ptx");
printf("FAILURE...\n");
exit(-1);
}
printf("clBuildProgram...\n");
}
int main(int argc, char ** argv)
{
int width = 10;
int height = 10;
int sizeInBytes = width * height * sizeof(unsigned int);
initOpenCL();
// OpenCL Kernel
cl_kernel oppKernel;
oppKernel = clCreateKernel( cpProgram, "filterTex", &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
printf("clCreateKernel (oppKernel)...\n");
//-----------------------------
#ifdef USE_BUFFER
// Result in Buffer Object
cl_mem cmDevBufOut = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeInBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
printf("clCreateBuffer ( Output buffers, device GMEM)...\n");
#else
// Result in Image Object
cl_mem imageOut;
cl_image_format InputFormat; // OpenCL format descriptor for 2D image useage
InputFormat.image_channel_order = CL_RGBA;
InputFormat.image_channel_data_type = CL_UNSIGNED_INT8;
// clCreateImage2D gives INVALID_IMAGE_SIZE if host_ptr is NULL
unsigned char *temp = (unsigned char * )malloc ( sizeInBytes );
imageOut = clCreateImage2D(cxGPUContext, CL_MEM_READ_WRITE, &InputFormat,
width, height,
width * sizeof(unsigned int), temp, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
printf("Device Image allocation SUCCESS to Hold Result...\n");
#endif
// Setting up Kernel Args
#ifdef USE_BUFFER
ciErrNum |= clSetKernelArg(oppKernel, 0, sizeof(cl_mem), (void*)&cmDevBufOut);
#else
ciErrNum |= clSetKernelArg(oppKernel, 0, sizeof(cl_mem), (void*)&imageOut);
#endif
ciErrNum |= clSetKernelArg(oppKernel, 1, sizeof(unsigned int), (void*)&width);
ciErrNum |= clSetKernelArg(oppKernel, 2, sizeof(unsigned int), (void*)&height);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
printf("clSetKernelArg (0-2) oppKernel...\n");
// Set global and local work sizes for row kernel
size_t szLocalWorkSize[2];
size_t szGlobalWorkSize[2];
szLocalWorkSize[0] = 32;
szLocalWorkSize[1] = 1;
szGlobalWorkSize[0]= szLocalWorkSize[0] * DivUp((size_t)width * height, szLocalWorkSize[0]);
szGlobalWorkSize[1] = 1;
// Sync host and start computation timer
clFinish(cqCommandQueue);
// 2D Image (Texture)
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, oppKernel, 2, NULL,
szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
printf("KERNEL LAUNCH SUCCESS\n");
// Sync host and start computation timer
clFinish(cqCommandQueue);
// Copy Result to Host
unsigned int * res = (unsigned int *)malloc(sizeInBytes);
#ifdef USE_BUFFER
clEnqueueReadBuffer(cqCommandQueue, cmDevBufOut, CL_TRUE, 0, sizeInBytes, res, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
printf("clEnqueueReadBuffer SUCCESS\n");
#else
// Read Image Object to host memory
const size_t szTexOrigin[3] = {0, 0, 0}; // Offset of input texture origin relative to host image
const size_t szTexRegion[3] = {width, height, 1}; // Size of texture region to operate on
ciErrNum = clEnqueueReadImage(cqCommandQueue, imageOut, CL_TRUE,
szTexOrigin, szTexRegion, 0, 0, res, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
printf("Device Image Copied to Host SUCEESS\n");
#endif
displayImageInt(res, height, width);
free(res);
}
The kernel code follows :
__kernel void filterTex(
#ifdef USE_BUFFER
__global unsigned int *uiDest,
#else
__write_only image2d_t iDest,
#endif
unsigned int uiWidth, unsigned int uiHeight)
{
size_t globalPosY = get_global_id(0);
int x = globalPosY % uiWidth;
int y = globalPosY / uiWidth;
int2 pos = {x, y};
#ifdef USE_BUFFER
// Write out to GMEM
uiDest[ globalPosY ] = globalPosY;
#else
// Write to Image Object
int4 val = {globalPosY, 0, 0, 0};
write_imagei(iDest, pos, val);
#endif
}
Output using Image object
clGetPlatformID...
Get the Device info and select Device...
# of Devices Available = 1
Using Device 0: GeForce 8800 GTX
# of Compute Units = 16
clCreateContext created successfully...
clCreateCommandQueue...
oclLoadProgSource...
clCreateProgramWithSource...
clBuildProgram...
clCreateKernel (oppKernel)...
Device Image allocation SUCCESS to Hold Result...
clSetKernelArg (0-2) oppKernel...
KERNEL LAUNCH SUCCESS
Device Image Copied to Host SUCEESS
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
Output using Buffer Object ( uncomment #define USE_BUFFER and sBuildOpts += " -D USE_BUFFER"External Image
clGetPlatformID...
Get the Device info and select Device...
# of Devices Available = 1
Using Device 0: GeForce 8800 GTX
# of Compute Units = 16
clCreateContext created successfully...
clCreateCommandQueue...
oclLoadProgSource...
clCreateProgramWithSource...
clBuildProgram...
clCreateKernel (oppKernel)...
clCreateBuffer ( Output buffers, device GMEM)...
clSetKernelArg (0-2) oppKernel...
KERNEL LAUNCH SUCCESS
clEnqueueReadBuffer SUCCESS
0 1 2 3 4 5 6 7 8 9
10 11 12 13 14 15 16 17 18 19
20 21 22 23 24 25 26 27 28 29
30 31 32 33 34 35 36 37 38 39
40 41 42 43 44 45 46 47 48 49
50 51 52 53 54 55 56 57 58 59
60 61 62 63 64 65 66 67 68 69
70 71 72 73 74 75 76 77 78 79
80 81 82 83 84 85 86 87 88 89
90 91 92 93 94 95 96 97 98 99
Is there anyone else facing this issue. Is this a driver BUG?
Also, I’m forced to allocate a host_ptr while calling clCreateImage2D gives INVALID_IMAGE_SIZE if host_ptr is NULL
unsigned char *temp = (unsigned char * )malloc ( sizeInBytes );
imageOut = clCreateImage2D(cxGPUContext, CL_MEM_READ_WRITE, &InputFormat,
width, height,
width * sizeof(unsigned int), temp, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
printf("Device Image allocation SUCCESS to Hold Result...\n");
System Details:
oclDeviceQuery, Platform Name = NVIDIA CUDA, Platform Version = OpenCL 1.0 CUDA
3.2.1, SDK Revision = 7027912, NumDevs = 1, Device = GeForce 8800 GTX
System Info:
Local Time/Date = 16:59:50, 4/6/2011
CPU Arch: 0
CPU Level: 15
# of CPU processors: 2
Windows Build: 7600
Windows Ver: 6.1 (Windows Vista / Windows 7)
Thanks