OpenCL Driver BUG? clEnqueueReadImage fails to copy data from Image Object

Hi,

I have written a code snippet that simply allocates an Image Object, launches a kernel to save global thread ID values into this Image object. clEnqueueReadImage gives me all values zeros.

If I use a buffer object instead, I get right results.

Here is the code snippet

// utilities, system and OpenCL includes

#include <oclUtils.h>

//#define USE_BUFFER

// OpenCL Stuff		

cl_int ciErrNum;			        // Error code var

cl_platform_id cpPlatform;          // OpenCL platform

cl_device_id* cdDevices = NULL;		// device list

cl_uint uiTargetDevice = 0;	        // Default Device to compute on

cl_context cxGPUContext;				// OpenCL context

cl_command_queue cqCommandQueue;		// OpenCL command queue

cl_uint uiNumDevsUsed = 1;			// Number of devices used in this sample 

cl_program cpProgram;						// OpenCL program

char* cPathAndName = NULL;			// var for full paths to data, src, etc.

const char* clSourcefile = "kernels.cl";  // OpenCL kernel source file

size_t szKernelLength;			    // Byte size of kernel code

char* cSourceCL = NULL;				// Buffer to hold source for compilation 

void initData(unsigned char *in, int rows, int cols)

{		

	int size  = rows * cols;

	for ( int i = 0; i < size; i++)

	{

		in[i] = (unsigned char) (rand() * 10.0f / (float)RAND_MAX); 

	}

}

void initDataInt(unsigned int *in, int rows, int cols)

{		

	int size  = rows * cols;

	for ( int i = 0; i < size; i++)

	{

		in[i] = (unsigned char) (rand() * 10.0f / (float)RAND_MAX); 

	}

}

void displayImage(unsigned char *in, int rows, int cols)

{

	for ( int i = 0; i < rows; i++)

	{	

		for ( int j = 0; j < cols; j++)

		{

			printf("%d ", in[ i * cols + j ]);

		}

		printf("\n");

	}

	

	printf("\n");

}

void displayImageInt(unsigned int *in, int rows, int cols)

{

	for ( int i = 0; i < rows; i++)

	{	

		for ( int j = 0; j < cols; j++)

		{

			printf("%d ", in[ i * cols + j ]);

		}

		printf("\n");

	}

	

	printf("\n");

}

size_t DivUp(size_t dividend, size_t divisor)

{

	return (dividend % divisor == 0) ? (dividend / divisor) : (dividend / divisor + 1);

}

void initOpenCL()

{

// Get the NVIDIA platform

	ciErrNum = oclGetPlatformID(&cpPlatform);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	printf("clGetPlatformID...\n"); 

	//Get all the devices

	cl_uint uiNumDevices = 0;           // Number of devices available

	cl_uint uiTargetDevice = 0;	        // Default Device to compute on

	cl_uint uiNumComputeUnits;          // Number of compute units (SM's on NV GPU)

	printf("Get the Device info and select Device...\n");

	ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );

	ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	// Set target device and Query number of compute units on uiTargetDevice

	printf("# of Devices Available = %u\n", uiNumDevices); 

	printf("Using Device %u: ", uiTargetDevice); 

	oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);

	ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	printf("\n# of Compute Units = %u\n", uiNumComputeUnits); 

	 cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	printf("clCreateContext created successfully...\n"); 

	// Create a command-queue 

	cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	printf("clCreateCommandQueue...\n");

	cPathAndName = shrFindFilePath(clSourcefile, NULL);

	oclCheckErrorEX(cPathAndName != NULL, shrTRUE, NULL);

	cSourceCL = oclLoadProgSource(cPathAndName, "// My comment\n", &szKernelLength);

	oclCheckErrorEX(cSourceCL != NULL, shrTRUE, NULL);

	printf("oclLoadProgSource...\n"); 

	// Create the program 

	cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	shrLog("clCreateProgramWithSource...\n"); 

	 // Setup build options string 

	//--------------------------------

	std::string sBuildOpts = " -cl-fast-relaxed-math"; 

	// sBuildOpts  += " -D USE_BUFFER";

	// Build the program

	ciErrNum = clBuildProgram(cpProgram, 0, NULL, sBuildOpts.c_str(), NULL, NULL);

	if (ciErrNum != CL_SUCCESS)

	{

		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);

		 oclLogBuildInfo(cpProgram, cdDevices[uiTargetDevice]);

		 oclLogPtx(cpProgram, cdDevices[uiTargetDevice], "oclBoxFilter.ptx");

		printf("FAILURE...\n"); 

		exit(-1);

	}

	printf("clBuildProgram...\n");

}

int main(int argc, char ** argv)

{

	int width = 10;

	int height = 10;

	int sizeInBytes = width * height * sizeof(unsigned int);

		

	initOpenCL();

	

	// OpenCL Kernel

	cl_kernel oppKernel;

	oppKernel = clCreateKernel( cpProgram, "filterTex", &ciErrNum);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	printf("clCreateKernel (oppKernel)...\n"); 

	//-----------------------------

#ifdef USE_BUFFER

	

	// Result in Buffer Object

	cl_mem cmDevBufOut = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeInBytes, NULL, &ciErrNum);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	printf("clCreateBuffer ( Output buffers, device GMEM)...\n"); 

#else

	// Result in Image Object

	cl_mem imageOut;

	cl_image_format InputFormat;        // OpenCL format descriptor for 2D image useage

	InputFormat.image_channel_order = CL_RGBA;

	InputFormat.image_channel_data_type = CL_UNSIGNED_INT8;

	

	// clCreateImage2D gives INVALID_IMAGE_SIZE if host_ptr is NULL

	unsigned char *temp = (unsigned char * )malloc ( sizeInBytes );

	

	imageOut = clCreateImage2D(cxGPUContext, CL_MEM_READ_WRITE, &InputFormat, 

								 width, height, 

								  width * sizeof(unsigned int), temp, &ciErrNum);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	printf("Device Image allocation SUCCESS to Hold Result...\n");

#endif

	

	// Setting up Kernel Args

#ifdef USE_BUFFER

	ciErrNum |= clSetKernelArg(oppKernel, 0, sizeof(cl_mem), (void*)&cmDevBufOut);

#else

	ciErrNum |= clSetKernelArg(oppKernel, 0, sizeof(cl_mem), (void*)&imageOut);

#endif

	ciErrNum |= clSetKernelArg(oppKernel, 1, sizeof(unsigned int), (void*)&width);

	ciErrNum |= clSetKernelArg(oppKernel, 2, sizeof(unsigned int), (void*)&height);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	printf("clSetKernelArg (0-2) oppKernel...\n"); 

	// Set global and local work sizes for row kernel

	size_t szLocalWorkSize[2];

	size_t szGlobalWorkSize[2];

	szLocalWorkSize[0] = 32;

	szLocalWorkSize[1] = 1;

	szGlobalWorkSize[0]= szLocalWorkSize[0] * DivUp((size_t)width * height, szLocalWorkSize[0]);

	szGlobalWorkSize[1] = 1;

	// Sync host and start computation timer

	clFinish(cqCommandQueue);

	 // 2D Image (Texture)

	 ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, oppKernel, 2, NULL, 

										  szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	printf("KERNEL LAUNCH SUCCESS\n");

	// Sync host and start computation timer

	clFinish(cqCommandQueue);

	// Copy Result to Host

	unsigned int * res = (unsigned int *)malloc(sizeInBytes);

#ifdef USE_BUFFER

	clEnqueueReadBuffer(cqCommandQueue, cmDevBufOut, CL_TRUE, 0, sizeInBytes, res, 0, NULL, NULL);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	printf("clEnqueueReadBuffer SUCCESS\n");

#else

	// Read Image Object to host memory

	const size_t szTexOrigin[3] = {0, 0, 0};                // Offset of input texture origin relative to host image

	const size_t szTexRegion[3] = {width, height, 1};   // Size of texture region to operate on

	ciErrNum = clEnqueueReadImage(cqCommandQueue, imageOut, CL_TRUE, 

								   szTexOrigin, szTexRegion, 0, 0, res, 0, NULL, NULL);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	printf("Device Image Copied to Host SUCEESS\n");

#endif

	displayImageInt(res, height, width);

	free(res);

}

The kernel code follows :

__kernel void filterTex(

#ifdef USE_BUFFER

						__global unsigned int *uiDest,

#else

                        __write_only image2d_t iDest,

#endif

							unsigned int uiWidth, unsigned int uiHeight)

{

    size_t globalPosY = get_global_id(0);

	int x = globalPosY % uiWidth;

	int y = globalPosY / uiWidth;

	int2 pos = {x, y};

#ifdef USE_BUFFER

	// Write out to GMEM

	uiDest[ globalPosY ] = globalPosY;

#else

	// Write to Image Object

	int4 val = {globalPosY, 0, 0, 0};

	write_imagei(iDest, pos, val);

#endif

	

}

Output using Image object

clGetPlatformID...

Get the Device info and select Device...

# of Devices Available = 1

Using Device 0: GeForce 8800 GTX

# of Compute Units = 16

clCreateContext created successfully...

clCreateCommandQueue...

oclLoadProgSource...

clCreateProgramWithSource...

clBuildProgram...

clCreateKernel (oppKernel)...

Device Image allocation SUCCESS to Hold Result...

clSetKernelArg (0-2) oppKernel...

KERNEL LAUNCH SUCCESS

Device Image Copied to Host SUCEESS

0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0

Output using Buffer Object ( uncomment #define USE_BUFFER and sBuildOpts += " -D USE_BUFFER";)

clGetPlatformID...

Get the Device info and select Device...

# of Devices Available = 1

Using Device 0: GeForce 8800 GTX

# of Compute Units = 16

clCreateContext created successfully...

clCreateCommandQueue...

oclLoadProgSource...

clCreateProgramWithSource...

clBuildProgram...

clCreateKernel (oppKernel)...

clCreateBuffer ( Output buffers, device GMEM)...

clSetKernelArg (0-2) oppKernel...

KERNEL LAUNCH SUCCESS

clEnqueueReadBuffer SUCCESS

0 1 2 3 4 5 6 7 8 9

10 11 12 13 14 15 16 17 18 19

20 21 22 23 24 25 26 27 28 29

30 31 32 33 34 35 36 37 38 39

40 41 42 43 44 45 46 47 48 49

50 51 52 53 54 55 56 57 58 59

60 61 62 63 64 65 66 67 68 69

70 71 72 73 74 75 76 77 78 79

80 81 82 83 84 85 86 87 88 89

90 91 92 93 94 95 96 97 98 99

Is there anyone else facing this issue. Is this a driver BUG?

Also, I’m forced to allocate a host_ptr while calling clCreateImage2D gives INVALID_IMAGE_SIZE if host_ptr is NULL

unsigned char *temp = (unsigned char * )malloc ( sizeInBytes );

	

	imageOut = clCreateImage2D(cxGPUContext, CL_MEM_READ_WRITE, &InputFormat, 

								 width, height, 

								  width * sizeof(unsigned int), temp, &ciErrNum);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);

	printf("Device Image allocation SUCCESS to Hold Result...\n");

System Details:

oclDeviceQuery, Platform Name = NVIDIA CUDA, Platform Version = OpenCL 1.0 CUDA

3.2.1, SDK Revision = 7027912, NumDevs = 1, Device = GeForce 8800 GTX

System Info:

Local Time/Date = 16:59:50, 4/6/2011

 CPU Arch: 0

 CPU Level: 15

 # of CPU processors: 2

 Windows Build: 7600

 Windows Ver: 6.1 (Windows Vista / Windows 7)

Thanks

Fixed doing:

  1. NULL can be passed as host_ptr to clCreateImage2D but pitch passed should be zero

image_row_pitch is the scan-line pitch in bytes. This must be 0 if host_ptr is NULL and can be
either 0 or >= image_width * size of element in bytes if host_ptr is not NULL. If host_ptr is not
NULL and image_row_pitch = 0, image_row_pitch is calculated as image_width * size of
element in bytes. If image_row_pitch is not 0, it must be a multiple of the image element size in
bytes.

  1. Writing to Image Object using write_imagei should be write_imageui as the channel used is UNSIGNED_INT8