Intel Opencl sample astonishing godrays implementation

Hi guys, I wonder It is possible to convert Intel OpenCL sample called GodRays that the code will be execute and compute by Nvidia GPU?

I need you help coz I want change the orginal code in as little steps as is possible.
The main idea is that GPU code should be as near orginal one as is possible, the performance is not important.

As I assume all changes need to be make in godray.cpp file.
in this section? or I’m hellishly wrong?

void Cleanup_OpenCL()
{
//release g_kernel, g_program, and memory objects
if( g_inputBuffer ) clReleaseMemObject( g_inputBuffer );
if( g_outputBuffer ) clReleaseMemObject( g_outputBuffer );
if( g_kernel ) clReleaseKernel( g_kernel );
if( g_program ) clReleaseProgram( g_program );
if( g_cmd_queue ) clReleaseCommandQueue( g_cmd_queue );
if( g_context ) clReleaseContext( g_context );
}

int Setup_OpenCL( const char *program_source )
{
cl_device_id devices[16];
size_t cb;
cl_uint size_ret = 0;
cl_int err;

cl_platform_id intel_platform_id = GetIntelOCLPlatform();
if( intel_platform_id == NULL )
{
	printf("ERROR: Failed to find Intel OpenCL platform.\n");
	return -1;
}

cl_context_properties context_properties[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)intel_platform_id, NULL };

// create the OpenCL context on a CPU device
g_context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_CPU, NULL, NULL, NULL);
if (g_context == (cl_context)0)
	return -1;

// get the list of CPU devices associated with context
err = clGetContextInfo(g_context, CL_CONTEXT_DEVICES, 0, NULL, &cb);
clGetContextInfo(g_context, CL_CONTEXT_DEVICES, cb, devices, NULL);

g_cmd_queue = clCreateCommandQueue(g_context, devices[0], 0, NULL);
if (g_cmd_queue == (cl_command_queue)0)
{
	Cleanup_OpenCL();
	return -1;
}

char *sources = ReadSources("GodRays.cl");	//read program .cl source file
g_program = clCreateProgramWithSource(g_context, 1, (const char**)&sources, NULL, NULL);
if (g_program == (cl_program)0)
{
	printf("ERROR: Failed to create Program with source...\n");
	Cleanup_OpenCL();
	free(sources);
	return -1;
}

err = clBuildProgram(g_program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
	printf("ERROR: Failed to build program...\n");
	BuildFailLog(g_program, devices[0]);
	Cleanup_OpenCL();
	free(sources);
	return -1;
}

g_kernel = clCreateKernel(g_program, "GodRays", NULL);
if (g_kernel == (cl_kernel)0)
{
	printf("ERROR: Failed to create kernel...\n");
	Cleanup_OpenCL();
	free(sources);
	return -1;
}
free(sources);

return 0; // success...

Could somebody re-write this sample that it can be done by GPU?
Any help will be nice :)

P.S I’m looking for the tutorial how to port opencl from cpu to gpu.
My english is bad I know that, but I am just starting learn it.

Hi guys, I wonder It is possible to convert Intel OpenCL sample called GodRays that the code will be execute and compute by Nvidia GPU?

I need you help coz I want change the orginal code in as little steps as is possible.
The main idea is that GPU code should be as near orginal one as is possible, the performance is not important.

As I assume all changes need to be make in godray.cpp file.
in this section? or I’m hellishly wrong?

void Cleanup_OpenCL()
{
//release g_kernel, g_program, and memory objects
if( g_inputBuffer ) clReleaseMemObject( g_inputBuffer );
if( g_outputBuffer ) clReleaseMemObject( g_outputBuffer );
if( g_kernel ) clReleaseKernel( g_kernel );
if( g_program ) clReleaseProgram( g_program );
if( g_cmd_queue ) clReleaseCommandQueue( g_cmd_queue );
if( g_context ) clReleaseContext( g_context );
}

int Setup_OpenCL( const char *program_source )
{
cl_device_id devices[16];
size_t cb;
cl_uint size_ret = 0;
cl_int err;

cl_platform_id intel_platform_id = GetIntelOCLPlatform();
if( intel_platform_id == NULL )
{
	printf("ERROR: Failed to find Intel OpenCL platform.\n");
	return -1;
}

cl_context_properties context_properties[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)intel_platform_id, NULL };

// create the OpenCL context on a CPU device
g_context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_CPU, NULL, NULL, NULL);
if (g_context == (cl_context)0)
	return -1;

// get the list of CPU devices associated with context
err = clGetContextInfo(g_context, CL_CONTEXT_DEVICES, 0, NULL, &cb);
clGetContextInfo(g_context, CL_CONTEXT_DEVICES, cb, devices, NULL);

g_cmd_queue = clCreateCommandQueue(g_context, devices[0], 0, NULL);
if (g_cmd_queue == (cl_command_queue)0)
{
	Cleanup_OpenCL();
	return -1;
}

char *sources = ReadSources("GodRays.cl");	//read program .cl source file
g_program = clCreateProgramWithSource(g_context, 1, (const char**)&sources, NULL, NULL);
if (g_program == (cl_program)0)
{
	printf("ERROR: Failed to create Program with source...\n");
	Cleanup_OpenCL();
	free(sources);
	return -1;
}

err = clBuildProgram(g_program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
	printf("ERROR: Failed to build program...\n");
	BuildFailLog(g_program, devices[0]);
	Cleanup_OpenCL();
	free(sources);
	return -1;
}

g_kernel = clCreateKernel(g_program, "GodRays", NULL);
if (g_kernel == (cl_kernel)0)
{
	printf("ERROR: Failed to create kernel...\n");
	Cleanup_OpenCL();
	free(sources);
	return -1;
}
free(sources);

return 0; // success...

Could somebody re-write this sample that it can be done by GPU?
Any help will be nice :)

P.S I’m looking for the tutorial how to port opencl from cpu to gpu.
My english is bad I know that, but I am just starting learn it.

I say this with some obvious naivity, but isn’t the point of openCL that is should
just run on either CPU or GPU devices without any real modifications (ignoring the
desire for performance tuning)?

The line you probably need to look at is

g_context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_CPU, NULL, NULL, NULL);

which should probably be

g_context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);

for the GPU device . . .?


jason

I say this with some obvious naivity, but isn’t the point of openCL that is should
just run on either CPU or GPU devices without any real modifications (ignoring the
desire for performance tuning)?

The line you probably need to look at is

g_context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_CPU, NULL, NULL, NULL);

which should probably be

g_context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);

for the GPU device . . .?


jason

Ok, but that is not as easy as it looks I was trying this at the begining but I got errors like that

The thread ‘Win32 Thread’ (0x1f78) has exited with code -1 (0xffffffff).

The thread ‘Win32 Thread’ (0x1f48) has exited with code -1 (0xffffffff).

The thread ‘Win32 Thread’ (0x197c) has exited with code -1 (0xffffffff).

The thread ‘Win32 Thread’ (0x1974) has exited with code -1 (0xffffffff).

The program ‘[7920] GodRays.exe: Native’ has exited with code -1 (0xffffffff)

I think the main problem is in this lines:

in utils.cpp

cl_platform_id GetIntelOCLPlatform()

{

cl_platform_id pPlatforms[10] = { 0 };

char pVendorName[128] = { 0 };



cl_uint uiPlatformsCount = 0;

cl_int err = clGetPlatformIDs(10, pPlatforms, &uiPlatformsCount);

for (cl_uint ui = 0; ui < uiPlatformsCount; ++ui)

{

	err = clGetPlatformInfo(pPlatforms[ui], CL_PLATFORM_VENDOR, 128 * sizeof(char), pVendorName, NULL);

	if ( err != CL_SUCCESS )

	{

		printf("ERROR: Failed to retreive platform vendor name.\n", ui);

		return NULL;

	}

	if (!strcmp(pVendorName, "Intel Corporation"))

		return pPlatforms[ui];

}

return NULL;

in main function

in godrays.cpp

cl_platform_id intel_platform_id = GetIntelOCLPlatform();

if( intel_platform_id == NULL )

{

	printf("ERROR: Failed to find Intel OpenCL platform.\n");

	return -1;

}

cl_context_properties context_properties[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)intel_platform_id, NULL }

I wanna make this sample independent from intel opencl implemantaion and I think I ought to replace //cl context properties// and //cl platform id//

from intel to nvidia somehow or may I wrong? And cl platform vendor make me pensive because vendor reminds me of the distinction between the different models of graphics cards.

Thx for your respond

Ok, but that is not as easy as it looks I was trying this at the begining but I got errors like that

The thread ‘Win32 Thread’ (0x1f78) has exited with code -1 (0xffffffff).

The thread ‘Win32 Thread’ (0x1f48) has exited with code -1 (0xffffffff).

The thread ‘Win32 Thread’ (0x197c) has exited with code -1 (0xffffffff).

The thread ‘Win32 Thread’ (0x1974) has exited with code -1 (0xffffffff).

The program ‘[7920] GodRays.exe: Native’ has exited with code -1 (0xffffffff)

I think the main problem is in this lines:

in utils.cpp

cl_platform_id GetIntelOCLPlatform()

{

cl_platform_id pPlatforms[10] = { 0 };

char pVendorName[128] = { 0 };



cl_uint uiPlatformsCount = 0;

cl_int err = clGetPlatformIDs(10, pPlatforms, &uiPlatformsCount);

for (cl_uint ui = 0; ui < uiPlatformsCount; ++ui)

{

	err = clGetPlatformInfo(pPlatforms[ui], CL_PLATFORM_VENDOR, 128 * sizeof(char), pVendorName, NULL);

	if ( err != CL_SUCCESS )

	{

		printf("ERROR: Failed to retreive platform vendor name.\n", ui);

		return NULL;

	}

	if (!strcmp(pVendorName, "Intel Corporation"))

		return pPlatforms[ui];

}

return NULL;

in main function

in godrays.cpp

cl_platform_id intel_platform_id = GetIntelOCLPlatform();

if( intel_platform_id == NULL )

{

	printf("ERROR: Failed to find Intel OpenCL platform.\n");

	return -1;

}

cl_context_properties context_properties[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)intel_platform_id, NULL }

I wanna make this sample independent from intel opencl implemantaion and I think I ought to replace //cl context properties// and //cl platform id//

from intel to nvidia somehow or may I wrong? And cl platform vendor make me pensive because vendor reminds me of the distinction between the different models of graphics cards.

Thx for your respond

Now I got 1>------ Build started: Project: GodRays, Configuration: Debug Win32 ------
1> GodRays.cpp
1>c:\users\public\documents\intel\opencl sdk\samples\godrays\godrays.cpp(82): error C2109: subscript requires array or pointer type
1>c:\users\public\documents\intel\opencl sdk\samples\godrays\godrays.cpp(103): error C2109: subscript requires array or pointer type
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========

code
// Copyright (c) 2009-2010 Intel Corporation
// All rights reserved.
//
// WARRANTY DISCLAIMER
//
// THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
// MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of the Materials, and requests that all
// problem reports or change requests be submitted to it directly

#include “stdafx.h”
#include “CL\cl.h”
#include “utils.h”

//we want to use POSIX functions
#pragma warning( push )
#pragma warning( disable : 4996 )

#define BLOCK_DIM_X 1
#define BLOCK_DIM_Y 1
#define GOD_RAYS_BUNCH_SIZE 15 //16
#define INPUT_IMAGE “My5.rgb”
#define devices 0

cl_mem g_inputBuffer = NULL;
cl_mem g_outputBuffer = NULL;
cl_context g_context = NULL;
cl_command_queue g_cmd_queue = NULL;
cl_program g_program = NULL;
cl_kernel g_kernel = NULL;
cl_uint g_globalWorkSize = 0;

void Cleanup_OpenCL()
{
//release g_kernel, g_program, and memory objects
if( g_inputBuffer ) clReleaseMemObject( g_inputBuffer );
if( g_outputBuffer ) clReleaseMemObject( g_outputBuffer );
if( g_kernel ) clReleaseKernel( g_kernel );
if( g_program ) clReleaseProgram( g_program );
if( g_cmd_queue ) clReleaseCommandQueue( g_cmd_queue );
if( g_context ) clReleaseContext( g_context );
}

int Setup_OpenCL( const char *program_source )
{
cl_device_id devices_id;
size_t cb;
cl_uint size_ret = 0;
cl_int err;

cl_platform_id intel_platform_id = GetIntelOCLPlatform();
if( intel_platform_id == NULL )
{
	printf("ERROR: Failed to find Intel OpenCL platform.\n");
	return -1;
}

cl_context_properties context_properties[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)intel_platform_id, NULL };

// create the OpenCL context on a CPU device
{ cl_context hContext;

hContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU,
0, 0, 0);
if (g_context == (cl_context)0)
return -3; }

// get the list of CPU devices associated with context
err = clGetContextInfo(g_context, CL_CONTEXT_DEVICES, 0, NULL, &cb);
clGetContextInfo(g_context, CL_CONTEXT_DEVICES, cb, devices, NULL);

g_cmd_queue = clCreateCommandQueue(g_context, devices[0], 0, NULL);
if (g_cmd_queue == (cl_command_queue)0)
{
	Cleanup_OpenCL();
	return -1;
}

char *sources = ReadSources("GodRays.cl");	//read program .cl source file
g_program = clCreateProgramWithSource(g_context, 1, (const char**)&sources, NULL, NULL);
if (g_program == (cl_program)0)
{
	printf("ERROR: Failed to create Program with source...\n");
	Cleanup_OpenCL();
	free(sources);
	return -1;
}

err = clBuildProgram(g_program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
	printf("ERROR: Failed to build program...\n");
	BuildFailLog(g_program, devices[0]);
	Cleanup_OpenCL();
	free(sources);
	return -1;
}

g_kernel = clCreateKernel(g_program, "GodRays", NULL);
if (g_kernel == (cl_kernel)0)
{
	printf("ERROR: Failed to create kernel...\n");
	Cleanup_OpenCL();
	free(sources);
	return -1;
}
free(sources);

return 0; // success...

}

cl_float* readInput(cl_uint* arrayWidth, cl_uint* arrayHeight)
{

//Load from HDR-image

//!Variables 
int x = 0;
int y = 0;
int iMemSize = 0;
int iResultMemSize = 0;
float fTmpVal = 0.0f;
int iWidth = 0;
int iHeight = 0;
cl_float* inputArray = 0;

FILE* pRGBAFile = fopen(INPUT_IMAGE,"rb");
if(!pRGBAFile)
{
    printf("HOST: Failed to open the HDR image file!\n");
    return 0;
}

fread((void*)&iWidth, sizeof(int), 1, pRGBAFile);
fread((void*)&iHeight, sizeof(int), 1, pRGBAFile);
printf("width = %d\n", iWidth);
printf("height = %d\n", iHeight);

if(iWidth<=0 || iHeight<=0 || iWidth > 1000000 || iHeight > 1000000)
{
    printf("HOST: width or height values are invalid!\n");
    return 0;
}

//! The image size in memory (bytes).
iMemSize = iWidth*iHeight*4*sizeof(cl_float); 

//! Allocate memory.
inputArray = (cl_float*)_aligned_malloc(iMemSize, 16);
if(!inputArray)
{
	printf("Failed to allocate memory for input HDR image!\n");
	return 0;
}

//! Calculate global work size 
g_globalWorkSize = 2*(iWidth + iHeight-2)/15+1;
///g_globalWorkSize = 2*(iWidth + iHeight)/16;


//! Read data from the input file to memory. 
fread((void*)inputArray, 1, iMemSize, pRGBAFile);

//HDR-image hight & weight
*arrayWidth = iWidth;
*arrayHeight = iHeight;

fclose(pRGBAFile);

//Save input image in bitmap file
float fTmpFVal = 0.0f;
cl_uint* outUIntBuf=0;
outUIntBuf = (cl_uint*)malloc(iWidth*iHeight*sizeof(cl_uint));
if(!outUIntBuf)
{
	printf("Failed to allocate memory for output image!\n");
	return 0;
}
for(int y = 0; y < iHeight; y++)
{
    for(int x = 0; x < iWidth; x++)
    {
        // Ensure that no value is greater than 255.0
        cl_uint uiTmp[4];
        fTmpFVal = (255.0f*inputArray[(y*iWidth+x)*4+0]);
		if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
        uiTmp[0] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*inputArray[(y*iWidth+x)*4+1]);
        if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;       
		uiTmp[1] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*inputArray[(y*iWidth+x)*4+2]);
		if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
		uiTmp[2] = (cl_uint)(fTmpFVal);
        
        inputArray[(y*iWidth+x)*4+3] = 0.0f;
		fTmpFVal = (255.0f*inputArray[(y*iWidth+x)*4+3]);
        if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
        uiTmp[3] = (cl_uint)(fTmpFVal);	//Alfa
        
        outUIntBuf[(iHeight-1-y)*iWidth+x] = 0x000000FF & uiTmp[2];
        outUIntBuf[(iHeight-1-y)*iWidth+x] |= 0x0000FF00 & ((uiTmp[1]) << 8);
        outUIntBuf[(iHeight-1-y)*iWidth+x] |= 0x00FF0000 & ((uiTmp[0]) << 16);
        outUIntBuf[(iHeight-1-y)*iWidth+x] |= 0xFF000000 & ((uiTmp[3]) << 24);
    }
}	
//----
SaveImageAsBMP( outUIntBuf, iWidth, iHeight, "GodRaysInput.bmp");
free(outUIntBuf);

return inputArray;

}

//declaration of native function
void EvaluateRay(
float* inBuf,
int iw,
int ih,
int blend,
float* outBuf,
int in_RayNum,
int god_rays_b_size
);

void ExecuteGodRaysReference(cl_float* inputArray, cl_float* outputArray, cl_uint arrayWidth, cl_uint arrayHeight, cl_uint blend)
{
//rays bunch loop
for(cl_uint j = 0; j < g_globalWorkSize;j++)
{
EvaluateRay(inputArray, arrayWidth, arrayHeight, blend, outputArray, j, GOD_RAYS_BUNCH_SIZE);
}
}

bool ExecuteGodRaysKernel(cl_float* inputArray, cl_float* outputArray, cl_uint arrayWidth, cl_uint arrayHeight, cl_uint blend)
{
cl_int err = CL_SUCCESS;

// allocate the buffer
g_inputBuffer = clCreateBuffer(g_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * 4 * arrayWidth * arrayHeight, inputArray, NULL);

if (g_inputBuffer == (cl_mem)0)
{
	printf("ERROR: Failed to create Input Buffer...\n");
	return false;
}

g_outputBuffer = clCreateBuffer(g_context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * 4 * arrayWidth * arrayHeight, NULL, NULL);

if (g_outputBuffer == (cl_mem)0)
{
	printf("ERROR: Failed to create Output Buffer...\n");
	return false;
}


err  = clSetKernelArg(g_kernel, 0, sizeof(cl_mem), (void *) &g_inputBuffer);
err  |= clSetKernelArg(g_kernel, 1, sizeof(cl_mem), (void *) &g_outputBuffer);
err  |= clSetKernelArg(g_kernel, 2, sizeof(cl_int), (void *) &arrayWidth);
err  |= clSetKernelArg(g_kernel, 3, sizeof(cl_int), (void *) &arrayHeight);
err  |= clSetKernelArg(g_kernel, 4, sizeof(cl_int), (void *) &blend);

if (err != CL_SUCCESS)
{
	printf("ERROR: Failed to set input g_kernel arguments...\n");
	return false;
}


size_t globalWorkSize[1];
size_t localWorkSize[1]= { BLOCK_DIM_X };
globalWorkSize[0] = g_globalWorkSize;
printf("Original global work size %d\n", globalWorkSize[0]);
printf("Original local work size %d\n", localWorkSize[0]);
globalWorkSize[0] = (globalWorkSize[0] + (localWorkSize[0]-1)) & ~(localWorkSize[0]-1);
printf("Corrected global work size %d\n", globalWorkSize[0]);
g_globalWorkSize = globalWorkSize[0]; 

// execute kernel
if (CL_SUCCESS != clEnqueueNDRangeKernel(g_cmd_queue, g_kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL))
{
    printf("ERROR: Failed to set input kernel arguments...\n");
    return false;
}

err = clEnqueueReadBuffer(g_cmd_queue, g_outputBuffer, CL_TRUE, 0, sizeof(cl_float) * 4 * arrayWidth * arrayHeight, outputArray, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
	printf("ERROR: Failed to read buffer...\n");
	return false;
}

err = clFinish(g_cmd_queue);

clReleaseMemObject(g_inputBuffer);
clReleaseMemObject(g_outputBuffer);

return err == CL_SUCCESS;

}

// main execution routine - perform God Rays post-processing on float4 vectors
int _tmain(int argc, _TCHAR* argv)
{
cl_uint arrayWidth;
cl_uint arrayHeight;
cl_float* inputArray = 0;
cl_uint blend = 1;

//read input image
inputArray = readInput(&arrayWidth, &arrayHeight);

//initialize Open CL objects (context, queue, etc.)
if( 0 != Setup_OpenCL("MedianFilter.cl") )
	return -1;

printf("Input size is %d X %d\n", arrayWidth, arrayHeight);
cl_float* outputArray = (cl_float*)_aligned_malloc(sizeof(cl_float) * 4 * arrayWidth * arrayHeight, 16);
cl_float* refArray = (cl_float*)_aligned_malloc(sizeof(cl_float) * 4 * arrayWidth * arrayHeight, 16);


//do god rays
printf("Executing OpenCL kernel...\n");
ExecuteGodRaysKernel(inputArray, outputArray, arrayWidth, arrayHeight, blend);

printf("Executing reference...\n");
ExecuteGodRaysReference(inputArray, refArray, arrayWidth, arrayHeight, blend);

//save results in bitmap files

float fTmpFVal = 0.0f;
cl_uint* outUIntBuf=0;
outUIntBuf = (cl_uint*)malloc(arrayWidth*arrayHeight*sizeof(cl_uint));
if(!outUIntBuf)
{
	printf("Failed to allocate memory for output BMP image!\n");
	return -1;
}
for(cl_uint y = 0; y < arrayHeight; y++)
{
    for(cl_uint x = 0; x < arrayWidth; x++)
    {
        // Ensure that no value is greater than 255.0
        cl_uint uiTmp[4];
        fTmpFVal = (255.0f*outputArray[(y*arrayWidth+x)*4+0]);
		if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
        uiTmp[0] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*outputArray[(y*arrayWidth+x)*4+1]);
        if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;       
		uiTmp[1] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*outputArray[(y*arrayWidth+x)*4+2]);
		if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
		uiTmp[2] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*outputArray[(y*arrayWidth+x)*4+3]);
        if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
        uiTmp[3] = 1;	//Alfa
        
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] = 0x000000FF & uiTmp[2];
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] |= 0x0000FF00 & ((uiTmp[1]) << 8);
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] |= 0x00FF0000 & ((uiTmp[0]) << 16);
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] |= 0xFF000000 & ((uiTmp[3]) << 24);
    }
}	
//----
SaveImageAsBMP( outUIntBuf, arrayWidth, arrayHeight, "GodRaysOutput.bmp");

for(cl_uint y = 0; y < arrayHeight; y++)
{
    for(cl_uint x = 0; x < arrayWidth; x++)
    {
        // Ensure that no value is greater than 255.0
        cl_uint uiTmp[4];
        fTmpFVal = (255.0f*refArray[(y*arrayWidth+x)*4+0]);
		if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
        uiTmp[0] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*refArray[(y*arrayWidth+x)*4+1]);
        if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;       
		uiTmp[1] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*refArray[(y*arrayWidth+x)*4+2]);
		if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
		uiTmp[2] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*refArray[(y*arrayWidth+x)*4+3]);
        if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
        uiTmp[3] = 1;	//Alfa
        
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] = 0x000000FF & uiTmp[2];
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] |= 0x0000FF00 & ((uiTmp[1]) << 8);
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] |= 0x00FF0000 & ((uiTmp[0]) << 16);
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] |= 0xFF000000 & ((uiTmp[3]) << 24);
    }
}	
//----
SaveImageAsBMP( outUIntBuf, arrayWidth, arrayHeight, "GodRaysOutputRefernce.bmp");
free(outUIntBuf);

//Do verification
printf("Performing verification...\n");
bool result = true;
for(cl_uint i = 0; i < arrayWidth*arrayHeight*4; i++)	
{
	//Compare the data
	if( fabsf(outputArray[i] - refArray[i]) > 0.01 )	
    {
		printf("Error at location %d,  outputArray = %f, refArray = %f \n", i, outputArray[i], refArray[i]);
        result = false;
	}
}
if(!result)
{
    printf("ERROR: Verification failed.\n");
}

_aligned_free( refArray );
_aligned_free( inputArray );
_aligned_free( outputArray );

Cleanup_OpenCL();

return 0;

}

#pragma warning( pop )

Now I got 1>------ Build started: Project: GodRays, Configuration: Debug Win32 ------
1> GodRays.cpp
1>c:\users\public\documents\intel\opencl sdk\samples\godrays\godrays.cpp(82): error C2109: subscript requires array or pointer type
1>c:\users\public\documents\intel\opencl sdk\samples\godrays\godrays.cpp(103): error C2109: subscript requires array or pointer type
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========

code
// Copyright (c) 2009-2010 Intel Corporation
// All rights reserved.
//
// WARRANTY DISCLAIMER
//
// THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
// MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of the Materials, and requests that all
// problem reports or change requests be submitted to it directly

#include “stdafx.h”
#include “CL\cl.h”
#include “utils.h”

//we want to use POSIX functions
#pragma warning( push )
#pragma warning( disable : 4996 )

#define BLOCK_DIM_X 1
#define BLOCK_DIM_Y 1
#define GOD_RAYS_BUNCH_SIZE 15 //16
#define INPUT_IMAGE “My5.rgb”
#define devices 0

cl_mem g_inputBuffer = NULL;
cl_mem g_outputBuffer = NULL;
cl_context g_context = NULL;
cl_command_queue g_cmd_queue = NULL;
cl_program g_program = NULL;
cl_kernel g_kernel = NULL;
cl_uint g_globalWorkSize = 0;

void Cleanup_OpenCL()
{
//release g_kernel, g_program, and memory objects
if( g_inputBuffer ) clReleaseMemObject( g_inputBuffer );
if( g_outputBuffer ) clReleaseMemObject( g_outputBuffer );
if( g_kernel ) clReleaseKernel( g_kernel );
if( g_program ) clReleaseProgram( g_program );
if( g_cmd_queue ) clReleaseCommandQueue( g_cmd_queue );
if( g_context ) clReleaseContext( g_context );
}

int Setup_OpenCL( const char *program_source )
{
cl_device_id devices_id;
size_t cb;
cl_uint size_ret = 0;
cl_int err;

cl_platform_id intel_platform_id = GetIntelOCLPlatform();
if( intel_platform_id == NULL )
{
	printf("ERROR: Failed to find Intel OpenCL platform.\n");
	return -1;
}

cl_context_properties context_properties[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)intel_platform_id, NULL };

// create the OpenCL context on a CPU device
{ cl_context hContext;

hContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU,
0, 0, 0);
if (g_context == (cl_context)0)
return -3; }

// get the list of CPU devices associated with context
err = clGetContextInfo(g_context, CL_CONTEXT_DEVICES, 0, NULL, &cb);
clGetContextInfo(g_context, CL_CONTEXT_DEVICES, cb, devices, NULL);

g_cmd_queue = clCreateCommandQueue(g_context, devices[0], 0, NULL);
if (g_cmd_queue == (cl_command_queue)0)
{
	Cleanup_OpenCL();
	return -1;
}

char *sources = ReadSources("GodRays.cl");	//read program .cl source file
g_program = clCreateProgramWithSource(g_context, 1, (const char**)&sources, NULL, NULL);
if (g_program == (cl_program)0)
{
	printf("ERROR: Failed to create Program with source...\n");
	Cleanup_OpenCL();
	free(sources);
	return -1;
}

err = clBuildProgram(g_program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
	printf("ERROR: Failed to build program...\n");
	BuildFailLog(g_program, devices[0]);
	Cleanup_OpenCL();
	free(sources);
	return -1;
}

g_kernel = clCreateKernel(g_program, "GodRays", NULL);
if (g_kernel == (cl_kernel)0)
{
	printf("ERROR: Failed to create kernel...\n");
	Cleanup_OpenCL();
	free(sources);
	return -1;
}
free(sources);

return 0; // success...

}

cl_float* readInput(cl_uint* arrayWidth, cl_uint* arrayHeight)
{

//Load from HDR-image

//!Variables 
int x = 0;
int y = 0;
int iMemSize = 0;
int iResultMemSize = 0;
float fTmpVal = 0.0f;
int iWidth = 0;
int iHeight = 0;
cl_float* inputArray = 0;

FILE* pRGBAFile = fopen(INPUT_IMAGE,"rb");
if(!pRGBAFile)
{
    printf("HOST: Failed to open the HDR image file!\n");
    return 0;
}

fread((void*)&iWidth, sizeof(int), 1, pRGBAFile);
fread((void*)&iHeight, sizeof(int), 1, pRGBAFile);
printf("width = %d\n", iWidth);
printf("height = %d\n", iHeight);

if(iWidth<=0 || iHeight<=0 || iWidth > 1000000 || iHeight > 1000000)
{
    printf("HOST: width or height values are invalid!\n");
    return 0;
}

//! The image size in memory (bytes).
iMemSize = iWidth*iHeight*4*sizeof(cl_float); 

//! Allocate memory.
inputArray = (cl_float*)_aligned_malloc(iMemSize, 16);
if(!inputArray)
{
	printf("Failed to allocate memory for input HDR image!\n");
	return 0;
}

//! Calculate global work size 
g_globalWorkSize = 2*(iWidth + iHeight-2)/15+1;
///g_globalWorkSize = 2*(iWidth + iHeight)/16;


//! Read data from the input file to memory. 
fread((void*)inputArray, 1, iMemSize, pRGBAFile);

//HDR-image hight & weight
*arrayWidth = iWidth;
*arrayHeight = iHeight;

fclose(pRGBAFile);

//Save input image in bitmap file
float fTmpFVal = 0.0f;
cl_uint* outUIntBuf=0;
outUIntBuf = (cl_uint*)malloc(iWidth*iHeight*sizeof(cl_uint));
if(!outUIntBuf)
{
	printf("Failed to allocate memory for output image!\n");
	return 0;
}
for(int y = 0; y < iHeight; y++)
{
    for(int x = 0; x < iWidth; x++)
    {
        // Ensure that no value is greater than 255.0
        cl_uint uiTmp[4];
        fTmpFVal = (255.0f*inputArray[(y*iWidth+x)*4+0]);
		if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
        uiTmp[0] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*inputArray[(y*iWidth+x)*4+1]);
        if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;       
		uiTmp[1] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*inputArray[(y*iWidth+x)*4+2]);
		if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
		uiTmp[2] = (cl_uint)(fTmpFVal);
        
        inputArray[(y*iWidth+x)*4+3] = 0.0f;
		fTmpFVal = (255.0f*inputArray[(y*iWidth+x)*4+3]);
        if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
        uiTmp[3] = (cl_uint)(fTmpFVal);	//Alfa
        
        outUIntBuf[(iHeight-1-y)*iWidth+x] = 0x000000FF & uiTmp[2];
        outUIntBuf[(iHeight-1-y)*iWidth+x] |= 0x0000FF00 & ((uiTmp[1]) << 8);
        outUIntBuf[(iHeight-1-y)*iWidth+x] |= 0x00FF0000 & ((uiTmp[0]) << 16);
        outUIntBuf[(iHeight-1-y)*iWidth+x] |= 0xFF000000 & ((uiTmp[3]) << 24);
    }
}	
//----
SaveImageAsBMP( outUIntBuf, iWidth, iHeight, "GodRaysInput.bmp");
free(outUIntBuf);

return inputArray;

}

//declaration of native function
void EvaluateRay(
float* inBuf,
int iw,
int ih,
int blend,
float* outBuf,
int in_RayNum,
int god_rays_b_size
);

void ExecuteGodRaysReference(cl_float* inputArray, cl_float* outputArray, cl_uint arrayWidth, cl_uint arrayHeight, cl_uint blend)
{
//rays bunch loop
for(cl_uint j = 0; j < g_globalWorkSize;j++)
{
EvaluateRay(inputArray, arrayWidth, arrayHeight, blend, outputArray, j, GOD_RAYS_BUNCH_SIZE);
}
}

bool ExecuteGodRaysKernel(cl_float* inputArray, cl_float* outputArray, cl_uint arrayWidth, cl_uint arrayHeight, cl_uint blend)
{
cl_int err = CL_SUCCESS;

// allocate the buffer
g_inputBuffer = clCreateBuffer(g_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * 4 * arrayWidth * arrayHeight, inputArray, NULL);

if (g_inputBuffer == (cl_mem)0)
{
	printf("ERROR: Failed to create Input Buffer...\n");
	return false;
}

g_outputBuffer = clCreateBuffer(g_context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * 4 * arrayWidth * arrayHeight, NULL, NULL);

if (g_outputBuffer == (cl_mem)0)
{
	printf("ERROR: Failed to create Output Buffer...\n");
	return false;
}


err  = clSetKernelArg(g_kernel, 0, sizeof(cl_mem), (void *) &g_inputBuffer);
err  |= clSetKernelArg(g_kernel, 1, sizeof(cl_mem), (void *) &g_outputBuffer);
err  |= clSetKernelArg(g_kernel, 2, sizeof(cl_int), (void *) &arrayWidth);
err  |= clSetKernelArg(g_kernel, 3, sizeof(cl_int), (void *) &arrayHeight);
err  |= clSetKernelArg(g_kernel, 4, sizeof(cl_int), (void *) &blend);

if (err != CL_SUCCESS)
{
	printf("ERROR: Failed to set input g_kernel arguments...\n");
	return false;
}


size_t globalWorkSize[1];
size_t localWorkSize[1]= { BLOCK_DIM_X };
globalWorkSize[0] = g_globalWorkSize;
printf("Original global work size %d\n", globalWorkSize[0]);
printf("Original local work size %d\n", localWorkSize[0]);
globalWorkSize[0] = (globalWorkSize[0] + (localWorkSize[0]-1)) & ~(localWorkSize[0]-1);
printf("Corrected global work size %d\n", globalWorkSize[0]);
g_globalWorkSize = globalWorkSize[0]; 

// execute kernel
if (CL_SUCCESS != clEnqueueNDRangeKernel(g_cmd_queue, g_kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL))
{
    printf("ERROR: Failed to set input kernel arguments...\n");
    return false;
}

err = clEnqueueReadBuffer(g_cmd_queue, g_outputBuffer, CL_TRUE, 0, sizeof(cl_float) * 4 * arrayWidth * arrayHeight, outputArray, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
	printf("ERROR: Failed to read buffer...\n");
	return false;
}

err = clFinish(g_cmd_queue);

clReleaseMemObject(g_inputBuffer);
clReleaseMemObject(g_outputBuffer);

return err == CL_SUCCESS;

}

// main execution routine - perform God Rays post-processing on float4 vectors
int _tmain(int argc, _TCHAR* argv)
{
cl_uint arrayWidth;
cl_uint arrayHeight;
cl_float* inputArray = 0;
cl_uint blend = 1;

//read input image
inputArray = readInput(&arrayWidth, &arrayHeight);

//initialize Open CL objects (context, queue, etc.)
if( 0 != Setup_OpenCL("MedianFilter.cl") )
	return -1;

printf("Input size is %d X %d\n", arrayWidth, arrayHeight);
cl_float* outputArray = (cl_float*)_aligned_malloc(sizeof(cl_float) * 4 * arrayWidth * arrayHeight, 16);
cl_float* refArray = (cl_float*)_aligned_malloc(sizeof(cl_float) * 4 * arrayWidth * arrayHeight, 16);


//do god rays
printf("Executing OpenCL kernel...\n");
ExecuteGodRaysKernel(inputArray, outputArray, arrayWidth, arrayHeight, blend);

printf("Executing reference...\n");
ExecuteGodRaysReference(inputArray, refArray, arrayWidth, arrayHeight, blend);

//save results in bitmap files

float fTmpFVal = 0.0f;
cl_uint* outUIntBuf=0;
outUIntBuf = (cl_uint*)malloc(arrayWidth*arrayHeight*sizeof(cl_uint));
if(!outUIntBuf)
{
	printf("Failed to allocate memory for output BMP image!\n");
	return -1;
}
for(cl_uint y = 0; y < arrayHeight; y++)
{
    for(cl_uint x = 0; x < arrayWidth; x++)
    {
        // Ensure that no value is greater than 255.0
        cl_uint uiTmp[4];
        fTmpFVal = (255.0f*outputArray[(y*arrayWidth+x)*4+0]);
		if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
        uiTmp[0] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*outputArray[(y*arrayWidth+x)*4+1]);
        if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;       
		uiTmp[1] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*outputArray[(y*arrayWidth+x)*4+2]);
		if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
		uiTmp[2] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*outputArray[(y*arrayWidth+x)*4+3]);
        if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
        uiTmp[3] = 1;	//Alfa
        
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] = 0x000000FF & uiTmp[2];
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] |= 0x0000FF00 & ((uiTmp[1]) << 8);
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] |= 0x00FF0000 & ((uiTmp[0]) << 16);
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] |= 0xFF000000 & ((uiTmp[3]) << 24);
    }
}	
//----
SaveImageAsBMP( outUIntBuf, arrayWidth, arrayHeight, "GodRaysOutput.bmp");

for(cl_uint y = 0; y < arrayHeight; y++)
{
    for(cl_uint x = 0; x < arrayWidth; x++)
    {
        // Ensure that no value is greater than 255.0
        cl_uint uiTmp[4];
        fTmpFVal = (255.0f*refArray[(y*arrayWidth+x)*4+0]);
		if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
        uiTmp[0] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*refArray[(y*arrayWidth+x)*4+1]);
        if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;       
		uiTmp[1] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*refArray[(y*arrayWidth+x)*4+2]);
		if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
		uiTmp[2] = (cl_uint)(fTmpFVal);
        
		fTmpFVal = (255.0f*refArray[(y*arrayWidth+x)*4+3]);
        if(fTmpFVal>255.0f) 
			fTmpFVal=255.0f;
        uiTmp[3] = 1;	//Alfa
        
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] = 0x000000FF & uiTmp[2];
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] |= 0x0000FF00 & ((uiTmp[1]) << 8);
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] |= 0x00FF0000 & ((uiTmp[0]) << 16);
        outUIntBuf[(arrayHeight-1-y)*arrayWidth+x] |= 0xFF000000 & ((uiTmp[3]) << 24);
    }
}	
//----
SaveImageAsBMP( outUIntBuf, arrayWidth, arrayHeight, "GodRaysOutputRefernce.bmp");
free(outUIntBuf);

//Do verification
printf("Performing verification...\n");
bool result = true;
for(cl_uint i = 0; i < arrayWidth*arrayHeight*4; i++)	
{
	//Compare the data
	if( fabsf(outputArray[i] - refArray[i]) > 0.01 )	
    {
		printf("Error at location %d,  outputArray = %f, refArray = %f \n", i, outputArray[i], refArray[i]);
        result = false;
	}
}
if(!result)
{
    printf("ERROR: Verification failed.\n");
}

_aligned_free( refArray );
_aligned_free( inputArray );
_aligned_free( outputArray );

Cleanup_OpenCL();

return 0;

}

#pragma warning( pop )

Well, you can see that comparing to the string is the main problem.

if (!strcmp(pVendorName, "Intel Corporation"))

Look at the NVidia SDK sample and see what they compare to ( strcmp(pbuf, “NVIDIA Corporation”)) or grab just the first platform which suits your needs (without comapring to the specified strings). You can query if the platform supports texture, how big is the device memory and so on. Read OpenCL spec for more details.