Double & OpenCL

Letinono · March 25, 2010, 9:19am

Hello,

I try to transform float to double in my simple program that only makes a product of one array and a value.

It works fine with float but when I pass it to double, results are wrong.

This is my kernel:

[codebox]#pragma OPENCL EXTENSION cl_khr_fp64: enable

__kernel void VectorMul(__global const double* a, double b, __global double* c, int iNumElements)

{

// get index into global data array

int iGID = get_global_id(0);

// bound check (equivalent to the limit on a ‘for’ loop for standard/serial C code

if (iGID >= iNumElements)

{   

    return; 

}

// Mul the vector elements

c[iGID] = a[iGID] * b;

}

[/codebox]

all datas are set to double or cl_double.

My computer spec:

Linux CentOS 5.1 64bit
devdriver_3.0_linux_64_195.36.15
cudatoolkit_3.0_linux_64_rhel5.3
gpucomputingsdk_3.0_linux
GeForce GTS 250

Here is my .cpp file, I try to add it in attachment but it doesn"t work.

[codebox]//************************************************************

//

// Multiply a vector by a float data

//

//************************************************************

// common SDK header for standard utilities and system libs

#include <oclUtils.h>

// Name of the file with the source code for the computation kernel

// ************************************************************

const char* cSourceFile = “VectorMul.cl”;

// Host buffers for demo

// ************************************************************

void *src, *dst, *temp; // Host buffers for OpenCL test

// OpenCL Vars

cl_platform_id cpPlatform;

cl_uint uiNumDevices;

cl_context cxGPUContext; // OpenCL context

cl_command_queue cqCommandQue; // OpenCL command que

cl_device_id* cdDevices; // OpenCL device list

cl_program cpProgram; // OpenCL program

cl_kernel ckKernel; // OpenCL kernel

cl_mem cmDevSrc; // OpenCL device source buffer

cl_mem cmDevDst; // OpenCL device destination buffer

size_t szGlobalWorkSize; // 1D var for Total # of work items

size_t szLocalWorkSize; // 1D var for # of work items in the work group

size_t szParmDataBytes; // Byte size of context information

size_t szKernelLength; // Byte size of kernel code

cl_int ciErr1, ciErr2; // Error code var

char* cPathAndName = NULL; // var for full paths to data, src, etc.

char* cSourceCL = NULL; // Buffer to hold source for compilation

// typedef

typedef cl_double cl_type;

typedef double local_type;

// demo config vars

int iNumElements = 8388608;//16777216;//11444777; // Length of float arrays to process (odd # for illustration)

local_type value = 10.0;

shrBOOL bQuickTest = shrFALSE;

// Forward Declarations

// ************************************************************

void Cleanup (int iExitCode);

#pragma OPENCL EXTENSION cl_khr_fp64: enable

// Main function

// ************************************************************

int main(int argc, char **argv)

{

// get command line arg for quick test, if provided

bQuickTest = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");

// start logs

shrSetLogFileName ("oclVectorMul.txt");

// shrLog(“%s Starting…\n\n# of float elements per Array \t= %u\n”, argv[0], iNumElements);

// set and log Global and Local work size dimensions

szLocalWorkSize = 256;

szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize

// shrLog(“Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n”,

// szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));

// Allocate and initialize host arrays

src = (void *)malloc(sizeof(cl_type) * szGlobalWorkSize);

dst = (void *)malloc(sizeof(cl_type) * szGlobalWorkSize);

temp = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);

shrFillArray((float *)temp, iNumElements);

local_type *src2;

src2 = (local_type *)src;

float *temp2;

temp2 = (float *)temp;

for (int i=0; i<iNumElements; i++)

{

src2[i] = (local_type)temp2[i];

}

// shrLog( “Allocate and Init Host Mem…\n”);

//Get the NVIDIA platform

ciErr1 = oclGetPlatformID(&cpPlatform);

oclCheckError(ciErr1, CL_SUCCESS);

//Get the devices

ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);

oclCheckError(ciErr1, CL_SUCCESS);

cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );

ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);

oclCheckError(ciErr1, CL_SUCCESS);

//Create the context

cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErr1);

oclCheckError(ciErr1, CL_SUCCESS);

// Create a command-queue

cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr1);

// shrLog(“clCreateCommandQueue…\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clCreateCommandQueue, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

// Allocate the OpenCL source and result buffer memory objects on the device GMEM, and copy the data to the device

shrDeltaT(0);

cmDevSrc = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_type) * szGlobalWorkSize, src, &ciErr1);

double memCpyHDTime = shrDeltaT(0);

cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_type) * szGlobalWorkSize, NULL, &ciErr2);

ciErr1 |= ciErr2;

// shrLog(“clCreateBuffer…\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clCreateBuffer, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

// Read the OpenCL kernel in from source file

cPathAndName = shrFindFilePath(cSourceFile, argv[0]);

cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);

// shrLog(“oclLoadProgSource (%s)…\n”, cSourceFile);

// Create the program

cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);

// shrLog(“clCreateProgramWithSource…\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clCreateProgramWithSource, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

ciErr1 = clBuildProgram(cpProgram, 0, NULL, “-cl-fast-relaxed-math”, NULL, NULL);

if (ciErr1 != CL_SUCCESS)

{

    // write out standard error, Build Log and PTX, then cleanup and exit

    shrLogEx(LOGBOTH | ERRORMSG, ciErr1, STDERROR);

    oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));

    oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclCorrel.ptx");

    oclCheckError(ciErr1, CL_SUCCESS); 

}

// Create the kernel

ckKernel = clCreateKernel(cpProgram, "VectorMul", &ciErr1);

// shrLog(“clCreateKernel…\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clCreateKernel, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

// Set the Argument values

ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrc);

ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_type), (void*)&value);

ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);

ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);

// shrLog(“clSetKernelArg…\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clSetKernelArg, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

// Launch kernel

shrDeltaT(0);

ciErr1 = clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);

double computeTime = shrDeltaT(0);

// shrLog(“clEnqueueNDRangeKernel…\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clEnqueueNDRangeKernel, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

// Synchronous/blocking read of results, and check accumulated errors

shrDeltaT(0);

ciErr1 = clEnqueueReadBuffer(cqCommandQue, cmDevDst, CL_TRUE, 0, sizeof(cl_type) * szGlobalWorkSize, dst, 0, NULL, NULL);

double memCpyDHTime = shrDeltaT(0);

// shrLog(“clEnqueueReadBuffer…\n\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clEnqueueReadBuffer, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

printf("\nMultiplication\n\n\t*%d datas\n\t*%d Global Work Size\n\t*%d Local Work Size\n", iNumElements, (int)szGlobalWorkSize, (int)szLocalWorkSize);

printf("\n=============================================\n");

printf("Time to copy datas HOST -> DEVICE : %.3f ms\n", (memCpyHDTime*1000));

printf("Time to compute                   : %.3f ms\n", (computeTime*1000));

printf("Time to copy datas DEVICE -> HOST : %.3f ms\n", (memCpyDHTime*1000));

printf("--------------------------------------------\n");

printf("Total time                        : %.3f ms\n", ((memCpyHDTime+computeTime+memCpyDHTime)*1000));

printf("=============================================\n\n");

local_type *srcf, *dstf;

srcf = (local_type*)src;

dstf = (local_type*)dst;

for (int i=0; i<iNumElements; i++)

{

	if (srcf[i]*10 != dstf[i])

		printf("Error at indice %d : %.3f != %.3f\n", i, srcf[i]*10, dstf[i]);

}

// Cleanup and leave

Cleanup (EXIT_SUCCESS);

}

void Cleanup (int iExitCode)

{

// Cleanup allocated objects

shrLog("\nStarting Cleanup...\n\n");

if(cdDevices)free(cdDevices);

if(cPathAndName)free(cPathAndName);

if(cSourceCL)free(cSourceCL);

if(ckKernel)clReleaseKernel(ckKernel);  

if(cpProgram)clReleaseProgram(cpProgram);

if(cqCommandQue)clReleaseCommandQueue(cqCommandQue);

if(cxGPUContext)clReleaseContext(cxGPUContext);

if(cmDevSrc)clReleaseMemObject(cmDevSrc);

if(cmDevDst)clReleaseMemObject(cmDevDst);

// Free host memory

free(src); 

free (dst);

// finalize logs and leave

if (bQuickTest)

{

    shrLog("oclVectorMul Ending...\n");

}

else 

{

    shrLog("oclVectorMul Ending...\nPress Enter to Exit\n");

    getchar();

}

exit (iExitCode);

}

[/codebox]

Thanks.

Letinono · March 25, 2010, 10:02am

It seems my geForce GTS 250 cannot compute datas in double precision.

madsen · March 25, 2010, 10:09am

The oclDeviceQuery program will show if you card support this extension. My 285 does, but my 8600 GT does not. Doubles work fine with the 285, but are slooooow. I have an include file which defines a type scalar of either type float or double. I can then quickly change from float to double. I include this on both the CPU (after including cl.h) and on the OpenCL codes.

#ifndef USE_DOUBLE

define USE_DOUBLE 0

endif

#if USE_DOUBLE

ifdef __OPENCL_CL_H

typedef cl_double cl_scalar;

else

#pragma OPENCL EXTENSION cl_khr_fp64 : enable

endif

typedef double scalar;

else

ifdef __OPENCL_CL_H

typedef cl_float cl_scalar;

endif

typedef float scalar;

endif

Letinono · March 25, 2010, 10:25am

I was so convinced she managed the double I have not thought about watching the deviceQuery.

Thanks a lot for this code.

Letinono · March 25, 2010, 10:28am

Can you give me the loss engendered by the double on GTX285? Regardless of the kernel code.

Thanks.

Kerilk · March 25, 2010, 2:12pm

I’ve had problem with similar code :

[codebox]

if (iGID >= iNumElements)

{

return;

}

[/codebox]

if return was not aligned with half warps.

I would advise against it and just let the threads do nothing until your algorithm finishes.

I have similar algorithms working flawlessly on a GTX 260.