Double & OpenCL

Hello,

I try to transform float to double in my simple program that only makes a product of one array and a value.

It works fine with float but when I pass it to double, results are wrong.

This is my kernel:

[codebox]#pragma OPENCL EXTENSION cl_khr_fp64: enable

__kernel void VectorMul(__global const double* a, double b, __global double* c, int iNumElements)

{

// get index into global data array

int iGID = get_global_id(0);

// bound check (equivalent to the limit on a ‘for’ loop for standard/serial C code

if (iGID >= iNumElements)

{   

    return; 

}

// Mul the vector elements

c[iGID] = a[iGID] * b;

}

[/codebox]

all datas are set to double or cl_double.

My computer spec:

  • Linux CentOS 5.1 64bit

  • devdriver_3.0_linux_64_195.36.15

  • cudatoolkit_3.0_linux_64_rhel5.3

  • gpucomputingsdk_3.0_linux

  • GeForce GTS 250

Here is my .cpp file, I try to add it in attachment but it doesn"t work.

[codebox]//************************************************************


//

// Multiply a vector by a float data

//

//************************************************************


// common SDK header for standard utilities and system libs

#include <oclUtils.h>

// Name of the file with the source code for the computation kernel

// ************************************************************


const char* cSourceFile = “VectorMul.cl”;

// Host buffers for demo

// ************************************************************


void *src, *dst, *temp; // Host buffers for OpenCL test

// OpenCL Vars

cl_platform_id cpPlatform;

cl_uint uiNumDevices;

cl_context cxGPUContext; // OpenCL context

cl_command_queue cqCommandQue; // OpenCL command que

cl_device_id* cdDevices; // OpenCL device list

cl_program cpProgram; // OpenCL program

cl_kernel ckKernel; // OpenCL kernel

cl_mem cmDevSrc; // OpenCL device source buffer

cl_mem cmDevDst; // OpenCL device destination buffer

size_t szGlobalWorkSize; // 1D var for Total # of work items

size_t szLocalWorkSize; // 1D var for # of work items in the work group

size_t szParmDataBytes; // Byte size of context information

size_t szKernelLength; // Byte size of kernel code

cl_int ciErr1, ciErr2; // Error code var

char* cPathAndName = NULL; // var for full paths to data, src, etc.

char* cSourceCL = NULL; // Buffer to hold source for compilation

// typedef

typedef cl_double cl_type;

typedef double local_type;

// demo config vars

int iNumElements = 8388608;//16777216;//11444777; // Length of float arrays to process (odd # for illustration)

local_type value = 10.0;

shrBOOL bQuickTest = shrFALSE;

// Forward Declarations

// ************************************************************


void Cleanup (int iExitCode);

#pragma OPENCL EXTENSION cl_khr_fp64: enable

// Main function

// ************************************************************


int main(int argc, char **argv)

{

// get command line arg for quick test, if provided

bQuickTest = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");

// start logs

shrSetLogFileName ("oclVectorMul.txt");

// shrLog(“%s Starting…\n\n# of float elements per Array \t= %u\n”, argv[0], iNumElements);

// set and log Global and Local work size dimensions

szLocalWorkSize = 256;

szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize

// shrLog(“Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n”,

// szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));

// Allocate and initialize host arrays

src = (void *)malloc(sizeof(cl_type) * szGlobalWorkSize);

dst = (void *)malloc(sizeof(cl_type) * szGlobalWorkSize);

temp = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);

shrFillArray((float *)temp, iNumElements);

local_type *src2;

src2 = (local_type *)src;

float *temp2;

temp2 = (float *)temp;

for (int i=0; i<iNumElements; i++)

{

src2[i] = (local_type)temp2[i];

}

// shrLog( “Allocate and Init Host Mem…\n”);

//Get the NVIDIA platform

ciErr1 = oclGetPlatformID(&cpPlatform);

oclCheckError(ciErr1, CL_SUCCESS);

//Get the devices

ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);

oclCheckError(ciErr1, CL_SUCCESS);

cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );

ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);

oclCheckError(ciErr1, CL_SUCCESS);

//Create the context

cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErr1);

oclCheckError(ciErr1, CL_SUCCESS);

// Create a command-queue

cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr1);

// shrLog(“clCreateCommandQueue…\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clCreateCommandQueue, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

// Allocate the OpenCL source and result buffer memory objects on the device GMEM, and copy the data to the device

shrDeltaT(0);

cmDevSrc = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_type) * szGlobalWorkSize, src, &ciErr1);

double memCpyHDTime = shrDeltaT(0);

cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_type) * szGlobalWorkSize, NULL, &ciErr2);

ciErr1 |= ciErr2;

// shrLog(“clCreateBuffer…\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clCreateBuffer, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

// Read the OpenCL kernel in from source file

cPathAndName = shrFindFilePath(cSourceFile, argv[0]);

cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);

// shrLog(“oclLoadProgSource (%s)…\n”, cSourceFile);

// Create the program

cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);

// shrLog(“clCreateProgramWithSource…\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clCreateProgramWithSource, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

ciErr1 = clBuildProgram(cpProgram, 0, NULL, “-cl-fast-relaxed-math”, NULL, NULL);

if (ciErr1 != CL_SUCCESS)

{

    // write out standard error, Build Log and PTX, then cleanup and exit

    shrLogEx(LOGBOTH | ERRORMSG, ciErr1, STDERROR);

    oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));

    oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclCorrel.ptx");

    oclCheckError(ciErr1, CL_SUCCESS); 

}

// Create the kernel

ckKernel = clCreateKernel(cpProgram, "VectorMul", &ciErr1);

// shrLog(“clCreateKernel…\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clCreateKernel, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

// Set the Argument values

ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrc);

ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_type), (void*)&value);

ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);

ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);

// shrLog(“clSetKernelArg…\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clSetKernelArg, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

// Launch kernel

shrDeltaT(0);

ciErr1 = clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);

double computeTime = shrDeltaT(0);

// shrLog(“clEnqueueNDRangeKernel…\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clEnqueueNDRangeKernel, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

// Synchronous/blocking read of results, and check accumulated errors

shrDeltaT(0);

ciErr1 = clEnqueueReadBuffer(cqCommandQue, cmDevDst, CL_TRUE, 0, sizeof(cl_type) * szGlobalWorkSize, dst, 0, NULL, NULL);

double memCpyDHTime = shrDeltaT(0);

// shrLog(“clEnqueueReadBuffer…\n\n”);

if (ciErr1 != CL_SUCCESS)

{

    shrLog("Error in clEnqueueReadBuffer, near Line %u in file %u", __LINE__, __FILE__);

    Cleanup(EXIT_FAILURE);

}

printf("\nMultiplication\n\n\t*%d datas\n\t*%d Global Work Size\n\t*%d Local Work Size\n", iNumElements, (int)szGlobalWorkSize, (int)szLocalWorkSize);

printf("\n=============================================\n");

printf("Time to copy datas HOST -> DEVICE : %.3f ms\n", (memCpyHDTime*1000));

printf("Time to compute                   : %.3f ms\n", (computeTime*1000));

printf("Time to copy datas DEVICE -> HOST : %.3f ms\n", (memCpyDHTime*1000));

printf("--------------------------------------------\n");

printf("Total time                        : %.3f ms\n", ((memCpyHDTime+computeTime+memCpyDHTime)*1000));

printf("=============================================\n\n");

local_type *srcf, *dstf;

srcf = (local_type*)src;

dstf = (local_type*)dst;

for (int i=0; i<iNumElements; i++)

{

	if (srcf[i]*10 != dstf[i])

		printf("Error at indice %d : %.3f != %.3f\n", i, srcf[i]*10, dstf[i]);

}

// Cleanup and leave

Cleanup (EXIT_SUCCESS);

}

void Cleanup (int iExitCode)

{

// Cleanup allocated objects

shrLog("\nStarting Cleanup...\n\n");

if(cdDevices)free(cdDevices);

if(cPathAndName)free(cPathAndName);

if(cSourceCL)free(cSourceCL);

if(ckKernel)clReleaseKernel(ckKernel);  

if(cpProgram)clReleaseProgram(cpProgram);

if(cqCommandQue)clReleaseCommandQueue(cqCommandQue);

if(cxGPUContext)clReleaseContext(cxGPUContext);

if(cmDevSrc)clReleaseMemObject(cmDevSrc);

if(cmDevDst)clReleaseMemObject(cmDevDst);

// Free host memory

free(src); 

free (dst);

// finalize logs and leave

if (bQuickTest)

{

    shrLog("oclVectorMul Ending...\n");

}

else 

{

    shrLog("oclVectorMul Ending...\nPress Enter to Exit\n");

    getchar();

}

exit (iExitCode);

}

[/codebox]

Thanks.

It seems my geForce GTS 250 cannot compute datas in double precision.

The oclDeviceQuery program will show if you card support this extension. My 285 does, but my 8600 GT does not. Doubles work fine with the 285, but are slooooow. I have an include file which defines a type scalar of either type float or double. I can then quickly change from float to double. I include this on both the CPU (after including cl.h) and on the OpenCL codes.

#ifndef USE_DOUBLE

define USE_DOUBLE 0

endif

#if USE_DOUBLE

ifdef __OPENCL_CL_H

typedef cl_double cl_scalar;

else

#pragma OPENCL EXTENSION cl_khr_fp64 : enable

endif

typedef double scalar;

else

ifdef __OPENCL_CL_H

typedef cl_float cl_scalar;

endif

typedef float scalar;

endif

I was so convinced she managed the double I have not thought about watching the deviceQuery.

Thanks a lot for this code.

Can you give me the loss engendered by the double on GTX285? Regardless of the kernel code.

Thanks.

I’ve had problem with similar code :

[codebox]

if (iGID >= iNumElements)

{

return;

}

[/codebox]

if return was not aligned with half warps.

I would advise against it and just let the threads do nothing until your algorithm finishes.

I have similar algorithms working flawlessly on a GTX 260.