Hello,
I try to transform float to double in my simple program that only makes a product of one array and a value.
It works fine with float but when I pass it to double, results are wrong.
This is my kernel:
[codebox]#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void VectorMul(__global const double* a, double b, __global double* c, int iNumElements)
{
// get index into global data array
int iGID = get_global_id(0);
// bound check (equivalent to the limit on a ‘for’ loop for standard/serial C code
if (iGID >= iNumElements)
{
return;
}
// Mul the vector elements
c[iGID] = a[iGID] * b;
}
[/codebox]
all datas are set to double or cl_double.
My computer spec:
-
Linux CentOS 5.1 64bit
-
devdriver_3.0_linux_64_195.36.15
-
cudatoolkit_3.0_linux_64_rhel5.3
-
gpucomputingsdk_3.0_linux
-
GeForce GTS 250
Here is my .cpp file, I try to add it in attachment but it doesn"t work.
[codebox]//************************************************************
//
// Multiply a vector by a float data
//
//************************************************************
// common SDK header for standard utilities and system libs
#include <oclUtils.h>
// Name of the file with the source code for the computation kernel
// ************************************************************
const char* cSourceFile = “VectorMul.cl”;
// Host buffers for demo
// ************************************************************
void *src, *dst, *temp; // Host buffers for OpenCL test
// OpenCL Vars
cl_platform_id cpPlatform;
cl_uint uiNumDevices;
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQue; // OpenCL command que
cl_device_id* cdDevices; // OpenCL device list
cl_program cpProgram; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel
cl_mem cmDevSrc; // OpenCL device source buffer
cl_mem cmDevDst; // OpenCL device destination buffer
size_t szGlobalWorkSize; // 1D var for Total # of work items
size_t szLocalWorkSize; // 1D var for # of work items in the work group
size_t szParmDataBytes; // Byte size of context information
size_t szKernelLength; // Byte size of kernel code
cl_int ciErr1, ciErr2; // Error code var
char* cPathAndName = NULL; // var for full paths to data, src, etc.
char* cSourceCL = NULL; // Buffer to hold source for compilation
// typedef
typedef cl_double cl_type;
typedef double local_type;
// demo config vars
int iNumElements = 8388608;//16777216;//11444777; // Length of float arrays to process (odd # for illustration)
local_type value = 10.0;
shrBOOL bQuickTest = shrFALSE;
// Forward Declarations
// ************************************************************
void Cleanup (int iExitCode);
#pragma OPENCL EXTENSION cl_khr_fp64: enable
// Main function
// ************************************************************
int main(int argc, char **argv)
{
// get command line arg for quick test, if provided
bQuickTest = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
// start logs
shrSetLogFileName ("oclVectorMul.txt");
// shrLog(“%s Starting…\n\n# of float elements per Array \t= %u\n”, argv[0], iNumElements);
// set and log Global and Local work size dimensions
szLocalWorkSize = 256;
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
// shrLog(“Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n”,
// szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
// Allocate and initialize host arrays
src = (void *)malloc(sizeof(cl_type) * szGlobalWorkSize);
dst = (void *)malloc(sizeof(cl_type) * szGlobalWorkSize);
temp = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
shrFillArray((float *)temp, iNumElements);
local_type *src2;
src2 = (local_type *)src;
float *temp2;
temp2 = (float *)temp;
for (int i=0; i<iNumElements; i++)
{
src2[i] = (local_type)temp2[i];
}
// shrLog( “Allocate and Init Host Mem…\n”);
//Get the NVIDIA platform
ciErr1 = oclGetPlatformID(&cpPlatform);
oclCheckError(ciErr1, CL_SUCCESS);
//Get the devices
ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
oclCheckError(ciErr1, CL_SUCCESS);
cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
oclCheckError(ciErr1, CL_SUCCESS);
//Create the context
cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErr1);
oclCheckError(ciErr1, CL_SUCCESS);
// Create a command-queue
cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr1);
// shrLog(“clCreateCommandQueue…\n”);
if (ciErr1 != CL_SUCCESS)
{
shrLog("Error in clCreateCommandQueue, near Line %u in file %u", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// Allocate the OpenCL source and result buffer memory objects on the device GMEM, and copy the data to the device
shrDeltaT(0);
cmDevSrc = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_type) * szGlobalWorkSize, src, &ciErr1);
double memCpyHDTime = shrDeltaT(0);
cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_type) * szGlobalWorkSize, NULL, &ciErr2);
ciErr1 |= ciErr2;
// shrLog(“clCreateBuffer…\n”);
if (ciErr1 != CL_SUCCESS)
{
shrLog("Error in clCreateBuffer, near Line %u in file %u", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// Read the OpenCL kernel in from source file
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
// shrLog(“oclLoadProgSource (%s)…\n”, cSourceFile);
// Create the program
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);
// shrLog(“clCreateProgramWithSource…\n”);
if (ciErr1 != CL_SUCCESS)
{
shrLog("Error in clCreateProgramWithSource, near Line %u in file %u", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
ciErr1 = clBuildProgram(cpProgram, 0, NULL, “-cl-fast-relaxed-math”, NULL, NULL);
if (ciErr1 != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then cleanup and exit
shrLogEx(LOGBOTH | ERRORMSG, ciErr1, STDERROR);
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclCorrel.ptx");
oclCheckError(ciErr1, CL_SUCCESS);
}
// Create the kernel
ckKernel = clCreateKernel(cpProgram, "VectorMul", &ciErr1);
// shrLog(“clCreateKernel…\n”);
if (ciErr1 != CL_SUCCESS)
{
shrLog("Error in clCreateKernel, near Line %u in file %u", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// Set the Argument values
ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrc);
ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_type), (void*)&value);
ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
ciErr1 |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
// shrLog(“clSetKernelArg…\n”);
if (ciErr1 != CL_SUCCESS)
{
shrLog("Error in clSetKernelArg, near Line %u in file %u", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// Launch kernel
shrDeltaT(0);
ciErr1 = clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
double computeTime = shrDeltaT(0);
// shrLog(“clEnqueueNDRangeKernel…\n”);
if (ciErr1 != CL_SUCCESS)
{
shrLog("Error in clEnqueueNDRangeKernel, near Line %u in file %u", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
// Synchronous/blocking read of results, and check accumulated errors
shrDeltaT(0);
ciErr1 = clEnqueueReadBuffer(cqCommandQue, cmDevDst, CL_TRUE, 0, sizeof(cl_type) * szGlobalWorkSize, dst, 0, NULL, NULL);
double memCpyDHTime = shrDeltaT(0);
// shrLog(“clEnqueueReadBuffer…\n\n”);
if (ciErr1 != CL_SUCCESS)
{
shrLog("Error in clEnqueueReadBuffer, near Line %u in file %u", __LINE__, __FILE__);
Cleanup(EXIT_FAILURE);
}
printf("\nMultiplication\n\n\t*%d datas\n\t*%d Global Work Size\n\t*%d Local Work Size\n", iNumElements, (int)szGlobalWorkSize, (int)szLocalWorkSize);
printf("\n=============================================\n");
printf("Time to copy datas HOST -> DEVICE : %.3f ms\n", (memCpyHDTime*1000));
printf("Time to compute : %.3f ms\n", (computeTime*1000));
printf("Time to copy datas DEVICE -> HOST : %.3f ms\n", (memCpyDHTime*1000));
printf("--------------------------------------------\n");
printf("Total time : %.3f ms\n", ((memCpyHDTime+computeTime+memCpyDHTime)*1000));
printf("=============================================\n\n");
local_type *srcf, *dstf;
srcf = (local_type*)src;
dstf = (local_type*)dst;
for (int i=0; i<iNumElements; i++)
{
if (srcf[i]*10 != dstf[i])
printf("Error at indice %d : %.3f != %.3f\n", i, srcf[i]*10, dstf[i]);
}
// Cleanup and leave
Cleanup (EXIT_SUCCESS);
}
void Cleanup (int iExitCode)
{
// Cleanup allocated objects
shrLog("\nStarting Cleanup...\n\n");
if(cdDevices)free(cdDevices);
if(cPathAndName)free(cPathAndName);
if(cSourceCL)free(cSourceCL);
if(ckKernel)clReleaseKernel(ckKernel);
if(cpProgram)clReleaseProgram(cpProgram);
if(cqCommandQue)clReleaseCommandQueue(cqCommandQue);
if(cxGPUContext)clReleaseContext(cxGPUContext);
if(cmDevSrc)clReleaseMemObject(cmDevSrc);
if(cmDevDst)clReleaseMemObject(cmDevDst);
// Free host memory
free(src);
free (dst);
// finalize logs and leave
if (bQuickTest)
{
shrLog("oclVectorMul Ending...\n");
}
else
{
shrLog("oclVectorMul Ending...\nPress Enter to Exit\n");
getchar();
}
exit (iExitCode);
}
[/codebox]
Thanks.