clBuildProgram returns CL_INVALID_BINARY for double data types on a GTX 480

max_pla3 · November 6, 2013, 9:33am

Today I tried to compile my program on a CUDA machine:

platform: NVIDIA CUDA [0]
version: OpenCL 1.1 CUDA 4.2.1
device: GeForce GTX 480 [0]
compute units: 15
global memory: 1535MB
max. buffer size: 383MB
max. work group size: 1024
floating point precision: double

The opencl program won’t build with this kernel:

#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void test(__global double *a, __global double *b)
{
int i = get_global_id(0);
a[i] = 99.9;
b[i] = a[i] + 77.7;
}

But it runs fine with this one:

// #pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void test(__global float *a, __global float *b)
{
int i = get_global_id(0);
a[i] = 99.9f;
b[i] = a[i] + 77.7f;
}

Both kernels are build and executed correctly on a second machine with a HD7970. It gets even stranger: The following kernel will build and execute on the GTX 480 machine:

#pragma OPENCL EXTENSION cl_khr_fp64: enable

__kernel void test(__global double *a, __global double *b)
{
int i = get_global_id(0);
a[i] = 99.9;
b[i] = 77.7;
}

The only difference to the first kernel is that I’ve removed a[i] in the last line. It turns out, that the program won’t build if I use double data type and try to read from either array a or b. But there is no problem with writing to both arrays! Can someone think of the reason? The driver is 304.54 and CUDA version is 5.0.35. I also tried the same code on another machine with a GTX 480 but with driver version 319.37 and CUDA 5.5.22 and got exactly the same behavior.

max_pla3 · November 6, 2013, 4:19pm

I’ve just added a zip-file with the code which reproduces the error.
cl_example.zip (2.62 KB)

max_pla3 · November 8, 2013, 5:55pm

An update to newest driver version didn’t help. I experience the same problem on another machine with a GTX 285. Most likely it’s a bug in the OpenCL implementations because the error code -42 (CL_INVALID_BINARY) must not be returned by clBuildProgram but can be returned only by clCreateWithProgramWithBinary which I’m not using at all.

I’m wondering why I can’t find anything about this rather generic problem on the internet.

The code to reproduce the error:

cl_example.cpp

#include <complex>
#include <iostream>
#include <fstream>
#include <time.h>
#include <CL/opencl.h>
#include "precision.h"

using namespace std;

int main(void) {
  cl_platform_id *platforms = NULL;          // IDs of OpenCL platforms  
  cl_device_id *devices = NULL;              // Device IDs
  cl_uint platformCount = 0;                 // Platform count
  cl_uint deviceCount = 0;                   // Device count          
  cl_context context;                        // Context
  cl_program program;                        // Compute program
  cl_kernel kernel;                          // Compute kernel
  cl_command_queue queue;                    // Queue
  int platformNr = 0;                        // Chosen platform
  int deviceNr = 0;                          // Chosen device
  
  cl_int err;
  size_t Nx = 2;
  size_t Ny = 2;
  size_t Nz = 2;
  size_t dataLength = Nx * Ny * Nz;
  size_t globalWorkGroupSize[3] = {Nx, Ny, Nz};
  size_t localWorkGroupSize[3] = {1, 1, 1};
  cl_mem dataDevice;                                 // Device-side buffer
  
  rfloat *dataHost = new rfloat[dataLength];         // Host-side input buffer
  
  ifstream sourceFile;
  char *sourceFromFile;
  int sourceLength;
  
  // Read CL source from file
  sourceFile.open("source.cl");
  sourceFile.seekg(0, ios::end);
  sourceLength = (int)sourceFile.tellg();
  sourceFile.seekg(0, ios::beg);
  sourceFromFile = (char*)malloc(sourceLength*sizeof(char));
  sourceFile.get(sourceFromFile, sourceLength+1, 0);
  
  // Get available platforms  
  clGetPlatformIDs(0, NULL, &platformCount);  
  platforms = (cl_platform_id*)malloc(platformCount*sizeof(cl_platform_id));  
  clGetPlatformIDs(platformCount, platforms, NULL);
  // Get number of available devices for this platform  
  clGetDeviceIDs(platforms[platformNr], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
  // Get available device IDs for this platform 
  devices = (cl_device_id*) malloc(deviceCount * sizeof(cl_device_id));
  clGetDeviceIDs(platforms[platformNr], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
  // Print platform name  
  char platform_name[1024];
  clGetPlatformInfo(platforms[platformNr], CL_PLATFORM_NAME, 1024, &platform_name, NULL);
  cout << "OpenCl platform " << platformNr << " [" << platform_name << "]" << endl;
  // Print device name and type  
  cl_device_type device_type;
  char device_name[1024];
  clGetDeviceInfo(devices[deviceNr], CL_DEVICE_NAME, 1024, &device_name, NULL);
  clGetDeviceInfo( devices[deviceNr],CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL);
  cout << "OpenCl device " << deviceNr << " [" << device_name << "]" << endl;
  cout << "using " << ((PRECISION == SINGLE) ? "single" : "double") << " precision" << endl;
  
  // Create OpenCL context  
  cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[platformNr], 0};
  context = clCreateContext(cps, deviceCount, devices, NULL, NULL, NULL);
  // Create command queue  
  queue = clCreateCommandQueue(context, devices[deviceNr], CL_QUEUE_PROFILING_ENABLE, NULL);
  // Create device buffer  
  dataDevice = clCreateBuffer(context, CL_MEM_READ_WRITE, dataLength * sizeof(rfloat), NULL, NULL);
 
  // Create the compute program from the source buffer
  program = clCreateProgramWithSource(context, 1, (const char **) &sourceFromFile, NULL, &err);
  // Build the program executable
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS) {
    size_t len;
    char buffer[2048];
    cerr << "Error: Failed to build program executable! Error code " << err << endl;
    clGetProgramBuildInfo(program, devices[deviceNr], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    cerr << buffer << endl;
    exit(1);
  }
  // Create the compute kernel in the program
  kernel = clCreateKernel(program, "foo", &err);

  // Generate data
  for(int i = 0; i < dataLength; i++) {
    dataHost[i] = i * 111.1;
  }
  cout << "input:" << endl;
  for(int i = 0; i < dataLength; i++) {
    cout << dataHost[i] << " ";
  }
  cout << endl;
  
  // Transfer data into device memory
  clEnqueueWriteBuffer(queue, dataDevice, CL_TRUE, 0, dataLength * sizeof(rfloat), dataHost, 0, NULL, NULL);
  
  // Set the arguments to the compute kernel
  clSetKernelArg(kernel, 0, sizeof(cl_mem), &dataDevice);
  
  // Execute the kernel
  clEnqueueNDRangeKernel(queue, kernel, 3, NULL, globalWorkGroupSize, localWorkGroupSize, 0, NULL, NULL);
  
  // Wait for all commands to complete
  clFinish(queue);
  
  // Read back the results from the device
  clEnqueueReadBuffer(queue, dataDevice, CL_TRUE, 0, dataLength * sizeof(rfloat), dataHost, 0, NULL, NULL );
  
  // Print results
  cout << "output:" << endl;
  for(int i = 0; i < dataLength; i++) {
    cout << dataHost[i] << " ";
  }
  cout << endl;
  
  // Free device memory  
  clReleaseMemObject(dataDevice);  
  // Release OpenCL context and queue  
  clReleaseCommandQueue(queue);  
  clReleaseContext(context);  
  // Free objects
  free(platforms); 
  free(devices);   
  free(dataHost);
}

source.cl

#include "precision.h"

#define nx              (signed)get_global_id(0)
#define ny              (signed)get_global_id(1)
#define nz              (signed)get_global_id(2)
#define Nx              (signed)get_global_size(0)
#define Ny              (signed)get_global_size(1)
#define Nz              (signed)get_global_size(2)

__kernel void foo(__global rfloat *data)
{
  int pos = nx + Nx * ny + Nx * Ny * nz;
  data[pos] = (rfloat)pos;
}

precision.h

#ifndef PRECISION_H
#define PRECISION_H

#define SINGLE 1
#define DOUBLE 2
// #define PRECISION SINGLE
#define PRECISION DOUBLE

#if PRECISION == DOUBLE
#define rfloat double
#pragma OPENCL EXTENSION cl_khr_fp64: enable

#elif PRECISION == SINGLE
#define rfloat float
#pragma OPENCL EXTENSION cl_khr_fp64: disable

#endif

#endif // PRECISION_H

chippies · November 11, 2013, 12:31pm

Are you using Windows or Linux? For Windows, I know that there is a much more recent driver version - 331.65. I’ve used double on Nvidia GPUs in OpenCL recently and haven’t had any problems with the recent drivers.

max_pla3 · November 13, 2013, 10:36am

Hello chippies,

I’m using Linux so 331.20 is the newest version available. Which GPUs were you working with?