clBuildProgram returns CL_INVALID_BINARY for double data types on a GTX 480

Today I tried to compile my program on a CUDA machine:

  • platform: NVIDIA CUDA [0]
  • version: OpenCL 1.1 CUDA 4.2.1
  • device: GeForce GTX 480 [0]
  • compute units: 15
  • global memory: 1535MB
  • max. buffer size: 383MB
  • max. work group size: 1024
  • floating point precision: double

The opencl program won’t build with this kernel:

#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void test(__global double *a, __global double *b)
{
int i = get_global_id(0);
a[i] = 99.9;
b[i] = a[i] + 77.7;
}

But it runs fine with this one:

// #pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void test(__global float *a, __global float *b)
{
int i = get_global_id(0);
a[i] = 99.9f;
b[i] = a[i] + 77.7f;
}

Both kernels are build and executed correctly on a second machine with a HD7970. It gets even stranger: The following kernel will build and execute on the GTX 480 machine:

#pragma OPENCL EXTENSION cl_khr_fp64: enable

__kernel void test(__global double *a, __global double *b)
{
int i = get_global_id(0);
a[i] = 99.9;
b[i] = 77.7;
}

The only difference to the first kernel is that I’ve removed a[i] in the last line. It turns out, that the program won’t build if I use double data type and try to read from either array a or b. But there is no problem with writing to both arrays! Can someone think of the reason? The driver is 304.54 and CUDA version is 5.0.35. I also tried the same code on another machine with a GTX 480 but with driver version 319.37 and CUDA 5.5.22 and got exactly the same behavior.

I’ve just added a zip-file with the code which reproduces the error.
cl_example.zip (2.62 KB)

An update to newest driver version didn’t help. I experience the same problem on another machine with a GTX 285. Most likely it’s a bug in the OpenCL implementations because the error code -42 (CL_INVALID_BINARY) must not be returned by clBuildProgram but can be returned only by clCreateWithProgramWithBinary which I’m not using at all.

I’m wondering why I can’t find anything about this rather generic problem on the internet.

The code to reproduce the error:

cl_example.cpp

#include <complex>
#include <iostream>
#include <fstream>
#include <time.h>
#include <CL/opencl.h>
#include "precision.h"

using namespace std;

int main(void) {
  cl_platform_id *platforms = NULL;          // IDs of OpenCL platforms  
  cl_device_id *devices = NULL;              // Device IDs
  cl_uint platformCount = 0;                 // Platform count
  cl_uint deviceCount = 0;                   // Device count          
  cl_context context;                        // Context
  cl_program program;                        // Compute program
  cl_kernel kernel;                          // Compute kernel
  cl_command_queue queue;                    // Queue
  int platformNr = 0;                        // Chosen platform
  int deviceNr = 0;                          // Chosen device
  
  cl_int err;
  size_t Nx = 2;
  size_t Ny = 2;
  size_t Nz = 2;
  size_t dataLength = Nx * Ny * Nz;
  size_t globalWorkGroupSize[3] = {Nx, Ny, Nz};
  size_t localWorkGroupSize[3] = {1, 1, 1};
  cl_mem dataDevice;                                 // Device-side buffer
  
  rfloat *dataHost = new rfloat[dataLength];         // Host-side input buffer
  
  ifstream sourceFile;
  char *sourceFromFile;
  int sourceLength;
  
  // Read CL source from file
  sourceFile.open("source.cl");
  sourceFile.seekg(0, ios::end);
  sourceLength = (int)sourceFile.tellg();
  sourceFile.seekg(0, ios::beg);
  sourceFromFile = (char*)malloc(sourceLength*sizeof(char));
  sourceFile.get(sourceFromFile, sourceLength+1, 0);
  
  // Get available platforms  
  clGetPlatformIDs(0, NULL, &platformCount);  
  platforms = (cl_platform_id*)malloc(platformCount*sizeof(cl_platform_id));  
  clGetPlatformIDs(platformCount, platforms, NULL);
  // Get number of available devices for this platform  
  clGetDeviceIDs(platforms[platformNr], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
  // Get available device IDs for this platform 
  devices = (cl_device_id*) malloc(deviceCount * sizeof(cl_device_id));
  clGetDeviceIDs(platforms[platformNr], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
  // Print platform name  
  char platform_name[1024];
  clGetPlatformInfo(platforms[platformNr], CL_PLATFORM_NAME, 1024, &platform_name, NULL);
  cout << "OpenCl platform " << platformNr << " [" << platform_name << "]" << endl;
  // Print device name and type  
  cl_device_type device_type;
  char device_name[1024];
  clGetDeviceInfo(devices[deviceNr], CL_DEVICE_NAME, 1024, &device_name, NULL);
  clGetDeviceInfo( devices[deviceNr],CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL);
  cout << "OpenCl device " << deviceNr << " [" << device_name << "]" << endl;
  cout << "using " << ((PRECISION == SINGLE) ? "single" : "double") << " precision" << endl;
  
  // Create OpenCL context  
  cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[platformNr], 0};
  context = clCreateContext(cps, deviceCount, devices, NULL, NULL, NULL);
  // Create command queue  
  queue = clCreateCommandQueue(context, devices[deviceNr], CL_QUEUE_PROFILING_ENABLE, NULL);
  // Create device buffer  
  dataDevice = clCreateBuffer(context, CL_MEM_READ_WRITE, dataLength * sizeof(rfloat), NULL, NULL);
 
  // Create the compute program from the source buffer
  program = clCreateProgramWithSource(context, 1, (const char **) &sourceFromFile, NULL, &err);
  // Build the program executable
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS) {
    size_t len;
    char buffer[2048];
    cerr << "Error: Failed to build program executable! Error code " << err << endl;
    clGetProgramBuildInfo(program, devices[deviceNr], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    cerr << buffer << endl;
    exit(1);
  }
  // Create the compute kernel in the program
  kernel = clCreateKernel(program, "foo", &err);

  // Generate data
  for(int i = 0; i < dataLength; i++) {
    dataHost[i] = i * 111.1;
  }
  cout << "input:" << endl;
  for(int i = 0; i < dataLength; i++) {
    cout << dataHost[i] << " ";
  }
  cout << endl;
  
  // Transfer data into device memory
  clEnqueueWriteBuffer(queue, dataDevice, CL_TRUE, 0, dataLength * sizeof(rfloat), dataHost, 0, NULL, NULL);
  
  // Set the arguments to the compute kernel
  clSetKernelArg(kernel, 0, sizeof(cl_mem), &dataDevice);
  
  // Execute the kernel
  clEnqueueNDRangeKernel(queue, kernel, 3, NULL, globalWorkGroupSize, localWorkGroupSize, 0, NULL, NULL);
  
  // Wait for all commands to complete
  clFinish(queue);
  
  // Read back the results from the device
  clEnqueueReadBuffer(queue, dataDevice, CL_TRUE, 0, dataLength * sizeof(rfloat), dataHost, 0, NULL, NULL );
  
  // Print results
  cout << "output:" << endl;
  for(int i = 0; i < dataLength; i++) {
    cout << dataHost[i] << " ";
  }
  cout << endl;
  
  // Free device memory  
  clReleaseMemObject(dataDevice);  
  // Release OpenCL context and queue  
  clReleaseCommandQueue(queue);  
  clReleaseContext(context);  
  // Free objects
  free(platforms); 
  free(devices);   
  free(dataHost);
}

source.cl

#include "precision.h"

#define nx              (signed)get_global_id(0)
#define ny              (signed)get_global_id(1)
#define nz              (signed)get_global_id(2)
#define Nx              (signed)get_global_size(0)
#define Ny              (signed)get_global_size(1)
#define Nz              (signed)get_global_size(2)

__kernel void foo(__global rfloat *data)
{
  int pos = nx + Nx * ny + Nx * Ny * nz;
  data[pos] = (rfloat)pos;
}

precision.h

#ifndef PRECISION_H
#define PRECISION_H

#define SINGLE 1
#define DOUBLE 2
// #define PRECISION SINGLE
#define PRECISION DOUBLE

#if PRECISION == DOUBLE
#define rfloat double
#pragma OPENCL EXTENSION cl_khr_fp64: enable

#elif PRECISION == SINGLE
#define rfloat float
#pragma OPENCL EXTENSION cl_khr_fp64: disable

#endif

#endif // PRECISION_H

Are you using Windows or Linux? For Windows, I know that there is a much more recent driver version - 331.65. I’ve used double on Nvidia GPUs in OpenCL recently and haven’t had any problems with the recent drivers.

Hello chippies,

I’m using Linux so 331.20 is the newest version available. Which GPUs were you working with?