An update to newest driver version didn’t help. I experience the same problem on another machine with a GTX 285. Most likely it’s a bug in the OpenCL implementations because the error code -42 (CL_INVALID_BINARY) must not be returned by clBuildProgram but can be returned only by clCreateWithProgramWithBinary which I’m not using at all.
I’m wondering why I can’t find anything about this rather generic problem on the internet.
The code to reproduce the error:
cl_example.cpp
#include <complex>
#include <iostream>
#include <fstream>
#include <time.h>
#include <CL/opencl.h>
#include "precision.h"
using namespace std;
int main(void) {
cl_platform_id *platforms = NULL; // IDs of OpenCL platforms
cl_device_id *devices = NULL; // Device IDs
cl_uint platformCount = 0; // Platform count
cl_uint deviceCount = 0; // Device count
cl_context context; // Context
cl_program program; // Compute program
cl_kernel kernel; // Compute kernel
cl_command_queue queue; // Queue
int platformNr = 0; // Chosen platform
int deviceNr = 0; // Chosen device
cl_int err;
size_t Nx = 2;
size_t Ny = 2;
size_t Nz = 2;
size_t dataLength = Nx * Ny * Nz;
size_t globalWorkGroupSize[3] = {Nx, Ny, Nz};
size_t localWorkGroupSize[3] = {1, 1, 1};
cl_mem dataDevice; // Device-side buffer
rfloat *dataHost = new rfloat[dataLength]; // Host-side input buffer
ifstream sourceFile;
char *sourceFromFile;
int sourceLength;
// Read CL source from file
sourceFile.open("source.cl");
sourceFile.seekg(0, ios::end);
sourceLength = (int)sourceFile.tellg();
sourceFile.seekg(0, ios::beg);
sourceFromFile = (char*)malloc(sourceLength*sizeof(char));
sourceFile.get(sourceFromFile, sourceLength+1, 0);
// Get available platforms
clGetPlatformIDs(0, NULL, &platformCount);
platforms = (cl_platform_id*)malloc(platformCount*sizeof(cl_platform_id));
clGetPlatformIDs(platformCount, platforms, NULL);
// Get number of available devices for this platform
clGetDeviceIDs(platforms[platformNr], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
// Get available device IDs for this platform
devices = (cl_device_id*) malloc(deviceCount * sizeof(cl_device_id));
clGetDeviceIDs(platforms[platformNr], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
// Print platform name
char platform_name[1024];
clGetPlatformInfo(platforms[platformNr], CL_PLATFORM_NAME, 1024, &platform_name, NULL);
cout << "OpenCl platform " << platformNr << " [" << platform_name << "]" << endl;
// Print device name and type
cl_device_type device_type;
char device_name[1024];
clGetDeviceInfo(devices[deviceNr], CL_DEVICE_NAME, 1024, &device_name, NULL);
clGetDeviceInfo( devices[deviceNr],CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL);
cout << "OpenCl device " << deviceNr << " [" << device_name << "]" << endl;
cout << "using " << ((PRECISION == SINGLE) ? "single" : "double") << " precision" << endl;
// Create OpenCL context
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[platformNr], 0};
context = clCreateContext(cps, deviceCount, devices, NULL, NULL, NULL);
// Create command queue
queue = clCreateCommandQueue(context, devices[deviceNr], CL_QUEUE_PROFILING_ENABLE, NULL);
// Create device buffer
dataDevice = clCreateBuffer(context, CL_MEM_READ_WRITE, dataLength * sizeof(rfloat), NULL, NULL);
// Create the compute program from the source buffer
program = clCreateProgramWithSource(context, 1, (const char **) &sourceFromFile, NULL, &err);
// Build the program executable
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS) {
size_t len;
char buffer[2048];
cerr << "Error: Failed to build program executable! Error code " << err << endl;
clGetProgramBuildInfo(program, devices[deviceNr], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
cerr << buffer << endl;
exit(1);
}
// Create the compute kernel in the program
kernel = clCreateKernel(program, "foo", &err);
// Generate data
for(int i = 0; i < dataLength; i++) {
dataHost[i] = i * 111.1;
}
cout << "input:" << endl;
for(int i = 0; i < dataLength; i++) {
cout << dataHost[i] << " ";
}
cout << endl;
// Transfer data into device memory
clEnqueueWriteBuffer(queue, dataDevice, CL_TRUE, 0, dataLength * sizeof(rfloat), dataHost, 0, NULL, NULL);
// Set the arguments to the compute kernel
clSetKernelArg(kernel, 0, sizeof(cl_mem), &dataDevice);
// Execute the kernel
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, globalWorkGroupSize, localWorkGroupSize, 0, NULL, NULL);
// Wait for all commands to complete
clFinish(queue);
// Read back the results from the device
clEnqueueReadBuffer(queue, dataDevice, CL_TRUE, 0, dataLength * sizeof(rfloat), dataHost, 0, NULL, NULL );
// Print results
cout << "output:" << endl;
for(int i = 0; i < dataLength; i++) {
cout << dataHost[i] << " ";
}
cout << endl;
// Free device memory
clReleaseMemObject(dataDevice);
// Release OpenCL context and queue
clReleaseCommandQueue(queue);
clReleaseContext(context);
// Free objects
free(platforms);
free(devices);
free(dataHost);
}
source.cl
#include "precision.h"
#define nx (signed)get_global_id(0)
#define ny (signed)get_global_id(1)
#define nz (signed)get_global_id(2)
#define Nx (signed)get_global_size(0)
#define Ny (signed)get_global_size(1)
#define Nz (signed)get_global_size(2)
__kernel void foo(__global rfloat *data)
{
int pos = nx + Nx * ny + Nx * Ny * nz;
data[pos] = (rfloat)pos;
}
precision.h
#ifndef PRECISION_H
#define PRECISION_H
#define SINGLE 1
#define DOUBLE 2
// #define PRECISION SINGLE
#define PRECISION DOUBLE
#if PRECISION == DOUBLE
#define rfloat double
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#elif PRECISION == SINGLE
#define rfloat float
#pragma OPENCL EXTENSION cl_khr_fp64: disable
#endif
#endif // PRECISION_H