Update: This works flawlessly when using single precision.
I have a kernel, which basically amounts to rearranging an array of size n.
In order to allow user-specified work-group sizes, I increase the global work size
to n + pad_length.
However, for some values of pad_length this sometimes fail.
Below I have included the most basic example that reproduce this behavior.
Host code(my code starts on line 103):
#include <stdio.h>
#include <sys/stat.h>
#include "CL/cl.h"
#define alloca __builtin_alloca
void checkErr(cl_int err, const char *name)
{
if (err != CL_SUCCESS) {
fprintf(stderr, "ERROR: %s (%d )\n", name, err);
exit(EXIT_FAILURE);
}
}
int main(int argc, char** argv)
{
cl_int errNum;
cl_uint numPlatforms;
cl_uint numDevices;
cl_platform_id *platformIDs;
cl_device_id *deviceIDs;
cl_context context = NULL;
cl_command_queue queue;
cl_program program;
cl_kernel rearrange_kernel;
cl_mem dest_device;
cl_mem src_device;
cl_mem map_device;
errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
checkErr((errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");
platformIDs = (cl_platform_id *) alloca(sizeof(cl_platform_id) * numPlatforms);
errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
checkErr((errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");
deviceIDs = NULL;
for(cl_uint i = 0; i < numPlatforms; ++i) {
errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU,
0, NULL, &numDevices);
if(errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND) {
checkErr(errNum, "clGetDeviceIDs");
} else if(numDevices > 0) {
deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, numDevices, &deviceIDs[0], NULL);
checkErr(errNum, "clGetDeviceIDs");
break;
}
}
if(deviceIDs == NULL) {
printf("No GPU device was found.\n");
exit(-1);
}
cl_context_properties contextProperties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platformIDs[0], 0};
context = clCreateContext(contextProperties, numDevices, deviceIDs, NULL, NULL, &errNum);
checkErr(errNum, "clCreateContext");
struct stat st;
const char* filename = "kernel.cl";
FILE *f = fopen(filename, "r");
checkErr(f ? CL_SUCCESS : -1, "Reading kernels.cl");
// kernel source code length
stat(filename, &st);
size_t src_sz = st.st_size;
char *src_str;
src_str = (char *) malloc(src_sz);
src_sz = fread(src_str, 1, src_sz, f);
fclose(f);
program = clCreateProgramWithSource(context, 1, (const char **) &src_str, &src_sz, &errNum);
checkErr(errNum, "clCreateProgramWithSource");
errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);
checkErr(errNum, "clBuildProgram");
cl_build_status build_status;
errNum = clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status), &build_status, NULL);
checkErr(errNum, "clGetProgramBuildInfo");
size_t ret_val_size;
errNum = clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
checkErr(errNum, "clGetProgramBuildInfo");
rearrange_kernel = clCreateKernel(program, "rearrange", &errNum);
checkErr(errNum, "clCreateKernel, rearrange");
queue = clCreateCommandQueue(context, deviceIDs[0], CL_QUEUE_PROFILING_ENABLE, &errNum);
checkErr(errNum, "clCreateCommandQueue");
// this is the original array size
const cl_uint n = 446880;
// and this the padded one
const cl_uint padded_n = 447488;
int *mapping = malloc(n * sizeof(int));
double *dest = malloc(n * sizeof(double));
double *source = malloc(n * sizeof(double));
double *output = malloc(n * sizeof(double));
// define a mapping that reverses order
// init. the source vector to 0,1,2,...,n-1
// define impossible default values for output
for(cl_uint i = 0, j = n-1; i < n; ++i, --j) {
mapping[i] = j;
source[i] = i;
output[i] = -100;
}
// allocate device memory
dest_device = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(double), NULL, &errNum);
checkErr(errNum, "clCreateBuffer(dest)");
src_device = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(double), NULL, &errNum);
checkErr(errNum, "clCreateBuffer(source)");
map_device = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(int), NULL, &errNum);
checkErr(errNum, "clCreateBuffer(mapping)");
errNum = clEnqueueWriteBuffer(queue, src_device, CL_TRUE, 0, n*sizeof(double), source, 0, NULL, NULL);
checkErr(errNum, "clEnqueueWriteBuffer(source)");
errNum = clEnqueueWriteBuffer(queue, map_device, CL_TRUE, 0, n*sizeof(int), mapping, 0, NULL, NULL);
checkErr(errNum, "clEnqueueWriteBuffer(mapping)");
// set the parameters for the rearrange operator
errNum = clSetKernelArg(rearrange_kernel, 0, sizeof(cl_mem), &dest_device);
errNum |= clSetKernelArg(rearrange_kernel, 1, sizeof(cl_mem), &src_device);
errNum |= clSetKernelArg(rearrange_kernel, 2, sizeof(cl_mem), &map_device);
errNum |= clSetKernelArg(rearrange_kernel, 3, sizeof(cl_uint), (void *) &n);
checkErr(errNum, "clSetKernelArg(rearrange)");
// specify the work group size/dim
const size_t work_dim = 3;
const size_t global_work_size_rearrange[] = {padded_n, 1, 1};
// perform the reordering on the host
for(int i = 0; i < padded_n; ++i) {
if(i < n) {
if(mapping[i] > n) {
fprintf(stderr, "something is quite wrong.");
exit(EXIT_FAILURE);
}
dest[i] = source[mapping[i]];
}
}
// just to be safe <img src='http://forums.nvidia.com/public/style_emoticons/<#EMO_DIR#>/blarg.gif' class='bbc_emoticon' alt=':/' />
clEnqueueBarrier(queue);
errNum = clEnqueueNDRangeKernel(queue, rearrange_kernel, work_dim, NULL, global_work_size_rearrange, NULL, 0, NULL, NULL);
checkErr(errNum, "rearrange");
clEnqueueBarrier(queue);
clEnqueueReadBuffer(queue, dest_device, CL_TRUE, 0, n*sizeof(double), output, 0, NULL, NULL);
clEnqueueBarrier(queue);
int errors = 0;
for(int i = 0; i < n; ++i) {
if( dest[i] != output[i] ) {
++errors;
//fprintf(stderr, "%6g vs %8g [idx=%6d] \n", dest[i], output[i], i);
}
}
printf("Error on %d/%d elements.\n", errors, n);
// Finalization
clFlush(queue);
clFinish(queue);
clReleaseKernel(rearrange_kernel);
clReleaseProgram(program);
clReleaseMemObject(dest_device);
clReleaseMemObject(map_device);
clReleaseMemObject(src_device);
free(dest);
free(source);
free(mapping);
}
Kernel code:
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void rearrange(__global double *dst, __global double *src, __global int *map, const uint size)
{
size_t l = get_global_id(0);
if( l < size )
dst[l] = src[map[l]];
}
output:
h0h0 ~/LBM/3d $ ./lbm
Error on 292/446880 elements.
h0h0 ~/LBM/3d $ ./lbm
Error on 366/446880 elements.
h0h0 ~/LBM/3d $ ./lbm
Error on 0/446880 elements.
h0h0 ~/LBM/3d $ ./lbm
Error on 0/446880 elements.
h0h0 ~/LBM/3d $ ./lbm
Error on 420/446880 elements.
Help?