Reordering a vector reordering a padded vector fails sporadically

Update: This works flawlessly when using single precision.

I have a kernel, which basically amounts to rearranging an array of size n.

In order to allow user-specified work-group sizes, I increase the global work size

to n + pad_length.

However, for some values of pad_length this sometimes fail.

Below I have included the most basic example that reproduce this behavior.

Host code(my code starts on line 103):

#include <stdio.h>

#include <sys/stat.h>

#include "CL/cl.h"

#define alloca __builtin_alloca

void checkErr(cl_int err, const char *name) 

{

    if (err != CL_SUCCESS) {

        fprintf(stderr, "ERROR: %s (%d )\n", name, err);

        exit(EXIT_FAILURE);

    }

}

int main(int argc, char** argv)

{

    cl_int errNum;

    cl_uint numPlatforms;

    cl_uint numDevices;

    cl_platform_id *platformIDs;

    cl_device_id *deviceIDs;

    cl_context context = NULL;

    cl_command_queue queue;

    cl_program program; 

cl_kernel rearrange_kernel;

cl_mem dest_device;

    cl_mem src_device;

    cl_mem map_device;

errNum = clGetPlatformIDs(0, NULL, &numPlatforms);

    checkErr((errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");

platformIDs = (cl_platform_id *) alloca(sizeof(cl_platform_id) * numPlatforms);

    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);

    checkErr((errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");

deviceIDs = NULL;

for(cl_uint i = 0; i < numPlatforms; ++i)  {

        errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, 

                                0, NULL, &numDevices);

if(errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND) {

            checkErr(errNum, "clGetDeviceIDs");

        } else if(numDevices > 0) {

            deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);

            errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, numDevices, &deviceIDs[0], NULL);

            checkErr(errNum, "clGetDeviceIDs");

            break;

        }

    }

if(deviceIDs == NULL) {

        printf("No GPU device was found.\n");

        exit(-1);

    }

cl_context_properties contextProperties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platformIDs[0], 0};

context = clCreateContext(contextProperties, numDevices, deviceIDs, NULL, NULL, &errNum);

    checkErr(errNum, "clCreateContext");

struct stat st;

    const char* filename = "kernel.cl";

    FILE  *f = fopen(filename, "r");

checkErr(f ? CL_SUCCESS : -1, "Reading kernels.cl");

// kernel source code length

    stat(filename, &st);            

    size_t src_sz = st.st_size; 

    char *src_str;

src_str = (char *) malloc(src_sz);

    src_sz = fread(src_str, 1, src_sz, f); 

fclose(f);

program = clCreateProgramWithSource(context, 1, (const char **) &src_str, &src_sz, &errNum);

    checkErr(errNum, "clCreateProgramWithSource");

errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);

    checkErr(errNum, "clBuildProgram"); 

cl_build_status build_status;

    errNum = clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_STATUS,

                                   sizeof(cl_build_status), &build_status, NULL);

    checkErr(errNum, "clGetProgramBuildInfo");

size_t ret_val_size;

    errNum = clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);

    checkErr(errNum, "clGetProgramBuildInfo");

rearrange_kernel = clCreateKernel(program, "rearrange", &errNum);

    checkErr(errNum, "clCreateKernel, rearrange");

queue = clCreateCommandQueue(context, deviceIDs[0], CL_QUEUE_PROFILING_ENABLE, &errNum);

    checkErr(errNum, "clCreateCommandQueue");

// this is the original array size  

    const cl_uint n = 446880; 

// and this the padded one

    const cl_uint padded_n = 447488; 

int *mapping = malloc(n * sizeof(int)); 

double *dest = malloc(n * sizeof(double));

    double *source = malloc(n * sizeof(double));

    double *output = malloc(n * sizeof(double));

// define a mapping that reverses order

    // init. the source vector to 0,1,2,...,n-1

    // define impossible default values for output

    for(cl_uint i = 0, j = n-1; i < n; ++i, --j) {

        mapping[i] = j;

        source[i] = i;

        output[i] = -100;

    }

// allocate device memory

    dest_device = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(double), NULL, &errNum);

    checkErr(errNum, "clCreateBuffer(dest)");

src_device = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(double), NULL, &errNum);

    checkErr(errNum, "clCreateBuffer(source)");

map_device = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(int), NULL, &errNum);

    checkErr(errNum, "clCreateBuffer(mapping)");

errNum =  clEnqueueWriteBuffer(queue, src_device, CL_TRUE, 0, n*sizeof(double), source, 0, NULL, NULL);

    checkErr(errNum, "clEnqueueWriteBuffer(source)"); 

errNum =  clEnqueueWriteBuffer(queue, map_device, CL_TRUE, 0, n*sizeof(int), mapping, 0, NULL, NULL);

    checkErr(errNum, "clEnqueueWriteBuffer(mapping)"); 

// set the parameters for the rearrange operator

    errNum = clSetKernelArg(rearrange_kernel, 0, sizeof(cl_mem), &dest_device);

    errNum |= clSetKernelArg(rearrange_kernel, 1, sizeof(cl_mem), &src_device);

    errNum |= clSetKernelArg(rearrange_kernel, 2, sizeof(cl_mem), &map_device);

    errNum |= clSetKernelArg(rearrange_kernel, 3, sizeof(cl_uint), (void *) &n);

    checkErr(errNum, "clSetKernelArg(rearrange)");

// specify the work group size/dim

    const size_t work_dim = 3;

    const size_t global_work_size_rearrange[] = {padded_n, 1, 1};

// perform the reordering on the host

    for(int i = 0; i < padded_n; ++i) {

        if(i < n) {

            if(mapping[i] > n) {

                fprintf(stderr, "something is quite wrong.");

                exit(EXIT_FAILURE); 

            }

dest[i] = source[mapping[i]];

        }

    }

// just to be safe <img src='http://forums.nvidia.com/public/style_emoticons/<#EMO_DIR#>/blarg.gif' class='bbc_emoticon' alt=':/' />

    clEnqueueBarrier(queue);

	errNum = clEnqueueNDRangeKernel(queue, rearrange_kernel, work_dim, NULL, global_work_size_rearrange, NULL, 0, NULL, NULL);

    checkErr(errNum, "rearrange");

clEnqueueBarrier(queue);

clEnqueueReadBuffer(queue, dest_device, CL_TRUE, 0, n*sizeof(double), output, 0, NULL, NULL);

clEnqueueBarrier(queue);

int errors = 0;

    for(int i = 0; i < n; ++i) {

        if( dest[i] != output[i] ) {

            ++errors;

            //fprintf(stderr, "%6g vs %8g [idx=%6d] \n", dest[i], output[i], i);

        }

    }

printf("Error on %d/%d elements.\n", errors, n);

	

    // Finalization

    clFlush(queue);

    clFinish(queue);

clReleaseKernel(rearrange_kernel);

clReleaseProgram(program);

clReleaseMemObject(dest_device);

    clReleaseMemObject(map_device);

    clReleaseMemObject(src_device);

free(dest);

    free(source);

	free(mapping);

}

Kernel code:

#pragma OPENCL EXTENSION cl_khr_fp64 : enable

__kernel void rearrange(__global double *dst, __global double *src, __global int *map, const uint size)

{

    size_t l = get_global_id(0); 

if( l < size )

	    dst[l] = src[map[l]];

}

output:

h0h0 ~/LBM/3d $ ./lbm

Error on 292/446880 elements.

h0h0 ~/LBM/3d $ ./lbm

Error on 366/446880 elements.

h0h0 ~/LBM/3d $ ./lbm

Error on 0/446880 elements.

h0h0 ~/LBM/3d $ ./lbm

Error on 0/446880 elements.

h0h0 ~/LBM/3d $ ./lbm

Error on 420/446880 elements.

Help?

You misunderstand what clEnqueueBarrier method does.

Put the code which compares dest and output AFTER clFinish.

Thanks, that was bad mistake.

This did not solve the problem.

I’m using a Tesla M1060 card with CUDA 4.0.

That’s strange. Try sleeping for, for example, 1 second after clFinish and before the comparison. Let’s split the problem in half.

And attach the modified code, please.

Still no luck.

#include <stdio.h>

#include <sys/stat.h>

#include <unistd.h>

#include "CL/cl.h"

#define alloca __builtin_alloca

void checkErr(cl_int err, const char *name) 

{

    if (err != CL_SUCCESS) {

        fprintf(stderr, "ERROR: %s (%d )\n", name, err);

        exit(EXIT_FAILURE);

    }

}

int main(int argc, char** argv)

{

    cl_int errNum;

    cl_uint numPlatforms;

    cl_uint numDevices;

    cl_platform_id *platformIDs;

    cl_device_id *deviceIDs;

    cl_context context = NULL;

    cl_command_queue queue;

    cl_program program; 

cl_kernel rearrange_kernel;

cl_mem dest_device;

    cl_mem src_device;

    cl_mem map_device;

errNum = clGetPlatformIDs(0, NULL, &numPlatforms);

    checkErr((errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");

platformIDs = (cl_platform_id *) alloca(sizeof(cl_platform_id) * numPlatforms);

    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);

    checkErr((errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");

deviceIDs = NULL;

for(cl_uint i = 0; i < numPlatforms; ++i)  {

        errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, 

                                0, NULL, &numDevices);

if(errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND) {

            checkErr(errNum, "clGetDeviceIDs");

        } else if(numDevices > 0) {

            deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);

            errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, numDevices, &deviceIDs[0], NULL);

            checkErr(errNum, "clGetDeviceIDs");

            break;

        }

    }

if(deviceIDs == NULL) {

        printf("No GPU device was found.\n");

        exit(-1);

    }

cl_context_properties contextProperties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platformIDs[0], 0};

context = clCreateContext(contextProperties, numDevices, deviceIDs, NULL, NULL, &errNum);

    checkErr(errNum, "clCreateContext");

struct stat st;

    const char* filename = "kernel.cl";

    FILE  *f = fopen(filename, "r");

checkErr(f ? CL_SUCCESS : -1, "Reading kernels.cl");

// kernel source code length

    stat(filename, &st);            

    size_t src_sz = st.st_size; 

    char *src_str;

src_str = (char *) malloc(src_sz);

    src_sz = fread(src_str, 1, src_sz, f); 

fclose(f);

program = clCreateProgramWithSource(context, 1, (const char **) &src_str, &src_sz, &errNum);

    checkErr(errNum, "clCreateProgramWithSource");

errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);

    checkErr(errNum, "clBuildProgram"); 

cl_build_status build_status;

    errNum = clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_STATUS,

                                   sizeof(cl_build_status), &build_status, NULL);

    checkErr(errNum, "clGetProgramBuildInfo");

size_t ret_val_size;

    errNum = clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);

    checkErr(errNum, "clGetProgramBuildInfo");

rearrange_kernel = clCreateKernel(program, "rearrange", &errNum);

    checkErr(errNum, "clCreateKernel, rearrange");

queue = clCreateCommandQueue(context, deviceIDs[0], CL_QUEUE_PROFILING_ENABLE, &errNum);

    checkErr(errNum, "clCreateCommandQueue");

// this is the original array size  

    const cl_uint n = 446880; 

// and this the padded one

    const cl_uint padded_n = 447488; 

int *mapping = malloc(n * sizeof(int)); 

double *dest = malloc(n * sizeof(double));

    double *source = malloc(n * sizeof(double));

    double *output = malloc(n * sizeof(double));

// define a mapping that reverses order

    // init. the source vector to 0,1,2,...,n-1

    // define impossible default values for output

    for(cl_uint i = 0, j = n-1; i < n; ++i, --j) {

        mapping[i] = j;

        source[i] = i;

        output[i] = -100;

    }

// allocate device memory

    dest_device = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(double), NULL, &errNum);

    checkErr(errNum, "clCreateBuffer(dest)");

src_device = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(double), NULL, &errNum);

    checkErr(errNum, "clCreateBuffer(source)");

map_device = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(int), NULL, &errNum);

    checkErr(errNum, "clCreateBuffer(mapping)");

errNum =  clEnqueueWriteBuffer(queue, src_device, CL_TRUE, 0, n*sizeof(double), source, 0, NULL, NULL);

    checkErr(errNum, "clEnqueueWriteBuffer(source)"); 

errNum =  clEnqueueWriteBuffer(queue, map_device, CL_TRUE, 0, n*sizeof(int), mapping, 0, NULL, NULL);

    checkErr(errNum, "clEnqueueWriteBuffer(mapping)"); 

// set the parameters for the rearrange operator

    errNum = clSetKernelArg(rearrange_kernel, 0, sizeof(cl_mem), &dest_device);

    errNum |= clSetKernelArg(rearrange_kernel, 1, sizeof(cl_mem), &src_device);

    errNum |= clSetKernelArg(rearrange_kernel, 2, sizeof(cl_mem), &map_device);

    errNum |= clSetKernelArg(rearrange_kernel, 3, sizeof(cl_uint), (void *) &n);

    checkErr(errNum, "clSetKernelArg(rearrange)");

// specify the work group size/dim

    const size_t work_dim = 3;

    const size_t global_work_size_rearrange[] = {padded_n, 1, 1};

// perform the reordering on the host

    for(int i = 0; i < padded_n; ++i) {

        if(i < n) {

            if(mapping[i] > n) {

                fprintf(stderr, "something is quite wrong.");

                exit(EXIT_FAILURE); 

            }

dest[i] = source[mapping[i]];

        }

    }

errNum = clEnqueueNDRangeKernel(queue, rearrange_kernel, work_dim, NULL, global_work_size_rearrange, NULL, 0, NULL, NULL);

    checkErr(errNum, "rearrange");

errNum = clEnqueueReadBuffer(queue, dest_device, CL_TRUE, 0, n*sizeof(double), output, 0, NULL, NULL);

    checkErr(errNum, "result");

clFlush(queue);

    clFinish(queue);

sleep(1);

int errors = 0;

    for(int i = 0; i < n; ++i) {

        if( dest[i] != output[i] ) {

            ++errors;

            //fprintf(stderr, "%6g vs %8g [idx=%6d] \n", dest[i], output[i], i);

        }

    }

printf("Error on %d/%d elements.\n", errors, n);

clReleaseKernel(rearrange_kernel);

    clReleaseProgram(program);

clReleaseMemObject(dest_device);

    clReleaseMemObject(map_device);

    clReleaseMemObject(src_device);

free(dest);

    free(source);

    free(mapping);

}

output:

$ gcc -lOpenCL test.c -std=c99 -o t

$ ./t

Error on 0/446880 elements.

$ ./t

Error on 528/446880 elements.

Did you look into what are differences? Positions and values? What are these wrong values in “output”? Maybe it makes sense to write output buffer to the device with those -100, so that we at least can see whether the values were overwritten by the kernel.

I now copied the output array to dest_source (this was indeed the original purpose of initializing them with the value -100).

Part of the output:

138144 vs   127152 [idx=308735] 

135073 vs   129969 [idx=311806] 

135072 vs   129968 [idx=311807] 

134577 vs   138145 [idx=312302] 

134576 vs   138144 [idx=312303] 

133809 vs   135073 [idx=313070] 

133808 vs   135072 [idx=313071] 

130977 vs   134577 [idx=315902] 

130976 vs   134576 [idx=315903] 

126369 vs   115361 [idx=320510] 

126368 vs   115360 [idx=320511] 

126129 vs   125873 [idx=320750] 

126128 vs   125872 [idx=320751] 

124849 vs   128433 [idx=322030] 

124848 vs   128432 [idx=322031] 

124065 vs   121009 [idx=322814] 

124064 vs   121008 [idx=322815] 

121249 vs   110257 [idx=325630] 

121248 vs   110256 [idx=325631] 

116913 vs   113057 [idx=329966] 

116912 vs   113056 [idx=329967] 

115361 vs   116913 [idx=331518] 

115360 vs   116912 [idx=331519] 

113057 vs   110513 [idx=333822] 

113056 vs   110512 [idx=333823] 

112817 vs   102817 [idx=334062] 

112816 vs   102816 [idx=334063] 

105889 vs    94881 [idx=340990] 

105888 vs    94880 [idx=340991] 

105377 vs   111009 [idx=341502] 

105376 vs   111008 [idx=341503] 

104609 vs   102577 [idx=342270] 

104608 vs   102576 [idx=342271] 

 95409 vs    95137 [idx=351470] 

 95408 vs    95136 [idx=351471] 

 95137 vs    93873 [idx=351742] 

 95136 vs    93872 [idx=351743] 

 91809 vs    85153 [idx=355070] 

 91808 vs    85152 [idx=355071] 

 87201 vs    81569 [idx=359678] 

 87200 vs    81568 [idx=359679] 

 85665 vs    83377 [idx=361214] 

 85664 vs    83376 [idx=361215] 

 83377 vs    87201 [idx=363502] 

 83376 vs    87200 [idx=363503] 

 83121 vs    85665 [idx=363758] 

 83120 vs    85664 [idx=363759] 

 79521 vs    71089 [idx=367358] 

 79520 vs    71088 [idx=367359] 

 78001 vs    80801 [idx=368878] 

 78000 vs    80800 [idx=368879] 

 71601 vs    60337 [idx=375278] 

 71600 vs    60336 [idx=375279] 

 71329 vs    62641 [idx=375550] 

 71328 vs    62640 [idx=375551] 

 71089 vs    69537 [idx=375790] 

 71088 vs    69536 [idx=375791] 

 70321 vs    64417 [idx=376558] 

 70320 vs    64416 [idx=376559] 

 69537 vs    69041 [idx=377342] 

 69536 vs    69040 [idx=377343] 

 69041 vs    71329 [idx=377838] 

 69040 vs    71328 [idx=377839] 

 64417 vs    63649 [idx=382462] 

 64416 vs    63648 [idx=382463] 

 62641 vs    71601 [idx=384238] 

 62640 vs    71600 [idx=384239] 

 60337 vs    70321 [idx=386542] 

 60336 vs    70320 [idx=386543] 

 55969 vs    54689 [idx=390910] 

 55968 vs    54688 [idx=390911] 

 54689 vs    50337 [idx=392190] 

 54688 vs    50336 [idx=392191] 

 50337 vs    48545 [idx=396542] 

 50336 vs    48544 [idx=396543] 

 48545 vs    49825 [idx=398334] 

 48544 vs    49824 [idx=398335] 

 48033 vs    48289 [idx=398846] 

 48032 vs    48288 [idx=398847] 

 44705 vs    30385 [idx=402174] 

 44704 vs    30384 [idx=402175] 

 44465 vs    42913 [idx=402414] 

 44464 vs    42912 [idx=402415] 

 42913 vs    28321 [idx=403966] 

 42912 vs    28320 [idx=403967] 

 41137 vs    44705 [idx=405742] 

 41136 vs    44704 [idx=405743] 

 40113 vs    41137 [idx=406766] 

 40112 vs    41136 [idx=406767] 

 33201 vs    23473 [idx=413678] 

 33200 vs    23472 [idx=413679] 

 30385 vs    44465 [idx=416494] 

 30384 vs    44464 [idx=416495] 

 30113 vs    17585 [idx=416766] 

 30112 vs    17584 [idx=416767] 

 28321 vs    23729 [idx=418558] 

 28320 vs    23728 [idx=418559] 

 24241 vs    19873 [idx=422638] 

 24240 vs    19872 [idx=422639] 

 23729 vs    21425 [idx=423150] 

 23728 vs    21424 [idx=423151] 

 23473 vs    23457 [idx=423406] 

 23472 vs    23456 [idx=423407] 

 21425 vs    24241 [idx=425454] 

 21424 vs    24240 [idx=425455] 

 19873 vs    33201 [idx=427006] 

 19872 vs    33200 [idx=427007] 

 17313 vs    14257 [idx=429566] 

 17312 vs    14256 [idx=429567] 

 14769 vs    17313 [idx=432110] 

 14768 vs    17312 [idx=432111] 

 14257 vs    10673 [idx=432622] 

 14256 vs    10672 [idx=432623] 

 10673 vs     5809 [idx=436206] 

 10672 vs     5808 [idx=436207] 

  9889 vs     7073 [idx=436990] 

  9888 vs     7072 [idx=436991] 

  7073 vs     5537 [idx=439806] 

  7072 vs     5536 [idx=439807] 

  5809 vs     9889 [idx=441070] 

  5808 vs     9888 [idx=441071]

I should note that I tried the program multiple times on another identical GPU and there it seemed to work - too early to blame the hardware?

It looks weird indeed. What version of driver do you use?

Have you tried writing intermediate results to the output buffer? Write the global id of each thread to dst[l] to ensure that the correct number of threads a running. Writing map[i] to output[i] and compare with the expected values.