OpenCL in Fedora vs Windows Same code. Runs in Windows. Kernel error in Fedora

Hello, all!

I have created a code that runs fine in Windows; however, the same code running in the same machine and video card errors out when run in Fedora 11! How can this be possible? The command “clEnqueueNDRangeKernel” doesn’t return CL_SUCCESS in Fedora and the same code works in Windows! Is this a bug in the Linux implementation of OpenCL? This is a huge bummer!

Thanks.

Maybe it’s bug in the driver. Maybe it is in your code. Later is much more probable, but you could hardly hope for any kind of meaningful answer here without actually providing a minimal (this is very important!) version of your code showing the problem.

Hi, cgorac!

Yeah, I was going to upload relevant parts of the code, but since my problem is that the SAME code runs in one OS and doesn’t in the other, I thought it would be unnecessary. Everything before the command to enqueue the kernel works, but the result of that command isn’t CL_SUCESS when running in Fedora. Here are parts of the code that I hope make some sense.

Observation: the key variables are:

n := number of rows of matrix Ab

Ab := matrix (in vectorized form)

The C functions:

[codebox]

/* … */

cl_context context;

cl_program program;

cl_command_queue queue;

cl_kernel kernel;

/* … */

int initAcc( bool printDevStats, cl_command_queue* queue, cl_kernel* kernel )

{

// Kernel file name

const char* kernelFile = "./src/kernelName.cl";

// OpenCL error return values

cl_int err;

// Connect to a compute device

cl_device_id devices;

int gpu = 1;

err = clGetDeviceIDs( NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &devices, NULL );

if ( err != CL_SUCCESS )

{

    printf( "Error: Failed to create a device group!\n" );

    return EXIT_FAILURE;

}

// Print device stats

if ( printDevStats )

{

    printDeviceStats( devices );

}

// Create a compute context

context = clCreateContext( 0, 1, &devices, NULL, NULL, &err );

if ( !context )

{

    printf( "Error: Failed to create a compute context!\n" );

    return EXIT_FAILURE;

}

// Create a command queue

*queue = clCreateCommandQueue( context, devices, 0, &err );

if ( !queue )

{

    printf( "Error: Failed to create a command queue!\n" );

    return EXIT_FAILURE;

}

// Load the compute program from disk into a cstring buffer

char* source = loadProgramSource( kernelFile );

if( source == NULL )

{

    printf( "Error: Failed to load compute program from file!\n" );

    return EXIT_FAILURE;

}

// Create the compute program from the source buffer

program = clCreateProgramWithSource( context, 1, ( const char ** ) &source, NULL, &err );

if ( !program || err != CL_SUCCESS )

{

    printf( "Error: Failed to create compute program!\n" );

    return EXIT_FAILURE;

}

// Build the program executable

err = clBuildProgram( program, 0, NULL, NULL, NULL, NULL );

if ( err != CL_SUCCESS )

{

    size_t len;

    char buffer[ 2048 ];

printf( “Error: Failed to build program executable!\n” );

    clGetProgramBuildInfo( program, devices, CL_PROGRAM_BUILD_LOG, sizeof( buffer ), buffer, &len );

    printf( "%s\n", buffer );

    return EXIT_FAILURE;

}

// Create the compute kernel from within the program

*kernel = clCreateKernel( program, "myKernel", &err );

if ( !kernel || err != CL_SUCCESS )

{

    printf( "Error: Failed to create compute kernel!\n" );

    return EXIT_FAILURE;

}

return CL_SUCCESS;

}

int execAcc( size_t n, float* Ab, size_t subIndex, float pivot )

{

// OpenCL error return values

cl_int err;

// Timing variables

struct timeval beg;

    struct timeval end;

// Allocate memory and queue it to be written to the device

size_t bufferSizeAb = ( n * ( n + 1 ) ) * sizeof( float );

gettimeofday( &beg, NULL );

cl_mem Ab_mem = clCreateBuffer( context, CL_MEM_READ_WRITE, bufferSizeAb, NULL, NULL );

err = clEnqueueWriteBuffer( queue, Ab_mem, CL_TRUE, 0, bufferSizeAb, ( void* ) Ab, 0, NULL, NULL );

// Push the data out to device

clFinish( queue );

gettimeofday( &end, NULL );

// Set kernel arguments

err  = clSetKernelArg( kernel, 0, sizeof( size_t ), &n );

err |= clSetKernelArg( kernel, 1, sizeof( cl_mem ), &Ab_mem );

err |= clSetKernelArg( kernel, 2, sizeof( size_t ), &subIndex );

err |= clSetKernelArg( kernel, 3, sizeof( float ), &pivot );

// Determine the global and local dimensions for the execution

    size_t global_work_size[] = { n + 1, n }, local_work_size = NULL;

// Queue up the kernels

err = CL_SUCCESS;

gettimeofday( &beg, NULL );

err |= clEnqueueNDRangeKernel( queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL );

// Finish the calculation

clFinish( queue );

gettimeofday( &end, NULL );

if ( err != CL_SUCCESS )

    {

         printf( "Error: Failed to execute kernel!\n" );

         return EXIT_FAILURE;

    }

cl_enqueue = elapsedTime( beg, end );

// Read back the results that were computed on the device

gettimeofday( &beg, NULL );

err = clEnqueueReadBuffer( queue, Ab_mem, CL_TRUE, 0, bufferSizeAb, Ab, 0, NULL, NULL);

clFinish( queue );

gettimeofday( &end, NULL );

cl_read = elapsedTime( beg, end );

// Release memory objects

clReleaseMemObject( Ab_mem );

return CL_SUCCESS;

}

[/codebox]

The kernel:

[codebox]

__kernel void myKernel( size_t n, __global float* Ab, size_t subIndex, float pivot )

{

size_t i = get_global_id( 1 );

size_t j = get_global_id( 0 );

if ( ( i >= subIndex && i < n ) && ( j >= subIndex && j < n + 1 ) )

{

    Ab[ i * ( n + 1 ) + j ] -= Ab[ i * ( n + 1 ) + subIndex - 1 ] / pivot * Ab[ ( subIndex - 1 ) * ( n + 1 ) + j ];

}

}

[/codebox]

This is not complete program, thus it is not possible to run it; as mentioned above, your chances that someone take a look into this are much bigger if you provide complete and minimal example of the program that would make the problem to appear when program run… Still, following line from your code:

size_t global_work_size[] = { n + 1, n }, local_work_size = NULL;

seems rather suspicious to me. I think you’d want:

size_t global_work_size[] = { n + 1, n };

size_t* local_work_size = NULL;

here instead. So I’d try to changing this first; if that doesn’t help, I’d proceed with printing the error value eventually returned from clEnqueueNDRangeKernel() immediately after this call, and starting further examination out of this error value.

Hi, cgorac.

I made the modification you suggested and I still get an error in the same “clEnqueueNDRangeKernel” function. The numeric value is -30. Does it mean anything to you?

OK, you asked for a short example and you got it!

[codebox]

#include <stdio.h>

#include <stdlib.h>

#include <stdbool.h>

#include <sys/time.h>

#include “CL/cl.h”

int initAcc( bool printDevStats );

int execAcc( size_t n, float* Ab, size_t subIndex, float pivot );

void relAcc();

cl_context context;

cl_program program;

cl_command_queue queue;

cl_kernel kernel;

int main( void )

{

size_t n = 1000;

float* Ab = ( float * ) malloc( n * ( n + 1 ) * sizeof( float ) );

for ( size_t i = 0; i < n; ++i )

{

    for ( size_t j = 0; j < n + 1; ++j )

    {

        Ab[ i * ( n + 1 ) + j ] = ( ( float ) rand() / ( float ) RAND_MAX );

    }

}

if ( initAcc( true ) != CL_SUCCESS )

{

	return EXIT_FAILURE;

}

if ( execAcc( n, Ab, 1, Ab[ 0 ] ) != CL_SUCCESS )

{

    return EXIT_FAILURE;

}

relAcc();

return 0;

}

int initAcc( bool printDevStats )

{

// Kernel file name

 const char* kernelFile = "./src/kernelName.cl";

// OpenCL error return values

cl_int err;

// Connect to a compute device

cl_device_id devices;

int gpu = 1;

err = clGetDeviceIDs( NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &devices, NULL );

if ( err != CL_SUCCESS )

{

    printf( "Error: Failed to create a device group!\n" );

    return EXIT_FAILURE;

}

// Print device stats

if ( printDevStats )

{

    printDeviceStats( devices );

}

// Create a compute context

context = clCreateContext( 0, 1, &devices, NULL, NULL, &err );

if ( !context )

{

    printf( "Error: Failed to create a compute context!\n" );

    return EXIT_FAILURE;

}

// Create a command queue

queue = clCreateCommandQueue( context, devices, 0, &err );

if ( !queue )

{

    printf( "Error: Failed to create a command queue!\n" );

    return EXIT_FAILURE;

}

// Load the compute program from disk into a cstring buffer

char* source = loadProgramSource( kernelFile );

if( source == NULL )

{

    printf( "Error: Failed to load compute program from file!\n" );

    return EXIT_FAILURE;

}

// Create the compute program from the source buffer

program = clCreateProgramWithSource( context, 1, ( const char ** ) &source, NULL, &err );

if ( !program || err != CL_SUCCESS )

{

    printf( "Error: Failed to create compute program!\n" );

    return EXIT_FAILURE;

}

// Build the program executable

err = clBuildProgram( program, 0, NULL, NULL, NULL, NULL );

if ( err != CL_SUCCESS )

{

    size_t len;

    char buffer[ 2048 ];

printf( “Error: Failed to build program executable!\n” );

    clGetProgramBuildInfo( program, devices, CL_PROGRAM_BUILD_LOG, sizeof( buffer ), buffer, &len );

    printf( "%s\n", buffer );

    return EXIT_FAILURE;

}

// Create the compute kernel from within the program

kernel = clCreateKernel( program, "myKernel", &err );

if ( !kernel || err != CL_SUCCESS )

{

    printf( "Error: Failed to create compute kernel!\n" );

    return EXIT_FAILURE;

}

return CL_SUCCESS;

}

int fwdElimAccExec( size_t n, float* Ab, size_t subIndex, float pivot )

{

// OpenCL error return values

cl_int err;

// Timing variables

struct timeval beg;

struct timeval end;

double cl_alloc, cl_enqueue, cl_read;

// Allocate memory and queue it to be written to the device

size_t bufferSizeAb = ( n * ( n + 1 ) ) * sizeof( float );

gettimeofday( &beg, NULL );

cl_mem Ab_mem = clCreateBuffer( context, CL_MEM_READ_WRITE, bufferSizeAb, NULL, NULL );

err = clEnqueueWriteBuffer( queue, Ab_mem, CL_TRUE, 0, bufferSizeAb, ( void* ) Ab, 0, NULL, NULL );

// Push the data out to device

clFinish( queue );

gettimeofday( &end, NULL );

cl_alloc = elapsedTime( beg, end );

// Set kernel arguments

err  = clSetKernelArg( kernel, 0, sizeof( size_t ), &n );

err |= clSetKernelArg( kernel, 1, sizeof( cl_mem ), &Ab_mem );

err |= clSetKernelArg( kernel, 2, sizeof( size_t ), &subIndex );

err |= clSetKernelArg( kernel, 3, sizeof( float ), &pivot );

// Determine the global and local dimensions for the execution

size_t global_work_size[] = { n + 1, n }, *local_work_size = NULL;

// Queue up the kernels

err = CL_SUCCESS;

gettimeofday( &beg, NULL );

err |= clEnqueueNDRangeKernel( queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL );

printf( "err = %d\n", err );

// Finish the calculation

clFinish( queue );

gettimeofday( &end, NULL );

if ( err != CL_SUCCESS )

{

    printf( "Error: Failed to execute kernel!\n" );

    return EXIT_FAILURE;

}

cl_enqueue = elapsedTime( beg, end );

// Read back the results that were computed on the device

gettimeofday( &beg, NULL );

err = clEnqueueReadBuffer( queue, Ab_mem, CL_TRUE, 0, bufferSizeAb, Ab, 0, NULL, NULL);

clFinish( queue );

gettimeofday( &end, NULL );

cl_read = elapsedTime( beg, end );

// Release memory objects

clReleaseMemObject( Ab_mem );

return CL_SUCCESS;

}

void relAcc()

{

// Release OpenCL variables

clReleaseKernel( kernel );

clReleaseProgram( program );

clReleaseCommandQueue( queue );

clReleaseContext( context );

}

[/codebox]

The kernel was posted previously.

The output I get from that code is:

[i]Vendor: NVIDIA Corporation

Device Name: GeForce GTS 250

Profile: FULL_PROFILE

Supported Extensions: cl_khr_byte_addressable_store cl_nv_compiler_options cl_nv_device_attribute_query cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics

Local Mem Type (Local=1, Global=2): 1

Global Mem Size (MB): 511

Global Mem Cache Size (Bytes): 0

Max Mem Alloc Size (MB): 128

Clock Frequency (MHz): 1836

Vector type width for: char = 1

Vector type width for: short = 1

Vector type width for: int = 1

Vector type width for: long = 1

Vector type width for: float = 1

Vector type width for: double = 0

Max Work Group Size: 512

Max Work Item Dims: 140226387247107

Max Compute Units: 16

err = -30

Error: Failed to execute kernel![/i]

Thanks again!

The code you posted does not compile again:

foo.cpp: In function ‘int initAcc(bool)’:

foo.cpp:67: error: ‘printDeviceStats’ was not declared in this scope

foo.cpp:87: error: ‘loadProgramSource’ was not declared in this scope

foo.cpp: In function ‘int fwdElimAccExec(size_t, float*, size_t, float)’:

foo.cpp:147: error: ‘elapsedTime’ was not declared in this scope

So - still generally speaking only: you could always lookup error values in your CL/cl.h file. The value -30 means CL_INVALID_VALUE.

Sorry for that (I forgot to add auxiliary functions)! The code below should work:

[codebox]

#include <stdio.h>

#include <stdlib.h>

#include <stdbool.h>

#include “CL/cl.h”

int initAcc( bool printDevStats );

int execAcc( size_t n, float* Ab, size_t subIndex, float pivot );

void relAcc();

char* loadProgramSource( const char* filePath );

void printDeviceStats( cl_device_id device_id );

cl_context context;

cl_program program;

cl_command_queue queue;

cl_kernel kernel;

int main( void )

{

size_t n = 1000;

float* Ab = ( float * ) malloc( n * ( n + 1 ) * sizeof( float ) );

for ( size_t i = 0; i < n; ++i )

{

	for ( size_t j = 0; j < n + 1; ++j )

	{

		Ab[ i * ( n + 1 ) + j ] = ( ( float ) rand() / ( float ) RAND_MAX );

	}

}

if ( initAcc( true ) != CL_SUCCESS )

{

	return EXIT_FAILURE;

}

if ( execAcc( n, Ab, 1, Ab[ 0 ] ) != CL_SUCCESS )

{

	return EXIT_FAILURE;

}

relAcc();

return 0;

}

int initAcc( bool printDevStats )

{

// Kernel file name

const char* kernelFile = "./src/myKernel.cl";

// OpenCL error return values

cl_int err;

// Connect to a compute device

cl_device_id devices;

int gpu = 1;

err = clGetDeviceIDs( NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &devices, NULL );

if ( err != CL_SUCCESS )

{

	printf( "Error: Failed to create a device group!\n" );

	return EXIT_FAILURE;

}

// Print device stats

if ( printDevStats )

{

	printDeviceStats( devices );

}

// Create a compute context

context = clCreateContext( 0, 1, &devices, NULL, NULL, &err );

if ( !context )

{

	printf( "Error: Failed to create a compute context!\n" );

	return EXIT_FAILURE;

}

// Create a command queue

queue = clCreateCommandQueue( context, devices, 0, &err );

if ( !queue )

{

	printf( "Error: Failed to create a command queue!\n" );

	return EXIT_FAILURE;

}

// Load the compute program from disk into a cstring buffer

char* source = loadProgramSource( kernelFile );

if( source == NULL )

{

	printf( "Error: Failed to load compute program from file!\n" );

	return EXIT_FAILURE;

}

// Create the compute program from the source buffer

program = clCreateProgramWithSource( context, 1, ( const char ** ) &source, NULL, &err );

if ( !program || err != CL_SUCCESS )

{

	printf( "Error: Failed to create compute program!\n" );

	return EXIT_FAILURE;

}

// Build the program executable

err = clBuildProgram( program, 0, NULL, NULL, NULL, NULL );

if ( err != CL_SUCCESS )

{

	size_t len;

	char buffer[ 2048 ];

	printf( "Error: Failed to build program executable!\n" );

	clGetProgramBuildInfo( program, devices, CL_PROGRAM_BUILD_LOG, sizeof( buffer ), buffer, &len );

	printf( "%s\n", buffer );

	return EXIT_FAILURE;

}

// Create the compute kernel from within the program

kernel = clCreateKernel( program, "myKernel", &err );

if ( !kernel || err != CL_SUCCESS )

{

	printf( "Error: Failed to create compute kernel!\n" );

	return EXIT_FAILURE;

}

return CL_SUCCESS;

}

int execAcc( size_t n, float* Ab, size_t subIndex, float pivot )

{

// OpenCL error return values

cl_int err;

// Allocate memory and queue it to be written to the device

size_t bufferSizeAb = ( n * ( n + 1 ) ) * sizeof( float );

cl_mem Ab_mem = clCreateBuffer( context, CL_MEM_READ_WRITE, bufferSizeAb, NULL, NULL );

err = clEnqueueWriteBuffer( queue, Ab_mem, CL_TRUE, 0, bufferSizeAb, ( void* ) Ab, 0, NULL, NULL );

// Push the data out to device

clFinish( queue );

// Set kernel arguments

err = clSetKernelArg( kernel, 0, sizeof( size_t ), &n );

err |= clSetKernelArg( kernel, 1, sizeof( cl_mem ), &Ab_mem );

err |= clSetKernelArg( kernel, 2, sizeof( size_t ), &subIndex );

err |= clSetKernelArg( kernel, 3, sizeof( float ), &pivot );

// Determine the global and local dimensions for the execution

size_t global_work_size[] = { n + 1, n }, *local_work_size = NULL;

// Queue up the kernels

err = CL_SUCCESS;

err |= clEnqueueNDRangeKernel( queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL );

printf( "err = %d\n", err );

// Finish the calculation

clFinish( queue );

if ( err != CL_SUCCESS )

{

	printf( "Error: Failed to execute kernel!\n" );

	return EXIT_FAILURE;

}

// Read back the results that were computed on the device

err = clEnqueueReadBuffer( queue, Ab_mem, CL_TRUE, 0, bufferSizeAb, Ab, 0, NULL, NULL);

clFinish( queue );

// Release memory objects

clReleaseMemObject( Ab_mem );

return CL_SUCCESS;

}

void relAcc()

{

// Release OpenCL variables

clReleaseKernel( kernel );

clReleaseProgram( program );

clReleaseCommandQueue( queue );

clReleaseContext( context );

}

char* loadProgramSource( const char* filePath )

{

FILE* inFile = fopen( filePath, "rt" );

if ( inFile == NULL )

{

	return NULL;

}

else

{

	long fileSize;

	fseek( inFile, 0, SEEK_END );

	fileSize = ftell( inFile );

	fseek( inFile, 0, SEEK_SET );

	char* source = ( char* ) malloc( fileSize + 1 );

	fread( source, sizeof( char ), fileSize, inFile );

	source[ fileSize ] = '\0';

	fclose( inFile );

	return source;

}

}

void printDeviceStats( cl_device_id device_id )

{

int err;

size_t returned_size;

// Report the device vendor and device name

cl_char vendor_name[ 1024 ]			= { 0 };

cl_char device_name[ 1024 ]			= { 0 };

cl_char device_profile[ 1024 ]		= { 0 };

cl_char device_extensions[ 1024 ]	= { 0 };

cl_device_local_mem_type local_mem_type;

cl_ulong global_mem_size;

cl_ulong global_mem_cache_size;

cl_ulong max_mem_alloc_size;

cl_uint clock_frequency;

cl_uint vector_width;

cl_uint max_compute_units;

size_t max_work_item_dims;

size_t max_work_group_size;

size_t max_work_item_sizes[ 3 ];

cl_uint vector_types[] = { CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,

						   CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,

						   CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE };

char* vector_type_names[] = { "char", "short", "int", "long", "float", "double" };

err = clGetDeviceInfo( device_id, CL_DEVICE_VENDOR, sizeof( vendor_name ), vendor_name, &returned_size );

err|= clGetDeviceInfo( device_id, CL_DEVICE_NAME, sizeof( device_name ), device_name, &returned_size );

err|= clGetDeviceInfo( device_id, CL_DEVICE_PROFILE, sizeof( device_profile ), device_profile, &returned_size );

err|= clGetDeviceInfo( device_id, CL_DEVICE_EXTENSIONS, sizeof( device_extensions ), device_extensions, &returned_size );

err|= clGetDeviceInfo( device_id, CL_DEVICE_LOCAL_MEM_TYPE, sizeof( local_mem_type ), &local_mem_type, &returned_size );

err|= clGetDeviceInfo( device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( global_mem_size ), &global_mem_size, &returned_size );

err|= clGetDeviceInfo( device_id, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof( global_mem_cache_size ), &global_mem_cache_size, &returned_size );

err|= clGetDeviceInfo( device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( max_mem_alloc_size ), &max_mem_alloc_size, &returned_size );

err|= clGetDeviceInfo( device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof( clock_frequency ), &clock_frequency, &returned_size );

err|= clGetDeviceInfo( device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( max_work_group_size ), &max_work_group_size, &returned_size );

err|= clGetDeviceInfo( device_id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( max_work_item_dims ), &max_work_item_dims, &returned_size );

err|= clGetDeviceInfo( device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( max_work_item_sizes ), max_work_item_sizes, &returned_size );

err|= clGetDeviceInfo( device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof( max_compute_units ), &max_compute_units, &returned_size );

printf( "Vendor: %s\n", vendor_name );

printf( "Device Name: %s\n", device_name );

printf( "Profile: %s\n", device_profile );

printf( "Supported Extensions: %s\n\n", device_extensions );

printf( "Local Mem Type (Local=1, Global=2): %i\n", ( int ) local_mem_type );

printf( "Global Mem Size (MB): %d\n", ( int ) global_mem_size / ( 1024 * 1024 ) );

printf( "Global Mem Cache Size (Bytes): %d\n",( int ) global_mem_cache_size );

printf( "Max Mem Alloc Size (MB): %ld\n", ( long int ) max_mem_alloc_size / ( 1024 * 1024 ) );

printf( "Clock Frequency (MHz): %d\n\n", clock_frequency );

for( size_t i = 0; i < 6; i++ )

{

	err|= clGetDeviceInfo( device_id, vector_types[ i ], sizeof( clock_frequency ), &vector_width, &returned_size );

	printf( "Vector type width for: %s = %i\n", vector_type_names[ i ], vector_width );

}

printf( "\nMax Work Group Size: %lu\n" ,max_work_group_size );

printf( "Max Work Item Dims: %lu\n", max_work_item_dims );

// for( size_t i = 0; i < max_work_item_dims; i++ )

// printf( “Max Work Items in Dim %lu: %lu\n”, ( long unsigned ) ( i + 1 ), ( long unsigned ) max_work_item_sizes[ i ] );

printf( "Max Compute Units: %i\n", max_compute_units );

printf( "\n" );

}

[/codebox]

Honestly, I have no idea why I get err = -30 in Fedora and err = 0 in Windows. I just ran the code above in both OSes and that’s what I got! Why this discrepancy? I can’t think of any “invalid value” in my code and the fact that it doesn’t run in Fedora puzzles me even more!

Thanks!

When I run your code above, I got -52 on my machine, which is CL_INVALID_KERNEL_ARGS. Then I just removed some un-needed cruft from your code, ending up with kernel (saved say in foo.cl):

__kernel void

foo(int n, __global float *Ab, int index)

{

	int			 i = get_global_id(0);

	int			 j = get_global_id(1);

	if (i == index || i > n || j == index || j > n + 1)

	return;

	float		   mul =

	Ab[i * (n + 1) + index] / Ab[index * (n + 1) + index];

	Ab[i * (n + 1) + j] -= mul * Ab[index * (n + 1) + j];

}

and code (saved say in foo.c):

#include <assert.h>

#include <math.h>

#include <stdio.h>

#include <stdlib.h>

#include <CL/cl.h>

#define KERNEL_FILE "foo.cl"

#define KERNEL_NAME "foo"

#define USE_GPU 1

#define N 64

#define INDEX 42

static void	 initAcc(void);

static void	 execAcc(int n, float *Ab, int index);

static void	 relAcc(void);

static char	*loadProgramSource(const char *filePath);

static cl_context context;

static cl_program program;

static cl_command_queue queue;

static cl_kernel kernel;

int

main(void)

{

	float		  *Ab = (float *) malloc(N * (N + 1) * sizeof(float));

	assert(Ab != NULL);

	for (int i = 0; i < N; ++i)

	for (int j = 0; j < N + 1; ++j)

		Ab[i * (N + 1) + j] = (i + 1) * (j + 1);

	

	initAcc();

	execAcc(N, Ab, INDEX);

	relAcc();

	for (int i = 0; i < N; ++i) {

		if (i == INDEX)

			continue;

	for (int j = 0; j < N + 1; ++j) {

			if (j == INDEX)

				continue;

			assert(fabs(Ab[i * (N + 1) + j]) < 1e-3);

		}

	}

	return 0;

}

void

initAcc(void)

{

	cl_int		  err;

	cl_device_id	devices;

#ifdef USE_GPU

	err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &devices, NULL);

#else

	err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &devices, NULL);

#endif

	assert(err == CL_SUCCESS);

	context = clCreateContext(NULL, 1, &devices, NULL, NULL, &err);

	assert(err == CL_SUCCESS);

	queue = clCreateCommandQueue(context, devices, 0, &err);

	assert(err == CL_SUCCESS);

	char		   *source = loadProgramSource(KERNEL_FILE);

	assert(source != NULL);

	program =

	clCreateProgramWithSource(context, 1, (const char **) &source,

				  NULL, &err);

	assert(err == CL_SUCCESS);

	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	assert(err == CL_SUCCESS);

	kernel = clCreateKernel(program, KERNEL_NAME, &err);

	assert(err == CL_SUCCESS);

}

void

execAcc(int n, float *Ab, int index)

{

	cl_int		  err;

	int			 size = (n * (n + 1)) * sizeof(float);

	cl_mem		  Ab_d =

	clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);

	assert(err == CL_SUCCESS);

	err =

	clEnqueueWriteBuffer(queue, Ab_d, CL_TRUE, 0, size, (void *) Ab, 0,

				 NULL, NULL);

	assert(err == CL_SUCCESS);

	err = clSetKernelArg(kernel, 0, sizeof(int), &n);

	assert(err == CL_SUCCESS);

	err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &Ab_d);

	assert(err == CL_SUCCESS);

	err = clSetKernelArg(kernel, 2, sizeof(int), &index);

	assert(err == CL_SUCCESS);

	size_t		  global_work_size[] = {

	n, n + 1

	};

	err =

	clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size,

				   NULL, 0, NULL, NULL);

	assert(err == CL_SUCCESS);

	err =

	clEnqueueReadBuffer(queue, Ab_d, CL_TRUE, 0, size, Ab, 0, NULL,

				NULL);

	assert(err == CL_SUCCESS);

	err = clReleaseMemObject(Ab_d);

	assert(err == CL_SUCCESS);

}

void

relAcc()

{

	cl_int		  err;

	err = clReleaseKernel(kernel);

	assert(err == CL_SUCCESS);

	err = clReleaseProgram(program);

	assert(err == CL_SUCCESS);

	err = clReleaseCommandQueue(queue);

	assert(err == CL_SUCCESS);

	err = clReleaseContext(context);

	assert(err == CL_SUCCESS);

}

char		   *

loadProgramSource(const char *path)

{

	FILE		   *file = fopen(path, "r");

	assert(file != NULL);

	fseek(file, 0, SEEK_END);

	long			size = ftell(file);

	fseek(file, 0, SEEK_SET);

	char		   *source = (char *) malloc(size + 1);

	assert(source != NULL);

	assert(fread(source, sizeof(char), size, file) == size);

	source = 0;

	fclose(file);

	return source;

}

I compile with simple:

gcc -o foo -std=c99 foo.c -lOpenCL

and then it executed on my machine (64-bit Linux, SDK 3.0-beta), without any errors reported.

I hope I got your intention right - that what you want to do is to have given matrix transformed using given element as pivoting element, except that you actually don’t want values in the column of pivoting element set to 0, but kept unchanged instead. Note that above code is result of two very quick and dirty passes over your code, so probably there are some bugs, and also doing low-level linear algebra stuff this way is of course all wrong. But - at least it should build, and pass simple test in the main() function, so I hope it could be usable for you to start with. As for issues with the code you posted initially - I’m sorry that I haven’t pointed them specifically, as I just changed it in big chunks, but you may wish to compare the version above with your initial version, and eventually spot some of these issues by yourself.