Hi, cgorac.
I made the modification you suggested and I still get an error in the same “clEnqueueNDRangeKernel” function. The numeric value is -30. Does it mean anything to you?
OK, you asked for a short example and you got it!
[codebox]
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <sys/time.h>
#include “CL/cl.h”
int initAcc( bool printDevStats );
int execAcc( size_t n, float* Ab, size_t subIndex, float pivot );
void relAcc();
cl_context context;
cl_program program;
cl_command_queue queue;
cl_kernel kernel;
int main( void )
{
size_t n = 1000;
float* Ab = ( float * ) malloc( n * ( n + 1 ) * sizeof( float ) );
for ( size_t i = 0; i < n; ++i )
{
for ( size_t j = 0; j < n + 1; ++j )
{
Ab[ i * ( n + 1 ) + j ] = ( ( float ) rand() / ( float ) RAND_MAX );
}
}
if ( initAcc( true ) != CL_SUCCESS )
{
return EXIT_FAILURE;
}
if ( execAcc( n, Ab, 1, Ab[ 0 ] ) != CL_SUCCESS )
{
return EXIT_FAILURE;
}
relAcc();
return 0;
}
int initAcc( bool printDevStats )
{
// Kernel file name
const char* kernelFile = "./src/kernelName.cl";
// OpenCL error return values
cl_int err;
// Connect to a compute device
cl_device_id devices;
int gpu = 1;
err = clGetDeviceIDs( NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &devices, NULL );
if ( err != CL_SUCCESS )
{
printf( "Error: Failed to create a device group!\n" );
return EXIT_FAILURE;
}
// Print device stats
if ( printDevStats )
{
printDeviceStats( devices );
}
// Create a compute context
context = clCreateContext( 0, 1, &devices, NULL, NULL, &err );
if ( !context )
{
printf( "Error: Failed to create a compute context!\n" );
return EXIT_FAILURE;
}
// Create a command queue
queue = clCreateCommandQueue( context, devices, 0, &err );
if ( !queue )
{
printf( "Error: Failed to create a command queue!\n" );
return EXIT_FAILURE;
}
// Load the compute program from disk into a cstring buffer
char* source = loadProgramSource( kernelFile );
if( source == NULL )
{
printf( "Error: Failed to load compute program from file!\n" );
return EXIT_FAILURE;
}
// Create the compute program from the source buffer
program = clCreateProgramWithSource( context, 1, ( const char ** ) &source, NULL, &err );
if ( !program || err != CL_SUCCESS )
{
printf( "Error: Failed to create compute program!\n" );
return EXIT_FAILURE;
}
// Build the program executable
err = clBuildProgram( program, 0, NULL, NULL, NULL, NULL );
if ( err != CL_SUCCESS )
{
size_t len;
char buffer[ 2048 ];
printf( “Error: Failed to build program executable!\n” );
clGetProgramBuildInfo( program, devices, CL_PROGRAM_BUILD_LOG, sizeof( buffer ), buffer, &len );
printf( "%s\n", buffer );
return EXIT_FAILURE;
}
// Create the compute kernel from within the program
kernel = clCreateKernel( program, "myKernel", &err );
if ( !kernel || err != CL_SUCCESS )
{
printf( "Error: Failed to create compute kernel!\n" );
return EXIT_FAILURE;
}
return CL_SUCCESS;
}
int fwdElimAccExec( size_t n, float* Ab, size_t subIndex, float pivot )
{
// OpenCL error return values
cl_int err;
// Timing variables
struct timeval beg;
struct timeval end;
double cl_alloc, cl_enqueue, cl_read;
// Allocate memory and queue it to be written to the device
size_t bufferSizeAb = ( n * ( n + 1 ) ) * sizeof( float );
gettimeofday( &beg, NULL );
cl_mem Ab_mem = clCreateBuffer( context, CL_MEM_READ_WRITE, bufferSizeAb, NULL, NULL );
err = clEnqueueWriteBuffer( queue, Ab_mem, CL_TRUE, 0, bufferSizeAb, ( void* ) Ab, 0, NULL, NULL );
// Push the data out to device
clFinish( queue );
gettimeofday( &end, NULL );
cl_alloc = elapsedTime( beg, end );
// Set kernel arguments
err = clSetKernelArg( kernel, 0, sizeof( size_t ), &n );
err |= clSetKernelArg( kernel, 1, sizeof( cl_mem ), &Ab_mem );
err |= clSetKernelArg( kernel, 2, sizeof( size_t ), &subIndex );
err |= clSetKernelArg( kernel, 3, sizeof( float ), &pivot );
// Determine the global and local dimensions for the execution
size_t global_work_size[] = { n + 1, n }, *local_work_size = NULL;
// Queue up the kernels
err = CL_SUCCESS;
gettimeofday( &beg, NULL );
err |= clEnqueueNDRangeKernel( queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL );
printf( "err = %d\n", err );
// Finish the calculation
clFinish( queue );
gettimeofday( &end, NULL );
if ( err != CL_SUCCESS )
{
printf( "Error: Failed to execute kernel!\n" );
return EXIT_FAILURE;
}
cl_enqueue = elapsedTime( beg, end );
// Read back the results that were computed on the device
gettimeofday( &beg, NULL );
err = clEnqueueReadBuffer( queue, Ab_mem, CL_TRUE, 0, bufferSizeAb, Ab, 0, NULL, NULL);
clFinish( queue );
gettimeofday( &end, NULL );
cl_read = elapsedTime( beg, end );
// Release memory objects
clReleaseMemObject( Ab_mem );
return CL_SUCCESS;
}
void relAcc()
{
// Release OpenCL variables
clReleaseKernel( kernel );
clReleaseProgram( program );
clReleaseCommandQueue( queue );
clReleaseContext( context );
}
[/codebox]
The kernel was posted previously.
The output I get from that code is:
[i]Vendor: NVIDIA Corporation
Device Name: GeForce GTS 250
Profile: FULL_PROFILE
Supported Extensions: cl_khr_byte_addressable_store cl_nv_compiler_options cl_nv_device_attribute_query cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics
Local Mem Type (Local=1, Global=2): 1
Global Mem Size (MB): 511
Global Mem Cache Size (Bytes): 0
Max Mem Alloc Size (MB): 128
Clock Frequency (MHz): 1836
Vector type width for: char = 1
Vector type width for: short = 1
Vector type width for: int = 1
Vector type width for: long = 1
Vector type width for: float = 1
Vector type width for: double = 0
Max Work Group Size: 512
Max Work Item Dims: 140226387247107
Max Compute Units: 16
err = -30
Error: Failed to execute kernel![/i]
Thanks again!