Counting algorithm ?

Hi,

I have the following situations :

a) an array of ‘int’ where each int contains a ‘flag’ (a value defined by a bit)

b) the set is sorted by ‘flag’

I use the following kernel to count the number of ‘type of flag’. To do this I simply detect where the flag differ and put this value back to a structure.

Sometimes it works, sometimes not !

typedef struct

{

   uint lastId;

   uint tasksCount_CreateCameraRays;

   uint tasksCount_Trace;

   uint tasksCount_Shade;

   uint tasksCount_Shade_AO;

   uint tasksCount_ImageReconstruction;

} clTasksTypes;

// ALSO DEFINED IN 'Types.cl'

#define TASK_HASH_CREATE_CAMERA_RAY      ((uint)1 << 24)

#define TASK_HASH_TRACE               ((uint)2 << 24)

#define TASK_HASH_SHADE             ((uint)4 << 24)

#define TASK_HASH_SHADE_AO            ((uint)8 << 24)

#define TASK_HASH_RECONSTRUCT_IMAGE      ((uint)16 << 24)

//#define IS_FLAG_SET(VAL,FLAG) (VAL&FLAG)

//#define IS_FLAG_SET(VAL,FLAG) ((VAL&FLAG)==FLAG)

#define IS_FLAG_SET(VAL,FLAG) ((VAL&FLAG)!=0)

__kernel

void kernel__countTasksTypes(

   __global uint* indices,

   __global clTasksTypes* tasksTypes,

   uint tasksCount

)

{

   size_t gid = get_global_id(0);

if (gid < 1)

   {

      tasksTypes->tasksCount_CreateCameraRays = 0;

      tasksTypes->tasksCount_Trace = 0;

      tasksTypes->tasksCount_Shade = 0;

      tasksTypes->tasksCount_Shade_AO = 0;

      tasksTypes->tasksCount_ImageReconstruction = 0;

      barrier(CLK_GLOBAL_MEM_FENCE);

      return;

   }

barrier(CLK_GLOBAL_MEM_FENCE);

if (gid >= tasksCount)

      return;

// Save the last hash/key value

   if (gid == tasksCount-1)

      tasksTypes->lastId = indices[(tasksCount-1) * 2];

const uint hashKey1 = indices[(gid * 2) - 2];

   const uint hashKey2 = indices[gid * 2];

   if (hashKey1 != hashKey2)

   {

      if (IS_FLAG_SET(hashKey1, TASK_HASH_CREATE_CAMERA_RAY))

         tasksTypes->tasksCount_CreateCameraRays = gid;

      else if (IS_FLAG_SET(hashKey1, TASK_HASH_TRACE))

         tasksTypes->tasksCount_Trace = gid;

      else if (IS_FLAG_SET(hashKey1, TASK_HASH_SHADE))

         tasksTypes->tasksCount_Shade = gid;

      else if (IS_FLAG_SET(hashKey1, TASK_HASH_SHADE_AO))

         tasksTypes->tasksCount_Shade_AO = gid;

      else if (IS_FLAG_SET(hashKey1, TASK_HASH_RECONSTRUCT_IMAGE))

         tasksTypes->tasksCount_ImageReconstruction = gid;

   }

}

There is only one special case, handled on the host side, it is for the last type of flag.

const unsigned int tasksCount = _tasksBatchSize;

   _kernel__CountTasksTypes->setArg(0, *_clBuffer_TasksIndices);

   _kernel__CountTasksTypes->setArg(1, *_clBuffer_CountTasksTypes);

   _kernel__CountTasksTypes->setArg(2, tasksCount);

_queue->enqueueNDRangeKernel(*_kernel__CountTasksTypes, cl::NullRange, cl::NDRange(globalWork), cl::NDRange(_workGroupSize_kernel__CountTasksTypes), 0, 0);

_queue->enqueueReadBuffer(*_clBuffer_CountTasksTypes, CL_TRUE, 0, sizeof(clTasksTypes), &_cpuBuffer_TasksTypes);

// Compute the last count

   unsigned int totalCount = _cpuBuffer_TasksTypes.tasksCount_CreateCameraRays+_cpuBuffer_TasksTypes.tasksCount_Trace+_cpuBuffer_TasksTypes.tasksCount_Shade+_cpuBuffer_TasksTypes.tasksCount_Shade_AO+_cpuBuffer_TasksTypes.tasksCount_ImageReconstruction;   

   if (_cpuBuffer_TasksTypes.lastId & TASK_HASH_CREATE_CAMERA_RAY)

      _cpuBuffer_TasksTypes.tasksCount_CreateCameraRays = _tasksBatchSize - totalCount;

   else if (_cpuBuffer_TasksTypes.lastId & TASK_HASH_TRACE)

      _cpuBuffer_TasksTypes.tasksCount_Trace = _tasksBatchSize - totalCount;

   else if (_cpuBuffer_TasksTypes.lastId & TASK_HASH_SHADE)

      _cpuBuffer_TasksTypes.tasksCount_Shade = _tasksBatchSize - totalCount;

   else if (_cpuBuffer_TasksTypes.lastId & TASK_HASH_SHADE_AO)

      _cpuBuffer_TasksTypes.tasksCount_Shade_AO = _tasksBatchSize - totalCount;

   else if (_cpuBuffer_TasksTypes.lastId & TASK_HASH_RECONSTRUCT_IMAGE)

      _cpuBuffer_TasksTypes.tasksCount_ImageReconstruction = _tasksBatchSize - totalCount;

I can’t find why it does not work, it should be simple no ?

Any suggestion is welcomed, thanks

BTW, yes it is OpenCL code but the algorithm is the same as in CUDA, it is just a detail ! I post here because what is important is the “ALGORITHM”, not the code !

Thanks

It looks to me like you have every thread writing to tasksTypes in parallel. Thus only one thread will actually write their data and all other updates will likely be lost. The behavior of this program is undefined. Also I’m not seeing where you are counting anything. It appears that you are just assigning gid to tasktypes randomly if the flag is set. (The random part is due to the race condition).