Hi,
I have the following situations :
a) an array of ‘int’ where each int contains a ‘flag’ (a value defined by a bit)
b) the set is sorted by ‘flag’
I use the following kernel to count the number of ‘type of flag’. To do this I simply detect where the flag differ and put this value back to a structure.
Sometimes it works, sometimes not !
typedef struct
{
uint lastId;
uint tasksCount_CreateCameraRays;
uint tasksCount_Trace;
uint tasksCount_Shade;
uint tasksCount_Shade_AO;
uint tasksCount_ImageReconstruction;
} clTasksTypes;
// ALSO DEFINED IN 'Types.cl'
#define TASK_HASH_CREATE_CAMERA_RAY ((uint)1 << 24)
#define TASK_HASH_TRACE ((uint)2 << 24)
#define TASK_HASH_SHADE ((uint)4 << 24)
#define TASK_HASH_SHADE_AO ((uint)8 << 24)
#define TASK_HASH_RECONSTRUCT_IMAGE ((uint)16 << 24)
//#define IS_FLAG_SET(VAL,FLAG) (VAL&FLAG)
//#define IS_FLAG_SET(VAL,FLAG) ((VAL&FLAG)==FLAG)
#define IS_FLAG_SET(VAL,FLAG) ((VAL&FLAG)!=0)
__kernel
void kernel__countTasksTypes(
__global uint* indices,
__global clTasksTypes* tasksTypes,
uint tasksCount
)
{
size_t gid = get_global_id(0);
if (gid < 1)
{
tasksTypes->tasksCount_CreateCameraRays = 0;
tasksTypes->tasksCount_Trace = 0;
tasksTypes->tasksCount_Shade = 0;
tasksTypes->tasksCount_Shade_AO = 0;
tasksTypes->tasksCount_ImageReconstruction = 0;
barrier(CLK_GLOBAL_MEM_FENCE);
return;
}
barrier(CLK_GLOBAL_MEM_FENCE);
if (gid >= tasksCount)
return;
// Save the last hash/key value
if (gid == tasksCount-1)
tasksTypes->lastId = indices[(tasksCount-1) * 2];
const uint hashKey1 = indices[(gid * 2) - 2];
const uint hashKey2 = indices[gid * 2];
if (hashKey1 != hashKey2)
{
if (IS_FLAG_SET(hashKey1, TASK_HASH_CREATE_CAMERA_RAY))
tasksTypes->tasksCount_CreateCameraRays = gid;
else if (IS_FLAG_SET(hashKey1, TASK_HASH_TRACE))
tasksTypes->tasksCount_Trace = gid;
else if (IS_FLAG_SET(hashKey1, TASK_HASH_SHADE))
tasksTypes->tasksCount_Shade = gid;
else if (IS_FLAG_SET(hashKey1, TASK_HASH_SHADE_AO))
tasksTypes->tasksCount_Shade_AO = gid;
else if (IS_FLAG_SET(hashKey1, TASK_HASH_RECONSTRUCT_IMAGE))
tasksTypes->tasksCount_ImageReconstruction = gid;
}
}
There is only one special case, handled on the host side, it is for the last type of flag.
const unsigned int tasksCount = _tasksBatchSize;
_kernel__CountTasksTypes->setArg(0, *_clBuffer_TasksIndices);
_kernel__CountTasksTypes->setArg(1, *_clBuffer_CountTasksTypes);
_kernel__CountTasksTypes->setArg(2, tasksCount);
_queue->enqueueNDRangeKernel(*_kernel__CountTasksTypes, cl::NullRange, cl::NDRange(globalWork), cl::NDRange(_workGroupSize_kernel__CountTasksTypes), 0, 0);
_queue->enqueueReadBuffer(*_clBuffer_CountTasksTypes, CL_TRUE, 0, sizeof(clTasksTypes), &_cpuBuffer_TasksTypes);
// Compute the last count
unsigned int totalCount = _cpuBuffer_TasksTypes.tasksCount_CreateCameraRays+_cpuBuffer_TasksTypes.tasksCount_Trace+_cpuBuffer_TasksTypes.tasksCount_Shade+_cpuBuffer_TasksTypes.tasksCount_Shade_AO+_cpuBuffer_TasksTypes.tasksCount_ImageReconstruction;
if (_cpuBuffer_TasksTypes.lastId & TASK_HASH_CREATE_CAMERA_RAY)
_cpuBuffer_TasksTypes.tasksCount_CreateCameraRays = _tasksBatchSize - totalCount;
else if (_cpuBuffer_TasksTypes.lastId & TASK_HASH_TRACE)
_cpuBuffer_TasksTypes.tasksCount_Trace = _tasksBatchSize - totalCount;
else if (_cpuBuffer_TasksTypes.lastId & TASK_HASH_SHADE)
_cpuBuffer_TasksTypes.tasksCount_Shade = _tasksBatchSize - totalCount;
else if (_cpuBuffer_TasksTypes.lastId & TASK_HASH_SHADE_AO)
_cpuBuffer_TasksTypes.tasksCount_Shade_AO = _tasksBatchSize - totalCount;
else if (_cpuBuffer_TasksTypes.lastId & TASK_HASH_RECONSTRUCT_IMAGE)
_cpuBuffer_TasksTypes.tasksCount_ImageReconstruction = _tasksBatchSize - totalCount;
I can’t find why it does not work, it should be simple no ?
Any suggestion is welcomed, thanks
BTW, yes it is OpenCL code but the algorithm is the same as in CUDA, it is just a detail ! I post here because what is important is the “ALGORITHM”, not the code !
Thanks