I was trying to wirte a Parallel Reduction for a 2D-Matrix based on cuda reduction sdk sample (Description can be found at http://www.nvidia.com/object/cuda_sample_data-parallel.html).
Parallel Reduction on a 1D Array works as expected, but 2D works only partially.
Here is the kernel:
#define A_loc(x,y) A_loc[y * block_size + x]
__kernel void
ParallelReduction2D(__global float * res, //result
__global float * A, //input Matrix
__local float * A_loc) //shared memory - for input array
{
int group_id_x = get_group_id(0); //group id - x - COL
int group_id_y = get_group_id(1); //group id - y - ROW
int tx = get_local_id(0); //local id - x
int ty = get_local_id(1); //local id - y
int global_id_x = get_global_id(0); //global id - x
int global_id_y = get_global_id(1); //global id - y
int global_size_x = get_global_size(0); //global size - x
int block_size = get_local_size(0); //work group size
int num_groups_x = get_num_groups(0); //nr of work groups - x
//int block_size = get_local_size(0); //work group size
//copy from global to shared memory
A_loc(tx,ty) = A[global_id_y * global_size_x + global_id_x];
//sync
barrier(CLK_LOCAL_MEM_FENCE);
//parallel reduction -> sum up values
if (block_size >= 512) { if(tx < 256){ A_loc(tx,ty) += A_loc(tx + 256,ty); barrier(CLK_LOCAL_MEM_FENCE);} }
if (block_size >= 256) { if(tx < 128){ A_loc(tx,ty) += A_loc(tx + 128,ty); barrier(CLK_LOCAL_MEM_FENCE);} }
if (block_size >= 128) { if(tx < 64){ A_loc(tx,ty) += A_loc(tx + 64,ty); barrier(CLK_LOCAL_MEM_FENCE);} }
if (tx <= 32)
{
if (block_size >= 64) { A_loc(tx,ty) += A_loc(tx + 32,ty); barrier(CLK_LOCAL_MEM_FENCE); }
if (block_size >= 32) { A_loc(tx,ty) += A_loc(tx + 16,ty); barrier(CLK_LOCAL_MEM_FENCE); }
if (block_size >= 16) { A_loc(tx,ty) += A_loc(tx + 8,ty); barrier(CLK_LOCAL_MEM_FENCE); }
if (block_size >= 8) { A_loc(tx,ty) += A_loc(tx + 4,ty); barrier(CLK_LOCAL_MEM_FENCE); }
if (block_size >= 4) { A_loc(tx,ty) += A_loc(tx + 2,ty); barrier(CLK_LOCAL_MEM_FENCE); }
if (block_size >= 2) { A_loc(tx,ty) += A_loc(tx + 1,ty); barrier(CLK_LOCAL_MEM_FENCE); }
}
//writing result to global memory in order to do final sum up
if(tx == 0)
res[global_id_y * global_size_x + global_id_x] = A_loc(tx,ty);
In order to keep it simple for now I use the same size for res and A.
So if block_size is 16 and A is filled with 1, I’d expect some result matrix like:
16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
…
but what I get instead is:
16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
…
16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
So the first row of each block is calculated correctly, but the rest is wrong.
What am i doing wrong in this case?