I motified avi’s code to choose the largest element of an array
// Use first warp of block to compute parallel reduction on the
// partial sum in shared memory.
if (threadIdx.x < 32) {
#pragma unroll
for(int i=32; i<blockDim.x; i+=32)
{
if (shared_buff[threadIdx.x] < shared_buff[threadIdx.x+i])
shared_buff[threadIdx.x] = shared_buff[threadIdx.x+i];
}
}
if ((threadIdx.x < 16)&&(shared_buff[threadIdx.x] < shared_buff[threadIdx.x+16])) { shared_buff[threadIdx.x] = shared_buff[threadIdx.x+16]; }
if ((threadIdx.x < 8)&&(shared_buff[threadIdx.x] < shared_buff[threadIdx.x+8])) { shared_buff[threadIdx.x] = shared_buff[threadIdx.x+8]; }
if ((threadIdx.x < 4)&&(shared_buff[threadIdx.x] < shared_buff[threadIdx.x+4])) { shared_buff[threadIdx.x] = shared_buff[threadIdx.x+4]; }
if ((threadIdx.x < 2)&&(shared_buff[threadIdx.x] < shared_buff[threadIdx.x+2])) { shared_buff[threadIdx.x] = shared_buff[threadIdx.x+2]; }
if (threadIdx.x == 0)
{
if (shared_buff[threadIdx.x] < shared_buff[threadIdx.x + 1]) { shared_buff[threadIdx.x] = shared_buff[threadIdx.x + 1];
}
due to all the comparison and if conditions is this code efficient of cuda?