How to obtain the maximum value of the sequence by using the reduction algorithm?

global void absMaxval(float *dst, float *src, int size)
{
shared float cache[blocksizeMax];

int g_id = (blockDim.x * blockIdx.x) +  threadIdx.x;
int l_id =  threadIdx.x;

cache[l_id] =  fabs(src[g_id]);
__syncthreads();

for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
	__syncthreads();
	if (l_id < s && l_id < size)
		cache[l_id] = max(cache[l_id], cache[l_id + s]);
	
	__syncthreads();
}

if (l_id == 0)
	dst[blockIdx.x] = cache[0];	

}

Above is my code. I feel it is not faster than that copying the sequence from GPU to CPU and obtain the maximum on CPU when the length of sequence is not small. But i do not know why ,please help

Search for thrust extrema max_element.
It is ready for use.