global void absMaxval(float *dst, float *src, int size)
{
shared float cache[blocksizeMax];
int g_id = (blockDim.x * blockIdx.x) + threadIdx.x;
int l_id = threadIdx.x;
cache[l_id] = fabs(src[g_id]);
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
__syncthreads();
if (l_id < s && l_id < size)
cache[l_id] = max(cache[l_id], cache[l_id + s]);
__syncthreads();
}
if (l_id == 0)
dst[blockIdx.x] = cache[0];
}
Above is my code. I feel it is not faster than that copying the sequence from GPU to CPU and obtain the maximum on CPU when the length of sequence is not small. But i do not know why ,please help