i am trying to find the maximum value in an array and have the following two versions:

```
//int blocksize = 16; //multiple of 32
//int nblocks = ((npix*npix)+blocksize-1)/blocksize; //round to max npix = 7
// printf("nblocks = %d\n", nblocks);
//dim3 dimBlock(blocksize,blocksize);
//dim3 dimGrid(nblocks,nblocks);
// find max value from the array
// find_corrmax <<< nblocks,blocksize >>> (max_val, x_shift, y_shift, xcout, npix, npix);
__global__ void find_corrmax(double* max_val, int* x_pos, int* y_pos, double* in1, int Nx, int Ny)
{
int k = blockIdx.x * blockDim.x + threadIdx.x;
if(k < 1)
{
for(int j = 0; j < Ny; ++j)
{
for(int i = 0; i < Nx; ++i)
{
if(in1[(k*Nx*Ny)+i*Nx+j] > max_val[k])
{
max_val[k] = in1[(k*Nx*Ny)+i*Nx+j];
x_pos[k] = j;
y_pos[k] = i;
}
}
}
}
}
// find_corrmax1 <<< dimGrid,dimBlock >>> (pmax1, x_shift1, y_shift1, out1_d, pix1, pix2, fftindx_d, fftindy_d);
__global__ void find_corrmax1(double* max_val, int* x_pos, int* y_pos, double* in1, int Nx, int Ny)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k;
for(k = 0; k < 16; ++k)//BLOCK_SIZE=16
{
if(in1[k*Nx+i] > max_val[0])
{
max_val[0] = in1[k*Nx+i];
x_pos[0] = j;
y_pos[0] = i;
}
}
}
```

the first kernel find_corrmax takes 139 usec and the second kernel find_corrmax1 takes 59 usec. However, the second kernel calculates the maximum value correctly but gives the wrong indices.

any suggestions on how to improve this kernel??

I have looked at Thrust and CUDPP, but would like to implement it this way as I have a time constraint to meet.

Thanks in advance !!!