find maximum value in an array along with index

i am trying to find the maximum value in an array and have the following two versions:

//int blocksize = 16; //multiple of 32

//int nblocks = ((npix*npix)+blocksize-1)/blocksize; //round to max npix = 7

// printf("nblocks  = %d\n", nblocks);

//dim3 dimBlock(blocksize,blocksize);

//dim3 dimGrid(nblocks,nblocks);

// find max value from the array

// find_corrmax <<< nblocks,blocksize >>> (max_val, x_shift, y_shift, xcout, npix, npix);

__global__ void find_corrmax(double* max_val, int* x_pos, int* y_pos, double* in1, int Nx, int Ny)

{

	int k = blockIdx.x * blockDim.x + threadIdx.x;

	if(k < 1)

	{

		for(int j = 0; j < Ny; ++j)

		{

			for(int i = 0; i < Nx; ++i)

			{

				if(in1[(k*Nx*Ny)+i*Nx+j] > max_val[k])

				{

					max_val[k] = in1[(k*Nx*Ny)+i*Nx+j];

					x_pos[k] = j; 

					y_pos[k] = i; 

				}

			}

		}

	}

}

// find_corrmax1 <<< dimGrid,dimBlock >>> (pmax1, x_shift1, y_shift1, out1_d, pix1, pix2, fftindx_d, fftindy_d);

__global__ void find_corrmax1(double* max_val, int* x_pos, int* y_pos, double* in1, int Nx, int Ny)

{

	int i = blockIdx.x * blockDim.x + threadIdx.x;

	int j = blockIdx.y * blockDim.y + threadIdx.y;

	int k;

	for(k = 0; k < 16; ++k)//BLOCK_SIZE=16

	{

		if(in1[k*Nx+i] > max_val[0])

		{

			max_val[0] = in1[k*Nx+i];

			x_pos[0] = j;

			y_pos[0] = i;

		}

	}

}

the first kernel find_corrmax takes 139 usec and the second kernel find_corrmax1 takes 59 usec. However, the second kernel calculates the maximum value correctly but gives the wrong indices.

any suggestions on how to improve this kernel??

I have looked at Thrust and CUDPP, but would like to implement it this way as I have a time constraint to meet.

Thanks in advance !!!

The second example has a memory race - every thread will be trying to write into max_val[0], x_pos[0], and y_pos[0]. Use a parallel reduction for this instead.

The second example has a memory race - every thread will be trying to write into max_val[0], x_pos[0], and y_pos[0]. Use a parallel reduction for this instead.

parallel reduction works for only powers of 2.

parallel reduction works for only powers of 2.

No it doesn’t. You can use a parallel reduction for any size array you want. Something like this (for the L infinity norm of a vector):

template <typename ValueType>

__global__ void

linfty_norm(unsigned int num_rows, const ValueType * x0, ValueType * norms)

{

	extern __shared__ ValueType buff[];

	volatile int tidx = threadIdx.x + blockIdx.x * blockDim.x;

	volatile int stride = gridDim.x * blockDim.x;

	// Loop through vector to find local maximum

	ValueType lumax = 0;

	for( unsigned int i = tidx; i < num_rows; i+=stride ) {

		lumax = fmax( lumax, fabs(x0[i]) );

	}

	// Load into shared memory

	buff[threadIdx.x] = lumax; __syncthreads();

	// First warp performs shared memory reduction

	if (threadIdx.x < warpSize) {

		#pragma unroll 

		for(int i=warpSize; i<blockDim.x; i+=warpSize) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+i]);

		}

		if (threadIdx.x < 16) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+16]);

		}

		if (threadIdx.x < 8) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+8]);

		}

		if (threadIdx.x < 4) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+4]);

		}

		if (threadIdx.x < 2) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+2]);

		}

		// Finalise and write out the results to global memory

		if (threadIdx.x == 0)  { 

			norms[blockIdx.x] = fmax(buff[0], buff[1]);

		}

	}

}

Launch it once to reduce any size array down to an array with the same number of elements as blocks in the grid. Launch it again with one block to get the final value, or copy the partial result back to the host and finish the reduction in host memory. I find zero copy memory works well for the second step if you choose to do it on the host.

No it doesn’t. You can use a parallel reduction for any size array you want. Something like this (for the L infinity norm of a vector):

template <typename ValueType>

__global__ void

linfty_norm(unsigned int num_rows, const ValueType * x0, ValueType * norms)

{

	extern __shared__ ValueType buff[];

	volatile int tidx = threadIdx.x + blockIdx.x * blockDim.x;

	volatile int stride = gridDim.x * blockDim.x;

	// Loop through vector to find local maximum

	ValueType lumax = 0;

	for( unsigned int i = tidx; i < num_rows; i+=stride ) {

		lumax = fmax( lumax, fabs(x0[i]) );

	}

	// Load into shared memory

	buff[threadIdx.x] = lumax; __syncthreads();

	// First warp performs shared memory reduction

	if (threadIdx.x < warpSize) {

		#pragma unroll 

		for(int i=warpSize; i<blockDim.x; i+=warpSize) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+i]);

		}

		if (threadIdx.x < 16) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+16]);

		}

		if (threadIdx.x < 8) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+8]);

		}

		if (threadIdx.x < 4) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+4]);

		}

		if (threadIdx.x < 2) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+2]);

		}

		// Finalise and write out the results to global memory

		if (threadIdx.x == 0)  { 

			norms[blockIdx.x] = fmax(buff[0], buff[1]);

		}

	}

}

Launch it once to reduce any size array down to an array with the same number of elements as blocks in the grid. Launch it again with one block to get the final value, or copy the partial result back to the host and finish the reduction in host memory. I find zero copy memory works well for the second step if you choose to do it on the host.

I ported some fast reduction code that i posted before so that it could be used to find max and minimum instead. The following code achieved 83% of peak bandwidth on my card ( reportedly above 90% on other cards) and finds both max and min of a linear array. You just need to add some code to record the index.

my_max_min.cu (8.63 KB)

I ported some fast reduction code that i posted before so that it could be used to find max and minimum instead. The following code achieved 83% of peak bandwidth on my card ( reportedly above 90% on other cards) and finds both max and min of a linear array. You just need to add some code to record the index.

[attachment=23507:my_max_min.cu]

@avidday here’s how I used your kernel

int blocksize = 16; //multiple of 32

int nblocks = ((pix3)+blocksize-1)/blocksize; //round to max

// printf("\nnblocks = %d\n", nblocks);

dim3 dimBlock(blocksize,blocksize);

dim3 dimGrid(nblocks,nblocks);

cufftReal *norms1, *norms2;

cudaMalloc((void**) &norms1, sizeof(cufftReal) * blocksize);

cudaMalloc((void**) &norms2, sizeof(cufftReal) * blocksize);

	

find_max_reduced <<< dimGrid, dimBlock >>> (pix1, out1_d, norms1); 

find_max_reduced <<< 1,blocksize >>> (pix1, norms1, norms2);

__global__ void find_max_reduced(int Nx, cufftReal *in1, cufftReal *norms)

{

	extern __shared__ cufftReal buff[];

	volatile int tidx = threadIdx.x + blockIdx.x * blockDim.x;

	volatile int stride = gridDim.x * blockDim.x;

	// Loop through vector to find local maximum

	cufftReal lumax = 0; 

	for( unsigned int i = tidx; i < Nx; i+=stride ) 

	{

		lumax = fmax( lumax, fabs(in1[i]) );

	}

	// Load into shared memory

	buff[threadIdx.x] = lumax; __syncthreads();

	// First warp performs shared memory reduction

	if (threadIdx.x < warpSize) {

		#pragma unroll 

		for(int i=warpSize; i<blockDim.x; i+=warpSize) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+i]);

		}

		if (threadIdx.x < 16) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+16]);

		}

		if (threadIdx.x < 8) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+8]);

		}

		if (threadIdx.x < 4) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+4]);

		}

		if (threadIdx.x < 2) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+2]);

		}

		// Finalise and write out the results to global memory

		if (threadIdx.x == 0)  { 

			norms[blockIdx.x] = fmax(buff[0], buff[1]);

		}

	}

}

segmentation fault error. If I understand your kernel, the maximum value is always in norms[0]. How can I find the index?

@avidday here’s how I used your kernel

int blocksize = 16; //multiple of 32

int nblocks = ((pix3)+blocksize-1)/blocksize; //round to max

// printf("\nnblocks = %d\n", nblocks);

dim3 dimBlock(blocksize,blocksize);

dim3 dimGrid(nblocks,nblocks);

cufftReal *norms1, *norms2;

cudaMalloc((void**) &norms1, sizeof(cufftReal) * blocksize);

cudaMalloc((void**) &norms2, sizeof(cufftReal) * blocksize);

	

find_max_reduced <<< dimGrid, dimBlock >>> (pix1, out1_d, norms1); 

find_max_reduced <<< 1,blocksize >>> (pix1, norms1, norms2);

__global__ void find_max_reduced(int Nx, cufftReal *in1, cufftReal *norms)

{

	extern __shared__ cufftReal buff[];

	volatile int tidx = threadIdx.x + blockIdx.x * blockDim.x;

	volatile int stride = gridDim.x * blockDim.x;

	// Loop through vector to find local maximum

	cufftReal lumax = 0; 

	for( unsigned int i = tidx; i < Nx; i+=stride ) 

	{

		lumax = fmax( lumax, fabs(in1[i]) );

	}

	// Load into shared memory

	buff[threadIdx.x] = lumax; __syncthreads();

	// First warp performs shared memory reduction

	if (threadIdx.x < warpSize) {

		#pragma unroll 

		for(int i=warpSize; i<blockDim.x; i+=warpSize) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+i]);

		}

		if (threadIdx.x < 16) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+16]);

		}

		if (threadIdx.x < 8) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+8]);

		}

		if (threadIdx.x < 4) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+4]);

		}

		if (threadIdx.x < 2) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+2]);

		}

		// Finalise and write out the results to global memory

		if (threadIdx.x == 0)  { 

			norms[blockIdx.x] = fmax(buff[0], buff[1]);

		}

	}

}

segmentation fault error. If I understand your kernel, the maximum value is always in norms[0]. How can I find the index?

Wouldn’t using thrust be the fastest way from a development perspective of doing this? This is very easy to with thrust. thrust::max_element returns an iterator to the max/min element so if you need the index it is very easy to easy determine since the iterators are just pointers.

Wouldn’t using thrust be the fastest way from a development perspective of doing this? This is very easy to with thrust. thrust::max_element returns an iterator to the max/min element so if you need the index it is very easy to easy determine since the iterators are just pointers.

@eelseen: did use Thrust, but thrust::max_element gave me an error. I did read that it’s more optimal to use/write custom kernels. Eevntually, I will compare the execution time taken for both approaches and then include the best one in the implementation.

//find max value and its index

int position = thrust::max_element(vec_sumh.begin(), vec_sumh.end()) - vec_sumh.begin(); 

double value = vec_sumh[position];

int shifted_y = position/7;

int shifted_x = position - (shifted_y*7);

@eelseen: did use Thrust, but thrust::max_element gave me an error. I did read that it’s more optimal to use/write custom kernels. Eevntually, I will compare the execution time taken for both approaches and then include the best one in the implementation.

//find max value and its index

int position = thrust::max_element(vec_sumh.begin(), vec_sumh.end()) - vec_sumh.begin(); 

double value = vec_sumh[position];

int shifted_y = position/7;

int shifted_x = position - (shifted_y*7);

The kernel requires runtime declared shared memory, which you have not defined. It should be 1 word per thread. Also, use many threads per block (for example 128), and launch something like 4 blocks per MP of your GPU. If you need the maximum position, then expand the code to save the position of the maximum whenever a new maximum value is found. Note also that the code I posted is finding the absolute maximum, not the maximum. I posted it as an example of how a reduction could be done for an arbitrary array, not as a solution to your problem, with the expectation that you would modify it for you needs. If you can’t do that, then take eelsen’s very sound advice and stick with thrust.

The kernel requires runtime declared shared memory, which you have not defined. It should be 1 word per thread. Also, use many threads per block (for example 128), and launch something like 4 blocks per MP of your GPU. If you need the maximum position, then expand the code to save the position of the maximum whenever a new maximum value is found. Note also that the code I posted is finding the absolute maximum, not the maximum. I posted it as an example of how a reduction could be done for an arbitrary array, not as a solution to your problem, with the expectation that you would modify it for you needs. If you can’t do that, then take eelsen’s very sound advice and stick with thrust.

max_element returns a pointer.

if vec_sumh is on the host you could simply do:

double value = *(thrust::max_element(vec_sumh.begin(), vec_sumh.end()));

and if you want the integral offset to the array it would be something like:

double *pos = thrust::max_element(vec_sumh.begin(), vec_sumh.end()) - vec_sumh.begin();

size_t offset = static_cast<size_t>(pos) / sizeof(double);

max_element returns a pointer.

if vec_sumh is on the host you could simply do:

double value = *(thrust::max_element(vec_sumh.begin(), vec_sumh.end()));

and if you want the integral offset to the array it would be something like:

double *pos = thrust::max_element(vec_sumh.begin(), vec_sumh.end()) - vec_sumh.begin();

size_t offset = static_cast<size_t>(pos) / sizeof(double);

According to this post by Nathan Bell, vivekv80’s code snippet should be working fine.