find maximum value in an array along with index

vivekv80 · September 3, 2010, 4:31am

i am trying to find the maximum value in an array and have the following two versions:

//int blocksize = 16; //multiple of 32

//int nblocks = ((npix*npix)+blocksize-1)/blocksize; //round to max npix = 7

// printf("nblocks  = %d\n", nblocks);

//dim3 dimBlock(blocksize,blocksize);

//dim3 dimGrid(nblocks,nblocks);

// find max value from the array

// find_corrmax <<< nblocks,blocksize >>> (max_val, x_shift, y_shift, xcout, npix, npix);

__global__ void find_corrmax(double* max_val, int* x_pos, int* y_pos, double* in1, int Nx, int Ny)

{

	int k = blockIdx.x * blockDim.x + threadIdx.x;

	if(k < 1)

	{

		for(int j = 0; j < Ny; ++j)

		{

			for(int i = 0; i < Nx; ++i)

			{

				if(in1[(k*Nx*Ny)+i*Nx+j] > max_val[k])

				{

					max_val[k] = in1[(k*Nx*Ny)+i*Nx+j];

					x_pos[k] = j; 

					y_pos[k] = i; 

				}

			}

		}

	}

}

// find_corrmax1 <<< dimGrid,dimBlock >>> (pmax1, x_shift1, y_shift1, out1_d, pix1, pix2, fftindx_d, fftindy_d);

__global__ void find_corrmax1(double* max_val, int* x_pos, int* y_pos, double* in1, int Nx, int Ny)

{

	int i = blockIdx.x * blockDim.x + threadIdx.x;

	int j = blockIdx.y * blockDim.y + threadIdx.y;

	int k;

	for(k = 0; k < 16; ++k)//BLOCK_SIZE=16

	{

		if(in1[k*Nx+i] > max_val[0])

		{

			max_val[0] = in1[k*Nx+i];

			x_pos[0] = j;

			y_pos[0] = i;

		}

	}

}

the first kernel find_corrmax takes 139 usec and the second kernel find_corrmax1 takes 59 usec. However, the second kernel calculates the maximum value correctly but gives the wrong indices.

any suggestions on how to improve this kernel??

I have looked at Thrust and CUDPP, but would like to implement it this way as I have a time constraint to meet.

Thanks in advance !!!

avidday · September 3, 2010, 5:15am

The second example has a memory race - every thread will be trying to write into max_val[0], x_pos[0], and y_pos[0]. Use a parallel reduction for this instead.

avidday · September 3, 2010, 5:15am

The second example has a memory race - every thread will be trying to write into max_val[0], x_pos[0], and y_pos[0]. Use a parallel reduction for this instead.

vivekv80 · September 3, 2010, 12:11pm

parallel reduction works for only powers of 2.

vivekv80 · September 3, 2010, 12:11pm

parallel reduction works for only powers of 2.

avidday · September 3, 2010, 12:14pm

No it doesn’t. You can use a parallel reduction for any size array you want. Something like this (for the L infinity norm of a vector):

template <typename ValueType>

__global__ void

linfty_norm(unsigned int num_rows, const ValueType * x0, ValueType * norms)

{

	extern __shared__ ValueType buff[];

	volatile int tidx = threadIdx.x + blockIdx.x * blockDim.x;

	volatile int stride = gridDim.x * blockDim.x;

	// Loop through vector to find local maximum

	ValueType lumax = 0;

	for( unsigned int i = tidx; i < num_rows; i+=stride ) {

		lumax = fmax( lumax, fabs(x0[i]) );

	}

	// Load into shared memory

	buff[threadIdx.x] = lumax; __syncthreads();

	// First warp performs shared memory reduction

	if (threadIdx.x < warpSize) {

		#pragma unroll 

		for(int i=warpSize; i<blockDim.x; i+=warpSize) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+i]);

		}

		if (threadIdx.x < 16) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+16]);

		}

		if (threadIdx.x < 8) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+8]);

		}

		if (threadIdx.x < 4) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+4]);

		}

		if (threadIdx.x < 2) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+2]);

		}

		// Finalise and write out the results to global memory

		if (threadIdx.x == 0)  { 

			norms[blockIdx.x] = fmax(buff[0], buff[1]);

		}

	}

}

Launch it once to reduce any size array down to an array with the same number of elements as blocks in the grid. Launch it again with one block to get the final value, or copy the partial result back to the host and finish the reduction in host memory. I find zero copy memory works well for the second step if you choose to do it on the host.

avidday · September 3, 2010, 12:14pm

No it doesn’t. You can use a parallel reduction for any size array you want. Something like this (for the L infinity norm of a vector):

template <typename ValueType>

__global__ void

linfty_norm(unsigned int num_rows, const ValueType * x0, ValueType * norms)

{

	extern __shared__ ValueType buff[];

	volatile int tidx = threadIdx.x + blockIdx.x * blockDim.x;

	volatile int stride = gridDim.x * blockDim.x;

	// Loop through vector to find local maximum

	ValueType lumax = 0;

	for( unsigned int i = tidx; i < num_rows; i+=stride ) {

		lumax = fmax( lumax, fabs(x0[i]) );

	}

	// Load into shared memory

	buff[threadIdx.x] = lumax; __syncthreads();

	// First warp performs shared memory reduction

	if (threadIdx.x < warpSize) {

		#pragma unroll 

		for(int i=warpSize; i<blockDim.x; i+=warpSize) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+i]);

		}

		if (threadIdx.x < 16) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+16]);

		}

		if (threadIdx.x < 8) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+8]);

		}

		if (threadIdx.x < 4) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+4]);

		}

		if (threadIdx.x < 2) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+2]);

		}

		// Finalise and write out the results to global memory

		if (threadIdx.x == 0)  { 

			norms[blockIdx.x] = fmax(buff[0], buff[1]);

		}

	}

}

Launch it once to reduce any size array down to an array with the same number of elements as blocks in the grid. Launch it again with one block to get the final value, or copy the partial result back to the host and finish the reduction in host memory. I find zero copy memory works well for the second step if you choose to do it on the host.

Jimmy_Pettersson · September 5, 2010, 6:17pm

I ported some fast reduction code that i posted before so that it could be used to find max and minimum instead. The following code achieved 83% of peak bandwidth on my card ( reportedly above 90% on other cards) and finds both max and min of a linear array. You just need to add some code to record the index.

my_max_min.cu (8.63 KB)

Jimmy_Pettersson · September 5, 2010, 6:17pm

I ported some fast reduction code that i posted before so that it could be used to find max and minimum instead. The following code achieved 83% of peak bandwidth on my card ( reportedly above 90% on other cards) and finds both max and min of a linear array. You just need to add some code to record the index.

[attachment=23507:my_max_min.cu]

vivekv80 · September 7, 2010, 9:50pm

@avidday hereâ€™s how I used your kernel

int blocksize = 16; //multiple of 32

int nblocks = ((pix3)+blocksize-1)/blocksize; //round to max

// printf("\nnblocks = %d\n", nblocks);

dim3 dimBlock(blocksize,blocksize);

dim3 dimGrid(nblocks,nblocks);

cufftReal *norms1, *norms2;

cudaMalloc((void**) &norms1, sizeof(cufftReal) * blocksize);

cudaMalloc((void**) &norms2, sizeof(cufftReal) * blocksize);

	

find_max_reduced <<< dimGrid, dimBlock >>> (pix1, out1_d, norms1); 

find_max_reduced <<< 1,blocksize >>> (pix1, norms1, norms2);

__global__ void find_max_reduced(int Nx, cufftReal *in1, cufftReal *norms)

{

	extern __shared__ cufftReal buff[];

	volatile int tidx = threadIdx.x + blockIdx.x * blockDim.x;

	volatile int stride = gridDim.x * blockDim.x;

	// Loop through vector to find local maximum

	cufftReal lumax = 0; 

	for( unsigned int i = tidx; i < Nx; i+=stride ) 

	{

		lumax = fmax( lumax, fabs(in1[i]) );

	}

	// Load into shared memory

	buff[threadIdx.x] = lumax; __syncthreads();

	// First warp performs shared memory reduction

	if (threadIdx.x < warpSize) {

		#pragma unroll 

		for(int i=warpSize; i<blockDim.x; i+=warpSize) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+i]);

		}

		if (threadIdx.x < 16) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+16]);

		}

		if (threadIdx.x < 8) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+8]);

		}

		if (threadIdx.x < 4) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+4]);

		}

		if (threadIdx.x < 2) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+2]);

		}

		// Finalise and write out the results to global memory

		if (threadIdx.x == 0)  { 

			norms[blockIdx.x] = fmax(buff[0], buff[1]);

		}

	}

}

segmentation fault error. If I understand your kernel, the maximum value is always in norms[0]. How can I find the index?

vivekv80 · September 7, 2010, 9:50pm

@avidday hereâ€™s how I used your kernel

int blocksize = 16; //multiple of 32

int nblocks = ((pix3)+blocksize-1)/blocksize; //round to max

// printf("\nnblocks = %d\n", nblocks);

dim3 dimBlock(blocksize,blocksize);

dim3 dimGrid(nblocks,nblocks);

cufftReal *norms1, *norms2;

cudaMalloc((void**) &norms1, sizeof(cufftReal) * blocksize);

cudaMalloc((void**) &norms2, sizeof(cufftReal) * blocksize);

	

find_max_reduced <<< dimGrid, dimBlock >>> (pix1, out1_d, norms1); 

find_max_reduced <<< 1,blocksize >>> (pix1, norms1, norms2);

__global__ void find_max_reduced(int Nx, cufftReal *in1, cufftReal *norms)

{

	extern __shared__ cufftReal buff[];

	volatile int tidx = threadIdx.x + blockIdx.x * blockDim.x;

	volatile int stride = gridDim.x * blockDim.x;

	// Loop through vector to find local maximum

	cufftReal lumax = 0; 

	for( unsigned int i = tidx; i < Nx; i+=stride ) 

	{

		lumax = fmax( lumax, fabs(in1[i]) );

	}

	// Load into shared memory

	buff[threadIdx.x] = lumax; __syncthreads();

	// First warp performs shared memory reduction

	if (threadIdx.x < warpSize) {

		#pragma unroll 

		for(int i=warpSize; i<blockDim.x; i+=warpSize) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+i]);

		}

		if (threadIdx.x < 16) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+16]);

		}

		if (threadIdx.x < 8) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+8]);

		}

		if (threadIdx.x < 4) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+4]);

		}

		if (threadIdx.x < 2) {

			buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+2]);

		}

		// Finalise and write out the results to global memory

		if (threadIdx.x == 0)  { 

			norms[blockIdx.x] = fmax(buff[0], buff[1]);

		}

	}

}

segmentation fault error. If I understand your kernel, the maximum value is always in norms[0]. How can I find the index?

eelsen · September 7, 2010, 10:36pm

Wouldn’t using thrust be the fastest way from a development perspective of doing this? This is very easy to with thrust. thrust::max_element returns an iterator to the max/min element so if you need the index it is very easy to easy determine since the iterators are just pointers.

eelsen · September 7, 2010, 10:36pm

Wouldn’t using thrust be the fastest way from a development perspective of doing this? This is very easy to with thrust. thrust::max_element returns an iterator to the max/min element so if you need the index it is very easy to easy determine since the iterators are just pointers.

vivekv80 · September 8, 2010, 12:35am

@eelseen: did use Thrust, but thrust::max_element gave me an error. I did read that itâ€™s more optimal to use/write custom kernels. Eevntually, I will compare the execution time taken for both approaches and then include the best one in the implementation.

//find max value and its index

int position = thrust::max_element(vec_sumh.begin(), vec_sumh.end()) - vec_sumh.begin(); 

double value = vec_sumh[position];

int shifted_y = position/7;

int shifted_x = position - (shifted_y*7);

vivekv80 · September 8, 2010, 12:35am

@eelseen: did use Thrust, but thrust::max_element gave me an error. I did read that itâ€™s more optimal to use/write custom kernels. Eevntually, I will compare the execution time taken for both approaches and then include the best one in the implementation.

//find max value and its index

int position = thrust::max_element(vec_sumh.begin(), vec_sumh.end()) - vec_sumh.begin(); 

double value = vec_sumh[position];

int shifted_y = position/7;

int shifted_x = position - (shifted_y*7);

avidday · September 8, 2010, 5:21am

The kernel requires runtime declared shared memory, which you have not defined. It should be 1 word per thread. Also, use many threads per block (for example 128), and launch something like 4 blocks per MP of your GPU. If you need the maximum position, then expand the code to save the position of the maximum whenever a new maximum value is found. Note also that the code I posted is finding the absolute maximum, not the maximum. I posted it as an example of how a reduction could be done for an arbitrary array, not as a solution to your problem, with the expectation that you would modify it for you needs. If you can’t do that, then take eelsen’s very sound advice and stick with thrust.

avidday · September 8, 2010, 5:21am

The kernel requires runtime declared shared memory, which you have not defined. It should be 1 word per thread. Also, use many threads per block (for example 128), and launch something like 4 blocks per MP of your GPU. If you need the maximum position, then expand the code to save the position of the maximum whenever a new maximum value is found. Note also that the code I posted is finding the absolute maximum, not the maximum. I posted it as an example of how a reduction could be done for an arbitrary array, not as a solution to your problem, with the expectation that you would modify it for you needs. If you can’t do that, then take eelsen’s very sound advice and stick with thrust.

eelsen · September 8, 2010, 6:41pm

@eelseen: did use Thrust, but thrust::max_element gave me an error. I did read that itâ€™s more optimal to use/write custom kernels. Eevntually, I will compare the execution time taken for both approaches and then include the best one in the implementation.
//find max value and its index

int position = thrust::max_element(vec_sumh.begin(), vec_sumh.end()) - vec_sumh.begin(); 

double value = vec_sumh[position];

int shifted_y = position/7;

int shifted_x = position - (shifted_y*7);

max_element returns a pointer.

if vec_sumh is on the host you could simply do:

double value = *(thrust::max_element(vec_sumh.begin(), vec_sumh.end()));

and if you want the integral offset to the array it would be something like:

double *pos = thrust::max_element(vec_sumh.begin(), vec_sumh.end()) - vec_sumh.begin();

size_t offset = static_cast<size_t>(pos) / sizeof(double);

eelsen · September 8, 2010, 6:41pm

@eelseen: did use Thrust, but thrust::max_element gave me an error. I did read that itâ€™s more optimal to use/write custom kernels. Eevntually, I will compare the execution time taken for both approaches and then include the best one in the implementation.
//find max value and its index

int position = thrust::max_element(vec_sumh.begin(), vec_sumh.end()) - vec_sumh.begin(); 

double value = vec_sumh[position];

int shifted_y = position/7;

int shifted_x = position - (shifted_y*7);

max_element returns a pointer.

if vec_sumh is on the host you could simply do:

double value = *(thrust::max_element(vec_sumh.begin(), vec_sumh.end()));

and if you want the integral offset to the array it would be something like:

double *pos = thrust::max_element(vec_sumh.begin(), vec_sumh.end()) - vec_sumh.begin();

size_t offset = static_cast<size_t>(pos) / sizeof(double);

erike · September 9, 2010, 7:31am

According to this post by Nathan Bell, vivekv80’s code snippet should be working fine.

Topic		Replies	Views
Finding max in array CUDA Programming and Performance	15	42781	November 26, 2017
CUDA reduction CUDA Programming and Performance	10	51529	June 7, 2009
about finding a max number from a big array CUDA Programming and Performance	10	4794	January 7, 2020
Are there memory limitations on Device when using large arrays? Tesla C1060 CUDA Programming and Performance	40	15013	April 22, 2009
Reduction in kernel function Get max value in vector CUDA Programming and Performance	8	13834	May 5, 2011
Max() function? CUDA Programming and Performance	1	3986	April 29, 2009
Cumpute Max of Vector or Matrix CUDA Programming and Performance	7	3820	June 6, 2011
CUDPP: Finding index of the max. in an array CUDA Programming and Performance	3	6024	July 31, 2009
different output every time I run my code probably wrong in finding max value. CUDA Programming and Performance	4	3079	June 21, 2011
finding max in array and it's index CUDA Programming and Performance	4	3073	February 27, 2018

find maximum value in an array along with index

Related topics