Where is the bottleneck in my parallel max reduction code?

gurifisu · September 5, 2011, 7:20am

I need to do parallel max index reduction (find the largest element in an array and return its index), I’m using a float2 vector type to store everything. The performance is only marginally better for very large arrays (like > 2 million elements), and worse for small number of arrays. Here’s the relevant sections of the code:

// INPUT: float array, array_in, contains only the values of the array

// OUTPUT: float2 array, array_out.x are the values, array_out.y are the indices

// EXTRA: converts float array to float2

__global__ void parallel_maxIndexPrep(float const * const array_in, float2 *array_out, int const N) {

	

	extern __shared__ float2 sdata[];

	int tpb = blockDim.x;

	int bid = blockIdx.x;

	int tid = threadIdx.x;

	int i = bid*tpb + tid;

	if( i < N ) {

		sdata[tid].x=array_in[i];

		sdata[tid].y=i;

		array_out[i]=sdata[tid];

	}

}

__global__ void parallel_maxIndex(float2 const * const array_in, float2 * const array_out, int const N) {

	extern __shared__ float2 sdata[];

	int tpb = blockDim.x;

	int bid = blockIdx.x;

	int tid = threadIdx.x;

	int i = bid*tpb*2 + tid;

	//load into shared memory

	//if we are in the last block

	if(gridDim.x - 1 == bid) {

		//remainder left:

		int R = N - (2*(gridDim.x-1))*tpb;

		

		//if two blocks left

		if ( R > tpb ) {

			if( tid < (R-tpb) ) {

				sdata[tid] = (array_in[i+tpb].x > array_in[i].x) ? array_in[i+tpb] : array_in[i];

			}

			else {

				sdata[tid] = array_in[i]; 

			}

		} 

		//if one block left

		else {

			if(tid < R) {

				sdata[tid]=array_in[i];

			}

			else {

				sdata[tid].x=0;

				sdata[tid].y=0;

			}

		}

	}

	else {

		//OK!

		sdata[tid]= (array_in[i+tpb].x > array_in[i].x) ? array_in[i+tpb] : array_in[i];

	}

	__syncthreads();

	//this way we have coalesced reads that avoid bank-conflicts?

	for(unsigned int s=tpb/2;s>0;s>>=1) {

		

		if(tid < s) {

			if(sdata[tid+s].x > sdata[tid].x) {

				sdata[tid] = sdata[tid+s];

			}

		}

		__syncthreads();

	}

	if(tid==0) {

		array_out[bid] = sdata[0];

	}

}

//the function that launches the kernels

float2 *recursive_parallel_maxIndex(float2 * d_array_in, int const N) {

	int nblocks = ceil((float) N / (float) (THREADSPERBLOCK*2));

	float2 *d_array_out;

	cudaMalloc((void **) &d_array_out, sizeof(*d_array_out)*nblocks);

	int smem = sizeof(float2) * THREADSPERBLOCK;

	//each threadperblock will need to do additional work!

	int nblocksLaunch = min(65536, nblocks);

	

	assert(nblocksLaunch < MAXBLOCKSPERGRID);

//	printf("Launching parallel_maxIndex() with N=%d points....\n", N);

//	printf("num points: %d\nnum blocks launched: %d\nthreads per block: %d\n", N, nblocks, THREADSPERBLOCK);

	parallel_maxIndex<<< nblocksLaunch, THREADSPERBLOCK, smem>>>(d_array_in, d_array_out, N);

	// *****DEBUG******************************************

	printf("______________________________________________________\n");

	float2 *h_array_input = (float2 *) malloc(sizeof(float2)*N);

	cudaMemcpy(h_array_input,d_array_in,sizeof(float2)*N,cudaMemcpyDeviceToHost);

	print_array(h_array_input, N);

	printf("||||||||||||||||||||||||||||||||||||||||||||||||||||||\n");

	float2 *h_array_debug = (float2 *) malloc(sizeof(float2)*nblocks);

	cudaMemcpy(h_array_debug,d_array_out,sizeof(float2)*nblocks,cudaMemcpyDeviceToHost);

	print_array(h_array_debug, nblocks);

	printf("Size of nblocks %d\n", nblocks);

	

	// *****DEBUG******************************************

	cudaFree(d_array_in);

	//if we have more points than threads 

	if(N > THREADSPERBLOCK) {

		return recursive_parallel_maxIndex(d_array_out, nblocks); 

	} 

	//after we have reached termination	

	else {

		//printf("Termination reached!\n");

		//this last pointer should never be de-allocated by this function!

		//it should be passed onto the very first function that calls recursive_..()

		printf("Target memory address: %p\n", d_array_out);

		return d_array_out;

	}

}

Jimmy_Pettersson · September 5, 2011, 2:01pm

Another thread on speedy min/max reductions:

Most of these codes are based on the reduction sum codes written by Mark Harris. So his reduction sum code AND documentation found in the SDK is a good starting point for doing this type of operations efficiently on the GPU.

djmj1000 · September 5, 2011, 9:27pm

Here are Mark Harris slides, they are very good to follow and to implement your own reduction algorithm and how to speed it up!

http://www.gpgpu.org/static/sc2007/SC07_CUDA_5_Optimization_Harris.pdf

Topic		Replies	Views
Min Max problem in parallel CUDA Programming and Performance	2	1600	September 25, 2008
CUDA reduction CUDA Programming and Performance	10	51485	June 7, 2009
find maximum value in an array along with index CUDA Programming and Performance	40	18803	October 11, 2010
How to realize that CUDA Programming and Performance	6	1896	February 16, 2010
Reduction in kernel function Get max value in vector CUDA Programming and Performance	8	13795	May 5, 2011
Find the Maximum value among 16 threads CUDA Programming and Performance	2	3685	June 12, 2008
parallel maximum detection bad performance CUDA Programming and Performance	11	3572	June 5, 2008
Cumpute Max of Vector or Matrix CUDA Programming and Performance	7	3796	June 6, 2011
about finding a max number from a big array CUDA Programming and Performance	10	4740	January 7, 2020
parallel reduction scope difference of two arrays CUDA Programming and Performance	2	2221	June 28, 2009

Where is the bottleneck in my parallel max reduction code?

Related topics