Where is the bottleneck in my parallel max reduction code?

I need to do parallel max index reduction (find the largest element in an array and return its index), I’m using a float2 vector type to store everything. The performance is only marginally better for very large arrays (like > 2 million elements), and worse for small number of arrays. Here’s the relevant sections of the code:

// INPUT: float array, array_in, contains only the values of the array

// OUTPUT: float2 array, array_out.x are the values, array_out.y are the indices

// EXTRA: converts float array to float2

__global__ void parallel_maxIndexPrep(float const * const array_in, float2 *array_out, int const N) {

	

	extern __shared__ float2 sdata[];

	int tpb = blockDim.x;

	int bid = blockIdx.x;

	int tid = threadIdx.x;

	int i = bid*tpb + tid;

	if( i < N ) {

		sdata[tid].x=array_in[i];

		sdata[tid].y=i;

		array_out[i]=sdata[tid];

	}

}

__global__ void parallel_maxIndex(float2 const * const array_in, float2 * const array_out, int const N) {

	extern __shared__ float2 sdata[];

	int tpb = blockDim.x;

	int bid = blockIdx.x;

	int tid = threadIdx.x;

	int i = bid*tpb*2 + tid;

	//load into shared memory

	//if we are in the last block

	if(gridDim.x - 1 == bid) {

		//remainder left:

		int R = N - (2*(gridDim.x-1))*tpb;

		

		//if two blocks left

		if ( R > tpb ) {

			if( tid < (R-tpb) ) {

				sdata[tid] = (array_in[i+tpb].x > array_in[i].x) ? array_in[i+tpb] : array_in[i];

			}

			else {

				sdata[tid] = array_in[i]; 

			}

		} 

		//if one block left

		else {

			if(tid < R) {

				sdata[tid]=array_in[i];

			}

			else {

				sdata[tid].x=0;

				sdata[tid].y=0;

			}

		}

	}

	else {

		//OK!

		sdata[tid]= (array_in[i+tpb].x > array_in[i].x) ? array_in[i+tpb] : array_in[i];

	}

	__syncthreads();

	//this way we have coalesced reads that avoid bank-conflicts?

	for(unsigned int s=tpb/2;s>0;s>>=1) {

		

		if(tid < s) {

			if(sdata[tid+s].x > sdata[tid].x) {

				sdata[tid] = sdata[tid+s];

			}

		}

		__syncthreads();

	}

	if(tid==0) {

		array_out[bid] = sdata[0];

	}

}

//the function that launches the kernels

float2 *recursive_parallel_maxIndex(float2 * d_array_in, int const N) {

	int nblocks = ceil((float) N / (float) (THREADSPERBLOCK*2));

	float2 *d_array_out;

	cudaMalloc((void **) &d_array_out, sizeof(*d_array_out)*nblocks);

	int smem = sizeof(float2) * THREADSPERBLOCK;

	//each threadperblock will need to do additional work!

	int nblocksLaunch = min(65536, nblocks);

	

	assert(nblocksLaunch < MAXBLOCKSPERGRID);

//	printf("Launching parallel_maxIndex() with N=%d points....\n", N);

//	printf("num points: %d\nnum blocks launched: %d\nthreads per block: %d\n", N, nblocks, THREADSPERBLOCK);

	parallel_maxIndex<<< nblocksLaunch, THREADSPERBLOCK, smem>>>(d_array_in, d_array_out, N);

	// *****DEBUG******************************************

	printf("______________________________________________________\n");

	float2 *h_array_input = (float2 *) malloc(sizeof(float2)*N);

	cudaMemcpy(h_array_input,d_array_in,sizeof(float2)*N,cudaMemcpyDeviceToHost);

	print_array(h_array_input, N);

	printf("||||||||||||||||||||||||||||||||||||||||||||||||||||||\n");

	float2 *h_array_debug = (float2 *) malloc(sizeof(float2)*nblocks);

	cudaMemcpy(h_array_debug,d_array_out,sizeof(float2)*nblocks,cudaMemcpyDeviceToHost);

	print_array(h_array_debug, nblocks);

	printf("Size of nblocks %d\n", nblocks);

	

	// *****DEBUG******************************************

	cudaFree(d_array_in);

	//if we have more points than threads 

	if(N > THREADSPERBLOCK) {

		return recursive_parallel_maxIndex(d_array_out, nblocks); 

	} 

	//after we have reached termination	

	else {

		//printf("Termination reached!\n");

		//this last pointer should never be de-allocated by this function!

		//it should be passed onto the very first function that calls recursive_..()

		printf("Target memory address: %p\n", d_array_out);

		return d_array_out;

	}

}

Another thread on speedy min/max reductions:

http://forums.nvidia.com/index.php?showtopic=177498

Most of these codes are based on the reduction sum codes written by Mark Harris. So his reduction sum code AND documentation found in the SDK is a good starting point for doing this type of operations efficiently on the GPU.

Here are Mark Harris slides, they are very good to follow and to implement your own reduction algorithm and how to speed it up!

http://www.gpgpu.org/static/sc2007/SC07_CUDA_5_Optimization_Harris.pdf