I need to do parallel max index reduction (find the largest element in an array and return its index), I’m using a float2 vector type to store everything. The performance is only marginally better for very large arrays (like > 2 million elements), and worse for small number of arrays. Here’s the relevant sections of the code:
// INPUT: float array, array_in, contains only the values of the array
// OUTPUT: float2 array, array_out.x are the values, array_out.y are the indices
// EXTRA: converts float array to float2
__global__ void parallel_maxIndexPrep(float const * const array_in, float2 *array_out, int const N) {
extern __shared__ float2 sdata[];
int tpb = blockDim.x;
int bid = blockIdx.x;
int tid = threadIdx.x;
int i = bid*tpb + tid;
if( i < N ) {
sdata[tid].x=array_in[i];
sdata[tid].y=i;
array_out[i]=sdata[tid];
}
}
__global__ void parallel_maxIndex(float2 const * const array_in, float2 * const array_out, int const N) {
extern __shared__ float2 sdata[];
int tpb = blockDim.x;
int bid = blockIdx.x;
int tid = threadIdx.x;
int i = bid*tpb*2 + tid;
//load into shared memory
//if we are in the last block
if(gridDim.x - 1 == bid) {
//remainder left:
int R = N - (2*(gridDim.x-1))*tpb;
//if two blocks left
if ( R > tpb ) {
if( tid < (R-tpb) ) {
sdata[tid] = (array_in[i+tpb].x > array_in[i].x) ? array_in[i+tpb] : array_in[i];
}
else {
sdata[tid] = array_in[i];
}
}
//if one block left
else {
if(tid < R) {
sdata[tid]=array_in[i];
}
else {
sdata[tid].x=0;
sdata[tid].y=0;
}
}
}
else {
//OK!
sdata[tid]= (array_in[i+tpb].x > array_in[i].x) ? array_in[i+tpb] : array_in[i];
}
__syncthreads();
//this way we have coalesced reads that avoid bank-conflicts?
for(unsigned int s=tpb/2;s>0;s>>=1) {
if(tid < s) {
if(sdata[tid+s].x > sdata[tid].x) {
sdata[tid] = sdata[tid+s];
}
}
__syncthreads();
}
if(tid==0) {
array_out[bid] = sdata[0];
}
}
//the function that launches the kernels
float2 *recursive_parallel_maxIndex(float2 * d_array_in, int const N) {
int nblocks = ceil((float) N / (float) (THREADSPERBLOCK*2));
float2 *d_array_out;
cudaMalloc((void **) &d_array_out, sizeof(*d_array_out)*nblocks);
int smem = sizeof(float2) * THREADSPERBLOCK;
//each threadperblock will need to do additional work!
int nblocksLaunch = min(65536, nblocks);
assert(nblocksLaunch < MAXBLOCKSPERGRID);
// printf("Launching parallel_maxIndex() with N=%d points....\n", N);
// printf("num points: %d\nnum blocks launched: %d\nthreads per block: %d\n", N, nblocks, THREADSPERBLOCK);
parallel_maxIndex<<< nblocksLaunch, THREADSPERBLOCK, smem>>>(d_array_in, d_array_out, N);
// *****DEBUG******************************************
printf("______________________________________________________\n");
float2 *h_array_input = (float2 *) malloc(sizeof(float2)*N);
cudaMemcpy(h_array_input,d_array_in,sizeof(float2)*N,cudaMemcpyDeviceToHost);
print_array(h_array_input, N);
printf("||||||||||||||||||||||||||||||||||||||||||||||||||||||\n");
float2 *h_array_debug = (float2 *) malloc(sizeof(float2)*nblocks);
cudaMemcpy(h_array_debug,d_array_out,sizeof(float2)*nblocks,cudaMemcpyDeviceToHost);
print_array(h_array_debug, nblocks);
printf("Size of nblocks %d\n", nblocks);
// *****DEBUG******************************************
cudaFree(d_array_in);
//if we have more points than threads
if(N > THREADSPERBLOCK) {
return recursive_parallel_maxIndex(d_array_out, nblocks);
}
//after we have reached termination
else {
//printf("Termination reached!\n");
//this last pointer should never be de-allocated by this function!
//it should be passed onto the very first function that calls recursive_..()
printf("Target memory address: %p\n", d_array_out);
return d_array_out;
}
}