Performance Degradation After Fully Unrolling the Reduction Process

Hi Experts,

After reading some materials, including Mark Harris’ reduction optimization process, I tried to optimize my top-K algorithm in a similar way:

I fully unrolled the process of each block’s local top-K reduction. However, strangely, I observed a noticeable performance drop in Nsight Compute.

Below, I’ve attached the code before (top_k_gpu) and after (top_k_gpu_opt) optimization. Please ignore any logical errors in the top_k_gpu_opt code for now.

I wonder if I unrolled it incorrectly, but even if that’s the case, I wouldn’t expect to see a performance degradation.

__device__ inline void replace_smaller(float* array, int* idx_buffer, int k, float data, int data_idx) {
    if(data < array[k-1]) return;
        
    for(int j=k-2; j>=0; j--) {
        if(data > array[j]) {
            array[j+1] = array[j];
            idx_buffer[j+1] = idx_buffer[j];
        } else {
            array[j+1] = data;
            idx_buffer[j+1] = data_idx;
            return;
        }
    }

    array[0] = data;
    idx_buffer[0] = data_idx;
}

__device__ inline void mergeTwoK(float* left, int* left_idx, float* right, int* right_idx, int k) {
    int i;
    for(i=0; i<k; i++) {
        replace_smaller(left, left_idx, k, right[i], right_idx[i]);
    }
}

__device__ inline void replace_smaller_warp(volatile float* array, volatile int* idx_buffer, int k, float data, int data_idx) {
    if(data < array[k-1]) return;
        
    for(int j=k-2; j>=0; j--) {
        if(data > array[j]) {
            array[j+1] = array[j];
            idx_buffer[j+1] = idx_buffer[j];
        } else {
            array[j+1] = data;
            idx_buffer[j+1] = data_idx;
            return;
        }
    }

    array[0] = data;
    idx_buffer[0] = data_idx;
}

__device__ inline void mergeTwoK_warp(volatile float* left, volatile int* left_idx, volatile float* right, volatile int* right_idx, int k) {
    int i;
    for(i=0; i<k; i++) {
        replace_smaller_warp(left, left_idx, k, right[i], right_idx[i]);
    }
}

template<unsigned int numThreads>
__global__ void top_k_gpu_opt(float* input, int length, int k, float* output, int* out_idx) {
    extern __shared__ float shared_buffer[];

    int gtid = threadIdx.x + blockIdx.x * numThreads;
    int tid = threadIdx.x;
    float *buffer = shared_buffer + tid * k;
    int *buffer_idx = reinterpret_cast<int*>(shared_buffer + numThreads * k) + tid * k;

    int threadNum = gridDim.x * numThreads;

    for (int i = 0;i < k; ++i) {
        buffer[i] = -FLT_MAX;
        buffer_idx[i] = -1;
    }

    for (int i = gtid; i < length; i += threadNum) {
        replace_smaller(buffer, buffer_idx, k, input[i], i);
    }
    __syncthreads();

    if (numThreads >= 1024) {
        if (tid < 512) {
            float* next_buffer = shared_buffer + (tid + 512) * k;
            int* next_buffer_idx = reinterpret_cast<int*>(shared_buffer + numThreads * k) + (tid + 512) * k;
            mergeTwoK(buffer, buffer_idx, next_buffer, next_buffer_idx, k);
        }
        __syncthreads();
    }
    if (numThreads >= 512) {
        if (tid < 256) {
            float* next_buffer = shared_buffer + (tid + 256) * k;
            int* next_buffer_idx = reinterpret_cast<int*>(shared_buffer + numThreads * k) + (tid + 256) * k;
            mergeTwoK(buffer, buffer_idx, next_buffer, next_buffer_idx, k);
        }
        __syncthreads();
    }
    if (numThreads >= 256) {
        if (tid < 128) {
            float* next_buffer = shared_buffer + (tid + 128) * k;
            int* next_buffer_idx = reinterpret_cast<int*>(shared_buffer + numThreads * k) + (tid + 128) * k;
            mergeTwoK(buffer, buffer_idx, next_buffer, next_buffer_idx, k);
        }
        __syncthreads();
    }
    if (numThreads >= 128) {
        if (tid < 64) {
            float* next_buffer = shared_buffer + (tid + 64) * k;
            int* next_buffer_idx = reinterpret_cast<int*>(shared_buffer + numThreads * k) + (tid + 64) * k;
            mergeTwoK(buffer, buffer_idx, next_buffer, next_buffer_idx, k);
        }
        __syncthreads();
    }

    if (tid < 32) {
        if (numThreads >= 64) {
            volatile float* next_buffer = shared_buffer + (tid + 32) * k;
            volatile int* next_buffer_idx = reinterpret_cast<int*>(shared_buffer + numThreads * k) + (tid + 32) * k;
            mergeTwoK_warp(buffer, buffer_idx, next_buffer, next_buffer_idx, k);
        }
        if (numThreads >= 32) {
            volatile float* next_buffer = shared_buffer + (tid + 16) * k;
            volatile int* next_buffer_idx = reinterpret_cast<int*>(shared_buffer + numThreads * k) + (tid + 16) * k;
            mergeTwoK_warp(buffer, buffer_idx, next_buffer, next_buffer_idx, k);
        }
        if (numThreads >= 16) {
            volatile float* next_buffer = shared_buffer + (tid + 8) * k;
            volatile int* next_buffer_idx = reinterpret_cast<int*>(shared_buffer + numThreads * k) + (tid + 8) * k;
            mergeTwoK_warp(buffer, buffer_idx, next_buffer, next_buffer_idx, k);
        }
        if (numThreads >= 8) {
            volatile float* next_buffer = shared_buffer + (tid + 4) * k;
            volatile int* next_buffer_idx = reinterpret_cast<int*>(shared_buffer + numThreads * k) + (tid + 4) * k;
            mergeTwoK_warp(buffer, buffer_idx, next_buffer, next_buffer_idx, k);
        }
        if (numThreads >= 4) {
            volatile float* next_buffer = shared_buffer + (tid + 2) * k;
            volatile int* next_buffer_idx = reinterpret_cast<int*>(shared_buffer + numThreads * k) + (tid + 2) * k;
            mergeTwoK_warp(buffer, buffer_idx, next_buffer, next_buffer_idx, k);
        }
        if (numThreads >= 2) {
            volatile float* next_buffer = shared_buffer + (tid + 1) * k;
            volatile int* next_buffer_idx = reinterpret_cast<int*>(shared_buffer + numThreads * k) + (tid + 1) * k;
            mergeTwoK_warp(buffer, buffer_idx, next_buffer, next_buffer_idx, k);
        }

        if (tid == 0) {
            for (int i = 0; i < k; ++i) {
                output[blockIdx.x * k + i] = buffer[i];
                out_idx[blockIdx.x * k + i] = buffer_idx[i];
            }
        }
    }
}

__global__ void top_k_gpu(float* input, int length, int k, float* output, int* out_idx) {
    extern __shared__ float shared_buffer[];

    int gtid = threadIdx.x + blockIdx.x * blockDim.x;
    int tid = threadIdx.x;
    float *buffer = shared_buffer + tid * k;
    int *buffer_idx = reinterpret_cast<int*>(shared_buffer + blockDim.x * k) + tid * k;

    int threadNum = gridDim.x * blockDim.x;

    for (int i = 0;i < k; ++i) {
        buffer[i] = -FLT_MAX;
        buffer_idx[i] = -1;
    }

    for (int i = gtid; i < length; i += threadNum) {
        replace_smaller(buffer, buffer_idx, k, input[i], i);
    }
    __syncthreads();

    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            float* next_buffer = shared_buffer + (tid + stride) * k;
            int* next_buffer_idx = reinterpret_cast<int*>(shared_buffer + blockDim.x * k) + (tid + stride) * k;
            mergeTwoK(buffer, buffer_idx, next_buffer, next_buffer_idx, k);
        }
        __syncthreads();
    }

    if (tid == 0) {
        for (int i = 0; i < k; ++i) {
            output[blockIdx.x * k + i] = buffer[i];
            out_idx[blockIdx.x * k + i] = buffer_idx[i];
        }
    }
}

main function:

int k = 4;

const int threadsPerBlock = 128;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
int sharedMemSize = threadsPerBlock * k * sizeof(int) + threadsPerBlock * k * sizeof(float);

thrust::device_vector<float> d_res_first(k * blocksPerGrid);
thrust::device_vector<int> d_idx_first(k * blocksPerGrid);

top_k_gpu<<<blocksPerGrid, threadsPerBlock, sharedMemSize>>>(d_vec.data().get(), n, k, d_res_first.data().get(), d_idx_first.data().get());
top_k_gpu_opt<128><<<blocksPerGrid, threadsPerBlock, sharedMemSize>>>(d_vec.data().get(), n, k, d_res_first.data().get(), d_idx_first.data().get());

Analysis:

Since I’m not an expert with CUDA C (I do by best but this beyond my expertise) and this forum is for the NVHPC Compilers as well as CUDA Fortran, C++ STDPAR, and directive based programming (OpenACC, OpenMP), I’ll move your post over the CUDA C forum where you might get better help.

However, I thought current practice is to use shuffle operations for reductions as outlined in this article. I’m not sure it would work for your case, but something to consider.

-Mat