The attached is my reduction code. It’s quite simple and not fully optimized, but the measured performance is already fast and makes me quite surprised.
The obtained result on my machine (GTX 295) is:
[font=“Courier New”]Use device 0: “GeForce GTX 295”
Measured bandwidth=106.4GB/s
Reduce size=135782400, use time=5.105375s, GPU_result=611004689, CPU_result=611004689
Test PASSED[/font]
and we know that the theoretical peak of GTX 295 is 9992448/8=111.9GB/s. So the absolute efficiency is about 95%. I don’t know what is the fastest reduction code in the world. But this score is obviously faster than the SDK reduction example, which is about 98-99GB/s measured on the same machine (only 88% to the theoretical peak).
I don’t know whether this code’s efficiency runs on other GPUs is as efficient as 295, any comments are appreciated.
Note: The default paremeter in the code needs about 500M global memory, so if you have not enough memory you should reduce the value of macro K. And the default paremeter is optimized for GTX 295, on other GPUs it might not be the optimized value and should be manually tuned for the best performance.
(a more efficient code is posted on #13 of this thread)
#include "cutil_inline.h"
#define M (240*20)
#define N 64
#define K 442 // Please reduce this value if out of memory
#define SIZE (M*N*K)
__global__ void reduce1(int *data, int *res) {
__shared__ int shm[N];
int inx=blockIdx.x*blockDim.x+threadIdx.x;
int s=0;
// #pragma unroll 1000
for(int j=inx; j<SIZE; j+=M*N) s+=data[j];
shm[threadIdx.x]=s;
__syncthreads();
if(threadIdx.x==0) {
int s=0;
for(int i=0; i<N; i++) s+=shm[i];
res[blockIdx.x]=s;
}
}
__global__ void reduce2(int *data) {
__shared__ int shm[N];
int inx=threadIdx.x;
int s=0;
for(int j=inx; j<M; j+=N) s+=data[j];
shm[threadIdx.x]=s;
__syncthreads();
if(inx==0) {
int s=0;
for(int i=0; i<N; i++) s+=shm[i];
data[0]=s;
}
}
void do_reduction(int *data, int *res) {
reduce1<<<M,N>>>(data, res);
reduce2<<<1,N>>>(res);
}
int A;
int main() {
int GPU_result, CPU_result=0, dev=0;
int *data, *res;
unsigned int t1;
srand(2010);
cudaSetDevice(dev);
cudaDeviceProp deviceProp;
cutilSafeCall(cudaGetDeviceProperties(&deviceProp, dev));
printf("Use device %d: \"%s\"\n", dev, deviceProp.name);
cutCreateTimer(&t1);
cutilSafeCall(cudaMalloc((void**)&data, SIZE*sizeof(int)));
cutilSafeCall(cudaMalloc((void**)&res, SIZE*sizeof(int)/K));
for(int i=0; i<SIZE; i++) A[i]=rand()%10, CPU_result+=A[i];
cudaMemcpy(data, A, SIZE*sizeof(int), cudaMemcpyHostToDevice);
do_reduction(data, res);
cudaThreadSynchronize();
cutStartTimer(t1);
for(int ii=0; ii<1000; ii++) do_reduction(data, res);
cudaThreadSynchronize();
cutStopTimer(t1);
float dt=cutGetTimerValue(t1)/1000.0f;
cudaMemcpy(&GPU_result, res, sizeof(int), cudaMemcpyDeviceToHost);
printf("Measured bandwidth=%.1fGB/s\n", 1E-9*SIZE*sizeof(int)*1000/dt);
printf("Reduce size=%d, use time=%fs, GPU_result=%d, CPU_result=%d\n",
SIZE, dt, GPU_result, CPU_result);
printf("Test %s\n", (GPU_result==CPU_result) ? "PASSED" : "FAILED");
}