My reduction code is not really fast..

WonwooLee · April 11, 2011, 1:47am

Hello.

I wrote kernels to compute a mean of 3D vectors.

Firstly, I used a very simple reduction code, but it is not as fast as I expected.

So that I wrote another, which is borrowed from the NVIDIA GPU computing SDK, which is reported much faster than a simple reduction code.

However, the execution time of the new one is a bit slower than the first one on my machine…

Anybody can give me a comment to make the the mean vector computation faster ??

The number of vectors is fixed to 25,600 (160 x 160).

I’m using GTX580 on Windows 7 Enterprise.

My codes are as below:

Simple reduction kernel:

It consist of two kernels, one reduces to 100 partial sum (compute_cent_256)

and another computes the summation of the partial 100 sums (compute_cent_100).

I used double4 vector types to store the number of meaningful vectors,

which means the point (0,0,0) is not used for mean vector computation.

I have two sets of points and their mean vectors are computed simultaneously.

__global__ void compute_cent_256(

	float3 *ap_pts1, 

	float3 *ap_pts2,

	int a_nelem_row,

	double4 *ap_dst1, 

	double4 *ap_dst2) 

{

	

	int gid_x = blockIdx.x * blockDim.x + threadIdx.x ;

    int gid_y = blockIdx.y * blockDim.y + threadIdx.y ;

	int l_thread_id = threadIdx.x + threadIdx.y * blockDim.x ; 

	int l_gid = gid_x + gid_y * a_nelem_row ;

	__shared__ double4 lp_buffer1[256] ;

	__shared__ double4 lp_buffer2[256] ;

	

	lp_buffer1[l_thread_id] = make_double4(ap_pts1[l_gid].x, ap_pts1[l_gid].y, ap_pts1[l_gid].z, 0) ;

	lp_buffer2[l_thread_id] = make_double4(ap_pts2[l_gid].x, ap_pts2[l_gid].y, ap_pts2[l_gid].z, 0) ;

	__syncthreads() ;

	if((lp_buffer1[l_thread_id].x == 0 && lp_buffer1[l_thread_id].y == 0 && lp_buffer1[l_thread_id].z == 0) || 

	   (lp_buffer2[l_thread_id].x == 0 && lp_buffer2[l_thread_id].y == 0 && lp_buffer2[l_thread_id].z == 0)

	  )

	{

		lp_buffer1[l_thread_id] = lp_buffer2[l_thread_id] = make_double4(0,0,0,0) ;

	}

	else 

	{

		lp_buffer1[l_thread_id].w = 1 ;

		lp_buffer2[l_thread_id].w = 1 ;

	}

	__syncthreads() ;

	int b = 256/2 ;

	while(b>0) {

		if(l_thread_id < b) { 

			lp_buffer1[l_thread_id].x += lp_buffer1[l_thread_id + b].x  ;

			lp_buffer1[l_thread_id].y += lp_buffer1[l_thread_id + b].y  ;

			lp_buffer1[l_thread_id].z += lp_buffer1[l_thread_id + b].z  ;

			lp_buffer1[l_thread_id].w += lp_buffer1[l_thread_id + b].w  ;

			lp_buffer2[l_thread_id].x += lp_buffer2[l_thread_id + b].x  ;

			lp_buffer2[l_thread_id].y += lp_buffer2[l_thread_id + b].y  ;

			lp_buffer2[l_thread_id].z += lp_buffer2[l_thread_id + b].z  ;

			lp_buffer2[l_thread_id].w += lp_buffer2[l_thread_id + b].w  ;

		}

		b /= 2 ;

		__syncthreads() ;

	}

	if(threadIdx.x == 0 && threadIdx.y == 0) { 

		ap_dst1[blockIdx.x] = lp_buffer1[0] ;

		ap_dst2[blockIdx.x] = lp_buffer2[0] ;

	}

	__syncthreads() ;

}

__global__ void compute_cent_100(

	double4 *ap_pts1, 

	double4 *ap_pts2,

	int a_nelem_row) 

{

	

	int gid_x = blockIdx.x * blockDim.x + threadIdx.x ;

    int gid_y = blockIdx.y * blockDim.y + threadIdx.y ;

	int l_thread_id = threadIdx.x + threadIdx.y * blockDim.x ; 

	int l_gid = gid_x + gid_y * a_nelem_row ;

	__shared__ double4 lp_buffer1[128] ;

	__shared__ double4 lp_buffer2[128] ;

	

	if(l_gid < 100) {

		lp_buffer1[l_thread_id] = ap_pts1[l_gid] ;

		lp_buffer2[l_thread_id] = ap_pts2[l_gid] ;

	}

	else 

	{	

		lp_buffer1[l_thread_id] = make_double4(0,0,0,0) ; 

		lp_buffer2[l_thread_id] = make_double4(0,0,0,0) ;  

	}

	__syncthreads() ;

	int b = 128/2 ;

	while(b>0) {

		if(l_thread_id < b) { 

			lp_buffer1[l_thread_id].x += lp_buffer1[l_thread_id + b].x  ;

			lp_buffer1[l_thread_id].y += lp_buffer1[l_thread_id + b].y  ;

			lp_buffer1[l_thread_id].z += lp_buffer1[l_thread_id + b].z  ;

			lp_buffer1[l_thread_id].w += lp_buffer1[l_thread_id + b].w  ;

			lp_buffer2[l_thread_id].x += lp_buffer2[l_thread_id + b].x  ;

			lp_buffer2[l_thread_id].y += lp_buffer2[l_thread_id + b].y  ;

			lp_buffer2[l_thread_id].z += lp_buffer2[l_thread_id + b].z  ;

			lp_buffer2[l_thread_id].w += lp_buffer2[l_thread_id + b].w  ;

		}

		b /= 2 ;

		__syncthreads() ;

	}

	if(threadIdx.x == 0 && threadIdx.y == 0) { 

		ap_pts1[0] = lp_buffer1[0] ;

		ap_pts2[0] = lp_buffer2[0] ;

	}

}

The new kernels borrowed from the NVIDIA SDK example.

In this version, the kernel works with a set of points only.

It firstly reduces to 50 partial sums and computed the final sum from them.

This was expected faster than the old one, but it was slow indeed.

template<int blockSize>

__global__ void compute_cent_256ex2(

	float3 *ap_pts1, 

	double4 *ap_dst1, 

	int n_elem) 

{

	__shared__ double4 lp_buffer1[blockSize] ;

	int l_thread_id = threadIdx.x ; 

    unsigned int l_gid = blockIdx.x*(blockSize*2) + threadIdx.x;

	double4 mySum ; 

	if(l_gid < n_elem) 

	{	

		if(ap_pts1[l_gid].x == 0 && 

		   ap_pts1[l_gid].y == 0 && 

		   ap_pts1[l_gid].z == 0) 

			double4 mySum = make_double4(0,0,0,0) ; 

		else

			mySum = make_double4(ap_pts1[l_gid].x, ap_pts1[l_gid].y, ap_pts1[l_gid].z, 1) ;

	}

	if(l_gid + blockSize < n_elem) 

	{

		if(ap_pts1[l_gid + blockSize].x != 0 || 

		   ap_pts1[l_gid + blockSize].y != 0 || 

		   ap_pts1[l_gid + blockSize].z != 0) 

		{ 

			mySum.x += ap_pts1[l_gid + blockSize].x ;

			mySum.y += ap_pts1[l_gid + blockSize].y ;

			mySum.z += ap_pts1[l_gid + blockSize].z ;

			mySum.w += 1 ;

		}

	}

	lp_buffer1[l_thread_id] = mySum ;

	__syncthreads() ;

	 // do reduction in shared mem

        if (blockSize >= 256) 

	{ 

		if (l_thread_id < 128) 

		{ 

			//sdata[l_thread_id] = mySum = mySum + sdata[tid + 128]; 

			mySum.x += lp_buffer1[l_thread_id + 128].x ;

			mySum.y += lp_buffer1[l_thread_id + 128].y ;

			mySum.z += lp_buffer1[l_thread_id + 128].z ;

			mySum.w += lp_buffer1[l_thread_id + 128].w ;

lp_buffer1[l_thread_id] = mySum ;

		} 

		__syncthreads(); 

	}

if (blockSize >= 128) 

	{ 

		if (l_thread_id <  64) 

		{ 

			mySum.x += lp_buffer1[l_thread_id + 64].x ;

			mySum.y += lp_buffer1[l_thread_id + 64].y ;

			mySum.z += lp_buffer1[l_thread_id + 64].z ;

			mySum.w += lp_buffer1[l_thread_id + 64].w ;

lp_buffer1[l_thread_id] = mySum ;

		} 

		__syncthreads(); 

	}

if (l_thread_id < 32)

    {

        // now that we are using warp-synchronous programming (below)

        // we need to declare our shared memory volatile so that the compiler

        // doesn't reorder stores to it and induce incorrect behavior.

        volatile double4 *smem = lp_buffer1;

if (blockSize >=  64) {

			mySum.x += smem[l_thread_id + 32].x;

			mySum.y += smem[l_thread_id + 32].y;

			mySum.z += smem[l_thread_id + 32].z;

			mySum.w += smem[l_thread_id + 32].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=  32) { 

			mySum.x += smem[l_thread_id + 16].x;

			mySum.y += smem[l_thread_id + 16].y;

			mySum.z += smem[l_thread_id + 16].z;

			mySum.w += smem[l_thread_id + 16].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=  16) {

			mySum.x += smem[l_thread_id + 8].x;

			mySum.y += smem[l_thread_id + 8].y;

			mySum.z += smem[l_thread_id + 8].z;

			mySum.w += smem[l_thread_id + 8].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=   8) { 

			mySum.x += smem[l_thread_id + 4].x;

			mySum.y += smem[l_thread_id + 4].y;

			mySum.z += smem[l_thread_id + 4].z;

			mySum.w += smem[l_thread_id + 4].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=   4) { 

			mySum.x += smem[l_thread_id + 2].x;

			mySum.y += smem[l_thread_id + 2].y;

			mySum.z += smem[l_thread_id + 2].z;

			mySum.w += smem[l_thread_id + 2].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=   2) { 			

			mySum.x += smem[l_thread_id + 1].x;

			mySum.y += smem[l_thread_id + 1].y;

			mySum.z += smem[l_thread_id + 1].z;

			mySum.w += smem[l_thread_id + 1].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

    }

// write result for this block to global mem 

    if (l_thread_id == 0)

		ap_dst1[blockIdx.x] = lp_buffer1[0] ;

}

template<int blockSize>

__global__ void compute_cent_50ex2(

	double4 *ap_pts1, 

	double4 *ap_dst1, 

	int n_elem) 

{

	__shared__ double4 lp_buffer1[blockSize] ;

	int l_thread_id = threadIdx.x ; 

    unsigned int l_gid = threadIdx.x;

	double4 mySum ; 

	if(l_gid < n_elem) 

	{	

		mySum = ap_pts1[l_gid] ;

	}

	else 

		mySum = make_double4(0,0,0,0) ;

	lp_buffer1[l_thread_id].x = mySum.x ;

	lp_buffer1[l_thread_id].y = mySum.y ;

	lp_buffer1[l_thread_id].z = mySum.z ;

	lp_buffer1[l_thread_id].w = mySum.w ;

	__syncthreads() ; 

if (l_thread_id < 32)

    {

        // now that we are using warp-synchronous programming (below)

        // we need to declare our shared memory volatile so that the compiler

        // doesn't reorder stores to it and induce incorrect behavior.

        volatile double4 *smem = lp_buffer1;

        if (blockSize >=  64) {

			mySum.x += smem[l_thread_id + 32].x;

			mySum.y += smem[l_thread_id + 32].y;

			mySum.z += smem[l_thread_id + 32].z;

			mySum.w += smem[l_thread_id + 32].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=  32) { 

			mySum.x += smem[l_thread_id + 16].x;

			mySum.y += smem[l_thread_id + 16].y;

			mySum.z += smem[l_thread_id + 16].z;

			mySum.w += smem[l_thread_id + 16].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=  16) {

			//smem[l_thread_id] = mySum = mySum + smem[l_thread_id +  8]; 

			mySum.x += smem[l_thread_id + 8].x;

			mySum.y += smem[l_thread_id + 8].y;

			mySum.z += smem[l_thread_id + 8].z;

			mySum.w += smem[l_thread_id + 8].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=   8) {

			mySum.x += smem[l_thread_id + 4].x;

			mySum.y += smem[l_thread_id + 4].y;

			mySum.z += smem[l_thread_id + 4].z;

			mySum.w += smem[l_thread_id + 4].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=   4) { 

			mySum.x += smem[l_thread_id + 2].x;

			mySum.y += smem[l_thread_id + 2].y;

			mySum.z += smem[l_thread_id + 2].z;

			mySum.w += smem[l_thread_id + 2].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=   2) { 

			mySum.x += smem[l_thread_id + 1].x;

			mySum.y += smem[l_thread_id + 1].y;

			mySum.z += smem[l_thread_id + 1].z;

			mySum.w += smem[l_thread_id + 1].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

    }

// write result for this block to global mem 

    if (l_thread_id == 0)

		ap_dst1[blockIdx.x] = lp_buffer1[0] ;

}

Topic		Replies	Views
float reduction, cpu and cuda answers differ CUDA Programming and Performance	4	3347	April 1, 2008
Would like to share my speedy reduction code Very simple code! CUDA Programming and Performance	0	1100	July 29, 2010
2D reduction using CUDA The use a cuda and cublas library for a 2D simple reduction CUDA Programming and Performance	11	4486	February 7, 2012
Reduction questions(newbie-ish) CUDA Programming and Performance	7	1830	January 14, 2009
Regarding Vector Reduction To find sum of all components of a vector CUDA Programming and Performance	1	3257	July 22, 2009
CUDA reduction CUDA Programming and Performance	10	51468	June 7, 2009
Speedy general reduction code ( 83.5 % of peak) Works for any size CUDA Programming and Performance	44	30473	October 29, 2010
My simple but speedy reduction code (runs 106.4GB/s on GTX 295) 106.4/111.9=95.1% to the peak bandwi CUDA Programming and Performance	32	28296	August 15, 2010
Best way to face this problem CUDA Programming and Performance	4	1172	May 16, 2010
Reduction & block dimension Using the easiest reduction example of the SDK CUDA Programming and Performance	6	2246	November 23, 2009

My reduction code is not really fast..

Related topics