My reduction code is not really fast..

Hello.

I wrote kernels to compute a mean of 3D vectors.

Firstly, I used a very simple reduction code, but it is not as fast as I expected.

So that I wrote another, which is borrowed from the NVIDIA GPU computing SDK, which is reported much faster than a simple reduction code.

However, the execution time of the new one is a bit slower than the first one on my machine…

Anybody can give me a comment to make the the mean vector computation faster ??

The number of vectors is fixed to 25,600 (160 x 160).

I’m using GTX580 on Windows 7 Enterprise.

My codes are as below:

Simple reduction kernel:

It consist of two kernels, one reduces to 100 partial sum (compute_cent_256)

and another computes the summation of the partial 100 sums (compute_cent_100).

I used double4 vector types to store the number of meaningful vectors,

which means the point (0,0,0) is not used for mean vector computation.

I have two sets of points and their mean vectors are computed simultaneously.

__global__ void compute_cent_256(

	float3 *ap_pts1, 

	float3 *ap_pts2,

	int a_nelem_row,

	double4 *ap_dst1, 

	double4 *ap_dst2) 

{

	

	int gid_x = blockIdx.x * blockDim.x + threadIdx.x ;

    int gid_y = blockIdx.y * blockDim.y + threadIdx.y ;

	int l_thread_id = threadIdx.x + threadIdx.y * blockDim.x ; 

	int l_gid = gid_x + gid_y * a_nelem_row ;

	__shared__ double4 lp_buffer1[256] ;

	__shared__ double4 lp_buffer2[256] ;

	

	lp_buffer1[l_thread_id] = make_double4(ap_pts1[l_gid].x, ap_pts1[l_gid].y, ap_pts1[l_gid].z, 0) ;

	lp_buffer2[l_thread_id] = make_double4(ap_pts2[l_gid].x, ap_pts2[l_gid].y, ap_pts2[l_gid].z, 0) ;

	__syncthreads() ;

	if((lp_buffer1[l_thread_id].x == 0 && lp_buffer1[l_thread_id].y == 0 && lp_buffer1[l_thread_id].z == 0) || 

	   (lp_buffer2[l_thread_id].x == 0 && lp_buffer2[l_thread_id].y == 0 && lp_buffer2[l_thread_id].z == 0)

	  )

	{

		lp_buffer1[l_thread_id] = lp_buffer2[l_thread_id] = make_double4(0,0,0,0) ;

	}

	else 

	{

		lp_buffer1[l_thread_id].w = 1 ;

		lp_buffer2[l_thread_id].w = 1 ;

	}

	__syncthreads() ;

	int b = 256/2 ;

	while(b>0) {

		if(l_thread_id < b) { 

			lp_buffer1[l_thread_id].x += lp_buffer1[l_thread_id + b].x  ;

			lp_buffer1[l_thread_id].y += lp_buffer1[l_thread_id + b].y  ;

			lp_buffer1[l_thread_id].z += lp_buffer1[l_thread_id + b].z  ;

			lp_buffer1[l_thread_id].w += lp_buffer1[l_thread_id + b].w  ;

			lp_buffer2[l_thread_id].x += lp_buffer2[l_thread_id + b].x  ;

			lp_buffer2[l_thread_id].y += lp_buffer2[l_thread_id + b].y  ;

			lp_buffer2[l_thread_id].z += lp_buffer2[l_thread_id + b].z  ;

			lp_buffer2[l_thread_id].w += lp_buffer2[l_thread_id + b].w  ;

		}

		b /= 2 ;

		__syncthreads() ;

	}

	if(threadIdx.x == 0 && threadIdx.y == 0) { 

		ap_dst1[blockIdx.x] = lp_buffer1[0] ;

		ap_dst2[blockIdx.x] = lp_buffer2[0] ;

	}

	__syncthreads() ;

}

__global__ void compute_cent_100(

	double4 *ap_pts1, 

	double4 *ap_pts2,

	int a_nelem_row) 

{

	

	int gid_x = blockIdx.x * blockDim.x + threadIdx.x ;

    int gid_y = blockIdx.y * blockDim.y + threadIdx.y ;

	int l_thread_id = threadIdx.x + threadIdx.y * blockDim.x ; 

	int l_gid = gid_x + gid_y * a_nelem_row ;

	__shared__ double4 lp_buffer1[128] ;

	__shared__ double4 lp_buffer2[128] ;

	

	if(l_gid < 100) {

		lp_buffer1[l_thread_id] = ap_pts1[l_gid] ;

		lp_buffer2[l_thread_id] = ap_pts2[l_gid] ;

	}

	else 

	{	

		lp_buffer1[l_thread_id] = make_double4(0,0,0,0) ; 

		lp_buffer2[l_thread_id] = make_double4(0,0,0,0) ;  

	}

	__syncthreads() ;

	int b = 128/2 ;

	while(b>0) {

		if(l_thread_id < b) { 

			lp_buffer1[l_thread_id].x += lp_buffer1[l_thread_id + b].x  ;

			lp_buffer1[l_thread_id].y += lp_buffer1[l_thread_id + b].y  ;

			lp_buffer1[l_thread_id].z += lp_buffer1[l_thread_id + b].z  ;

			lp_buffer1[l_thread_id].w += lp_buffer1[l_thread_id + b].w  ;

			lp_buffer2[l_thread_id].x += lp_buffer2[l_thread_id + b].x  ;

			lp_buffer2[l_thread_id].y += lp_buffer2[l_thread_id + b].y  ;

			lp_buffer2[l_thread_id].z += lp_buffer2[l_thread_id + b].z  ;

			lp_buffer2[l_thread_id].w += lp_buffer2[l_thread_id + b].w  ;

		}

		b /= 2 ;

		__syncthreads() ;

	}

	if(threadIdx.x == 0 && threadIdx.y == 0) { 

		ap_pts1[0] = lp_buffer1[0] ;

		ap_pts2[0] = lp_buffer2[0] ;

	}

}

The new kernels borrowed from the NVIDIA SDK example.

In this version, the kernel works with a set of points only.

It firstly reduces to 50 partial sums and computed the final sum from them.

This was expected faster than the old one, but it was slow indeed.

template<int blockSize>

__global__ void compute_cent_256ex2(

	float3 *ap_pts1, 

	double4 *ap_dst1, 

	int n_elem) 

{

	__shared__ double4 lp_buffer1[blockSize] ;

	int l_thread_id = threadIdx.x ; 

    unsigned int l_gid = blockIdx.x*(blockSize*2) + threadIdx.x;

	double4 mySum ; 

	if(l_gid < n_elem) 

	{	

		if(ap_pts1[l_gid].x == 0 && 

		   ap_pts1[l_gid].y == 0 && 

		   ap_pts1[l_gid].z == 0) 

			double4 mySum = make_double4(0,0,0,0) ; 

		else

			mySum = make_double4(ap_pts1[l_gid].x, ap_pts1[l_gid].y, ap_pts1[l_gid].z, 1) ;

	}

	if(l_gid + blockSize < n_elem) 

	{

		if(ap_pts1[l_gid + blockSize].x != 0 || 

		   ap_pts1[l_gid + blockSize].y != 0 || 

		   ap_pts1[l_gid + blockSize].z != 0) 

		{ 

			mySum.x += ap_pts1[l_gid + blockSize].x ;

			mySum.y += ap_pts1[l_gid + blockSize].y ;

			mySum.z += ap_pts1[l_gid + blockSize].z ;

			mySum.w += 1 ;

		}

	}

	lp_buffer1[l_thread_id] = mySum ;

	__syncthreads() ;

	 // do reduction in shared mem

        if (blockSize >= 256) 

	{ 

		if (l_thread_id < 128) 

		{ 

			//sdata[l_thread_id] = mySum = mySum + sdata[tid + 128]; 

			mySum.x += lp_buffer1[l_thread_id + 128].x ;

			mySum.y += lp_buffer1[l_thread_id + 128].y ;

			mySum.z += lp_buffer1[l_thread_id + 128].z ;

			mySum.w += lp_buffer1[l_thread_id + 128].w ;

lp_buffer1[l_thread_id] = mySum ;

		} 

		__syncthreads(); 

	}

if (blockSize >= 128) 

	{ 

		if (l_thread_id <  64) 

		{ 

			mySum.x += lp_buffer1[l_thread_id + 64].x ;

			mySum.y += lp_buffer1[l_thread_id + 64].y ;

			mySum.z += lp_buffer1[l_thread_id + 64].z ;

			mySum.w += lp_buffer1[l_thread_id + 64].w ;

lp_buffer1[l_thread_id] = mySum ;

		} 

		__syncthreads(); 

	}

if (l_thread_id < 32)

    {

        // now that we are using warp-synchronous programming (below)

        // we need to declare our shared memory volatile so that the compiler

        // doesn't reorder stores to it and induce incorrect behavior.

        volatile double4 *smem = lp_buffer1;

if (blockSize >=  64) {

			mySum.x += smem[l_thread_id + 32].x;

			mySum.y += smem[l_thread_id + 32].y;

			mySum.z += smem[l_thread_id + 32].z;

			mySum.w += smem[l_thread_id + 32].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=  32) { 

			mySum.x += smem[l_thread_id + 16].x;

			mySum.y += smem[l_thread_id + 16].y;

			mySum.z += smem[l_thread_id + 16].z;

			mySum.w += smem[l_thread_id + 16].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=  16) {

			mySum.x += smem[l_thread_id + 8].x;

			mySum.y += smem[l_thread_id + 8].y;

			mySum.z += smem[l_thread_id + 8].z;

			mySum.w += smem[l_thread_id + 8].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=   8) { 

			mySum.x += smem[l_thread_id + 4].x;

			mySum.y += smem[l_thread_id + 4].y;

			mySum.z += smem[l_thread_id + 4].z;

			mySum.w += smem[l_thread_id + 4].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=   4) { 

			mySum.x += smem[l_thread_id + 2].x;

			mySum.y += smem[l_thread_id + 2].y;

			mySum.z += smem[l_thread_id + 2].z;

			mySum.w += smem[l_thread_id + 2].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=   2) { 			

			mySum.x += smem[l_thread_id + 1].x;

			mySum.y += smem[l_thread_id + 1].y;

			mySum.z += smem[l_thread_id + 1].z;

			mySum.w += smem[l_thread_id + 1].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

    }

// write result for this block to global mem 

    if (l_thread_id == 0)

		ap_dst1[blockIdx.x] = lp_buffer1[0] ;

}

template<int blockSize>

__global__ void compute_cent_50ex2(

	double4 *ap_pts1, 

	double4 *ap_dst1, 

	int n_elem) 

{

	__shared__ double4 lp_buffer1[blockSize] ;

	int l_thread_id = threadIdx.x ; 

    unsigned int l_gid = threadIdx.x;

	double4 mySum ; 

	if(l_gid < n_elem) 

	{	

		mySum = ap_pts1[l_gid] ;

	}

	else 

		mySum = make_double4(0,0,0,0) ;

	lp_buffer1[l_thread_id].x = mySum.x ;

	lp_buffer1[l_thread_id].y = mySum.y ;

	lp_buffer1[l_thread_id].z = mySum.z ;

	lp_buffer1[l_thread_id].w = mySum.w ;

	__syncthreads() ; 

if (l_thread_id < 32)

    {

        // now that we are using warp-synchronous programming (below)

        // we need to declare our shared memory volatile so that the compiler

        // doesn't reorder stores to it and induce incorrect behavior.

        volatile double4 *smem = lp_buffer1;

        if (blockSize >=  64) {

			mySum.x += smem[l_thread_id + 32].x;

			mySum.y += smem[l_thread_id + 32].y;

			mySum.z += smem[l_thread_id + 32].z;

			mySum.w += smem[l_thread_id + 32].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=  32) { 

			mySum.x += smem[l_thread_id + 16].x;

			mySum.y += smem[l_thread_id + 16].y;

			mySum.z += smem[l_thread_id + 16].z;

			mySum.w += smem[l_thread_id + 16].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=  16) {

			//smem[l_thread_id] = mySum = mySum + smem[l_thread_id +  8]; 

			mySum.x += smem[l_thread_id + 8].x;

			mySum.y += smem[l_thread_id + 8].y;

			mySum.z += smem[l_thread_id + 8].z;

			mySum.w += smem[l_thread_id + 8].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=   8) {

			mySum.x += smem[l_thread_id + 4].x;

			mySum.y += smem[l_thread_id + 4].y;

			mySum.z += smem[l_thread_id + 4].z;

			mySum.w += smem[l_thread_id + 4].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=   4) { 

			mySum.x += smem[l_thread_id + 2].x;

			mySum.y += smem[l_thread_id + 2].y;

			mySum.z += smem[l_thread_id + 2].z;

			mySum.w += smem[l_thread_id + 2].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

        if (blockSize >=   2) { 

			mySum.x += smem[l_thread_id + 1].x;

			mySum.y += smem[l_thread_id + 1].y;

			mySum.z += smem[l_thread_id + 1].z;

			mySum.w += smem[l_thread_id + 1].w ;

			

			smem[l_thread_id].x = mySum.x ; 

			smem[l_thread_id].y = mySum.y ; 

			smem[l_thread_id].z = mySum.z ; 

			smem[l_thread_id].w = mySum.w ; 

			__syncthreads(); 

		}

    }

// write result for this block to global mem 

    if (l_thread_id == 0)

		ap_dst1[blockIdx.x] = lp_buffer1[0] ;

}