Serial vs Parallel Performance

Hello all,
I created a cuda code for clustering, but somehow my CUDA code runs slower than the serial code.

How is that possible after using such massive parallelism using CUDA.

My serial code is:

#include<iostream>
#include<cstdlib>
#include<stdlib.h>
#include<cmath>
using namespace std;

struct point
{
	double x;
	double y;
};

void Distance(point *d_p,double *d_c,double *d_dist,int n,int k)
{
for(int i=0;i<n;i++)
	{
	double tempx,tempy;
	tempx=d_p[i].x;
	tempy=d_p[i].y;

	for(int j=0;j<k;j++)
		{
		double dx=tempx-d_c[2*j];
		double dy=tempy-d_c[(2*j)+1];

		d_dist[(i*k)+j]=sqrt((dx*dx)+(dy*dy));
		//dist[i][j]=(dx*dx)+(dy*dy);
		}
	}

}

void ClusterAllocation(double *d_dist,double *d_mini, int *d_cnum,int n,int k)
{
for(int i=0;i<n;i++)
	{
	double min=9999999;
	int num;
	for(int j=0;j<k;j++)
	{
		if(d_dist[(i*k)+j]<min)
		{
			min=d_dist[(i*k)+j];
			num=j+1;
		}
	}
	d_mini[i]=min;
	d_cnum[i]=num;
	}

}
void ClusterFormation(point *d_p,int *d_cnum, point *d_c1, point *d_c2, point *d_c3, int *dc_counts,int n,int k)
{
int num,i1,i2,i3;
	for(int i=0;i<n;i++)
	{
	num=d_cnum[i];
	switch(num)
	{
		case 1:
			i1=dc_counts[0];
			d_c1[i1]=d_p[i];
			dc_counts[0]++;
		break;

		case 2:
			i2=dc_counts[1];
			d_c2[i2]=d_p[i];
			dc_counts[1]++;
		break;

		case 3:
			i3=dc_counts[2];
			d_c3[i3]=d_p[i];
			dc_counts[2]++;
		break;
	}
	}	
}

void ClusterUpdate(point *d_c1,point *d_c2,point *d_c3,int *dc_counts,double *d_c,int n,int k)
{
for(int i=0;i<k;i++)
	{
	switch(i+1)
	{
	case 1:
	if(dc_counts[0]!=0)
	{
	double tempx=0,tempy=0;
	for(int j=0;j<dc_counts[0];j++)
		{
			point temp1=d_c1[j];
			tempx+=temp1.x;
			tempy+=temp1.y;
		}
	d_c[2*i]=tempx/dc_counts[0];
	d_c[(2*i)+1]=tempy/dc_counts[0];
	}
	break;

	case 2:
	if(dc_counts[1]!=0)
	{
	double tempx=0,tempy=0;
	for(int j=0;j<dc_counts[1];j++)
		{
			point temp1=d_c2[j];
			tempx+=temp1.x;
			tempy+=temp1.y;
		}
	d_c[2*i]=tempx/dc_counts[1];
	d_c[(2*i)+1]=tempy/dc_counts[1];
	}
	break;
	case 3:
	if(dc_counts[2]!=0)
	{
	double tempx=0,tempy=0;
	for(int j=0;j<dc_counts[2];j++)
		{
			point temp1=d_c3[j];
			tempx+=temp1.x;
			tempy+=temp1.y;
		}
	d_c[2*i]=tempx/dc_counts[2];
	d_c[(2*i)+1]=tempy/dc_counts[2];
	}
	break;	
	}
	}
}

bool MovementDetection(double *c_old, double *c,int k)
{
for(int i=0;i<k*2;i++)
{
double difference=c[i]-c_old[i];
if(abs(difference) >= 0.4)
	return true;			//Movement Detected
}
return false;				//No movement
}

int main()
{

	int n=10000;
	
	double *x,*y;
	point *p,*d_p;

	int k;
	cout<<"\nHow many clusters you want:";
	//cin>>k;
	k=3;
	cout<<k;

	double *c,*d_c,*c_old;
	double *dist,*d_dist;
	double *mini,*d_mini;
	int *cnum,*d_cnum;

	size_t size=sizeof(double)*n;
	size_t csize=sizeof(double)*k*2;
	size_t dsize=sizeof(double)*n*k;

	//Allocating memory to host variables
	p=(point*)malloc(2*size);		//Points
	x=(double*)malloc(size);		//x-coordinate
	y=(double*)malloc(size);		//y-coordinate
	c=(double*)malloc(csize);		//Cluster Heads
	c_old=(double*)malloc(csize);	//Old Cluster Heads
	dist=(double*)malloc(dsize);	//Distance array
	mini=(double*)malloc(size);		//Minimum distance array
	cnum=(int*)malloc(size);		//Cluster number array

	//Initializing x and y
	for(int i=0;i<n;i++)
		x[i]=y[i]=i+1;

	//Printing x and y
	cout<<"\n\nX:\t";
	for(int i=0;i<n;i++)
		cout<<"\t"<<x[i];

	cout<<"\n\nY:\t";
	for(int i=0;i<n;i++)
		cout<<"\t"<<y[i];

	//Initializing points
	for(int i=0;i<n;i++)
	{
		p[i].x=x[i];
		p[i].y=y[i];
	}

	//Printing points
	cout<<"\n\nPoint X:";
	for(int i=0;i<n;i++)
		cout<<"\t"<<p[i].x;

	cout<<"\n\nPoint Y:";
	for(int i=0;i<n;i++)
		cout<<"\t"<<p[i].y;

	//Initializing cluster heads
	for(int i=0;i<k*2;i++)
		c[i]=rand()%n;

	//Printing Cluster Heads
	cout<<"\n\nCluster Heads:\n";
	for(int i=0;i<k*2;i++)
	{
	cout<<"\t"<<c[i];
	if(i%2==1)
		cout<<endl;
	else
		cout<<"\t";
	}

int iter=0;
do
{
	//Function Call Distance(point *d_p,double *d_c,double *d_dist,int k,int n)
	Distance(p,c,dist,n,k);
	
	/*
	cout<<"\nModified Distance array:\n";
	int j=0;
		for(int i=0;i<n*k;i++)
		{
		cout<<dist[i]<<"\t\t";
		if(i%k==2)
		{
			cout<<"\t"<<j<<endl;
			j++;
		}
		}

	*/
	//Function Call ClusterAllocation(double *d_dist,int k,double *d_mini, int *d_cnum,int n,int k)
	ClusterAllocation(dist,mini,cnum,n,k);
	
	/*//Printing minimum distance of each point
	cout<<"\nMinimum Distances:";
	for(int i=0;i<n;i++)
		cout<<"\n"<<(float)mini[i]<<"\t"<<cnum[i];
	*/

	point *C1,*C2,*C3;
	C1=(point*)malloc(n*sizeof(point));
	C2=(point*)malloc(n*sizeof(point));
	C3=(point*)malloc(n*sizeof(point));
	int count[k];

	count[0]=count[1]=count[2]=0;

	//Function Call ClusterFormation(point *d_p,int *d_cnum, T *d_c1, T *d_c2, T *d_c3, int *dc_counts,int n,int k)
	ClusterFormation(p,cnum,C1,C2,C3,count,n,k);

	
	//Displaying Clusters
	cout<<"\n\nCluster 1:";
	for(int i=0;i<count[0];i++)
		cout<<"\n"<<C1[i].x<<"\t"<<C1[i].y;

	cout<<"\n\nCluster 2:";
	for(int i=0;i<count[1];i++)
		cout<<"\n"<<C2[i].x<<"\t"<<C2[i].y;

	cout<<"\n\nCluster 3:";
	for(int i=0;i<count[2];i++)
		cout<<"\n"<<C3[i].x<<"\t"<<C3[i].y;

	cout<<"\n\nCounts Of clusters:";
	for(int i=0;i<k;i++)
		cout<<"\t"<<count[i];

	int total=count[0]+count[1]+count[2];
	cout<<"\t\tTotal :"<<total;

	//Retaining old cluster heads
	for(int i=0;i<k*2;i++)
		c_old[i]=c[i];

	//Function Call ClusterUpdate(point *d_c1,point *d_c2,point *d_c3,int *dc_counts,double *d_c,int n,int k)
	ClusterUpdate(C1,C2,C3,count,c,n,k);

	//Printing Old Cluster Heads
	cout<<"\n\nOld Cluster Heads:\n";
	for(int i=0;i<k*2;i++)
	{
	cout<<"\t"<<c_old[i];
	if(i%2==1)
		cout<<endl;
	else
		cout<<"\t";
	}

	//Printing New Cluster Heads
		cout<<"\nNew Cluster Heads:\n";
		for(int i=0;i<k*2;i++)
		{
		cout<<"\t"<<c[i];
		if(i%2==1)
			cout<<endl;
		else
			cout<<"\t";
		}
		cout<<endl;
iter++;

}while(MovementDetection(c_old,c,k));	

	cout<<"\n\nNumber of iterations:"<<iter;
	cout<<"\nAlgorithm Stopped.";
	cout<<endl;
	return 0;

		return 0;
}

My CUDA code is:

/*
 This is a code for Parallel K-Means Algorithm on CUDA
 Assumption : The number of clusters are already known and cannot change at runtime.
 */

#include<iostream>
#include<cstdlib>
#include<stdlib.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/fill.h>
using namespace std;

__device__ int count1;		//Variable that would reside in CUDA global memory.
__device__ int count2;
__device__ int count3;


struct point
{
	double x;
	double y;
};

__global__ void Distance(point *d_p,double *d_c,double *d_dist,int k)
{
	int id=blockIdx.x * blockDim.x + threadIdx.x;
	int idy=threadIdx.y;

	double tempx,tempy;
	tempx=d_p[id].x;
	tempy=d_p[id].y;

	double dx=tempx-d_c[2*idy];
	double dy=tempy-d_c[(2*idy)+1];

	d_dist[(id*k)+idy]=sqrt((dx*dx)+(dy*dy));
}
__global__ void ClusterAllocation(double *d_dist,int k,double *d_mini, int *d_cnum)
{
	int id=blockIdx.x * blockDim.x + threadIdx.x;
	double min=9999999;
	int num;

	for(int j=0;j<k;j++)
	{
	if(d_dist[(id*k)+j]<min)
		{
		min=d_dist[(id*k)+j];
		num=j+1;
		}
	}
	d_mini[id]=min;
	d_cnum[id]=num;
}

template <typename T> __global__ void ClusterFormation(point *d_p,int *d_cnum, int k, T *d_c1, T *d_c2, T *d_c3, int *dc_counts)
{
int id=blockIdx.x * blockDim.x + threadIdx.x;

if(id==0)
{
	count1=count2=count3=0;
	dc_counts[0]=dc_counts[1]=dc_counts[2]=0;
}
int *c1_val=&count1, *c2_val=&count2, *c3_val=&count3;

__syncthreads();
switch(d_cnum[id])
{
case 1:
	int i1=atomicAdd(c1_val,1);
	d_c1[i1]=d_p[id];		
	break;

case 2:
	int i2=atomicAdd(c2_val,1);
	d_c2[i2]=d_p[id];
	break;

case 3:
	int i3=atomicAdd(c3_val,1);
	d_c3[i3]=d_p[id];
	break;
}
}

template <typename T> __global__ void ClusterUpdate(T *d_c1,T *d_c2,T *d_c3,int *dc_counts,double *d_c)
{
	int id=blockIdx.x * blockDim.x + threadIdx.x;
	switch(id+1)
	{
	case 1:
		if(dc_counts[id]!=0)
		{
		double tempx1=0,tempy1=0;
		for(int i1=0;i1<dc_counts[id];i1++)
		{
			point temp1=d_c1[i1];
			tempx1+=temp1.x;
			tempy1+=temp1.y;
		}
		d_c[2*id]=tempx1/dc_counts[id];
		d_c[(2*id)+1]=tempy1/dc_counts[id];
		}
		break;

	case 2:
		if(dc_counts[id]!=0)
		{
		double tempx2=0,tempy2=0;
		for(int i2=0;i2<dc_counts[id];i2++)
		{
			point temp2=d_c2[i2];
			tempx2+=temp2.x;
			tempy2+=temp2.y;
		}
		d_c[2*id]=tempx2/dc_counts[id];
		d_c[(2*id)+1]=tempy2/dc_counts[id];
		}
		break;

	case 3:
		if(dc_counts[id]!=0)
		{
		double tempx3=0,tempy3=0;
		for(int i3=0;i3<dc_counts[id];i3++)
		{
			point temp3=d_c3[i3];
			tempx3+=temp3.x;
			tempy3+=temp3.y;
		}
		d_c[2*id]=tempx3/dc_counts[id];
		d_c[(2*id)+1]=tempy3/dc_counts[id];
		}
		break;
	}
}

bool MovementDetection(double *c_old, double *c,int k)
{
for(int i=0;i<k*2;i++)
{
double difference=c[i]-c_old[i];
if(abs(difference) >= 0.4)
	return true;			//Movement Detected
}
return false;				//No movement
}
int main()
{

	int n=10000;
	int tnum=250;
	int option;

	double *x,*y;
	point *p,*d_p;

	int k;
	cout<<"\nHow many clusters you want:";
	//cin>>k;
	k=3;
	cout<<k;

	double *c,*d_c,*c_old;
	double *dist,*d_dist;
	double *mini,*d_mini;
	int *cnum,*d_cnum;

	size_t size=sizeof(double)*n;
	size_t csize=sizeof(double)*k*2;
	size_t dsize=sizeof(double)*n*k;

	//Allocating memory to host variables
	p=(point*)malloc(2*size);		//Points
	x=(double*)malloc(size);		//x-coordinate
	y=(double*)malloc(size);		//y-coordinate
	c=(double*)malloc(csize);		//Cluster Heads
	c_old=(double*)malloc(csize);	//Old Cluster Heads
	dist=(double*)malloc(dsize);	//Distance array
	mini=(double*)malloc(size);		//Minimum distance array
	cnum=(int*)malloc(size);		//Cluster number array

	//Allocating memory to device variables
	cudaMalloc(&d_p,2*size);		//Points
	cudaMalloc(&d_c,csize);			//Cluster Heads
	cudaMalloc(&d_dist,dsize);		//Distance array
	cudaMalloc(&d_mini,size);		//Minimum Distance array
	cudaMalloc(&d_cnum,size);		//Cluster number array


	//Initializing x and y
	for(int i=0;i<n;i++)
		x[i]=y[i]=i+1;

	//Printing x and y
	cout<<"\n\nX:\t";
	for(int i=0;i<n;i++)
		cout<<"\t"<<x[i];

	cout<<"\n\nY:\t";
	for(int i=0;i<n;i++)
		cout<<"\t"<<y[i];

	//Initializing points
	for(int i=0;i<n;i++)
	{
		p[i].x=x[i];
		p[i].y=y[i];
	}

	//Printing points
	cout<<"\n\nPoint X:";
	for(int i=0;i<n;i++)
		cout<<"\t"<<p[i].x;

	cout<<"\n\nPoint Y:";
	for(int i=0;i<n;i++)
		cout<<"\t"<<p[i].y;

	//Initializing Clusters
	thrust::host_vector<point> C1(n);
	thrust::host_vector<point> C2(n);
	thrust::host_vector<point> C3(n);

	thrust::device_vector<point> D1 = C1;
	thrust::device_vector<point> D2 = C2;
	thrust::device_vector<point> D3 = C3;

	point zero;
	zero.x=zero.y=0;

	//Allocating memory to device clusters
	point *d_c1=thrust::raw_pointer_cast(D1.data());
	point *d_c2=thrust::raw_pointer_cast(D2.data());
	point *d_c3=thrust::raw_pointer_cast(D3.data());

	//Initializing cluster heads
	for(int i=0;i<k*2;i++)
		c[i]=rand()%n;

	//Printing Cluster Heads
	cout<<"\n\nCluster Heads:\n";
	for(int i=0;i<k*2;i++)
	{
	cout<<"\t"<<c[i];
	if(i%2==1)
		cout<<endl;
	else
		cout<<"\t";
	}

	int total;
	int iter=0;

	do
	{
	//Cuda Memory transfer for first kernal
	cudaMemcpy(d_p,p,2*size,cudaMemcpyHostToDevice);
	cudaMemcpy(d_c,c,csize,cudaMemcpyHostToDevice);
	cudaMemcpy(d_dist,dist,dsize,cudaMemcpyHostToDevice);

	Distance<<< n/tnum , dim3(tnum,k,1) >>>(d_p,d_c,d_dist,k);		//<<<Number of blocks , Number of threads per block>>>

	//cudaMemcpy(p,d_p,size,cudaMemcpyDeviceToHost);
	cudaMemcpy(c,d_c,csize,cudaMemcpyDeviceToHost);
	cudaMemcpy(dist,d_dist,dsize,cudaMemcpyDeviceToHost);
	/*
	cout<<"\nModified Distance array:\n";
	int j=0;
		for(int i=0;i<n*k;i++)
		{
		cout<<dist[i]<<"\t\t";
		if(i%k==2)
		{
			cout<<"\t"<<j<<endl;
			j++;
		}
		}
	*/
	//cin>>option;

	//Cuda Memory transfer for second kernal
	cudaMemcpy(d_dist,dist,dsize,cudaMemcpyHostToDevice);
	cudaMemcpy(d_mini,mini,size,cudaMemcpyHostToDevice);
	cudaMemcpy(d_cnum,cnum,size,cudaMemcpyHostToDevice);

	ClusterAllocation<<<n/tnum,tnum>>>(d_dist,k,d_mini,d_cnum);

	cudaMemcpy(dist,d_dist,dsize,cudaMemcpyDeviceToHost);
	cudaMemcpy(mini,d_mini,size,cudaMemcpyDeviceToHost);
	cudaMemcpy(cnum,d_cnum,size,cudaMemcpyDeviceToHost);

	//Printing minimum distance of each point
	/*	
	cout<<"\nMinimum Distances:";
	for(int i=0;i<n;i++)
		cout<<"\n"<<(float)mini[i]<<"\t"<<cnum[i];
	*/
	//Printing cluster numbers
	/*
	cout<<"\nCluster Numbers:";
	for(int i=0;i<n;i++)
		cout<<"\t"<<cnum[i];
	*/

	//cin>>option;

	thrust::fill(D1.begin(), D1.end(), zero);
	thrust::fill(D2.begin(), D2.end(), zero);
	thrust::fill(D3.begin(), D3.end(), zero);

	//Initializing array of counts
	int *c_counts,*dc_counts;
	c_counts=(int *)malloc(k*sizeof(int));
	cudaMalloc(&dc_counts,k*sizeof(int));

	//Cuda Memory transfer for third kernal
	cudaMemcpy(d_cnum,cnum,size,cudaMemcpyHostToDevice);
	cudaMemcpy(dc_counts,c_counts,k*sizeof(int),cudaMemcpyHostToDevice);

	ClusterFormation<<<n/tnum , tnum>>>(d_p, d_cnum,k,d_c1, d_c2, d_c3,dc_counts);

	cudaMemcpy(cnum,d_cnum,size,cudaMemcpyDeviceToHost);
	cudaMemcpy(c_counts,dc_counts,k*sizeof(int),cudaMemcpyDeviceToHost);


	c_counts[0]=c_counts[1]=c_counts[2]=0;
	
	
	//Printing actual clusters
	cout<<"\n\nCluster 1:";
	for(int i=0;i<n;i++)
		{
		if(static_cast<point>(D1[i]).x != 0 && static_cast<point>(D1[i]).y != 0)
		{
			cout<<"\n"<<static_cast<point>(D1[i]).x<<"\t"<<static_cast<point>(D1[i]).y;
			c_counts[0]++;
		}
		}

	cout<<"\n\nCluster 2:";
	for(int i=0;i<n;i++)
		{
		if(static_cast<point>(D2[i]).x != 0 && static_cast<point>(D2[i]).y != 0)
			{
			cout<<"\n"<<static_cast<point>(D2[i]).x<<"\t"<<static_cast<point>(D2[i]).y;
			c_counts[1]++;
			}
		}

	cout<<"\n\nCluster 3:";
	for(int i=0;i<n;i++)
		{
		if(static_cast<point>(D3[i]).x != 0 && static_cast<point>(D3[i]).y != 0)
			{
			cout<<"\n"<<static_cast<point>(D3[i]).x<<"\t"<<static_cast<point>(D3[i]).y;
			c_counts[2]++;
			}
		}

	
	cout<<"\n\nCounts Of clusters:";
	for(int i=0;i<k;i++)
		cout<<"\t"<<c_counts[i];

	total=c_counts[0]+c_counts[1]+c_counts[2];
	cout<<"\t\tTotal :"<<total;

	//Retaining old cluster heads
	for(int i=0;i<k*2;i++)
		c_old[i]=c[i];

	//Cuda Memory transfer for fourth kernal
	cudaMemcpy(dc_counts,c_counts,k*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(d_c,c,csize,cudaMemcpyHostToDevice);

	ClusterUpdate<<<k,1>>>(d_c1,d_c2,d_c3,dc_counts,d_c);

	cudaMemcpy(c_counts,dc_counts,k*sizeof(int),cudaMemcpyDeviceToHost);
	cudaMemcpy(c,d_c,csize,cudaMemcpyDeviceToHost);


	//Printing Old Cluster Heads
	cout<<"\n\nOld Cluster Heads:\n";
	for(int i=0;i<k*2;i++)
	{
	cout<<"\t"<<c_old[i];
	if(i%2==1)
		cout<<endl;
	else
		cout<<"\t";
	}

	//Printing New Cluster Heads
		cout<<"\nNew Cluster Heads:\n";
		for(int i=0;i<k*2;i++)
		{
		cout<<"\t"<<c[i];
		if(i%2==1)
			cout<<endl;
		else
			cout<<"\t";
		}
		cout<<endl;

		free(c_counts);
		cudaFree(dc_counts);

		iter++;


//	cin>>option;
//	}while(option==1);


}while(MovementDetection(c_old,c,k));

	cout<<"\n\nNumber of iterations:"<<iter;
	cout<<"\nAlgorithm Stopped.";

	free(x);
	free(y);
	free(p);
	cudaFree(d_p);
	free(c);
	cudaFree(d_c);
	free(dist);
	cudaFree(d_dist);
	free(mini);
	cudaFree(d_mini);

	return 0;
}

I am calculating the execution time using time ./a.out on ubuntu.
Please help.

That may not be a very good method. A system that I use regularly has 6-7 seconds of CUDA startup overhead.

What actual times do you measure in each case?

In case of my serial code, the time was:

real	0m3.346s
user	0m0.148s
sys	0m0.361s

In case of parallel code, the time was :

real	0m8.963s
user	0m5.741s
sys	0m1.712s

Is there any other way by which I can calculate the time just to be sure ?

Does the CUDA code work properly? I get warnings about uninitialized variables (see below). Are you using a release build to compile the CUDA code? The posted code seems to generate significant I/O spew, I would suggest timing without the I/O. I tried to run the program under control of nvprof (the CUDA profiler) but it’s been running for more than six minutes and it hasn’t terminated yet.

cluster.cu(309): warning: variable "option" was declared but never referenced

cluster.cu(137): warning: transfer of control bypasses initialization of:
            variable "i1"
(143): here
            variable "i2"
(153): here
          detected during instantiation of "void ClusterFormation(point *, int *, int, T *, T *, T *, int *) [with T=point]"
(645): here

cluster.cu(137): warning: transfer of control bypasses initialization of:
            variable "i1"
(143): here
            variable "i2"
(153): here

cluster.cu(309): warning: variable "option" 
was declared but never referenced

The warnings are because i used switch case block inside the kernal function.
And Yes, the program runs successfully in my case but takes more time than that of its serial counterpart.

Your logs above show a run time of 8 seconds for the GPU version, while I killed the executable (release build) after six minutes. That seemed to suggest to me that the uninitialized data may cause indeterminate results and thus run times.

I would suggest drilling down into the performance details of the GPU code with the CUDA profiler. What GPU are you using? The code seems to be using a bunch of double-precision computation, which would be slow on consumer cards: the double-precision throughput of your CPU may be higher than the double-precision throughput of your GPU.

Another factor would seem to be memory operations. I can’t tell exactly how large the working set of this application is, but at first glance it would appear to fit into the last-level cache of higher-end CPUs, meaning the code hits a sweet spot for the CPU. I would suggest applying appropriate “const” and “restrict” modifiers to all pointer arguments in the GPU code to enable all memory optimizations that are independent of access pattern.