Hello all,
I created a cuda code for clustering, but somehow my CUDA code runs slower than the serial code.
How is that possible after using such massive parallelism using CUDA.
My serial code is:
#include<iostream>
#include<cstdlib>
#include<stdlib.h>
#include<cmath>
using namespace std;
struct point
{
double x;
double y;
};
void Distance(point *d_p,double *d_c,double *d_dist,int n,int k)
{
for(int i=0;i<n;i++)
{
double tempx,tempy;
tempx=d_p[i].x;
tempy=d_p[i].y;
for(int j=0;j<k;j++)
{
double dx=tempx-d_c[2*j];
double dy=tempy-d_c[(2*j)+1];
d_dist[(i*k)+j]=sqrt((dx*dx)+(dy*dy));
//dist[i][j]=(dx*dx)+(dy*dy);
}
}
}
void ClusterAllocation(double *d_dist,double *d_mini, int *d_cnum,int n,int k)
{
for(int i=0;i<n;i++)
{
double min=9999999;
int num;
for(int j=0;j<k;j++)
{
if(d_dist[(i*k)+j]<min)
{
min=d_dist[(i*k)+j];
num=j+1;
}
}
d_mini[i]=min;
d_cnum[i]=num;
}
}
void ClusterFormation(point *d_p,int *d_cnum, point *d_c1, point *d_c2, point *d_c3, int *dc_counts,int n,int k)
{
int num,i1,i2,i3;
for(int i=0;i<n;i++)
{
num=d_cnum[i];
switch(num)
{
case 1:
i1=dc_counts[0];
d_c1[i1]=d_p[i];
dc_counts[0]++;
break;
case 2:
i2=dc_counts[1];
d_c2[i2]=d_p[i];
dc_counts[1]++;
break;
case 3:
i3=dc_counts[2];
d_c3[i3]=d_p[i];
dc_counts[2]++;
break;
}
}
}
void ClusterUpdate(point *d_c1,point *d_c2,point *d_c3,int *dc_counts,double *d_c,int n,int k)
{
for(int i=0;i<k;i++)
{
switch(i+1)
{
case 1:
if(dc_counts[0]!=0)
{
double tempx=0,tempy=0;
for(int j=0;j<dc_counts[0];j++)
{
point temp1=d_c1[j];
tempx+=temp1.x;
tempy+=temp1.y;
}
d_c[2*i]=tempx/dc_counts[0];
d_c[(2*i)+1]=tempy/dc_counts[0];
}
break;
case 2:
if(dc_counts[1]!=0)
{
double tempx=0,tempy=0;
for(int j=0;j<dc_counts[1];j++)
{
point temp1=d_c2[j];
tempx+=temp1.x;
tempy+=temp1.y;
}
d_c[2*i]=tempx/dc_counts[1];
d_c[(2*i)+1]=tempy/dc_counts[1];
}
break;
case 3:
if(dc_counts[2]!=0)
{
double tempx=0,tempy=0;
for(int j=0;j<dc_counts[2];j++)
{
point temp1=d_c3[j];
tempx+=temp1.x;
tempy+=temp1.y;
}
d_c[2*i]=tempx/dc_counts[2];
d_c[(2*i)+1]=tempy/dc_counts[2];
}
break;
}
}
}
bool MovementDetection(double *c_old, double *c,int k)
{
for(int i=0;i<k*2;i++)
{
double difference=c[i]-c_old[i];
if(abs(difference) >= 0.4)
return true; //Movement Detected
}
return false; //No movement
}
int main()
{
int n=10000;
double *x,*y;
point *p,*d_p;
int k;
cout<<"\nHow many clusters you want:";
//cin>>k;
k=3;
cout<<k;
double *c,*d_c,*c_old;
double *dist,*d_dist;
double *mini,*d_mini;
int *cnum,*d_cnum;
size_t size=sizeof(double)*n;
size_t csize=sizeof(double)*k*2;
size_t dsize=sizeof(double)*n*k;
//Allocating memory to host variables
p=(point*)malloc(2*size); //Points
x=(double*)malloc(size); //x-coordinate
y=(double*)malloc(size); //y-coordinate
c=(double*)malloc(csize); //Cluster Heads
c_old=(double*)malloc(csize); //Old Cluster Heads
dist=(double*)malloc(dsize); //Distance array
mini=(double*)malloc(size); //Minimum distance array
cnum=(int*)malloc(size); //Cluster number array
//Initializing x and y
for(int i=0;i<n;i++)
x[i]=y[i]=i+1;
//Printing x and y
cout<<"\n\nX:\t";
for(int i=0;i<n;i++)
cout<<"\t"<<x[i];
cout<<"\n\nY:\t";
for(int i=0;i<n;i++)
cout<<"\t"<<y[i];
//Initializing points
for(int i=0;i<n;i++)
{
p[i].x=x[i];
p[i].y=y[i];
}
//Printing points
cout<<"\n\nPoint X:";
for(int i=0;i<n;i++)
cout<<"\t"<<p[i].x;
cout<<"\n\nPoint Y:";
for(int i=0;i<n;i++)
cout<<"\t"<<p[i].y;
//Initializing cluster heads
for(int i=0;i<k*2;i++)
c[i]=rand()%n;
//Printing Cluster Heads
cout<<"\n\nCluster Heads:\n";
for(int i=0;i<k*2;i++)
{
cout<<"\t"<<c[i];
if(i%2==1)
cout<<endl;
else
cout<<"\t";
}
int iter=0;
do
{
//Function Call Distance(point *d_p,double *d_c,double *d_dist,int k,int n)
Distance(p,c,dist,n,k);
/*
cout<<"\nModified Distance array:\n";
int j=0;
for(int i=0;i<n*k;i++)
{
cout<<dist[i]<<"\t\t";
if(i%k==2)
{
cout<<"\t"<<j<<endl;
j++;
}
}
*/
//Function Call ClusterAllocation(double *d_dist,int k,double *d_mini, int *d_cnum,int n,int k)
ClusterAllocation(dist,mini,cnum,n,k);
/*//Printing minimum distance of each point
cout<<"\nMinimum Distances:";
for(int i=0;i<n;i++)
cout<<"\n"<<(float)mini[i]<<"\t"<<cnum[i];
*/
point *C1,*C2,*C3;
C1=(point*)malloc(n*sizeof(point));
C2=(point*)malloc(n*sizeof(point));
C3=(point*)malloc(n*sizeof(point));
int count[k];
count[0]=count[1]=count[2]=0;
//Function Call ClusterFormation(point *d_p,int *d_cnum, T *d_c1, T *d_c2, T *d_c3, int *dc_counts,int n,int k)
ClusterFormation(p,cnum,C1,C2,C3,count,n,k);
//Displaying Clusters
cout<<"\n\nCluster 1:";
for(int i=0;i<count[0];i++)
cout<<"\n"<<C1[i].x<<"\t"<<C1[i].y;
cout<<"\n\nCluster 2:";
for(int i=0;i<count[1];i++)
cout<<"\n"<<C2[i].x<<"\t"<<C2[i].y;
cout<<"\n\nCluster 3:";
for(int i=0;i<count[2];i++)
cout<<"\n"<<C3[i].x<<"\t"<<C3[i].y;
cout<<"\n\nCounts Of clusters:";
for(int i=0;i<k;i++)
cout<<"\t"<<count[i];
int total=count[0]+count[1]+count[2];
cout<<"\t\tTotal :"<<total;
//Retaining old cluster heads
for(int i=0;i<k*2;i++)
c_old[i]=c[i];
//Function Call ClusterUpdate(point *d_c1,point *d_c2,point *d_c3,int *dc_counts,double *d_c,int n,int k)
ClusterUpdate(C1,C2,C3,count,c,n,k);
//Printing Old Cluster Heads
cout<<"\n\nOld Cluster Heads:\n";
for(int i=0;i<k*2;i++)
{
cout<<"\t"<<c_old[i];
if(i%2==1)
cout<<endl;
else
cout<<"\t";
}
//Printing New Cluster Heads
cout<<"\nNew Cluster Heads:\n";
for(int i=0;i<k*2;i++)
{
cout<<"\t"<<c[i];
if(i%2==1)
cout<<endl;
else
cout<<"\t";
}
cout<<endl;
iter++;
}while(MovementDetection(c_old,c,k));
cout<<"\n\nNumber of iterations:"<<iter;
cout<<"\nAlgorithm Stopped.";
cout<<endl;
return 0;
return 0;
}
My CUDA code is:
/*
This is a code for Parallel K-Means Algorithm on CUDA
Assumption : The number of clusters are already known and cannot change at runtime.
*/
#include<iostream>
#include<cstdlib>
#include<stdlib.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/fill.h>
using namespace std;
__device__ int count1; //Variable that would reside in CUDA global memory.
__device__ int count2;
__device__ int count3;
struct point
{
double x;
double y;
};
__global__ void Distance(point *d_p,double *d_c,double *d_dist,int k)
{
int id=blockIdx.x * blockDim.x + threadIdx.x;
int idy=threadIdx.y;
double tempx,tempy;
tempx=d_p[id].x;
tempy=d_p[id].y;
double dx=tempx-d_c[2*idy];
double dy=tempy-d_c[(2*idy)+1];
d_dist[(id*k)+idy]=sqrt((dx*dx)+(dy*dy));
}
__global__ void ClusterAllocation(double *d_dist,int k,double *d_mini, int *d_cnum)
{
int id=blockIdx.x * blockDim.x + threadIdx.x;
double min=9999999;
int num;
for(int j=0;j<k;j++)
{
if(d_dist[(id*k)+j]<min)
{
min=d_dist[(id*k)+j];
num=j+1;
}
}
d_mini[id]=min;
d_cnum[id]=num;
}
template <typename T> __global__ void ClusterFormation(point *d_p,int *d_cnum, int k, T *d_c1, T *d_c2, T *d_c3, int *dc_counts)
{
int id=blockIdx.x * blockDim.x + threadIdx.x;
if(id==0)
{
count1=count2=count3=0;
dc_counts[0]=dc_counts[1]=dc_counts[2]=0;
}
int *c1_val=&count1, *c2_val=&count2, *c3_val=&count3;
__syncthreads();
switch(d_cnum[id])
{
case 1:
int i1=atomicAdd(c1_val,1);
d_c1[i1]=d_p[id];
break;
case 2:
int i2=atomicAdd(c2_val,1);
d_c2[i2]=d_p[id];
break;
case 3:
int i3=atomicAdd(c3_val,1);
d_c3[i3]=d_p[id];
break;
}
}
template <typename T> __global__ void ClusterUpdate(T *d_c1,T *d_c2,T *d_c3,int *dc_counts,double *d_c)
{
int id=blockIdx.x * blockDim.x + threadIdx.x;
switch(id+1)
{
case 1:
if(dc_counts[id]!=0)
{
double tempx1=0,tempy1=0;
for(int i1=0;i1<dc_counts[id];i1++)
{
point temp1=d_c1[i1];
tempx1+=temp1.x;
tempy1+=temp1.y;
}
d_c[2*id]=tempx1/dc_counts[id];
d_c[(2*id)+1]=tempy1/dc_counts[id];
}
break;
case 2:
if(dc_counts[id]!=0)
{
double tempx2=0,tempy2=0;
for(int i2=0;i2<dc_counts[id];i2++)
{
point temp2=d_c2[i2];
tempx2+=temp2.x;
tempy2+=temp2.y;
}
d_c[2*id]=tempx2/dc_counts[id];
d_c[(2*id)+1]=tempy2/dc_counts[id];
}
break;
case 3:
if(dc_counts[id]!=0)
{
double tempx3=0,tempy3=0;
for(int i3=0;i3<dc_counts[id];i3++)
{
point temp3=d_c3[i3];
tempx3+=temp3.x;
tempy3+=temp3.y;
}
d_c[2*id]=tempx3/dc_counts[id];
d_c[(2*id)+1]=tempy3/dc_counts[id];
}
break;
}
}
bool MovementDetection(double *c_old, double *c,int k)
{
for(int i=0;i<k*2;i++)
{
double difference=c[i]-c_old[i];
if(abs(difference) >= 0.4)
return true; //Movement Detected
}
return false; //No movement
}
int main()
{
int n=10000;
int tnum=250;
int option;
double *x,*y;
point *p,*d_p;
int k;
cout<<"\nHow many clusters you want:";
//cin>>k;
k=3;
cout<<k;
double *c,*d_c,*c_old;
double *dist,*d_dist;
double *mini,*d_mini;
int *cnum,*d_cnum;
size_t size=sizeof(double)*n;
size_t csize=sizeof(double)*k*2;
size_t dsize=sizeof(double)*n*k;
//Allocating memory to host variables
p=(point*)malloc(2*size); //Points
x=(double*)malloc(size); //x-coordinate
y=(double*)malloc(size); //y-coordinate
c=(double*)malloc(csize); //Cluster Heads
c_old=(double*)malloc(csize); //Old Cluster Heads
dist=(double*)malloc(dsize); //Distance array
mini=(double*)malloc(size); //Minimum distance array
cnum=(int*)malloc(size); //Cluster number array
//Allocating memory to device variables
cudaMalloc(&d_p,2*size); //Points
cudaMalloc(&d_c,csize); //Cluster Heads
cudaMalloc(&d_dist,dsize); //Distance array
cudaMalloc(&d_mini,size); //Minimum Distance array
cudaMalloc(&d_cnum,size); //Cluster number array
//Initializing x and y
for(int i=0;i<n;i++)
x[i]=y[i]=i+1;
//Printing x and y
cout<<"\n\nX:\t";
for(int i=0;i<n;i++)
cout<<"\t"<<x[i];
cout<<"\n\nY:\t";
for(int i=0;i<n;i++)
cout<<"\t"<<y[i];
//Initializing points
for(int i=0;i<n;i++)
{
p[i].x=x[i];
p[i].y=y[i];
}
//Printing points
cout<<"\n\nPoint X:";
for(int i=0;i<n;i++)
cout<<"\t"<<p[i].x;
cout<<"\n\nPoint Y:";
for(int i=0;i<n;i++)
cout<<"\t"<<p[i].y;
//Initializing Clusters
thrust::host_vector<point> C1(n);
thrust::host_vector<point> C2(n);
thrust::host_vector<point> C3(n);
thrust::device_vector<point> D1 = C1;
thrust::device_vector<point> D2 = C2;
thrust::device_vector<point> D3 = C3;
point zero;
zero.x=zero.y=0;
//Allocating memory to device clusters
point *d_c1=thrust::raw_pointer_cast(D1.data());
point *d_c2=thrust::raw_pointer_cast(D2.data());
point *d_c3=thrust::raw_pointer_cast(D3.data());
//Initializing cluster heads
for(int i=0;i<k*2;i++)
c[i]=rand()%n;
//Printing Cluster Heads
cout<<"\n\nCluster Heads:\n";
for(int i=0;i<k*2;i++)
{
cout<<"\t"<<c[i];
if(i%2==1)
cout<<endl;
else
cout<<"\t";
}
int total;
int iter=0;
do
{
//Cuda Memory transfer for first kernal
cudaMemcpy(d_p,p,2*size,cudaMemcpyHostToDevice);
cudaMemcpy(d_c,c,csize,cudaMemcpyHostToDevice);
cudaMemcpy(d_dist,dist,dsize,cudaMemcpyHostToDevice);
Distance<<< n/tnum , dim3(tnum,k,1) >>>(d_p,d_c,d_dist,k); //<<<Number of blocks , Number of threads per block>>>
//cudaMemcpy(p,d_p,size,cudaMemcpyDeviceToHost);
cudaMemcpy(c,d_c,csize,cudaMemcpyDeviceToHost);
cudaMemcpy(dist,d_dist,dsize,cudaMemcpyDeviceToHost);
/*
cout<<"\nModified Distance array:\n";
int j=0;
for(int i=0;i<n*k;i++)
{
cout<<dist[i]<<"\t\t";
if(i%k==2)
{
cout<<"\t"<<j<<endl;
j++;
}
}
*/
//cin>>option;
//Cuda Memory transfer for second kernal
cudaMemcpy(d_dist,dist,dsize,cudaMemcpyHostToDevice);
cudaMemcpy(d_mini,mini,size,cudaMemcpyHostToDevice);
cudaMemcpy(d_cnum,cnum,size,cudaMemcpyHostToDevice);
ClusterAllocation<<<n/tnum,tnum>>>(d_dist,k,d_mini,d_cnum);
cudaMemcpy(dist,d_dist,dsize,cudaMemcpyDeviceToHost);
cudaMemcpy(mini,d_mini,size,cudaMemcpyDeviceToHost);
cudaMemcpy(cnum,d_cnum,size,cudaMemcpyDeviceToHost);
//Printing minimum distance of each point
/*
cout<<"\nMinimum Distances:";
for(int i=0;i<n;i++)
cout<<"\n"<<(float)mini[i]<<"\t"<<cnum[i];
*/
//Printing cluster numbers
/*
cout<<"\nCluster Numbers:";
for(int i=0;i<n;i++)
cout<<"\t"<<cnum[i];
*/
//cin>>option;
thrust::fill(D1.begin(), D1.end(), zero);
thrust::fill(D2.begin(), D2.end(), zero);
thrust::fill(D3.begin(), D3.end(), zero);
//Initializing array of counts
int *c_counts,*dc_counts;
c_counts=(int *)malloc(k*sizeof(int));
cudaMalloc(&dc_counts,k*sizeof(int));
//Cuda Memory transfer for third kernal
cudaMemcpy(d_cnum,cnum,size,cudaMemcpyHostToDevice);
cudaMemcpy(dc_counts,c_counts,k*sizeof(int),cudaMemcpyHostToDevice);
ClusterFormation<<<n/tnum , tnum>>>(d_p, d_cnum,k,d_c1, d_c2, d_c3,dc_counts);
cudaMemcpy(cnum,d_cnum,size,cudaMemcpyDeviceToHost);
cudaMemcpy(c_counts,dc_counts,k*sizeof(int),cudaMemcpyDeviceToHost);
c_counts[0]=c_counts[1]=c_counts[2]=0;
//Printing actual clusters
cout<<"\n\nCluster 1:";
for(int i=0;i<n;i++)
{
if(static_cast<point>(D1[i]).x != 0 && static_cast<point>(D1[i]).y != 0)
{
cout<<"\n"<<static_cast<point>(D1[i]).x<<"\t"<<static_cast<point>(D1[i]).y;
c_counts[0]++;
}
}
cout<<"\n\nCluster 2:";
for(int i=0;i<n;i++)
{
if(static_cast<point>(D2[i]).x != 0 && static_cast<point>(D2[i]).y != 0)
{
cout<<"\n"<<static_cast<point>(D2[i]).x<<"\t"<<static_cast<point>(D2[i]).y;
c_counts[1]++;
}
}
cout<<"\n\nCluster 3:";
for(int i=0;i<n;i++)
{
if(static_cast<point>(D3[i]).x != 0 && static_cast<point>(D3[i]).y != 0)
{
cout<<"\n"<<static_cast<point>(D3[i]).x<<"\t"<<static_cast<point>(D3[i]).y;
c_counts[2]++;
}
}
cout<<"\n\nCounts Of clusters:";
for(int i=0;i<k;i++)
cout<<"\t"<<c_counts[i];
total=c_counts[0]+c_counts[1]+c_counts[2];
cout<<"\t\tTotal :"<<total;
//Retaining old cluster heads
for(int i=0;i<k*2;i++)
c_old[i]=c[i];
//Cuda Memory transfer for fourth kernal
cudaMemcpy(dc_counts,c_counts,k*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_c,c,csize,cudaMemcpyHostToDevice);
ClusterUpdate<<<k,1>>>(d_c1,d_c2,d_c3,dc_counts,d_c);
cudaMemcpy(c_counts,dc_counts,k*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(c,d_c,csize,cudaMemcpyDeviceToHost);
//Printing Old Cluster Heads
cout<<"\n\nOld Cluster Heads:\n";
for(int i=0;i<k*2;i++)
{
cout<<"\t"<<c_old[i];
if(i%2==1)
cout<<endl;
else
cout<<"\t";
}
//Printing New Cluster Heads
cout<<"\nNew Cluster Heads:\n";
for(int i=0;i<k*2;i++)
{
cout<<"\t"<<c[i];
if(i%2==1)
cout<<endl;
else
cout<<"\t";
}
cout<<endl;
free(c_counts);
cudaFree(dc_counts);
iter++;
// cin>>option;
// }while(option==1);
}while(MovementDetection(c_old,c,k));
cout<<"\n\nNumber of iterations:"<<iter;
cout<<"\nAlgorithm Stopped.";
free(x);
free(y);
free(p);
cudaFree(d_p);
free(c);
cudaFree(d_c);
free(dist);
cudaFree(d_dist);
free(mini);
cudaFree(d_mini);
return 0;
}
I am calculating the execution time using time ./a.out on ubuntu.
Please help.