I’m a new hand in CUDA programming. i want to use GPU acceleration function to accelerate Dijkstra algorithm. But it gets slower after i use CUDA. I’m curious and want to know why.
this is my source code(no CUDA)
#include<iostream>
#include<sstream>
#include<string>
#include<fstream>
#include<ctime>
const int maprow=28;
const int mapcolumn=28;
const int points=maprow*mapcolumn;
int maxnumber=10000;
const int startpoint=1;
const int endpoint=783;
void readfile(int (*map)[mapcolumn])
{
std::ifstream read;
read.open("/home/wuhaoran/gridmap.txt");
for(int i=0;i<maprow;i++)
{
for(int j=0;j<mapcolumn;j++)
{
read>>map[i][j];
}
}
/*for(int i=0;i<maprow;i++)
{
for(int j=0;j<mapcolumn;j++)
{
std::cout<<map[i][j]<<" ";
}
std::cout<<std::endl;
}*/
}
void calCost(int (*map)[mapcolumn], int (*distance)[points])
{
for(int i=0;i<points;i++)
{
for(int j=0;j<points;j++)
{
if(j==i)
{ distance[i][j]=0;}
else if(j==i+mapcolumn||j==i-mapcolumn||j==i+1||j==i-1)
{
if(map[i/mapcolumn][i%mapcolumn]==1||map[j/mapcolumn][j%mapcolumn]==1)
distance[i][j]=maxnumber;
else
distance[i][j]=1;
}
else
{distance[i][j]=maxnumber;}
}
}
/*for(int i=0;i<points;i++)
{
for(int j=0;j<points;j++)
{
std::cout<<distance[i][j]<<" ";
}
std::cout<<std::endl;
}*/
}
int dijkstra(int startpoint, int endpoint, int (*distance)[points], int* prev)
{
for(int success=0;success<points;success++)
{
prev[success]=startpoint;
}
int dist[points]={}; //distance to start point
int minpoint=0; //min distance point to start point
int flag[points]={0}; //if point is scanned
for(int i=0;i<points;i++)
{
dist[i]=distance[startpoint][i];
}
for(int i=0;i<points;i++)
{
int min=100;
for(int j=0;j<points;j++)
{
if(flag[j]==0 && dist[j]<min && dist[j]!=0)
{
min=dist[j];
minpoint=j;
}
}
flag[minpoint]=1; //find min distance point
for(int m=0;m<points;m++)
{
int tmp=min+distance[m][minpoint];
if(flag[m]==0&&dist[m]>tmp)
{
dist[m]=tmp;
prev[m]=minpoint;
}
}
/*for(int n=0;n<points;n++)
{
std::cout<<"dist="<<dist[n]<<" ";
}
std::cout<<std::endl;*/
}
return(dist[endpoint]);
}
int main()
{
int prev[points]={}; //route, prev[i]=j means move from j to i
int map[maprow][mapcolumn]={};
int distance[points][points]={};
readfile(map);
calCost(map,distance);
int CostResult=0;
clock_t startTime,endTime;
startTime = clock();
CostResult=dijkstra(startpoint,endpoint,distance,prev);
endTime = clock();
std::cout<<"The run time is:"<<(double)(endTime-startTime)/CLOCKS_PER_SEC<<"s"<<std::endl;
std::cout<<"Cost Result="<<CostResult<<std::endl;
int routecount=endpoint;
while(routecount!=startpoint)
{
std::cout<<"the route is come from "<<prev[routecount]<<" to "<<routecount<<std::endl;
routecount=prev[routecount];
}
}
This is my CUDA code:
#include<iostream>
#include<sstream>
#include<string>
#include<fstream>
#include<ctime>
const int maprow=28;
const int mapcolumn=28;
const int points=maprow*mapcolumn;
const int world=points*points;
const int maxnumber=10000;
const int startpoint=1;
const int endpoint=783;
void readfile(int *map)
{
std::ifstream read;
read.open("/home/wuhaoran/gridmap.txt");
for(int i=0;i<maprow;i++)
{
for(int j=0;j<mapcolumn;j++)
{
read>>map[i*maprow+j];
}
}
/*for(int i=0;i<maprow;i++)
{
for(int j=0;j<mapcolumn;j++)
{
std::cout<<map[i*maprow+j]<<" ";
}
}*/
}
__global__
void calDistance(int* dev_map, int* dev_distance)
{
int threadID= blockDim.x * blockIdx.x + threadIdx.x;
int pointA=threadID/points;
int pointB=threadID%points;//two points distance
if(pointA==pointB)
{dev_distance[threadID]=0;}
else if(pointB==pointA+mapcolumn||pointB==pointA-mapcolumn||pointB==pointA+1||pointB==pointA-1)
{
if(dev_map[pointB]==1||dev_map[pointA]==1)
dev_distance[threadID]=maxnumber;
else
dev_distance[threadID]=1;
}
else
{dev_distance[threadID]=maxnumber;}
}
__global__
void updateDist(int* dev_dist,int* dev_distance,int* dev_prev)
{
int threadID= blockDim.x * blockIdx.x + threadIdx.x;
for(int i=0;i<points;i++)
{
if(dev_dist[threadID%points]>(dev_dist[threadID/points]+dev_distance[threadID]))
{
dev_dist[threadID%points]=dev_dist[threadID/points]+dev_distance[threadID];
dev_prev[threadID%points]=threadID/points;
}
__syncthreads();
}
}
int main()
{
//clock_t startTime,endTime;
//startTime = clock();
int map[points]={0};
int distance[world]={0};
readfile(map);
//initialize the distance map
int* dev_distance, *dev_map;
cudaMalloc( (void**)&dev_distance, world*sizeof(int) );
cudaMalloc( (void**)&dev_map, points*sizeof(int) );
cudaMemcpy(dev_map,map,points*sizeof(int),cudaMemcpyHostToDevice );
calDistance<<<points,points>>>(dev_map, dev_distance);
cudaMemcpy(distance,dev_distance,world*sizeof(int),cudaMemcpyDeviceToHost);
/*int count1=0,count2=0,count3=0;
for(int i=world-points;i<world;i++)
{
std::cout<<"distance="<<distance[i]<<" ";
if(distance[i]==10000){count1++;}
if(distance[i]==1 && i==(world-2)){count2++;}
if(distance[i]==0 && i==(world-1)){count3++;}
}
if(count1==(points-2) && count2==1 &&count3==1)
{std::cout<<"Distance Success"<<std::endl;}
else
{std::cout<<"error:"<<count1<<" "<<count2<<" "<<count3<<" "<<std::endl;}*/
cudaFree(dev_map);
int prev[points]={};
for(int success=0;success<points;success++)
{
prev[success]=startpoint;
}
int dist[points]={}; //distance to start point
for(int i=0;i<points;i++)
{
dist[i]=distance[startpoint*points+i];
}
int*dev_dist, *dev_prev;
cudaMalloc( (void**)&dev_dist, points*sizeof(int) );
cudaMalloc( (void**)&dev_prev, points*sizeof(int) );
cudaMemcpy(dev_dist,dist,points*sizeof(int),cudaMemcpyHostToDevice );
cudaMemcpy(dev_prev,prev,points*sizeof(int),cudaMemcpyHostToDevice );
clock_t startTime,endTime;
startTime = clock();
updateDist<<<points,points>>>(dev_dist, dev_distance, dev_prev);
endTime = clock();
std::cout<<"The run time is:"<<(double)(endTime-startTime)/CLOCKS_PER_SEC<<"s"<<std::endl;
cudaMemcpy(dist,dev_dist,points*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(prev,dev_prev,points*sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(dev_prev);
cudaFree(dev_dist);
cudaFree(dev_distance);
int routecount=endpoint;
while(routecount!=startpoint)
{
std::cout<<"the route is come from "<<prev[routecount]<<" to "<<routecount<<std::endl;
routecount=prev[routecount];
}
//endTime = clock();
//std::cout<<"The run time is:"<<(double)(endTime-startTime)/CLOCKS_PER_SEC<<"s"<<std::endl;
}
this is my gridmap.txt:
0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 0 0 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
The runtime of source code is about 0.01s, and the run time of CUDA code is about 0.1s.My computer has GTX960M, thirsting for your advice.