When i use CUDA, my program gets slower

I’m a new hand in CUDA programming. i want to use GPU acceleration function to accelerate Dijkstra algorithm. But it gets slower after i use CUDA. I’m curious and want to know why.

this is my source code(no CUDA)

#include<iostream>
#include<sstream>
#include<string>
#include<fstream>
#include<ctime>
const int  maprow=28;
const int mapcolumn=28;
const int points=maprow*mapcolumn;
int maxnumber=10000;
const int startpoint=1;
const int endpoint=783;

void readfile(int (*map)[mapcolumn])
{
std::ifstream read;
read.open("/home/wuhaoran/gridmap.txt");

  for(int i=0;i<maprow;i++)
  {
	for(int j=0;j<mapcolumn;j++)
	  {
		  read>>map[i][j];
	  }
  }
  /*for(int i=0;i<maprow;i++)
    {
  	for(int j=0;j<mapcolumn;j++)
  	  {
  		  std::cout<<map[i][j]<<" ";
  	  }
  	std::cout<<std::endl;
    }*/
}

void calCost(int (*map)[mapcolumn], int (*distance)[points])
{
	for(int i=0;i<points;i++)
	 {
	   for(int j=0;j<points;j++)
		  {
			if(j==i)
			{	distance[i][j]=0;}
			else if(j==i+mapcolumn||j==i-mapcolumn||j==i+1||j==i-1)
			{
				if(map[i/mapcolumn][i%mapcolumn]==1||map[j/mapcolumn][j%mapcolumn]==1)
					distance[i][j]=maxnumber;
				else
					distance[i][j]=1;
			}
			else
			{distance[i][j]=maxnumber;}
		  }
         }
	/*for(int i=0;i<points;i++)
		 {
		   for(int j=0;j<points;j++)
			  {
			   std::cout<<distance[i][j]<<" ";
			  }
		   std::cout<<std::endl;
		 }*/

}
int dijkstra(int startpoint, int endpoint, int (*distance)[points], int* prev)
{
	for(int success=0;success<points;success++)
	   {
		   prev[success]=startpoint;
	   }
	int dist[points]={}; //distance to start point
	int minpoint=0; //min distance point to start point
	int flag[points]={0};    //if point is scanned
	for(int i=0;i<points;i++)
	{
		dist[i]=distance[startpoint][i];
	}
	for(int i=0;i<points;i++)
	{
		int min=100;
		for(int j=0;j<points;j++)
		{
			if(flag[j]==0 && dist[j]<min && dist[j]!=0)
			{
				min=dist[j];
				minpoint=j;
			}
		}
		flag[minpoint]=1; //find min distance point
	    for(int m=0;m<points;m++)
	    {
	    	int tmp=min+distance[m][minpoint];
	    	if(flag[m]==0&&dist[m]>tmp)
	    	{
	    		dist[m]=tmp;
	    		prev[m]=minpoint;
	    	}
	    }
	    /*for(int n=0;n<points;n++)
	    {
	    std::cout<<"dist="<<dist[n]<<" ";
	    }
		std::cout<<std::endl;*/
	}
	return(dist[endpoint]);
}
int main()
{  

   int prev[points]={}; //route, prev[i]=j means move from j to i
   int map[maprow][mapcolumn]={};
   int distance[points][points]={};
   readfile(map);
   calCost(map,distance);
   int CostResult=0;
   clock_t startTime,endTime;
   startTime = clock();
   CostResult=dijkstra(startpoint,endpoint,distance,prev);
   endTime = clock();
   std::cout<<"The run time is:"<<(double)(endTime-startTime)/CLOCKS_PER_SEC<<"s"<<std::endl;
   std::cout<<"Cost Result="<<CostResult<<std::endl;
   int routecount=endpoint;
   while(routecount!=startpoint)
   {
   std::cout<<"the route is come from "<<prev[routecount]<<" to "<<routecount<<std::endl;
   routecount=prev[routecount];
   }
   
}

This is my CUDA code:

#include<iostream>
#include<sstream>
#include<string>
#include<fstream>
#include<ctime>
const int maprow=28;
const int mapcolumn=28;
const int points=maprow*mapcolumn;
const int world=points*points;
const int maxnumber=10000;
const int startpoint=1;
const int endpoint=783;

void readfile(int *map)
{
 std::ifstream read;
 read.open("/home/wuhaoran/gridmap.txt");

  for(int i=0;i<maprow;i++)
  {
    for(int j=0;j<mapcolumn;j++)
    {
      read>>map[i*maprow+j];
    }
  }
  /*for(int i=0;i<maprow;i++)
  {
    for(int j=0;j<mapcolumn;j++)
    {
      std::cout<<map[i*maprow+j]<<" ";
    }
  }*/
}

__global__
void calDistance(int* dev_map, int* dev_distance)
{
 int threadID= blockDim.x * blockIdx.x + threadIdx.x;
 int pointA=threadID/points;
 int pointB=threadID%points;//two points distance
   if(pointA==pointB)
    {dev_distance[threadID]=0;}
   else if(pointB==pointA+mapcolumn||pointB==pointA-mapcolumn||pointB==pointA+1||pointB==pointA-1)
    {
	  if(dev_map[pointB]==1||dev_map[pointA]==1)
	    dev_distance[threadID]=maxnumber;
	  else
	    dev_distance[threadID]=1;
    }
   else
    {dev_distance[threadID]=maxnumber;}
}

__global__
void updateDist(int* dev_dist,int* dev_distance,int* dev_prev)
{
  int threadID= blockDim.x * blockIdx.x + threadIdx.x;
  for(int i=0;i<points;i++)
    {
	if(dev_dist[threadID%points]>(dev_dist[threadID/points]+dev_distance[threadID]))
	{
		dev_dist[threadID%points]=dev_dist[threadID/points]+dev_distance[threadID];
		dev_prev[threadID%points]=threadID/points;
	}
        __syncthreads();
    }
}

int main()
{
   //clock_t startTime,endTime;
   //startTime = clock();
   int map[points]={0};
   int distance[world]={0};
   readfile(map);
   //initialize the distance map
   int* dev_distance, *dev_map;
   cudaMalloc( (void**)&dev_distance, world*sizeof(int) );
   cudaMalloc( (void**)&dev_map, points*sizeof(int) );
   cudaMemcpy(dev_map,map,points*sizeof(int),cudaMemcpyHostToDevice );
   calDistance<<<points,points>>>(dev_map, dev_distance);
   cudaMemcpy(distance,dev_distance,world*sizeof(int),cudaMemcpyDeviceToHost);
   /*int count1=0,count2=0,count3=0;
   for(int i=world-points;i<world;i++)
       {
            std::cout<<"distance="<<distance[i]<<" ";
            if(distance[i]==10000){count1++;}
            if(distance[i]==1 && i==(world-2)){count2++;}
            if(distance[i]==0 && i==(world-1)){count3++;}   
       }
   if(count1==(points-2) && count2==1 &&count3==1)
    {std::cout<<"Distance Success"<<std::endl;}
   else
     {std::cout<<"error:"<<count1<<" "<<count2<<" "<<count3<<" "<<std::endl;}*/
   cudaFree(dev_map);
   int prev[points]={};
   for(int success=0;success<points;success++)
   	 {
           prev[success]=startpoint;
   	 }
   int dist[points]={}; //distance to start point
   for(int i=0;i<points;i++)
   	 {
   	   dist[i]=distance[startpoint*points+i];
   	 }
   int*dev_dist, *dev_prev;
   cudaMalloc( (void**)&dev_dist, points*sizeof(int) );
   cudaMalloc( (void**)&dev_prev, points*sizeof(int) );
   cudaMemcpy(dev_dist,dist,points*sizeof(int),cudaMemcpyHostToDevice );
   cudaMemcpy(dev_prev,prev,points*sizeof(int),cudaMemcpyHostToDevice );
   clock_t startTime,endTime;
   startTime = clock();
   updateDist<<<points,points>>>(dev_dist, dev_distance, dev_prev);
   endTime = clock();
   std::cout<<"The run time is:"<<(double)(endTime-startTime)/CLOCKS_PER_SEC<<"s"<<std::endl;
   cudaMemcpy(dist,dev_dist,points*sizeof(int),cudaMemcpyDeviceToHost);
   cudaMemcpy(prev,dev_prev,points*sizeof(int),cudaMemcpyDeviceToHost);
   cudaFree(dev_prev);
   cudaFree(dev_dist);
   cudaFree(dev_distance);
   int routecount=endpoint;
   while(routecount!=startpoint)
   {
     std::cout<<"the route is come from "<<prev[routecount]<<" to "<<routecount<<std::endl;
     routecount=prev[routecount];
   }
   //endTime = clock();
   //std::cout<<"The run time is:"<<(double)(endTime-startTime)/CLOCKS_PER_SEC<<"s"<<std::endl;
}

this is my gridmap.txt:

0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 0 0 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0

The runtime of source code is about 0.01s, and the run time of CUDA code is about 0.1s.My computer has GTX960M, thirsting for your advice.

Can you format/indent your code with the “</>” button, so people have a chance to read the program?

Dear saulocpp,
Sorry,i use this function the first time.Now you can see it?

There are a few things I can point out without running your program, and then you investigate further:

  • You are launching a number of threads per block that is not a multiple of 32. In your case, it is 28 * 28. Search for “occupancy cuda” and you will get detailed information about it. But the objective is to keep as much of the device as possible.

  • Looking at line #59 of your second code block, all threads are performing a for() up to the number of points (in this case 28 * 28). This is inefficient to do on a GPU compared to a CPU which is clocked much higher (thus, performs this much faster). Of course there are problems that require that, for example, stencils, so we have to find optimal ways for the GPU to do it. Please search for “shared memory cuda” and read about the benefits of data reuse in a much faster portion (though smaller) of the card. The way it is done now requires constant data fetch from the “global” memory, which is slow compared to “shared” memory.

  • The other problem is that you are also measuring the time taken to copy data to/from device, and this affects your comparison. You have to either surround the kernel with the cudaEvent lines, or just use the profiler (nvprof) to get an accurate read of how long it is really taking to compute.

  • Last but not least, make sure you are compiling to a “release” version and not “debug” if using Nsight.

Dear saulocpp,
Really Thanks for your reply.For time count i’m using nvprof, not ctime.That is a useless part in my code, and i’m not using Nsight now.So i will try to solve the first and second problems these days.

Just to fix my first observation, because it looks like I ate some words, what I meant was that what we want is to keep the device as busy as possible, as many of its computing units as possible. When you read more about occupancy, you will get the explanation to why it is this way.
Good luck.