When i use CUDA, my program gets slower

wuhr15 · April 1, 2019, 6:08am

I’m a new hand in CUDA programming. i want to use GPU acceleration function to accelerate Dijkstra algorithm. But it gets slower after i use CUDA. I’m curious and want to know why.

this is my source code(no CUDA)

#include<iostream>
#include<sstream>
#include<string>
#include<fstream>
#include<ctime>
const int  maprow=28;
const int mapcolumn=28;
const int points=maprow*mapcolumn;
int maxnumber=10000;
const int startpoint=1;
const int endpoint=783;

void readfile(int (*map)[mapcolumn])
{
std::ifstream read;
read.open("/home/wuhaoran/gridmap.txt");

  for(int i=0;i<maprow;i++)
  {
	for(int j=0;j<mapcolumn;j++)
	  {
		  read>>map[i][j];
	  }
  }
  /*for(int i=0;i<maprow;i++)
    {
  	for(int j=0;j<mapcolumn;j++)
  	  {
  		  std::cout<<map[i][j]<<" ";
  	  }
  	std::cout<<std::endl;
    }*/
}

void calCost(int (*map)[mapcolumn], int (*distance)[points])
{
	for(int i=0;i<points;i++)
	 {
	   for(int j=0;j<points;j++)
		  {
			if(j==i)
			{	distance[i][j]=0;}
			else if(j==i+mapcolumn||j==i-mapcolumn||j==i+1||j==i-1)
			{
				if(map[i/mapcolumn][i%mapcolumn]==1||map[j/mapcolumn][j%mapcolumn]==1)
					distance[i][j]=maxnumber;
				else
					distance[i][j]=1;
			}
			else
			{distance[i][j]=maxnumber;}
		  }
         }
	/*for(int i=0;i<points;i++)
		 {
		   for(int j=0;j<points;j++)
			  {
			   std::cout<<distance[i][j]<<" ";
			  }
		   std::cout<<std::endl;
		 }*/

}
int dijkstra(int startpoint, int endpoint, int (*distance)[points], int* prev)
{
	for(int success=0;success<points;success++)
	   {
		   prev[success]=startpoint;
	   }
	int dist[points]={}; //distance to start point
	int minpoint=0; //min distance point to start point
	int flag[points]={0};    //if point is scanned
	for(int i=0;i<points;i++)
	{
		dist[i]=distance[startpoint][i];
	}
	for(int i=0;i<points;i++)
	{
		int min=100;
		for(int j=0;j<points;j++)
		{
			if(flag[j]==0 && dist[j]<min && dist[j]!=0)
			{
				min=dist[j];
				minpoint=j;
			}
		}
		flag[minpoint]=1; //find min distance point
	    for(int m=0;m<points;m++)
	    {
	    	int tmp=min+distance[m][minpoint];
	    	if(flag[m]==0&&dist[m]>tmp)
	    	{
	    		dist[m]=tmp;
	    		prev[m]=minpoint;
	    	}
	    }
	    /*for(int n=0;n<points;n++)
	    {
	    std::cout<<"dist="<<dist[n]<<" ";
	    }
		std::cout<<std::endl;*/
	}
	return(dist[endpoint]);
}
int main()
{  

   int prev[points]={}; //route, prev[i]=j means move from j to i
   int map[maprow][mapcolumn]={};
   int distance[points][points]={};
   readfile(map);
   calCost(map,distance);
   int CostResult=0;
   clock_t startTime,endTime;
   startTime = clock();
   CostResult=dijkstra(startpoint,endpoint,distance,prev);
   endTime = clock();
   std::cout<<"The run time is:"<<(double)(endTime-startTime)/CLOCKS_PER_SEC<<"s"<<std::endl;
   std::cout<<"Cost Result="<<CostResult<<std::endl;
   int routecount=endpoint;
   while(routecount!=startpoint)
   {
   std::cout<<"the route is come from "<<prev[routecount]<<" to "<<routecount<<std::endl;
   routecount=prev[routecount];
   }
   
}

This is my CUDA code:

#include<iostream>
#include<sstream>
#include<string>
#include<fstream>
#include<ctime>
const int maprow=28;
const int mapcolumn=28;
const int points=maprow*mapcolumn;
const int world=points*points;
const int maxnumber=10000;
const int startpoint=1;
const int endpoint=783;

void readfile(int *map)
{
 std::ifstream read;
 read.open("/home/wuhaoran/gridmap.txt");

  for(int i=0;i<maprow;i++)
  {
    for(int j=0;j<mapcolumn;j++)
    {
      read>>map[i*maprow+j];
    }
  }
  /*for(int i=0;i<maprow;i++)
  {
    for(int j=0;j<mapcolumn;j++)
    {
      std::cout<<map[i*maprow+j]<<" ";
    }
  }*/
}

__global__
void calDistance(int* dev_map, int* dev_distance)
{
 int threadID= blockDim.x * blockIdx.x + threadIdx.x;
 int pointA=threadID/points;
 int pointB=threadID%points;//two points distance
   if(pointA==pointB)
    {dev_distance[threadID]=0;}
   else if(pointB==pointA+mapcolumn||pointB==pointA-mapcolumn||pointB==pointA+1||pointB==pointA-1)
    {
	  if(dev_map[pointB]==1||dev_map[pointA]==1)
	    dev_distance[threadID]=maxnumber;
	  else
	    dev_distance[threadID]=1;
    }
   else
    {dev_distance[threadID]=maxnumber;}
}

__global__
void updateDist(int* dev_dist,int* dev_distance,int* dev_prev)
{
  int threadID= blockDim.x * blockIdx.x + threadIdx.x;
  for(int i=0;i<points;i++)
    {
	if(dev_dist[threadID%points]>(dev_dist[threadID/points]+dev_distance[threadID]))
	{
		dev_dist[threadID%points]=dev_dist[threadID/points]+dev_distance[threadID];
		dev_prev[threadID%points]=threadID/points;
	}
        __syncthreads();
    }
}

int main()
{
   //clock_t startTime,endTime;
   //startTime = clock();
   int map[points]={0};
   int distance[world]={0};
   readfile(map);
   //initialize the distance map
   int* dev_distance, *dev_map;
   cudaMalloc( (void**)&dev_distance, world*sizeof(int) );
   cudaMalloc( (void**)&dev_map, points*sizeof(int) );
   cudaMemcpy(dev_map,map,points*sizeof(int),cudaMemcpyHostToDevice );
   calDistance<<<points,points>>>(dev_map, dev_distance);
   cudaMemcpy(distance,dev_distance,world*sizeof(int),cudaMemcpyDeviceToHost);
   /*int count1=0,count2=0,count3=0;
   for(int i=world-points;i<world;i++)
       {
            std::cout<<"distance="<<distance[i]<<" ";
            if(distance[i]==10000){count1++;}
            if(distance[i]==1 && i==(world-2)){count2++;}
            if(distance[i]==0 && i==(world-1)){count3++;}   
       }
   if(count1==(points-2) && count2==1 &&count3==1)
    {std::cout<<"Distance Success"<<std::endl;}
   else
     {std::cout<<"error:"<<count1<<" "<<count2<<" "<<count3<<" "<<std::endl;}*/
   cudaFree(dev_map);
   int prev[points]={};
   for(int success=0;success<points;success++)
   	 {
           prev[success]=startpoint;
   	 }
   int dist[points]={}; //distance to start point
   for(int i=0;i<points;i++)
   	 {
   	   dist[i]=distance[startpoint*points+i];
   	 }
   int*dev_dist, *dev_prev;
   cudaMalloc( (void**)&dev_dist, points*sizeof(int) );
   cudaMalloc( (void**)&dev_prev, points*sizeof(int) );
   cudaMemcpy(dev_dist,dist,points*sizeof(int),cudaMemcpyHostToDevice );
   cudaMemcpy(dev_prev,prev,points*sizeof(int),cudaMemcpyHostToDevice );
   clock_t startTime,endTime;
   startTime = clock();
   updateDist<<<points,points>>>(dev_dist, dev_distance, dev_prev);
   endTime = clock();
   std::cout<<"The run time is:"<<(double)(endTime-startTime)/CLOCKS_PER_SEC<<"s"<<std::endl;
   cudaMemcpy(dist,dev_dist,points*sizeof(int),cudaMemcpyDeviceToHost);
   cudaMemcpy(prev,dev_prev,points*sizeof(int),cudaMemcpyDeviceToHost);
   cudaFree(dev_prev);
   cudaFree(dev_dist);
   cudaFree(dev_distance);
   int routecount=endpoint;
   while(routecount!=startpoint)
   {
     std::cout<<"the route is come from "<<prev[routecount]<<" to "<<routecount<<std::endl;
     routecount=prev[routecount];
   }
   //endTime = clock();
   //std::cout<<"The run time is:"<<(double)(endTime-startTime)/CLOCKS_PER_SEC<<"s"<<std::endl;
}

this is my gridmap.txt:

0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 0 0 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0
1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1
1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0

The runtime of source code is about 0.01s, and the run time of CUDA code is about 0.1s.My computer has GTX960M, thirsting for your advice.

saulocpp · April 1, 2019, 8:32am

Can you format/indent your code with the “</>” button, so people have a chance to read the program?

wuhr15 · April 1, 2019, 10:14am

Dear saulocpp，
Sorry，i use this function the first time.Now you can see it?

saulocpp · April 1, 2019, 10:59am

There are a few things I can point out without running your program, and then you investigate further:

You are launching a number of threads per block that is not a multiple of 32. In your case, it is 28 * 28. Search for “occupancy cuda” and you will get detailed information about it. But the objective is to keep as much of the device as possible.
Looking at line #59 of your second code block, all threads are performing a for() up to the number of points (in this case 28 * 28). This is inefficient to do on a GPU compared to a CPU which is clocked much higher (thus, performs this much faster). Of course there are problems that require that, for example, stencils, so we have to find optimal ways for the GPU to do it. Please search for “shared memory cuda” and read about the benefits of data reuse in a much faster portion (though smaller) of the card. The way it is done now requires constant data fetch from the “global” memory, which is slow compared to “shared” memory.
The other problem is that you are also measuring the time taken to copy data to/from device, and this affects your comparison. You have to either surround the kernel with the cudaEvent lines, or just use the profiler (nvprof) to get an accurate read of how long it is really taking to compute.
Last but not least, make sure you are compiling to a “release” version and not “debug” if using Nsight.

wuhr15 · April 2, 2019, 3:13am

There are a few things I can point out without running your program, and then you investigate further:

You are launching a number of threads per block that is not a multiple of 32. In your case, it is 28 * 28. Search for “occupancy cuda” and you will get detailed information about it. But the objective is to keep as much of the device as possible.

Looking at line #59 of your second code block, all threads are performing a for() up to the number of points (in this case 28 * 28). This is inefficient to do on a GPU compared to a CPU which is clocked much higher (thus, performs this much faster). Of course there are problems that require that, for example, stencils, so we have to find optimal ways for the GPU to do it. Please search for “shared memory cuda” and read about the benefits of data reuse in a much faster portion (though smaller) of the card. The way it is done now requires constant data fetch from the “global” memory, which is slow compared to “shared” memory.

The other problem is that you are also measuring the time taken to copy data to/from device, and this affects your comparison. You have to either surround the kernel with the cudaEvent lines, or just use the profiler (nvprof) to get an accurate read of how long it is really taking to compute.

Last but not least, make sure you are compiling to a “release” version and not “debug” if using Nsight.

Dear saulocpp,
Really Thanks for your reply.For time count i’m using nvprof, not ctime.That is a useless part in my code, and i’m not using Nsight now.So i will try to solve the first and second problems these days.

saulocpp · April 2, 2019, 7:34am

Just to fix my first observation, because it looks like I ate some words, what I meant was that what we want is to keep the device as busy as possible, as many of its computing units as possible. When you read more about occupancy, you will get the explanation to why it is this way.
Good luck.

Topic		Replies	Views
Simple/1st CUDA program: Reverse bits in byte Why is it faster on the CPU? CUDA Programming and Performance	11	7142	December 6, 2007
Cuda code performance CUDA Programming and Performance	14	3144	December 16, 2014
Serial vs Parallel Performance CUDA Programming and Performance	5	907	April 3, 2017
well how do I know if cuda runs on the gpu CUDA Programming and Performance	20	13391	July 9, 2008
Outrageous speedups over serial code: advice needed CUDA Programming and Performance	17	2801	November 18, 2012
cuda program running speed is very slow CUDA Programming and Performance	5	1364	December 22, 2016
Can you GUESS this without experimenting? Latencies CUDA Programming and Performance	13	9347	January 7, 2008
CUDA very slow performance CUDA Programming and Performance	21	16715	March 6, 2020
Cuda Latency problems Slow Cuda CUDA Programming and Performance	15	13931	September 5, 2008
[Beginner]: CUDA slower than serial implementation fill Operation on entire image CUDA Programming and Performance	18	13519	September 15, 2011

When i use CUDA, my program gets slower

Related topics