TSP (Travelling Salesman Problem) on CUDA

ilbona82 · October 28, 2010, 3:35pm

Hello, does anyone have a program on the TSP written in CUDA to give me?

I’ve written it, but it is too slow compared to the CPU, and I was faster than the CPU.

This is my Code:

#include <stdio.h>

#include <time.h>

#include <math.h>

#include <stdlib.h>

#include <errno.h>

#define MAXN 12

#define FNAME "file.txt"

#define INF 1e20

int c1[MAXN];

int c2[MAXN];

int n;

__shared__ float minsum;

__shared__ int nc;

__shared__ int p[MAXN];

__shared__ float mat[MAXN][MAXN];

int *a_h, *b_h;	 // pointers to host memory

int *a_d, *b_d;	 // pointers to device memory

float *s_h;

float *s_d;

void getgeom(int z)

{	FILE *fp;

	fp = fopen(FNAME, "r");

	for (n = 0; n <= z; n++) {

	fscanf(fp, "%d %d", &c1[n], &c2[n]);

	}

}

__device__ float sqr(float x)

{  return x*x;

}

__device__ float distanza(int i, int j,int* a_d, int* b_d)

{

	return (sqrt(sqr(a_d[i]-a_d[j])+sqr(b_d[i]-b_d[j])));

}

__device__ void save(float sum, float* s_d)

{

	if (sum < minsum) {

		minsum = sum;

		for (int i = 0; i < nc; i++) {

			s_d[i+1] = p[i];

		}

				s_d[0]=minsum;

//__syncthreads();

	}

}

/* ------------------------------------------------------------*/

__device__ void swap(int i, int j)

{	int t = p[i];

	p[i] = p[j];

	p[j] = t;

//__syncthreads();

}

__device__ void check(float sum, float* s_d)

{	sum += mat[p[0]][p[nc-1]];

//__syncthreads();

	save(sum,s_d);

}

__device__ void search11(int m, float sum, float* s_d)

{

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	}

}

__device__ void search10(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search11(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search9(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search10(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search8(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search9(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search7(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search8(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search6(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search7(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search5(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search6(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search4(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search5(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search3(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search4(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search2(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search3(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search2(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__global__ void VecAdd(int* a_d, int* b_d, float* s_d, int n) {

//int i = blockDim.x * blockIdx.x + threadIdx.x;

nc=n;

minsum = INF;

for (int i=0; i<nc; i++) {

	p[i] = i;

	for (int j=0; j<nc; j++) {

		mat[i][j] = distanza(i,j,a_d,b_d);

	}

}

search(nc-1,0,s_d);

}

/* ------------------------------------------------------------*/

int main(int argc, char** argv)

{	int i, start;

	float secs;

	if (argc != 2){

		fprintf(stderr,"Numero parametri errato. Digitare il numero di citta.\n");

		exit(1);

	}

	n = atoi(argv[1]);

	if (errno != 0 || n < 2 || n > MAXN) {

		fprintf(stderr,"Numero citta non corretto; mettere un numero da 2 a %d.\n", MAXN);

		exit(2);

	}

	getgeom(n-1);

	printf("-------------------------------------\n");

	printf(" Travel Salesman problem			 \n");

	printf("\n");

	printf(" Num. Citta	   = %d			   \n", n);

	printf("-------------------------------------\n");

	// allocate arrays on host

	a_h = (int *)malloc(sizeof(int)*MAXN);

	b_h = (int *)malloc(sizeof(int)*MAXN);

	s_h = (float *)malloc(sizeof(float)*(MAXN+1));

	for (i=0; i<MAXN; i++) {

		a_h[i] = c1[i];

		b_h[i] = c2[i];

		s_h[i] = 0;

	}

		s_h[MAXN] = 0;

	// allocate arrays on device

	cudaMalloc((void **) &a_d, sizeof(int)*MAXN);

	cudaMalloc((void **) &b_d, sizeof(int)*MAXN);

	cudaMalloc((void **) &s_d, sizeof(float)*(MAXN+1));

	// send data from host to device

	cudaMemcpy(a_d, a_h, sizeof(int)*MAXN, cudaMemcpyHostToDevice);

	cudaMemcpy(b_d, b_h, sizeof(int)*MAXN, cudaMemcpyHostToDevice);

	cudaMemcpy(s_d, s_h, sizeof(float)*(MAXN+1), cudaMemcpyHostToDevice);

	start = clock();

	int threadsPerBlock = 1;

	int blocksPerGrid = 8;

	VecAdd<<<blocksPerGrid, threadsPerBlock>>>(a_d, b_d, s_d, n);

	// retrieve data from device

	//cudaMemcpy(a_h, a_d, sizeof(int)*MAXN, cudaMemcpyDeviceToHost);

	//cudaMemcpy(b_h, b_d, sizeof(int)*MAXN, cudaMemcpyDeviceToHost);

	cudaMemcpy(s_h, s_d, sizeof(float)*(MAXN+1), cudaMemcpyDeviceToHost);

	secs = ((float) clock() - (float) start) / (float) CLOCKS_PER_SEC;

	//printf("-------------------------------------\n");

	printf("sequenza originaria				  \n");

	for (i = 0; i < n; i++) {

			   	printf(" %d: %d %d \n", i+1, c1[i],  c2[i]);  

	}

	printf("-------------------------------------\n");

	printf(" tempo (sec) = %f					\n", secs);

	printf(" distanza	= %f					\n", s_h[0]);

	printf("-------------------------------------\n");

	printf("sequenza minima					  \n");

	for (i = 0; i < n; i++) {

	int a = (int) (s_h[i+1]+1);

			printf(" %d: %d %d \n", a, c1[a-1],  c2[a-1]);   

	}

	printf("-------------------------------------\n");

	// clean

	free(a_h); free(b_h); free(s_h);

	cudaFree(a_d); cudaFree(b_d); cudaFree(s_d);

}

Tips on how to improve it?

Thank You :)

ilbona82 · October 28, 2010, 3:35pm

Hello, does anyone have a program on the TSP written in CUDA to give me?

I’ve written it, but it is too slow compared to the CPU, and I was faster than the CPU.

This is my Code:

#include <stdio.h>

#include <time.h>

#include <math.h>

#include <stdlib.h>

#include <errno.h>

#define MAXN 12

#define FNAME "file.txt"

#define INF 1e20

int c1[MAXN];

int c2[MAXN];

int n;

__shared__ float minsum;

__shared__ int nc;

__shared__ int p[MAXN];

__shared__ float mat[MAXN][MAXN];

int *a_h, *b_h;	 // pointers to host memory

int *a_d, *b_d;	 // pointers to device memory

float *s_h;

float *s_d;

void getgeom(int z)

{	FILE *fp;

	fp = fopen(FNAME, "r");

	for (n = 0; n <= z; n++) {

	fscanf(fp, "%d %d", &c1[n], &c2[n]);

	}

}

__device__ float sqr(float x)

{  return x*x;

}

__device__ float distanza(int i, int j,int* a_d, int* b_d)

{

	return (sqrt(sqr(a_d[i]-a_d[j])+sqr(b_d[i]-b_d[j])));

}

__device__ void save(float sum, float* s_d)

{

	if (sum < minsum) {

		minsum = sum;

		for (int i = 0; i < nc; i++) {

			s_d[i+1] = p[i];

		}

				s_d[0]=minsum;

//__syncthreads();

	}

}

/* ------------------------------------------------------------*/

__device__ void swap(int i, int j)

{	int t = p[i];

	p[i] = p[j];

	p[j] = t;

//__syncthreads();

}

__device__ void check(float sum, float* s_d)

{	sum += mat[p[0]][p[nc-1]];

//__syncthreads();

	save(sum,s_d);

}

__device__ void search11(int m, float sum, float* s_d)

{

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	}

}

__device__ void search10(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search11(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search9(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search10(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search8(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search9(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search7(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search8(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search6(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search7(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search5(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search6(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search4(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search5(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search3(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search4(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search2(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search3(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search2(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__global__ void VecAdd(int* a_d, int* b_d, float* s_d, int n) {

//int i = blockDim.x * blockIdx.x + threadIdx.x;

nc=n;

minsum = INF;

for (int i=0; i<nc; i++) {

	p[i] = i;

	for (int j=0; j<nc; j++) {

		mat[i][j] = distanza(i,j,a_d,b_d);

	}

}

search(nc-1,0,s_d);

}

/* ------------------------------------------------------------*/

int main(int argc, char** argv)

{	int i, start;

	float secs;

	if (argc != 2){

		fprintf(stderr,"Numero parametri errato. Digitare il numero di citta.\n");

		exit(1);

	}

	n = atoi(argv[1]);

	if (errno != 0 || n < 2 || n > MAXN) {

		fprintf(stderr,"Numero citta non corretto; mettere un numero da 2 a %d.\n", MAXN);

		exit(2);

	}

	getgeom(n-1);

	printf("-------------------------------------\n");

	printf(" Travel Salesman problem			 \n");

	printf("\n");

	printf(" Num. Citta	   = %d			   \n", n);

	printf("-------------------------------------\n");

	// allocate arrays on host

	a_h = (int *)malloc(sizeof(int)*MAXN);

	b_h = (int *)malloc(sizeof(int)*MAXN);

	s_h = (float *)malloc(sizeof(float)*(MAXN+1));

	for (i=0; i<MAXN; i++) {

		a_h[i] = c1[i];

		b_h[i] = c2[i];

		s_h[i] = 0;

	}

		s_h[MAXN] = 0;

	// allocate arrays on device

	cudaMalloc((void **) &a_d, sizeof(int)*MAXN);

	cudaMalloc((void **) &b_d, sizeof(int)*MAXN);

	cudaMalloc((void **) &s_d, sizeof(float)*(MAXN+1));

	// send data from host to device

	cudaMemcpy(a_d, a_h, sizeof(int)*MAXN, cudaMemcpyHostToDevice);

	cudaMemcpy(b_d, b_h, sizeof(int)*MAXN, cudaMemcpyHostToDevice);

	cudaMemcpy(s_d, s_h, sizeof(float)*(MAXN+1), cudaMemcpyHostToDevice);

	start = clock();

	int threadsPerBlock = 1;

	int blocksPerGrid = 8;

	VecAdd<<<blocksPerGrid, threadsPerBlock>>>(a_d, b_d, s_d, n);

	// retrieve data from device

	//cudaMemcpy(a_h, a_d, sizeof(int)*MAXN, cudaMemcpyDeviceToHost);

	//cudaMemcpy(b_h, b_d, sizeof(int)*MAXN, cudaMemcpyDeviceToHost);

	cudaMemcpy(s_h, s_d, sizeof(float)*(MAXN+1), cudaMemcpyDeviceToHost);

	secs = ((float) clock() - (float) start) / (float) CLOCKS_PER_SEC;

	//printf("-------------------------------------\n");

	printf("sequenza originaria				  \n");

	for (i = 0; i < n; i++) {

			   	printf(" %d: %d %d \n", i+1, c1[i],  c2[i]);  

	}

	printf("-------------------------------------\n");

	printf(" tempo (sec) = %f					\n", secs);

	printf(" distanza	= %f					\n", s_h[0]);

	printf("-------------------------------------\n");

	printf("sequenza minima					  \n");

	for (i = 0; i < n; i++) {

	int a = (int) (s_h[i+1]+1);

			printf(" %d: %d %d \n", a, c1[a-1],  c2[a-1]);   

	}

	printf("-------------------------------------\n");

	// clean

	free(a_h); free(b_h); free(s_h);

	cudaFree(a_d); cudaFree(b_d); cudaFree(s_d);

}

Tips on how to improve it?

Thank You :)

happyjack272 · October 28, 2010, 5:48pm

first off, without commenting on the code itself, do you need the best solution or just a very good solution? because there are metaheuristic methods of finding very close to optimal solutions in a much lower computational complexity time than NP. for instance, you could use a genetic algorithm with repairing of invalid solutions (e.g. breaking cyclic paths). you just need to design the genome so as to represent a flat/unbiased search space. e.g. the genome could consist of one data point per destination(node): the next destination. then calculating the total path length would just be a matter of looking up values in a 2-dimensional array ( surf2Dread(&distance, dtable, from*4, to); ) and adding them up, and checking for cyclicy(sp?) would be a matter of checking off each node you’ve visited and when you encounter a checked off node it’s cyclic. it seems a program like this would approach the solution exponentially faster than exhaustive search.

happyjack272 · October 28, 2010, 5:48pm

first off, without commenting on the code itself, do you need the best solution or just a very good solution? because there are metaheuristic methods of finding very close to optimal solutions in a much lower computational complexity time than NP. for instance, you could use a genetic algorithm with repairing of invalid solutions (e.g. breaking cyclic paths). you just need to design the genome so as to represent a flat/unbiased search space. e.g. the genome could consist of one data point per destination(node): the next destination. then calculating the total path length would just be a matter of looking up values in a 2-dimensional array ( surf2Dread(&distance, dtable, from*4, to); ) and adding them up, and checking for cyclicy(sp?) would be a matter of checking off each node you’ve visited and when you encounter a checked off node it’s cyclic. it seems a program like this would approach the solution exponentially faster than exhaustive search.

happyjack272 · October 28, 2010, 6:03pm

your’re using the gpu as if it was single-threaded. you have to assign different data elements to different threads, like so:

__global__ void createDistanceTable(int n) {

	int i0 = threadIdx.x+blockIdx.x*blockDim.x;

	int is = blockDim.x*gridDim.x;

	int j0 = threadIdx.y+blockIdx.y*blockDim.y;

	int js = blockDim.y*gridDim.y;

	for (int j=j0; j<n; j+=js) {

		float x1 = sourceData_x[j];

		float y1 = sourceData_y[j];

		for (int i=i0; i<n; i+=is) {

			float x2 = sourceData_x[i];

			float y2 = sourceData_y[i];

			distanceTable[j][i] = sqrt(sqr(x1-x2)+sqr(y1-y2));

		}

	}

}

then make sure blockDim.x is a multiple of 32. and the algorithm could benefit from the sourceData and distanceTable being textures or surfaces.

(described in section 3.4.2 of the programming guide, i believe)

happyjack272 · October 28, 2010, 6:03pm

your’re using the gpu as if it was single-threaded. you have to assign different data elements to different threads, like so:

__global__ void createDistanceTable(int n) {

	int i0 = threadIdx.x+blockIdx.x*blockDim.x;

	int is = blockDim.x*gridDim.x;

	int j0 = threadIdx.y+blockIdx.y*blockDim.y;

	int js = blockDim.y*gridDim.y;

	for (int j=j0; j<n; j+=js) {

		float x1 = sourceData_x[j];

		float y1 = sourceData_y[j];

		for (int i=i0; i<n; i+=is) {

			float x2 = sourceData_x[i];

			float y2 = sourceData_y[i];

			distanceTable[j][i] = sqrt(sqr(x1-x2)+sqr(y1-y2));

		}

	}

}

then make sure blockDim.x is a multiple of 32. and the algorithm could benefit from the sourceData and distanceTable being textures or surfaces.

(described in section 3.4.2 of the programming guide, i believe)

SPWorley · October 28, 2010, 6:11pm

Your code uses 8 threads total on the GPU. This will never be efficient… you typically want many thousands of threads in flight at once.
Try a better starting configuration, something like 128 threads per block with 1000 blocks.

Your code also has your 8 threads all repeating the exact same calculation… there’s a commented out line like you were thinking of having the threads do different work based on blockIdx.x and threadIdx.x, which is what you must do, or you’d be wasting nearly all your work.

SPWorley · October 28, 2010, 6:11pm

Your code uses 8 threads total on the GPU. This will never be efficient… you typically want many thousands of threads in flight at once.
Try a better starting configuration, something like 128 threads per block with 1000 blocks.

Your code also has your 8 threads all repeating the exact same calculation… there’s a commented out line like you were thinking of having the threads do different work based on blockIdx.x and threadIdx.x, which is what you must do, or you’d be wasting nearly all your work.

ilbona82 · October 29, 2010, 9:25am

Thanks for the replies. I need just a good solution for TSP, i try with a genetic algorithm to use much many thread if I able :P

ilbona82 · October 29, 2010, 9:25am

Thanks for the replies. I need just a good solution for TSP, i try with a genetic algorithm to use much many thread if I able :P

Topic		Replies	Views
The Travelling Salesman Problem My Numerical Analysis Homework CUDA Programming and Performance	5	1910	November 15, 2010
TSP Brute Force CUDA Programming and Performance	0	3582	September 10, 2011
Getting strange result difference executing code on K40 GPU with CUDA 8 CUDA Programming and Performance	12	1370	March 9, 2017
Vehicle Routing Problem with CUDA CUDA Programming and Performance	18	4406	January 14, 2010
Is this a task in which CUDA could speed up things? CUDA Programming and Performance	11	11827	September 29, 2009
Is GPU worth it? GPU currently too slow. CUDA Programming and Performance	16	6080	December 8, 2008
2600x speedup for GA? Is this fake? CUDA Programming and Performance	19	15039	March 12, 2010
Effective way to solve small TSP in a single CUDA core? CUDA Programming and Performance	0	469	January 11, 2019
System just halts for simple CUDA program CUDA Programming and Performance	40	4398	September 20, 2010
Does workload have an effect on speed of CUDA ? CUDA Programming and Performance	1	2337	March 3, 2010

TSP (Travelling Salesman Problem) on CUDA

Related topics