TSP (Travelling Salesman Problem) on CUDA

Hello, does anyone have a program on the TSP written in CUDA to give me?

I’ve written it, but it is too slow compared to the CPU, and I was faster than the CPU.

This is my Code:

#include <stdio.h>

#include <time.h>

#include <math.h>

#include <stdlib.h>

#include <errno.h>

#define MAXN 12

#define FNAME "file.txt"

#define INF 1e20

int c1[MAXN];

int c2[MAXN];

int n;

__shared__ float minsum;

__shared__ int nc;

__shared__ int p[MAXN];

__shared__ float mat[MAXN][MAXN];

int *a_h, *b_h;	 // pointers to host memory

int *a_d, *b_d;	 // pointers to device memory

float *s_h;

float *s_d;

void getgeom(int z)

{	FILE *fp;

	fp = fopen(FNAME, "r");

	for (n = 0; n <= z; n++) {

	fscanf(fp, "%d %d", &c1[n], &c2[n]);

	}

}

__device__ float sqr(float x)

{  return x*x;

}

__device__ float distanza(int i, int j,int* a_d, int* b_d)

{

	return (sqrt(sqr(a_d[i]-a_d[j])+sqr(b_d[i]-b_d[j])));

}

__device__ void save(float sum, float* s_d)

{

	if (sum < minsum) {

		minsum = sum;

		for (int i = 0; i < nc; i++) {

			s_d[i+1] = p[i];

		}

				s_d[0]=minsum;

//__syncthreads();

	}

}

/* ------------------------------------------------------------*/

__device__ void swap(int i, int j)

{	int t = p[i];

	p[i] = p[j];

	p[j] = t;

//__syncthreads();

}

__device__ void check(float sum, float* s_d)

{	sum += mat[p[0]][p[nc-1]];

//__syncthreads();

	save(sum,s_d);

}

__device__ void search11(int m, float sum, float* s_d)

{

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	}

}

__device__ void search10(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search11(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search9(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search10(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search8(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search9(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search7(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search8(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search6(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search7(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search5(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search6(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search4(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search5(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search3(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search4(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search2(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search3(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search2(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__global__ void VecAdd(int* a_d, int* b_d, float* s_d, int n) {

//int i = blockDim.x * blockIdx.x + threadIdx.x;

nc=n;

minsum = INF;

for (int i=0; i<nc; i++) {

	p[i] = i;

	for (int j=0; j<nc; j++) {

		mat[i][j] = distanza(i,j,a_d,b_d);

	}

}

search(nc-1,0,s_d);

}

/* ------------------------------------------------------------*/

int main(int argc, char** argv)

{	int i, start;

	float secs;

	if (argc != 2){

		fprintf(stderr,"Numero parametri errato. Digitare il numero di citta.\n");

		exit(1);

	}

	n = atoi(argv[1]);

	if (errno != 0 || n < 2 || n > MAXN) {

		fprintf(stderr,"Numero citta non corretto; mettere un numero da 2 a %d.\n", MAXN);

		exit(2);

	}

	getgeom(n-1);

	printf("-------------------------------------\n");

	printf(" Travel Salesman problem			 \n");

	printf("\n");

	printf(" Num. Citta	   = %d			   \n", n);

	printf("-------------------------------------\n");

	// allocate arrays on host

	a_h = (int *)malloc(sizeof(int)*MAXN);

	b_h = (int *)malloc(sizeof(int)*MAXN);

	s_h = (float *)malloc(sizeof(float)*(MAXN+1));

	for (i=0; i<MAXN; i++) {

		a_h[i] = c1[i];

		b_h[i] = c2[i];

		s_h[i] = 0;

	}

		s_h[MAXN] = 0;

	// allocate arrays on device

	cudaMalloc((void **) &a_d, sizeof(int)*MAXN);

	cudaMalloc((void **) &b_d, sizeof(int)*MAXN);

	cudaMalloc((void **) &s_d, sizeof(float)*(MAXN+1));

	// send data from host to device

	cudaMemcpy(a_d, a_h, sizeof(int)*MAXN, cudaMemcpyHostToDevice);

	cudaMemcpy(b_d, b_h, sizeof(int)*MAXN, cudaMemcpyHostToDevice);

	cudaMemcpy(s_d, s_h, sizeof(float)*(MAXN+1), cudaMemcpyHostToDevice);

	start = clock();

	int threadsPerBlock = 1;

	int blocksPerGrid = 8;

	VecAdd<<<blocksPerGrid, threadsPerBlock>>>(a_d, b_d, s_d, n);

	// retrieve data from device

	//cudaMemcpy(a_h, a_d, sizeof(int)*MAXN, cudaMemcpyDeviceToHost);

	//cudaMemcpy(b_h, b_d, sizeof(int)*MAXN, cudaMemcpyDeviceToHost);

	cudaMemcpy(s_h, s_d, sizeof(float)*(MAXN+1), cudaMemcpyDeviceToHost);

	secs = ((float) clock() - (float) start) / (float) CLOCKS_PER_SEC;

	//printf("-------------------------------------\n");

	printf("sequenza originaria				  \n");

	for (i = 0; i < n; i++) {

			   	printf(" %d: %d %d \n", i+1, c1[i],  c2[i]);  

	}

	printf("-------------------------------------\n");

	printf(" tempo (sec) = %f					\n", secs);

	printf(" distanza	= %f					\n", s_h[0]);

	printf("-------------------------------------\n");

	printf("sequenza minima					  \n");

	for (i = 0; i < n; i++) {

	int a = (int) (s_h[i+1]+1);

			printf(" %d: %d %d \n", a, c1[a-1],  c2[a-1]);   

	}

	printf("-------------------------------------\n");

	// clean

	free(a_h); free(b_h); free(s_h);

	cudaFree(a_d); cudaFree(b_d); cudaFree(s_d);

}

Tips on how to improve it?

Thank You :)

Hello, does anyone have a program on the TSP written in CUDA to give me?

I’ve written it, but it is too slow compared to the CPU, and I was faster than the CPU.

This is my Code:

#include <stdio.h>

#include <time.h>

#include <math.h>

#include <stdlib.h>

#include <errno.h>

#define MAXN 12

#define FNAME "file.txt"

#define INF 1e20

int c1[MAXN];

int c2[MAXN];

int n;

__shared__ float minsum;

__shared__ int nc;

__shared__ int p[MAXN];

__shared__ float mat[MAXN][MAXN];

int *a_h, *b_h;	 // pointers to host memory

int *a_d, *b_d;	 // pointers to device memory

float *s_h;

float *s_d;

void getgeom(int z)

{	FILE *fp;

	fp = fopen(FNAME, "r");

	for (n = 0; n <= z; n++) {

	fscanf(fp, "%d %d", &c1[n], &c2[n]);

	}

}

__device__ float sqr(float x)

{  return x*x;

}

__device__ float distanza(int i, int j,int* a_d, int* b_d)

{

	return (sqrt(sqr(a_d[i]-a_d[j])+sqr(b_d[i]-b_d[j])));

}

__device__ void save(float sum, float* s_d)

{

	if (sum < minsum) {

		minsum = sum;

		for (int i = 0; i < nc; i++) {

			s_d[i+1] = p[i];

		}

				s_d[0]=minsum;

//__syncthreads();

	}

}

/* ------------------------------------------------------------*/

__device__ void swap(int i, int j)

{	int t = p[i];

	p[i] = p[j];

	p[j] = t;

//__syncthreads();

}

__device__ void check(float sum, float* s_d)

{	sum += mat[p[0]][p[nc-1]];

//__syncthreads();

	save(sum,s_d);

}

__device__ void search11(int m, float sum, float* s_d)

{

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	}

}

__device__ void search10(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search11(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search9(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search10(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search8(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search9(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search7(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search8(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search6(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search7(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search5(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search6(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search4(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search5(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search3(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search4(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search2(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search3(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__device__ void search(int m, float sum, float* s_d)

{	int i;

	if (m == 1) {

		check(sum + mat[p[0]][p[1]],s_d);

	} else {

		for (i = m-1; i >= 0; i--) {

			swap(i, m-1);

			search2(m-1, sum + mat[p[m-1]][p[m]],s_d);

			swap(i, m-1);

		}

	}

}

__global__ void VecAdd(int* a_d, int* b_d, float* s_d, int n) {

//int i = blockDim.x * blockIdx.x + threadIdx.x;

nc=n;

minsum = INF;

for (int i=0; i<nc; i++) {

	p[i] = i;

	for (int j=0; j<nc; j++) {

		mat[i][j] = distanza(i,j,a_d,b_d);

	}

}

search(nc-1,0,s_d);

}

/* ------------------------------------------------------------*/

int main(int argc, char** argv)

{	int i, start;

	float secs;

	if (argc != 2){

		fprintf(stderr,"Numero parametri errato. Digitare il numero di citta.\n");

		exit(1);

	}

	n = atoi(argv[1]);

	if (errno != 0 || n < 2 || n > MAXN) {

		fprintf(stderr,"Numero citta non corretto; mettere un numero da 2 a %d.\n", MAXN);

		exit(2);

	}

	getgeom(n-1);

	printf("-------------------------------------\n");

	printf(" Travel Salesman problem			 \n");

	printf("\n");

	printf(" Num. Citta	   = %d			   \n", n);

	printf("-------------------------------------\n");

	// allocate arrays on host

	a_h = (int *)malloc(sizeof(int)*MAXN);

	b_h = (int *)malloc(sizeof(int)*MAXN);

	s_h = (float *)malloc(sizeof(float)*(MAXN+1));

	for (i=0; i<MAXN; i++) {

		a_h[i] = c1[i];

		b_h[i] = c2[i];

		s_h[i] = 0;

	}

		s_h[MAXN] = 0;

	// allocate arrays on device

	cudaMalloc((void **) &a_d, sizeof(int)*MAXN);

	cudaMalloc((void **) &b_d, sizeof(int)*MAXN);

	cudaMalloc((void **) &s_d, sizeof(float)*(MAXN+1));

	// send data from host to device

	cudaMemcpy(a_d, a_h, sizeof(int)*MAXN, cudaMemcpyHostToDevice);

	cudaMemcpy(b_d, b_h, sizeof(int)*MAXN, cudaMemcpyHostToDevice);

	cudaMemcpy(s_d, s_h, sizeof(float)*(MAXN+1), cudaMemcpyHostToDevice);

	start = clock();

	int threadsPerBlock = 1;

	int blocksPerGrid = 8;

	VecAdd<<<blocksPerGrid, threadsPerBlock>>>(a_d, b_d, s_d, n);

	// retrieve data from device

	//cudaMemcpy(a_h, a_d, sizeof(int)*MAXN, cudaMemcpyDeviceToHost);

	//cudaMemcpy(b_h, b_d, sizeof(int)*MAXN, cudaMemcpyDeviceToHost);

	cudaMemcpy(s_h, s_d, sizeof(float)*(MAXN+1), cudaMemcpyDeviceToHost);

	secs = ((float) clock() - (float) start) / (float) CLOCKS_PER_SEC;

	//printf("-------------------------------------\n");

	printf("sequenza originaria				  \n");

	for (i = 0; i < n; i++) {

			   	printf(" %d: %d %d \n", i+1, c1[i],  c2[i]);  

	}

	printf("-------------------------------------\n");

	printf(" tempo (sec) = %f					\n", secs);

	printf(" distanza	= %f					\n", s_h[0]);

	printf("-------------------------------------\n");

	printf("sequenza minima					  \n");

	for (i = 0; i < n; i++) {

	int a = (int) (s_h[i+1]+1);

			printf(" %d: %d %d \n", a, c1[a-1],  c2[a-1]);   

	}

	printf("-------------------------------------\n");

	// clean

	free(a_h); free(b_h); free(s_h);

	cudaFree(a_d); cudaFree(b_d); cudaFree(s_d);

}

Tips on how to improve it?

Thank You :)

first off, without commenting on the code itself, do you need the best solution or just a very good solution? because there are metaheuristic methods of finding very close to optimal solutions in a much lower computational complexity time than NP. for instance, you could use a genetic algorithm with repairing of invalid solutions (e.g. breaking cyclic paths). you just need to design the genome so as to represent a flat/unbiased search space. e.g. the genome could consist of one data point per destination(node): the next destination. then calculating the total path length would just be a matter of looking up values in a 2-dimensional array ( surf2Dread(&distance, dtable, from*4, to); ) and adding them up, and checking for cyclicy(sp?) would be a matter of checking off each node you’ve visited and when you encounter a checked off node it’s cyclic. it seems a program like this would approach the solution exponentially faster than exhaustive search.

first off, without commenting on the code itself, do you need the best solution or just a very good solution? because there are metaheuristic methods of finding very close to optimal solutions in a much lower computational complexity time than NP. for instance, you could use a genetic algorithm with repairing of invalid solutions (e.g. breaking cyclic paths). you just need to design the genome so as to represent a flat/unbiased search space. e.g. the genome could consist of one data point per destination(node): the next destination. then calculating the total path length would just be a matter of looking up values in a 2-dimensional array ( surf2Dread(&distance, dtable, from*4, to); ) and adding them up, and checking for cyclicy(sp?) would be a matter of checking off each node you’ve visited and when you encounter a checked off node it’s cyclic. it seems a program like this would approach the solution exponentially faster than exhaustive search.

your’re using the gpu as if it was single-threaded. you have to assign different data elements to different threads, like so:

__global__ void createDistanceTable(int n) {

	int i0 = threadIdx.x+blockIdx.x*blockDim.x;

	int is = blockDim.x*gridDim.x;

	int j0 = threadIdx.y+blockIdx.y*blockDim.y;

	int js = blockDim.y*gridDim.y;

	for (int j=j0; j<n; j+=js) {

		float x1 = sourceData_x[j];

		float y1 = sourceData_y[j];

		for (int i=i0; i<n; i+=is) {

			float x2 = sourceData_x[i];

			float y2 = sourceData_y[i];

			distanceTable[j][i] = sqrt(sqr(x1-x2)+sqr(y1-y2));

		}

	}

}

then make sure blockDim.x is a multiple of 32. and the algorithm could benefit from the sourceData and distanceTable being textures or surfaces.

(described in section 3.4.2 of the programming guide, i believe)

your’re using the gpu as if it was single-threaded. you have to assign different data elements to different threads, like so:

__global__ void createDistanceTable(int n) {

	int i0 = threadIdx.x+blockIdx.x*blockDim.x;

	int is = blockDim.x*gridDim.x;

	int j0 = threadIdx.y+blockIdx.y*blockDim.y;

	int js = blockDim.y*gridDim.y;

	for (int j=j0; j<n; j+=js) {

		float x1 = sourceData_x[j];

		float y1 = sourceData_y[j];

		for (int i=i0; i<n; i+=is) {

			float x2 = sourceData_x[i];

			float y2 = sourceData_y[i];

			distanceTable[j][i] = sqrt(sqr(x1-x2)+sqr(y1-y2));

		}

	}

}

then make sure blockDim.x is a multiple of 32. and the algorithm could benefit from the sourceData and distanceTable being textures or surfaces.

(described in section 3.4.2 of the programming guide, i believe)

Your code uses 8 threads total on the GPU. This will never be efficient… you typically want many thousands of threads in flight at once.
Try a better starting configuration, something like 128 threads per block with 1000 blocks.

Your code also has your 8 threads all repeating the exact same calculation… there’s a commented out line like you were thinking of having the threads do different work based on blockIdx.x and threadIdx.x, which is what you must do, or you’d be wasting nearly all your work.

Your code uses 8 threads total on the GPU. This will never be efficient… you typically want many thousands of threads in flight at once.
Try a better starting configuration, something like 128 threads per block with 1000 blocks.

Your code also has your 8 threads all repeating the exact same calculation… there’s a commented out line like you were thinking of having the threads do different work based on blockIdx.x and threadIdx.x, which is what you must do, or you’d be wasting nearly all your work.

Thanks for the replies. I need just a good solution for TSP, i try with a genetic algorithm to use much many thread if I able :P

Thanks for the replies. I need just a good solution for TSP, i try with a genetic algorithm to use much many thread if I able :P