Hello, does anyone have a program on the TSP written in CUDA to give me?
I’ve written it, but it is too slow compared to the CPU, and I was faster than the CPU.
This is my Code:
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <errno.h>
#define MAXN 12
#define FNAME "file.txt"
#define INF 1e20
int c1[MAXN];
int c2[MAXN];
int n;
__shared__ float minsum;
__shared__ int nc;
__shared__ int p[MAXN];
__shared__ float mat[MAXN][MAXN];
int *a_h, *b_h; // pointers to host memory
int *a_d, *b_d; // pointers to device memory
float *s_h;
float *s_d;
void getgeom(int z)
{ FILE *fp;
fp = fopen(FNAME, "r");
for (n = 0; n <= z; n++) {
fscanf(fp, "%d %d", &c1[n], &c2[n]);
}
}
__device__ float sqr(float x)
{ return x*x;
}
__device__ float distanza(int i, int j,int* a_d, int* b_d)
{
return (sqrt(sqr(a_d[i]-a_d[j])+sqr(b_d[i]-b_d[j])));
}
__device__ void save(float sum, float* s_d)
{
if (sum < minsum) {
minsum = sum;
for (int i = 0; i < nc; i++) {
s_d[i+1] = p[i];
}
s_d[0]=minsum;
//__syncthreads();
}
}
/* ------------------------------------------------------------*/
__device__ void swap(int i, int j)
{ int t = p[i];
p[i] = p[j];
p[j] = t;
//__syncthreads();
}
__device__ void check(float sum, float* s_d)
{ sum += mat[p[0]][p[nc-1]];
//__syncthreads();
save(sum,s_d);
}
__device__ void search11(int m, float sum, float* s_d)
{
if (m == 1) {
check(sum + mat[p[0]][p[1]],s_d);
}
}
__device__ void search10(int m, float sum, float* s_d)
{ int i;
if (m == 1) {
check(sum + mat[p[0]][p[1]],s_d);
} else {
for (i = m-1; i >= 0; i--) {
swap(i, m-1);
search11(m-1, sum + mat[p[m-1]][p[m]],s_d);
swap(i, m-1);
}
}
}
__device__ void search9(int m, float sum, float* s_d)
{ int i;
if (m == 1) {
check(sum + mat[p[0]][p[1]],s_d);
} else {
for (i = m-1; i >= 0; i--) {
swap(i, m-1);
search10(m-1, sum + mat[p[m-1]][p[m]],s_d);
swap(i, m-1);
}
}
}
__device__ void search8(int m, float sum, float* s_d)
{ int i;
if (m == 1) {
check(sum + mat[p[0]][p[1]],s_d);
} else {
for (i = m-1; i >= 0; i--) {
swap(i, m-1);
search9(m-1, sum + mat[p[m-1]][p[m]],s_d);
swap(i, m-1);
}
}
}
__device__ void search7(int m, float sum, float* s_d)
{ int i;
if (m == 1) {
check(sum + mat[p[0]][p[1]],s_d);
} else {
for (i = m-1; i >= 0; i--) {
swap(i, m-1);
search8(m-1, sum + mat[p[m-1]][p[m]],s_d);
swap(i, m-1);
}
}
}
__device__ void search6(int m, float sum, float* s_d)
{ int i;
if (m == 1) {
check(sum + mat[p[0]][p[1]],s_d);
} else {
for (i = m-1; i >= 0; i--) {
swap(i, m-1);
search7(m-1, sum + mat[p[m-1]][p[m]],s_d);
swap(i, m-1);
}
}
}
__device__ void search5(int m, float sum, float* s_d)
{ int i;
if (m == 1) {
check(sum + mat[p[0]][p[1]],s_d);
} else {
for (i = m-1; i >= 0; i--) {
swap(i, m-1);
search6(m-1, sum + mat[p[m-1]][p[m]],s_d);
swap(i, m-1);
}
}
}
__device__ void search4(int m, float sum, float* s_d)
{ int i;
if (m == 1) {
check(sum + mat[p[0]][p[1]],s_d);
} else {
for (i = m-1; i >= 0; i--) {
swap(i, m-1);
search5(m-1, sum + mat[p[m-1]][p[m]],s_d);
swap(i, m-1);
}
}
}
__device__ void search3(int m, float sum, float* s_d)
{ int i;
if (m == 1) {
check(sum + mat[p[0]][p[1]],s_d);
} else {
for (i = m-1; i >= 0; i--) {
swap(i, m-1);
search4(m-1, sum + mat[p[m-1]][p[m]],s_d);
swap(i, m-1);
}
}
}
__device__ void search2(int m, float sum, float* s_d)
{ int i;
if (m == 1) {
check(sum + mat[p[0]][p[1]],s_d);
} else {
for (i = m-1; i >= 0; i--) {
swap(i, m-1);
search3(m-1, sum + mat[p[m-1]][p[m]],s_d);
swap(i, m-1);
}
}
}
__device__ void search(int m, float sum, float* s_d)
{ int i;
if (m == 1) {
check(sum + mat[p[0]][p[1]],s_d);
} else {
for (i = m-1; i >= 0; i--) {
swap(i, m-1);
search2(m-1, sum + mat[p[m-1]][p[m]],s_d);
swap(i, m-1);
}
}
}
__global__ void VecAdd(int* a_d, int* b_d, float* s_d, int n) {
//int i = blockDim.x * blockIdx.x + threadIdx.x;
nc=n;
minsum = INF;
for (int i=0; i<nc; i++) {
p[i] = i;
for (int j=0; j<nc; j++) {
mat[i][j] = distanza(i,j,a_d,b_d);
}
}
search(nc-1,0,s_d);
}
/* ------------------------------------------------------------*/
int main(int argc, char** argv)
{ int i, start;
float secs;
if (argc != 2){
fprintf(stderr,"Numero parametri errato. Digitare il numero di citta.\n");
exit(1);
}
n = atoi(argv[1]);
if (errno != 0 || n < 2 || n > MAXN) {
fprintf(stderr,"Numero citta non corretto; mettere un numero da 2 a %d.\n", MAXN);
exit(2);
}
getgeom(n-1);
printf("-------------------------------------\n");
printf(" Travel Salesman problem \n");
printf("\n");
printf(" Num. Citta = %d \n", n);
printf("-------------------------------------\n");
// allocate arrays on host
a_h = (int *)malloc(sizeof(int)*MAXN);
b_h = (int *)malloc(sizeof(int)*MAXN);
s_h = (float *)malloc(sizeof(float)*(MAXN+1));
for (i=0; i<MAXN; i++) {
a_h[i] = c1[i];
b_h[i] = c2[i];
s_h[i] = 0;
}
s_h[MAXN] = 0;
// allocate arrays on device
cudaMalloc((void **) &a_d, sizeof(int)*MAXN);
cudaMalloc((void **) &b_d, sizeof(int)*MAXN);
cudaMalloc((void **) &s_d, sizeof(float)*(MAXN+1));
// send data from host to device
cudaMemcpy(a_d, a_h, sizeof(int)*MAXN, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, sizeof(int)*MAXN, cudaMemcpyHostToDevice);
cudaMemcpy(s_d, s_h, sizeof(float)*(MAXN+1), cudaMemcpyHostToDevice);
start = clock();
int threadsPerBlock = 1;
int blocksPerGrid = 8;
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(a_d, b_d, s_d, n);
// retrieve data from device
//cudaMemcpy(a_h, a_d, sizeof(int)*MAXN, cudaMemcpyDeviceToHost);
//cudaMemcpy(b_h, b_d, sizeof(int)*MAXN, cudaMemcpyDeviceToHost);
cudaMemcpy(s_h, s_d, sizeof(float)*(MAXN+1), cudaMemcpyDeviceToHost);
secs = ((float) clock() - (float) start) / (float) CLOCKS_PER_SEC;
//printf("-------------------------------------\n");
printf("sequenza originaria \n");
for (i = 0; i < n; i++) {
printf(" %d: %d %d \n", i+1, c1[i], c2[i]);
}
printf("-------------------------------------\n");
printf(" tempo (sec) = %f \n", secs);
printf(" distanza = %f \n", s_h[0]);
printf("-------------------------------------\n");
printf("sequenza minima \n");
for (i = 0; i < n; i++) {
int a = (int) (s_h[i+1]+1);
printf(" %d: %d %d \n", a, c1[a-1], c2[a-1]);
}
printf("-------------------------------------\n");
// clean
free(a_h); free(b_h); free(s_h);
cudaFree(a_d); cudaFree(b_d); cudaFree(s_d);
}
Tips on how to improve it?
Thank You :)