Executable cuda code:
#include"stdio.h"
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include"math.h"
#include <ctype.h>
device int distD(int x,int y,int N,floatdt)
{
int id;
if(x>y){x=x+y;y=x-y;x=x-y;}
id=x(N-1)+(y-1)-(x*(x+1)/2);
return(dt[id]);
}
global void tsp(int *rt,int *dst,int *tid,int *city,float *dt)
{
int i,j,cost;
register int change=0;
register int cit=*city;
int sol=(cit)*(cit-1)/2;
int id=threadIdx.x+blockIdx.x*blockDim.x;
cost=*dst;
if(id<sol)
{
i=cit-2-floorf((sqrtf(8*(sol-id-1)+1)-1)/2);
j=id-i*(cit-1)+(i*(i+1)/2)+1;
change=distD(rt[i],rt[j],cit,dt)+distD(rt[(i+1)%cit],rt[(j+1)%cit],cit,dt)-distD(rt[i],rt[(i+1)%cit],cit,dt)-distD(rt[j],rt[(j+1)%cit],cit,dt);
cost+=change;
__syncthreads();
atomicMin(dst,cost);
__syncthreads();
if(cost== (*dst) && change < 0)
{
*tid=id;
}
}
}
int DistH(introute,float dist_mat,int N)
{
int i,j,k,id,cost=0;
for(k=0;k<N-1;k++)
{
i=route[k];
j=route[k+1];
if(i>j){i=i+j;j=i-j;i=i-j;}
id=id=i(N-1)+(j-1)-(i(i+1)/2);
cost+=dist_mat[id];
}
i=route[k];
j=route[0];
if(i>j){i=i+j;j=i-j;i=i-j;}
id=i*(N-1)+(j-1)-(i*(i+1)/2);
cost+=dist_mat[id];
return cost;
}
int *twoOpt(int x,int y,int *route,int city)
{
int *tmp;
tmp=(int *)malloc(sizeof(int )*city);
int i,j;
for (i = 0; i <=x; ++i)
{
tmp[i] = route[i];
}
for (i = x+1, j = y; i <= y; ++i, --j)
{
tmp[i] = route[j];
}
for (i = y+1; i < city; ++i)
{
tmp[i] = route[i];
}
return tmp;
}
int main(int argc, char *argv)
{
int ch, cnt, in1, cities;
float in2, in3;
FILE *f;
float *posx, *posy;
char str[256];
int *r, *route;
int dst,d,*d_dst,tid,*d_tid,*d_city;
int sol, x,y;
int blk,thrd;
clock_t start,end;
f = fopen(argv[1], "r");
if (f == NULL) {fprintf(stderr, "could not open file \n"); exit(-1);}
ch = getc(f); while ((ch != EOF) && (ch != '\n')) ch = getc(f);
ch = getc(f); while ((ch != EOF) && (ch != '\n')) ch = getc(f);
ch = getc(f); while ((ch != EOF) && (ch != '\n')) ch = getc(f);
ch = getc(f); while ((ch != EOF) && (ch != ':')) ch = getc(f);
fscanf(f, "%s\n", str);
cities = atoi(str);
if (cities <= 2) {fprintf(stderr, "only %d cities\n", cities); exit(-1);}
sol=cities*(cities-1)/2;
posx = (float *)malloc(sizeof(float) * cities); if (posx == NULL) {fprintf(stderr, "cannot allocate posx\n"); exit(-1);}
posy = (float *)malloc(sizeof(float) * cities); if (posy == NULL) {fprintf(stderr, "cannot allocate posy\n"); exit(-1);}
r = (int *)malloc(sizeof(int) * cities);
route = (int *)malloc(sizeof(int) * cities);
//posy = (float *)malloc(sizeof(float) * cities);
for(int i=0;i<cities;i++)
{r[i]=i;}
ch = getc(f); while ((ch != EOF) && (ch != '\n')) ch = getc(f);
fscanf(f, "%s\n", str);
if (strcmp(str, "NODE_COORD_SECTION") != 0) {fprintf(stderr, "wrong file format\n"); exit(-1);}
cnt = 0;
while (fscanf(f, "%d %f %f\n", &in1, &in2, &in3))
{
posx[cnt] = in2;
posy[cnt] = in3;
cnt++;
if (cnt > cities) {fprintf(stderr, "input too long\n"); exit(-1);}
if (cnt != in1) {fprintf(stderr, "input line mismatch: expected %d instead of %d\n", cnt, in1); exit(-1);}
}
float *dist_mat=(float*)malloc(sizeof(float)*sol);
int k=0;
for (int i = 0; i < cities; ++i)
{
for (int j = i+1; j < cities; ++j)
{
dist_mat[k] = sqrtf(pow(posx[i] - posx[j], 2)
+powf(posy[i] - posy[j], 2));
k++;
}
}
if (cnt != cities) {fprintf(stderr, "read %d instead of %d cities\n", cnt, cities); exit(-1);}
fscanf(f, "%s", str);
if (strcmp(str, "EOF") != 0) {fprintf(stderr, "didn't see 'EOF' at end of file\n"); exit(-1);}
if(sol<=50000)
{
blk=(sol-1)/512+1;
thrd=512;
}
else
{
blk=(sol-1)/1024+1;
thrd=1024;
}
start = clock();
dst=DistH(r,dist_mat,cities);
printf("\ninitial cost : %d\n",dst);
int *d_r;
float *d_mt;
if(cudaSuccess!=cudaMalloc((void**)&d_dst,sizeof(int)))printf("\nAllocating memory for dst on GPU");
if(cudaSuccess!=cudaMalloc((void**)&d_tid,sizeof(int)))printf("\nAllocating memory for thread id on GPU");
if(cudaSuccess!=cudaMalloc((void**)&d_city,sizeof(int)))printf("\nAllocating memory for thread id on GPU");
if(cudaSuccess!=cudaMalloc((void**)&d_mt,sizeof(float)*sol))printf("\nAllocating memory for thread id on GPU");
if(cudaSuccess!=cudaMalloc((void**)&d_r,sizeof(int)*cities))printf("\nAllocating memory for thread id on GPU");
if(cudaSuccess!=cudaMemcpy(d_dst,&dst,sizeof(int),cudaMemcpyHostToDevice))printf("\ntransfer on GPU");
if(cudaSuccess!=cudaMemcpy(d_city,&cities,sizeof(int),cudaMemcpyHostToDevice))printf("\ntransfer on GPU 1");
if(cudaSuccess!=cudaMemcpy(d_mt,dist_mat,sizeof(float)*sol,cudaMemcpyHostToDevice))printf("\ntransfer on GPU 1");
if(cudaSuccess!=cudaMemcpy(d_r,r,sizeof(int)*cities,cudaMemcpyHostToDevice))printf("\ntransfer on GPU 1");
tsp<<<blk,thrd>>>(d_r,d_dst,d_tid,d_city,d_mt);
if(cudaSuccess!=cudaMemcpy(&d,d_dst,sizeof(int),cudaMemcpyDeviceToHost))printf("\nCan't transfer minimal cost back to CPU");
if(cudaSuccess!=cudaMemcpy(&tid,d_tid,sizeof(int),cudaMemcpyDeviceToHost))printf("\nCan't transfer thread ID back to CPU");
x=cities-2-floorf((sqrtf(8*(sol-tid-1)+1)-1)/2);
y=tid-x*(cities-1)+(x*(x+1)/2)+1;
route=twoOpt(x,y,r,cities);
int count =0;
while( d < dst )
{
dst=d;
if(cudaSuccess!=cudaMemcpy(d_r,route,sizeof(int)*cities,cudaMemcpyHostToDevice))printf("\ntransfer on GPU 1");
if(cudaSuccess!=cudaMemcpy(d_dst,&dst,sizeof(int),cudaMemcpyHostToDevice))printf("\ntransfer on GPU");
tsp<<<blk,thrd>>>(d_r,d_dst,d_tid,d_city,d_mt);
if(cudaSuccess!=cudaMemcpy(&d,d_dst,sizeof(int),cudaMemcpyDeviceToHost))
printf("\nCan't transfer minimal cost back to CPU");
if(cudaSuccess!=cudaMemcpy(&tid,d_tid,sizeof(int),cudaMemcpyDeviceToHost))
printf("\nCan't transfer thread ID back to CPU");
x=cities-2-floorf((sqrtf(8*(sol-tid-1)+1)-1)/2);
y=tid-x*(cities-1)+(x*(x+1)/2)+1;
route=twoOpt(x,y,route,cities);
count++;
}
printf(“\nMinimal Distance :%d\n”,d);
printf(“\nnumber of time climbed %d\n”,count);
end = clock();
printf(“\ntime : %f\n”,((double) (end - start)) / CLOCKS_PER_SEC);
cudaFree(d_dst);
cudaFree(d_tid);
free(posx);
free(posy);
return 0;
}
This cuda code works fine till 300 TSPLIB instances on K40 GPU whereas on quadro 2000 GPU, I can run any TSPLIB instances. Once I go beyond 300 instance,I am getting strange results on K40.I compiles code along with -arch=sm_35 option and while running,I need to pass TSPLIB instance like ./a.out lin318.tsp. The link to collect TSPLIB instance is Teaching