Getting strange result difference executing code on K40 GPU with CUDA 8

I am getting strange results executing same cuda code on two different GPU.First,I execute my code on cuda version 7.5 and GPU is Quadro-2000(fermi architecture). I get following results:

initial cost : 119771

Minimal Distance :45721

time : 0.164938

When I execute same code on cuda 8 with K40(kepler architecture) GPU. I am getting following result

initial cost : 119771

Minimal Distance :-382326

time : 1.730000

The Minimal Distance must not go in negation.In order to trace problem,I did memcheck,racecheck,there is no error found.I am clueless,please get me out of this issue.Does GPU micro-architecture affects on result calculation?

“Objection, your honor! Calls for speculation.” :-)

Could you post a minimal, complete, and verifiable example (http://stackoverflow.com/help/mcve) that demonstrates the issue?

Here is actual code:

global void tsp(float *pox,float *poy,int *dst,int *tid,int *city)
{

``````int i,j,cost;
register int change=0;
register int cit=*city;
int sol=(cit)*(cit-1)/2;
cost=*dst;
if(id<sol)
{
i=cit-2-floorf((sqrtf(8*(sol-id-1)+1)-1)/2);
j=id-i*(cit-1)+(i*(i+1)/2)+1;

change=distD(i,j,pox,poy)+distD((i+1)%cit,(j+1)%cit,pox,poy)-distD(i,(i+1)%cit,pox,poy)-distD(j,(j+1)%cit,pox,poy);
cost+=change;
atomicMin(dst,cost);
if(cost== (*dst) && change < 0)
{
*tid=id;
}
}
``````

}

Where distD is device function used to find out distance between index i,j using ecludean distance formula.pox,poy are simply x,y coordinates of TSP instances.I calls kernel function multiple times until minimal distance found.

Please note the “complete” part of MCVE. The code provided in #2 is not a complete program I can run to reproduce the difference in behavior. For better readability, it is a good idea to use the “code” markup provided by the forum software (right-most button on the edit menu) when posting code.

Executable cuda code:

#include"stdio.h"
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include"math.h"
#include <ctype.h>

device int distD(int x,int y,int N,floatdt)
{
int id;
if(x>y){x=x+y;y=x-y;x=x-y;}
id=x
(N-1)+(y-1)-(x*(x+1)/2);
return(dt[id]);

}

global void tsp(int *rt,int *dst,int *tid,int *city,float *dt)
{

``````int i,j,cost;
register int change=0;
register int cit=*city;
int sol=(cit)*(cit-1)/2;
cost=*dst;
if(id<sol)
{
i=cit-2-floorf((sqrtf(8*(sol-id-1)+1)-1)/2);
j=id-i*(cit-1)+(i*(i+1)/2)+1;

change=distD(rt[i],rt[j],cit,dt)+distD(rt[(i+1)%cit],rt[(j+1)%cit],cit,dt)-distD(rt[i],rt[(i+1)%cit],cit,dt)-distD(rt[j],rt[(j+1)%cit],cit,dt);
cost+=change;
atomicMin(dst,cost);
if(cost== (*dst) && change < 0)
{
*tid=id;
}
}
``````

}

int DistH(introute,float dist_mat,int N)
{
int i,j,k,id,cost=0;
for(k=0;k<N-1;k++)
{
i=route[k];
j=route[k+1];
if(i>j){i=i+j;j=i-j;i=i-j;}
id=id=i
(N-1)+(j-1)-(i
(i+1)/2);
cost+=dist_mat[id];
}
i=route[k];
j=route[0];
if(i>j){i=i+j;j=i-j;i=i-j;}
id=i*(N-1)+(j-1)-(i*(i+1)/2);
cost+=dist_mat[id];
return cost;
}

int *twoOpt(int x,int y,int *route,int city)
{
int *tmp;
tmp=(int *)malloc(sizeof(int )*city);
int i,j;

``````for (i = 0; i <=x; ++i)
{
tmp[i] = route[i];
}

for (i = x+1, j = y; i <= y; ++i, --j)
{
tmp[i] = route[j];
}

for (i = y+1; i < city; ++i)
{
tmp[i] = route[i];
}

return tmp;
``````

}

int main(int argc, char *argv)
{
int ch, cnt, in1, cities;
float in2, in3;
FILE *f;
float *posx, *posy;
char str[256];
int *r, *route;
int dst,d,*d_dst,tid,*d_tid,*d_city;
int sol, x,y;
int blk,thrd;

``````clock_t start,end;

f = fopen(argv[1], "r");
if (f == NULL) {fprintf(stderr, "could not open file \n");  exit(-1);}

ch = getc(f);  while ((ch != EOF) && (ch != '\n')) ch = getc(f);
ch = getc(f);  while ((ch != EOF) && (ch != '\n')) ch = getc(f);
ch = getc(f);  while ((ch != EOF) && (ch != '\n')) ch = getc(f);

ch = getc(f);  while ((ch != EOF) && (ch != ':')) ch = getc(f);
fscanf(f, "%s\n", str);
cities = atoi(str);
if (cities <= 2) {fprintf(stderr, "only %d cities\n", cities);  exit(-1);}

sol=cities*(cities-1)/2;
posx = (float *)malloc(sizeof(float) * cities);  if (posx == NULL) {fprintf(stderr, "cannot allocate posx\n");  exit(-1);}
posy = (float *)malloc(sizeof(float) * cities);  if (posy == NULL) {fprintf(stderr, "cannot allocate posy\n");  exit(-1);}
r = (int *)malloc(sizeof(int) * cities);
route = (int *)malloc(sizeof(int) * cities);
//posy = (float *)malloc(sizeof(float) * cities);
for(int i=0;i<cities;i++)
{r[i]=i;}

ch = getc(f);  while ((ch != EOF) && (ch != '\n')) ch = getc(f);
fscanf(f, "%s\n", str);
if (strcmp(str, "NODE_COORD_SECTION") != 0) {fprintf(stderr, "wrong file format\n");  exit(-1);}

cnt = 0;

while (fscanf(f, "%d %f %f\n", &in1, &in2, &in3))
{
posx[cnt] = in2;
posy[cnt] = in3;
cnt++;
if (cnt > cities) {fprintf(stderr, "input too long\n");  exit(-1);}
if (cnt != in1) {fprintf(stderr, "input line mismatch: expected %d instead of %d\n", cnt, in1);  exit(-1);}
}
float *dist_mat=(float*)malloc(sizeof(float)*sol);
int k=0;
for (int i = 0; i < cities; ++i)
{
for (int j = i+1; j < cities; ++j)
{
dist_mat[k] = sqrtf(pow(posx[i] - posx[j], 2)
+powf(posy[i] - posy[j], 2));
k++;
}
}
if (cnt != cities) {fprintf(stderr, "read %d instead of %d cities\n", cnt, cities);  exit(-1);}
fscanf(f, "%s", str);
if (strcmp(str, "EOF") != 0) {fprintf(stderr, "didn't see 'EOF' at end of file\n");  exit(-1);}

if(sol<=50000)
{
blk=(sol-1)/512+1;
thrd=512;
}
else
{
blk=(sol-1)/1024+1;
thrd=1024;
}
start = clock();
dst=DistH(r,dist_mat,cities);
printf("\ninitial cost : %d\n",dst);
int *d_r;
float *d_mt;
if(cudaSuccess!=cudaMalloc((void**)&d_dst,sizeof(int)))printf("\nAllocating memory for dst on GPU");
if(cudaSuccess!=cudaMalloc((void**)&d_tid,sizeof(int)))printf("\nAllocating memory for thread id on GPU");
if(cudaSuccess!=cudaMalloc((void**)&d_city,sizeof(int)))printf("\nAllocating memory for thread id on GPU");
if(cudaSuccess!=cudaMalloc((void**)&d_mt,sizeof(float)*sol))printf("\nAllocating memory for thread id on GPU");
if(cudaSuccess!=cudaMalloc((void**)&d_r,sizeof(int)*cities))printf("\nAllocating memory for thread id on GPU");

if(cudaSuccess!=cudaMemcpy(d_dst,&dst,sizeof(int),cudaMemcpyHostToDevice))printf("\ntransfer on GPU");
if(cudaSuccess!=cudaMemcpy(d_city,&cities,sizeof(int),cudaMemcpyHostToDevice))printf("\ntransfer on GPU 1");
if(cudaSuccess!=cudaMemcpy(d_mt,dist_mat,sizeof(float)*sol,cudaMemcpyHostToDevice))printf("\ntransfer on GPU 1");
if(cudaSuccess!=cudaMemcpy(d_r,r,sizeof(int)*cities,cudaMemcpyHostToDevice))printf("\ntransfer on GPU 1");

tsp<<<blk,thrd>>>(d_r,d_dst,d_tid,d_city,d_mt);

if(cudaSuccess!=cudaMemcpy(&d,d_dst,sizeof(int),cudaMemcpyDeviceToHost))printf("\nCan't transfer minimal cost back to CPU");
if(cudaSuccess!=cudaMemcpy(&tid,d_tid,sizeof(int),cudaMemcpyDeviceToHost))printf("\nCan't transfer thread ID back to CPU");
x=cities-2-floorf((sqrtf(8*(sol-tid-1)+1)-1)/2);
y=tid-x*(cities-1)+(x*(x+1)/2)+1;
route=twoOpt(x,y,r,cities);
int count =0;

while( d < dst )
{
dst=d;
if(cudaSuccess!=cudaMemcpy(d_r,route,sizeof(int)*cities,cudaMemcpyHostToDevice))printf("\ntransfer on GPU 1");
if(cudaSuccess!=cudaMemcpy(d_dst,&dst,sizeof(int),cudaMemcpyHostToDevice))printf("\ntransfer on GPU");

tsp<<<blk,thrd>>>(d_r,d_dst,d_tid,d_city,d_mt);

if(cudaSuccess!=cudaMemcpy(&d,d_dst,sizeof(int),cudaMemcpyDeviceToHost))
printf("\nCan't transfer minimal cost back to CPU");
if(cudaSuccess!=cudaMemcpy(&tid,d_tid,sizeof(int),cudaMemcpyDeviceToHost))
printf("\nCan't transfer thread ID back to CPU");
x=cities-2-floorf((sqrtf(8*(sol-tid-1)+1)-1)/2);
y=tid-x*(cities-1)+(x*(x+1)/2)+1;
route=twoOpt(x,y,route,cities);

count++;
}
``````

printf("\nMinimal Distance :%d\n",d);
printf("\nnumber of time climbed %d\n",count);
end = clock();
printf("\ntime : %f\n",((double) (end - start)) / CLOCKS_PER_SEC);

cudaFree(d_dst);
cudaFree(d_tid);
free(posx);
free(posy);
return 0;
}

This cuda code works fine till 300 TSPLIB instances on K40 GPU whereas on quadro 2000 GPU, I can run any TSPLIB instances. Once I go beyond 300 instance,I am getting strange results on K40.I compiles code along with -arch=sm_35 option and while running,I need to pass TSPLIB instance like ./a.out lin318.tsp. The link to collect TSPLIB instance is http://comopt.ifi.uni-heidelberg.de/software/TSPLIB95/tsp/

I would not call this a “minimal” example, it seems you simply dumped your code in the current state here. When I run the code with lin318.tsp on a Quadro K2200, I get different outputs on consecutive runs:

``````C:\Users\Norbert\My Programs>tsp lin318.tsp
initial cost : 119771
Minimal Distance :-1043842
number of time climbed 223
time : 0.187000

C:\Users\Norbert\My Programs>tsp lin318.tsp
initial cost : 119771
Minimal Distance :-991709
number of time climbed 207
time : 0.179000

C:\Users\Norbert\My Programs>tsp lin318.tsp
initial cost : 119771
Minimal Distance :-1048894
number of time climbed 214
time : 0.201000
``````

I get different outputs still if I use a debug build instead of a release build. Likely causes: (1) Incorrect data access picking up uninitialized data (2) Race condition. Given that every run seems to produce a different, random, result, I am inclined to think there is definitely a race condition in the code. Negative distances could be due to integer overflow in addition to picking up incorrect data.

Assuming distances can never be negative at any point, one way of finding out where things go wrong may be to stop execution the moment a negative distance is observed, and work backwards from there.

I don’t know how your test cases differ other than by size, so I would be hesitant to state a hypothesis that the failures are related to problem size. For your own sanity during debugging, you would definitely want to pare down the code to a minimum, and use the smallest data set that reproduces the issue.

By the way, what is the correct result for the lin318.tsp data set? Is there a list of reference results somewhere?

Sir,this code work fine. Until now,I have executed several tsp instances correctly on Quadro 2000 such as pcb442,rat783,pcb3038,fnl4461.But when I execute same code with same instances on K40,I am getting negative results.
The optimal cost of lin318.tsp is 42029.As I used approximation method,I get near-optimal solution i.e. 45721.

pramod@pramod:~/Desktop/cuda/TSP2.0\$ ./a.out lin318.tsp

initial cost : 119771

Minimal Distance :45721

number of time climbed 187

time : 0.156213
pramod@pramod:~/Desktop/cuda/TSP2.0\$ ./a.out rat783.tsp

initial cost : 72130

Minimal Distance :9477

number of time climbed 691

time : 2.988084
pramod@pramod:~/Desktop/cuda/TSP2.0\$ ./a.out rl1304.tsp

initial cost : 3231515

Minimal Distance :278666

number of time climbed 1024

time : 12.720769

pramod@pramod:~/Desktop/cuda/TSP2.0\$ ./a.out lin105.tsp

initial cost : 36446

Minimal Distance :14941

number of time climbed 65

time : 0.024524

When I execute same code on K40 GPU,I gets following results
[sparklab@localhost TSP2.0]\$ ./a.out lin318.tsp

initial cost : 119771

Minimal Distance :-289890

number of time climbed 195

time : 0.930000
[sparklab@localhost TSP2.0]\$ ./a.out rat783.tsp

initial cost : 72130

Minimal Distance :-3193718

number of time climbed 968

time : 1.090000
[sparklab@localhost TSP2.0]\$ ./a.out rl1304.tsp

initial cost : 3231515

Minimal Distance :-841225202

number of time climbed 2860

time : 2.330000

[sparklab@localhost TSP2.0]\$ ./a.out lin105.tsp

initial cost : 36446

Minimal Distance :14941

number of time climbed 65

time : 0.930000
In above results, it is observed that executing instance lin105.tsp on both K40 and Quadro2000, it returns near-optimal minimal distance correctly(There may be slight variation in minimal distance as I used approximation method).But while solving lin318.tsp instance on K40,it gives unexpected results whereas on quadro 2000, it works fine.

The following link has optimal distances of TSPLIB instances.
http://comopt.ifi.uni-heidelberg.de/software/TSPLIB95/STSP.html

From what I have seen so far, my hypothesis is that your code “happened to work”: that is, it worked by luck (e.g. implementation artifacts), not by design, on the Quadro 2000. In other words, I think the difference in program behavior with different GPUs that you observed is a bit of a red herring.

I have a Quadro 2000 available to try, but am not currently able to install it as I am running four rather long-running jobs (should be finished some time next week).

Is there a possibility that your code is correct by design, but the CUDA tool chain is broken? Yes, but it seems extremely slim at present. With debug builds, all optimizations are turned off (in addition, I specified -Xptxas -O0 to turn off all optimizations in the machine-specific code generation) and yet I still see random results with every run. All code generation bugs I have encountered with the CUDA tool chain (quite a few over the years) were related to optimizing code transformations, I don’t recall any affecting non-optimized code.

[Later:] I have now built the code (debug and release) with the CUDA 6.5 as well as the CUDA 7.5 tool chains, and still see random negative results when running `tsp lin318.tsp`.

I would suggest critically examining the premise that the code is correct because it used to deliver correct results on one platform. If you plan to file a bug with NVIDIA, you would have to present some plausible evidence that points in the direction of a bug in any of NVIDIA’s software components.

I’d strongly support njuffa’s suspicion that the code might produce correct results for smaller problems by luck rather than by design:

Your use of atomicMin(&dst,…), mixed with direct accesses to dst, is unsafe, and the __syncthreads() calls in between are not sufficient to fix that if you have more than one block.

Your code would (probably) work if __syncthreads() were a grid-wide barrier, rather than per-block. However no such per-grid synchronization methods exist in the CUDA programming model.

Your code happens to work for small problem sizes, because blocks will initially stay approximate in sync purely by means of similar execution speed. However they will inevitably diverge after a while, by which time your code will be comparing distances for salespersons that have travelled different numbers of cities.

The CUDA Free Bug Fixing Service™ is back with a version of your code quickly hacked to remove the obvious bugs. I presume the expected result for the lin318 problem is a minimal distance of 46069?

``````#include"stdio.h"
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include"math.h"
#include <ctype.h>

__device__ int distD(int x,int y,int N,float*dt)
{
int id;
if(x>y){x=x+y;y=x-y;x=x-y;}
id=x*(N-1)+(y-1)-(x*(x+1)/2);
return(dt[id]);

}

__global__ void tsp(int *rt,int cost,unsigned long long *dst_tid,int *tid,int *city,float *dt)
{

int i,j;
register int change=0;
register int cit=*city;
int sol=(cit)*(cit-1)/2;
if(id<sol)
{
i=cit-2-floorf((sqrtf(8*(sol-id-1)+1)-1)/2);
j=id-i*(cit-1)+(i*(i+1)/2)+1;

change=distD(rt[i],rt[j],cit,dt)+distD(rt[(i+1)%cit],rt[(j+1)%cit],cit,dt)-distD(rt[i],rt[(i+1)%cit],cit,dt)-distD(rt[j],rt[(j+1)%cit],cit,dt);
cost+=change;
if (change < 0)
atomicMin(dst_tid, ((unsigned long long)cost << 32) | id);
}

}

int DistH(int*route,float *dist_mat,int N)
{
int i,j,k,id,cost=0;
for(k=0;k<N-1;k++)
{
i=route[k];
j=route[k+1];
if(i>j){i=i+j;j=i-j;i=i-j;}
id=id=i*(N-1)+(j-1)-(i*(i+1)/2);
cost+=dist_mat[id];
}
i=route[k];
j=route[0];
if(i>j){i=i+j;j=i-j;i=i-j;}
id=i*(N-1)+(j-1)-(i*(i+1)/2);
cost+=dist_mat[id];
return cost;
}

int *twoOpt(int x,int y,int *route,int city)
{
int *tmp;
tmp=(int *)malloc(sizeof(int )*city);
int i,j;

for (i = 0; i <=x; ++i)
{
tmp[i] = route[i];
}

for (i = x+1, j = y; i <= y; ++i, --j)
{
tmp[i] = route[j];
}

for (i = y+1; i < city; ++i)
{
tmp[i] = route[i];
}

return tmp;

}

int main(int argc, char *argv[])
{
int ch, cnt, in1, cities;
float in2, in3;
FILE *f;
float *posx, *posy;
char str[256];
int *r, *route;
int dst,d,tid,*d_tid,*d_city;
unsigned long long *d_dst_tid;
int sol, x,y;
int blk,thrd;

clock_t start,end;

f = fopen(argv[1], "r");
if (f == NULL) {fprintf(stderr, "could not open file \n");  exit(-1);}

ch = getc(f);  while ((ch != EOF) && (ch != '\n')) ch = getc(f);
ch = getc(f);  while ((ch != EOF) && (ch != '\n')) ch = getc(f);
ch = getc(f);  while ((ch != EOF) && (ch != '\n')) ch = getc(f);

ch = getc(f);  while ((ch != EOF) && (ch != ':')) ch = getc(f);
fscanf(f, "%s\n", str);
cities = atoi(str);
if (cities <= 2) {fprintf(stderr, "only %d cities\n", cities);  exit(-1);}

sol=cities*(cities-1)/2;
posx = (float *)malloc(sizeof(float) * cities);  if (posx == NULL) {fprintf(stderr, "cannot allocate posx\n");  exit(-1);}
posy = (float *)malloc(sizeof(float) * cities);  if (posy == NULL) {fprintf(stderr, "cannot allocate posy\n");  exit(-1);}
r = (int *)malloc(sizeof(int) * cities);
route = (int *)malloc(sizeof(int) * cities);
//posy = (float *)malloc(sizeof(float) * cities);
for(int i=0;i<cities;i++)
{r[i]=i;}

ch = getc(f);  while ((ch != EOF) && (ch != '\n')) ch = getc(f);
fscanf(f, "%s\n", str);
if (strcmp(str, "NODE_COORD_SECTION") != 0) {fprintf(stderr, "wrong file format\n");  exit(-1);}

cnt = 0;

while (fscanf(f, "%d %f %f\n", &in1, &in2, &in3))
{
posx[cnt] = in2;
posy[cnt] = in3;
cnt++;
if (cnt > cities) {fprintf(stderr, "input too long\n");  exit(-1);}
if (cnt != in1) {fprintf(stderr, "input line mismatch: expected %d instead of %d\n", cnt, in1);  exit(-1);}
}
float *dist_mat=(float*)malloc(sizeof(float)*sol);
int k=0;
for (int i = 0; i < cities; ++i)
{
for (int j = i+1; j < cities; ++j)
{
dist_mat[k] = sqrtf(pow(posx[i] - posx[j], 2)
+powf(posy[i] - posy[j], 2));
k++;
}
}
if (cnt != cities) {fprintf(stderr, "read %d instead of %d cities\n", cnt, cities);  exit(-1);}
fscanf(f, "%s", str);
if (strcmp(str, "EOF") != 0) {fprintf(stderr, "didn't see 'EOF' at end of file\n");  exit(-1);}

if(sol<=50000)
{
blk=(sol-1)/512+1;
thrd=512;
}
else
{
blk=(sol-1)/1024+1;
thrd=1024;
}
start = clock();
dst=DistH(r,dist_mat,cities);
unsigned long long dst_tid = (((long)dst+1) << 32) -1;
unsigned long long dtid;
printf("\ninitial cost : %d\n",dst);
int *d_r;
float *d_mt;
if(cudaSuccess!=cudaMalloc((void**)&d_dst_tid,sizeof(unsigned long long)))printf("\nAllocating memory for dst_tid on GPU");
if(cudaSuccess!=cudaMalloc((void**)&d_tid,sizeof(int)))printf("\nAllocating memory for thread id on GPU");
if(cudaSuccess!=cudaMalloc((void**)&d_city,sizeof(int)))printf("\nAllocating memory for thread id on GPU");
if(cudaSuccess!=cudaMalloc((void**)&d_mt,sizeof(float)*sol))printf("\nAllocating memory for thread id on GPU");
if(cudaSuccess!=cudaMalloc((void**)&d_r,sizeof(int)*cities))printf("\nAllocating memory for thread id on GPU");

if(cudaSuccess!=cudaMemcpy(d_dst_tid,&dst_tid,sizeof(unsigned long long),cudaMemcpyHostToDevice))printf("\ntransfer on GPU");
if(cudaSuccess!=cudaMemcpy(d_city,&cities,sizeof(int),cudaMemcpyHostToDevice))printf("\ntransfer on GPU 1");
if(cudaSuccess!=cudaMemcpy(d_mt,dist_mat,sizeof(float)*sol,cudaMemcpyHostToDevice))printf("\ntransfer on GPU 1");
if(cudaSuccess!=cudaMemcpy(d_r,r,sizeof(int)*cities,cudaMemcpyHostToDevice))printf("\ntransfer on GPU 1");

tsp<<<blk,thrd>>>(d_r,dst,d_dst_tid,d_tid,d_city,d_mt);

if(cudaSuccess!=cudaMemcpy(&dtid,d_dst_tid,sizeof(unsigned long long),cudaMemcpyDeviceToHost))printf("\nCan't transfer minimal cost back to CPU");
if(cudaSuccess!=cudaMemcpy(&tid,d_tid,sizeof(int),cudaMemcpyDeviceToHost))printf("\nCan't transfer thread ID back to CPU");

d = dtid >> 32;
tid = dtid & ((1ull<<32)-1);
x=cities-2-floorf((sqrtf(8*(sol-tid-1)+1)-1)/2);
y=tid-x*(cities-1)+(x*(x+1)/2)+1;
route=twoOpt(x,y,r,cities);
int count =0;

while( d < dst )
{
dst=d;
unsigned long long dst_tid = (((long)dst+1) << 32) -1;
if(cudaSuccess!=cudaMemcpy(d_r,route,sizeof(int)*cities,cudaMemcpyHostToDevice))printf("\ntransfer on GPU 1");
if(cudaSuccess!=cudaMemcpy(d_dst_tid,&dst_tid,sizeof(unsigned long long),cudaMemcpyHostToDevice))printf("\ntransfer on GPU");

tsp<<<blk,thrd>>>(d_r,dst,d_dst_tid,d_tid,d_city,d_mt);

if(cudaSuccess!=cudaMemcpy(&dtid,d_dst_tid,sizeof(unsigned long long),cudaMemcpyDeviceToHost))
printf("\nCan't transfer minimal cost back to CPU");
if(cudaSuccess!=cudaMemcpy(&tid,d_tid,sizeof(int),cudaMemcpyDeviceToHost))
printf("\nCan't transfer thread ID back to CPU");
d = dtid >> 32;
tid = dtid & ((1ull<<32)-1);
x=cities-2-floorf((sqrtf(8*(sol-tid-1)+1)-1)/2);
y=tid-x*(cities-1)+(x*(x+1)/2)+1;
route=twoOpt(x,y,route,cities);

count++;
}

printf("\nMinimal Distance :%d\n",d);
printf("\nnumber of time climbed %d\n",count);
end = clock();
printf("\ntime : %f\n",((double) (end - start)) / CLOCKS_PER_SEC);

cudaFree(d_dst_tid);
cudaFree(d_tid);
free(posx);
free(posy);
return 0;
}
``````

P.S.: Norbert, you can leave your Quadro 2000 in the cupboard. :)

The list of reference results pointed to earlier says:

lin318 : 42029

If you are offering a Free Bug Fixing Service™, be prepared to attract many more programmers looking to have their bugs fixed :-)

At Stackoverflow the vast majority of questions related to CUDA now seem to be of the “my installation is broken” and “my code doesn’t work” variety. As a consequence I rarely look at those questions anymore. I see these forums heading down the same path …

Unfortunately I see it the same way. It was more fun a couple of years ago.

To be honest, I had originally passed over this post completely for not bothering to format the code (not even after your hint with detailed instructions). Only when I saw you putting in some effort and even offering to literally open up your computer, I had a look at the code.

I have to admit I have not tried to figure out how the algorithm is supposed to work. I have removed the original “racy” read of the minimal distance so far and replaced it with passing in the value at the time of the kernel launch. While that is one step to make the results reproducible, it might lead to better solutions being missed that otherwise may be found with a bit of luck.

P.S.: The Free Bug Fixing Service™ does not come with any guarantee of a specific service level, other than “at some times, there may or may not be some service that may or may not be helpful. In fact, it may even be unhelpful.”

Thanks to njuffa and tera for your valuable suggestion.I will thoroughly trace code with suggested things prior to ask any doubt.