TSP Brute Force

Hi,

I’m currently developing a code to solve TSP using brute force. What I do in the code is assign each thread calculate the “tid” permutation and after that, calculate the total distance of the cities included in the calculated permutation. For example if I have 3 cities, total permutations are 6. So starting the index from 0, tid=5 will calculate the last lexicographic permutation which is 3 2 1. I’ve developed a code which seems to run correcty for number of cities smaller that 11. After that I get wrong results. What the cuda kernel does is this. After finishing the calculations above I’m performing parallel reduction so that the cache matrix contains 32 tids of permutations with the smallest distances. For numbers smaller than 11 I get normal results. After that, especially from 12 and above the matrix returned is full of 0s. I tried executing the code using visual studio 2010, and when I entered 12 the screen flashed and got a message from the nvidia driver that the kernel had to recover. This is the code:

#include <iostream>

#include <algorithm>

#include <time.h>

#include <cuda.h>

using namespace std;

void randomize(float *a,float *b, int counter);

float distance(float *cordx,float *cordy, int *matrix,int counter);

__device__ float dev_distance(float *cordx,float *cordy, int *matrix,int counter)

{

	float temp,dist;

	

	

	

	

	temp=0;

	dist=0;

	for(int i=0;i<counter-1;i++)

	temp+=(cordx[matrix[i+1]]-cordx[matrix[i]])*(cordx[matrix[i+1]]-cordx[matrix[i]]) + (cordy[matrix[i+1]]-cordy[matrix[i]])*(cordy[matrix[i+1]]-cordy[matrix[i]]);

	

	dist=temp+(cordx[matrix[0]]-cordx[matrix[counter-1]])*(cordx[matrix[0]]-cordx[matrix[counter-1]]) + (cordy[matrix[0]]-cordy[matrix[counter-1]])*(cordy[matrix[0]]-cordy[matrix[counter-1]]);

	

	return dist;

}

__device__ unsigned long long fact(int n)

  {

unsigned long long f = 1;

      int i;

      for(i = 1; i <= n; i++)

     {

          f *= i;

      }

      return f;

  }

__device__ float perm(int *a,unsigned long long number,int index,int length,float *x,float *y)

{

    unsigned long long p;

    int i;

    float dist;

    int num=length;

	int temp[20],b[20];

	for(int i=0;i<length;i++) temp[i]=a[i];

	while(length!=0)

	{

	

		p=(number/fact(length-1))%length;

        b[index]=temp[p];

        index++;

        for(i = p; i<length-1;i++) temp[i] = temp[i+1];

        number=number%fact(length-1);

        length=length-1;

     }

	dist=dev_distance(x,y,b,num);

	

	return dist;	

}

__global__ void kernel(int *cities,unsigned long long factor,int num,float *x,float *y,unsigned long long *c)

{

	__shared__ unsigned long long cache[256];

	

	float temp=RAND_MAX;

	float dist,temp1,temp2;

	

	

	unsigned long long tid=threadIdx.x + blockIdx.x*blockDim.x;

	

	int cacheIndex=threadIdx.x;

	

	cache[cacheIndex]=0;

	

	while(tid<factor)

	{

		dist=perm(cities,tid,0,num,x,y);

		if(dist<temp)

		{

			temp=dist;

			cache[cacheIndex]=tid;

		}

	tid+=blockDim.x*gridDim.x;

	}

	

	

	

	__syncthreads();

	

	

	

	int i=blockDim.x/2;

	while(i!=0)

	{

		if(cacheIndex<i)

		{

			temp1=perm(cities,cache[cacheIndex],0,num,x,y);

			temp2=perm(cities,cache[cacheIndex+i],0,num,x,y);

			if(temp1>temp2)

			cache[cacheIndex]=cache[cacheIndex+i];

		}

		__syncthreads();

		i/=2;

	}

	

	if(cacheIndex==0) c[blockIdx.x]=cache[0];

	

	

	

}	

int main(void)

{

	int num,*cities,*path;

	float *x,*y,dist;

	

	int *dev_cities;

	float *dev_x,*dev_y;

	

	unsigned long long *dev_c,*c;

	

	int best=RAND_MAX;

	

	

	

	unsigned long long factorial;

	cout<<"Please enter number of cities: ";

	cin>>num;

	cities=(int*)malloc(num*sizeof(int));

	path=(int*)malloc(num*sizeof(int));

	for(int i=0;i<num;i++) cities[i]=i+1;

	x=(float*)malloc(num*sizeof(float));

	y=(float*)malloc(num*sizeof(float));

	c=(unsigned long long*)malloc(32*sizeof(unsigned long long));

	

	

	randomize(x,y,num);

	

	

	factorial=1;

	for(int i = 1; i <= num; i++)

     {

          factorial*= i;

     }

	

	

	cudaMalloc((void**)&dev_cities,num*sizeof(int));

	cudaMalloc((void**)&dev_x,num*sizeof(float));

	cudaMalloc((void**)&dev_y,num*sizeof(float));

	cudaMalloc((void**)&dev_c,32*sizeof(unsigned long long));

	

	

	cudaMemcpy(dev_cities,cities,num*sizeof(int),cudaMemcpyHostToDevice);

	cudaMemcpy(dev_x,x,num*sizeof(float),cudaMemcpyHostToDevice);

	cudaMemcpy(dev_y,y,num*sizeof(float),cudaMemcpyHostToDevice);

	

	kernel<<<32,256>>>(dev_cities,factorial,num,dev_x,dev_y,dev_c);

	

	cudaMemcpy(c,dev_c,32*sizeof(unsigned long long),cudaMemcpyDeviceToHost);

	

	for(int i=0;i<32;i++) cout<<c[i]<<endl;

	

	cout<<endl<<endl;

	

	

	

	

	

	do

	{

	dist=distance(x,y,cities,num);

	//cout<<dist<<endl;

		if(dist<best)

		{

			best=dist;

			for(int i=0;i<num;i++) path[i]=cities[i];

		}

	}while(next_permutation(cities,cities+num));

	

	

	cout<<"Best path is: ";

	for(int i=0;i<num;i++) cout<<path[i]<<" ";

	cout<<endl;

		

		

	

	

	return 0;

}

	

void randomize(float *a,float *b, int counter)

{

	int i;

	

	

	srand(time(NULL));

	for(i=0;i<counter;i++)

	{

	

		

		a[i]=(float)rand()/RAND_MAX*11;

		b[i]=(float)rand()/RAND_MAX*23;

	}

}

		float distance(float *cordx,float *cordy, int *matrix,int counter)

{

	float temp,dist;

	

	

	

	temp=0;

	dist=0;

	for(int i=0;i<counter-1;i++)

	temp+=(cordx[matrix[i+1]]-cordx[matrix[i]])*(cordx[matrix[i+1]]-cordx[matrix[i]]) + (cordy[matrix[i+1]]-cordy[matrix[i]])*(cordy[matrix[i+1]]-cordy[matrix[i]]);

	

	dist=temp+(cordx[matrix[0]]-cordx[matrix[counter-1]])*(cordx[matrix[0]]-cordx[matrix[counter-1]]) + (cordy[matrix[0]]-cordy[matrix[counter-1]])*(cordy[matrix[0]]-cordy[matrix[counter-1]]);

	

	return dist;

}