Cuda unknown error on the shaded line.can anyone help?

#include <stdlib.h>
#include <stdio.h>
#include<math.h>
#include<cuda.h>
#include<time.h>
#define MAX_CITIES 280
#define MAX_ANT 2
#define BETA 5.0
#define RHO 0.5

#define gpuErrchk(ans) { gpuAssert((ans), FILE, LINE); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

struct antType{

int curCity;
int visited[MAX_CITIES];
int path[MAX_CITIES];
float tourLength;
};

//to calculate square
device float sqr(float x)
{
return x*x;
}

device float distance(int i,int j,int *a_d, int *b_d)
{
return (sqrt(sqr(a_d[i]-a_d[j])+sqr(b_d[i]-b_d[j])));
}

//calculate distance between cities and pheromone
global void calc_dist(float *dev_dist,int *a_d, int *b_d,float *dev_phero)
{

int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
int grid_width = gridDim.x * blockDim.x;
int index = index_y * grid_width + index_x;
dev_dist[index]=distance(index_x,index_y,a_d,b_d);
dev_phero[index]=(1.0/MAX_CITIES);
}

device int isfree(int ant,struct antType *dev_ant)
{
int i;
for(i=0;i<MAX_CITIES;i++)
{
if(dev_ant[ant].visited[i]==0)
return 1;
}
return 0;
}

device int selectNextCity(int start,int ant,float *dev_dist,float *dev_phero,struct antType dev_ant)
{
dev_ant[ant].visited[start]=1;
int c=start;
float max=0.0;
float num_value,v,v2;
int i,next,l;
int index1,index2;
float power,power2;
for(i=0;i<MAX_CITIES;i++)
{
if(i!=c&&dev_ant[ant].visited[i]==0)
{
index1=(c
MAX_CITIES+i);
v=1/dev_dist[index1];
power=(float)pow((double)v,BETA);
num_value=dev_phero[index1]power;
float den_value=0.0;
for(l=0;l<MAX_CITIES;l++)
{
index2=c
MAX_CITIES+l;
if(l!=i&&l!=c)
{
v2=1/dev_dist[index2];
power2=(float)pow((double)v2,BETA);
den_value+=(dev_phero[index2]*power2);
}
}

		if((num_value/den_value)>max)
		  {
			max=num_value/den_value;
			next=i;			  
		  }
              }
}

return next;
}

global void simulatedev_ant(struct antType *dev_ant,float *dev_phero,float *dev_dist)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int next;
float value=(1.0 - RHO);
int k=0;
dev_ant[tid].path[0]=dev_ant[tid].curCity;
int start=dev_ant[tid].curCity;
int remaining=MAX_CITIES-1;

while((remaining–)>0)
{
next=selectNextCity(start,tid,dev_dist,dev_phero,dev_ant);
dev_ant[tid].visited[next]=1;
int index = startMAX_CITIES+next;
dev_ant[tid].path[++k]=next;
dev_ant[tid].tourLength +=dev_dist[index];
dev_phero[index]=value
dev_phero[index];
start=next;
}
}

int main(void)
{

cudaFree(0);

int num_elements_x = 32;
int num_elements_y = 32;
int sno[MAX_CITIES];
int num_bytes = num_elements_x * num_elements_y * sizeof(float);
float *prob;
float *dist,*dev_dist;
float *phero,*dev_phero;
clock_t begin,end;
float timeSpent;
float optimal[MAX_ANT];

begin=clock();
dist = (float*)malloc(num_bytes);
gpuErrchk(cudaMalloc((void**)&dev_dist, num_bytes));
phero = (float*)malloc(num_bytes);
gpuErrchk(cudaMalloc((void**)&dev_phero, num_bytes));
prob = (float*)malloc(num_bytes);

int *a_h,*b_h,*a_d,*b_d;
FILE *f1;
int n=0;
a_h = (int *)malloc(sizeof(int)*MAX_CITIES);
b_h = (int *)malloc(sizeof(int)*MAX_CITIES);
f1=fopen(“co_32.txt”,“r”);// input file

while(!feof(f1))
{

fscanf(f1,"%d",&a_h[n]);
fscanf(f1,"%d",&b_h[n]);
n++;

}
fclose(f1);

struct antType *dev_ant,host_ant;
int dev_ant_size=MAX_ANT
sizeof(struct antType);

printf("\nsize%d\n",dev_ant_size);

host_ant=(struct antType*)malloc(MAX_ANT*sizeof(struct antType));
gpuErrchk(cudaMalloc((void **) &dev_ant, dev_ant_size));
int i,j;
for(i=0;i<MAX_ANT;i++)
{
host_ant[i].curCity=rand()%MAX_CITIES;
host_ant[i].tourLength=0.0;
for(j=0;j<MAX_CITIES;j++)
{
host_ant[i].visited[j]=0;
host_ant[i].path[j]=-1;
}

}

gpuErrchk(cudaMalloc((void **) &a_d, sizeof(int)*MAX_CITIES));
gpuErrchk(cudaMalloc((void **) &b_d, sizeof(int)*MAX_CITIES));
gpuErrchk(cudaMemcpy(a_d, a_h, sizeof(int)*MAX_CITIES, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(b_d, b_h, sizeof(int)*MAX_CITIES, cudaMemcpyHostToDevice));

dim3 block_size;
block_size.x = 8;
block_size.y = 4;

dim3 grid_size;
grid_size.x = num_elements_x / block_size.x;
grid_size.y = num_elements_y / block_size.y;

calc_dist<<<grid_size,block_size>>>(dev_dist,a_d,b_d,dev_phero);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );

gpuErrchk(cudaMemcpy(dev_ant,host_ant,dev_ant_size,cudaMemcpyHostToDevice));

simulatedev_ant<<<1,MAX_ANT>>>(dev_ant,dev_phero,dev_dist);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
gpuErrchk(cudaMemcpy(host_ant,dev_ant,dev_ant_size,cudaMemcpyDeviceToHost));

printf("\n\n");

for(i=0;i<MAX_ANT;i++)
{
printf(“ANT %d details\n\n”,i+1);
optimal[i]=host_ant[i].tourLength;
printf("%f",host_ant[i].tourLength);

printf("\nPath\n");
for(j=0;j<MAX_CITIES;j++)
printf("%d\t",host_ant[i].path[j]);

printf("\n");

}
float smallest = optimal[0];
for (i = 0; i < MAX_ANT; i++)
{
if (optimal[i] < smallest) {
smallest = optimal[i];
}
}
printf("\nSmallest Element : %f", smallest);

end=clock();
timeSpent=(double)((end-begin)/(CLOCKS_PER_SEC/1000));
printf(“Time spent=%lf\n”,timeSpent);

// deallocate memory
free(dist);
free(host_ant);
cudaFree(dev_ant);
cudaFree(dev_dist);
free(phero);
cudaFree(dev_phero);
}

Have you tried running this with cuda-memcheck? It will likely be able to point out where the issues are with the code. The most likely reason for the error is that you have an out-of-bounds memory access in the simulatedev_ant() kernel, causing the kernel to fail.

Side-remark: For the distance computations, consider using the standard math function hypot().

When I try CUDA memcheck it shows following error

========= CUDA-MEMCHECK

size4496
========= Invalid global read of size 4
========= at 0x00000130 in simulatedev_ant(antType*, float*, float*)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x002820a0 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so (cuLaunchKernel + 0x2c5) [0x14ad95]
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.5.5 [0xe108]
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.5.5 (cudaLaunch + 0x143) [0x2cb53]
========= Host Frame:./a.out [0x1a6d]
========= Host Frame:./a.out [0x18ac]
========= Host Frame:./a.out [0x18d9]
========= Host Frame:./a.out [0x142c]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
========= Host Frame:./a.out [0xce9]

========= Program hit error 30 on CUDA API call to cudaDeviceSynchronize
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so [0x2ef613]
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.5.5 (cudaDeviceSynchronize + 0x166) [0x2a986]
========= Host Frame:./a.out [0x144c]
GPUassert: unknown error sample.cu 219
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
========= Host Frame:./a.out [0xce9]

========= ERROR SUMMARY: 2 errors

so your simulatedev_ant kernel is accessing memory out-of-bounds.

You can find out which line of code is actually causing the illegal access using the method described in the answer here:

http://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy

basically add -lineinfo to your compile command, and then cuda-memcheck will indicate the line that is causing the illegal access