I am calculating distance between two points without using threads but i need to calculate distance

#include<stdio.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <errno.h>
#define MAX_CITIES 5

device double sqr(double x)
{
return x*x;
}

device double distance(int i,int j,int *a_d, int *b_d)
{
return (sqrt(sqr(a_d[i]-a_d[j])+sqr(b_d[i]-b_d[j])));
}

global void calc_dist(double *dev_dist,int *a_d, int *b_d)
{
int i,j;
for(i=0;i<MAX_CITIES;i++)
for(j=0;j<MAX_CITIES;j++)

dev_dist[i*MAX_CITIES+j]=distance(i,j,a_d,b_d);
}
int main()
{
int c1[MAX_CITIES];
int c2[MAX_CITIES];
double *host_dist,*dev_dist;
int *a_h,*b_h,*a_d,*b_d;
FILE *f1;
int n=0;
int a,b;
f1=fopen(“co.txt”,“r”);
while(!feof(f1))
{
fscanf(f1,"%d",&c1[n]);
fscanf(f1,"%d",&c2[n]);
n++;
}
fclose(f1);
for(n=0;n<MAX_CITIES;n++)
printf("\n%d %d",c1[n],c2[n]);

a_h = (int *)malloc(sizeof(int)*MAX_CITIES);
b_h = (int *)malloc(sizeof(int)*MAX_CITIES);
for (a=0; a<MAX_CITIES; a++)
{
a_h[a] = c1[a];
b_h[a] = c2[a];
printf("\n hai%d%d",a_h[a],b_h[a]);
}

cudaMalloc((void **) &a_d, sizeof(int)*MAX_CITIES);
cudaMalloc((void **) &b_d, sizeof(int)*MAX_CITIES);
cudaMemcpy(a_d, a_h, sizeof(int)*MAX_CITIES, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, sizeof(int)*MAX_CITIES, cudaMemcpyHostToDevice);
host_dist=(double )malloc(sizeof(double)MAX_CITIESMAX_CITIES);
cudaMalloc((void **) &dev_dist, sizeof(double)MAX_CITIESMAX_CITIES);
calc_dist<<<1,1>>>(dev_dist,a_d,b_d);
cudaMemcpy(host_dist,dev_dist,sizeof(double)MAX_CITIESMAX_CITIES,cudaMemcpyDeviceToHost);
for(b=0;b<(MAX_CITIES
MAX_CITIES);b++)
printf("%lf\ndist ",host_dist[b]);
free(a_h);free(b_h);free(c_h);
cudaFree(a_d);cudaFree(b_d);
cudaFree(dev_dist);

return 0;
}

You may find the function hypot() useful in calculating distance in 2D space and the function norm3d() useful in calculating the distance in 3D space.

1. use code blocks, rather than just dumping unstructured code

2. Start with more basic programs in CUDA, rather than immediately jumping to converting a serial portion of code to a parallel implementation

3. It appears you a trying to implement a inherently serial algorithm, which relies on previously calculated values to calculate the current value. Keep in mind threads in a launch to do not execute in an consistent ordered serial fashion, and any algorithm which relies this will be a bit more tricky to convert to CUDA

4. You need to learn to walk before you can run, start with a basic vector add kernel and work your way up

5. hypot() is indeed the more efficient way to calculate the distance between two points, as njuffa mentioned;

http://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g2880a4ebf5500aeb74fb01340ea91215

1. Keep in mind 64 bit performance on GPUs is a fraction of 32 bit performance. If you have any GPU other than a Titan or Tesla, or high-end quadro you would be better off casting to float

I would like to emphasize that the potential large performance differential CudaaduC warns about in point (6) applies to 64-bit vs 32-bit floating-point operands, that is, ‘double’ vs ‘float’. The performance differential between 64-bit and 32-bit integer operations should generally be about 2x to 4x in favor of 32-bit integer operations (since 64-bit integer operations are mostly emulated).

Thanks guyz