Hello everybody,

I try to calculate the euclidian distance between two images. For that purpose I partition the images so that the distance of each part is handles by a block of threads. Finally a get a vector with the distance of each block.

Now my problem is how can I ensure that all blocks are finished to calculate the final overall distance?

The still unoptimized code:

```
__global__ void computeDistance(float *dimage1, float *dimage2, float *dsumvector)
{
// Variable declaration
extern __shared__ float shared[];
float *distancevec;
float *sumvec;
float difference;
float sum;
int yoffset;
int position;
int linelength;
int i;
// Setup the partitioning of the shared memory
distancevec = shared;
sumvec = shared + __mul24(blockDim.x, blockDim.y);
// Calculate the width of the images from the given grid and block size
linelength = __mul24(gridDim.x, blockDim.x);
// Get the distance between the two pixels and save to shared mem
position = threadIdx.x + __mul24(blockIdx.x, blockDim.x);
position += __mul24(threadIdx.y + __mul24(blockIdx.y, blockDim.y), linelength);
yoffset = __mul24(threadIdx.y, blockDim.x);
difference = dimage2[position] - dimage1[position];
distancevec[threadIdx.x + yoffset] = difference * difference;
// Wait until all threads calculated the distance
__syncthreads();
// Just use the first thread column to sum up rowwise
if(threadIdx.x == 0) {
sum = 0.0f;
for(i = 0; i < blockDim.x; ++i) {
sum += distancevec[i + yoffset];
}
sumvec[threadIdx.y] = sum;
}
// Wait until all threads finished summing
__syncthreads();
// Sum up columnwise and write the result to global memory
if(threadIdx.x == 0 && threadIdx.y == 0) {
sum = 0.0f;
for(i = 0; i < blockDim.y; ++i) {
sum += sumvec[i];
}
dsumvector[blockIdx.x + __mul24(blockIdx.y, gridDim.x)] = sum;
}
// Now what should I put in here to calculate the overall distance???
}
```

Thanks in advance!