I’m trying to perform the distance transform of an image on GPU my code is:

```
//This kernel is just to prepare the matrix containing the distance transform, I put 0 (in distTransform matrix) to the points from where I want to compute the distances
__global__ void kernelPrepareDistTransf(u_char *matrix, float *distTransform, int row, int col)
{
int idxI = blockIdx.y * blockDim.y + threadIdx.y;
int idxJ = blockIdx.x * blockDim.x + threadIdx.x;
while(idxI < row)
{
while(idxJ < col)
{
if ( matrix[idxI*col+idxJ] == 255 )
{
distTransform[idxI*col+idxJ] = 0;
}
else
{
distTransform[idxI*col+idxJ] = 255;
}
idxJ += blockDim.x*gridDim.x;
}
idxI += blockDim.y*gridDim.y;
idxJ = blockIdx.x * blockDim.x + threadIdx.x;
}
}
//Here is where the real distance is computed, for each pixel in the image that isn't equal to 0
__global__ void kernelDistanceTransform(u_char *matrix, float *distTransform, int row, int col)
{
int idxI = blockIdx.y * blockDim.y + threadIdx.y;
int idxJ = blockIdx.x * blockDim.x + threadIdx.x;
int idxK, idxL;
float distance;
while(idxI < row)
{
while(idxJ < col)
{
if ( distTransform[idxI*col+idxJ] == 0 )
{
for (idxK = 0; idxK < row; idxK++)
{
for (idxL = 0; idxL < col; idxL++)
{
if ( distTransform[idxI*col+idxJ] != 0 ){
distance = abs(idxK-idxI) + abs(idxL-idxJ);
if ( distTransform[idxK*col+idxL] > distance )
{
distTransform[idxK*col+idxL] = distance;
}
}
}
}
}
idxJ += blockDim.x*gridDim.x;
}
idxI += blockDim.y*gridDim.y;
idxJ = blockIdx.x * blockDim.x + threadIdx.x;
}
}
void pprDistanceTransform(pprMatrix *matrix, pprMatrixf *distTransform, u_char type)
{
u_char *d_matrix;
float *d_transform;
cudaEvent_t start, stop;
float elapsedTime;
dim3 blocks(3,3);
dim3 threads(10,10);
distTransform->row = matrix->row;
distTransform->col = matrix->col;
pprMatrixfMem(distTransform);
//Create start, stop events.
CHECK_ERROR( cudaEventCreate( &start ) );
CHECK_ERROR( cudaEventCreate( &stop ) );
//Launch the start event.
CHECK_ERROR( cudaEventRecord( start, 0 ) );
//Allocate memory on the GPU.
CHECK_ERROR( cudaMalloc((void**)&d_matrix, sizeof(u_char)*matrix->row*matrix->col) );
CHECK_ERROR( cudaMalloc((void**)&d_transform, sizeof(float)*matrix->row*matrix->col) );
//Copy information from CPU(host) to GPU(device).
CHECK_ERROR( cudaMemcpy( d_matrix, matrix->data, sizeof(u_char)*matrix->row*matrix->col, cudaMemcpyHostToDevice ) );
//Launch the kernel computation to create the distance transform.
kernelPrepareDistTransf<<<blocks,threads>>>(d_matrix, d_transform, matrix->row, matrix->col);
CHECK_ERROR( cudaThreadSynchronize() );
kernelDistanceTransform<<<blocks,threads>>>(d_matrix, d_transform, matrix->row, matrix->col);
CHECK_ERROR( cudaThreadSynchronize() );
//Copy information from GPU(device) to CPU(host).
CHECK_ERROR( cudaMemcpy( distTransform->data, d_transform, sizeof(float)*matrix->row*matrix->col, cudaMemcpyDeviceToHost ) );
//Launch the stop event.
CHECK_ERROR( cudaEventRecord( stop, 0 ) );
CHECK_ERROR( cudaEventSynchronize( stop ) );
//Print the elapsed time.
CHECK_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) );
printf( "%3.1f ms\n", elapsedTime );
//Destroy the events.
CHECK_ERROR( cudaEventDestroy( start ) );
CHECK_ERROR( cudaEventDestroy( stop ) );
//Free the allocated memory on GPU.
cudaFree(d_matrix);
cudaFree(d_transform);
}
```

But the problem is that the kernel **kernelDistanceTransform** seems not to be working as the result I have is the same as if I was launching just the **kernelPrepareDistTransf**

I need to compute the distance transform on GPU since on CPU its taking too much time … on images of 2 MP it takes 20 min so I hope with GPU’s help I can achieved it faster.