Hello guys,

I am trying to accelerate an image filter with OpenACC. The accelerated code is shown as follows:

```
#include <algorithm>
#include <cmath>
void DNLM_OpenACC(const float* pSrcBorder, int stepBytesSrcBorder, const float* pSqrIntegralImage, int stepBytesSqrIntegral, float* pDst, int stepBytesDst, int windowRadius, int neighborRadius, int imageWidth, int imageHeight, int windowWidth, int windowHeight, int neighborWidth, int neighborHeight, float sigma_r)
{
//Array to store window matrix for euclidean distance
float * restrict pEuclDist = (float*) malloc(windowHeight * windowWidth * sizeof(float));
float * restrict pWindowIJCorr = (float*) malloc(windowHeight * windowWidth * sizeof(float));
#pragma acc data deviceptr(pSrcBorder[(windowHeight+2*(windowRadius+neighborRadius))*(windowWidth+2*(windowRadius+neighborRadius))], pSqrIntegralImage[(windowHeight+2*(windowRadius+neighborRadius)+1)*(windowWidth+2*(windowRadius+neighborRadius)+1)], pDst[imageHeight*imageWidth]) create(pEuclDist[0:windowHeight*windowWidth], pWindowIJCorr[windowHeight*windowWidth])
{
#pragma acc parallel vector_length(32) num_workers(2)
{
#pragma acc loop gang collapse(2) private(pEuclDist[0:windowHeight*windowWidth], pWindowIJCorr[0:windowHeight*windowWidth])
for(int j = 0; j < imageHeight; j++)
{
for (int i = 0; i < imageWidth; i++)
{
//Compute base address for each array
const int indexPdstBase = j * (stepBytesDst/sizeof(float));
const int indexWindowStartBase = j * stepBytesSrcBorder/sizeof(float);
const int indexNeighborIJBase = (j + windowRadius) * stepBytesSrcBorder/sizeof(float);
const int indexIINeighborIJBase = (j + windowRadius) * stepBytesSqrIntegral/sizeof(float);
const int indexIINeighborIJBaseWOffset = (j + windowRadius + neighborWidth) * stepBytesSqrIntegral/sizeof(float);
//Get sum area of neighborhood IJ
const float sqrSumIJNeighborhood = pSqrIntegralImage[indexIINeighborIJBaseWOffset + (i + windowRadius + neighborWidth)]
+ pSqrIntegralImage[indexIINeighborIJBase + (i + windowRadius)]
- pSqrIntegralImage[indexIINeighborIJBase + (i + windowRadius + neighborWidth)]
- pSqrIntegralImage[indexIINeighborIJBaseWOffset + (i + windowRadius)];
//Compute start pointer for each array
const float * restrict pWindowStart = (float*) &pSrcBorder[indexWindowStartBase + i];
const float * restrict pNeighborhoodStartIJ = (float*) &pSrcBorder[indexNeighborIJBase + i + windowRadius];
//Compute window correlation with IJ neighborhood
#pragma acc loop worker tile(8,8)
for(int row_w = 0; row_w < windowHeight; row_w++)
{
for(int col_w = 0; col_w < windowWidth; col_w++)
{
float neighborCorrSum = 0;
#pragma acc loop vector collapse(2) reduction(+:neighborCorrSum)
for(int row_n = 0; row_n < neighborHeight; row_n++)
{
for(int col_n = 0; col_n < neighborWidth; col_n++)
{
neighborCorrSum += pWindowStart[(col_w+col_n)+((row_w+row_n)*stepBytesSrcBorder/sizeof(float))] * pNeighborhoodStartIJ[col_n + (row_n * stepBytesSrcBorder/sizeof(float))];
}
}
pWindowIJCorr[col_w + row_w * windowWidth] = neighborCorrSum;
}
}
#pragma acc loop collapse(2)
for (int n = 0; n < windowHeight; n++)
{
for (int m = 0; m < windowWidth; m++)
{
//Compute base address for each array
const int indexIINeighborMNBase = (j + n) * stepBytesSqrIntegral/sizeof(float);
const int indexIINeighborMNBaseWOffset = (j + n + neighborWidth) * stepBytesSqrIntegral/sizeof(float);
const float sqrSumMNNeighborhood = pSqrIntegralImage[indexIINeighborMNBaseWOffset + (i + m + neighborWidth)]
+ pSqrIntegralImage[indexIINeighborMNBase + (i + m )]
- pSqrIntegralImage[indexIINeighborMNBase + (i + m + neighborWidth)]
- pSqrIntegralImage[indexIINeighborMNBaseWOffset + (i + m )];
pEuclDist[n*windowWidth + m]= sqrSumIJNeighborhood + sqrSumMNNeighborhood -2*pWindowIJCorr[n*windowWidth + m];
}
}
float sumExpTerm = 0;
#pragma acc loop collapse(2) reduction(+:sumExpTerm)
for(int row = 0; row < windowHeight; row++)
{
for(int col = 0; col < windowWidth; col++)
{
float filterWeight = expf(pEuclDist[col + row * windowWidth] * 1/(-2*(sigma_r * sigma_r)));
pEuclDist[col + row * windowWidth] = filterWeight;
sumExpTerm += filterWeight;
}
}
float filterResult = 0;
//Reduce(+)
#pragma acc loop collapse(2) reduction(+:filterResult)
for(int row = 0; row < windowHeight; row++)
{
for(int col = 0; col < windowWidth; col++)
{
float filterRes = pEuclDist[col + row * windowWidth] * pWindowStart[(col+neighborRadius) + (row+neighborRadius) * stepBytesSrcBorder/sizeof(float)];
filterResult += filterRes;
}
}
pDst[indexPdstBase + i] = filterResult/sumExpTerm;
}
}
}
}
}
```

As you can see the filter has 6 nested for-loops, the first two iterate through each image pixel, the next two iterate over a search window and the remaining two iterate over a pixel neighborhood. The problem I am facing is that when I use a tile clause over the search window loop the filter result has an MSE error of 1 compared to using a collapse(2) clause instead, which has a 3.51e-06 error (as expected). The tiled loop over the search window has a speedup of 10x compared to the collapsed version. I use the collapsed version as a reference.

I am working with a K40c with CUDA 10.1, PGI 14.04 and driver version 418.87.01.

Here is the compiler output for each version:

Collapsed version

```
DNLM_OpenACC(const float *, int, const float *, int, float *, int, int, int, int, int, int, int, int, int, float):
15, Generating create(pWindowIJCorr[:windowWidth*windowHeight],pEuclDist[:windowWidth*windowHeight])
17, Generating Tesla code
19, #pragma acc loop gang collapse(2) /* blockIdx.x */
21, /* blockIdx.x collapsed */
40, #pragma acc loop worker(2) collapse(2) /* threadIdx.y */
42, /* threadIdx.y collapsed */
46, #pragma acc loop vector(32) collapse(2) /* threadIdx.x */
48, /* threadIdx.x collapsed */
Generating reduction(+:neighborCorrSum)
50, Vector barrier inserted for vector loop reduction
51, Vector barrier inserted due to potential dependence out of a vector loop
58, #pragma acc loop worker(2), vector(32) collapse(2) /* threadIdx.
60, /* threadIdx.y threadIdx.x collapsed */
76, #pragma acc loop worker(2), vector(32) collapse(2) /* threadIdx.y threadIdx.x */
78, /* threadIdx.y threadIdx.x collapsed */
Generating reduction(+:sumExpTerm)
89, #pragma acc loop worker(2), vector(32) collapse(2) /* threadIdx.y threadIdx.x */
91, /* threadIdx.y threadIdx.x collapsed */
Generating reduction(+:filterResult)
40, Loop is parallelizable
42, Loop is parallelizable
46, Loop is parallelizable
48, Loop is parallelizable
58, Loop is parallelizable
60, Loop is parallelizable
76, Loop is parallelizable
78, Loop is parallelizable
89, Loop is parallelizable
91, Loop is parallelizable
```

Tiled version

```
DNLM_OpenACC(const float *, int, const float *, int, float *, int, int, int, int, int, int, int, int, int, float):
15, Generating create(pWindowIJCorr[:windowWidth*windowHeight],pEuclDist[:windowWidth*windowHeight])
17, Generating Tesla code
19, #pragma acc loop gang collapse(2) /* blockIdx.x */
21, /* blockIdx.x collapsed */
40, #pragma acc loop worker(64) tile(8,8) /* threadIdx.y */
42, /* threadIdx.y tiled */
46, #pragma acc loop vector(32) collapse(2) /* threadIdx.x */
48, /* threadIdx.x collapsed */
Generating reduction(+:neighborCorrSum)
50, Vector barrier inserted for vector loop reduction
51, Vector barrier inserted due to potential dependence out of a vector loop
58, #pragma acc loop worker(2), vector(32) collapse(2) /* threadIdx.
60, /* threadIdx.y threadIdx.x collapsed */
76, #pragma acc loop worker(2), vector(32) collapse(2) /* threadIdx.y threadIdx.x */
78, /* threadIdx.y threadIdx.x collapsed */
Generating reduction(+:sumExpTerm)
89, #pragma acc loop worker(2), vector(32) collapse(2) /* threadIdx.y threadIdx.x */
91, /* threadIdx.y threadIdx.x collapsed */
Generating reduction(+:filterResult)
40, Loop is parallelizable
42, Loop is parallelizable
46, Loop is parallelizable
48, Loop is parallelizable
58, Loop is parallelizable
60, Loop is parallelizable
76, Loop is parallelizable
78, Loop is parallelizable
89, Loop is parallelizable
91, Loop is parallelizable
```

How could you explain the effect of each clause in the mapping of the accelerated code on the GPU?

What could be the source of error? Tiled loops may be increasing race conditions?

Thanks.