I am trying to accelerate my function with shared meory, since it worked for me with cuda hand-writing but it does not work :

first I notices that allocation do not happen with 1D allocated memory ,

here is my code

```
void function(int32_t bpl, int32_t h, int32_t w,
const uint8_t*restrict dispEstL,
const uint8_t*restrict vgL,
const uint8_t*restrict vgR,
const uint8_t*restrict hgL,
const uint8_t*restrict hgR,
const uint8_t*restrict D1min,
const uint8_t*restrict D1max,
const uint8_t**restrict tmp,
float * D1
)
{
// some initializations
-----
#pragma acc kernels copyin(tmp[h][bpl], dispEstL[0:h*w], vgL[0:h*bpl], vgR[0:h*bpl], hgL[0:h*bpl], hgR[0:h*bpl], D1max[0:h], D1min[0:h], P[0:disp_num]) copy(D1[0:h*w])
{
//#pragma acc parallel present(tmp[h][bpl], dispEstL[0:h*w], vgL[0:h*bpl], vgR[0:h*bpl], hgL[0:h*bpl], hgR[0:h*bpl], D1max[0:h], D1min[0:h], P[0:disp_num], D1[0:h*w])
{
#pragma acc loop independent gang vector //gang(12) vector(32)
for(int32_t v=2; v<h-2; v++)
{
#pragma acc loop independent gang vector //gang(39) vector(32) private(v)
for( int32_t u=2; u<w-2; u++)
{
#pragma acc cache(hgL[v-2:v+2][u-2:u+2], vgL[v-1:v+1][u-1:u+1], hgR[v-2:v+2][u-2:u+2], vgR[v-1:v+1][u-1:u+1])
{
int32_t v_tmp = v;
int32_t u_tmp = u;
int32_t line_max = (int32_t)D1max[v_tmp];
int32_t line_min = (int32_t)D1min[v_tmp];
uint8_t d_plane = dispEstL[v_tmp*w + u_tmp];
int32_t min_val= 10000;
int32_t min_d = -1;
int32_t val, u_warp;
int32_t a, b;
a = v_tmp*bpl + u_tmp;
//#pragma acc loop seq
for ( int32_t d_curr=line_min; d_curr<=line_max; d_curr++)
{
u_warp = u_tmp-d_curr;
if (u_warp<2 || u_warp>=w-2)
continue;
b = a - d_curr;
val = *(P+abs(d_curr-d_plane));
val += abs(*(hgL + (a-2*bpl)) - *(hgR + (b-2*bpl))) ;
val += abs(*(hgL + (a-bpl -2)) - *(hgR + (b-bpl -2))) ;
val += abs(*(hgL + (a-bpl )) - *(hgR + (b-bpl))) ;
val += abs(*(hgL + (a-bpl +2)) - *(hgR + (b-bpl +2))) ;
val += abs(*(hgL + (a-1)) - *(hgR + (b-1))) ;
val += abs(*(hgL + (a)) - *(hgR + (b))) ;
val += abs(*(hgL + (a)) - *(hgR + (b))) ;
val += abs(*(hgL + (a+1)) - *(hgR + (b+1))) ;
val += abs(*(hgL + (a+bpl -2)) - *(hgR + (b+bpl -2))) ;
val += abs(*(hgL + (a+bpl )) - *(hgR + (b+bpl ))) ;
val += abs(*(hgL + (a+bpl +2)) - *(hgR + (b+bpl +2))) ;
val += abs(*(hgL + (a+2*bpl )) - *(hgR + (b+2*bpl ))) ;
val += abs(*(vgL + (a-bpl )) - *(vgR + (b-bpl ))) ;
val += abs(*(vgL + (a-1)) - *(vgR + (b-1))) ;
val += abs(*(vgL + (a+1)) - *(vgR + (b+1))) ;
val += abs(*(vgL + (a+bpl )) - *(vgR + (b+bpl ))) ;
if (val<min_val) {
min_val = val;
min_d = d_curr;
}
}
if (min_d>=0) *(D1+v*w + u) = (float)min_d; // MAP value (min neg-Log probability)
else *(D1+v*w + u) = -1; // invalid disparity
}
}
}
}
}
delete[] P;
}
}
```

even though the compiler says that it has referenced the above arrays in the cache, I noticed that the execution time did not change .

Then, I profiled with nvprof -m shared_load_transactions,shared_store_transactions and I found that not load has been performed on the shared memory .

I modified the above function to work on 2D allocated arrays, (uint8_t**) …

The execution time has been increased even with no shared memory .

Then , with shared memory, it does the allocations for some arrays not for all .

I did not find any doc concerning the shared memory with OpenACC.

Could you please help and explain me the above results :

Why it works only on 2D allocated arrays ?

why it works only on few arrays ?

Thank you

/Djamila