Hello,

I am trying to implement a shared memory optimization on a piece of code but I see inconsistent results.

In this code below I am looping on the z dimension for every XY plane in a block, iteratively loading values in the shared memory as i go up in Z.

Over the ex, ey and ez arrays updates, you can see commented out a global memory access to those same values previously loaded into shared memory. Well, that works perfectly.

The problem arises when I use shared memory. I get completely different results at every run.

All the compute sanitizer checks return 0 error/warning/info.

The shared arrays should hold the same exact values as those in gblbal memory, but they do not.

Also, I noted that using less than 4 of these shared arrays does not produce erroneous results (in release mode). Any combination.

At this point I am not even sure how to debug this.

Any help is super appreciated.

```
#include<stdio.h>
#include "constantsGPU.h"
#define assert( X , i, j, k, str) if ( !(X)) { printf("%s failed at %d, %d, %d \n", str, i, j, k ); return; }
#define BLOCKSIZE 16
__global__ void EStepPMLNew( float* ex, float* ey, float* ez,
float* hx,float* hy, float* hz,
bool dbg)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int i = bx*blockDim.x + tx;
int j = by*blockDim.y + ty;
int index = i + j*wE;
__shared__ float currHx[BLOCKSIZE+1][BLOCKSIZE+1];
__shared__ float currHy[BLOCKSIZE+1][BLOCKSIZE+1];
__shared__ float currHz[BLOCKSIZE+1][BLOCKSIZE+1];
__shared__ float aboveHx[BLOCKSIZE][BLOCKSIZE];
__shared__ float aboveHy[BLOCKSIZE][BLOCKSIZE];
if(i < Nx && j < Ny){
// load the first plane of H
currHx[tx][ty] = hx[i+j*wHx];
currHy[tx][ty] = hy[i+j*wHy];
currHz[tx][ty] = hz[i+j*wHz];
if(tx == BLOCKSIZE-1){
currHy[tx+1][ty] = hy[i+1+j*wHy];
currHz[tx+1][ty] = hz[i+1+j*wHz];
}
if(ty == BLOCKSIZE-1){
currHx[tx][ty+1] = hx[i+(j+1)*wHx];
currHz[tx][ty+1] = hz[i+(j+1)*wHz];
}
}
__syncthreads();
for(int z = 0; z < Nz; z++){
if (i < Nx && j < Ny) {
aboveHx[tx][ty] = hx[i+j*wHx+(z+1)*whHx];
aboveHy[tx][ty] = hy[i+j*wHy+(z+1)*whHy];
}
__syncthreads();
if (i < Nx && j < Ny) {
// assert shared memory is correct
int k = z;
// ex[index + z*whE] -= d_dtmu*(d_invdZ*(hy[i+j*wHy+(k+1)*whHy]-hy[i+j*wHy+k*whHy])
// -d_invdY*(hz[i+(j+1)*wHz+k*whHz]-hz[i+j*wHz+k*whHz]));
ex[index + z*whE] -= d_dtmu*(d_invdZ*(aboveHy[tx][ty]-currHy[tx][ty])
-d_invdY*(currHz[tx][ty+1]-currHz[tx][ty]));
// ey[index + z*whE] -= d_dtmu*(d_invdX*(hz[i+1+j*wHz+k*whHz]-hz[i+j*wHz+k*whHz])
// -d_invdZ*(hx[i+j*wHx+(k+1)*whHx]-hx[i+j*wHx+k*whHx]));
ey[index + z*whE] -= d_dtmu*(d_invdX*(currHz[tx+1][ty]-currHz[tx][ty])
-d_invdZ*(aboveHx[tx][ty]-currHx[tx][ty]));
// ez[index + z*whE] -= d_dtmu*(d_invdY*(hx[i+(j+1)*wHx+k*whHx]-hx[i+j*wHx+k*whHx])
// -d_invdX*(hy[i+1+j*wHy+k*whHy]-hy[i+j*wHy+k*whHy]));
ez[index + z*whE] -= d_dtmu*(d_invdY*(currHx[tx][ty+1]-currHx[tx][ty])
-d_invdX*(currHy[tx+1][ty]-currHy[tx][ty]));
}
__syncthreads();
if (i < Nx && j < Ny) {
if (z < Nz-1) {
if(tx == BLOCKSIZE-1){
currHy[tx+1][ty] = hy[i+1+j*wHy+(z+1)*whHy];
currHz[tx+1][ty] = hz[i+1+j*wHz+(z+1)*whHz];
}
if(ty == BLOCKSIZE-1){
currHx[tx][ty+1] = hx[i+(j+1)*wHx+(z+1)*whHx];
currHz[tx][ty+1] = hz[i+(j+1)*wHz+(z+1)*whHz];
}
currHx[tx][ty] = aboveHx[tx][ty];
currHy[tx][ty] = aboveHy[tx][ty];
currHz[tx][ty] = hz[i+j*wHz+(z+1)*whHz];
}
}
__syncthreads();
}
}
```