Hi

There is probably something I am still missing with the shared memory approach.

I have the following device function which works fine

```
__global__ void DiffX_GPU(float* d_U, float* d_Ux, int Nx, int alpha , float* d_stencils, int rank )
{
//indices
const int b_i = blockIdx.x*blockDim.x + threadIdx.x;
int row = b_i;
float value=0.0;
// Compute dot-product between FDM stencil weights and input vector U
int diff = 0; // diff is used for automatically taking one-sided difference near boundaries
if (row<alpha)
diff = alpha - row;
else if (row>Nx-1-alpha) // row > Nx-3 Nx-2 Nx-1
diff = Nx-1-alpha-row;
int tmp = (alpha-diff)*rank+alpha;
int tmp2 = row + diff;
int i;
for (i = -alpha; i<alpha+1; ++i)
value += d_U[tmp2+i]*d_stencils[tmp+i]; ;//(d_U[tmp2+i]);//*d_stencils[tmp+i]) ;
// Store computed approximation
d_Ux[row] = value;
}
```

I need to make a shared version. But first of all I only manage to go down from 0.23 to 0.19 milliseconds each time I call the function. Moreover, the results are wrong.

Any Idea what I am messing up :

```
__global__ void DiffX_GPU(float* d_U, float* d_Ux, int Nx, int alpha , float* d_stencils, int rank )
{
//indices
const int b_i = blockIdx.x*blockDim.x + threadIdx.x;
int row = b_i;
float value=0.0;
// Compute dot-product between FDM stencil weights and input vector U
int diff = 0; // diff is used for automatically taking one-sided difference near boundaries
if (row<alpha)
diff = alpha - row;
else if (row>Nx-1-alpha) // row > Nx-3 Nx-2 Nx-1
diff = Nx-1-alpha-row;
int tmp = (alpha-diff)*rank+alpha;
int tmp2 = row + diff;
__shared__ float s_U [Shared_block];
for( int b=0; b < Nx; b++)
s_U[threadIdx.x] = d_U[tmp2+threadIdx.x];
int i;
for (i = -alpha; i<alpha+1; ++i)
value += s_U[tmp2+i]*d_stencils[tmp+i]; ;//(d_U[tmp2+i]);//*d_stencils[tmp+i]) ;
// Store computed approximation
d_Ux[row] = value;
// synchronize
__syncthreads ();
}
```

Thanx in advance