not quite. what I tried was (this is in the kernel)

j = blockIdx.x * blockDim.x + threadIdx.x

i = blockIdx.y * blockDim.y +threadIdx.y

for(int n=0;n<MAX;n++)

{

if(j>=2&&j<=IE&&i>=2&&i<=IE)

{

Vx[INDEX(i,j,IE)] = Vx[INDEX(i,j,IE)] + c1[INDEX(i,j,IB)] * ( (Txx[INDEX(i,j,IB)]-Txx[INDEX(i-1,j,IB)]) + (Txz[INDEX(i,j,IB)]-Txz[INDEX(i,j-1,IB)]) );

Vz[INDEX(i,j,IE)] = Vz[INDEX(i,j,IE)] + c1[INDEX(i,j,IB)] * ( (Txz[INDEX(i,j,IB)]-Txz[INDEX(i-1,j,IB)]) + (Tzz[INDEX(i,j,IB)]-Tzz[INDEX(i,j-1,IB)]) );

}

if(j>=2&&j<=IE-1&&i>=2&&i<=IE-1)

{

Txx[INDEX(i,j,IB)] = Txx[INDEX(i,j,IB)] + c2[INDEX(i,j,IB)] *(Vx[INDEX(i+1,j,IE)]-Vx[INDEX(i,j,IE)]) + c3[INDEX(i,j,IB)] *(Vz[INDEX(i,j+1,IE)]-Vz[INDEX(i,j,IE)]);

Tzz[INDEX(i,j,IB)] = Tzz[INDEX(i,j,IB)] + c2[INDEX(i,j,IB)] *(Vz[INDEX(i,j+1,IE)]-Vz[INDEX(i,j,IE)]) + c3[INDEX(i,j,IB)] *(Vx[INDEX(i+1,j,IE)]-Vx[INDEX(i,j,IE)]);

Txz[INDEX(i,j,IB)] = Txz[INDEX(i,j,IB)] + c4[INDEX(i,j,IB)] * ( (Vx[INDEX(i,j+1,IE)]-Vx[INDEX(i,j,IE)]) + (Vz[INDEX(i+1,j,IE)]-Vz[INDEX(i,j,IE)]) );

```
}
if (n < dd)
{
Tzz[INDEX((IE/2),(JE/2),IB)] = source[n];
Txx[INDEX((IE/2),(JE/2),IB)] = source[n];
}
```

}

This was how I implemented my kernel and it gave me the wrong results when I compared it to the CPU results