2D jacobi CUDA problem

I once found this code and topic on the internet. But when I read it, I found some problems, at least my understanding for this code so far. Hopefully anyone could give me some hints or help on this. Thx!

Code below:

#include <cuda_runtime.h>

#include <cutil.h>

#include <stdlib.h>

#include <stdio.h>

#define BLOCK_SIZE_X 16

#define BLOCK_SIZE_Y 16

global void JacobiRelaxationGPU(float *u_d, float *f_d, int ArraySizeX, int ArraySizeY, float h)

{

    int tx = threadIdx.x;

    int ty = threadIdx.y;

    int bx = blockIdx.x*(BLOCK_SIZE_X-2)+1;

    int by = blockIdx.y*(BLOCK_SIZE_X-2)+1;

    int x = tx + bx;

    int y = ty + by;

shared float u_sh[BLOCK_SIZE_X][BLOCK_SIZE_Y];

u_sh[tx][ty] = u_d[x+y*ArraySizeX]; // I don’t understand here. Since x: [1, 1024] and y: [1:1024], if x=1024 and y=1024, and ArraySizeX=1024,

                                                           // the index at this moment exceeds the boundary of the matrix u_d upperbound 1024*1024!

__syncthreads();

if (tx>0 && tx<BLOCK_SIZE_X-1 && ty>0 && ty<BLOCK_SIZE_Y-1)

    {

        u_d[x+y*ArraySizeX] = (1.0/4.0)*(u_sh[tx+1][ty]+u_sh[tx-1][ty]+u_sh[tx][ty+1]+u_sh[tx][ty-1]-f_d[x+y*ArraySizeX]*h*h);

    }

}

int main()

{

    double nResult = 0;

float *u_h;

    float *f_h;

    float *u_d;

    float *f_d;

    int ArraySizeX = 1024*16;

    int ArraySizeY = 1024*16;

    long size = ArraySizeX * ArraySizeY * sizeof(float);

    long mysize = ArraySizeX * ArraySizeY;

//allocate arrays on host

    u_h = (float*)malloc(size);

    f_h = (float*)malloc(size);

//allocate arrays on device

    cudaMalloc((void**)&u_d, size);

    cudaMalloc((void**)&f_d, size);

long i=0;

    for (i=0; i<mysize; i++)  // matrix initialization

    {

        u_h[i] = 3;

        f_h[i] = 4;

    }

cudaMemcpy(u_d, u_h, size, cudaMemcpyHostToDevice);

    cudaMemcpy(f_d, f_h, size, cudaMemcpyHostToDevice);

gettimeofday(&afterLoadtp1, NULL);

int nBlocksX = ArraySizeX/(BLOCK_SIZE_X-2);

    int nBlocksY = ArraySizeY/(BLOCK_SIZE_Y-2);

dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);

    dim3 dimGrid(nBlocksX, nBlocksY);

gettimeofday(&exetp1, NULL);

    JacobiRelaxationGPU<<<dimGrid, dimBlock>>>(u_d, f_d, ArraySizeX, ArraySizeY, 1);

    gettimeofday(&beforeLoadtp2, NULL);

cudaMemcpy(u_h, u_d, size, cudaMemcpyDeviceToHost);

free(u_h);

    free(f_h);

    cudaFree(u_d);

    cudaFree(f_d);

    printf("**********Execuation is done*********!!!\n");