I once found this code and topic on the internet. But when I read it, I found some problems, at least my understanding for this code so far. Hopefully anyone could give me some hints or help on this. Thx!
Code below:
#include <cuda_runtime.h>
#include <cutil.h>
#include <stdlib.h>
#include <stdio.h>
#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16
global void JacobiRelaxationGPU(float *u_d, float *f_d, int ArraySizeX, int ArraySizeY, float h)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x*(BLOCK_SIZE_X-2)+1;
int by = blockIdx.y*(BLOCK_SIZE_X-2)+1;
int x = tx + bx;
int y = ty + by;
shared float u_sh[BLOCK_SIZE_X][BLOCK_SIZE_Y];
u_sh[tx][ty] = u_d[x+y*ArraySizeX]; // I don’t understand here. Since x: [1, 1024] and y: [1:1024], if x=1024 and y=1024, and ArraySizeX=1024,
// the index at this moment exceeds the boundary of the matrix u_d upperbound 1024*1024!
__syncthreads();
if (tx>0 && tx<BLOCK_SIZE_X-1 && ty>0 && ty<BLOCK_SIZE_Y-1)
{
u_d[x+y*ArraySizeX] = (1.0/4.0)*(u_sh[tx+1][ty]+u_sh[tx-1][ty]+u_sh[tx][ty+1]+u_sh[tx][ty-1]-f_d[x+y*ArraySizeX]*h*h);
}
}
int main()
{
double nResult = 0;
float *u_h;
float *f_h;
float *u_d;
float *f_d;
int ArraySizeX = 1024*16;
int ArraySizeY = 1024*16;
long size = ArraySizeX * ArraySizeY * sizeof(float);
long mysize = ArraySizeX * ArraySizeY;
//allocate arrays on host
u_h = (float*)malloc(size);
f_h = (float*)malloc(size);
//allocate arrays on device
cudaMalloc((void**)&u_d, size);
cudaMalloc((void**)&f_d, size);
long i=0;
for (i=0; i<mysize; i++) // matrix initialization
{
u_h[i] = 3;
f_h[i] = 4;
}
cudaMemcpy(u_d, u_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(f_d, f_h, size, cudaMemcpyHostToDevice);
gettimeofday(&afterLoadtp1, NULL);
int nBlocksX = ArraySizeX/(BLOCK_SIZE_X-2);
int nBlocksY = ArraySizeY/(BLOCK_SIZE_Y-2);
dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
dim3 dimGrid(nBlocksX, nBlocksY);
gettimeofday(&exetp1, NULL);
JacobiRelaxationGPU<<<dimGrid, dimBlock>>>(u_d, f_d, ArraySizeX, ArraySizeY, 1);
gettimeofday(&beforeLoadtp2, NULL);
cudaMemcpy(u_h, u_d, size, cudaMemcpyDeviceToHost);
free(u_h);
free(f_h);
cudaFree(u_d);
cudaFree(f_d);
printf("**********Execuation is done*********!!!\n");