Red black relaxation

Hi,

We implemented a multi-grid solver using cuda using red-black relaxation (RB). We tried to optimize the RB using shared memory, but the code worked more slowly.

RB means that we run a 3x3 mask (convolution) for smoothing a matrix in a checkers pattern, first the red squares, then the black.

Since each cell needs its one ring neighbors, we padded the matrix with a thickness of 1. We can at first glance see the first problem: The block we work on is 32x16, but with the padding it’s 34x18, which ruins the byte alignment. So first we should probably try to work on a smaller block size of 30x14, so when we read from global memory it would be 32x16 (do we still need cudaMallocPitch for this case?). But even this doesn’t explain why the results are worse when we used shared memory. Since after all we decreased the reads from global memory by approximately 4 times.

Here are a few versions that we tried.

Not using shared memory:

__global__ void dev_relaxGS (REAL* u, REAL* rhs, int sizeX, int sizeY, REAL dx, REAL dy, REAL dxdx, REAL dydy, int color) {

            int i = threadIdx.x + blockIdx.x * blockDim.x;

            int j = threadIdx.y + blockIdx.y * blockDim.y;

            //avoid updating frame elements

            if ((0 < i&&i < sizeX-1)  &&  (0 < j&&j < sizeY-1)  &&  ((i+j) % 2 == color)) {

                // evaluate variables Solution value at one Grid point:

                // U(xy) = [F(xy) Hx^2 Hy^2 + Hx^2 [U(i,j-1) + U(i,j+1)] + Hy^2 [U(i-1,j) + U(i+1,j)]] / [2 [Hx^2 + Hy^2]]

                REAL rhs_ij = rhs [i*sizeY + j]; //rhs [i,j]

                REAL u_ijm = u [i*sizeY + j-1]; //u [i,j-1]

                REAL u_ijp = u [i*sizeY + j+1]; //u [i,j+1]

                REAL u_imj = u [(i-1)*sizeY + j]; //u [i-1,j]

                REAL u_ipj = u [(i+1)*sizeY + j]; //u [i+1,j]

REAL val = (rhs_ij*dxdx*dydy + dxdx*(u_ijm + u_ijp) + dydy*(u_imj + u_ipj)) / (2 * (dxdx + dydy));

                u[i*sizeY + j] = val;

            }

    }

Ran for 105.960000s.

Using shared memory for the reads and calculation, but writing the result back directly to global memory:

__global__ void dev_sharedDirectWrite_RelaxGS (REAL* u, REAL* rhs, int sizeX, int sizeY, REAL dx, REAL dy, REAL dxdx, REAL dydy, int color) {

        __shared__ REAL sharedU[34][18]; // Shared memory used to store Asub

int i = threadIdx.x + blockIdx.x * blockDim.x;

        int j = threadIdx.y + blockIdx.y * blockDim.y;

int I = 1 + threadIdx.x;

        int J = 1 + threadIdx.y;

// Load u from device memory to shared memory: Each thread loads one element of sub-matrix

        sharedU [I][J] = u [i*sizeY + j];

if (I-1 == 0  &&  i-1 >= 0) {

            sharedU [0][J] = u [(i-1)*sizeY + j];

        }

        if (I+1 == 33  && i+1 <= sizeX-1) {

            sharedU [33][J] = u [(i+1)*sizeY + j];

        }

if (J-1 == 0  &&  j-1 >= 0) {

            sharedU [I][0] = u [i*sizeY + j-1];

        }

        if (J+1 == 17  && j+1 <= sizeY-1) {

            sharedU [I][17] = u [i*sizeY + j+1];

        }       

        __syncthreads();

if ((0 < i&&i < sizeX-1)  &&  (0 < j&&j < sizeY-1)  &&  ((i+j) % 2 == color)) { //avoid updating frame elements

            // do the stuff on the shared memory: do all "reds"  or all "blacks"

            // evaluate variables Solution value at one Grid point:  U(xy) = [F(xy) Hx^2 Hy^2 + Hx^2 [U(i,j-1) + U(i,j+1)] + Hy^2 [U(i-1,j) + U(i+1,j)]] / [2 [Hx^2 + Hy^2]]

            REAL rhs_ij = rhs [i*sizeY + j]; //rhs [i,j]

            REAL u_ijm = sharedU [I][J-1]; //u [i,j-1]

            REAL u_ijp = sharedU [I][J+1]; //u [i,j+1]

            REAL u_imj = sharedU [I-1][J]; //u [i-1,j]

            REAL u_ipj = sharedU [I+1][J]; //u [i+1,j]

u [i*sizeY + j] = (rhs_ij*dxdx*dydy + dxdx*(u_ijm + u_ijp) + dydy*(u_imj + u_ipj)) / (2 * (dxdx + dydy));

        }

    }

Ran for 122.884000s.

Reading and writing to shared memory:

__global__ void dev_sharedRelaxGS (REAL* u, REAL* rhs, int sizeX, int sizeY, REAL dx, REAL dy, REAL dxdx, REAL dydy, int color) {

        __shared__ REAL sharedU[34][18]; // Shared memory used to store Asub

int i = threadIdx.x + blockIdx.x * blockDim.x;

        int j = threadIdx.y + blockIdx.y * blockDim.y;

int I = 1 + threadIdx.x;

        int J = 1 + threadIdx.y;

// Load u from device memory to shared memory: Each thread loads one element of sub-matrix

        sharedU [I][J] = u [i*sizeY + j];

if (I-1 == 0  &&  i-1 >= 0) {

            sharedU [0][J] = u [(i-1)*sizeY + j];

        }

        if (I+1 == 33  && i+1 <= sizeX-1) {

            sharedU [33][J] = u [(i+1)*sizeY + j];

        }

if (J-1 == 0  &&  j-1 >= 0) {

            sharedU [I][0] = u [i*sizeY + j-1];

        }

        if (J+1 == 17  && j+1 <= sizeY-1) {

            sharedU [I][17] = u [i*sizeY + j+1];

        }       

        __syncthreads();

if ((0 < i&&i < sizeX-1)  &&  (0 < j&&j < sizeY-1)  &&  ((i+j) % 2 == color)) { //avoid updating frame elements

            // do the stuff on the shared memory: do all "reds"  or all "blacks"

            // evaluate variables Solution value at one Grid point:  U(xy) = [F(xy) Hx^2 Hy^2 + Hx^2 [U(i,j-1) + U(i,j+1)] + Hy^2 [U(i-1,j) + U(i+1,j)]] / [2 [Hx^2 + Hy^2]]

            REAL rhs_ij = rhs [i*sizeY + j]; //rhs [i,j]

            REAL u_ijm = sharedU [I][J-1]; //u [i,j-1]

            REAL u_ijp = sharedU [I][J+1]; //u [i,j+1]

            REAL u_imj = sharedU [I-1][J]; //u [i-1,j]

            REAL u_ipj = sharedU [I+1][J]; //u [i+1,j]

sharedU [I][J] = (rhs_ij*dxdx*dydy + dxdx*(u_ijm + u_ijp) + dydy*(u_imj + u_ipj)) / (2 * (dxdx + dydy));

        }

        __syncthreads();

if ((0 < i&&i < sizeX-1)  &&  (0 < j&&j < sizeY-1)  &&  ((i+j) % 2 == color)) { //avoid updating frame elements

            u [i*sizeY + j] = sharedU [I][J]; // Store shared memory back into u

        }

    }

Ran for 135.103000s.

It’s probably related to this:

I think that the shared memory also loses its effect because of this problem.