CUDA Jacobi Method Using Shared memory

I have a question about Jacobi iteration in shared memory. Basically, I have a global memory kernel and using a constant memory to optimize the kernel, the size of 4K, the result from 220s down to 30s by using constant memory. However, when I try to use shared and the constant together but the result is like 28s, I am not sure why. Here are the kernels below:

Given a konwn, diagonally dominat matrix A and a Known vector b, we aim to find the vector x that satisfies the following equation:
//
// Ax = b
//
// first split the matrix A into the diagonal D and the remainder R:
//
// (D + R)x = b
// then rearrange to form an iterative solution:
//
// x' = (b - Rx) / D
// In this program

// sigma += A[j][j] * x[j]
// x_next = (b - sigma * x_prev) / A[i][i]
// so the basic idea are the same as (D + R)x = b,
// and x' = (b- Rx) / D.

Here is the basic principle of the Jacobi method Here is the basic principle of the Jacobi method

void jacobiOnHost(float* x_next, float* A, float* x_now, float* b_h, int Ni, int Nj)
{
    int i,j;
    float sigma;
    
    for (i=0; i<Ni; i++)
    {
        sigma = 0.0;
        for (j=0; j<Nj; j++)
        {
            if (i != j)
                sigma += A[i*Nj + j] * x_now[j]; // From the
            // argothum sigma is the Rx, and also matrix A is
            // seperated into the Nj + j and Ni + i
        }
        x_next[i] = (b_h[i] - sigma) / A[i*Ni + i];
    }
}

This is the serial program This is the serial program

// Device version of the Jacobi method
__global__ void jacobiUnOptimizedOnDevice(float* x_next_u, float* A_u, float* x_now_u, float* b_u, int Ni, int Nj)
{
    // Optimization step 1: tiling
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    
    if (idx < Ni)
    {
        float sigma = 0.0;
        
        int idx_Ai = idx*Nj;
        
        for (int j=0; j<Nj; j++)
            if (idx != j)
                sigma += A_u[idx_Ai + j] * x_now_u[j];
        x_next_u[idx] = (b_u[idx] - sigma) / A_u[idx_Ai + idx];
    }
}

This is a normal kernel using global memory This is the normal kernel using global memory

__constant__ float b[512];
__global__ void jacobiConstantOnDevice(float* d_x_next, float* d_A, float* d_x_now,  int Ni, int Nj)
{
    // Optimization step 1: tiling
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    
    if (i < Ni)
    {
        float sigma = 0.0;
        
        int idx_Aci = i*Nj;
        
        for (int j=0; j<Nj; j++)
            if (i != j)
                sigma += d_A[idx_Aci + j] * d_x_now[j];
        d_x_next[i] = (b[i] - sigma) / d_A[idx_Aci + i];
        
    }
}

I put the vector b in the constant memory I put the vector b in the constant memory and the performance is increasing much.

#define Tile_Width 32
__constant__ float b_s[1024];
__global__ void jacobiOptimizedOnDevice(float* d_x_next, float* d_A, float* d_x_now,  int Ni, int Nj)
{
    __shared__ float xdsn[Tile_Width];
    __shared__ float xdsx[Tile_Width];
    // Optimization step 1: tiling
    //read the matrix tile into shared memory
    int bx = blockIdx.x;
    int tx = threadIdx.x;
    int xIndex = bx * Tile_Width + tx;
    
    int idx = xIndex * Tile_Width + threadIdx.x;
    if (idx < Ni) {
        float sigma = 0.0;
        int idx_Ai = idx * Nj;
        for (int j=0; j<Tile_Width; j++) {
            if (idx != j) {
                xdsn[tx] = d_x_now[idx*Tile_Width];
                __syncthreads();
                sigma += d_A[idx_Ai + j] * xdsn[tx];
                
                xdsx[tx] = d_x_next[idx * Tile_Width];
                
                xdsx[tx] = (b_s[idx] - sigma) / d_A[idx_Ai + idx];
                
                
                
            }
            
        }
        for (int k=0; k<Ni; k++) {
            d_x_next[k* Ni] = xdsx[tx];
        }
        
    }
    __syncthreads();
    
    
}

Put two x vectors into shared memory When I put the two x vectors into the shared memory the execution time is not decrease much just 2s, I am not sure why.

The size of the kernel I use the kernel size as below, I am not sure of this part. And if someone has any advice please tell me. Thank you so much.

#include<stdio.h>
#include<stdlib.h>
#include<getopt.h>
#include <assert.h>
#include <cuda.h>
#include <time.h>
#define Tile_Width 32
///////////////////////////////////////////////////////////////////////////////////////
// This program made for Optimized the Jacobi Method by using the CUDA GPU Computing //
///////////////////////////////////////////////////////////////////////////////////////
// ************* Kernel 1 is unoptimized kernel, Kernel 2 is optimized kernel.*********
// The basic idea of this program is using the CUDA global kernel to setup a Jacobi Method solver.
// 1. Implementation of the iterative Jacobi Method.
//      Given a konwn, diagonally dominat matrix A and a Known vector b, we aim to find the vector x that satisfies the following equation:
//
// Ax = b
//
// first split the matrix A into the diagonal D and the remainder R:
//
// (D + R)x = b
// then rearrange to form an iterative solution:
//
// x' = (b - Rx) / D
// In this program

// sigma += A[j][j] * x[j]
// x_next = (b - sigma * x_prev) / A[i][i]
// so the basic idea are the same as (D + R)x = b,
// and x' = (b- Rx) / D.



static char* program_name;

// 2. This part is for a spcific matrix and read files from the local storage.
// However, for simplified the program and I skip the reading file part and
// set all the values to the random.

void print_usage (FILE* stream, int exit_code)
{
    fprintf (stream, "Usage:  %s options\n", program_name);
    fprintf (stream,
             "  -h  --help             Display this usage information.\n"
             "  -f  --file filename    File containing coefficient matrix.\n"
             "  -i  --Ni int           Number of elements in Y direction (default=512).\n"
             "  -j  --Nj int           Number of elements in X direction (default=512).\n"
             "  -n  --iterations int   Number of iterations (default=10000).\n"
             "  -k  --kernel [1,2]     1: unoptimized, 2: optimized kernel (default).\n"
             "  -t  --tilesize int     Size of each thread block in kernel 2 (default=4).\n");
    exit (exit_code);
}
// 3. This part is the control group this is a serial programming by using this
// method. It's running with the parallel programming together, to see the
// differences of execution time between the host and the device.

// Host version of the Jacobi method
void jacobiOnHost(float* x_next, float* A, float* x_now, float* b_h, int Ni, int Nj)
{
    int i,j;
    float sigma;
    
    for (i=0; i<Ni; i++)
    {
        sigma = 0.0;
        for (j=0; j<Nj; j++)
        {
            if (i != j)
                sigma += A[i*Nj + j] * x_now[j]; // From the
            // argothum sigma is the Rx, and also matrix A is
            // seperated into the Nj + j and Ni + i
        }
        x_next[i] = (b_h[i] - sigma) / A[i*Ni + i];
    }
}



// Device version of the Jacobi method
__global__ void jacobiUnOptimizedOnDevice(float* x_next_u, float* A_u, float* x_now_u, float* b_u, int Ni, int Nj)
{
    // Optimization step 1: tiling
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    
    if (idx < Ni)
    {
        float sigma = 0.0;
        
        int idx_Ai = idx*Nj;
        
        for (int j=0; j<Nj; j++)
            if (idx != j)
                sigma += A_u[idx_Ai + j] * x_now_u[j];
        x_next_u[idx] = (b_u[idx] - sigma) / A_u[idx_Ai + idx];
    }
}

// Optimized device version of the Jacobi method
//
// 4. This is the method of Jacobi Method under the computing of the GPU.
// I named the Jacobi Optimized On Devece.
// However, I tried to put the matrix into the Constant and Shared Memory, and
// the matrix is all random values, I skiped the constant memory, and directly
// using the global memory.
#define Tile_Width 32
__constant__ float b_s[1024];
__global__ void jacobiOptimizedOnDevice(float* d_x_next, float* d_A, float* d_x_now,  int Ni, int Nj)
{
    __shared__ float xdsn[Tile_Width];
    __shared__ float xdsx[Tile_Width];
    // Optimization step 1: tiling
    //read the matrix tile into shared memory
    int bx = blockIdx.x;
    int tx = threadIdx.x;
    int xIndex = bx * Tile_Width + tx;
    
    int idx = xIndex * Tile_Width + threadIdx.x;
    if (idx < Ni) {
        float sigma = 0.0;
        int idx_Ai = idx * Nj;
        for (int j=0; j<Tile_Width; j++) {
            if (idx != j) {
                xdsn[tx] = d_x_now[idx*Tile_Width];
                __syncthreads();
                sigma += d_A[idx_Ai + j] * xdsn[tx];
                
                xdsx[tx] = d_x_next[idx * Tile_Width];
                
                xdsx[tx] = (b_s[idx] - sigma) / d_A[idx_Ai + idx];
                
                
                
            }
            
        }
        for (int k=0; k<Ni; k++) {
            d_x_next[k* Ni] = xdsx[tx];
        }
        
    }
    __syncthreads();
    
    
}
// For the agorithum,



int main(int argc, char *argv[])
{
    // initialize timing variables
    time_t start, end, start_h, end_h, start_d, end_d;
    float t_full, t_host, t_dev;
    
    start=clock();
    
    // initialize data variables
    float *x_now, *x_next, *A, *b_h, *x_h, *x_d;
    //the data in un_optimize kernel
    float *x_next_u, *x_now_u, *A_u, *b_u;
    // the data in optimized kernel
    float *d_x_now, *d_x_next, *d_A, *b;
    
    // initialize parameter variables
    int N, Ni, Nj, iter, kernel, tileSize;
    int ch;
    int i,k;
    char* fname; // For the specific matrix and the matrix from the Files
    // FILE* file;
    
    // Argument parsing
    static struct option long_options[] =
    {
        {"file", required_argument, NULL, 'f'},
        {"Ni", optional_argument, NULL, 'i'},
        {"Nj", optional_argument, NULL, 'j'},
        {"iterations", optional_argument, NULL, 'n'},
        {"kernel", optional_argument, NULL, 'k'},
        {"tilesize", optional_argument, NULL, 't'},
        {"help", optional_argument, NULL, 'h'},
        {NULL, 0, NULL, 0}
    };
    // 5. Setup the size of the Matrix, and the maximum iteration times and the
    // kernel and the tileSize
    //
    program_name = argv[0];
    Ni=1024, Nj=1024, iter=10000, kernel=2, tileSize=4;
    ch=0;
    
    while ((ch = getopt_long(argc, argv,"f:i:j:n:k:h", long_options, NULL)) != -1) {
        switch (ch) {
            case 'f' : fname = optarg;
                break;
            case 'i' : Ni = atoi(optarg);
                break;
            case 'j' : Nj = atoi(optarg);
                break;
            case 'n' : iter = atoi(optarg);
                break;
            case 'k' : kernel = atoi(optarg);
                break;
            case 't' : tileSize = atoi(optarg);
                break;
            case 'h': print_usage(stderr, 1);
                exit(EXIT_FAILURE);
            case '?': print_usage(stderr, 1);
                exit(EXIT_FAILURE);
            default:
                abort();
        }
    }
    
    N = Ni * Nj;
    
    
    printf("\nRunning Jacobi method:\n");
    printf("======================\n\n");
    //printf("Coefficient matrix given in file: \n%s\n\n", fname);
    printf("Parameters:\n");
    printf("N=%d, Ni=%d, Nj=%d, ", N, Ni, Nj);
    printf("iterations=%d, kernel=%d, tilesize=%d\n", iter,kernel,tileSize);
    
    
    // Allocate memory on host
    x_next = (float *) malloc(Ni*sizeof(float));
    A = (float *) malloc(N*sizeof(float));
    x_now = (float *) malloc(Ni*sizeof(float));
    b_h = (float *) malloc(Ni*sizeof(float));
    x_h = (float *) malloc(Ni*sizeof(float));
    x_d = (float *) malloc(Ni*sizeof(float));
    
    // Initialize result vector x
    for (i=0; i<Ni; i++)
    {
        x_now[i] = 0;
        x_next[i] = 0;
    }
    
    // 6. Read coefficient matrix from file, but here I made it
    // like the random value
    /* file = fopen(fname, "r");
     if (file == NULL)
     exit(EXIT_FAILURE);
     char *line;
     size_t len = 0;
     i=0;
     while ((getline(&line, &len, file)) != -1)
     {
     if (i<N)
     A[i] = atof(line);
     else
     b[i-N] = atof(line);
     i++;
     }*/
    
    
    start_h = clock();
    
    // Run "iter" iterations of the Jacobi method on HOST
    for (k=0; k<iter; k++)
    {
        if (k%2)
            jacobiOnHost(x_now, A, x_next, b_h, Ni, Nj);
        else
            jacobiOnHost(x_next, A, x_now, b_h, Ni, Nj);
        for (i=0; i<Nj; i++)
            x_now[i] = x_next[i];
    }
    
    end_h = clock();
    
    // Save result from host in x_h
    for (i=0; i<Nj; i++)
        x_h[i] = x_next[i];
    
    
    // Re-initialize result vector x for device computation
    for (i=0; i<Ni; i++)
    {
        x_now[i] = 0;
        x_next[i] = 0;
        //b_h[i] = rand();
    }
    for (int i=0; i < Ni; i++)
    {
        b_h[i]=rand();
        for (int j=0; j < Nj; j++)
            A[i*Ni + j] = rand();
    }
    
    
    // Allocate memory on the device
    // Unoptimize
    assert(cudaSuccess == cudaMalloc((void **) &x_next_u, Ni*sizeof(float)));
    assert(cudaSuccess == cudaMalloc((void **) &A_u, N*sizeof(float)));
    assert(cudaSuccess == cudaMalloc((void **) &x_now_u, Ni*sizeof(float)));
    assert(cudaSuccess == cudaMalloc((void **) &b_u, Ni*sizeof(float)));
    // Optimized
    assert(cudaSuccess == cudaMalloc((void **) &d_x_next, Ni*sizeof(float)));
    assert(cudaSuccess == cudaMalloc((void **) &d_A, N*sizeof(float)));
    assert(cudaSuccess == cudaMalloc((void **) &d_x_now, Ni*sizeof(float)));
    assert(cudaSuccess == cudaMalloc((void **) &b, Ni*sizeof(float)));
    //assert(cudaSuccess == cudaMalloc((void **) &b_s, Ni*sizeof(float)));
    
    
    // Copy data -> device
    // 1.Un-Optimize
    cudaMemcpy(x_next_u, x_next, sizeof(float)*Ni, cudaMemcpyHostToDevice);
    cudaMemcpy(A_u, A, sizeof(float)*N, cudaMemcpyHostToDevice);
    cudaMemcpy(x_now_u, x_now, sizeof(float)*Ni, cudaMemcpyHostToDevice);
    cudaMemcpy(b_u, b_h, sizeof(float)*Ni, cudaMemcpyHostToDevice);
    
    
    //2. Optimized
    cudaMemcpy(d_x_next, x_next, sizeof(float)*Ni, cudaMemcpyHostToDevice);
    cudaMemcpy(d_A, A, sizeof(float)*N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_x_now, x_now, sizeof(float)*Ni, cudaMemcpyHostToDevice);
    cudaMemcpy(b, b_h, sizeof(float)*Ni, cudaMemcpyHostToDevice);
    
    // Compute grid and block size.
    // Un-optimized kernel
    //int blockSize = Ni;
    // int nBlocks = 1;
    
    // Optimized kernel
    int nTiles = Ni/tileSize + (Ni%tileSize == 0?0:1);
    int gridHeight = Nj/tileSize + (Nj%tileSize == 0?0:1);
    int gridWidth = Ni/tileSize + (Ni%tileSize == 0?0:1);
    printf("w=%d, h=%d\n",gridWidth,gridHeight);
    dim3 dGrid(gridHeight, gridWidth),
    dBlock(tileSize, tileSize);
    
    dim3 dimGrid(16,4);
    dim3 dimBlock(4,1);
    
    // 8. Start counting the running time for kernel
    start_d = clock();
    
    // Run "iter" iterations of the Jacobi method on DEVICE
    if (kernel == 1)
    {
        printf("Using un-optimized kernel.\n");
        for (k=0; k<iter; k++)
        {
            if (k%2)
                jacobiUnOptimizedOnDevice <<< nTiles, tileSize >>> (x_now_u, A_u, x_next_u, b_u, Ni, Nj);
            else
                
                jacobiUnOptimizedOnDevice <<< nTiles, tileSize >>> (x_now_u, A_u, x_next_u, b_u, Ni, Nj);
            cudaMemcpy(x_now_u, x_next_u, sizeof(float)*Ni, cudaMemcpyDeviceToDevice);
        }
    }
    else
    {
        printf("Using Shared Memory to optimize kernel.\n");
        for (k=0; k<iter; k++)
        {
            if (k%2)
                jacobiOptimizedOnDevice <<< dimGrid, dimBlock >>> (d_x_now,  d_A, d_x_next, Ni, Nj);
            else
                jacobiOptimizedOnDevice <<< dimGrid, dimBlock >>> (d_x_now,  d_A, d_x_next, Ni, Nj);
            //cudaMemcpy(d_x_now, d_x_next, sizeof(float)*Ni, cudaMemcpyDeviceToDevice);
        }
    }
    
    end_d = clock();
    
    
    // Data <- device
    cudaMemcpy(x_d, d_x_next, sizeof(float)*Ni, cudaMemcpyDeviceToHost);
    
    // Free memory
    free(x_next); free(A); free(x_now); free(b_h);
    cudaFree(d_x_next); cudaFree(d_A); cudaFree(d_x_now); cudaFree(b);
    cudaFree(x_now_u); cudaFree(x_next_u); cudaFree(A_u); cudaFree(b_u);
    
    end=clock();
    
    
    
    
    // 9. Check the mismatch of the x_now and the x_prev,
    // and print the result.
    printf("\nResult after %d iterations:\n",iter);
    float err = 0.0;
    for (i=0; i < Ni; i++)
    {
        /*printf("x_h[%d]=%f\n",i,x_h[i]);
         printf("x_d[%d]=%f\n",i,x_d[i]);*/
        // *************************************
        //These two printing the iteration results on each points.
        // For example, if the matrix size is 512 * 512
        // then it's gonna print all 0 - 511 for both x_h
        // and x_d.
        // And here we need to do a 4k * 4k, but the result is
        // too long and cover all the imformation, so I just
        // skip this part.
        
        
        
        err += abs(x_h[i] - x_d[i]) / Ni;
    }
    
    //
    printf("x_h[%d]=%f\n",0,x_h[0]);
    printf("x_d[%d]=%f\n",0,x_d[0]);
    t_full = ((float)end - (float)start) / CLOCKS_PER_SEC;
    t_host = ((float)end_h - (float)start_h) / CLOCKS_PER_SEC;
    t_dev = ((float)end_d - (float)start_d) / CLOCKS_PER_SEC;
    printf("\nTiming:\nTotal: %f\nSerial: %f\nParallel: %f\n\n", t_full, t_host, t_dev);
    printf("Relative error: %f\n", err);
    
    printf("\nProgram terminated successfully.\n");
    return 0;
}

// CUDA Finalized :)
// All this programming shows the results for different size of
// the input matrix A and the vector b.
// From the results, the size increasing the execution time
// increased. I made 11 points to verify the results, and from
// 8 * 8 to 4k * 4k, from the very beginning the serial
// programming is faster than the parallel programming, due to
// the communication in the GPU. However, the size increasing
// the advantage of GPU processing has arisen. At the very end
// the size of 4K * 4K the execution time for the serial is three
// times bigger than parallel. The results are reasonable.

// The reason why there is a warning because I didn't use read
// file function, so I just leave it.

Here is all my code please let me know if there is any problem thank you so much

Referring to the complete code, this:

dim3 dimGrid(16,4);
    dim3 dimBlock(4,1);

is too small to effectively utilize any GPU.

Not that its very important in the overall scheme here, but what is the purpose of the first line here:

xdsx[tx] = d_x_next[idx * Tile_Width];
                
                xdsx[tx] = (b_s[idx] - sigma) / d_A[idx_Ai + idx];

You are writing a value and then immediately overwriting it with something else?

Here is an example of jacobi on CUDA:

http://developer.download.nvidia.com/compute/developertrainingmaterials/samples/cuda_c/Jacobi_Optimization.zip

Thanks for your reply, and I rewrite the code like this

#define Tile_Width 32
__constant__ float b_s[512];
__global__ void jacobiOptimizedOnDevice(float* d_x_next, float* d_A, float* d_x_now,  int Ni, int Nj)
{
    __shared__ float xdsn[Tile_Width];
    __shared__ float xdsx[Tile_Width];
    // Optimization step 1: tiling
    //read the matrix tile into shared memory
    int bx = blockIdx.x;
    int tx = threadIdx.x;
    int xIndex = bx * Tile_Width + tx;
    
    int idx = xIndex * Tile_Width + threadIdx.x;
    if (idx < Ni) {
        float sigma = 0.0;
        int idx_Ai = idx * Nj;
        for (int j=0; j<Tile_Width; j++) {
            if (idx != j) {
                xdsn[tx] = d_x_now[idx*Tile_Width];
                xdsx[tx] = d_x_next[idx * Tile_Width];

                sigma += d_A[idx_Ai + j] * xdsn[tx];
                
                
                
                xdsx[tx] = (b_s[idx] - sigma) / d_A[idx_Ai + idx];
                
                
                
            }
            
        }
        for (int k=0; k<Ni; k++) {
            d_x_next[k* Ni] = xdsx[tx];
        }
        
    }
    
    
    
}

My basic idea is set up the vectors in the shared memory, then after computing and give back to the global memory to print the result. But I am not sure do I need to give the value back to global memory?