I have a question about Jacobi iteration in shared memory. Basically, I have a global memory kernel and using a constant memory to optimize the kernel, the size of 4K, the result from 220s down to 30s by using constant memory. However, when I try to use shared and the constant together but the result is like 28s, I am not sure why. Here are the kernels below:
Given a konwn, diagonally dominat matrix A and a Known vector b, we aim to find the vector x that satisfies the following equation:
//
// Ax = b
//
// first split the matrix A into the diagonal D and the remainder R:
//
// (D + R)x = b
// then rearrange to form an iterative solution:
//
// x' = (b - Rx) / D
// In this program
// sigma += A[j][j] * x[j]
// x_next = (b - sigma * x_prev) / A[i][i]
// so the basic idea are the same as (D + R)x = b,
// and x' = (b- Rx) / D.
Here is the basic principle of the Jacobi method Here is the basic principle of the Jacobi method
void jacobiOnHost(float* x_next, float* A, float* x_now, float* b_h, int Ni, int Nj)
{
int i,j;
float sigma;
for (i=0; i<Ni; i++)
{
sigma = 0.0;
for (j=0; j<Nj; j++)
{
if (i != j)
sigma += A[i*Nj + j] * x_now[j]; // From the
// argothum sigma is the Rx, and also matrix A is
// seperated into the Nj + j and Ni + i
}
x_next[i] = (b_h[i] - sigma) / A[i*Ni + i];
}
}
This is the serial program This is the serial program
// Device version of the Jacobi method
__global__ void jacobiUnOptimizedOnDevice(float* x_next_u, float* A_u, float* x_now_u, float* b_u, int Ni, int Nj)
{
// Optimization step 1: tiling
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < Ni)
{
float sigma = 0.0;
int idx_Ai = idx*Nj;
for (int j=0; j<Nj; j++)
if (idx != j)
sigma += A_u[idx_Ai + j] * x_now_u[j];
x_next_u[idx] = (b_u[idx] - sigma) / A_u[idx_Ai + idx];
}
}
This is a normal kernel using global memory This is the normal kernel using global memory
__constant__ float b[512];
__global__ void jacobiConstantOnDevice(float* d_x_next, float* d_A, float* d_x_now, int Ni, int Nj)
{
// Optimization step 1: tiling
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < Ni)
{
float sigma = 0.0;
int idx_Aci = i*Nj;
for (int j=0; j<Nj; j++)
if (i != j)
sigma += d_A[idx_Aci + j] * d_x_now[j];
d_x_next[i] = (b[i] - sigma) / d_A[idx_Aci + i];
}
}
I put the vector b in the constant memory I put the vector b in the constant memory and the performance is increasing much.
#define Tile_Width 32
__constant__ float b_s[1024];
__global__ void jacobiOptimizedOnDevice(float* d_x_next, float* d_A, float* d_x_now, int Ni, int Nj)
{
__shared__ float xdsn[Tile_Width];
__shared__ float xdsx[Tile_Width];
// Optimization step 1: tiling
//read the matrix tile into shared memory
int bx = blockIdx.x;
int tx = threadIdx.x;
int xIndex = bx * Tile_Width + tx;
int idx = xIndex * Tile_Width + threadIdx.x;
if (idx < Ni) {
float sigma = 0.0;
int idx_Ai = idx * Nj;
for (int j=0; j<Tile_Width; j++) {
if (idx != j) {
xdsn[tx] = d_x_now[idx*Tile_Width];
__syncthreads();
sigma += d_A[idx_Ai + j] * xdsn[tx];
xdsx[tx] = d_x_next[idx * Tile_Width];
xdsx[tx] = (b_s[idx] - sigma) / d_A[idx_Ai + idx];
}
}
for (int k=0; k<Ni; k++) {
d_x_next[k* Ni] = xdsx[tx];
}
}
__syncthreads();
}
Put two x vectors into shared memory When I put the two x vectors into the shared memory the execution time is not decrease much just 2s, I am not sure why.
The size of the kernel I use the kernel size as below, I am not sure of this part. And if someone has any advice please tell me. Thank you so much.
#include<stdio.h>
#include<stdlib.h>
#include<getopt.h>
#include <assert.h>
#include <cuda.h>
#include <time.h>
#define Tile_Width 32
///////////////////////////////////////////////////////////////////////////////////////
// This program made for Optimized the Jacobi Method by using the CUDA GPU Computing //
///////////////////////////////////////////////////////////////////////////////////////
// ************* Kernel 1 is unoptimized kernel, Kernel 2 is optimized kernel.*********
// The basic idea of this program is using the CUDA global kernel to setup a Jacobi Method solver.
// 1. Implementation of the iterative Jacobi Method.
// Given a konwn, diagonally dominat matrix A and a Known vector b, we aim to find the vector x that satisfies the following equation:
//
// Ax = b
//
// first split the matrix A into the diagonal D and the remainder R:
//
// (D + R)x = b
// then rearrange to form an iterative solution:
//
// x' = (b - Rx) / D
// In this program
// sigma += A[j][j] * x[j]
// x_next = (b - sigma * x_prev) / A[i][i]
// so the basic idea are the same as (D + R)x = b,
// and x' = (b- Rx) / D.
static char* program_name;
// 2. This part is for a spcific matrix and read files from the local storage.
// However, for simplified the program and I skip the reading file part and
// set all the values to the random.
void print_usage (FILE* stream, int exit_code)
{
fprintf (stream, "Usage: %s options\n", program_name);
fprintf (stream,
" -h --help Display this usage information.\n"
" -f --file filename File containing coefficient matrix.\n"
" -i --Ni int Number of elements in Y direction (default=512).\n"
" -j --Nj int Number of elements in X direction (default=512).\n"
" -n --iterations int Number of iterations (default=10000).\n"
" -k --kernel [1,2] 1: unoptimized, 2: optimized kernel (default).\n"
" -t --tilesize int Size of each thread block in kernel 2 (default=4).\n");
exit (exit_code);
}
// 3. This part is the control group this is a serial programming by using this
// method. It's running with the parallel programming together, to see the
// differences of execution time between the host and the device.
// Host version of the Jacobi method
void jacobiOnHost(float* x_next, float* A, float* x_now, float* b_h, int Ni, int Nj)
{
int i,j;
float sigma;
for (i=0; i<Ni; i++)
{
sigma = 0.0;
for (j=0; j<Nj; j++)
{
if (i != j)
sigma += A[i*Nj + j] * x_now[j]; // From the
// argothum sigma is the Rx, and also matrix A is
// seperated into the Nj + j and Ni + i
}
x_next[i] = (b_h[i] - sigma) / A[i*Ni + i];
}
}
// Device version of the Jacobi method
__global__ void jacobiUnOptimizedOnDevice(float* x_next_u, float* A_u, float* x_now_u, float* b_u, int Ni, int Nj)
{
// Optimization step 1: tiling
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < Ni)
{
float sigma = 0.0;
int idx_Ai = idx*Nj;
for (int j=0; j<Nj; j++)
if (idx != j)
sigma += A_u[idx_Ai + j] * x_now_u[j];
x_next_u[idx] = (b_u[idx] - sigma) / A_u[idx_Ai + idx];
}
}
// Optimized device version of the Jacobi method
//
// 4. This is the method of Jacobi Method under the computing of the GPU.
// I named the Jacobi Optimized On Devece.
// However, I tried to put the matrix into the Constant and Shared Memory, and
// the matrix is all random values, I skiped the constant memory, and directly
// using the global memory.
#define Tile_Width 32
__constant__ float b_s[1024];
__global__ void jacobiOptimizedOnDevice(float* d_x_next, float* d_A, float* d_x_now, int Ni, int Nj)
{
__shared__ float xdsn[Tile_Width];
__shared__ float xdsx[Tile_Width];
// Optimization step 1: tiling
//read the matrix tile into shared memory
int bx = blockIdx.x;
int tx = threadIdx.x;
int xIndex = bx * Tile_Width + tx;
int idx = xIndex * Tile_Width + threadIdx.x;
if (idx < Ni) {
float sigma = 0.0;
int idx_Ai = idx * Nj;
for (int j=0; j<Tile_Width; j++) {
if (idx != j) {
xdsn[tx] = d_x_now[idx*Tile_Width];
__syncthreads();
sigma += d_A[idx_Ai + j] * xdsn[tx];
xdsx[tx] = d_x_next[idx * Tile_Width];
xdsx[tx] = (b_s[idx] - sigma) / d_A[idx_Ai + idx];
}
}
for (int k=0; k<Ni; k++) {
d_x_next[k* Ni] = xdsx[tx];
}
}
__syncthreads();
}
// For the agorithum,
int main(int argc, char *argv[])
{
// initialize timing variables
time_t start, end, start_h, end_h, start_d, end_d;
float t_full, t_host, t_dev;
start=clock();
// initialize data variables
float *x_now, *x_next, *A, *b_h, *x_h, *x_d;
//the data in un_optimize kernel
float *x_next_u, *x_now_u, *A_u, *b_u;
// the data in optimized kernel
float *d_x_now, *d_x_next, *d_A, *b;
// initialize parameter variables
int N, Ni, Nj, iter, kernel, tileSize;
int ch;
int i,k;
char* fname; // For the specific matrix and the matrix from the Files
// FILE* file;
// Argument parsing
static struct option long_options[] =
{
{"file", required_argument, NULL, 'f'},
{"Ni", optional_argument, NULL, 'i'},
{"Nj", optional_argument, NULL, 'j'},
{"iterations", optional_argument, NULL, 'n'},
{"kernel", optional_argument, NULL, 'k'},
{"tilesize", optional_argument, NULL, 't'},
{"help", optional_argument, NULL, 'h'},
{NULL, 0, NULL, 0}
};
// 5. Setup the size of the Matrix, and the maximum iteration times and the
// kernel and the tileSize
//
program_name = argv[0];
Ni=1024, Nj=1024, iter=10000, kernel=2, tileSize=4;
ch=0;
while ((ch = getopt_long(argc, argv,"f:i:j:n:k:h", long_options, NULL)) != -1) {
switch (ch) {
case 'f' : fname = optarg;
break;
case 'i' : Ni = atoi(optarg);
break;
case 'j' : Nj = atoi(optarg);
break;
case 'n' : iter = atoi(optarg);
break;
case 'k' : kernel = atoi(optarg);
break;
case 't' : tileSize = atoi(optarg);
break;
case 'h': print_usage(stderr, 1);
exit(EXIT_FAILURE);
case '?': print_usage(stderr, 1);
exit(EXIT_FAILURE);
default:
abort();
}
}
N = Ni * Nj;
printf("\nRunning Jacobi method:\n");
printf("======================\n\n");
//printf("Coefficient matrix given in file: \n%s\n\n", fname);
printf("Parameters:\n");
printf("N=%d, Ni=%d, Nj=%d, ", N, Ni, Nj);
printf("iterations=%d, kernel=%d, tilesize=%d\n", iter,kernel,tileSize);
// Allocate memory on host
x_next = (float *) malloc(Ni*sizeof(float));
A = (float *) malloc(N*sizeof(float));
x_now = (float *) malloc(Ni*sizeof(float));
b_h = (float *) malloc(Ni*sizeof(float));
x_h = (float *) malloc(Ni*sizeof(float));
x_d = (float *) malloc(Ni*sizeof(float));
// Initialize result vector x
for (i=0; i<Ni; i++)
{
x_now[i] = 0;
x_next[i] = 0;
}
// 6. Read coefficient matrix from file, but here I made it
// like the random value
/* file = fopen(fname, "r");
if (file == NULL)
exit(EXIT_FAILURE);
char *line;
size_t len = 0;
i=0;
while ((getline(&line, &len, file)) != -1)
{
if (i<N)
A[i] = atof(line);
else
b[i-N] = atof(line);
i++;
}*/
start_h = clock();
// Run "iter" iterations of the Jacobi method on HOST
for (k=0; k<iter; k++)
{
if (k%2)
jacobiOnHost(x_now, A, x_next, b_h, Ni, Nj);
else
jacobiOnHost(x_next, A, x_now, b_h, Ni, Nj);
for (i=0; i<Nj; i++)
x_now[i] = x_next[i];
}
end_h = clock();
// Save result from host in x_h
for (i=0; i<Nj; i++)
x_h[i] = x_next[i];
// Re-initialize result vector x for device computation
for (i=0; i<Ni; i++)
{
x_now[i] = 0;
x_next[i] = 0;
//b_h[i] = rand();
}
for (int i=0; i < Ni; i++)
{
b_h[i]=rand();
for (int j=0; j < Nj; j++)
A[i*Ni + j] = rand();
}
// Allocate memory on the device
// Unoptimize
assert(cudaSuccess == cudaMalloc((void **) &x_next_u, Ni*sizeof(float)));
assert(cudaSuccess == cudaMalloc((void **) &A_u, N*sizeof(float)));
assert(cudaSuccess == cudaMalloc((void **) &x_now_u, Ni*sizeof(float)));
assert(cudaSuccess == cudaMalloc((void **) &b_u, Ni*sizeof(float)));
// Optimized
assert(cudaSuccess == cudaMalloc((void **) &d_x_next, Ni*sizeof(float)));
assert(cudaSuccess == cudaMalloc((void **) &d_A, N*sizeof(float)));
assert(cudaSuccess == cudaMalloc((void **) &d_x_now, Ni*sizeof(float)));
assert(cudaSuccess == cudaMalloc((void **) &b, Ni*sizeof(float)));
//assert(cudaSuccess == cudaMalloc((void **) &b_s, Ni*sizeof(float)));
// Copy data -> device
// 1.Un-Optimize
cudaMemcpy(x_next_u, x_next, sizeof(float)*Ni, cudaMemcpyHostToDevice);
cudaMemcpy(A_u, A, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy(x_now_u, x_now, sizeof(float)*Ni, cudaMemcpyHostToDevice);
cudaMemcpy(b_u, b_h, sizeof(float)*Ni, cudaMemcpyHostToDevice);
//2. Optimized
cudaMemcpy(d_x_next, x_next, sizeof(float)*Ni, cudaMemcpyHostToDevice);
cudaMemcpy(d_A, A, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy(d_x_now, x_now, sizeof(float)*Ni, cudaMemcpyHostToDevice);
cudaMemcpy(b, b_h, sizeof(float)*Ni, cudaMemcpyHostToDevice);
// Compute grid and block size.
// Un-optimized kernel
//int blockSize = Ni;
// int nBlocks = 1;
// Optimized kernel
int nTiles = Ni/tileSize + (Ni%tileSize == 0?0:1);
int gridHeight = Nj/tileSize + (Nj%tileSize == 0?0:1);
int gridWidth = Ni/tileSize + (Ni%tileSize == 0?0:1);
printf("w=%d, h=%d\n",gridWidth,gridHeight);
dim3 dGrid(gridHeight, gridWidth),
dBlock(tileSize, tileSize);
dim3 dimGrid(16,4);
dim3 dimBlock(4,1);
// 8. Start counting the running time for kernel
start_d = clock();
// Run "iter" iterations of the Jacobi method on DEVICE
if (kernel == 1)
{
printf("Using un-optimized kernel.\n");
for (k=0; k<iter; k++)
{
if (k%2)
jacobiUnOptimizedOnDevice <<< nTiles, tileSize >>> (x_now_u, A_u, x_next_u, b_u, Ni, Nj);
else
jacobiUnOptimizedOnDevice <<< nTiles, tileSize >>> (x_now_u, A_u, x_next_u, b_u, Ni, Nj);
cudaMemcpy(x_now_u, x_next_u, sizeof(float)*Ni, cudaMemcpyDeviceToDevice);
}
}
else
{
printf("Using Shared Memory to optimize kernel.\n");
for (k=0; k<iter; k++)
{
if (k%2)
jacobiOptimizedOnDevice <<< dimGrid, dimBlock >>> (d_x_now, d_A, d_x_next, Ni, Nj);
else
jacobiOptimizedOnDevice <<< dimGrid, dimBlock >>> (d_x_now, d_A, d_x_next, Ni, Nj);
//cudaMemcpy(d_x_now, d_x_next, sizeof(float)*Ni, cudaMemcpyDeviceToDevice);
}
}
end_d = clock();
// Data <- device
cudaMemcpy(x_d, d_x_next, sizeof(float)*Ni, cudaMemcpyDeviceToHost);
// Free memory
free(x_next); free(A); free(x_now); free(b_h);
cudaFree(d_x_next); cudaFree(d_A); cudaFree(d_x_now); cudaFree(b);
cudaFree(x_now_u); cudaFree(x_next_u); cudaFree(A_u); cudaFree(b_u);
end=clock();
// 9. Check the mismatch of the x_now and the x_prev,
// and print the result.
printf("\nResult after %d iterations:\n",iter);
float err = 0.0;
for (i=0; i < Ni; i++)
{
/*printf("x_h[%d]=%f\n",i,x_h[i]);
printf("x_d[%d]=%f\n",i,x_d[i]);*/
// *************************************
//These two printing the iteration results on each points.
// For example, if the matrix size is 512 * 512
// then it's gonna print all 0 - 511 for both x_h
// and x_d.
// And here we need to do a 4k * 4k, but the result is
// too long and cover all the imformation, so I just
// skip this part.
err += abs(x_h[i] - x_d[i]) / Ni;
}
//
printf("x_h[%d]=%f\n",0,x_h[0]);
printf("x_d[%d]=%f\n",0,x_d[0]);
t_full = ((float)end - (float)start) / CLOCKS_PER_SEC;
t_host = ((float)end_h - (float)start_h) / CLOCKS_PER_SEC;
t_dev = ((float)end_d - (float)start_d) / CLOCKS_PER_SEC;
printf("\nTiming:\nTotal: %f\nSerial: %f\nParallel: %f\n\n", t_full, t_host, t_dev);
printf("Relative error: %f\n", err);
printf("\nProgram terminated successfully.\n");
return 0;
}
// CUDA Finalized :)
// All this programming shows the results for different size of
// the input matrix A and the vector b.
// From the results, the size increasing the execution time
// increased. I made 11 points to verify the results, and from
// 8 * 8 to 4k * 4k, from the very beginning the serial
// programming is faster than the parallel programming, due to
// the communication in the GPU. However, the size increasing
// the advantage of GPU processing has arisen. At the very end
// the size of 4K * 4K the execution time for the serial is three
// times bigger than parallel. The results are reasonable.
// The reason why there is a warning because I didn't use read
// file function, so I just leave it.
Here is all my code please let me know if there is any problem thank you so much