# How to modify with matrix

(1) Apply CPU and GPU to compute a matrix multiplication AB=C, where A, B and C
are either 1000
1000 or others, such as 500500, 2020, etc. The requirements
are to show a maximal error, an average error and GFLOPS.

(2) To reduce the errors, please apply Kahan’s Summation Formula and show all the results mentioned above.

(3) To raise the GFLOPS, please apply the Pitch function of copying matrix and the shared memory function
in a block from main memory to global memory and show all the results mentioned above again.

my Cuda sample -

``````#include <cuda_runtime.h>
#include <malloc.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define BLOCK_SIZE 16
__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
const int bid = blockIdx.x;
const int idx = bid * blockDim.x + tid;
const int row = idx / n;
const int column = idx % n;
int i;

if (row < n && column < n)
{
float t = 0;
for (i = 0; i < n; i++)
{
t += a[row * lda + i] * b[i * ldb + column];
}
c[row * ldc + column] = t;
}
}

clock_t matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
float *ac, *bc, *cc;
clock_t start, end;

start = clock();
cudaMalloc((void**)&ac, sizeof(float)* n * n);
cudaMalloc((void**)&bc, sizeof(float)* n * n);
cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

matMultCUDA << <blocks * n, NUM_THREADS >> >(ac, n, bc, n, cc, n, n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);
cudaFree(bc);
cudaFree(cc);

end = clock();

return end - start;
}

__global__ static void matMultCUDA_2(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
const int bid = blockIdx.x;
const int idx = bid * blockDim.x + tid;
const int row = idx / n;
const int column = idx % n;
int i;

if (row < n && column < n)
{
float t = 0;
float y = 0;
for (i = 0; i < n; i++)
{
float r;
y -= a[row * lda + i] * b[i * ldb + column];
r = t - y;
y = (r - t) + y;
t = r;
}
c[row * ldc + column] = t;
}
}

clock_t matmultCUDA_2(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
float *ac, *bc, *cc;
clock_t start, end;

start = clock();
cudaMalloc((void**)&ac, sizeof(float)* n * n);
cudaMalloc((void**)&bc, sizeof(float)* n * n);
cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

matMultCUDA_2 << <blocks * n, NUM_THREADS >> >(ac, n, bc, n, cc, n, n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);
cudaFree(bc);
cudaFree(cc);

end = clock();

return end - start;
}

__global__ static void matMultCUDA_3(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
extern __shared__ float data[];
const int row = blockIdx.x;
int i, j;

for (i = tid; i < n; i += blockDim.x)
{
data[i] = a[row * lda + i];
}

for (j = tid; j < n; j += blockDim.x)
{
float t = 0;
float y = 0;
for (i = 0; i < n; i++) {
float r;
y -= data[i] * b[i * ldb + j];
r = t - y;
y = (r - t) + y;
t = r;
}
c[row * ldc + j] = t;
}
}

clock_t matmultCUDA_3(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
float *ac, *bc, *cc;
clock_t start, end;

start = clock();
cudaMalloc((void**)&ac, sizeof(float)* n * n);
cudaMalloc((void**)&bc, sizeof(float)* n * n);
cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

matMultCUDA_3 << <n, NUM_THREADS, sizeof(float)* n >> >(ac, n, bc, n, cc, n, n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);
cudaFree(bc);
cudaFree(cc);

end = clock();

return end - start;
}

__global__ static void matMultCUDA_4(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
extern __shared__ float data[];
const int row = blockIdx.x;
int i, j;

for (i = tid; i < n; i += blockDim.x)
{
data[i] = a[row * lda + i];
}

for (j = tid; j < n; j += blockDim.x)
{
float t = 0;
float y = 0;
for (i = 0; i < n; i++)
{
float r;
y -= data[i] * b[i * ldb + j];
r = t - y;
y = (r - t) + y;
t = r;
}
c[row * ldc + j] = t;
}
}

clock_t matmultCUDA_4(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
float *ac, *bc, *cc;
clock_t start, end;

start = clock();

size_t pitch_a, pitch_b, pitch_c;
cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);
cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);
cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);

cudaMalloc((void**)&ac, sizeof(float)* n * n);
cudaMalloc((void**)&bc, sizeof(float)* n * n);
cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

matMultCUDA_4 << <n, NUM_THREADS, sizeof(float)* n >> >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);
cudaFree(bc);
cudaFree(cc);

end = clock();

return end - start;
}

__global__ static void matMultCUDA_5(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
const int bidc = blockIdx.x * BLOCK_SIZE;
const int bidr = blockIdx.y * BLOCK_SIZE;
int i, j;

float results = 0;
float comp = 0;

for (j = 0; j < n; j += BLOCK_SIZE)
{
if (tidr + bidr < n && tidc + j < n)
{
matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
}
else
{
matA[tidr][tidc] = 0;
}

if (tidr + j < n && tidc + bidc < n)
{
matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];
}
else
{
matB[tidr][tidc] = 0;
}

for (i = 0; i < BLOCK_SIZE; i++)
{
float t;
comp -= matA[tidr][i] * matB[i][tidc];
t = results - comp;
comp = (t - results) + comp;
results = t;
}

}

if (tidr + bidr < n && tidc + bidc < n)
{
c[(tidr + bidr) * ldc + tidc + bidc] = results;
}
}

clock_t matmultCUDA_5(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
float *ac, *bc, *cc;
clock_t start, end;

start = clock();

size_t pitch_a, pitch_b, pitch_c;
cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);
cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);
cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);

cudaMalloc((void**)&ac, sizeof(float)* n * n);
cudaMalloc((void**)&bc, sizeof(float)* n * n);
cudaMalloc((void**)&cc, sizeof(float)* n * n);

cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

int bx = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 blocks(bx, bx);
matMultCUDA_5 << <blocks, threads >> >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);

cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

cudaFree(ac);
cudaFree(bc);
cudaFree(cc);

end = clock();
return end - start;

}
void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
int i, j, k;

for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
double t = 0;
for (k = 0; k < n; k++)
{
t += a[i * lda + k] * b[k * ldb + j];
}
c[i * ldc + j] = t;
}
}
}

void matgen(float* a, int lda, int n)
{
int i, j;

for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++) {
a[i * lda + j] = (float)rand() / RAND_MAX + (float)rand() / (RAND_MAX * RAND_MAX);
}
}
}

void compare_mat(const float* a, int lda, const float* b, int ldb, int n)
{
float max_err = 0;
float average_err = 0;
int i, j;

for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
if (b[i * ldb + j] != 0)
{
float err = fabs((a[i * lda + j] - b[i * ldb + j]) / b[i * ldb + j]);
if (max_err < err) max_err = err;
average_err += err;
}
}
}

printf("Max error: %g  Average error: %g\n", max_err, average_err / (n * n));
}

bool InitCUDA()
{
int count;

cudaGetDeviceCount(&count);
if (count == 0)
{
fprintf(stderr, "There is no device.\n");
return false;
}

int i;
for (i = 0; i < count; i++)
{
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess)
{
if (prop.major >= 1)
{
break;
}
}
}

if (i == count)
{
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}

cudaSetDevice(i);

return true;
}

int main()
{
int n;
scanf("%d",&n);
float *a, *b, *c, *d;
if (!InitCUDA())
{
return 0;
}

a = (float*)malloc(sizeof(float)* n * n);
b = (float*)malloc(sizeof(float)* n * n);
c = (float*)malloc(sizeof(float)* n * n);
d = (float*)malloc(sizeof(float)* n * n);

srand(0);

matgen(a, n, n);

LARGE_INTEGER timeStart;
LARGE_INTEGER timeEnd;
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&timeStart);

matmult(a, n, b, n, d, n, n);

QueryPerformanceCounter(&timeEnd);
printf("\n(CPU) Time used is: %f ms\n", elapsed);

clock_t time = matmultCUDA(a, n, b, n, c, n, n);
double sec = (double)time / CLOCKS_PER_SEC;
printf("\n(1) Before reduce errors:");
printf("\n(GPU) Time used is: %.f ms  (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
compare_mat(c, n, d, n, n);

time = matmultCUDA_2(a, n, b, n, c, n, n);
sec = (double)time / CLOCKS_PER_SEC;
printf("\n(2) After reduce errors:");
printf("\n(GPU) Time used is: %.f ms  (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

time = matmultCUDA_3(a, n, b, n, c, n, n);
sec = (double)time / CLOCKS_PER_SEC;
printf("\n(GPU) Time used: %.f sec  (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

time = matmultCUDA_4(a, n, b, n, c, n, n);
sec = (double)time / CLOCKS_PER_SEC;
printf("\n(GPU) Time used: %.f ms  (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
compare_mat(c, n, d, n, n);

QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&timeStart);
time = matmultCUDA_5(a, n, b, n, c, n, n);
QueryPerformanceCounter(&timeEnd);
printf("\n(CPU) Time used: %f ms\n", elapsed);
sec = ((double)elapsed / CLOCKS_PER_SEC)*1000 ;

printf("\n(3) Apply pitch and shard memory:");
printf("\n(GPU) Time used is: %f ms (%f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

free(a);
free(b);
free(c);
free(d);

system("\n pause");
return 0;
}
``````

Here how to modify the time units above???