(1) Apply CPU and GPU to compute a matrix multiplication AB=C, where A, B and C
are either 10001000 or others, such as 500500, 2020, etc. The requirements
are to show a maximal error, an average error and GFLOPS.
(2) To reduce the errors, please apply Kahan’s Summation Formula and show all the results mentioned above.
(3) To raise the GFLOPS, please apply the Pitch function of copying matrix and the shared memory function
in a block from main memory to global memory and show all the results mentioned above again.
my Cuda sample -
#include <cuda_runtime.h>
#include <malloc.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define BLOCK_SIZE 16
#define NUM_THREADS 256
__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int idx = bid * blockDim.x + tid;
const int row = idx / n;
const int column = idx % n;
int i;
if (row < n && column < n)
{
float t = 0;
for (i = 0; i < n; i++)
{
t += a[row * lda + i] * b[i * ldb + column];
}
c[row * ldc + column] = t;
}
}
clock_t matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
float *ac, *bc, *cc;
clock_t start, end;
start = clock();
cudaMalloc((void**)&ac, sizeof(float)* n * n);
cudaMalloc((void**)&bc, sizeof(float)* n * n);
cudaMalloc((void**)&cc, sizeof(float)* n * n);
cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);
int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
matMultCUDA << <blocks * n, NUM_THREADS >> >(ac, n, bc, n, cc, n, n);
cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);
cudaFree(ac);
cudaFree(bc);
cudaFree(cc);
end = clock();
return end - start;
}
__global__ static void matMultCUDA_2(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int idx = bid * blockDim.x + tid;
const int row = idx / n;
const int column = idx % n;
int i;
if (row < n && column < n)
{
float t = 0;
float y = 0;
for (i = 0; i < n; i++)
{
float r;
y -= a[row * lda + i] * b[i * ldb + column];
r = t - y;
y = (r - t) + y;
t = r;
}
c[row * ldc + column] = t;
}
}
clock_t matmultCUDA_2(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
float *ac, *bc, *cc;
clock_t start, end;
start = clock();
cudaMalloc((void**)&ac, sizeof(float)* n * n);
cudaMalloc((void**)&bc, sizeof(float)* n * n);
cudaMalloc((void**)&cc, sizeof(float)* n * n);
cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);
int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
matMultCUDA_2 << <blocks * n, NUM_THREADS >> >(ac, n, bc, n, cc, n, n);
cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);
cudaFree(ac);
cudaFree(bc);
cudaFree(cc);
end = clock();
return end - start;
}
__global__ static void matMultCUDA_3(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
extern __shared__ float data[];
const int tid = threadIdx.x;
const int row = blockIdx.x;
int i, j;
for (i = tid; i < n; i += blockDim.x)
{
data[i] = a[row * lda + i];
}
__syncthreads();
for (j = tid; j < n; j += blockDim.x)
{
float t = 0;
float y = 0;
for (i = 0; i < n; i++) {
float r;
y -= data[i] * b[i * ldb + j];
r = t - y;
y = (r - t) + y;
t = r;
}
c[row * ldc + j] = t;
}
}
clock_t matmultCUDA_3(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
float *ac, *bc, *cc;
clock_t start, end;
start = clock();
cudaMalloc((void**)&ac, sizeof(float)* n * n);
cudaMalloc((void**)&bc, sizeof(float)* n * n);
cudaMalloc((void**)&cc, sizeof(float)* n * n);
cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);
int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
matMultCUDA_3 << <n, NUM_THREADS, sizeof(float)* n >> >(ac, n, bc, n, cc, n, n);
cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);
cudaFree(ac);
cudaFree(bc);
cudaFree(cc);
end = clock();
return end - start;
}
__global__ static void matMultCUDA_4(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
extern __shared__ float data[];
const int tid = threadIdx.x;
const int row = blockIdx.x;
int i, j;
for (i = tid; i < n; i += blockDim.x)
{
data[i] = a[row * lda + i];
}
__syncthreads();
for (j = tid; j < n; j += blockDim.x)
{
float t = 0;
float y = 0;
for (i = 0; i < n; i++)
{
float r;
y -= data[i] * b[i * ldb + j];
r = t - y;
y = (r - t) + y;
t = r;
}
c[row * ldc + j] = t;
}
}
clock_t matmultCUDA_4(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
float *ac, *bc, *cc;
clock_t start, end;
start = clock();
size_t pitch_a, pitch_b, pitch_c;
cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);
cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);
cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);
cudaMalloc((void**)&ac, sizeof(float)* n * n);
cudaMalloc((void**)&bc, sizeof(float)* n * n);
cudaMalloc((void**)&cc, sizeof(float)* n * n);
cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);
int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
matMultCUDA_4 << <n, NUM_THREADS, sizeof(float)* n >> >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);
cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost);
cudaFree(ac);
cudaFree(bc);
cudaFree(cc);
end = clock();
return end - start;
}
__global__ static void matMultCUDA_5(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
const int tidc = threadIdx.x;
const int tidr = threadIdx.y;
const int bidc = blockIdx.x * BLOCK_SIZE;
const int bidr = blockIdx.y * BLOCK_SIZE;
int i, j;
float results = 0;
float comp = 0;
for (j = 0; j < n; j += BLOCK_SIZE)
{
if (tidr + bidr < n && tidc + j < n)
{
matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
}
else
{
matA[tidr][tidc] = 0;
}
if (tidr + j < n && tidc + bidc < n)
{
matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];
}
else
{
matB[tidr][tidc] = 0;
}
__syncthreads();
for (i = 0; i < BLOCK_SIZE; i++)
{
float t;
comp -= matA[tidr][i] * matB[i][tidc];
t = results - comp;
comp = (t - results) + comp;
results = t;
}
__syncthreads();
}
if (tidr + bidr < n && tidc + bidc < n)
{
c[(tidr + bidr) * ldc + tidc + bidc] = results;
}
}
clock_t matmultCUDA_5(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
float *ac, *bc, *cc;
clock_t start, end;
start = clock();
size_t pitch_a, pitch_b, pitch_c;
cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);
cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);
cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);
cudaMalloc((void**)&ac, sizeof(float)* n * n);
cudaMalloc((void**)&bc, sizeof(float)* n * n);
cudaMalloc((void**)&cc, sizeof(float)* n * n);
cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);
int bx = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 blocks(bx, bx);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
matMultCUDA_5 << <blocks, threads >> >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);
cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost);
cudaFree(ac);
cudaFree(bc);
cudaFree(cc);
end = clock();
return end - start;
}
void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
int i, j, k;
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
double t = 0;
for (k = 0; k < n; k++)
{
t += a[i * lda + k] * b[k * ldb + j];
}
c[i * ldc + j] = t;
}
}
}
void matgen(float* a, int lda, int n)
{
int i, j;
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++) {
a[i * lda + j] = (float)rand() / RAND_MAX + (float)rand() / (RAND_MAX * RAND_MAX);
}
}
}
void compare_mat(const float* a, int lda, const float* b, int ldb, int n)
{
float max_err = 0;
float average_err = 0;
int i, j;
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
if (b[i * ldb + j] != 0)
{
float err = fabs((a[i * lda + j] - b[i * ldb + j]) / b[i * ldb + j]);
if (max_err < err) max_err = err;
average_err += err;
}
}
}
printf("Max error: %g Average error: %g\n", max_err, average_err / (n * n));
}
bool InitCUDA()
{
int count;
cudaGetDeviceCount(&count);
if (count == 0)
{
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
for (i = 0; i < count; i++)
{
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess)
{
if (prop.major >= 1)
{
break;
}
}
}
if (i == count)
{
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
int main()
{
int n;
printf("please input maxter number :");
scanf("%d",&n);
float *a, *b, *c, *d;
if (!InitCUDA())
{
return 0;
}
a = (float*)malloc(sizeof(float)* n * n);
b = (float*)malloc(sizeof(float)* n * n);
c = (float*)malloc(sizeof(float)* n * n);
d = (float*)malloc(sizeof(float)* n * n);
srand(0);
matgen(a, n, n);
LARGE_INTEGER timeStart;
LARGE_INTEGER timeEnd;
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
double quadpart = (double)frequency.QuadPart;
QueryPerformanceCounter(&timeStart);
matmult(a, n, b, n, d, n, n);
QueryPerformanceCounter(&timeEnd);
double elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;
printf("\n(CPU) Time used is: %f ms\n", elapsed);
clock_t time = matmultCUDA(a, n, b, n, c, n, n);
double sec = (double)time / CLOCKS_PER_SEC;
printf("\n(1) Before reduce errors:");
printf("\n(GPU) Time used is: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
compare_mat(c, n, d, n, n);
time = matmultCUDA_2(a, n, b, n, c, n, n);
sec = (double)time / CLOCKS_PER_SEC;
printf("\n(2) After reduce errors:");
printf("\n(GPU) Time used is: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
time = matmultCUDA_3(a, n, b, n, c, n, n);
sec = (double)time / CLOCKS_PER_SEC;
printf("\n(GPU) Time used: %.f sec (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
time = matmultCUDA_4(a, n, b, n, c, n, n);
sec = (double)time / CLOCKS_PER_SEC;
printf("\n(GPU) Time used: %.f ms (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
compare_mat(c, n, d, n, n);
QueryPerformanceFrequency(&frequency);
quadpart = (double)frequency.QuadPart;
QueryPerformanceCounter(&timeStart);
time = matmultCUDA_5(a, n, b, n, c, n, n);
QueryPerformanceCounter(&timeEnd);
elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;
printf("\n(CPU) Time used: %f ms\n", elapsed);
sec = ((double)elapsed / CLOCKS_PER_SEC)*1000 ;
printf("\n(3) Apply pitch and shard memory:");
printf("\n(GPU) Time used is: %f ms (%f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
free(a);
free(b);
free(c);
free(d);
system("\n pause");
return 0;
}
Here how to modify the time units above???