How to correct the time units on the results

(1) Apply CPU and GPU to compute a matrix multiplication AB=C, where A, B and C
are either 1000
1000 or others, such as 500500, 2020, etc. The requirements
are to show a maximal error, an average error and GFLOPS.

(2) To reduce the errors, please apply Kahan’s Summation Formula and show all the results mentioned above.

(3) To raise the GFLOPS, please apply the Pitch function of copying matrix and the shared memory function
in a block from main memory to global memory and show all the results mentioned above again.


My Cuda programme below can run but the output is not so good -

#include <cuda_runtime.h>
#include <malloc.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <windows.h>  

#define BLOCK_SIZE 16
#define NUM_THREADS 256

__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	const int idx = bid * blockDim.x + tid;
	const int row = idx / n;
	const int column = idx % n;
	int i;

	if (row < n && column < n) 
	{
	float t = 0;
	for (i = 0; i < n; i++) 		
	{
	t += a[row * lda + i] * b[i * ldb + column];
	}
	c[row * ldc + column] = t;
	}
}

clock_t matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	float *ac, *bc, *cc;
	clock_t start, end;

	start = clock();
	cudaMalloc((void**)&ac, sizeof(float)* n * n);
	cudaMalloc((void**)&bc, sizeof(float)* n * n);
	cudaMalloc((void**)&cc, sizeof(float)* n * n);

	cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
	cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

	int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
	matMultCUDA << <blocks * n, NUM_THREADS >> >(ac, n, bc, n, cc, n, n);

	cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

	cudaFree(ac);
	cudaFree(bc);
	cudaFree(cc);

	end = clock();

	return end - start;
}

__global__ static void matMultCUDA_2(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	const int idx = bid * blockDim.x + tid;
	const int row = idx / n;
	const int column = idx % n;
	int i;

	if (row < n && column < n) 
	{
	float t = 0;
	float y = 0;
	for (i = 0; i < n; i++) 
	{
	float r;
	y -= a[row * lda + i] * b[i * ldb + column];
	r = t - y;		
	y = (r - t) + y;
	t = r;
	}
	c[row * ldc + column] = t;
	}
}

clock_t matmultCUDA_2(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	float *ac, *bc, *cc;
	clock_t start, end;

	start = clock();
	cudaMalloc((void**)&ac, sizeof(float)* n * n);
	cudaMalloc((void**)&bc, sizeof(float)* n * n);
	cudaMalloc((void**)&cc, sizeof(float)* n * n);

	cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
	cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

	int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
	matMultCUDA_2 << <blocks * n, NUM_THREADS >> >(ac, n, bc, n, cc, n, n);

	cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

	cudaFree(ac);
	cudaFree(bc);
	cudaFree(cc);

	end = clock();

	return end - start;
}

__global__ static void matMultCUDA_3(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	extern __shared__ float data[];
	const int tid = threadIdx.x;
	const int row = blockIdx.x;
	int i, j;

	for (i = tid; i < n; i += blockDim.x) 
	{
	data[i] = a[row * lda + i];
	}

	__syncthreads();

	for (j = tid; j < n; j += blockDim.x) 
	{
	float t = 0;
	float y = 0;		
	for (i = 0; i < n; i++) 
                 {
	float r;
	y -= data[i] * b[i * ldb + j];
	r = t - y;
	y = (r - t) + y;
	t = r;
	}
	c[row * ldc + j] = t;
	}
}

clock_t matmultCUDA_3(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	float *ac, *bc, *cc;
	clock_t start, end;

	start = clock();
	cudaMalloc((void**)&ac, sizeof(float)* n * n);
	cudaMalloc((void**)&bc, sizeof(float)* n * n);
	cudaMalloc((void**)&cc, sizeof(float)* n * n);

	cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
	cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

	int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
	matMultCUDA_3 << <n, NUM_THREADS, sizeof(float)* n >> >(ac, n, bc, n, cc, n, n);

	cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

	cudaFree(ac);
	cudaFree(bc);
	cudaFree(cc);

	end = clock();

	return end - start;
}

__global__ static void matMultCUDA_4(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	extern __shared__ float data[];
	const int tid = threadIdx.x;
	const int row = blockIdx.x;
	int i, j;

	for (i = tid; i < n; i += blockDim.x) 
	{
	data[i] = a[row * lda + i];
	}

	__syncthreads();

	for (j = tid; j < n; j += blockDim.x) 
	{
	float t = 0;
	float y = 0;
	for (i = 0; i < n; i++) 
	{
	float r;
	y -= data[i] * b[i * ldb + j];
	r = t - y;
	y = (r - t) + y;
	t = r;
	}
	c[row * ldc + j] = t;
	}
}

clock_t matmultCUDA_4(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	float *ac, *bc, *cc;
	clock_t start, end;

	start = clock();

	size_t pitch_a, pitch_b, pitch_c;
	cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);
	cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);
	cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);

	cudaMalloc((void**)&ac, sizeof(float)* n * n);
	cudaMalloc((void**)&bc, sizeof(float)* n * n);
	cudaMalloc((void**)&cc, sizeof(float)* n * n);

	cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
	cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

	int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
	matMultCUDA_4 << <n, NUM_THREADS, sizeof(float)* n >> >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);

	cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

	cudaFree(ac);
	cudaFree(bc);
	cudaFree(cc);

	end = clock();

	return end - start;
}

__global__ static void matMultCUDA_5(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
	const int tidc = threadIdx.x;
	const int tidr = threadIdx.y;
	const int bidc = blockIdx.x * BLOCK_SIZE;
	const int bidr = blockIdx.y * BLOCK_SIZE;
	int i, j;

	float results = 0;
	float comp = 0;

	for (j = 0; j < n; j += BLOCK_SIZE) 
	{
		if (tidr + bidr < n && tidc + j < n) 
		{
		matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
		}
		else 
		{
		matA[tidr][tidc] = 0;
		}

		if (tidr + j < n && tidc + bidc < n) 
		{
		matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];
		}
		else 
		{
		matB[tidr][tidc] = 0;
		}

		__syncthreads();

		for (i = 0; i < BLOCK_SIZE; i++) 
		{
		float t;
		comp -= matA[tidr][i] * matB[i][tidc];
		t = results - comp;			
		comp = (t - results) + comp;
		results = t;
		}

		__syncthreads();
	}

	if (tidr + bidr < n && tidc + bidc < n) 
	{
	c[(tidr + bidr) * ldc + tidc + bidc] = results;
	}
}

clock_t matmultCUDA_5(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	float *ac, *bc, *cc;
	clock_t start, end;

	start = clock();

	size_t pitch_a, pitch_b, pitch_c;
	cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);
	cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);
	cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);

	cudaMalloc((void**)&ac, sizeof(float)* n * n);
	cudaMalloc((void**)&bc, sizeof(float)* n * n);
	cudaMalloc((void**)&cc, sizeof(float)* n * n);

	cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
	cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

	int bx = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
	dim3 blocks(bx, bx);
	dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
	matMultCUDA_5 << <blocks, threads >> >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);

	cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

	cudaFree(ac);
	cudaFree(bc);
	cudaFree(cc);

	end = clock();
	
                  return end - start;

}
void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	int i, j, k;

	for (i = 0; i < n; i++) 
	{
	for (j = 0; j < n; j++) 
	{
	double t = 0;
	for (k = 0; k < n; k++) 
	{
	t += a[i * lda + k] * b[k * ldb + j];
	}
	c[i * ldc + j] = t;
	}
	}
}

void matgen(float* a, int lda, int n)
{
	int i, j;

	for (i = 0; i < n; i++)
	{
	for (j = 0; j < n; j++) {
	a[i * lda + j] = (float)rand() / RAND_MAX + (float)rand() / (RAND_MAX * RAND_MAX);
	}
	}
}

void compare_mat(const float* a, int lda, const float* b, int ldb, int n)
{
	float max_err = 0;
	float average_err = 0;
	int i, j;

	for (i = 0; i < n; i++) 
	{
	for (j = 0; j < n; j++) 
	{
	if (b[i * ldb + j] != 0) 
	{
	float err = fabs((a[i * lda + j] - b[i * ldb + j]) / b[i * ldb + j]);
	if (max_err < err) max_err = err;
	average_err += err;
	}
	}
	}

	printf("Max error: %g  Average error: %g\n", max_err, average_err / (n * n));
}

bool InitCUDA()
{
	int count;

	cudaGetDeviceCount(&count);
	if (count == 0) 
	{
	fprintf(stderr, "There is no device.\n");
	return false;
	}

	int i;
	for (i = 0; i < count; i++) 
	{
	cudaDeviceProp prop;
	if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) 
	{
	if (prop.major >= 1) 
	{
	break;
	}
    }
	}

	if (i == count) 
	{
	fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
	return false;
	}

	cudaSetDevice(i);

	return true;
}

int main()
{
	int n;
	printf("Please input maxter number:");
	scanf("%d",&n);
	float *a, *b, *c, *d;
	if (!InitCUDA()) 
	{
	return 0;
	}

	a = (float*)malloc(sizeof(float)* n * n);
	b = (float*)malloc(sizeof(float)* n * n);
	c = (float*)malloc(sizeof(float)* n * n);
	d = (float*)malloc(sizeof(float)* n * n);

	srand(0);

	matgen(a, n, n);

	LARGE_INTEGER timeStart;
	LARGE_INTEGER timeEnd;
	LARGE_INTEGER frequency;
	QueryPerformanceFrequency(&frequency);
	double quadpart = (double)frequency.QuadPart;
	QueryPerformanceCounter(&timeStart);

	matmult(a, n, b, n, d, n, n);

	QueryPerformanceCounter(&timeEnd);
	double elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;
	printf("\n(CPU) Time used is: %f ms\n", elapsed);

	clock_t time = matmultCUDA(a, n, b, n, c, n, n);
	double sec = (double)time / CLOCKS_PER_SEC;
	printf("\n(1) Before reduce errors:");
	printf("\n(GPU) Time used is: %.f ms  (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
	compare_mat(c, n, d, n, n);

	time = matmultCUDA_2(a, n, b, n, c, n, n);
	sec = (double)time / CLOCKS_PER_SEC;
	printf("\n(2) After reduce errors:");
	printf("\n(GPU2) Time used is: %.f ms  (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
        compare_mat(c, n, d, n, n);

	time = matmultCUDA_3(a, n, b, n, c, n, n);
	sec = (double)time / CLOCKS_PER_SEC;
	printf("\n(GPU3) Time used is: %.f ms  (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
        compare_mat(c, n, d, n, n);

	time = matmultCUDA_4(a, n, b, n, c, n, n);
	sec = (double)time / CLOCKS_PER_SEC;
	printf("\n(GPU4) Time used is: %.f ms  (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
        compare_mat(c, n, d, n, n);
	
        QueryPerformanceFrequency(&frequency);
	quadpart = (double)frequency.QuadPart;
	QueryPerformanceCounter(&timeStart);

        time = matmultCUDA_5(a, n, b, n, c, n, n);
        QueryPerformanceCounter(&timeEnd);
        elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;
        printf("\n(CPU5) Time used is: %f ms\n", elapsed);
        compare_mat(c, n, d, n, n);
        sec = ((double)elapsed / CLOCKS_PER_SEC)*1000 ;
	
        printf("\n(3) Apply pitch and shard memory:");
        printf("\n(GPU) Time used is: %f ms (%f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

        free(a);
        free(b);
        free(c);
        free(d);

        system("\n pause");
        return 0;
}

Kindly please help for the modification such as -

printf(“\n(GPU) Time used is: %.f ms (%.f GFLOPS)\n”, sec, 2.0 * n * n * n / (sec * 1E9));

Some mistake above?

You should only time this step and not the cudaMalloc, cudaMemcpy2D, cudaFree. Then you get more accurate results.

matMultCUDA_5 << <blocks, threads >> >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);
cudaDeviceSyncronize(); // required to time the execution time of the kernel, and not just the asynchronous launch

Much more accurate timings could be achieved by using the CUDA event APIs as outlined here. Again, the author is Mark Harris. ;)

https://devblogs.nvidia.com/how-implement-performance-metrics-cuda-cc/

For more accuracy it helps to time about 10 invocations of the kernel and then to drop outliers, e.g. by taking the median value of the result.

To compute an estimate of Flop/s you simply divide the expected number of floating point operations (mul, add) for your problem size by the run time in seconds. Divide by 1e9 to get the Gigaflops value.

Can you please show more detail information such as write some examples over here - it will be good for understanding!