How to correct the time units on the results

vokoyov · May 11, 2018, 8:13am

(1) Apply CPU and GPU to compute a matrix multiplication AB=C, where A, B and C
are either 10001000 or others, such as 500500, 2020, etc. The requirements
are to show a maximal error, an average error and GFLOPS.

(2) To reduce the errors, please apply Kahan’s Summation Formula and show all the results mentioned above.

(3) To raise the GFLOPS, please apply the Pitch function of copying matrix and the shared memory function
in a block from main memory to global memory and show all the results mentioned above again.

My Cuda programme below can run but the output is not so good -

#include <cuda_runtime.h>
#include <malloc.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <windows.h>  

#define BLOCK_SIZE 16
#define NUM_THREADS 256

__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	const int idx = bid * blockDim.x + tid;
	const int row = idx / n;
	const int column = idx % n;
	int i;

	if (row < n && column < n) 
	{
	float t = 0;
	for (i = 0; i < n; i++) 		
	{
	t += a[row * lda + i] * b[i * ldb + column];
	}
	c[row * ldc + column] = t;
	}
}

clock_t matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	float *ac, *bc, *cc;
	clock_t start, end;

	start = clock();
	cudaMalloc((void**)&ac, sizeof(float)* n * n);
	cudaMalloc((void**)&bc, sizeof(float)* n * n);
	cudaMalloc((void**)&cc, sizeof(float)* n * n);

	cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
	cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

	int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
	matMultCUDA << <blocks * n, NUM_THREADS >> >(ac, n, bc, n, cc, n, n);

	cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

	cudaFree(ac);
	cudaFree(bc);
	cudaFree(cc);

	end = clock();

	return end - start;
}

__global__ static void matMultCUDA_2(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	const int idx = bid * blockDim.x + tid;
	const int row = idx / n;
	const int column = idx % n;
	int i;

	if (row < n && column < n) 
	{
	float t = 0;
	float y = 0;
	for (i = 0; i < n; i++) 
	{
	float r;
	y -= a[row * lda + i] * b[i * ldb + column];
	r = t - y;		
	y = (r - t) + y;
	t = r;
	}
	c[row * ldc + column] = t;
	}
}

clock_t matmultCUDA_2(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	float *ac, *bc, *cc;
	clock_t start, end;

	start = clock();
	cudaMalloc((void**)&ac, sizeof(float)* n * n);
	cudaMalloc((void**)&bc, sizeof(float)* n * n);
	cudaMalloc((void**)&cc, sizeof(float)* n * n);

	cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
	cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

	int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
	matMultCUDA_2 << <blocks * n, NUM_THREADS >> >(ac, n, bc, n, cc, n, n);

	cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

	cudaFree(ac);
	cudaFree(bc);
	cudaFree(cc);

	end = clock();

	return end - start;
}

__global__ static void matMultCUDA_3(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	extern __shared__ float data[];
	const int tid = threadIdx.x;
	const int row = blockIdx.x;
	int i, j;

	for (i = tid; i < n; i += blockDim.x) 
	{
	data[i] = a[row * lda + i];
	}

	__syncthreads();

	for (j = tid; j < n; j += blockDim.x) 
	{
	float t = 0;
	float y = 0;		
	for (i = 0; i < n; i++) 
                 {
	float r;
	y -= data[i] * b[i * ldb + j];
	r = t - y;
	y = (r - t) + y;
	t = r;
	}
	c[row * ldc + j] = t;
	}
}

clock_t matmultCUDA_3(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	float *ac, *bc, *cc;
	clock_t start, end;

	start = clock();
	cudaMalloc((void**)&ac, sizeof(float)* n * n);
	cudaMalloc((void**)&bc, sizeof(float)* n * n);
	cudaMalloc((void**)&cc, sizeof(float)* n * n);

	cudaMemcpy2D(ac, sizeof(float)* n, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
	cudaMemcpy2D(bc, sizeof(float)* n, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

	int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
	matMultCUDA_3 << <n, NUM_THREADS, sizeof(float)* n >> >(ac, n, bc, n, cc, n, n);

	cudaMemcpy2D(c, sizeof(float)* ldc, cc, sizeof(float)* n, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

	cudaFree(ac);
	cudaFree(bc);
	cudaFree(cc);

	end = clock();

	return end - start;
}

__global__ static void matMultCUDA_4(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	extern __shared__ float data[];
	const int tid = threadIdx.x;
	const int row = blockIdx.x;
	int i, j;

	for (i = tid; i < n; i += blockDim.x) 
	{
	data[i] = a[row * lda + i];
	}

	__syncthreads();

	for (j = tid; j < n; j += blockDim.x) 
	{
	float t = 0;
	float y = 0;
	for (i = 0; i < n; i++) 
	{
	float r;
	y -= data[i] * b[i * ldb + j];
	r = t - y;
	y = (r - t) + y;
	t = r;
	}
	c[row * ldc + j] = t;
	}
}

clock_t matmultCUDA_4(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	float *ac, *bc, *cc;
	clock_t start, end;

	start = clock();

	size_t pitch_a, pitch_b, pitch_c;
	cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);
	cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);
	cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);

	cudaMalloc((void**)&ac, sizeof(float)* n * n);
	cudaMalloc((void**)&bc, sizeof(float)* n * n);
	cudaMalloc((void**)&cc, sizeof(float)* n * n);

	cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
	cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

	int blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
	matMultCUDA_4 << <n, NUM_THREADS, sizeof(float)* n >> >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);

	cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

	cudaFree(ac);
	cudaFree(bc);
	cudaFree(cc);

	end = clock();

	return end - start;
}

__global__ static void matMultCUDA_5(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
	__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
	const int tidc = threadIdx.x;
	const int tidr = threadIdx.y;
	const int bidc = blockIdx.x * BLOCK_SIZE;
	const int bidr = blockIdx.y * BLOCK_SIZE;
	int i, j;

	float results = 0;
	float comp = 0;

	for (j = 0; j < n; j += BLOCK_SIZE) 
	{
		if (tidr + bidr < n && tidc + j < n) 
		{
		matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
		}
		else 
		{
		matA[tidr][tidc] = 0;
		}

		if (tidr + j < n && tidc + bidc < n) 
		{
		matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];
		}
		else 
		{
		matB[tidr][tidc] = 0;
		}

		__syncthreads();

		for (i = 0; i < BLOCK_SIZE; i++) 
		{
		float t;
		comp -= matA[tidr][i] * matB[i][tidc];
		t = results - comp;			
		comp = (t - results) + comp;
		results = t;
		}

		__syncthreads();
	}

	if (tidr + bidr < n && tidc + bidc < n) 
	{
	c[(tidr + bidr) * ldc + tidc + bidc] = results;
	}
}

clock_t matmultCUDA_5(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	float *ac, *bc, *cc;
	clock_t start, end;

	start = clock();

	size_t pitch_a, pitch_b, pitch_c;
	cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n);
	cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n);
	cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n);

	cudaMalloc((void**)&ac, sizeof(float)* n * n);
	cudaMalloc((void**)&bc, sizeof(float)* n * n);
	cudaMalloc((void**)&cc, sizeof(float)* n * n);

	cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, sizeof(float)* n, n, cudaMemcpyHostToDevice);
	cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, sizeof(float)* n, n, cudaMemcpyHostToDevice);

	int bx = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
	dim3 blocks(bx, bx);
	dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
	matMultCUDA_5 << <blocks, threads >> >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);

	cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, sizeof(float)* n, n, cudaMemcpyDeviceToHost);

	cudaFree(ac);
	cudaFree(bc);
	cudaFree(cc);

	end = clock();
	
                  return end - start;

}
void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
	int i, j, k;

	for (i = 0; i < n; i++) 
	{
	for (j = 0; j < n; j++) 
	{
	double t = 0;
	for (k = 0; k < n; k++) 
	{
	t += a[i * lda + k] * b[k * ldb + j];
	}
	c[i * ldc + j] = t;
	}
	}
}

void matgen(float* a, int lda, int n)
{
	int i, j;

	for (i = 0; i < n; i++)
	{
	for (j = 0; j < n; j++) {
	a[i * lda + j] = (float)rand() / RAND_MAX + (float)rand() / (RAND_MAX * RAND_MAX);
	}
	}
}

void compare_mat(const float* a, int lda, const float* b, int ldb, int n)
{
	float max_err = 0;
	float average_err = 0;
	int i, j;

	for (i = 0; i < n; i++) 
	{
	for (j = 0; j < n; j++) 
	{
	if (b[i * ldb + j] != 0) 
	{
	float err = fabs((a[i * lda + j] - b[i * ldb + j]) / b[i * ldb + j]);
	if (max_err < err) max_err = err;
	average_err += err;
	}
	}
	}

	printf("Max error: %g  Average error: %g\n", max_err, average_err / (n * n));
}

bool InitCUDA()
{
	int count;

	cudaGetDeviceCount(&count);
	if (count == 0) 
	{
	fprintf(stderr, "There is no device.\n");
	return false;
	}

	int i;
	for (i = 0; i < count; i++) 
	{
	cudaDeviceProp prop;
	if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) 
	{
	if (prop.major >= 1) 
	{
	break;
	}
    }
	}

	if (i == count) 
	{
	fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
	return false;
	}

	cudaSetDevice(i);

	return true;
}

int main()
{
	int n;
	printf("Please input maxter number:");
	scanf("%d",&n);
	float *a, *b, *c, *d;
	if (!InitCUDA()) 
	{
	return 0;
	}

	a = (float*)malloc(sizeof(float)* n * n);
	b = (float*)malloc(sizeof(float)* n * n);
	c = (float*)malloc(sizeof(float)* n * n);
	d = (float*)malloc(sizeof(float)* n * n);

	srand(0);

	matgen(a, n, n);

	LARGE_INTEGER timeStart;
	LARGE_INTEGER timeEnd;
	LARGE_INTEGER frequency;
	QueryPerformanceFrequency(&frequency);
	double quadpart = (double)frequency.QuadPart;
	QueryPerformanceCounter(&timeStart);

	matmult(a, n, b, n, d, n, n);

	QueryPerformanceCounter(&timeEnd);
	double elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;
	printf("\n(CPU) Time used is: %f ms\n", elapsed);

	clock_t time = matmultCUDA(a, n, b, n, c, n, n);
	double sec = (double)time / CLOCKS_PER_SEC;
	printf("\n(1) Before reduce errors:");
	printf("\n(GPU) Time used is: %.f ms  (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
	compare_mat(c, n, d, n, n);

	time = matmultCUDA_2(a, n, b, n, c, n, n);
	sec = (double)time / CLOCKS_PER_SEC;
	printf("\n(2) After reduce errors:");
	printf("\n(GPU2) Time used is: %.f ms  (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
        compare_mat(c, n, d, n, n);

	time = matmultCUDA_3(a, n, b, n, c, n, n);
	sec = (double)time / CLOCKS_PER_SEC;
	printf("\n(GPU3) Time used is: %.f ms  (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
        compare_mat(c, n, d, n, n);

	time = matmultCUDA_4(a, n, b, n, c, n, n);
	sec = (double)time / CLOCKS_PER_SEC;
	printf("\n(GPU4) Time used is: %.f ms  (%.f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));
        compare_mat(c, n, d, n, n);
	
        QueryPerformanceFrequency(&frequency);
	quadpart = (double)frequency.QuadPart;
	QueryPerformanceCounter(&timeStart);

        time = matmultCUDA_5(a, n, b, n, c, n, n);
        QueryPerformanceCounter(&timeEnd);
        elapsed = ((timeEnd.QuadPart - timeStart.QuadPart) / quadpart) * 1000;
        printf("\n(CPU5) Time used is: %f ms\n", elapsed);
        compare_mat(c, n, d, n, n);
        sec = ((double)elapsed / CLOCKS_PER_SEC)*1000 ;
	
        printf("\n(3) Apply pitch and shard memory:");
        printf("\n(GPU) Time used is: %f ms (%f GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

        free(a);
        free(b);
        free(c);
        free(d);

        system("\n pause");
        return 0;
}

Kindly please help for the modification such as -

printf(“\n(GPU) Time used is: %.f ms (%.f GFLOPS)\n”, sec, 2.0 * n * n * n / (sec * 1E9));

Some mistake above?

cbuchner1 · May 11, 2018, 8:33am

You should only time this step and not the cudaMalloc, cudaMemcpy2D, cudaFree. Then you get more accurate results.

matMultCUDA_5 << <blocks, threads >> >(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);
cudaDeviceSyncronize(); // required to time the execution time of the kernel, and not just the asynchronous launch

Much more accurate timings could be achieved by using the CUDA event APIs as outlined here. Again, the author is Mark Harris. ;)

https://devblogs.nvidia.com/how-implement-performance-metrics-cuda-cc/

For more accuracy it helps to time about 10 invocations of the kernel and then to drop outliers, e.g. by taking the median value of the result.

To compute an estimate of Flop/s you simply divide the expected number of floating point operations (mul, add) for your problem size by the run time in seconds. Divide by 1e9 to get the Gigaflops value.

vokoyov · May 11, 2018, 9:03am

Can you please show more detail information such as write some examples over here - it will be good for understanding!

Topic		Replies	Views
How to modify with matrix CUDA Programming and Performance	0	519	May 5, 2018
Cuda program taking more time. CUDA Programming and Performance	15	7059	November 21, 2010
How to modify the Cuda output CUDA Programming and Performance	0	447	May 21, 2018
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	10004	January 18, 2012
GPU running time is not stable CUDA Programming and Performance	5	3056	April 24, 2010
matrix multiply reduction CUDA Programming and Performance	41	35548	January 15, 2011
GPU/CPU precision comparison and Kernel instructions question CUDA Programming and Performance	5	679	April 4, 2017
Cuda matrix multiplication too slow CUDA Programming and Performance	5	13332	February 17, 2010
need help for cudaMemcpy2D() CUDA Programming and Performance	5	4558	December 8, 2009
Why cannot run this program CUDA Programming and Performance	7	951	May 11, 2018

How to correct the time units on the results

Related topics