Time consuming comparison between 820M GPU combination code and pure C++ on CPU

eplzy97 · May 19, 2019, 3:49am

Hello.

Firstly, I wrapped some CUDA 8.0 code (kernel1.cu) into the dll and lib file for later invoking. The following
is the complete code in kernel1.cu. N.B. To distinguish the time used for vector add operations and memory transferring, I constructed two nearly the same functions, the only difference between them is whether launch the GPU kernel function or not, i.e. addKernel in this context.

#include "cuda_runtime.h"  
#include "device_launch_parameters.h"    
#include "cudadll1.h"

__global__ void addKernel(int *c, const int *a, const int *b, int size)
{
	int index = blockIdx.x * blockDim.x + threadIdx.x;
	int stride = blockDim.x * gridDim.x;
	for(int i = index; i < size; i += stride)
		c[i] = a[i] + b[i];
}


 
CUDADLL1_API int vectorCopyConsuming(int c[], int a[], int b[], int size)
{
	int result = -1;
	cudaError_t cudaStatus;
	int *dev_a = 0;
	int *dev_b = 0;
	int *dev_c = 0;
	

	// running GPU selection 
	cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess) {
		result = 1;
		goto Error;
	}

	// assign memory in GPU for dev_a、dev_b、dev_c 
	cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		result = 2;
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		result = 3;
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		result = 4;
		goto Error;
	}

	// copy data from host to device 
	cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		result = 5;
		goto Error;
	}
	cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		result = 6;
		goto Error;
	}

	// launch GPU kernel function
	//int blockSize = 256;
	//int numBlocks = (size + blockSize - 1) / blockSize;
	//addKernel <<<numBlocks, blockSize >>>(dev_c, dev_a, dev_b, size);

	// Wait for GPU to finish before accessing on host
	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		result = 7;
		goto Error;
	}

	// copy data from device to host 
	cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		result = 8;
		goto Error;
	}

	result = 0;

	// reset GPU device
	cudaStatus = cudaDeviceReset();
	if (cudaStatus != cudaSuccess) {
		return 9;
	}


Error:
	//Free memory
	cudaFree(c);
	cudaFree(a);
	cudaFree(b);

	return result;
}


CUDADLL1_API int vectorAddCABsize(int c[], int a[], int b[], int size)
{
	int result = -1;
	cudaError_t cudaStatus;
	int *dev_a = 0;
	int *dev_b = 0;
	int *dev_c = 0;


	// running GPU selection 
	cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess) {
		result = 1;
		goto Error;
	}

	// assign memory in GPU for dev_a、dev_b、dev_c 
	cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		result = 2;
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		result = 3;
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		result = 4;
		goto Error;
	}

	// copy data from host to device 
	cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		result = 5;
		goto Error;
	}
	cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		result = 6;
		goto Error;
	}

	// launch GPU kernel function
	int blockSize = 256;
	int numBlocks = (size + blockSize - 1) / blockSize;
	addKernel << <numBlocks, blockSize >> >(dev_c, dev_a, dev_b, size);

	// Wait for GPU to finish before accessing on host
	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		result = 7;
		goto Error;
	}

	// copy data from device to host 
	cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		result = 8;
		goto Error;
	}

	result = 0;

	// reset GPU device
	cudaStatus = cudaDeviceReset();
	if (cudaStatus != cudaSuccess) {
		return 9;
	}


Error:
	//Free memory
	cudaFree(c);
	cudaFree(a);
	cudaFree(b);

	return result;
}

Secondly, a new vs2015 project was created to test the CUDA functionality and efficiency of paralleling computing. The following is the main code which shows how I invoked the functions in dll file.

int main()
{
	const int arraySize = 1 << 20;


	int *a = new int[arraySize];
	int *b = new int[arraySize];
	int *c = new int[arraySize];

	int *d = new int[arraySize];
	int *e = new int[arraySize];
	int *f = new int[arraySize];

	int *g = new int[arraySize];
	int *h = new int[arraySize];
	int *k = new int[arraySize];
	
	for (int i = 0; i < arraySize; i++) {
		a[i] = 1;
		b[i] = 2;
		c[i] = 0;
		d[i] = 1;
		e[i] = 2;
		f[i] = 0;
		g[i] = 1;
		h[i] = 2;
		k[i] = 0;
	}
	

	printf("Data length : %d \n Vector Add operation ... \n \n", arraySize);
	LARGE_INTEGER t1, t2, tc;
	QueryPerformanceFrequency(&tc);
	QueryPerformanceCounter(&t1);
	// Add vectors in parallel.  
	int number1 = vectorAddCABsize(c, a, b, arraySize);
	QueryPerformanceCounter(&t2);
	printf(" CUDA GPU calculation+memory copy consuming   Time:%f s\n", (t2.QuadPart - t1.QuadPart)*1.0 / tc.QuadPart);
	printf("invoke CUDA dll  status code = %d\n\n", number1);


	
	LARGE_INTEGER t5, t6, te;
	QueryPerformanceFrequency(&te);
	QueryPerformanceCounter(&t5);
	// Add vectors in parallel.  
	int number2 = vectorCopyConsuming(k, g, h, arraySize);
	QueryPerformanceCounter(&t6);
	printf(" CUDA GPU memory copy consuming    Time:%f s\n", (t6.QuadPart - t5.QuadPart)*1.0 / te.QuadPart);
	printf("invoke CUDA dll  status code = %d\n\n", number2);


	LARGE_INTEGER t3, t4, td;
	QueryPerformanceFrequency(&td);
	QueryPerformanceCounter(&t3);
	// Add vectors in parallel.  
	for (int i = 0; i != arraySize; i++) {
		f[i] = d[i] + e[i];
	}
	QueryPerformanceCounter(&t4);
	printf(" C++  calculation  consuming  Time:%f s\n", (t4.QuadPart - t3.QuadPart)*1.0 / td.QuadPart);
	
         system("Pause");
	return 0;
}

The console output text is below.

Data length : 1048576
Vector Add operation …

CUDA GPU calculation+memory copy consuming Time:0.386232 s
invoke CUDA dll status code = 0

CUDA GPU memory copy consuming Time:0.065450 s
invoke CUDA dll status code = 0

C++ calculation consuming Time:0.003581 s

Therefore,one issue has confused me for long.

Why the time consumed by CUDA code is even larger than pure C++ code which running just on CPU and the single addKernel function needs more time than pure C++ does executing the same operation in particular.

I will really appreciate if you can help me. Thank you in advance.

underscore · May 20, 2019, 8:13am

Simple vector addition on modern architectures is usually limited solely by memory bandwidth.
Since you also measure the time which is required to copy data from host to device and back, using CUDA will be slower.
Not only are you getting slowed by the PCIe bandwidth, but also by the associated latency.

If you measure only the pure kernel runtime you might see an improvement, provided that your GPU memory bandwidth is higher than the CPU memory bandwidth, which might not be true on a mobile platform.

eplzy97 · May 22, 2019, 2:02am

thank you, underscore. you enlightened me. I will try other advanced operations on my GPU facilities later.

Topic		Replies	Views
Question about vector access performance CUDA Programming and Performance	4	530	December 21, 2018
Cuda code performance CUDA Programming and Performance	14	3118	December 16, 2014
Can you GUESS this without experimenting? Latencies CUDA Programming and Performance	13	9346	January 7, 2008
Inconsistent CUDA Kernel Execution Times in Sequential Execution CUDA Programming and Performance cuda	6	208	June 11, 2024
Getting started with CUDA ... cannot add simple vectors CUDA Programming and Performance	9	20919	January 31, 2011
well how do I know if cuda runs on the gpu CUDA Programming and Performance	20	13254	July 9, 2008
why load vector4 not faster than single load? CUDA Programming and Performance	8	744	April 2, 2019
[Beginner]: CUDA slower than serial implementation fill Operation on entire image CUDA Programming and Performance	18	13519	September 15, 2011
GPU - CPU Performance comparison on string conversion i7 860 3.5GHz beat out NVidia 9800 GT CUDA Programming and Performance	11	10657	January 4, 2011
CUDA hangups Jetson TK1	26	3666	October 18, 2021

Time consuming comparison between 820M GPU combination code and pure C++ on CPU

The console output text is below.

C++ calculation consuming Time:0.003581 s

Related topics