Time consuming comparison between 820M GPU combination code and pure C++ on CPU

Hello.

Firstly, I wrapped some CUDA 8.0 code (kernel1.cu) into the dll and lib file for later invoking. The following
is the complete code in kernel1.cu. N.B. To distinguish the time used for vector add operations and memory transferring, I constructed two nearly the same functions, the only difference between them is whether launch the GPU kernel function or not, i.e. addKernel in this context.

#include "cuda_runtime.h"  
#include "device_launch_parameters.h"    
#include "cudadll1.h"

__global__ void addKernel(int *c, const int *a, const int *b, int size)
{
	int index = blockIdx.x * blockDim.x + threadIdx.x;
	int stride = blockDim.x * gridDim.x;
	for(int i = index; i < size; i += stride)
		c[i] = a[i] + b[i];
}


 
CUDADLL1_API int vectorCopyConsuming(int c[], int a[], int b[], int size)
{
	int result = -1;
	cudaError_t cudaStatus;
	int *dev_a = 0;
	int *dev_b = 0;
	int *dev_c = 0;
	

	// running GPU selection 
	cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess) {
		result = 1;
		goto Error;
	}

	// assign memory in GPU for dev_a、dev_b、dev_c 
	cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		result = 2;
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		result = 3;
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		result = 4;
		goto Error;
	}

	// copy data from host to device 
	cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		result = 5;
		goto Error;
	}
	cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		result = 6;
		goto Error;
	}

	// launch GPU kernel function
	//int blockSize = 256;
	//int numBlocks = (size + blockSize - 1) / blockSize;
	//addKernel <<<numBlocks, blockSize >>>(dev_c, dev_a, dev_b, size);

	// Wait for GPU to finish before accessing on host
	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		result = 7;
		goto Error;
	}

	// copy data from device to host 
	cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		result = 8;
		goto Error;
	}

	result = 0;

	// reset GPU device
	cudaStatus = cudaDeviceReset();
	if (cudaStatus != cudaSuccess) {
		return 9;
	}


Error:
	//Free memory
	cudaFree(c);
	cudaFree(a);
	cudaFree(b);

	return result;
}


CUDADLL1_API int vectorAddCABsize(int c[], int a[], int b[], int size)
{
	int result = -1;
	cudaError_t cudaStatus;
	int *dev_a = 0;
	int *dev_b = 0;
	int *dev_c = 0;


	// running GPU selection 
	cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess) {
		result = 1;
		goto Error;
	}

	// assign memory in GPU for dev_a、dev_b、dev_c 
	cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		result = 2;
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		result = 3;
		goto Error;
	}
	cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		result = 4;
		goto Error;
	}

	// copy data from host to device 
	cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		result = 5;
		goto Error;
	}
	cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		result = 6;
		goto Error;
	}

	// launch GPU kernel function
	int blockSize = 256;
	int numBlocks = (size + blockSize - 1) / blockSize;
	addKernel << <numBlocks, blockSize >> >(dev_c, dev_a, dev_b, size);

	// Wait for GPU to finish before accessing on host
	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		result = 7;
		goto Error;
	}

	// copy data from device to host 
	cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		result = 8;
		goto Error;
	}

	result = 0;

	// reset GPU device
	cudaStatus = cudaDeviceReset();
	if (cudaStatus != cudaSuccess) {
		return 9;
	}


Error:
	//Free memory
	cudaFree(c);
	cudaFree(a);
	cudaFree(b);

	return result;
}

Secondly, a new vs2015 project was created to test the CUDA functionality and efficiency of paralleling computing. The following is the main code which shows how I invoked the functions in dll file.

int main()
{
	const int arraySize = 1 << 20;


	int *a = new int[arraySize];
	int *b = new int[arraySize];
	int *c = new int[arraySize];

	int *d = new int[arraySize];
	int *e = new int[arraySize];
	int *f = new int[arraySize];

	int *g = new int[arraySize];
	int *h = new int[arraySize];
	int *k = new int[arraySize];
	
	for (int i = 0; i < arraySize; i++) {
		a[i] = 1;
		b[i] = 2;
		c[i] = 0;
		d[i] = 1;
		e[i] = 2;
		f[i] = 0;
		g[i] = 1;
		h[i] = 2;
		k[i] = 0;
	}
	

	printf("Data length : %d \n Vector Add operation ... \n \n", arraySize);
	LARGE_INTEGER t1, t2, tc;
	QueryPerformanceFrequency(&tc);
	QueryPerformanceCounter(&t1);
	// Add vectors in parallel.  
	int number1 = vectorAddCABsize(c, a, b, arraySize);
	QueryPerformanceCounter(&t2);
	printf(" CUDA GPU calculation+memory copy consuming   Time:%f s\n", (t2.QuadPart - t1.QuadPart)*1.0 / tc.QuadPart);
	printf("invoke CUDA dll  status code = %d\n\n", number1);


	
	LARGE_INTEGER t5, t6, te;
	QueryPerformanceFrequency(&te);
	QueryPerformanceCounter(&t5);
	// Add vectors in parallel.  
	int number2 = vectorCopyConsuming(k, g, h, arraySize);
	QueryPerformanceCounter(&t6);
	printf(" CUDA GPU memory copy consuming    Time:%f s\n", (t6.QuadPart - t5.QuadPart)*1.0 / te.QuadPart);
	printf("invoke CUDA dll  status code = %d\n\n", number2);


	LARGE_INTEGER t3, t4, td;
	QueryPerformanceFrequency(&td);
	QueryPerformanceCounter(&t3);
	// Add vectors in parallel.  
	for (int i = 0; i != arraySize; i++) {
		f[i] = d[i] + e[i];
	}
	QueryPerformanceCounter(&t4);
	printf(" C++  calculation  consuming  Time:%f s\n", (t4.QuadPart - t3.QuadPart)*1.0 / td.QuadPart);
	
         system("Pause");
	return 0;
}

The console output text is below.

Data length : 1048576
Vector Add operation …

CUDA GPU calculation+memory copy consuming Time:0.386232 s
invoke CUDA dll status code = 0

CUDA GPU memory copy consuming Time:0.065450 s
invoke CUDA dll status code = 0

C++ calculation consuming Time:0.003581 s

Therefore,one issue has confused me for long.

  • Why the time consumed by CUDA code is even larger than pure C++ code which running just on CPU and the single addKernel function needs more time than pure C++ does executing the same operation in particular.
  • I will really appreciate if you can help me. Thank you in advance.

    Simple vector addition on modern architectures is usually limited solely by memory bandwidth.
    Since you also measure the time which is required to copy data from host to device and back, using CUDA will be slower.
    Not only are you getting slowed by the PCIe bandwidth, but also by the associated latency.

    If you measure only the pure kernel runtime you might see an improvement, provided that your GPU memory bandwidth is higher than the CPU memory bandwidth, which might not be true on a mobile platform.

    thank you, underscore. you enlightened me. I will try other advanced operations on my GPU facilities later.