How to calculate CPU time and GPU time

I try to perform the Cuda C programming for the Cuda GPU properties as below

However I get some mistake for the CPU and GPU

Kindly please provide your opinion and suggestion thus I will be able to improve my computing skills

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>
#include <math.h>

#define DATA_SIZE 1048576

int data[DATA_SIZE];

bool InitCUDA()
{
	int count;

	cudaGetDeviceCount(&count);
	if (count == 0)
	{
		fprintf(stderr, "There is no device.\n");
		return false;
	}

	int i;
	for (i = 0; i < count; i++)
	{
		cudaDeviceProp prop;
		if (cudaGetDeviceProperties(&prop, i) == cudaSuccess)
		{
			if (prop.major >= 1)
			{
				break;
			}
		}
		cudaGetDeviceProperties(&prop, i);
		printf("Device Name: %s\n", prop.name);
		printf("Total global mem: %d bytes\n", prop.totalGlobalMem);
		printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
		printf("Clock rate: %.2f GHz\n", prop.clockRate*1e-6f);
		printf("\n");
		cudaSetDevice(i);
	}
	return true;
}

__global__ static void sumOfSquares(int *num, int* result, clock_t* time)
{
	int sum = 0;
	int i;
	clock_t start = clock();
	for (i = 0; i < DATA_SIZE; i++)
	{
		sum += num[i] * num[i];
	}

	*result = sum;
	*time = clock() - start;
}

int sumOfSquares_CPU(int *data)
{
	int sum = 0;
	for (int i = 0; i< DATA_SIZE; i++)
	{
		sum += data[i] * data[i];
	}
	return sum;
}

void GenerateNumbers(int *number, int size)
{
	for (int i = 0; i< size; i++)
	{
		number[i] = rand() % 10;
	}
}

int main()
{
	if (!InitCUDA())
	{
		return 0;
	}

	printf("CUDA initialized.\n");

	GenerateNumbers(data, DATA_SIZE);
	int* gpudata, *result;
	clock_t* time;
	cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
	cudaMalloc((void**)&result, sizeof(int));
	cudaMalloc((void**)&time, sizeof(clock_t));
	cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

	sumOfSquares << <1, 1, 0 >> >(gpudata, result, time);

	int sum, sum1;
	clock_t time_used;
	cudaMemcpy(&sum, result, sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
	cudaFree(gpudata);
	cudaFree(result);

	clock_t start_cpu = clock();
	int cpu_time;
	sum1 = sumOfSquares_CPU(data);
	cpu_time = clock() - start_cpu;
	printf("sum (CPU): %d\n", sum);
	printf("sum (GPU): %d\n", sum1);
	printf("sum (CPU) - sum (GPU) = %d\n", sum - sum1);
	printf("(CPU) time: %.1f us\n", (cpu_time/CLOCKS_PER_SEC)*pow(10.0,6)/1000));

	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, 0);

	printf("(GPU) time:%.1f us\n", (time_used/prop.clockRate)*pow(10.0,6)/1000));
	system("pause");
	return 0;
}

use Code tag (last button above edit box) to format your code. You can edit your message

Some problems on this part -

GenerateNumbers(data, DATA_SIZE);
int* gpudata, *result;
clock_t* time;
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int));
cudaMalloc((void**)&time, sizeof(clock_t));
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

sumOfSquares << <1, 1, 0 >> >(gpudata, result, time);

int sum, sum1;
clock_t time_used;
cudaMemcpy(&sum, result, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);

clock_t start_cpu = clock();
int cpu_time;
sum1 = sumOfSquares_CPU(data);
cpu_time = clock() - start_cpu;
printf("sum (CPU): %d\n", sum);
printf("sum (GPU): %d\n", sum1);
printf("sum (CPU) - sum (GPU) = %d\n", sum - sum1);
printf("(CPU) time: %.1f us\n", (cpu_time/CLOCKS_PER_SEC)*pow(10.0,6)/1000));

cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);

printf("(GPU) time:%.1f us\n", (time_used/prop.clockRate)*pow(10.0,6)/1000));
system("pause");
return 0;
}

Kindly please provide your solution to modify the coding such as sum and sum1 etc

Any time you are having trouble with a CUDA code, you should use proper CUDA error checking. If you’re not sure what that is, google “proper CUDA error checking” and start reading.

Your code in your first posting appears to run correctly for me, after I fix a few typos (extra parenthesis on printf statements).

It seems fairly evident to me you are running on windows. In that case, one possibility is that you are hitting a kernel timeout. If you are building a debug project, you may want to try building a release project, as the kernel code will generally run quicker. Your kernel is poorly written (uses only 1 thread) but that doesn’t mean you cannot get correct results. Here is the output I get when I run your code:

$ ./t85
CUDA initialized.
sum (CPU): 29909398
sum (GPU): 29909398
sum (CPU) - sum (GPU) = 0
(CPU) time: 0.0 us
(GPU) time:13000.0 us
$

You may be hitting a WDDM timeout.

https://docs.nvidia.com/gameworks/content/developertools/desktop/nsight/timeout_detection_recovery.htm

When I run the code using nvprof, I get 14ms for the kernel duration, so your measurement of 13000 us is reasonably accurate. The host code clock measurement function is a very coarse measurement (at least on linux, which is what I am using) so it returns a result of 0.

I think you are right

Thank you for the helpful comment

I greatly appreciate for your kind attention