Issues with measuring speedup timing analysis for CUDA

Hello! I wrote a very basic code on CUDA to add two vectors with N components. Now I wanted to measure the speedup of my code. However, I have a few doubts/issues.

1. Below, I have created a function for CPU and a kernel for GPU to do the same task of addition. Do I use the same functions to measure the time on my CPU functions and GPU kernel?

2. Since the time to allocate and copy numbers to the GPU should also come in the calculation, can I simply start the timer before the cudamalloc() function and stop it when my kernel runs. Or do I have to use the other method of starting and stopping events and then taking the time difference? (which will reduce my precision! And make my calculation operating system dependent!)

3. Can I use visual profiler to measure time? Although I am getting some kind of timing analysis, I am not sure how I can compare it with my CPU function.

Thanks a lot!

[codebox]// Vector : Defines the entry point for the console application.


#include “stdafx.h”

#include <stdio.h>

#include <cuda.h>

#include “cutil.h”

#include <cuda_runtime_api.h>

#include <string.h>

global void add(int *d_A, int *d_B, int *d_C, int N)


int i = blockIdx.x*blockDim.x + threadIdx.x;


	d_C[i]= d_A[i] + d_B[i];


int main(int argc, char *argv)


#if __DEVICE_EMULATION__  // Check if running emulation mode

	printf("\nRunning in Emulation Mode\n");


	printf("\nRunning in non-Emulation Mode\n");


int N, i, deviceCount, threadsPerBlock, blocksPerGrid, ll, ul; //Declarations

int *A, *B, *C, *d_A, *d_B, *d_C; 

unsigned int timer = 0;

cudaDeviceProp deviceProp;



	puts("Improper number of arguments");



cudaGetDeviceCount(&deviceCount); // Check number of CUDA enabled devices present

printf("\nDevice Count: %d\n", deviceCount);

if (deviceCount == 0)

    printf("There is no device supporting CUDA\n");

int dev;

for (dev = 0; dev < deviceCount; ++dev) {

	cudaGetDeviceProperties(&deviceProp, dev);

if (dev == 0) {

		if (deviceProp.major == 9999 && deviceProp.minor == 9999)

            printf("\nThere is no device supporting CUDA.\n");

        else if (deviceCount == 1)

            printf("\nThere is 1 device supporting CUDA\n");


            printf("\nThere are %d devices supporting CUDA\n", deviceCount);


	 printf("\nDevice %d: \"%s\"\n", dev,;


N = atoi(argv[1]);

/*printf("\nEnter N: ");





cudaMalloc(&d_A, N*sizeof(int));

cudaMalloc(&d_B, N*sizeof(int));

cudaMalloc(&d_C, N*sizeof(int));

ll = atoi(argv[2]);

ul = atoi(argv[3]);

/*printf("Enter the lower limit and the upper limit: ");

scanf("%d %d", &ll, &ul);*/

for(i=0; i<N; i++)


	A[i]=(rand()%(ul-ll) + ll);

	B[i]=(rand()%(ul-ll) + ll);


threadsPerBlock = 256;

blocksPerGrid = (N + threadsPerBlock - 1)/threadsPerBlock;




cudaMemcpy(d_A, A, N*sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_B, B, N*sizeof(int), cudaMemcpyHostToDevice);


add<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);



cudaMemcpy(C, d_C, N*sizeof(int), cudaMemcpyDeviceToHost);

printf("The vector A, B & C: ");

for(i=0; i<N; i++)

	printf("(%d %d %d) ",A[i],B[i],C[i]);

printf("\n The time taken for the Kernel to execute is: %f ms",cutGetTimerValue(timer));



for(i=0; i<N; i++)

	C[i] = A[i] + B[i];


printf("\n The time taken for the CPU function to execute is: %f ms",cutGetTimerValue(timer));