problem in timing of GPU work

i had written a program to be executed on GPU, the problem as I’m notice is in the timing process, my code is as follows

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cmath>

#define NumOfThreads 1024 // min is 1 and max is 1024
#define numOfChuncks 1 // min is 1 and max is 512
#define global_Size NumOfThreads*numOfChuncks
#define SHARED_SIZE_LIMIT 256
#define N SHARED_SIZE_LIMIT*global_Size

#define half_shared_size SHARED_SIZE_LIMIT/2

// Define this to turn on error checking
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )

//****************************************************************************************
// functions for cuda error checking
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
	cudaError err = cudaGetLastError();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}

	// More careful checking. However, this will affect performance.
	// Comment away if needed.
	err = cudaDeviceSynchronize();
	if (cudaSuccess != err)
	{
		fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
			file, line, cudaGetErrorString(err));
		exit(-1);
	}
#endif

	return;
}
//****************************************************************************************

//*****************************************************************************************
// code of bitonic sort downloded from http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/Data-Parallel_Algorithms.html#bitonic
__device__ inline void swap(int & a, int & b)
{
	// Alternative swap doesn't use a temporary register:
	// a ^= b;
	// b ^= a;
	// a ^= b;

	int tmp = a;
	a = b;
	b = tmp;
}

__device__  void mergeSort(int *a, int *temp, int mergedSize, int rec_count, int d,int threadsRange){

	int threadId = threadIdx.x;
	int BlockStart = blockIdx.x*(NumOfThreads*SHARED_SIZE_LIMIT);

	if (0 <= threadId && threadId < (threadsRange / (2 * d))){
		int index1 = BlockStart + mergedSize * (threadId);
		int endIndex1 = index1 + (mergedSize / 2) - 1;
		int index2 = endIndex1 + 1;
		int endIndex2 = index2 + (mergedSize / 2) - 1;
		int targetIndex = index1;
		int sortedSize = 0;
		//printf("th= %d ", threadId);
		while (index1 <= endIndex1 && index2 <= endIndex2){

			if (a[index1] <= a[index2]){
				temp[targetIndex] = a[index1];
				++sortedSize;
				++index1;
				++targetIndex;
			}
			else{
				temp[targetIndex] = a[index2];
				++index2;
				++sortedSize;
				++targetIndex;
			}
		}

		if (index1 < endIndex1 &&index2 == endIndex2){

			if (sortedSize < mergedSize){
			int bb = 0;
			while (bb < (mergedSize / 2)){

			temp[targetIndex] = a[index1];
			++sortedSize;
			++targetIndex;
			++index1;
			++bb;
			}
			}

			}
					if (index2 > endIndex2 &&index1 <= endIndex1){
			if (sortedSize < mergedSize){
				int bb = 0;
				while (bb < (mergedSize / 2)){

					temp[targetIndex] = a[index1];
					++sortedSize;
					++targetIndex;
					++index1;
					++bb;
					if (index1 > endIndex1)
						break;
				}

			}
		}

		if (index1 > endIndex1 &&index2 <= endIndex2){
			if (sortedSize < mergedSize){
				int bb = 0;
				while (bb < mergedSize / 2){

					temp[targetIndex] = a[index2];
					++sortedSize;
					++targetIndex;
					++index2;
					++bb;
					if (index2 > endIndex2)
						break;
				}
			}
		}


	}
	__syncthreads();
		if (rec_count > 1){
			mergeSort(temp, a, 2 * mergedSize, --rec_count, 2 * d,threadsRange);
		}

	
}

__global__ void merge_starter(int *values, int *temp, int mergedSize, int x, int threadsRange){
	mergeSort(values, temp, mergedSize, x, 1, threadsRange);
}

__global__ static void bitonicSort(int * values)
{
	__shared__ int shared[SHARED_SIZE_LIMIT];

	const int tid = threadIdx.x;

	// Copy input to shared mem.
	shared[tid] = values[tid + blockIdx.x * SHARED_SIZE_LIMIT];
	//	shared[(tid)+(SHARED_SIZE_LIMIT)+blockIdx.x *SHARED_SIZE_LIMIT] = values[tid + (SHARED_SIZE_LIMIT)];
	__syncthreads();

	// Parallel bitonic sort.
	for (int k = 2; k <= SHARED_SIZE_LIMIT; k *= 2)
	{
		// Bitonic merge:
		for (int j = k / 2; j>0; j /= 2)
		{
			int ixj = tid ^ j;

			if (ixj > tid)
			{
				if ((tid & k) == 0)
				{
					if (shared[tid] > shared[ixj])
					{
						swap(shared[tid], shared[ixj]);
					}
				}
				else
				{
					if (shared[tid] < shared[ixj])
					{
						swap(shared[tid], shared[ixj]);
					}
				}
			}

			__syncthreads();
		}
	}

	// Write result.
	values[tid + blockIdx.x * SHARED_SIZE_LIMIT] = shared[tid];


}

//*******************************************************************************************

//****************************************************************************************************************************************
int main()
{
	int *a, *dev_a, *dev_temp1;


	a = (int *)malloc(N*sizeof(int)); //allocate memory on host
	cudaMalloc((void **)&dev_a, N*sizeof(int));
	cudaMalloc((void **)&dev_temp1, N*sizeof(int));//allocate memory on device

	//	srand(time(NULL));
	for (int i = 0; i < N; i++)
	{
		int num = rand() % 20;
		a[i] = num;

	}

	
		//*********************************************************
		//transfer data from host memory to device memory, where dev_a is the destination
		cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
		//*********************************************************
		cudaDeviceSynchronize();
		int x = log2(NumOfThreads);
		int z = log2(numOfChuncks);

		//*************************************************************
		// calculating elapsed time, phase1

		cudaEvent_t start, stop; float time;
		cudaEventCreate(&start); cudaEventCreate(&stop);
		cudaEventRecord(start, 0);
		//**************************************************************


		//start execution
		//*********************************************************************************

		
		printf("   start sorting\n");

		bitonicSort << <global_Size, SHARED_SIZE_LIMIT >> >(dev_a);
		//cudaDeviceSynchronize();
		///////////////////////////////////////////////////////////////////////////////////////////////////////////
		if (x % 2 == 0){
			if (numOfChuncks == 1){
				merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
				//********************************************************************************
				cudaDeviceSynchronize();
				cudaMemcpy(a, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost);
				//***********************************************************************
				// calculating elapsed time, phase2
				cudaEventRecord(stop, 0);
				cudaEventSynchronize(stop);
				cudaEventElapsedTime(&time, start, stop);
				cudaEventDestroy(start);
				cudaEventDestroy(stop);

				printf("kernel time in ms:\t%.7f\n", time);
				//**********************************************************************
			
			}
			if (2 <= numOfChuncks&&numOfChuncks < 1024){
				if (z % 2 == 0){
					merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
					merge_starter << <1, numOfChuncks / 2 >> > (dev_a, dev_temp1, NumOfThreads * 512, z, numOfChuncks);
					//********************************************************************************
					cudaDeviceSynchronize();
					cudaMemcpy(a, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost);
					//***********************************************************************
					// calculating elapsed time, phase2
					cudaEventRecord(stop, 0);
					cudaEventSynchronize(stop);
					cudaEventElapsedTime(&time, start, stop);
					cudaEventDestroy(start);
					cudaEventDestroy(stop);

					printf("kernel time in ms:\t%.7f\n", time);
					//**********************************************************************
				
				}
				else{
					merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
					merge_starter << <1, numOfChuncks / 2 >> > (dev_a, dev_temp1, NumOfThreads * 512, z, numOfChuncks);
					//********************************************************************************
					cudaDeviceSynchronize();

					//***********************************************************************
					// calculating elapsed time, phase2
					cudaEventRecord(stop, 0);
					cudaEventSynchronize(stop);
					cudaEventElapsedTime(&time, start, stop);
					cudaEventDestroy(start);
					cudaEventDestroy(stop);

					printf("kernel time in ms:\t%.7f\n", time);
					//**********************************************************************
					cudaMemcpy(a, dev_temp1, N*sizeof(int), cudaMemcpyDeviceToHost);
				}
			
			}
		}
		////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		else {
			if (numOfChuncks == 1){
				merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
				//********************************************************************************
				cudaDeviceSynchronize();

				//***********************************************************************
				// calculating elapsed time, phase2
				cudaEventRecord(stop, 0);
				cudaEventSynchronize(stop);
				cudaEventElapsedTime(&time, start, stop);
				cudaEventDestroy(start);
				cudaEventDestroy(stop);

				printf("kernel time in ms:\t%.7f\n", time);
				//**********************************************************************
				cudaMemcpy(a, dev_temp1, N*sizeof(int), cudaMemcpyDeviceToHost);
			}
			if (2 <= numOfChuncks&&numOfChuncks < 1024){
				if (z % 2 == 0){
					merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
					merge_starter << <1, numOfChuncks / 2 >> > (dev_temp1, dev_a, NumOfThreads * 512, z, numOfChuncks);
					//********************************************************************************
					cudaDeviceSynchronize();

					//***********************************************************************
					// calculating elapsed time, phase2
					cudaEventRecord(stop, 0);
					cudaEventSynchronize(stop);
					cudaEventElapsedTime(&time, start, stop);
					cudaEventDestroy(start);
					cudaEventDestroy(stop);

					printf("kernel time in ms:\t%.7f\n", time);
					//**********************************************************************
					cudaMemcpy(a, dev_temp1, N*sizeof(int), cudaMemcpyDeviceToHost);
				}
				else{
					merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
					merge_starter << <1, numOfChuncks / 2 >> > (dev_temp1, dev_a, NumOfThreads * 512, z, numOfChuncks);
					//********************************************************************************
					cudaDeviceSynchronize();

					//***********************************************************************
					// calculating elapsed time, phase2
					cudaEventRecord(stop, 0);
					cudaEventSynchronize(stop);
					cudaEventElapsedTime(&time, start, stop);
					cudaEventDestroy(start);
					cudaEventDestroy(stop);

					printf("kernel time in ms:\t%.7f\n", time);
					//**********************************************************************
					cudaMemcpy(a, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost);
				}
			}
			
		}
		


		
		printf("\n ++++++++++++++++++++++++++++++++++++++\n\n\n\n ");
		for (int i = 1; i < N; ++i){
			if (a[i - 1] <= a[i])
			{
				//printf("true");

			}
			else {
				printf("false in %d ", i);
			}
		}
		printf("\n ++++++++++++++++++++++++++++++++++++++\n\n\n\n ");
		

		cudaFree(dev_a);//free memory on device

		free(a);//free memory on host
	}

as you can see, when I put the ode for transferring data from host to device or the opposite inside the code of timing or outside I can’t see any difference
Is there is wrong in the code of timing or is it the real and correct time that I get when executing the program???

Since you are using a synchronous API to copy, i.e. cudaMemcpy(), you don’t need calls to cudaDeviceSynchronize().

I cannot tell where in the posted code the two modes of timing you mention are reflected. What compile-time or run-time switch selects between the two modes? Can you show sample output for both modes?

Generally, I do not recommend using CUDA events for this kind of profiling, I would suggest using a high resolution host timer for reporting wallclock elapsed time.

The CUDA profiler should be more than adequate for showing the time taken for kernels and data transfers, what is the reason for creating a custom framework?

My suggestion would be to create a simpler example. Your question seems to be about timing with cudaMemcpy inside or outside the timing region. I don’t think you need a 400 line example to demonstrate that. It’s tedious just trying to follow all the if paths through your code.

please look at this simpler example

int main()
{
	int *a, *dev_a, *dev_temp1;


	a = (int *)malloc(N*sizeof(int)); //allocate memory on host
	cudaMalloc((void **)&dev_a, N*sizeof(int));
	cudaMalloc((void **)&dev_temp1, N*sizeof(int));//allocate memory on device

	
	for (int i = 0; i < N; i++)
	{
		int num = rand() % 20;
		a[i] = num;

	}
	
		//*********************************************************
		//transfer data from host memory to device memory, where dev_a is the destination
		cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);

		cudaDeviceSynchronize();
	
		//*************************************************************
		// calculating elapsed time, phase1

		cudaEvent_t start, stop; float time;
		cudaEventCreate(&start); cudaEventCreate(&stop);
		cudaEventRecord(start, 0);
		//**************************************************************


		//start execution
		//*********************************************************************************

		//sort up to N=1024*1024
		printf("   start sorting\n");

		bitonicSort << <global_Size, SHARED_SIZE_LIMIT >> >(dev_a);
	 merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
		//cudaDeviceSynchronize();


	// calculating elapsed time, phase2
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
				
	printf("kernel time in ms:\t%.7f\n", time);
	//**********************************************************************
			
							//********************************************************************************
	cudaDeviceSynchronize();
	cudaMemcpy(a, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost);
//***********************************************************************		

		cudaFree(dev_a);//free memory on device

		free(a);//free memory on host
	}

my question is my code correct for calculating the time for executing the methods (bitonicSort and mergeSort)??

because when I put the code for transfering data to and from the device inside the code of estimating time or outside it has no effect on time , but as I know that the process of transferring data from host to device and verse consume time

please help me

Your original question spoke of two modes used for timing, and your problem of reconciling the results from these two modes. I don’t see those two modes reflected in the simplified code above, am I overlooking something?

Certainly I did not mean to suggest that you create a simplified example that nobody could run. Please provide a compilable, runnable sample. As njuffa points out, you could use conditional compilation or another method to demonstrate the two different timing methods you are asking about. Finally, include your actual results for both cases, and the GPU that you are running on as well as the CUDA version, and your operating system.