i had written a program to be executed on GPU, the problem as I’m notice is in the timing process, my code is as follows
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cmath>
#define NumOfThreads 1024 // min is 1 and max is 1024
#define numOfChuncks 1 // min is 1 and max is 512
#define global_Size NumOfThreads*numOfChuncks
#define SHARED_SIZE_LIMIT 256
#define N SHARED_SIZE_LIMIT*global_Size
#define half_shared_size SHARED_SIZE_LIMIT/2
// Define this to turn on error checking
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
//****************************************************************************************
// functions for cuda error checking
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
if (cudaSuccess != err)
{
fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
// More careful checking. However, this will affect performance.
// Comment away if needed.
err = cudaDeviceSynchronize();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
//****************************************************************************************
//*****************************************************************************************
// code of bitonic sort downloded from http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/Data-Parallel_Algorithms.html#bitonic
__device__ inline void swap(int & a, int & b)
{
// Alternative swap doesn't use a temporary register:
// a ^= b;
// b ^= a;
// a ^= b;
int tmp = a;
a = b;
b = tmp;
}
__device__ void mergeSort(int *a, int *temp, int mergedSize, int rec_count, int d,int threadsRange){
int threadId = threadIdx.x;
int BlockStart = blockIdx.x*(NumOfThreads*SHARED_SIZE_LIMIT);
if (0 <= threadId && threadId < (threadsRange / (2 * d))){
int index1 = BlockStart + mergedSize * (threadId);
int endIndex1 = index1 + (mergedSize / 2) - 1;
int index2 = endIndex1 + 1;
int endIndex2 = index2 + (mergedSize / 2) - 1;
int targetIndex = index1;
int sortedSize = 0;
//printf("th= %d ", threadId);
while (index1 <= endIndex1 && index2 <= endIndex2){
if (a[index1] <= a[index2]){
temp[targetIndex] = a[index1];
++sortedSize;
++index1;
++targetIndex;
}
else{
temp[targetIndex] = a[index2];
++index2;
++sortedSize;
++targetIndex;
}
}
if (index1 < endIndex1 &&index2 == endIndex2){
if (sortedSize < mergedSize){
int bb = 0;
while (bb < (mergedSize / 2)){
temp[targetIndex] = a[index1];
++sortedSize;
++targetIndex;
++index1;
++bb;
}
}
}
if (index2 > endIndex2 &&index1 <= endIndex1){
if (sortedSize < mergedSize){
int bb = 0;
while (bb < (mergedSize / 2)){
temp[targetIndex] = a[index1];
++sortedSize;
++targetIndex;
++index1;
++bb;
if (index1 > endIndex1)
break;
}
}
}
if (index1 > endIndex1 &&index2 <= endIndex2){
if (sortedSize < mergedSize){
int bb = 0;
while (bb < mergedSize / 2){
temp[targetIndex] = a[index2];
++sortedSize;
++targetIndex;
++index2;
++bb;
if (index2 > endIndex2)
break;
}
}
}
}
__syncthreads();
if (rec_count > 1){
mergeSort(temp, a, 2 * mergedSize, --rec_count, 2 * d,threadsRange);
}
}
__global__ void merge_starter(int *values, int *temp, int mergedSize, int x, int threadsRange){
mergeSort(values, temp, mergedSize, x, 1, threadsRange);
}
__global__ static void bitonicSort(int * values)
{
__shared__ int shared[SHARED_SIZE_LIMIT];
const int tid = threadIdx.x;
// Copy input to shared mem.
shared[tid] = values[tid + blockIdx.x * SHARED_SIZE_LIMIT];
// shared[(tid)+(SHARED_SIZE_LIMIT)+blockIdx.x *SHARED_SIZE_LIMIT] = values[tid + (SHARED_SIZE_LIMIT)];
__syncthreads();
// Parallel bitonic sort.
for (int k = 2; k <= SHARED_SIZE_LIMIT; k *= 2)
{
// Bitonic merge:
for (int j = k / 2; j>0; j /= 2)
{
int ixj = tid ^ j;
if (ixj > tid)
{
if ((tid & k) == 0)
{
if (shared[tid] > shared[ixj])
{
swap(shared[tid], shared[ixj]);
}
}
else
{
if (shared[tid] < shared[ixj])
{
swap(shared[tid], shared[ixj]);
}
}
}
__syncthreads();
}
}
// Write result.
values[tid + blockIdx.x * SHARED_SIZE_LIMIT] = shared[tid];
}
//*******************************************************************************************
//****************************************************************************************************************************************
int main()
{
int *a, *dev_a, *dev_temp1;
a = (int *)malloc(N*sizeof(int)); //allocate memory on host
cudaMalloc((void **)&dev_a, N*sizeof(int));
cudaMalloc((void **)&dev_temp1, N*sizeof(int));//allocate memory on device
// srand(time(NULL));
for (int i = 0; i < N; i++)
{
int num = rand() % 20;
a[i] = num;
}
//*********************************************************
//transfer data from host memory to device memory, where dev_a is the destination
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
//*********************************************************
cudaDeviceSynchronize();
int x = log2(NumOfThreads);
int z = log2(numOfChuncks);
//*************************************************************
// calculating elapsed time, phase1
cudaEvent_t start, stop; float time;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//**************************************************************
//start execution
//*********************************************************************************
printf(" start sorting\n");
bitonicSort << <global_Size, SHARED_SIZE_LIMIT >> >(dev_a);
//cudaDeviceSynchronize();
///////////////////////////////////////////////////////////////////////////////////////////////////////////
if (x % 2 == 0){
if (numOfChuncks == 1){
merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
//********************************************************************************
cudaDeviceSynchronize();
cudaMemcpy(a, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost);
//***********************************************************************
// calculating elapsed time, phase2
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("kernel time in ms:\t%.7f\n", time);
//**********************************************************************
}
if (2 <= numOfChuncks&&numOfChuncks < 1024){
if (z % 2 == 0){
merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
merge_starter << <1, numOfChuncks / 2 >> > (dev_a, dev_temp1, NumOfThreads * 512, z, numOfChuncks);
//********************************************************************************
cudaDeviceSynchronize();
cudaMemcpy(a, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost);
//***********************************************************************
// calculating elapsed time, phase2
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("kernel time in ms:\t%.7f\n", time);
//**********************************************************************
}
else{
merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
merge_starter << <1, numOfChuncks / 2 >> > (dev_a, dev_temp1, NumOfThreads * 512, z, numOfChuncks);
//********************************************************************************
cudaDeviceSynchronize();
//***********************************************************************
// calculating elapsed time, phase2
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("kernel time in ms:\t%.7f\n", time);
//**********************************************************************
cudaMemcpy(a, dev_temp1, N*sizeof(int), cudaMemcpyDeviceToHost);
}
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
else {
if (numOfChuncks == 1){
merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
//********************************************************************************
cudaDeviceSynchronize();
//***********************************************************************
// calculating elapsed time, phase2
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("kernel time in ms:\t%.7f\n", time);
//**********************************************************************
cudaMemcpy(a, dev_temp1, N*sizeof(int), cudaMemcpyDeviceToHost);
}
if (2 <= numOfChuncks&&numOfChuncks < 1024){
if (z % 2 == 0){
merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
merge_starter << <1, numOfChuncks / 2 >> > (dev_temp1, dev_a, NumOfThreads * 512, z, numOfChuncks);
//********************************************************************************
cudaDeviceSynchronize();
//***********************************************************************
// calculating elapsed time, phase2
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("kernel time in ms:\t%.7f\n", time);
//**********************************************************************
cudaMemcpy(a, dev_temp1, N*sizeof(int), cudaMemcpyDeviceToHost);
}
else{
merge_starter << <numOfChuncks, NumOfThreads / 2 >> > (dev_a, dev_temp1, 512, x, NumOfThreads);
merge_starter << <1, numOfChuncks / 2 >> > (dev_temp1, dev_a, NumOfThreads * 512, z, numOfChuncks);
//********************************************************************************
cudaDeviceSynchronize();
//***********************************************************************
// calculating elapsed time, phase2
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("kernel time in ms:\t%.7f\n", time);
//**********************************************************************
cudaMemcpy(a, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost);
}
}
}
printf("\n ++++++++++++++++++++++++++++++++++++++\n\n\n\n ");
for (int i = 1; i < N; ++i){
if (a[i - 1] <= a[i])
{
//printf("true");
}
else {
printf("false in %d ", i);
}
}
printf("\n ++++++++++++++++++++++++++++++++++++++\n\n\n\n ");
cudaFree(dev_a);//free memory on device
free(a);//free memory on host
}
as you can see, when I put the ode for transferring data from host to device or the opposite inside the code of timing or outside I can’t see any difference
Is there is wrong in the code of timing or is it the real and correct time that I get when executing the program???