// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>


// includes, project
#include <cutil_inline.h>
void d_Info(void);
void mem_Alloc(void);
void mem_Free(void);
void mem_hd_transfer(void);
void mem_dd_transfer(void);
void mem_dh_transfer(void);
void CUDA_kernel_exe(void);
void CUDA_check_result(float *arry_A, float *arry_B, int arry_Dim_check);
void CPU_kernel_exe(float *arry_A, float *arry_B, int arry_Dim_check, int element_inc);

float *h_arry_A,*h_arry_B;
float *d_arry_A,*d_arry_B;
int arry_Dim;

unsigned int hTimer;
double gpuTime, cpuTime;

__global__ void CUDAKernel(float *d_arry_B, int b, int arry_pass_Dim)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx<arry_pass_Dim)
    d_arry_B[idx] = d_arry_B[idx]+b;
}


//Main-------------------------------------------------------------------------*

int main( int argc, char** argv) 
{
    arry_Dim=4194240;
    int element_inc=1;
    d_Info();
	mem_Alloc();
    CPU_kernel_exe(h_arry_A, h_arry_B, arry_Dim, element_inc); 
    mem_hd_transfer();
    mem_dd_transfer();
    CUDA_kernel_exe();
    mem_dh_transfer();
    CUDA_check_result(h_arry_A, h_arry_B, arry_Dim);    
    mem_Free();
	cutilExit(argc, argv);
    return 0;
}


//CUDA Devices Count and Properties----------------------------------------------BEGIN*
void d_Info(void)
{
    int d_Count;
    cutilSafeCall(cudaGetDeviceCount(&d_Count));
    if (d_Count == 0)
        printf("No CUDA capable device presented\n");

    int dev;
    for (dev = 0; dev < d_Count; ++dev) {
        cudaDeviceProp d_Props;
                cutilSafeCall(cudaGetDeviceProperties(&d_Props, dev));
        if (dev == 0) {
            if (d_Props.major == 9999 && d_Props.minor == 9999)
                printf("No CUDA capable device presented\n");
            else if (d_Count == 1)
                printf("There is 1 device supporting CUDA\n");
            else
                printf("There are %d devices supporting CUDA\n", d_Count);
        }
        printf ("==================================================================\n\n");
        printf("%d - %s\n\n", dev, d_Props.name);
        printf("  major revision number:                         %d\n",
               d_Props.major);
        printf("  Minor revision number:                         %d\n",
               d_Props.minor);
    #if CUDART_VERSION >= 2000
        printf("  Number of multiprocessors:                     %d\n",
               d_Props.multiProcessorCount);
        printf("  Number of cores:                               %d\n\n",
               8 * d_Props.multiProcessorCount);
    #endif
        printf("  Clock rate:                                    %.2f GHz\n\n",
               d_Props.clockRate * 1e-6f);
        printf("  Total amount of global memory:                 %u bytes\n",
               d_Props.totalGlobalMem);
        printf("  Total amount of constant memory:               %u bytes\n",
               d_Props.totalConstMem); 
        printf("  Total amount of shared memory per block:       %u bytes\n",
               d_Props.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n\n",
               d_Props.regsPerBlock);
        printf("  Maximum memory pitch:                          %u bytes\n",
               d_Props.memPitch);
        printf("  Warp size:                                     %d\n",
               d_Props.warpSize);
        printf("  Maximum number of threads per block:           %d\n",
               d_Props.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
               d_Props.maxThreadsDim[0],
               d_Props.maxThreadsDim[1],
               d_Props.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n\n",
               d_Props.maxGridSize[0],
               d_Props.maxGridSize[1],
               d_Props.maxGridSize[2]);

        printf("  Texture alignment:                             %u bytes\n",
               d_Props.textureAlignment);

    #if CUDART_VERSION >= 2000
        printf("  Concurrent copy and execution:                 %s\n\n",
              d_Props.deviceOverlap ? "Yes" : "No");
    #endif
    }
    printf ("==================================================================\n\n");
}
//CUDA Devices Count and Properties----------------------------------------------END**

//Host and Device Memory Allocation--------------------------------------------BEGIN**
void mem_Alloc(void)
{   
    int i;
    
    //Host memory allocation and array initialization
    printf ("Host Memory Allocation"); 
    h_arry_A = (float *) malloc(arry_Dim*sizeof(float));
        for (i=0; i<arry_Dim; i++)
        {
        h_arry_A[i] = 0;
        }    
    h_arry_B = (float *) malloc(arry_Dim*sizeof(float));
        for (i=0; i<arry_Dim; i++)
        {
        h_arry_B[i] = 0;
        } 
        printf ("->DONE\n\n");
    
    //Device memory allocation
    printf ("Device Memory Allocation");   
    size_t memSize = arry_Dim*sizeof(float);
    cudaMalloc( (void**)&d_arry_A, memSize);
    cudaMalloc( (void**)&d_arry_B, memSize);

    printf ("->DONE\n\n");

}
//Host and Device Memory Allocation-----------------------------------------------END**

//CPU Kernel---------------------------------------------------------------------BEGIN**
void CPU_kernel_exe(float *arry_A, float *arry_B, int arry_Dim_check, int element_inc)
{  
    int i;
    printf ("CPU Kernel");
    cutCreateTimer(&hTimer);
    cutResetTimer(hTimer);
    cutStartTimer(hTimer);

    for(i=0; i<arry_Dim_check;i++){
        arry_B[i]=arry_A[i]+element_inc;
    }
    cutStopTimer(hTimer);
    cpuTime = cutGetTimerValue(hTimer);
    cutDeleteTimer(hTimer);

   printf ("->DONE\n\n");
   printf("CPU time : %f msec \n\n", cpuTime);

}
//CPU Kernel----------------------------------------------------------------------END**

//Host->Device Memory Transfer--------------------------------------------------BEGIN**
void mem_hd_transfer(void)
{   
    printf ("Host to Device Transfer");
    size_t memSize = arry_Dim*sizeof(float);
    cudaMemcpy(d_arry_A, h_arry_A, memSize, cudaMemcpyHostToDevice);
    printf ("->DONE\n\n");
}
//Host->Device Memory Transfer----------------------------------------------------END**

//Device->Device Memory Transfer--------------------------------------------------BEGIN**
void mem_dd_transfer(void)
{   
    printf ("Device to Device Transfer");
    size_t memSize = arry_Dim*sizeof(float);
    cudaMemcpy(d_arry_B, d_arry_A, memSize, cudaMemcpyDeviceToDevice);
    printf ("->DONE\n\n");
}
//Device->Device Memory Transfer----------------------------------------------------END**

//CUDA Kernel---------------------------------------------------------------------BEGIN**
void CUDA_kernel_exe(void)
{  
   int numbThreads =64;
   int numbBlocks = arry_Dim/numbThreads;
   if((arry_Dim % 64)>0)
        numbBlocks = numbBlocks + 1;
   
   dim3 dimGrid(numbBlocks);
   dim3 dimBlock(numbThreads);
   printf ("CUDA Kernel");
   cutCreateTimer(&hTimer);
   cutResetTimer(hTimer);
   cutStartTimer(hTimer);

   CUDAKernel<<< dimGrid, dimBlock >>>( d_arry_B, 1, arry_Dim );
   cudaThreadSynchronize();

   cutStopTimer(hTimer);
   gpuTime = cutGetTimerValue(hTimer);
   printf ("->DONE\n\n");
   printf("GPU time : %f msec \n\n", gpuTime);
   cutDeleteTimer(hTimer);

}
//CUDA Kernel---------------------------------------------------------------------END**

//Device->Host Memory Transfer----------------------------------------------------BEGIN**
void mem_dh_transfer(void)
{   
    printf ("Device to Host Transfer");
    size_t memSize = arry_Dim*sizeof(float);
    cudaMemcpy(h_arry_A, d_arry_B, memSize, cudaMemcpyDeviceToHost);
    printf ("->DONE\n\n");    
}
//Device->Host Memory Transfer------------------------------------------------------END**

//Check Result CUDA----------------------------------------------------------------BEGIN**
void CUDA_check_result(float *arry_A, float *arry_B, int arry_Dim_check)
{  
 //printf("%i", (sizeof(float)));  
 //printf("%i", h_arry_A[30]); 
    
    int i, result;
    printf ("Checking Result");
    for(i = 0; i < arry_Dim_check; i ++)
    {
        if (arry_B[i]!=(arry_A[i]))
         {  result=1; 
           exit;
           }
  
    } 
    
    if (result==1)
         printf ("->Incorrect");
    else
         printf ("->Correct"); 

    printf("\n\n");
 }
//Check Result CUDA------------------------------------------------------END**

//Free Host and Device Memory---------------------------------------------------BEGIN**
void mem_Free(void)
{   
    printf ("Free Device Memory");
    cudaFree(d_arry_A);
    cudaFree(d_arry_B);
    printf ("->DONE\n\n");
    printf ("Free Host Memory");
    free(h_arry_A);
    free(h_arry_B);
    printf ("->DONE\n\n");

}
//Free Host and Device Memory-----------------------------------------------------END**