// includes, system #include #include #include // includes, project #include void d_Info(void); void mem_Alloc(void); void mem_Free(void); void mem_hd_transfer(void); void mem_dd_transfer(void); void mem_dh_transfer(void); void CUDA_kernel_exe(void); void CUDA_check_result(float *arry_A, float *arry_B, int arry_Dim_check); void CPU_kernel_exe(float *arry_A, float *arry_B, int arry_Dim_check, int element_inc); float *h_arry_A,*h_arry_B; float *d_arry_A,*d_arry_B; int arry_Dim; unsigned int hTimer; double gpuTime, cpuTime; __global__ void CUDAKernel(float *d_arry_B, int b, int arry_pass_Dim) { int idx = blockIdx.x*blockDim.x + threadIdx.x; if(idx= 2000 printf(" Number of multiprocessors: %d\n", d_Props.multiProcessorCount); printf(" Number of cores: %d\n\n", 8 * d_Props.multiProcessorCount); #endif printf(" Clock rate: %.2f GHz\n\n", d_Props.clockRate * 1e-6f); printf(" Total amount of global memory: %u bytes\n", d_Props.totalGlobalMem); printf(" Total amount of constant memory: %u bytes\n", d_Props.totalConstMem); printf(" Total amount of shared memory per block: %u bytes\n", d_Props.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n\n", d_Props.regsPerBlock); printf(" Maximum memory pitch: %u bytes\n", d_Props.memPitch); printf(" Warp size: %d\n", d_Props.warpSize); printf(" Maximum number of threads per block: %d\n", d_Props.maxThreadsPerBlock); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", d_Props.maxThreadsDim[0], d_Props.maxThreadsDim[1], d_Props.maxThreadsDim[2]); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n\n", d_Props.maxGridSize[0], d_Props.maxGridSize[1], d_Props.maxGridSize[2]); printf(" Texture alignment: %u bytes\n", d_Props.textureAlignment); #if CUDART_VERSION >= 2000 printf(" Concurrent copy and execution: %s\n\n", d_Props.deviceOverlap ? "Yes" : "No"); #endif } printf ("==================================================================\n\n"); } //CUDA Devices Count and Properties----------------------------------------------END** //Host and Device Memory Allocation--------------------------------------------BEGIN** void mem_Alloc(void) { int i; //Host memory allocation and array initialization printf ("Host Memory Allocation"); h_arry_A = (float *) malloc(arry_Dim*sizeof(float)); for (i=0; iDONE\n\n"); //Device memory allocation printf ("Device Memory Allocation"); size_t memSize = arry_Dim*sizeof(float); cudaMalloc( (void**)&d_arry_A, memSize); cudaMalloc( (void**)&d_arry_B, memSize); printf ("->DONE\n\n"); } //Host and Device Memory Allocation-----------------------------------------------END** //CPU Kernel---------------------------------------------------------------------BEGIN** void CPU_kernel_exe(float *arry_A, float *arry_B, int arry_Dim_check, int element_inc) { int i; printf ("CPU Kernel"); cutCreateTimer(&hTimer); cutResetTimer(hTimer); cutStartTimer(hTimer); for(i=0; iDONE\n\n"); printf("CPU time : %f msec \n\n", cpuTime); } //CPU Kernel----------------------------------------------------------------------END** //Host->Device Memory Transfer--------------------------------------------------BEGIN** void mem_hd_transfer(void) { printf ("Host to Device Transfer"); size_t memSize = arry_Dim*sizeof(float); cudaMemcpy(d_arry_A, h_arry_A, memSize, cudaMemcpyHostToDevice); printf ("->DONE\n\n"); } //Host->Device Memory Transfer----------------------------------------------------END** //Device->Device Memory Transfer--------------------------------------------------BEGIN** void mem_dd_transfer(void) { printf ("Device to Device Transfer"); size_t memSize = arry_Dim*sizeof(float); cudaMemcpy(d_arry_B, d_arry_A, memSize, cudaMemcpyDeviceToDevice); printf ("->DONE\n\n"); } //Device->Device Memory Transfer----------------------------------------------------END** //CUDA Kernel---------------------------------------------------------------------BEGIN** void CUDA_kernel_exe(void) { int numbThreads =64; int numbBlocks = arry_Dim/numbThreads; if((arry_Dim % 64)>0) numbBlocks = numbBlocks + 1; dim3 dimGrid(numbBlocks); dim3 dimBlock(numbThreads); printf ("CUDA Kernel"); cutCreateTimer(&hTimer); cutResetTimer(hTimer); cutStartTimer(hTimer); CUDAKernel<<< dimGrid, dimBlock >>>( d_arry_B, 1, arry_Dim ); cudaThreadSynchronize(); cutStopTimer(hTimer); gpuTime = cutGetTimerValue(hTimer); printf ("->DONE\n\n"); printf("GPU time : %f msec \n\n", gpuTime); cutDeleteTimer(hTimer); } //CUDA Kernel---------------------------------------------------------------------END** //Device->Host Memory Transfer----------------------------------------------------BEGIN** void mem_dh_transfer(void) { printf ("Device to Host Transfer"); size_t memSize = arry_Dim*sizeof(float); cudaMemcpy(h_arry_A, d_arry_B, memSize, cudaMemcpyDeviceToHost); printf ("->DONE\n\n"); } //Device->Host Memory Transfer------------------------------------------------------END** //Check Result CUDA----------------------------------------------------------------BEGIN** void CUDA_check_result(float *arry_A, float *arry_B, int arry_Dim_check) { //printf("%i", (sizeof(float))); //printf("%i", h_arry_A[30]); int i, result; printf ("Checking Result"); for(i = 0; i < arry_Dim_check; i ++) { if (arry_B[i]!=(arry_A[i])) { result=1; exit; } } if (result==1) printf ("->Incorrect"); else printf ("->Correct"); printf("\n\n"); } //Check Result CUDA------------------------------------------------------END** //Free Host and Device Memory---------------------------------------------------BEGIN** void mem_Free(void) { printf ("Free Device Memory"); cudaFree(d_arry_A); cudaFree(d_arry_B); printf ("->DONE\n\n"); printf ("Free Host Memory"); free(h_arry_A); free(h_arry_B); printf ("->DONE\n\n"); } //Free Host and Device Memory-----------------------------------------------------END**