#include <stdio.h>
#include <malloc.h>
#include <cuda_runtime.h>
#include <windows.h>
#include <stdlib.h>
#include <tchar.h>
//#include <cutil.h>
#define blksize (30)
#define trdsize (12)
#define x_size (1920)
#define y_size (1080)
#define LOOPCOUNT (10)
global void test(int *result1, int *input_img )
int tidx, bidx;
int n;
int i, j;
int index;
tidx = threadIdx.x;
bidx = blockIdx.x;
i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;
index = i + (j * x_size);
for( n = 0; n < LOOPCOUNT; n++ )
result1[index] = (input_img[index] * 11 + 11)/11 ;
int main()
int *host_Result;
int *device_Result1;
int *device_Result2;
int *device_Result3;
int *device_input;
int i=0, j=0;
int n;
dim3 dimBlock(blksize, blksize);
dim3 dimGrid(x_size/dimBlock.x, y_size/dimBlock.y); //dimBlock & dimGrid
double duration1 = 0.0;
double duration2 = 0.0;
clock_t start1, start2, finish;
LARGE_INTEGER freq, startTime, endTime;
int bSuppored = QueryPerformanceFrequency(&freq); //tick
if (bSuppored == 0)
_tprintf(TEXT("high-resolution performance count 'No'\n"));
return 1;
int* input=NULL;
input = new int[x_size*y_size*sizeof(int)];
// Create HD image
for(j=0; j<y_size; j++)
for(i=0; i<x_size; i++)
input[i+ (x_size*j)]= 33;
} // 1920 * 1080 input image
printf("Input pixel value: %d\n", input[0]);
start1 = clock();
printf("start1: %f\n", (double)start1);
host_Result = (int *)malloc( x_size * y_size* sizeof(int) ); //cpu
cudaMalloc( (void**) &device_Result1, sizeof(int) * x_size * y_size); //gpu
cudaMalloc( (void**) &device_input, sizeof(int) * x_size * y_size); // Memory allocation for 'device_input'
// Copy input image data to CUDA memory
cudaMemcpy( device_input, input, sizeof(int) * x_size * y_size, cudaMemcpyHostToDevice );
start2 = clock();
printf("start2: %f\n", (double)start2);
//start time.
for( n = 0; n < LOOPCOUNT; n++ )
test <<<dimGrid, dimBlock>>>(device_Result1,device_input);
//end time.
cudaMemcpy( host_Result, device_Result1, sizeof(int) * x_size * y_size, cudaMemcpyDeviceToHost ); // memcpy device to host
finish = clock();
printf("finish: %f\n", (double)finish);
duration1 = finish - start1;
duration2 = finish - start2;
printf("GPU Processing time (including memory allocation):\t %10.6f\n", (double)duration1 / (double)CLOCKS_PER_SEC);
printf("GPU Processing time:\t %10.6f\n", (double)duration2 / (double)CLOCKS_PER_SEC);
double elapsedTime = ((double)(endTime.QuadPart - startTime.QuadPart) / (freq.QuadPart) );
// elapsedTime *= 1000;
_tprintf(TEXT("GPU Processing time:\t %lf s\n\n\n"), elapsedTime);
return 0;
hi, guys
i want to check processing time for ‘gpu is faster than cpu’
for( n = 0; n < LOOPCOUNT; n++ )
test <<<dimGrid, dimBlock>>>(device_Result1,device_input);
but i think it’s not work.
I changed a Number of Loopcount but the processing time is not changed.
i want to know how do i am And how can i reduce memcpy running time.
i’ll wait your good advice. thanks.