processing time check

#include <stdio.h>
#include <malloc.h>
#include <cuda_runtime.h>

#include <windows.h>
#include <stdlib.h>
#include <tchar.h>

//#include <cutil.h>

#define blksize (30)
#define trdsize (12)

#define x_size (1920)
#define y_size (1080)

#define LOOPCOUNT (10)

global void test(int *result1, int *input_img )
{
int tidx, bidx;
int n;
int i, j;
int index;

tidx = threadIdx.x;
bidx = blockIdx.x;



i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;

index = i + (j * x_size);

for( n = 0; n < LOOPCOUNT; n++ )
    {
       result1[index]  = (input_img[index] * 11 + 11)/11 ;

    }

}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

int main()
{
int *host_Result;
int *device_Result1;
int *device_Result2;
int *device_Result3;
int *device_input;

int i=0, j=0; 
int n;

dim3 dimBlock(blksize, blksize);
dim3 dimGrid(x_size/dimBlock.x, y_size/dimBlock.y);							//dimBlock & dimGrid

double duration1 = 0.0;
double duration2 = 0.0;

clock_t start1, start2, finish;

LARGE_INTEGER  freq, startTime, endTime;

int bSuppored = QueryPerformanceFrequency(&freq);	//tick
if (bSuppored == 0)
{
	_tprintf(TEXT("high-resolution performance count 'No'\n"));
	return 1;	
}


int* input=NULL;
input = new int[x_size*y_size*sizeof(int)];
memset(input,0,x_size*y_size);

// Create HD image
for(j=0; j<y_size; j++)
{
	for(i=0; i<x_size; i++)
	{
		input[i+ (x_size*j)]= 33;
	}
}																			// 1920 * 1080 input image

printf("GPU\n");

printf("Input pixel value: %d\n", input[0]);


start1 = clock();
printf("start1: %f\n", (double)start1);

host_Result = (int *)malloc( x_size * y_size* sizeof(int) );				//cpu 
cudaMalloc( (void**) &device_Result1, sizeof(int) * x_size * y_size);		//gpu 

cudaMalloc( (void**) &device_input, sizeof(int) * x_size * y_size);		// Memory allocation for 'device_input'

// Copy input image data to CUDA memory
cudaMemcpy( device_input, input, sizeof(int) * x_size * y_size, cudaMemcpyHostToDevice ); 

start2 = clock();
printf("start2: %f\n", (double)start2);

//start time.
QueryPerformanceCounter(&startTime);

for( n = 0; n < LOOPCOUNT; n++ )
{
test <<<dimGrid, dimBlock>>>(device_Result1,device_input);

}
//end time.
QueryPerformanceCounter(&endTime);

cudaMemcpy( host_Result, device_Result1, sizeof(int) * x_size * y_size, cudaMemcpyDeviceToHost );  // memcpy device to host
finish = clock();




printf("finish: %f\n", (double)finish);

duration1 = finish - start1; 
duration2 = finish - start2; 
printf("GPU Processing time (including memory allocation):\t %10.6f\n", (double)duration1 / (double)CLOCKS_PER_SEC);
printf("GPU Processing time:\t %10.6f\n", (double)duration2 / (double)CLOCKS_PER_SEC);


double elapsedTime = ((double)(endTime.QuadPart - startTime.QuadPart) / (freq.QuadPart) );
// elapsedTime *= 1000;
_tprintf(TEXT("GPU Processing time:\t %lf s\n\n\n"), elapsedTime);

free(host_Result);
cudaFree(device_Result1);
cudaFree(device_input);


    return 0;

}

hi, guys

i want to check processing time for ‘gpu is faster than cpu’

///////////////////////////////////////////////////////////////////
for( n = 0; n < LOOPCOUNT; n++ )
{
test <<<dimGrid, dimBlock>>>(device_Result1,device_input);

}

///////////////////////////////////////////////////////////////////
but i think it’s not work.

I changed a Number of Loopcount but the processing time is not changed.

i want to know how do i am And how can i reduce memcpy running time.

i’ll wait your good advice. thanks.

#include <stdio.h>
#include <malloc.h>
#include <cuda_runtime.h>

#include <windows.h>
#include <stdlib.h>
#include <tchar.h>

//#include <cutil.h>

#define blksize (30)
#define trdsize (12)

#define x_size (1920)
#define y_size (1080)

#define LOOPCOUNT (10)

global void test(int *result1, int *input_img )
{
int tidx, bidx;
int n;
int i, j;
int index;

tidx = threadIdx.x;
bidx = blockIdx.x;



i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;

index = i + (j * x_size);

for( n = 0; n < LOOPCOUNT; n++ )
    {
       result1[index]  = (input_img[index] * 11 + 11)/11 ;

    }

}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

int main()
{
int *host_Result;
int *device_Result1;
int *device_Result2;
int *device_Result3;
int *device_input;

int i=0, j=0; 
int n;

dim3 dimBlock(blksize, blksize);
dim3 dimGrid(x_size/dimBlock.x, y_size/dimBlock.y);							//dimBlock & dimGrid

double duration1 = 0.0;
double duration2 = 0.0;

clock_t start1, start2, finish;

LARGE_INTEGER  freq, startTime, endTime;

int bSuppored = QueryPerformanceFrequency(&freq);	//tick
if (bSuppored == 0)
{
	_tprintf(TEXT("high-resolution performance count 'No'\n"));
	return 1;	
}


int* input=NULL;
input = new int[x_size*y_size*sizeof(int)];
memset(input,0,x_size*y_size);

// Create HD image
for(j=0; j<y_size; j++)
{
	for(i=0; i<x_size; i++)
	{
		input[i+ (x_size*j)]= 33;
	}
}																			// 1920 * 1080 input image

printf("GPU\n");

printf("Input pixel value: %d\n", input[0]);


start1 = clock();
printf("start1: %f\n", (double)start1);

host_Result = (int *)malloc( x_size * y_size* sizeof(int) );				//cpu 
cudaMalloc( (void**) &device_Result1, sizeof(int) * x_size * y_size);		//gpu 

cudaMalloc( (void**) &device_input, sizeof(int) * x_size * y_size);		// Memory allocation for 'device_input'

// Copy input image data to CUDA memory
cudaMemcpy( device_input, input, sizeof(int) * x_size * y_size, cudaMemcpyHostToDevice ); 

start2 = clock();
printf("start2: %f\n", (double)start2);

//start time.
QueryPerformanceCounter(&startTime);

for( n = 0; n < LOOPCOUNT; n++ )
{
test <<<dimGrid, dimBlock>>>(device_Result1,device_input);

}
//end time.
QueryPerformanceCounter(&endTime);

cudaMemcpy( host_Result, device_Result1, sizeof(int) * x_size * y_size, cudaMemcpyDeviceToHost );  // memcpy device to host
finish = clock();




printf("finish: %f\n", (double)finish);

duration1 = finish - start1; 
duration2 = finish - start2; 
printf("GPU Processing time (including memory allocation):\t %10.6f\n", (double)duration1 / (double)CLOCKS_PER_SEC);
printf("GPU Processing time:\t %10.6f\n", (double)duration2 / (double)CLOCKS_PER_SEC);


double elapsedTime = ((double)(endTime.QuadPart - startTime.QuadPart) / (freq.QuadPart) );
// elapsedTime *= 1000;
_tprintf(TEXT("GPU Processing time:\t %lf s\n\n\n"), elapsedTime);

free(host_Result);
cudaFree(device_Result1);
cudaFree(device_input);


    return 0;

}

hi, guys

i want to check processing time for ‘gpu is faster than cpu’

///////////////////////////////////////////////////////////////////
for( n = 0; n < LOOPCOUNT; n++ )
{
test <<<dimGrid, dimBlock>>>(device_Result1,device_input);

}

///////////////////////////////////////////////////////////////////
but i think it’s not work.

I changed a Number of Loopcount but the processing time is not changed.

i want to know how do i am And how can i reduce memcpy running time.

i’ll wait your good advice. thanks.

I’m not sure of exactly the behavior of QueryPerformanceCounter, but your placement of that line is not going to measure the GPU runtime. CUDA kernels are launched asynchronously, which means that as soon as the kernel is queued up by the driver, control returns your CPU code. When you reach QueryPerformanceCounter, not all of the queued kernels will have finished running. For benchmarking purposes, you should add a call to cudaThreadSynchronize() before QueryPerformanceCounter(&endTime). (Note that you do not need cudaThreadSynchronize() in actual code because cudaMemcpy() has an implicit synchronization.)

I’m not sure of exactly the behavior of QueryPerformanceCounter, but your placement of that line is not going to measure the GPU runtime. CUDA kernels are launched asynchronously, which means that as soon as the kernel is queued up by the driver, control returns your CPU code. When you reach QueryPerformanceCounter, not all of the queued kernels will have finished running. For benchmarking purposes, you should add a call to cudaThreadSynchronize() before QueryPerformanceCounter(&endTime). (Note that you do not need cudaThreadSynchronize() in actual code because cudaMemcpy() has an implicit synchronization.)

thank you, seibert.

Now running time problem is ok , i have to find some functions for seeing output in Kernel_Function.

anyway your advice is very helpful to me.

thank you, seibert.

Now running time problem is ok , i have to find some functions for seeing output in Kernel_Function.

anyway your advice is very helpful to me.