processing time check

bampy7 · November 15, 2010, 12:58pm

#include <stdio.h>
#include <malloc.h>
#include <cuda_runtime.h>

#include <windows.h>
#include <stdlib.h>
#include <tchar.h>

//#include <cutil.h>

#define blksize (30)
#define trdsize (12)

#define x_size (1920)
#define y_size (1080)

#define LOOPCOUNT (10)

global void test(int *result1, int *input_img )
{
int tidx, bidx;
int n;
int i, j;
int index;

tidx = threadIdx.x;
bidx = blockIdx.x;



i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;

index = i + (j * x_size);

for( n = 0; n < LOOPCOUNT; n++ )
    {
       result1[index]  = (input_img[index] * 11 + 11)/11 ;

    }

}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

int main()
{
int *host_Result;
int *device_Result1;
int *device_Result2;
int *device_Result3;
int *device_input;

int i=0, j=0; 
int n;

dim3 dimBlock(blksize, blksize);
dim3 dimGrid(x_size/dimBlock.x, y_size/dimBlock.y);							//dimBlock & dimGrid

double duration1 = 0.0;
double duration2 = 0.0;

clock_t start1, start2, finish;

LARGE_INTEGER  freq, startTime, endTime;

int bSuppored = QueryPerformanceFrequency(&freq);	//tick
if (bSuppored == 0)
{
	_tprintf(TEXT("high-resolution performance count 'No'\n"));
	return 1;	
}


int* input=NULL;
input = new int[x_size*y_size*sizeof(int)];
memset(input,0,x_size*y_size);

// Create HD image
for(j=0; j<y_size; j++)
{
	for(i=0; i<x_size; i++)
	{
		input[i+ (x_size*j)]= 33;
	}
}																			// 1920 * 1080 input image

printf("GPU\n");

printf("Input pixel value: %d\n", input[0]);


start1 = clock();
printf("start1: %f\n", (double)start1);

host_Result = (int *)malloc( x_size * y_size* sizeof(int) );				//cpu 
cudaMalloc( (void**) &device_Result1, sizeof(int) * x_size * y_size);		//gpu 

cudaMalloc( (void**) &device_input, sizeof(int) * x_size * y_size);		// Memory allocation for 'device_input'

// Copy input image data to CUDA memory
cudaMemcpy( device_input, input, sizeof(int) * x_size * y_size, cudaMemcpyHostToDevice ); 

start2 = clock();
printf("start2: %f\n", (double)start2);

//start time.
QueryPerformanceCounter(&startTime);

for( n = 0; n < LOOPCOUNT; n++ )
{
test <<<dimGrid, dimBlock>>>(device_Result1,device_input);

}
//end time.
QueryPerformanceCounter(&endTime);

cudaMemcpy( host_Result, device_Result1, sizeof(int) * x_size * y_size, cudaMemcpyDeviceToHost );  // memcpy device to host
finish = clock();




printf("finish: %f\n", (double)finish);

duration1 = finish - start1; 
duration2 = finish - start2; 
printf("GPU Processing time (including memory allocation):\t %10.6f\n", (double)duration1 / (double)CLOCKS_PER_SEC);
printf("GPU Processing time:\t %10.6f\n", (double)duration2 / (double)CLOCKS_PER_SEC);


double elapsedTime = ((double)(endTime.QuadPart - startTime.QuadPart) / (freq.QuadPart) );
// elapsedTime *= 1000;
_tprintf(TEXT("GPU Processing time:\t %lf s\n\n\n"), elapsedTime);

free(host_Result);
cudaFree(device_Result1);
cudaFree(device_input);


    return 0;

}

hi, guys

i want to check processing time for ‘gpu is faster than cpu’

///////////////////////////////////////////////////////////////////
for( n = 0; n < LOOPCOUNT; n++ )
{
test <<<dimGrid, dimBlock>>>(device_Result1,device_input);

///////////////////////////////////////////////////////////////////
but i think it’s not work.

I changed a Number of Loopcount but the processing time is not changed.

i want to know how do i am And how can i reduce memcpy running time.

i’ll wait your good advice. thanks.

bampy7 · November 15, 2010, 12:58pm

#include <stdio.h>
#include <malloc.h>
#include <cuda_runtime.h>

#include <windows.h>
#include <stdlib.h>
#include <tchar.h>

//#include <cutil.h>

#define blksize (30)
#define trdsize (12)

#define x_size (1920)
#define y_size (1080)

#define LOOPCOUNT (10)

global void test(int *result1, int *input_img )
{
int tidx, bidx;
int n;
int i, j;
int index;

tidx = threadIdx.x;
bidx = blockIdx.x;



i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;

index = i + (j * x_size);

for( n = 0; n < LOOPCOUNT; n++ )
    {
       result1[index]  = (input_img[index] * 11 + 11)/11 ;

    }

}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

int main()
{
int *host_Result;
int *device_Result1;
int *device_Result2;
int *device_Result3;
int *device_input;

int i=0, j=0; 
int n;

dim3 dimBlock(blksize, blksize);
dim3 dimGrid(x_size/dimBlock.x, y_size/dimBlock.y);							//dimBlock & dimGrid

double duration1 = 0.0;
double duration2 = 0.0;

clock_t start1, start2, finish;

LARGE_INTEGER  freq, startTime, endTime;

int bSuppored = QueryPerformanceFrequency(&freq);	//tick
if (bSuppored == 0)
{
	_tprintf(TEXT("high-resolution performance count 'No'\n"));
	return 1;	
}


int* input=NULL;
input = new int[x_size*y_size*sizeof(int)];
memset(input,0,x_size*y_size);

// Create HD image
for(j=0; j<y_size; j++)
{
	for(i=0; i<x_size; i++)
	{
		input[i+ (x_size*j)]= 33;
	}
}																			// 1920 * 1080 input image

printf("GPU\n");

printf("Input pixel value: %d\n", input[0]);


start1 = clock();
printf("start1: %f\n", (double)start1);

host_Result = (int *)malloc( x_size * y_size* sizeof(int) );				//cpu 
cudaMalloc( (void**) &device_Result1, sizeof(int) * x_size * y_size);		//gpu 

cudaMalloc( (void**) &device_input, sizeof(int) * x_size * y_size);		// Memory allocation for 'device_input'

// Copy input image data to CUDA memory
cudaMemcpy( device_input, input, sizeof(int) * x_size * y_size, cudaMemcpyHostToDevice ); 

start2 = clock();
printf("start2: %f\n", (double)start2);

//start time.
QueryPerformanceCounter(&startTime);

for( n = 0; n < LOOPCOUNT; n++ )
{
test <<<dimGrid, dimBlock>>>(device_Result1,device_input);

}
//end time.
QueryPerformanceCounter(&endTime);

cudaMemcpy( host_Result, device_Result1, sizeof(int) * x_size * y_size, cudaMemcpyDeviceToHost );  // memcpy device to host
finish = clock();




printf("finish: %f\n", (double)finish);

duration1 = finish - start1; 
duration2 = finish - start2; 
printf("GPU Processing time (including memory allocation):\t %10.6f\n", (double)duration1 / (double)CLOCKS_PER_SEC);
printf("GPU Processing time:\t %10.6f\n", (double)duration2 / (double)CLOCKS_PER_SEC);


double elapsedTime = ((double)(endTime.QuadPart - startTime.QuadPart) / (freq.QuadPart) );
// elapsedTime *= 1000;
_tprintf(TEXT("GPU Processing time:\t %lf s\n\n\n"), elapsedTime);

free(host_Result);
cudaFree(device_Result1);
cudaFree(device_input);


    return 0;

}

hi, guys

i want to check processing time for ‘gpu is faster than cpu’

///////////////////////////////////////////////////////////////////
for( n = 0; n < LOOPCOUNT; n++ )
{
test <<<dimGrid, dimBlock>>>(device_Result1,device_input);

///////////////////////////////////////////////////////////////////
but i think it’s not work.

I changed a Number of Loopcount but the processing time is not changed.

i want to know how do i am And how can i reduce memcpy running time.

i’ll wait your good advice. thanks.

seibert · November 15, 2010, 1:13pm

I’m not sure of exactly the behavior of QueryPerformanceCounter, but your placement of that line is not going to measure the GPU runtime. CUDA kernels are launched asynchronously, which means that as soon as the kernel is queued up by the driver, control returns your CPU code. When you reach QueryPerformanceCounter, not all of the queued kernels will have finished running. For benchmarking purposes, you should add a call to cudaThreadSynchronize() before QueryPerformanceCounter(&endTime). (Note that you do not need cudaThreadSynchronize() in actual code because cudaMemcpy() has an implicit synchronization.)

seibert · November 15, 2010, 1:13pm

I’m not sure of exactly the behavior of QueryPerformanceCounter, but your placement of that line is not going to measure the GPU runtime. CUDA kernels are launched asynchronously, which means that as soon as the kernel is queued up by the driver, control returns your CPU code. When you reach QueryPerformanceCounter, not all of the queued kernels will have finished running. For benchmarking purposes, you should add a call to cudaThreadSynchronize() before QueryPerformanceCounter(&endTime). (Note that you do not need cudaThreadSynchronize() in actual code because cudaMemcpy() has an implicit synchronization.)

bampy7 · November 16, 2010, 10:39am

thank you, seibert.

Now running time problem is ok , i have to find some functions for seeing output in Kernel_Function.

anyway your advice is very helpful to me.

bampy7 · November 16, 2010, 10:39am

thank you, seibert.

Now running time problem is ok , i have to find some functions for seeing output in Kernel_Function.

anyway your advice is very helpful to me.

Topic		Replies	Views
Why is the Kernel faster when my matrices are not initialized CUDA Programming and Performance	2	738	December 18, 2017
Serial vs Parallel Performance CUDA Programming and Performance	5	901	April 3, 2017
How to explain the performance difference? CUDA Programming and Performance	7	3506	March 26, 2008
well how do I know if cuda runs on the gpu CUDA Programming and Performance	20	13253	July 9, 2008
How much time is cudaMemcpy() use? CUDA Programming and Performance	1	4022	July 30, 2008
Can kernel function parallel with CPU code? CUDA Programming and Performance	12	7735	December 5, 2008
bets way to return a float value sync or assync CUDA Programming and Performance	26	10314	May 7, 2009
Performance Boost Not Really Seen CUDA Programming and Performance	8	906	December 21, 2010
Confused about GPU vs CPU speed in multiplication CUDA Programming and Performance	8	6538	February 19, 2009
[Beginner]: CUDA slower than serial implementation fill Operation on entire image CUDA Programming and Performance	18	13519	September 15, 2011

processing time check

Related topics