kernel function make it slower then normal for loop

pardxa · August 17, 2016, 3:43pm

when writing my program on converting a BMP 24bits picture to 8bits gray one. I added a timer to record processing time. The result make me laugh out loud:
when use for loop as normal it took about 15-17 clicks/sec
but when use CUDA kernel that could be 1600 clicks/sec
my sample picture is 800 x 600,and I runs in visual studio 2013+CUDA7.5
I use NVIDIA GeForce 940M on Thinkpad T550
what is the matter???

__global__ void doGray(unsigned char *data,unsigned char *m_data,int sz_w,int sz_h)
{
int col = blockIdx.x*blockDim.x + threadIdx.x;
int row = blockIdx.y*blockDim.y + threadIdx.y;
int h8_idx = col + row*sz_w;
//
int h24_idx = h8_idx * 3;
if (h8_idx<sz_w*sz_h)
{
float blue = m_data[h24_idx];
blue *= 0.114;
float green = m_data[h24_idx + 1];
green *= 0.587;
float red = m_data[h24_idx + 2];
red *= 0.299;
float val = blue + green + red;
data[h8_idx] = val;
}	
}
void run(unsigned char *data, unsigned char *m_data, int sz_w, int sz_h, int szImg_tHdr, int szImg_mHdr)
{
unsigned char * d_data;
unsigned char * d_m_data;
cudaError_t er=cudaMalloc(&d_data, szImg_tHdr);//tInfoHdr.biSizeImage
if (er!=cudaSuccess){
return;
}
er = cudaMalloc(&d_m_data, szImg_mHdr);//m_infoHeader.biSizeImage
if (er != cudaSuccess){
return;
}
er = cudaMemset(d_data, 0, szImg_tHdr);
if (er != cudaSuccess){
return;
}
//calculate
int g_m_x=(sz_w + 31) >> 5;
int g_m_y = sz_h/2;
dim3 gridDim(g_m_x,g_m_y);
dim3 blockDim(32,2);
er = cudaMemcpy(d_m_data, m_data, szImg_mHdr, cudaMemcpyHostToDevice);
if (er != cudaSuccess){
return;
}
doGray << <gridDim, blockDim >> >(d_data, d_m_data, sz_w, sz_h);
//normal loop
//for (int h = 0; h < m_infoHeader.biHeight; h++){
//	int h8_idx = h*m_infoHeader.biWidth;
//	int h24_idx = h*m_infoHeader.biWidth*3;
//	for (int w = 0; w < m_infoHeader.biWidth; w++)
//	{
//	BYTE blue = m_data[h24_idx + w * 3];
//	BYTE green = m_data[h24_idx + w * 3 + 1];
//	BYTE red = m_data[h24_idx + w * 3 + 2];
//	//Y = 0.299R+0.587G+0.114B
//	data[h8_idx + w] = red * 0.299 + green * 0.587 + blue * 0.114;
//	}
//}
er = cudaGetLastError();
if (er != cudaSuccess){
const char* errorStr = cudaGetErrorString(er);
return;
}
else
{
cudaMemcpy(data, d_data, szImg_tHdr, cudaMemcpyDeviceToHost);
}

//cudaMemcpy(m_data,d_m_data, m_infoHeader.biSizeImage, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_m_data);
}

njuffa · August 17, 2016, 3:50pm

Please use code-block markup when posting code. What are “clicks”? One obvious issue: The code needlessly forces double-precision computation (on a device with low double-precision throughput) by using double-precision literal constants like 0.114. You would want to use ‘float’ constants instead, e.g. 0.114f

tera · August 17, 2016, 4:57pm

On a tangent please note that your error checking is not effective: cudaGetLastError() directly after a kernel launch will only capture launch errors or errors already present before the kernel launch, as it does not synchronize.
To check for errors during the kernel execution, you need to check the return code from the cudaMemcpy() call after the kernel launch.

pardxa · August 18, 2016, 12:40pm

Thank you for your response. @njuffa: I use the float constant and the speed really accelerate a bit,it is now 1200-1100 milliseconds .But there is still a big gap comparing with normal loop.Any suggestions?

//
__global__ void doGray(unsigned char *data,unsigned char *m_data,int sz_w,int sz_h)
{
	int col = blockIdx.x*blockDim.x + threadIdx.x;
	int row = blockIdx.y*blockDim.y + threadIdx.y;
	int h8_idx = col + row*sz_w;
	//
	int h24_idx = h8_idx * 3;
	if (h8_idx<sz_w*sz_h)
	{
		float blue = m_data[h24_idx];
		blue *= 0.114f;
		float green = m_data[h24_idx + 1];
		green *= 0.587f;
		float red = m_data[h24_idx + 2];
		red *= 0.299f;
		float val = blue + green + red;
		data[h8_idx] = val;
		//data[h8_idx] = m_data[h24_idx] * 0.114f + m_data[h24_idx + 1] * 0.587f 
                //+ m_data[h24_idx + 2] * 0.299f;
	}		
}
void  run(unsigned char *data, unsigned char *m_data, int sz_w, int sz_h, int szImg_tHdr, int szImg_mHdr)
{
	unsigned char * d_data;
	unsigned char * d_m_data;
	cudaError_t er=cudaMalloc(&d_data, szImg_tHdr);//tInfoHdr.biSizeImage
	if (er!=cudaSuccess){
		return;
	}
	er = cudaMalloc(&d_m_data, szImg_mHdr);//m_infoHeader.biSizeImage
	if (er != cudaSuccess){
		return;
	}
	er = cudaMemset(d_data, 0, szImg_tHdr);
	if (er != cudaSuccess){
		return;
	}
	//calculate
	int g_m_x=(sz_w + 31) >>5;
	int g_m_y = sz_h/2;
	dim3 gridDim(g_m_x,g_m_y);
	dim3 blockDim(32,2);
	er = cudaMemcpy(d_m_data, m_data, szImg_mHdr, cudaMemcpyHostToDevice);
	if (er != cudaSuccess){
		return;
	}
	doGray << <gridDim, blockDim >> >(d_data, d_m_data, sz_w, sz_h);
	//normal loop
	//for (int h = 0; h < m_infoHeader.biHeight; h++){
	//	int h8_idx = h*m_infoHeader.biWidth;
	//	int h24_idx = h*m_infoHeader.biWidth*3;
	//	for (int w = 0; w < m_infoHeader.biWidth; w++)
	//	{
	//	BYTE blue = m_data[h24_idx + w * 3];
	//	BYTE green = m_data[h24_idx + w * 3 + 1];
	//	BYTE red = m_data[h24_idx + w * 3 + 2];
	//	//Y = 0.299R+0.587G+0.114B
	//	data[h8_idx + w] = red * 0.299 + green * 0.587 + blue * 0.114;
	//	}
	//}
	er = cudaGetLastError();
	if (er != cudaSuccess){
		const char* errorStr = cudaGetErrorString(er);
		return;
	}
	else
	{
		cudaMemcpy(data, d_data, szImg_tHdr, cudaMemcpyDeviceToHost);
	}
	cudaFree(d_data);
	cudaFree(d_m_data);
}

and my timer is :

const clock_t begin = clock();
	//start CUDA
	run(data, m_data, m_infoHeader.biWidth, m_infoHeader.biHeight, tInfoHdr.biSizeImage, m_infoHeader.biSizeImage);
	TRACE("time consumption is %d\n",clock() - begin);

Topic		Replies	Views
Speed reduces 17 -> 20 times after the kernel is called 9th times! T_T! CUDA Programming and Performance	4	2542	November 18, 2008
The kernel execute time CHANGE,Why CUDA Programming and Performance	2	3181	June 9, 2009
[Beginner]: CUDA slower than serial implementation fill Operation on entire image CUDA Programming and Performance	18	13691	September 15, 2011
CUDA performance How to improve my code? CUDA Programming and Performance	16	13728	June 29, 2010
processing time check CUDA Programming and Performance	5	625	November 16, 2010
faster at small runtimes, slower for larger runtimes CUDA Programming and Performance	1	784	June 4, 2010
Seemingly insignificant changes result in a 100x kernel slowdown CUDA Programming and Performance	2	596	February 14, 2020
Inconsistent kernel run times CUDA Programming and Performance	12	5910	August 5, 2009
kernel sample CUDA Programming and Performance	0	2334	July 13, 2007
Why does my kernel take too long occasionally? CUDA Programming and Performance	21	9035	October 13, 2010

kernel function make it slower then normal for loop

Related topics