kernel function make it slower then normal for loop

when writing my program on converting a BMP 24bits picture to 8bits gray one. I added a timer to record processing time. The result make me laugh out loud:
when use for loop as normal it took about 15-17 clicks/sec
but when use CUDA kernel that could be 1600 clicks/sec
my sample picture is 800 x 600,and I runs in visual studio 2013+CUDA7.5
I use NVIDIA GeForce 940M on Thinkpad T550
what is the matter???

__global__ void doGray(unsigned char *data,unsigned char *m_data,int sz_w,int sz_h)
{
int col = blockIdx.x*blockDim.x + threadIdx.x;
int row = blockIdx.y*blockDim.y + threadIdx.y;
int h8_idx = col + row*sz_w;
//
int h24_idx = h8_idx * 3;
if (h8_idx<sz_w*sz_h)
{
float blue = m_data[h24_idx];
blue *= 0.114;
float green = m_data[h24_idx + 1];
green *= 0.587;
float red = m_data[h24_idx + 2];
red *= 0.299;
float val = blue + green + red;
data[h8_idx] = val;
}	
}
void run(unsigned char *data, unsigned char *m_data, int sz_w, int sz_h, int szImg_tHdr, int szImg_mHdr)
{
unsigned char * d_data;
unsigned char * d_m_data;
cudaError_t er=cudaMalloc(&d_data, szImg_tHdr);//tInfoHdr.biSizeImage
if (er!=cudaSuccess){
return;
}
er = cudaMalloc(&d_m_data, szImg_mHdr);//m_infoHeader.biSizeImage
if (er != cudaSuccess){
return;
}
er = cudaMemset(d_data, 0, szImg_tHdr);
if (er != cudaSuccess){
return;
}
//calculate
int g_m_x=(sz_w + 31) >> 5;
int g_m_y = sz_h/2;
dim3 gridDim(g_m_x,g_m_y);
dim3 blockDim(32,2);
er = cudaMemcpy(d_m_data, m_data, szImg_mHdr, cudaMemcpyHostToDevice);
if (er != cudaSuccess){
return;
}
doGray << <gridDim, blockDim >> >(d_data, d_m_data, sz_w, sz_h);
//normal loop
//for (int h = 0; h < m_infoHeader.biHeight; h++){
//	int h8_idx = h*m_infoHeader.biWidth;
//	int h24_idx = h*m_infoHeader.biWidth*3;
//	for (int w = 0; w < m_infoHeader.biWidth; w++)
//	{
//	BYTE blue = m_data[h24_idx + w * 3];
//	BYTE green = m_data[h24_idx + w * 3 + 1];
//	BYTE red = m_data[h24_idx + w * 3 + 2];
//	//Y = 0.299R+0.587G+0.114B
//	data[h8_idx + w] = red * 0.299 + green * 0.587 + blue * 0.114;
//	}
//}
er = cudaGetLastError();
if (er != cudaSuccess){
const char* errorStr = cudaGetErrorString(er);
return;
}
else
{
cudaMemcpy(data, d_data, szImg_tHdr, cudaMemcpyDeviceToHost);
}

//cudaMemcpy(m_data,d_m_data, m_infoHeader.biSizeImage, cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_m_data);
}

Please use code-block markup when posting code. What are “clicks”? One obvious issue: The code needlessly forces double-precision computation (on a device with low double-precision throughput) by using double-precision literal constants like 0.114. You would want to use ‘float’ constants instead, e.g. 0.114f

On a tangent please note that your error checking is not effective: cudaGetLastError() directly after a kernel launch will only capture launch errors or errors already present before the kernel launch, as it does not synchronize.
To check for errors during the kernel execution, you need to check the return code from the cudaMemcpy() call after the kernel launch.

Thank you for your response. @njuffa: I use the float constant and the speed really accelerate a bit,it is now 1200-1100 milliseconds .But there is still a big gap comparing with normal loop.Any suggestions?

//
__global__ void doGray(unsigned char *data,unsigned char *m_data,int sz_w,int sz_h)
{
	int col = blockIdx.x*blockDim.x + threadIdx.x;
	int row = blockIdx.y*blockDim.y + threadIdx.y;
	int h8_idx = col + row*sz_w;
	//
	int h24_idx = h8_idx * 3;
	if (h8_idx<sz_w*sz_h)
	{
		float blue = m_data[h24_idx];
		blue *= 0.114f;
		float green = m_data[h24_idx + 1];
		green *= 0.587f;
		float red = m_data[h24_idx + 2];
		red *= 0.299f;
		float val = blue + green + red;
		data[h8_idx] = val;
		//data[h8_idx] = m_data[h24_idx] * 0.114f + m_data[h24_idx + 1] * 0.587f 
                //+ m_data[h24_idx + 2] * 0.299f;
	}		
}
void  run(unsigned char *data, unsigned char *m_data, int sz_w, int sz_h, int szImg_tHdr, int szImg_mHdr)
{
	unsigned char * d_data;
	unsigned char * d_m_data;
	cudaError_t er=cudaMalloc(&d_data, szImg_tHdr);//tInfoHdr.biSizeImage
	if (er!=cudaSuccess){
		return;
	}
	er = cudaMalloc(&d_m_data, szImg_mHdr);//m_infoHeader.biSizeImage
	if (er != cudaSuccess){
		return;
	}
	er = cudaMemset(d_data, 0, szImg_tHdr);
	if (er != cudaSuccess){
		return;
	}
	//calculate
	int g_m_x=(sz_w + 31) >>5;
	int g_m_y = sz_h/2;
	dim3 gridDim(g_m_x,g_m_y);
	dim3 blockDim(32,2);
	er = cudaMemcpy(d_m_data, m_data, szImg_mHdr, cudaMemcpyHostToDevice);
	if (er != cudaSuccess){
		return;
	}
	doGray << <gridDim, blockDim >> >(d_data, d_m_data, sz_w, sz_h);
	//normal loop
	//for (int h = 0; h < m_infoHeader.biHeight; h++){
	//	int h8_idx = h*m_infoHeader.biWidth;
	//	int h24_idx = h*m_infoHeader.biWidth*3;
	//	for (int w = 0; w < m_infoHeader.biWidth; w++)
	//	{
	//	BYTE blue = m_data[h24_idx + w * 3];
	//	BYTE green = m_data[h24_idx + w * 3 + 1];
	//	BYTE red = m_data[h24_idx + w * 3 + 2];
	//	//Y = 0.299R+0.587G+0.114B
	//	data[h8_idx + w] = red * 0.299 + green * 0.587 + blue * 0.114;
	//	}
	//}
	er = cudaGetLastError();
	if (er != cudaSuccess){
		const char* errorStr = cudaGetErrorString(er);
		return;
	}
	else
	{
		cudaMemcpy(data, d_data, szImg_tHdr, cudaMemcpyDeviceToHost);
	}
	cudaFree(d_data);
	cudaFree(d_m_data);
}

and my timer is :

const clock_t begin = clock();
	//start CUDA
	run(data, m_data, m_infoHeader.biWidth, m_infoHeader.biHeight, tInfoHdr.biSizeImage, m_infoHeader.biSizeImage);
	TRACE("time consumption is %d\n",clock() - begin);