Hi,
Need some help (again) hahaha.
I’ve been trying to make an RGB to Grayscale conversion.
I have a sequential algorithm and the parallel one, but still the sequential seems to have better performance than the parallel, I know something is wrong but don’t know what.
My codes are the following:
void pprRGB2Gray(pprColorImage *imageRGB, pprGrayImage *imageGray)
{
u_int idxI, idxJ, idxK;
float r, g, b, gray;
imageGray->row = imageRGB->row;
imageGray->col = imageRGB->col;
pprMatrixMem(imageGray);
for (idxI = 0; idxI < imageRGB->row; idxI++) {
for (idxJ = 0; idxJ < imageRGB->col; idxJ++) {
gray = ((float)imageRGB->data[0][idxI*imageRGB->col+idxJ])*0.3 +
((float)imageRGB->data[1][idxI*imageRGB->col+idxJ]*0.59) +
((float)imageRGB->data[2][idxI*imageRGB->col+idxJ]*0.11);
imageGray->data[idxI*imageRGB->col+idxJ] = gray;
}
}
}
__global__ void kernelRGB2Gray(u_char *imageR, u_char *imageG, u_char *imageB, u_char *imageGray, u_int size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < size){
imageGray[tid] = (float)imageR[tid]*0.3f + (float)imageG[tid]*0.59f + (float)imageB[tid]*0.11f;
tid += blockDim.x * gridDim.x;
}
}
void pprRGB2Gray(pprColorImage *imageRGB, pprGrayImage *imageGray)
{
u_int size;
u_char *d_imageR, *d_imageG, *d_imageB, *d_imageGray;
size = imageRGB->row*imageRGB->col;
imageGray->row = imageRGB->row;
imageGray->col = imageRGB->col;
pprMatrixMem(imageGray);
//Allocate memory on GPU for each channel of the RGB image and for the Grayscale image.
CHECK_ERROR( cudaMalloc( (void**)&d_imageR, sizeof(u_char)*size ) );
CHECK_ERROR( cudaMalloc( (void**)&d_imageG, sizeof(u_char)*size ) );
CHECK_ERROR( cudaMalloc( (void**)&d_imageB, sizeof(u_char)*size ) );
CHECK_ERROR( cudaMalloc( (void**)&d_imageGray, sizeof(u_char)*size ) );
//Copy each channel of the RGB image to the GPU.
CHECK_ERROR( cudaMemcpy( d_imageR, imageRGB->data[0], sizeof(u_char)*size, cudaMemcpyHostToDevice ) );
CHECK_ERROR( cudaMemcpy( d_imageG, imageRGB->data[1], sizeof(u_char)*size, cudaMemcpyHostToDevice ) );
CHECK_ERROR( cudaMemcpy( d_imageB, imageRGB->data[2], sizeof(u_char)*size, cudaMemcpyHostToDevice ) );
//Launch kernel with each channel of RGB plus an empty Grayscale array.
kernelRGB2Gray<<<128,256>>>(d_imageR, d_imageG, d_imageB, d_imageGray, size);
//Copy the result of Grayscale back to the CPU and fill the pprGrayImage structure with the new computed data.
CHECK_ERROR( cudaMemcpy( imageGray->data, d_imageGray, sizeof(u_char)*size, cudaMemcpyDeviceToHost ) );
//Free the allocated memory on GPU.
cudaFree(d_imageR);
cudaFree(d_imageG);
cudaFree(d_imageB);
cudaFree(d_imageGray);
}
I should add somethings:
Working on a Macbook Pro with nVidia 320M
The number of blocks and threads was set empirically