Hello ppl,
I will be glad if someone can help me, because no one to guide with cuda programming.
I am basically trying to convert a bmp image of RGB color format into YUV format. The image size is 640x480. The code is working, but to process 1 frame it is taking 46ms.
The other code, where I read the whole image 25 times with every iteration reading 12800 pixels and pass those 12800 pixels from CPU to GPU every time. I am calculating each component of YUV separately. If i calculate the kernel elasped time of each turn, the total results in around 0.514ms for each component, resulting in just 1.5ms in total.
I have written the following code with the help of various topics available on forum itself. Can the following code which takes 46ms can be made more efficient?
This is my C code of host:
int main()
{
FILE fp;
bmpfileinfoheader hp;
bmpfileinfoheader* d_hp;
unsigned char *data;
unsigned char *d_data;
int height=0,width=0;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//Open input file:
fp = fopen("C:\\Input_Frames\\Minutes\\1\\0000.bmp", "rb");
if(fp==NULL)
return 1;
//Read the file headers
hp=(bmpfileinfoheader*)malloc(sizeof(bmpfileinfoheader));
cudaMalloc( &d_hp, sizeof(bmpfileinfoheader));
if(hp==NULL)
return 1;
fread(hp, sizeof(bmpfileinfoheader), 1, fp);
cudaMemcpy(d_hp, hp, sizeof(bmpfileinfoheader), cudaMemcpyHostToDevice);
//Read the image data
data = (unsigned char*)malloc(sizeof(char)*hp->biSizeImage);
printf("\nImage size : %ld",hp->biSizeImage);
getch();
height = hp->biHeight;
width = hp->biWidth;
cudaMalloc(&d_data, sizeof(char)*hp->biSizeImage);
fseek(fp,sizeof(char)*hp->fileheader.bfOffBits,SEEK_SET);
fread(data,sizeof(char),hp->biSizeImage, fp);
cudaMemcpy(d_data, data, sizeof(char)*hp->biSizeImage, cudaMemcpyHostToDevice);
dim3 grid(height,width);
cudaEventRecord(start, 0);
rgbToyuv<<<grid,1>>>(d_hp, d_data, height, width);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel elapsed time: %3.3f ms \n", time);
getch();
cudaMemcpy(data, d_data, sizeof(char)*hp->biSizeImage, cudaMemcpyDeviceToHost);
//Printing some pixels
for(int i=0,j=0;i<62000;i++)
{
printf("\n%d : %d ",j,data[i]);
printf("%d ",data[++i]);
printf("%d ",data[++i]);
j++;
}
getch();
free(hp);
free(data);
cudaFree(d_data);
cudaFree(d_hp);
fclose(fp);
getch();
return 0;
}
Kernel function:
global void rgbToyuv(bmpfileinfoheader* hp, unsigned char* data, int height, int width)
{
int x = blockIdx.x; // current row
int y = blockIdx.y; // current column
int dim = gridDim.x;
int index = x + y*dim; // current pixel index
int iindex = 3*index; // compute byte offset
int oindex = 3*index;
unsigned char temp, temp1, temp2;
if(data[iindex]==0 && data[iindex+1]==0 && data[iindex+2]==0)
{
temp = 0;
temp1 = temp2 = 128;
}
temp = ceil(0.2990 * data[iindex+2] + 0.5870 * data[iindex+1] + 0.1140 * data[iindex]);
temp1 = ceil(-0.1684 * data[iindex+2] - 0.3316 * data[iindex+1] + 0.5000 * data[iindex]) + 128;
temp2 = ceil(0.5000 * data[iindex+2] - 0.4187 * data[iindex+1] - 0.0813 * data[iindex]) + 128;
data[oindex] = temp; // assuming 3 byte output format
data[oindex+1] = temp1; // assuming 3 byte output format
data[oindex+2] = temp2; // assuming 3 byte output format
}