Hi,
I’m using CUDA for image proccessing. The images are usually in RGB or BGR format (uchar3), which is useless for effective memory access (coalescing). At first I was converting images into RGBA format on CPU before sending them on graphic card. Baut this way is realy slow for big images (50-100ms). So I decided to use GPU for this. But the GPU have limit size of global memory and I want this algorithm to be in place.
I’m using cudaMemcpy2D function for copy RGB image on RGBA. RGB image is copied into RGBA memory size as RGBRGBRGBRGB…RGB000…000. I’m posting my code and the result image I’m geting. There you can see horizontal lines. I don;t understand why this is happening. I think that my process of converting RGB2RGBA and RGBA2RGB is ok. Why is this happening? THX
__global__ void kernel_rgb2rgbaI(uchar4 * image, const int width, const int height)
{
__shared__ uchar4 shared[256];
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width)
{
unsigned char * row = (unsigned char *)&image[y*width];
x = width - 1 - x; // odzadu
int idx = 3*x;
//uchar4 rgba = make_uchar4(row[idx], row[idx+1], row[idx+2], 0);
shared[threadIdx.x] = make_uchar4(row[idx], row[idx+1], row[idx+2], 0);
__syncthreads();
image[y*width + x] = shared[threadIdx.x];
}
} // kernel_rgb2rgbaI()
__global__ void kernel_rgba2rgbI(uchar4 * image, const int width, const int height)
{
__shared__ uchar4 shared[1024];
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width)
{
unsigned char * row = (unsigned char *)&image[y*width];
int idx = 3*x;
//uchar4 rgba = image[y*width + x];
shared[threadIdx.x] = image[y*width + x];
__syncthreads();
//row[idx] = rgba.x;
//row[idx+1] = rgba.y;
//row[idx+2] = rgba.z;
row[idx] = shared[threadIdx.x].x;
row[idx+1] = shared[threadIdx.x].y;
row[idx+2] = shared[threadIdx.x].z;
}
} // kernel_rgba2rgbI()
extern "C" void rgb2rgbaI(uchar4 * image, const int width, const int height)
{
//cudaFuncAttributes attr;
//cudaError error = cudaFuncGetAttributes(&attr, kernel_rgb2rgba);
int num_threads_per_block = 256;//attr.maxThreadsPerBlock;
dim3 block_size = dim3(num_threads_per_block, 1, 1);
dim3 num_blocks = dim3((width + num_threads_per_block - 1) / num_threads_per_block, height, 1);
kernel_rgb2rgbaI<<<num_blocks, block_size>>>(image, width, height);
cudaThreadSynchronize();
} // rgb2rgbaI()
extern "C" void rgba2rgbI(uchar4 * image, const int width, const int height)
{
//cudaFuncAttributes attr;
//cudaError error = cudaFuncGetAttributes(&attr, kernel_rgba2rgb);
int num_threads_per_block = 256;//attr.maxThreadsPerBlock;
dim3 block_size = dim3(num_threads_per_block, 1, 1);
dim3 num_blocks = dim3((width + num_threads_per_block - 1) / num_threads_per_block, height, 1);
kernel_rgba2rgbI<<<num_blocks, block_size>>>(image, width, height);
cudaThreadSynchronize();
} // rgb2rgbaI()
uchar4 * input_d;
cudaMalloc(&input_d, imagesize);
cudaMemcpy2D(input_d, 4*width, image->data(), 3*width, 3*width, height, cudaMemcpyHostToDevice);
rgb2rgbaI(input_d, width, height);
// some image processing
rgba2rgbI(input_d, width, height);
cudaMemcpy2D(image->data(), 3*width, input_d, 4*width, 3*width, height, cudaMemcpyDeviceToHost);