Afraid that I’m a bit short on time this week, so hope that this is good enough, if not, I’ll try something more complete in a week
In any case, I’m guessing that the input image does not have an alpha channel if it is the input of OpenCV
jpeg never has, png can have an alpha channel
I didn’t test the 4 channel output image, don’t know if OpenCV supports it
I’m also writing this ad-hoc so I hope that there are not too many compiler errors
[codebox]
texture<unsigned char, 2, cudaReadModeElementType> texRef;
global void CopyKernel (uchar4 *gpuOut, size_t gpuoutStride, int width, int height, int nChannels)
{
int x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
int y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
if (x >= width || y >= height)
return;
int ix = x*nChannels;
uchar4 val = make_uchar4(0,0,0,0);
val.x = tex2D(texRef, ix, y);
if (nChannels >= 2) val.y = tex2D(texRef, ix + 1, y);
if (nChannels >= 3) val.z = tex2D(texRef, ix + 2, y);
if (nChannels >= 4) val.w = tex2D(texRef, ix + 3, y);
*((uchar4 *)((char )gpuOut + ygpuoutStride) + x) = val
}
IplImage* img = cvLoadImage(“img.jpg”, CV_LOAD_IMAGE_ANYCOLOR);
if (!img || img->depth != 8)
return;
// Create an rgba output image
IplImage* oimg = cvCreateImage(cvSize(img->width, img->height), img->depth, 4);
unsigned char *gpuIn;
size_t gpuinStride;
cudaMallocPitch((void **)&gpuIn, &gpuinStride, img->width*img->nChannels, img->height);
// 4 channel output
unsigned uchar4 *gpuOut;
size_t gpuoutStride;
cudaMallocPitch((void **)&gpuOut, &gpuoutStride, oimg->width*4, oimg->height);
cudaMemcpy2D(gpuIn, gpuinStride, img->imageData, img->widthStep, img->width*img->nChannels, img->height, cudaMemcpyHostToDevice);
cudaChannelFormatDesc desc = cudaCreateChannelDesc();
cudaBindTexture2D (NULL, &texRef, gpuIn, &desc, img->width*img->nChannels, img->height, gpuinStride);
dim3 dimBlock(16, 16);
dim3 dimGrid((img->width + dimBlock.x - 1)/dimBlock.x, (img->height + dimBlock.y - 1)/dimBlock.y);
CopyKernel <<< dimGrid, dimBlock >>> (gpuOut, gpuoutStride, img->width, img->height, img->nChannels);
cudaMemcpy2D(oimg->imageData, oimg->widthStep, gpuOut, gpuoutStride, oimg->width*oimg->nChannels, oimg->height, cudaMemcpyHostToDevice);
[/codebox]
if the input is RGBA, i.e. img->nChannels == 4, you can simplify this by using a uchar4 texture, i.e
[codebox]
texture<uchar4, 2, cudaReadModeElementType> texRef;
global void CopyKernel (uchar4 *gpuOut, size_t gpuoutStride, int width, int height, int nChannels)
{
int x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
int y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
if (x >= width || y >= height)
return;
*((uchar4 *)((char )gpuOut + ygpuoutStride) + x) = tex2D(texRef, x, y);
}
[/codebox]
You can post the input image are using and I’ll try to make a working example