I want to average N cv images of type CV_16UC1, using CUDA, but the resultant image out of CUDA kernel is always black

Hi All,

I want to compute mean value of each pixel of N grayscale images of type CV_16UC1.
I have written a sample code which basically takes an input grayscale image and assign the same image back to output, just to make sure the data flow is correct, and no data corruption is happening.

Here is the code:

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <opencv2/opencv.hpp>
#include <opencv2/core/cuda/common.hpp>
#include <iostream>
#include <stdio.h>
using namespace std;
void MatType(cv::Mat inputMat);
void MatType(cv::Mat inputMat)
{
	int inttype = inputMat.type();

	string r, a;
	uchar depth = inttype & CV_MAT_DEPTH_MASK;
	uchar chans = 1 + (inttype >> CV_CN_SHIFT);
	switch (depth) {
	case CV_8U:  r = "8U";   a = "Mat.at<uchar>(y,x)"; break;
	case CV_8S:  r = "8S";   a = "Mat.at<schar>(y,x)"; break;
	case CV_16U: r = "16U";  a = "Mat.at<ushort>(y,x)"; break;
	case CV_16S: r = "16S";  a = "Mat.at<short>(y,x)"; break;
	case CV_32S: r = "32S";  a = "Mat.at<int>(y,x)"; break;
	case CV_32F: r = "32F";  a = "Mat.at<float>(y,x)"; break;
	case CV_64F: r = "64F";  a = "Mat.at<double>(y,x)"; break;
	default:     r = "User"; a = "Mat.at<UKNOWN>(y,x)"; break;
	}
	r += "C";
	r += (chans + '0');
	cout << "Mat is of type " << r << " and should be accessed with " << a << endl;

}

__global__ void vectorAdd10(unsigned char* input, unsigned char* output, int width, int height, int inStep, int outStep) {
	// Index of current thread
	const int x = blockIdx.x * blockDim.x + threadIdx.x;
	const int y = blockIdx.y * blockDim.y + threadIdx.y;

	// Number of channels
	const int in_c = inStep / width;
	const int out_c = outStep / width;

	// Only valid threads perform memory I/O
	if ((x < width) && (y < height)) {

		// Location of pixel
		const int in_tid = y * inStep + (in_c * x);
		const int out_tid = y * outStep + (out_c * x);

		// Invert
		for (int i = 0; i < in_c; ++i) {
			output[out_tid + i] = (unsigned char)(input[in_tid + i]);
		}
	}
}

int main() { //Umesh: replace this function with other name
	//Read Image
	cv::Mat& result = cv::Mat(1024, 1024, CV_8UC1);
	cv::Mat input = cv::imread("Frame1.png", cv::IMREAD_GRAYSCALE);
	std::cout << "Number of channel=" << input.channels() << endl;
	cout << " " << cv::typeToString(input.type()) << endl;
	cv::imshow("input image", input);
	cv::waitKey(3000);

	const int inBytes = input.step * input.rows;
	const int outBytes = result.step * result.rows;

	unsigned char* b_input, * b_output;
	std::cout << "step 1" << std::endl;
	b_input = (unsigned char*)malloc(inBytes);
	b_output = (unsigned char*)malloc(outBytes);
	cout << "step 21" << endl;
	cudaMalloc<unsigned char>(&b_input, inBytes);
	cudaMalloc<unsigned char>(&b_output, outBytes);
	cout << "step 2" << endl;

	cudaMemcpy(b_input, input.data, inBytes, cudaMemcpyHostToDevice);
	cout << "step 3" << endl;

	// Threadblock size
	const dim3 block(16, 16);

	const dim3 grid(cv::cuda::device::divUp(input.cols, block.x), cv::cuda::device::divUp(input.rows, block.y));

	// Grid size

	vectorAdd10 << <grid, block >> > (b_input, b_output, input.cols, input.rows, input.step, result.step);
	cout << "step 4" << endl;

	cudaSafeCall(cudaDeviceSynchronize());
	// Copy sum vector from device to host
	cudaSafeCall(cudaMemcpy(result.data, b_output, outBytes, cudaMemcpyDeviceToHost));
	cout << "step 5" << endl;
	cout << cv::typeToString(result.type()) << endl;

	cv::Mat result1 = result * 257;
	cv::Mat result2;
	result.convertTo(result2, CV_8UC1);
	cout << cv::typeToString(result2.type()) << endl;
	cv::imshow("Result2", result2);
	cv::waitKey(3000);
	cout << "step 6" << endl;


	printf("COMPLETED SUCCESFULLY\n");
}

But the result of Cuda kernel is always black image. Can you please help, I feel like I am doing something wrong with the data handling.