How can I make a pixel parallel access with cuda?

Hello, developer!
I am beginner of cuda programming but i am trying to make myself.
But I found that i have a limited knowledge of mine with Cuda, Graphics or else.
In this case, I want to make up GPU image subtraction with OpenCV in Cuda 10.0 with Visual Studio 2017 to show GPU parallel process is much faster than CPU sequence process.

BUT i have really trouble with parallel access with images.
Using a pixel from each of images access to GPU is very hard to find a way.
PLEASE check my code and tell me what is wrong or tell me what should i fix in my code.

// basic header
#include <iostream>
#include <cstdlib>
#include <ctime>

// Cuda header
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

// OpenCV header
#include <opencv.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/core/cuda/common.hpp>

using namespace std;
using namespace cv;

void GPUimageSubtract(Mat *img1, Mat *img2, int row, int col, int row2, int col2) {
	int x = threadIdx.x + blockIdx.x*blockDim.x;
	int y = threadIdx.y + blockIdx.y*blockDim.y;
	int offset = x + y * blockDim.x*gridDim.x;  // <-- I don't know what is meaning for?

		I don't know about this part.
                Give me a hint for understanding


int main(){
	Mat image1 = imread("sky.jpg");
	Mat image2 = imread("sky1.jpg");
	Mat CPU_res, GPU_res;
	if (image1.empty() || image2.empty()) {
		cout << "Cannot open the image file." << endl;
		return 0;

	clock_t begin = clock();		// Time Start

	cv::subtract(image1, image2, CPU_res);	// CPU operation

	clock_t end = clock();			// Time End
	double esec = double(end - begin) / CLOCKS_PER_SEC;

	Mat *dev_image1, *dev_image2;
	if ( cudaMalloc((void**)dev_image1, sizeof(Mat)) != cudaSuccess) {
		cout << "Error with dev_image1" << endl;
		return 0;
	if (cudaMalloc((void**)dev_image2, sizeof(Mat)) != cudaSuccess) {
		cout << "Error with dev_image2" << endl;
		return 0;

	cudaMemcpy(dev_image1, &image1, sizeof(image1), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_image2, &image2, sizeof(image1), cudaMemcpyHostToDevice);
	cudaEvent_t start, stop;
	float esec2;
	cudaEventRecord(start, 0);		// cuda time start

	GPUimageSubtract <<< >> > (dev_image1, dev_image2, image1.rows, image1.cols, image2.rows, image2.cols);
        // How can i know about fitting blocks, threads?

	cudaEventRecord(stop, 0);		
	cudaEventSynchronize(stop);		// cuda time end
	cudaEventElapsedTime(&esec2, start, stop);

	cudaMemcpy(&GPU_res, dev_image1, sizeof(dev_image1), cudaMemcpyDeviceToHost);

	cout << fixed;
	cout << "CPU = " << esec << endl;
	cout << "GPU = " << esec2 << endl;

	imshow("CPU_res", CPU_res);
	imshow("GPU_res", GPU_res);


	return 0;

I know this code is really stupid and silly but this is what i can do best now.
BUT I want to know why and how very well!

Thank you!

sky.jpg (2.31 KB)