Hello, developer!
I am beginner of cuda programming but i am trying to make myself.
But I found that i have a limited knowledge of mine with Cuda, Graphics or else.
In this case, I want to make up GPU image subtraction with OpenCV in Cuda 10.0 with Visual Studio 2017 to show GPU parallel process is much faster than CPU sequence process.
BUT i have really trouble with parallel access with images.
Using a pixel from each of images access to GPU is very hard to find a way.
PLEASE check my code and tell me what is wrong or tell me what should i fix in my code.
// basic header
#include <iostream>
#include <cstdlib>
#include <ctime>
// Cuda header
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
// OpenCV header
#include <opencv.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/core/cuda/common.hpp>
using namespace std;
using namespace cv;
void GPUimageSubtract(Mat *img1, Mat *img2, int row, int col, int row2, int col2) {
int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;
int offset = x + y * blockDim.x*gridDim.x; // <-- I don't know what is meaning for?
/*
I don't know about this part.
Give me a hint for understanding
*/
}
int main(){
Mat image1 = imread("sky.jpg");
Mat image2 = imread("sky1.jpg");
Mat CPU_res, GPU_res;
if (image1.empty() || image2.empty()) {
cout << "Cannot open the image file." << endl;
return 0;
}
clock_t begin = clock(); // Time Start
cv::subtract(image1, image2, CPU_res); // CPU operation
clock_t end = clock(); // Time End
double esec = double(end - begin) / CLOCKS_PER_SEC;
Mat *dev_image1, *dev_image2;
if ( cudaMalloc((void**)dev_image1, sizeof(Mat)) != cudaSuccess) {
cout << "Error with dev_image1" << endl;
return 0;
}
if (cudaMalloc((void**)dev_image2, sizeof(Mat)) != cudaSuccess) {
cout << "Error with dev_image2" << endl;
return 0;
}
cudaMemcpy(dev_image1, &image1, sizeof(image1), cudaMemcpyHostToDevice);
cudaMemcpy(dev_image2, &image2, sizeof(image1), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
float esec2;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0); // cuda time start
GPUimageSubtract <<< >> > (dev_image1, dev_image2, image1.rows, image1.cols, image2.rows, image2.cols);
// How can i know about fitting blocks, threads?
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop); // cuda time end
cudaEventElapsedTime(&esec2, start, stop);
cudaMemcpy(&GPU_res, dev_image1, sizeof(dev_image1), cudaMemcpyDeviceToHost);
cout << fixed;
cout.precision(10);
cout << "CPU = " << esec << endl;
cout << "GPU = " << esec2 << endl;
imshow("CPU_res", CPU_res);
imshow("GPU_res", GPU_res);
waitKey(0);
cudaFree(dev_image1);
cudaFree(dev_image2);
return 0;
}
I know this code is really stupid and silly but this is what i can do best now.
BUT I want to know why and how very well!
Thank you!
program3.cu (2.31 KB)