Opencv+cuda to do to do inverse polar coordinate transformation,I can get ture result

I had some trouble when I tried to rewrite this program as CUDA.

include
include “opencv2/opencv.hpp”
include
using namespace std;
using namespace cv;

int main(int argc, char** argv)
{
Mat src;
src = imread(“…/3.png”, 0);

namedWindow("InputImages", WINDOW_NORMAL);
imshow("InputImages", src);

int src_height = src.rows;
int src_width = src.cols;

Size dstSize = Size(2 * src_height, 2 * src_height);
Mat dst = Mat::zeros(dstSize, CV_8UC1);

double scale_r = 2 * src_height / (dstSize.width);
double scale_theta = src_width / CV_2PI;

for (int i = 0; i < dstSize.height; ++i) {
    for (int j = 0; j < dstSize.width; ++j) {
       
        Point2d center(dstSize.width / 2, dstSize.width / 2);
        double distance = sqrt(pow(i - center.y, 2) + pow(j - center.x, 2));
        if (distance < dstSize.width / 2) {
            double Rec_Y = distance; 
            if (Rec_Y < 0) {
                Rec_Y = 0;
            }
            if (Rec_Y > dstSize.width / 2) {
                Rec_Y = dstSize.width / 2;
            }
            double line_theta = atan2(i - center.y, j - center.x);
            if (line_theta < 0) {
                line_theta += CV_2PI;
            }
            if (line_theta < 0) {
                cout << "still<0" << endl;
            }
            double Rec_X = line_theta * scale_theta;
            dst.at<uchar>(i, j) = src.at<uchar>((int)Rec_Y, (int)Rec_X);
        }
    }
}
namedWindow("OutputImages", WINDOW_NORMAL);
imshow("OutputImages",dst);
imwrite("C:/Users/Liuru/Desktop/result.jpg", dst);
printf("%d", dstSize.width);
waitKey();
return 0; 

}

The following is the nth version of CUDA program I have rewritten:

include
include"opencv2/opencv.hpp"

#include<cuda_runtime.h>
#include<device_launch_parameters.h>
using namespace std;
using namespace cv;

global void polartrans(const uchar3* d_in, uchar3* d_out, int dstheight, int dstwidth, Point2d center,int width) {

double scale_theta = width / CV_2PI;

for (int i = blockDim.y * blockIdx.y + threadIdx.y; i < dstheight; i += gridDim.y * blockDim.y)
    for (int j = blockDim.x * blockIdx.x + threadIdx.x; j < dstwidth; j += gridDim.x * blockDim.x)
    {
        double distance = sqrt(((i - center.y) * (i - center.y)) + ((j - center.x) * (j - center.x)));
        //printf("distance:%f i = %d\n" ,distance,i);
        if (distance < dstwidth / 2) {
            double Rec_Y = distance; 

            if (Rec_Y < 0) {
                Rec_Y = 0;
            }
            if (Rec_Y > dstwidth / 2) {
                Rec_Y = dstwidth / 2;
            }
            double line_theta = atan2(i - center.y, j - center.x);
            if (line_theta < 0) {
                line_theta += CV_2PI;
            }
            if (line_theta < 0) {
                //cout << "still<0" << endl;
            }
            double Rec_X = line_theta * scale_theta;

            //dst.at<uchar>(i, j) = src.at<uchar>((int)Rec_Y, (int)Rec_X);
            //d_out[i * dstwidth + j] = d_in[(int)Rec_Y * dstwidth + (int)Rec_X];
            //d_out[i * dstwidth + j].x = d_in[(int)Rec_Y * dstwidth + (int)Rec_X].x;
            //d_out[i * dstwidth + j].y = d_in[(int)Rec_Y * dstwidth + (int)Rec_X].y;
            //d_out[i * dstwidth + j].z = d_in[(int)Rec_Y * dstwidth + (int)Rec_X].z;
           
            d_out[i * width + j].x = d_in[(int)Rec_Y * width + (int)Rec_X].x;
            d_out[i * width + j].y = d_in[(int)Rec_Y * width + (int)Rec_X].y;
            d_out[i * width + j].z = d_in[(int)Rec_Y * width + (int)Rec_X].z;
        }

    }

}

int main()
{
Mat src;
src = imread(“…/3.png”);

namedWindow("InputImages", WINDOW_NORMAL);
imshow("InputImages", src);

int height = src.rows;
int width = src.cols;
size_t src_size = sizeof(uchar3) * height * width;

Size dstSize = Size(2 * height, 2 * height);
Mat dst = Mat::zeros(dstSize, CV_8UC1);
int dstheight = dstSize.height;
int dstwidth = dstSize.width;
size_t dst_size = sizeof(uchar3) * height * width*4;

uchar3* d_in = NULL;
uchar3* d_out = NULL;
uchar3* h_out = (uchar3*)dst.data+1;

cudaMalloc((void**)&d_in, src_size);
cudaMalloc((void**)&d_out, dst_size);
cudaMemcpy(d_in, (uchar3*)src.data, src_size, cudaMemcpyHostToDevice);

Point2d center(dstSize.width / 2, dstSize.width / 2);

dim3 dimGrid(8, 8, 1);
dim3 dimBlock(32, 32, 1);

polartrans << <dimGrid, dimBlock >> > (d_in, d_out,dstheight,dstwidth, center,width);
cudaMemcpy(h_out, d_out, dst_size, cudaMemcpyDeviceToHost);

cv::imshow("polartocart", dst);

cv::waitKey();
cudaFree(d_in);
cudaFree(d_out);

return 0;

}

I have made many attempts when the input and output array sizes are inconsistent. If you can solve this problem, I would be grateful.