Hello,
I have one question on the code below:
vector< vector <int> > kernel;
Mat dill(image.rows,image.cols,CV_8UC1,Scalar::all(0));
auto *dillstart=dill.data;
vector< vector < int> > shouldBeZeroImage(drows, vector<int>(dcols, 0));
dilate(image, dill, element,Point(-1, -1), 1);
int drows=dill.rows;
int dcols=dill.cols;
int n = kernel.size();
int m = kernel[0].size();
#pragma acc enter data copyin(dillstart[:drows*dcols],shouldBeZeroImage[:drows][:dcols],kernel[:n][:m])
#pragma acc parallel loop collapse(2) default(present)
for(int i = n / 2; i < drows - n / 2; i++) {
for(int j = m / 2; j < dcols - m / 2; j++) {
if( (int)dillstart[i*dcols+j] == ONE) {
//bool shouldBeZero = false;
int shouldBeZero = 0;
//#pragma acc parallel loop
for(int crtX = i - n / 2, x = 0; crtX <= i + n / 2; crtX++, x++) {
// #pragma omp parallel for
for(int crtY = j - m / 2, y = 0; crtY <= j + m / 2; crtY++, y++) {
if((int)dillstart[crtX*dcols+crtY] == ZERO && kernel[x][y] == 1) {
// shouldBeZero = true;
shouldBeZero=1;
break;
}
}
}
if(shouldBeZero) {
// shouldBeZeroImage[i][j] = true;
shouldBeZeroImage[i][j]=1;
}
}
}
}
#pragma acc exit data copyout(dillstart[:drows*dcols],shouldBeZeroImage[:drows][:dcols],kernel[:n][:m])
Related information:
WatershedAlg::erosion(cv::Mat, std::vector<std::vector<int, std::allocator<int>>, std::allocator<std::vector<int, std::allocator<int>>>>):
21, Generating enter data copyin(dillstart[:dcols*drows],shouldBeZeroImage,kernel[:m])
Generating NVIDIA GPU code
24, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
25, /* blockIdx.x threadIdx.x collapsed */
30, #pragma acc loop seq
21, Generating default present(dillstart[:],shouldBeZeroImage,kernel[:])
30, Scalar last value needed after loop for shouldBeZero at line 41
Complex loop carried dependence of dillstart->,kernel-> prevents parallelization
32, Complex loop carried dependence of kernel->,dillstart-> prevents parallelization
47, Generating exit data copyout(dillstart[:dcols*drows],shouldBeZeroImage)
Generating enter data copyin(shouldBeZeroImage,dillstart[:dcols*drows])
Generating exit data copyout(kernel[:m])
Generating NVIDIA GPU code
81, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
82, /* blockIdx.x threadIdx.x collapsed */
I donât know why the parallelization was blocked by dillstart and kernel. Could anyone provide any hint or suggestion?Thanks in advance!