Error for "call to cuStreamSynchronize returned error 7

Hi all,

I would like to apply acc routine at this time, but it give “call to cuStreamSynchronize returned error 700: Illegal address during kernel execution” error.

What I am trying to do is just converting image(int*) to integral image(int*).

My codes are followings:

#include <opencv2/opencv.hpp>
#include <boost/filesystem.hpp>
using namespace cv;
using namespace std;

const int I_WIDTH = 5;
const int I_HEIGHT = 5;
const int I_LENGTH = 25;

const int II_WIDTH = 6;
const int II_HEIGHT = 6;
const int II_LENGTH = 36;

const int NUM_IMG = 3;

void Mat2Arr(Mat in, int* &img_data, int img_length)
{
	uchar* temp = in.data;
	for (int i = 0; i < img_length; i++)
	{
		img_data[i] = (int) temp[i];
//		cout << img_data[i] << " ";
	}
//	cout << endl;
}

#pragma void routine seq
void Img2II(int* img, int* &ii, int ii_width, int ii_height)
{
#pragma acc loop seq
	for (int i = 0; i < ii_height; i++)
	{
		for (int j = 0; j < ii_width; j++)
		{
			if (i == 0 || j == 0)
			{
				ii[ii_width * i + j] = 0;
			}
			else
			{
				int val = img[(ii_width - 1) * (i - 1) + (j - 1)];
				ii[ii_width * i + j] = val + ii[ii_width * (i - 1) + j]
						+ ii[ii_width * i + (j - 1)]
						- ii[ii_width * (i - 1) + (j - 1)];
			}
//			cout << ii[ii_width * i + j] << " ";
		}
//		cout << endl;
	}
}

void MatSet2Arr(string dir, int** &imgs, int num_img, int img_length)
{

}

int main(void)
{
	Mat img1 = imread("/home/dongyoung/Desktop/test1.jpg",
			CV_LOAD_IMAGE_GRAYSCALE);
	Mat img2 = imread("/home/dongyoung/Desktop/test2.jpg",
			CV_LOAD_IMAGE_GRAYSCALE);
	Mat img3 = imread("/home/dongyoung/Desktop/test3.jpg",
			CV_LOAD_IMAGE_GRAYSCALE);

	int** imgs = (int**) malloc(sizeof(int*) * NUM_IMG);
	for (int i = 0; i < NUM_IMG; i++)
	{
		imgs[i] = (int*) malloc(sizeof(int) * I_LENGTH);
		stringstream ss;
		ss << "/home/dongyoung/Desktop/test" << i << ".jpg";
		string filename = ss.str();
		Mat temp = imread(filename, CV_LOAD_IMAGE_GRAYSCALE);
		Mat2Arr(temp, imgs[i], I_LENGTH);
	}

	int** iis = (int**) malloc(sizeof(int*) * NUM_IMG);

	for (int i = 0; i < NUM_IMG; i++)
	{
		iis[i] = (int*) malloc(sizeof(int) * II_LENGTH);
	}

#pragma acc parallel loop
	for (int i = 0; i < NUM_IMG; i++)
	{
		Img2II(imgs[i], iis[i], II_WIDTH, II_HEIGHT);
	}



	//Test
	for(int i = 0; i<NUM_IMG; i++)
	{
		int* curII = iis[i];
		for(int j = 0; j<II_HEIGHT; j++)
		{
			for(int k = 0; k<II_WIDTH; k++)
			{
				cout<<curII[II_WIDTH*j+k]<<" ";
			}
			cout<<endl;
		}
		cout<<endl;
	}
}

The output:

call to cuStreamSynchronize returned error 700: Illegal address during kernel execution

Please someone insight me to solve this problem.

Thanks in advance,

DK

Hi DK,

Typically when you see error 700, there’s some type of memory error. Since you’re code uses external packages (openCV, boost) and files I’m not able to build or run your example so don’t know exactly what’s wrong. Though in looking through your code, I see two issues.

First, you have “#pragma void routine seq” which should be “#pragma acc routine seq”. Hence, I doubt your code actually generated an OpenACC device routine. What does the compiler feedback messages show (i.e. -Minfo=accel)? I suspect you wont see any messages for this routine.

Second, you haven’t copied any data to the device. The compiler’s not going to be able to do this automatically here so you need to add in data regions. Try following the example I wrote for you in your first post: about FATAL ERROR (c++, PGI 14.7)

  • Mat

Dear Mat,

I assigned some data regions for my array stuffs. It compiles and runs okay, but there was no acceleration here.

As far as I concerned, calculating integral image should be independent, but using the the function(calculating integral image for each image) can be parallelized.

Could you please tell me some tips for accelerating the code?

btw, my integral image function doesn’t use any OpenCV and Boost Libraries. It uses only primitive stuffs.

#include <opencv2/opencv.hpp>
#include <boost/filesystem.hpp>
using namespace cv;
using namespace std;

const int CHN = 1;
const int I_WIDTH = 64;
const int I_HEIGHT = 128;
const int I_LENGTH = I_WIDTH * I_HEIGHT;

const int II_WIDTH = I_WIDTH + 1;
const int II_HEIGHT = I_HEIGHT + 1;
const int II_LENGTH = II_WIDTH * II_HEIGHT;

void Mat2Arr(Mat in, int* img_data, int img_length)
{
	uchar* temp = in.data;
	for (int i = 0; i < img_length; i++)
	{
		img_data[i] = (int) temp[i];
		cout << img_data[i] << " ";
	}
}

#pragma acc routine seq
void Img2II(int* img, int* ii, int ii_width, int ii_height)
{
	int temp = 0;
#pragma acc data create(temp) pcopyin(img[0:(ii_width-1)*(ii_height-1)]) pcopyout(ii[0:(ii_width)*(ii_height)])
	for (int i = 0; i < ii_height; i++)
	{
		for (int j = 0; j < ii_width; j++)
		{
			if (i == 0 || j == 0)
			{
				temp = 0;
			}
			else
			{
				long val = img[(ii_width - 1) * (i - 1) + (j - 1)];
				temp = val + ii[ii_width * (i - 1) + j]
						+ ii[ii_width * i + (j - 1)]
						- ii[ii_width * (i - 1) + (j - 1)];
			}
			ii[ii_width * i + j] = temp;
		}
	}
}
vector<string> defaultListFile(string dir, int &numImg)
{
	boost::filesystem::path path(dir);
	vector<string> sortedString;
	if (!boost::filesystem::exists(path))
		throw invalid_argument("Invalid input path!");

	if (boost::filesystem::is_directory(path))
	{
		boost::filesystem::recursive_directory_iterator it(path);
		boost::filesystem::recursive_directory_iterator endit;

		while (it != endit)
		{
			if (boost::filesystem::is_regular_file(*it))
			{
				string img_path = path.string() + "/"
						+ it->path().filename().string();
				sortedString.push_back(img_path);
			}
			++it;
		}
	}
	numImg = sortedString.size();
	sort(sortedString.begin(), sortedString.end());

	return sortedString;
}

int main(void)
{
int numPos = 0;
	int numNeg = 0;
	vector<string> posImg = defaultListFile(
			"/home/dongyoung/Desktop/Dataset/INRIA/Pedestrian/mini2/pos",
			numPos);
	vector<string> negImg = defaultListFile(
			"/home/dongyoung/Desktop/Dataset/INRIA/Pedestrian/mini2/neg",
			numNeg);
	vector<string> simg;
	for (int i = 0; i < numPos; i++)
	{
		simg.push_back(posImg[i]);
	}
	for (int i = 0; i < numNeg; i++)
	{
		simg.push_back(negImg[i]);
	}
	int numImgs = simg.size();

	int* labels = (int*) malloc(sizeof(int) * numImgs);
	for (int i = 0; i < numPos; i++)
	{
		labels[i] = 1;
	}
	for (int i = numPos; i < numImgs; i++)
	{
		labels[i] = 0;
	}

	int*** imgs = (int***) malloc(sizeof(int**) * CHN);
	for (int i = 0; i < CHN; i++)
	{
		imgs[i] = (int**) malloc(sizeof(int*) * numImgs);
		for (int j = 0; j < numImgs; j++)
		{
			imgs[i][j] = (int*) malloc(sizeof(int) * I_LENGTH);
		}
	}
	for (int j = 0; j < numImgs; j++)
	{
		Mat temp = imread(simg[j]);
		Mat gray;
		cvtColor(temp, gray, CV_BGR2GRAY);
		Mat2Arr(gray, imgs[0][j], I_LENGTH);
}

	int*** iis = (int***) malloc(sizeof(int**) * CHN);
	for (int i = 0; i < CHN; i++)
	{
		iis[i] = (int**) malloc(sizeof(int*) * numImgs);
		for (int j = 0; j < numImgs; j++)
		{
			iis[i][j] = (int*) malloc(sizeof(int) * II_LENGTH);
		}
	}

#pragma acc data copy(imgs[0:CHN][0:numImgs][0:I_LENGTH], II_WIDTH, II_HEIGHT) create(iis[0:CHN][0:numImgs][0:II_LENGTH])
	for (int i = 0; i < CHN; i++)
	{
#pragma acc loop seq
		for (int j = 0; j < numImgs; j++)
		{
			Img2II(imgs[i][j], iis[i][j], II_WIDTH, II_HEIGHT);
		}
	}

	//Test
	for (int i = 0; i < numImgs; i++)
	{
		int* curII = iis[0][i];
		for (int j = 0; j < II_HEIGHT; j++)
		{
			for (int k = 0; k < II_WIDTH; k++)
			{
				cout << curII[II_WIDTH * j + k] << " ";
			}
			cout << endl;
		}
		cout << endl;
	}
}

Followings are the parts of the entire codes. This part is what I would like to give some speed up here.

#pragma acc routine seq
void Img2II(int* img, int* ii, int ii_width, int ii_height)
{
	int temp = 0;
#pragma acc data create(temp) pcopyin(img[0:(ii_width-1)*(ii_height-1)]) pcopyout(ii[0:(ii_width)*(ii_height)])
	for (int i = 0; i < ii_height; i++)
	{
		for (int j = 0; j < ii_width; j++)
		{
			if (i == 0 || j == 0)
			{
				temp = 0;
			}
			else
			{
				long val = img[(ii_width - 1) * (i - 1) + (j - 1)];
				temp = val + ii[ii_width * (i - 1) + j]
						+ ii[ii_width * i + (j - 1)]
						- ii[ii_width * (i - 1) + (j - 1)];
			}
			ii[ii_width * i + j] = temp;
		}
	}
}

#pragma acc data copy(imgs[0:CHN][0:numImgs][0:I_LENGTH], II_WIDTH, II_HEIGHT) create(iis[0:CHN][0:numImgs][0:II_LENGTH])
	for (int i = 0; i < CHN; i++)
	{
#pragma acc loop seq
		for (int j = 0; j < numImgs; j++)
		{
			Img2II(imgs[i][j], iis[i][j], II_WIDTH, II_HEIGHT);
		}
	}

When I compile it, I got below messages:


Img2II(int *, int *, int, int):
     34, Generating acc routine seq
         Generating Tesla code
main:
    324, Generating copy(imgs[:1][:numImgs][:8192])
         Generating copy(II_WIDTH)
         Generating copy(II_HEIGHT)
         Generating create(iis[:1][:numImgs][:8385])
  Timing stats:
    init                    16 millisecs     1%
    parser                 484 millisecs    35%
    expand                 399 millisecs    29%
    vectorize              250 millisecs    18%
    optimize                34 millisecs     2%
    schedule               134 millisecs     9%
    unroll                  33 millisecs     2%
    Total time            1350 millisecs

Many Thanks,
DK

Hi DK,

Looks like you removed the “#pragma acc parallel loop” in the main routine. Try adding it back.

Also, the data regions in the Img2II routine are being ignored since you can’t have host/device data movement within a compute region.

  • Mat

Thank you so much :D

You are my life saver!

Thanks again,

DK