How to run Caffe on GPU (TX2)

Hi,
I’m using a Caffe trained network in my application for classifying patterns when I use OpenCV (which only use CPU) I get 26ms for each patch, but when I use Caffe(GPU mode) to improve the time, Unexpectedly, the time increase to 32ms! I couldn’t check GPU usage with gpustat [https://github.com/wookayin/gpustat] but by checking CPU usage it’s obvious I’m still using CPU with Caffe even when I set the mode to GPU!
jetpack 3.0 is installed on my jetsonTX2,
here is part of my code

void Network::load() throw (utility::Exception)
{
	int		gpuCount = 0;
	cudaDeviceProp	deviceProperties;

	try {
		if (useGPU_) {
			cudaGetDeviceCount(&gpuCount);
			cudaGetDeviceProperties(&deviceProperties, 0);
std::cout << "GPUs: " << gpuCount << "\nGPU device name: " << deviceProperties.name << std::endl;
			caffeNetwork_ = std::make_shared< caffe::Net<float> >(
									prototypeFile_,
									caffe::TEST
									);
			caffeNetwork_->CopyTrainedLayersFrom(modelFile_);
		} else
			network_ = cv::dnn::readNetFromCaffe(prototypeFile_, modelFile_);
		loadMeanFile();
		loadLabelsFile();
	} catch (cv::Exception& exception) {
		throw utility::Exception(exception.what());
	}
}

const Result Network::classifier(
				const cv::Mat& image,
				const size_t imageSize
				) throw (utility::Exception)
{
	caffe::Blob<float>*	caffeInput;
	caffe::Blob<float>*	caffeOutput;
	cv::Mat			blob;
	cv::Mat			caffeInputMatrix;
	cv::Mat			probabilities;
	Class			_class;
	Result			result;
caffe::Timer	forwardTimer;

	// Convert image to batch of images
	blob = cv::dnn::blobFromImage(
				image,
				1.0f,
				cv::Size(imageSize, imageSize),
				cv::Scalar(meanPixels[0], meanPixels[1], meanPixels[2]),
				false
				);
	if (useGPU_) {
		// Run Caffe model using Caffe
		caffeInput = caffeNetwork_->input_blobs()[0];
		// Wrap Caffe's input blob to cv::Mat
		caffeInputMatrix = cv::Mat(
					caffeInput->shape(),
					CV_32F,
					(char*) caffeInput->cpu_data()
					);
		blob.copyTo(caffeInputMatrix);
// forwardTimer.Start();
		caffeOutput = caffeNetwork_->Forward()[0];
// std::cout << "Forward Time: " << forwardTimer.MilliSeconds() << std::endl;
		probabilities = cv::Mat(
					caffeOutput->shape(),
					CV_32F,
					(char*) caffeOutput->cpu_data()
					);
	} else {
		network_.setInput(blob, "data");
		probabilities = network_.forward("softmax");
	}
	_class = getClass(probabilities);
	result.label(labels[_class.first]);
	result.probability(_class.second);

	return result;
}

Hi,

Have you built Caffe from source with GPU support?
If not, please check this tutorial for details:
https://github.com/jetsonhacks/installCaffeJTX2

Thanks.

yes of course, I have build Caffe from source with GPU support.
but still if there is a way to check this i’ll be glad to know

Hi,

There is some memory copy inside your program.
Ideally, you should use GPU data to avoid CPU <-> GPU memory copy.

float* input = input_layer->mutable_gpu_data();

Could you run your program with nvprof to check execution time of each function call first?

sudo ./nvprof -o data.nvvp [your program]

Thanks.