Using multiple streams in npp

astroboykhw · January 25, 2022, 7:39am

I want to make a code rotates images with concurrncy. First I checked the code worked well in default stream version using nppiRotate_8u_C1R, and I’m changing it to 4-stream version. 32 images are copied into 4 streams and rotated using nppiRotate_8u_C1R_Ctx.

The size of input image is 2048*2048, but the result image looks weird unlike default stream version.
I profiled both codes with Nsight system and found the kernel launch configuration is different each other.

In default stream version, the number of threads is equal to the number of pixels (left). However, the number of threads is 128128, far less than the size of input image (right). The rotated image appears in 128128 upper-left region only.

How can I solve it? Or is there anyone experienced same problem to mine?

OS : Windows 10
GPU : RTX 3090
CUDA version : 11.1

#include <iostream>
#include <stdio.h>

#include "cuda_runtime.h"
#include "helper_cuda.h"
#include "helper_functions.h"

#include <opencv2/opencv.hpp>

#include <nppi.h>
#include <nppcore.h>
#include <nppdefs.h>

using namespace std;
using namespace cv;

Mat read_BMP_opencv(char* filename, int& w, int& h);

int main()
{
	int f_width, f_height;
	char buf[256];

	cudaEvent_t start, stop;
	float  elapsedTime;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	int img_num = 32; 
	const int stream_num = 4;
	int n_iter = 50; 

	cudaStream_t stream[stream_num];
	NppStreamContext nppStreamContext[stream_num];
	cudaDeviceProp oDeviceProperties;

	for (int n = 0; n < stream_num; n++)
	{
		cudaStreamCreate(&stream[n]);
		nppStreamContext[n].hStream = stream[n];

		cudaDeviceGetAttribute(&nppStreamContext[n].nCudaDevAttrComputeCapabilityMajor,
			cudaDevAttrComputeCapabilityMajor,
			nppStreamContext[n].nCudaDeviceId);

		cudaDeviceGetAttribute(&nppStreamContext[n].nCudaDevAttrComputeCapabilityMinor,
			cudaDevAttrComputeCapabilityMinor,
			nppStreamContext[n].nCudaDeviceId);

		cudaStreamGetFlags(nppStreamContext[n].hStream, &nppStreamContext[n].nStreamFlags);

		cudaGetDevice(&nppStreamContext[n].nCudaDeviceId);
		cudaGetDeviceProperties(&oDeviceProperties, nppStreamContext[n].nCudaDeviceId);

		nppStreamContext[n].nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
		nppStreamContext[n].nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
		nppStreamContext[n].nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
		nppStreamContext[n].nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
	}

	cv::Mat::setDefaultAllocator(cv::cuda::HostMem::getAllocator(cv::cuda::HostMem::AllocType::PAGE_LOCKED));

	double angle = 3.8;

	///////////////////////////*********************** Memory Allocation ***********************/////////////////////////

	NppiSize srcSize = { f_width, f_height };
	NppiRect srcROI = { 0, 0, f_width, f_height };
	NppiRect dstROI = { 0, 0, f_width, f_height };

	Mat* img = new Mat[img_num];
	Mat* result = new Mat[img_num];

	Npp8u* *data = new Npp8u*[img_num];
	Npp8u* *d_data = new Npp8u*[img_num];
	Npp8u* *d_rotated = new Npp8u*[img_num];
	Npp8u* *h_result = new Npp8u*[img_num];



	for (int i = 0; i < img_num; i++)
	{
		img[i] = read_BMP_opencv("input_images/test_2048_2.bmp", f_width, f_height);
		data[i] = img[i].data;
	}

	for (int i = 0; i < img_num; i++)
	{
		cudaMalloc((void**)&d_data[i], sizeof(Npp8u)*f_width*f_height);
		cudaMalloc((void**)&d_rotated[i], sizeof(Npp8u)*f_width*f_height);
		cudaMallocHost((void**)&h_result[i], sizeof(Npp8u)*f_width*f_height);

	}

	///////////////////////////*********************** NPP rotation ***********************/////////////////////////
	cudaEventRecord(start, 0);
	for (int n = 0; n < n_iter; n++)
	{
		for (int i = 0; i < int(img_num / stream_num); i++)
		{
			for (int j = 0; j < stream_num; j++)
			{
				cudaMemcpyAsync(d_data[i*stream_num+j], data[i*stream_num + j], sizeof(Npp8u)*f_width*f_height, cudaMemcpyHostToDevice, stream[j]);

				// NPP 10.2 and beyond contain an additional element in the NppStreamContext structure
				nppiRotate_8u_C1R_Ctx(d_data[i*stream_num + j], srcSize, f_width, srcROI, d_rotated[i*stream_num + j], f_width, dstROI, angle, 0, 0, NPPI_INTER_LINEAR, nppStreamContext[j]);

				cudaMemcpyAsync(h_result[i*stream_num + j], d_rotated[i*stream_num + j], sizeof(Npp8u)*f_width*f_height, cudaMemcpyDeviceToHost, stream[j]);
			}
		}
	}
	cudaDeviceSynchronize();
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);

	printf("Average Rotation Time: %3.1f ms\n", elapsedTime / n_iter);

	for (int i = 0; i < img_num; i++)
	{
		result[i] = Mat(f_height, f_width, CV_8UC1);
		result[i].data = h_result[i];

		sprintf(buf, "output_images/test2048_rotated_nppi_%d.bmp", i);
		imwrite(buf, result[i]);
	}

	return 0;

}

Mat read_BMP_opencv(char* filename, int& w, int& h)
{
	Mat input_img = imread(filename, 0);
	if (input_img.empty())
		throw "Argument Exception";

	// extract image height and width from header
	int width = input_img.cols;
	int height = input_img.rows;

	w = width;
	h = height;

	return input_img;
}

Topic		Replies	Views
Npp with multiple Streams GPU-Accelerated Libraries	1	2025	August 31, 2016
NPP Stream crash GPU-Accelerated Libraries	5	2556	March 21, 2017
using npp on multiple stream CUDA Programming and Performance	2	1461	July 12, 2013
nppiResize_8u_C1R function CUDA Programming and Performance	2	1595	May 19, 2015
NPP & stream problems? GPU-Accelerated Libraries npp	1	1714	October 12, 2021
Problem using cuda streams CUDA Programming and Performance	13	2139	March 4, 2015
NppStreamContext usage for nppi_Ctx functions GPU-Accelerated Libraries	0	496	April 23, 2020
NPP row and column filters GPU-Accelerated Libraries	7	2820	December 3, 2015
NppStreamContext usage for nppi"Name"_Ctx functions CUDA Programming and Performance	0	932	April 22, 2020
Using nppiMean_StdDev_8u_C1R after setNppStream returns NPP_RANGE_ERROR GPU-Accelerated Libraries	2	1727	March 20, 2018

Using multiple streams in npp

Related topics