NPP vs Thrust performance

Hi,

I just put together a quick comparison of some NPP functions with Thrust equivalents. In particular, I compared a scalar multiplication of a vector (an image stored as linear memory), and a reduction.

I was expecting NPP to be faster, however it turned out that thrust was about twice as fast for both.

I’m happy to hear if this was not a fair comparison (for example, I might not have been using the best NPP functions) or any other thoughts on this comparison.

From my perspective, Thrust is a lot nicer to use (C++ interface, abstracted memory management etc.) and if it is faster too, seems like a win-win!

Has anyone else compared the performance of these libraries?

Source code is below.

Cheers,
Alex

This is an example output:

NPP scalar took: 0.239168ms
Thrust scalar took: 0.128064ms
-----------
NPP sum took: 0.315712ms
Thrust sum took: 0.145184ms
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <npp.h>

class CudaTimer
{
	public:
		CudaTimer()
		{
			cudaEventCreate(&m_start);
			cudaEventCreate(&m_stop);
		}
		~CudaTimer()
		{
			cudaEventDestroy(m_start);
			cudaEventDestroy(m_stop);
		}
		void start()
		{
			cudaEventRecord(m_start);
		}
		/** @return Time in milliseconds. */
		float stop()
		{
			cudaEventRecord(m_stop);
			cudaEventSynchronize(m_stop);
			float elapsed;
			cudaEventElapsedTime(&elapsed, m_start, m_stop);
			return elapsed;
		}

	private:
		cudaEvent_t m_start, m_stop;
};

struct scalar : public thrust::unary_function<float, float>
{
	public:
		scalar(float factor) : m_factor(factor) {}

		__host__ __device__
		float operator()(float x) { return m_factor * x; }

	private:
		float m_factor;
};

int main(int argc, char* argv[])
{
	const int width = 1024;
	const int height = 768;
	const int elements = width*height;

 	thrust::device_vector<float> d_vec(elements);

	// fill the vector with the number 2
	thrust::fill(d_vec.begin(), d_vec.end(), 2);
	CudaTimer timer;

	// npp needs a buffer
	int bufferSize = 0;
	nppsSumGetBufferSize_32f(elements, &bufferSize);
	thrust::device_vector<unsigned char> d_buffer(bufferSize);

	// compare scalar
	NppiSize roi = {width, height};
	timer.start();
	int ret = nppiDivC_32f_C1IR(
			10.0,
			thrust::raw_pointer_cast(&d_vec[0]),
			width*4,
			roi
			);
	std::cout << "NPP scalar took: " << timer.stop() << "ms" << std::endl;

	timer.start();
	thrust::transform(d_vec.begin(), d_vec.end(), d_vec.begin(), scalar(1.0/10.0));
	std::cout << "Thrust scalar took: " << timer.stop() << "ms" << std::endl;

	cudaDeviceSynchronize();

	std::cout << "-----------" << std::endl;

	// compare reductions
	timer.start();
	float* d_sum = NULL;
	cudaMalloc(&d_sum, sizeof(float));
	nppsSum_32f(
			thrust::raw_pointer_cast(&d_vec[0]),
			elements,
			d_sum,
			thrust::raw_pointer_cast(&d_buffer[0])
			);
	float sum = 0;
	cudaMemcpy(&sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "NPP sum took: " << timer.stop() << "ms" << std::endl;

	timer.start();
	sum = thrust::reduce(d_vec.begin(), d_vec.end());
	std::cout << "Thrust sum took: " << timer.stop() << "ms" << std::endl;;

	return 0;
}
nvcc -g -o nppVthrust -lnpp main.cu