CUB Prefix Sum


I am wondering if I am using the CUB library correctly to apply scanning on a sequence of n integers. In the code below, file “input” contains the sequence of n integers:

#include <cuda.h>
#include <cub/cub.cuh>
using namespace std;

// CustomMin functor
struct Add
host device T operator()(const T &a, const T &b) const {
return a+b;

int main (int argc, char * argv) {
clock_t start_t = clock ();
if (argc < 2) {
cout << “Number of elements missing!” << endl;
return 1;
int n = atoi (argv[1]);
int * a = new int [n];
int * b = new int [n];
int * dev_a, * dev_b;
Add add_op;

cudaMalloc ((void**) & dev_a, n*sizeof(int));
cudaMalloc ((void**) & dev_b, n*sizeof(int));

ifstream in; ("input");
for (int i = 0; i < n; i ++) 		
	in >> a[i];
cudaMemcpy (dev_a, a, n*sizeof(int), cudaMemcpyHostToDevice);	

void     *d_temp_storage = NULL;
size_t   temp_storage_bytes = 0;
cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, dev_a, dev_b, add_op, n);

// Allocate temporary storage for inclusive prefix scan
cudaMalloc(&d_temp_storage, temp_storage_bytes);
cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, dev_a, dev_b, add_op, n);
cudaMemcpy (b, dev_b, n*sizeof(int), cudaMemcpyDeviceToHost);	

ofstream par; ("par");
for (int i = 0; i < n; i ++) 
	par << b[i] << " ";
par << endl;

cudaFree (dev_a);
cudaFree (dev_b);

clock_t end_t = clock ();
cout << "time = " << (double)(end_t - start_t) / CLOCKS_PER_SEC << endl;

return 1;


CUB scanning turns out to be slower than sequential scanning on my computer.
Thanks a lot.

You’re likely capturing CUDA start-up time with your timing method.

The timing commences at the very start of main() and ends just before main() returns. So it is capturing not only CUDA startup overhead, but CUDA allocations and deallocations, and copying of data from host to GPU and GPU to host. The actual kernel run-time is likely no more than 10% of the total run time, and probably less.

A simple scan is a memory-bound application, and it makes sense to perform it on the GPU when the data is already resident on the GPU, as the GPU memory throughput can easily be 5x the system memory throughput (note that this is highly dependent on the GPU used, some low-end GPUs offer no memory throughput advantage at all). Copying data across PCIe (max throughput only about 12 GB/sec; maybe a quarter of system memory throughput) for a scan only completely negates the GPU’s advantages of higher processing speed.