Hello,
I am wondering if I am using the CUB library correctly to apply scanning on a sequence of n integers. In the code below, file “input” contains the sequence of n integers:
#include <cuda.h>
#include
#include
#include <cub/cub.cuh>
using namespace std;
// CustomMin functor
struct Add
{
template
CUB_RUNTIME_FUNCTION forceinline
host device T operator()(const T &a, const T &b) const {
return a+b;
}
};
int main (int argc, char * argv) {
clock_t start_t = clock ();
if (argc < 2) {
cout << “Number of elements missing!” << endl;
return 1;
}
int n = atoi (argv[1]);
int * a = new int [n];
int * b = new int [n];
int * dev_a, * dev_b;
Add add_op;
cudaMalloc ((void**) & dev_a, n*sizeof(int));
cudaMalloc ((void**) & dev_b, n*sizeof(int));
ifstream in;
in.open ("input");
for (int i = 0; i < n; i ++)
in >> a[i];
cudaMemcpy (dev_a, a, n*sizeof(int), cudaMemcpyHostToDevice);
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, dev_a, dev_b, add_op, n);
// Allocate temporary storage for inclusive prefix scan
cudaMalloc(&d_temp_storage, temp_storage_bytes);
cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, dev_a, dev_b, add_op, n);
cudaMemcpy (b, dev_b, n*sizeof(int), cudaMemcpyDeviceToHost);
ofstream par;
par.open ("par");
for (int i = 0; i < n; i ++)
par << b[i] << " ";
par << endl;
cudaFree (dev_a);
cudaFree (dev_b);
clock_t end_t = clock ();
cout << "time = " << (double)(end_t - start_t) / CLOCKS_PER_SEC << endl;
return 1;
}
CUB scanning turns out to be slower than sequential scanning on my computer.
Thanks a lot.