Hi All,
I am facing problem in using cudppMultiScan. The code posted below for reference.
This code is working, when dim is power of two and not otherwise. Is there something
obvious that I am missing. Please help me in using cudppMultiScan properly so that
it also works from non power of two dim.
Best Regards,
Sachin
[codebox]
#include <stdio.h>
#include <cudpp.h>
#include <sys/time.h>
int main (int argc, char **argv)
{
int *d_in, *d_out; // device pointers
int dim = atoi(argv[1]); // dimension of the array
// For timer measures
struct timeval tv_begin, tv_end; // local timer (1 iteration)
float time_gpu;
// DIM x DIM input array
int source[dim*dim];
for (int i = 0; i < dim*dim; i++)
source[i] = 1;
size_t mem_size = dimdimsizeof(int);
// device buffers
cudaMalloc( (void**) &d_in, mem_size );
cudaMalloc( (void**) &d_out, mem_size );
// copy source array from host to device
cudaMemcpy( (void*) d_in, (void*) source, mem_size, cudaMemcpyHostToDevice );
// scanPlan
CUDPPHandle scanPlan;
CUDPPConfiguration config = { CUDPP_SCAN, CUDPP_ADD, CUDPP_INT, CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE };
cudppPlan(&scanPlan, config, dim*dim, dim, dim);
gettimeofday(&tv_begin, NULL);
// do the scan
cudppMultiScan(scanPlan, d_out, d_in, dim, dim);
gettimeofday(&tv_end, NULL);
time_gpu = (tv_end.tv_sec-tv_begin.tv_sec)*1e6 + tv_end.tv_usec-tv_begin.tv_usec;
// copy data back
cudaThreadSynchronize();
cudaMemcpy( (void*) source, (void*) d_out, mem_size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
// display it
#if 1
printf("\n");
for (int i = 0; i < dim; i++)
{
for (int j = 0; j < dim; j++)
{
if(source[i*dim+j] != (j+1))
{
printf("source[%d][%d] %d \n",i,j,source[i*dim+j]);
break;
}
}
}
printf("Time taken : %f\n",time_gpu/1e3);
return 0;
}
[/codebox]