cudppMultiScan not working for non power of 2

Hi All,

  I am facing problem in using cudppMultiScan. The code posted below for reference.

This code is working, when dim is power of two and not otherwise. Is there something

obvious that I am missing. Please help me in using cudppMultiScan properly so that

it also works from non power of two dim.

Best Regards,

Sachin

[codebox]

#include <stdio.h>

#include <cudpp.h>

#include <sys/time.h>

int main (int argc, char **argv)

{

    int *d_in, *d_out;                // device pointers

    int dim = atoi(argv[1]);        // dimension of the array

// For timer measures

    struct timeval tv_begin, tv_end;  // local timer (1 iteration)

    float time_gpu;

// DIM x DIM input array

    int source[dim*dim];

    for (int i = 0; i < dim*dim; i++)

            source[i] = 1;

size_t mem_size = dimdimsizeof(int);

// device buffers

    cudaMalloc( (void**) &d_in, mem_size );

    cudaMalloc( (void**) &d_out, mem_size );

// copy source array from host to device

    cudaMemcpy( (void*) d_in, (void*) source, mem_size, cudaMemcpyHostToDevice );

// scanPlan

    CUDPPHandle scanPlan;

    CUDPPConfiguration config = { CUDPP_SCAN, CUDPP_ADD, CUDPP_INT, CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE };

    cudppPlan(&scanPlan, config, dim*dim, dim, dim);

gettimeofday(&tv_begin, NULL);

// do the scan

    cudppMultiScan(scanPlan, d_out, d_in, dim, dim);        

gettimeofday(&tv_end, NULL);

    time_gpu = (tv_end.tv_sec-tv_begin.tv_sec)*1e6 + tv_end.tv_usec-tv_begin.tv_usec;

// copy data back

    cudaThreadSynchronize();

cudaMemcpy( (void*) source, (void*) d_out, mem_size, cudaMemcpyDeviceToHost);

    cudaThreadSynchronize();

// display it

#if 1

    printf("\n");

    for (int i = 0; i < dim; i++)

    {

            for (int j = 0; j < dim; j++)

            {

                if(source[i*dim+j] != (j+1))

                {

                    printf("source[%d][%d] %d \n",i,j,source[i*dim+j]);

                    break;

                }

            }

    }

#endif

    printf("Time taken : %f\n",time_gpu/1e3);

    return 0;

}

[/codebox]