Actually my general view is that for operations more complicated than a reduction, I would encourage general CUDA programmers to avoid trying to write their own, if a library function is available. In the case of prefix sums, reductions, and various other “hard-to-write” parallel operations, there are library functions in thrust, cub, and other libraries we could use.
Here’s a worked example of:
- Your original test function
- the recurrence algorithm proposed by Vectorizer in ordinary serial C-code
- a parallel realization of 2 in thrust
$ cat t12.cu
#include <stdio.h>
#include <math.h>
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#define N 10
#define B 2
#define A 3
typedef int mytype;
void TestFunc(mytype alpha, mytype beta, mytype *in, mytype *out, int numPoints)
{
for (int i = 1; i < numPoints; i++) {
out[i] = alpha * in[i] + beta * out[i-1];
}
}
struct my_pow_func
{
mytype r;
my_pow_func(mytype _r) : r(_r) {}
__host__ __device__
mytype operator()(mytype p){
return powf(r,p);
}
};
struct my_scale_func
{
mytype r;
my_scale_func(mytype _r) : r(_r) {}
__host__ __device__
mytype operator()(mytype p){
return p*r;
}
};
int main(){
int in[N], out[N], test[N], aux[N];
// test
for (int i = 0; i < N; i++)
in[i] = 1;
thrust::device_vector<mytype> d_in(in, in+N);
test[0] = A*in[0];
TestFunc(A, B, in, test, N);
//serial method
aux[0] = 1;
for (int i = 1; i < N; i++)
aux[i] = aux[i-1]*B;
for (int i = 0; i < N; i++)
in[i] = aux[i]*in[i];
for (int i = 0; i < N; i++)
in[i] = A*in[i];
out[0] = in[0];
for (int i = 1; i < N; i++)
out[i] = out[i-1] + in[i];
//using thrust
thrust::device_vector<mytype> d_aux(N);
thrust::device_vector<mytype> d_out(N);
thrust::sequence(d_aux.begin(), d_aux.end());
thrust::transform(d_aux.begin(), d_aux.end(), d_aux.begin(), my_pow_func(B));
thrust::transform(d_aux.begin(), d_aux.end(), d_in.begin(), d_in.begin(), thrust::multiplies<mytype>());
thrust::transform(d_in.begin(), d_in.end(), d_in.begin(), my_scale_func(A));
thrust::inclusive_scan(d_in.begin(), d_in.end(), d_out.begin());
for (int i=0; i < N; i++){
mytype tres = d_out[i];
printf("%d: %f, %f, %f\n", i, (float)test[i], (float)out[i], (float)tres);}
}
$ nvcc -o t12 t12.cu
$ ./t12
0: 3.000000, 3.000000, 3.000000
1: 9.000000, 9.000000, 9.000000
2: 21.000000, 21.000000, 21.000000
3: 45.000000, 45.000000, 45.000000
4: 93.000000, 93.000000, 93.000000
5: 189.000000, 189.000000, 189.000000
6: 381.000000, 381.000000, 381.000000
7: 765.000000, 765.000000, 765.000000
8: 1533.000000, 1533.000000, 1533.000000
9: 3069.000000, 3069.000000, 3069.000000
$
(thrust: https://github.com/thrust/thrust/wiki/Quick-Start-Guide )