there are some inconsistencies in your posted code. Leaving those aside, here is a simple example, lifted almost directly from the link #2 above:
$ cat t2238.cu
const int N = 32;
#include <cstdlib>
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/iterator/zip_iterator.h>
__global__ void forward_pass(double* w, double* b){
int idx = threadIdx.x;
__shared__ double w_buffer[N];
__shared__ double b_buffer[N];
w_buffer[idx] = w[idx];
b_buffer[idx] = b[idx];
__syncthreads();
if (idx == 0) {
for (int i = 1; i < N; i++) {
b_buffer[i] = b_buffer[i] - b_buffer[i - 1] * w_buffer[i];
}
}
__syncthreads();
b[idx] = b_buffer[idx];
}
struct scan_op // as per blelloch (1.7)
{
template <typename T1, typename T2>
__host__ __device__
T1 operator()(const T1 &t1, const T2 &t2){
T1 ret;
thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<0>(t2);
thrust::get<1>(ret) = thrust::get<1>(t1)*thrust::get<0>(t2)+thrust::get<1>(t2);
return ret;
}
};
using mt = double;
using namespace thrust::placeholders;
const int disp_s = 8;
int main(){
mt *h_w, *d_w, *h_b, *d_b, *h_r;
h_w = new mt[N];
h_r = new mt[N];
h_b = new mt[N];
cudaMalloc(&d_b, N*sizeof(d_b[0]));
cudaMalloc(&d_w, N*sizeof(d_w[0]));
for (int i = 0; i < N; i++) {h_w[i] = rand()/(double)RAND_MAX; h_b[i] = rand()/(double)RAND_MAX;}
cudaMemcpy(d_b, h_b, N*sizeof(d_b[0]), cudaMemcpyHostToDevice);
cudaMemcpy(d_w, h_w, N*sizeof(d_w[0]), cudaMemcpyHostToDevice);
forward_pass<<<1,N>>>(d_w, d_b);
cudaMemcpy(h_r, d_b, N*sizeof(d_b[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < disp_s; i++) std::cout << h_r[i] << ",";
std::cout << std::endl;
// we will use blelloch 1.5 with b=db = d_b, a=da = -d_w
thrust::device_vector<mt> db(h_b, h_b+N);
thrust::device_vector<mt> da(h_w, h_w+N);
thrust::transform(da.begin(), da.end(), da.begin(), _1*(-1));
thrust::device_vector<mt> dy(N);
thrust::device_vector<mt> dx(N);
thrust::inclusive_scan(thrust::make_zip_iterator(thrust::make_tuple(da.begin(), db.begin())), thrust::make_zip_iterator(thrust::make_tuple(da.end(), db.end())), thrust::make_zip_iterator(thrust::make_tuple(dy.begin(), dx.begin())), scan_op());
thrust::host_vector<mt> hx = dx;
thrust::copy_n(hx.begin(), disp_s, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t2238 t2238.cu
$ compute-sanitizer ./t2238
========= COMPUTE-SANITIZER
0.394383,0.489599,-0.24879,0.85163,0.317409,0.477341,0.339274,0.593128,
0.394383,0.489599,-0.24879,0.85163,0.317409,0.477341,0.339274,0.593128,
========= ERROR SUMMARY: 0 errors
$
Unfortunately, for a very small problem size such as 1000 or 2000 elements, I don’t think there is much if any performance benefit.