Slow sequential loops on CUDA

One of my kernels looks like this (adapted to avoid IP leak):

__global__ void forward_pass(double* w, double* b){
    int idx = threadIdx.x;
    __shared__ double w_buffer[N];  // N is about 2000
    __shared__ double b_buffer[N];
    w_buffer[idx] = w[idx];
    b_buffer[idx] = b[idx];
    __syncthreads();
    if (worker_idx == 0) {
      for (int i = 1; i < N; i++) {
        b_buffer[i] = b_buffer[i] - b_buffer[i - 1] * w_buffer[i];
      }
    }
    __syncthreads();

    b[idx] = b_buffer[idx];
}

this code works reasonably well, but I still do not quite like it. In particular, I think this code has some issues:

  1. only one thread is actively doing calculation, while all other thread in the block is just ideling;
  2. the for loop access shared memory, which to my understanding, is slower than working with registers.

However I have a hard time improving this code, because it appears to me that this code is inherently sequential, like each b[i] depends on b[i-1].

Is there anyway to make this code more CUDA friendly/faster?
Thanks in advance!

The calculation here falls into a category of first-order recurrence relations. Here are some treatments:

1
2
3

there are some inconsistencies in your posted code. Leaving those aside, here is a simple example, lifted almost directly from the link #2 above:

$ cat t2238.cu
const int N = 32;
#include <cstdlib>
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/iterator/zip_iterator.h>

__global__ void forward_pass(double* w, double* b){
    int idx = threadIdx.x;
    __shared__ double w_buffer[N];
    __shared__ double b_buffer[N];
    w_buffer[idx] = w[idx];
    b_buffer[idx] = b[idx];
    __syncthreads();
    if (idx == 0) {
      for (int i = 1; i < N; i++) {
        b_buffer[i] = b_buffer[i] - b_buffer[i - 1] * w_buffer[i];
      }
    }
    __syncthreads();

    b[idx] = b_buffer[idx];
}

struct scan_op // as per blelloch (1.7)
{
  template <typename T1, typename T2>
  __host__ __device__
  T1 operator()(const T1 &t1, const T2 &t2){
    T1 ret;
    thrust::get<0>(ret) = thrust::get<0>(t1)*thrust::get<0>(t2);
    thrust::get<1>(ret) = thrust::get<1>(t1)*thrust::get<0>(t2)+thrust::get<1>(t2);
    return ret;
    }
};

using mt = double;
using namespace thrust::placeholders;
const int disp_s = 8;
int main(){

  mt *h_w, *d_w, *h_b, *d_b, *h_r;
  h_w = new mt[N];
  h_r = new mt[N];
  h_b = new mt[N];
  cudaMalloc(&d_b, N*sizeof(d_b[0]));
  cudaMalloc(&d_w, N*sizeof(d_w[0]));
  for (int i = 0; i < N; i++) {h_w[i] = rand()/(double)RAND_MAX; h_b[i] = rand()/(double)RAND_MAX;}
  cudaMemcpy(d_b, h_b, N*sizeof(d_b[0]), cudaMemcpyHostToDevice);
  cudaMemcpy(d_w, h_w, N*sizeof(d_w[0]), cudaMemcpyHostToDevice);
  forward_pass<<<1,N>>>(d_w, d_b);
  cudaMemcpy(h_r, d_b, N*sizeof(d_b[0]), cudaMemcpyDeviceToHost);
  for (int i = 0; i < disp_s; i++) std::cout << h_r[i] << ",";
  std::cout << std::endl;

  // we will use blelloch 1.5 with b=db = d_b,  a=da = -d_w

  thrust::device_vector<mt> db(h_b, h_b+N);
  thrust::device_vector<mt> da(h_w, h_w+N);
  thrust::transform(da.begin(), da.end(), da.begin(), _1*(-1));
  thrust::device_vector<mt> dy(N);
  thrust::device_vector<mt> dx(N);
  thrust::inclusive_scan(thrust::make_zip_iterator(thrust::make_tuple(da.begin(), db.begin())), thrust::make_zip_iterator(thrust::make_tuple(da.end(), db.end())), thrust::make_zip_iterator(thrust::make_tuple(dy.begin(), dx.begin())), scan_op());
  thrust::host_vector<mt> hx = dx;
  thrust::copy_n(hx.begin(), disp_s, std::ostream_iterator<mt>(std::cout, ","));
  std::cout << std::endl;
}

$ nvcc -o t2238 t2238.cu
$ compute-sanitizer ./t2238
========= COMPUTE-SANITIZER
0.394383,0.489599,-0.24879,0.85163,0.317409,0.477341,0.339274,0.593128,
0.394383,0.489599,-0.24879,0.85163,0.317409,0.477341,0.339274,0.593128,
========= ERROR SUMMARY: 0 errors
$

Unfortunately, for a very small problem size such as 1000 or 2000 elements, I don’t think there is much if any performance benefit.

Thank you Robert! That is definitely helpful. I will give that a try and agree that my problem size is a little awkward here since it is just too small for GPU.