I am getting the following warning when compile my code using a dual stage pipeline
warning #20054-D: dynamic initialization is not supported for a function-scope static __shared__ variable within a __device__/__global__ function
__attribute__((shared)) cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, 2> pipeline_state;
My function is as follows,
#include <cooperative_groups.h>
#include <cooperative_groups/memcpy_async.h>
#include <cuda/pipeline> // only supported on sm_70 and above
namespace cg = cooperative_groups;
#define NUM_STAGES 2 // number of stages in the pipeline
__global__ void pipeline_example(int *global1, int *global2, size_t count) {
extern __shared__ int s[]; // size controlled by the host launch config
cg::thread_block group = cg::this_thread_block();
int *shared[NUM_STAGES] = {s, s + 2 * group.size()}; // each step process 2 global chunks
// create a pipeline shared state
__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, NUM_STAGES> pipeline_state;
auto pipeline = cuda::make_pipeline(group, &pipeline_state);
size_t fetch, subset;
for (subset = fetch = 0; subset < count; ++subset) {
// fetch data up to NUM_STAGES chunks ahead
for (; fetch < subset + NUM_STAGES && fetch < count; ++fetch) {
// fetch data from global memory to shared memory
pipeline.producer_acquire();
cuda::memcpy_async(group, shared[fetch % NUM_STAGES],
global1 + fetch * group.size(), sizeof(int) * group.size(), pipeline);
cuda::memcpy_async(group, shared[fetch % NUM_STAGES] + group.size(),
global2 + fetch * group.size(), sizeof(int) * group.size(), pipeline);
pipeline.producer_commit();
}
pipeline.consumer_wait(); // wait for the data to be fetched
int *data = shared[subset % NUM_STAGES];
global1[subset * group.size() + group.thread_rank()] = data[group.thread_rank()] + data[group.thread_rank() + group.size()];
global2[subset * group.size() + group.thread_rank()] = data[group.thread_rank()] * data[group.thread_rank() + group.size()];
pipeline.consumer_release();
}
}
I am running in WSL2 with cuda12.1. The output is correct, so I am wondering what the meaning of the warning is and how to work around.