I am following this blog to build a sample program of async pipelined copy from global to shared memory and compute. I was expecting to see some performance gain over synchronous copy (global->register->shared), but the pipelined implementation turned out to be very slow, which is almost the same as no shared memory.
The computation is just a simple task requires frequent reads from array,
__device__ int compute(int *data1, int *data2, int *dst1, int *dst2) {
cg::thread_block group = cg::this_thread_block();
int x=0, y=1;
for (int i = 0; i < group.size(); i++) {
x += data2[i];
y *= data1[i];
}
dst1[group.thread_rank()] = x;
dst2[group.thread_rank()] = y;
return 0;
}
The pipelined kernel is
#define NUM_STAGES 2 // number of stages in the pipeline
__global__ void pipeline_example(int *global1, int *global2, size_t count) {
global1 += blockIdx.x * blockDim.x;
global2 += blockIdx.x * blockDim.x;
extern __shared__ int s[]; // size controlled by the host launch config
cg::thread_block group = cg::this_thread_block();
int *shared[NUM_STAGES]; // each step process 2 global chunks
for (int i = 0; i < NUM_STAGES; ++i) {
shared[i] = s + i * group.size() * 2;
}
// create a pipeline shared state
__shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, NUM_STAGES> pipeline_state;
auto pipeline = cuda::make_pipeline(group, &pipeline_state);
size_t fetch, subset;
for (subset = fetch = 0; subset < count; ++subset) {
// fetch data up to NUM_STAGES chunks ahead
for (; fetch < subset + NUM_STAGES && fetch < count; ++fetch) {
// fetch data from global memory to shared memory
pipeline.producer_acquire();
cuda::memcpy_async(group, shared[fetch % NUM_STAGES],
global1 + fetch * group.size(), sizeof(int) * group.size(), pipeline);
cuda::memcpy_async(group, shared[fetch % NUM_STAGES] + group.size(),
global2 + fetch * group.size(), sizeof(int) * group.size(), pipeline);
pipeline.producer_commit();
}
pipeline.consumer_wait(); // wait for the data to be fetched
compute(shared[subset % NUM_STAGES], shared[subset % NUM_STAGES] + group.size(), global1, global2);
pipeline.consumer_release();
}
}
the naive kernel is
__global__ void serial_example(int *global1, int *global2, size_t count) {
global1 += blockIdx.x * blockDim.x;
global2 += blockIdx.x * blockDim.x;
extern __shared__ int s[]; // size controlled by the host launch config
cg::thread_block group = cg::this_thread_block();
for (size_t subset = 0; subset < count; ++subset) {
// fetch data up to NUM_STAGES chunks ahead
s[group.thread_rank()] = global1[subset * group.size() + group.thread_rank()];
s[group.thread_rank() + group.size()] = global2[subset * group.size() + group.thread_rank()];
group.sync();
compute(s, s + group.size(), global1, global2);
group.sync();
}
}
I compared the time by launching each kernel with
pipeline_example<<<64, 1024, sizeof(int)*1024*NUM_STAGES*2>>>(device_global1, device_global2, count);
serial_example<<<64, 1024, sizeof(int)*1024*2>>>(device_global1, device_global2, count);
The pipelined kernel takes about 60ms (almost the same as only using global memory) while serial kernel takes 37ms.
Why is the advanced implementation even slower? Is that because the feature is not supported on WSL2?