Why pipeline built with cuda::memcpy_async is slower than sync implementation?

I am following this blog to build a sample program of async pipelined copy from global to shared memory and compute. I was expecting to see some performance gain over synchronous copy (global->register->shared), but the pipelined implementation turned out to be very slow, which is almost the same as no shared memory.
The computation is just a simple task requires frequent reads from array,

__device__ int compute(int *data1, int *data2, int *dst1, int *dst2) {
    cg::thread_block group = cg::this_thread_block();
    int x=0, y=1;
    for (int i = 0; i < group.size(); i++) {
        x += data2[i];
        y *= data1[i];
    }
    dst1[group.thread_rank()] = x;
    dst2[group.thread_rank()] = y;
    return 0;
}

The pipelined kernel is

#define NUM_STAGES 2 // number of stages in the pipeline
__global__ void pipeline_example(int *global1, int *global2, size_t count) {
    global1 += blockIdx.x * blockDim.x;
    global2 += blockIdx.x * blockDim.x;
    extern __shared__ int s[]; // size controlled by the host launch config
    cg::thread_block group = cg::this_thread_block();

    int *shared[NUM_STAGES]; // each step process 2 global chunks
    for (int i = 0; i < NUM_STAGES; ++i) {
        shared[i] = s + i * group.size() * 2;
    }

    // create a pipeline shared state
    __shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, NUM_STAGES> pipeline_state;
    auto pipeline = cuda::make_pipeline(group, &pipeline_state);

    size_t fetch, subset;
    for (subset = fetch = 0; subset < count; ++subset) {
        // fetch data up to NUM_STAGES chunks ahead
        for (; fetch < subset + NUM_STAGES && fetch < count; ++fetch) {
            // fetch data from global memory to shared memory
            pipeline.producer_acquire();
            cuda::memcpy_async(group, shared[fetch % NUM_STAGES], 
                            global1 + fetch * group.size(), sizeof(int) * group.size(), pipeline);
            cuda::memcpy_async(group, shared[fetch % NUM_STAGES] + group.size(), 
                            global2 + fetch * group.size(), sizeof(int) * group.size(), pipeline);
            pipeline.producer_commit();
        }
        pipeline.consumer_wait(); // wait for the data to be fetched
        compute(shared[subset % NUM_STAGES], shared[subset % NUM_STAGES] + group.size(), global1, global2);
        pipeline.consumer_release();
    }
}

the naive kernel is

__global__ void serial_example(int *global1, int *global2, size_t count) {
    global1 += blockIdx.x * blockDim.x;
    global2 += blockIdx.x * blockDim.x;
    extern __shared__ int s[]; // size controlled by the host launch config
    cg::thread_block group = cg::this_thread_block();

    for (size_t subset = 0; subset < count; ++subset) {
        // fetch data up to NUM_STAGES chunks ahead
        s[group.thread_rank()] = global1[subset * group.size() + group.thread_rank()];
        s[group.thread_rank() + group.size()] = global2[subset * group.size() + group.thread_rank()];
        group.sync();
        compute(s, s + group.size(), global1, global2);
        group.sync();
    }
}

I compared the time by launching each kernel with

pipeline_example<<<64, 1024, sizeof(int)*1024*NUM_STAGES*2>>>(device_global1, device_global2, count);
serial_example<<<64, 1024, sizeof(int)*1024*2>>>(device_global1, device_global2, count);

The pipelined kernel takes about 60ms (almost the same as only using global memory) while serial kernel takes 37ms.
Why is the advanced implementation even slower? Is that because the feature is not supported on WSL2?