I’m trying to use the pipeline feature with pipeline roles; however, the process seems to hang at a consumer barrier. It seems like this feature is fairly new and the documentation isn’t very clear about the expected behaviour in this case. Below is a simple 2 stage pipeline that demonstrates the problem I’m having.
The intention is to divide the threads equally between producer and consumer roles, with the producer threads storing data from global to shared and the consumer threads then using the data from shared. (Here just printing it out to verify the behaviour). Any insights into this feature would be great.
#include <stdio.h>
#include <cooperative_groups.h>
#include <cooperative_groups/memcpy_async.h>
#include <cuda/pipeline>
namespace cg = cooperative_groups;
__global__ void
test_kernel(float* __restrict__ b)
{
extern __shared__ float a[];
auto block = cg::this_thread_block();
auto warp = cg::tiled_partition<32>(block);
auto tile = cg::tiled_partition<1>(warp);
auto thread = cg::this_thread();
auto role = ((block.thread_rank() % 2) == 0) ? cuda::pipeline_role::producer : cuda::pipeline_role::consumer;
constexpr size_t stages = 2;
constexpr auto scope = cuda::thread_scope::thread_scope_block;
__shared__ cuda::pipeline_shared_state<scope, stages> shared_state;
auto pipe = cuda::make_pipeline(block, &shared_state, role);
//
printf("%3d: before produce 1 => %d\n", threadIdx.x, role);
pipe.producer_acquire();
printf("%3d: in produce 1 => %d\n", threadIdx.x, role);
cuda::memcpy_async(tile, a + threadIdx.x, b + threadIdx.x, cuda::aligned_size_t<4>(sizeof(float)), pipe);
pipe.producer_commit();
printf("%3d: before consume 1 => %d\n", threadIdx.x, role);
pipe.consumer_wait();
printf("%3d: in consume 1 => %d\n", threadIdx.x, role);
printf("%d %f\n", threadIdx.x, a[threadIdx.x-1]);
pipe.consumer_release();
printf("%3d: before produce 2 => %d\n", threadIdx.x, role);
pipe.producer_acquire();
printf("%3d: in produce 2 => %d\n", threadIdx.x, role);
cuda::memcpy_async(tile, a + threadIdx.x + blockDim.x, b + threadIdx.x + blockDim.x, cuda::aligned_size_t<4>(sizeof(float)), pipe);
pipe.producer_commit();
printf("%3d: before consume 2 => %d\n", threadIdx.x, role);
pipe.consumer_wait();
printf("%3d: in consume 2 => %d\n", threadIdx.x, role);
printf("%d %f\n", threadIdx.x, a[threadIdx.x-1 + blockDim.x]);
pipe.consumer_release();
return;
}
int main()
{
int blk_dim = 32, grd_dim = 1;
int shr_size = 2*blk_dim*grd_dim*sizeof(float);
float *b, *b_d;
b = (float*)malloc(2*grd_dim*blk_dim*sizeof(float));
cudaMalloc(&b_d, 2*grd_dim*blk_dim*sizeof(float));
for(int i=0; i<2*grd_dim*blk_dim; ++i)
b[i] = i;
cudaMemcpy(b_d, b, 2*grd_dim*blk_dim*sizeof(float), cudaMemcpyHostToDevice);
test_kernel<<<grd_dim, blk_dim, shr_size>>>(b_d);
cudaError_t ierr = cudaDeviceSynchronize();
return ierr;
}
which can be compiled for a V100 or Titan V with $ nvcc pipe_role_test.cu --std=c++17 --gpu-code=compute_70 --gpu-architecture=compute_70 -o test
FYI: I am testing this on a 64-bit Ubuntu 20.04 system with a Titan V and Cuda V11.2.67.