it seems to me that there is a bug or that I made a mistake in my use of __pipeline memcpy async, there is no implementation online for the moment so it is complicated to implement
Code;
#include <stdio.h>
#include <cuda_pipeline.h>
__global__ void test_pipe_intr(float* input) {
__shared__ float smem[32];
__shared__ int idx4;
idx4 = threadIdx.x * 4;
// for tread 0 get it imput[0 - 4], for tread 1 get it imput[4 - 8]
__pipeline_memcpy_async(smem, input + idx4, 16);
__pipeline_commit();
__pipeline_wait_prior(0);
printf("% i = %f \n", threadIdx.x, smem[0]);
}
int main() {
float* h_C = (float*)malloc(32000*sizeof(float));
float* d_C;
// fill a array from 0 to 32000
for (int i = 0; i < 32000; i++) {
h_C[i] = i;
}
cudaMalloc(&d_C, 32000*sizeof(float));
cudaMemcpy(d_C, h_C, 32000*sizeof(float), cudaMemcpyHostToDevice);
test_pipe_intr<<<1, 32>>>(d_C);
cudaDeviceSynchronize();
}
venus@V:~/Pipeline$ compute-sanitizer ./a.out
========= COMPUTE-SANITIZER
0 = 96.000000
1 = 96.000000
2 = 96.000000
3 = 96.000000
4 = 96.000000
5 = 96.000000
6 = 96.000000
7 = 96.000000
8 = 96.000000
9 = 96.000000
10 = 96.000000
11 = 96.000000
12 = 96.000000
13 = 96.000000
14 = 96.000000
15 = 96.000000
16 = 96.000000
17 = 96.000000
18 = 96.000000
19 = 96.000000
20 = 96.000000
21 = 96.000000
22 = 96.000000
23 = 96.000000
24 = 96.000000
25 = 96.000000
26 = 96.000000
27 = 96.000000
28 = 96.000000
29 = 96.000000
30 = 96.000000
31 = 96.000000
========= ERROR SUMMARY: 0 errors
I expected to have 0 = 0, 1 = 4, 2 = 8… as a result
Thank for hell