Hi there,
As the title suggests, I am trying to copy data from shared memory to global memory on a hopper GPU.
I have tried this function:
template <class T>
__device__ __inline__ void memcpy_panel(T *dst, T *src, int m, unsigned int cols, int lda_src, int lda_dst) {
for (unsigned int idx = threadIdx.x; idx < m * cols; idx += blockDim.x) {
const unsigned int row = idx % m;
const unsigned int col = idx / m;
dst[row + col * lda_dst] = src[row + col * lda_src];
}
}
where src is shared memory and dst is global memory.
Upon using the above function:
memcpy_panel(A + panel_i_write * cols_per_panel * lda, A_s_i + panel_i_offset_write * m, m, cols_per_panel, m, lda);
memcpy_panel(A + panel_j_write * cols_per_panel * lda, A_s_j + panel_j_offset_write * m, m, cols_per_panel, m, lda);
*Note that lda=m=512 in my case.
This is not using any asynchronous manner.
template <class T>
__device__ __inline__ void async_memcpy_panel(cuda::pipeline<cuda::thread_scope_thread> & pipe, T *dst, T *src, int m, unsigned int cols, int lda_src, int lda_dst) {
/* for (unsigned int col = threadIdx.x; col < cols; col += blockDim.x) {
cuda::memcpy_async(&dst[col * lda_dst], &src[col * lda_src], sizeof(T) * m, pipe);
} */ // This approach is resulting in the worse numbers across all alternatives
const unsigned int copies_per_thread = (m * cols + blockDim.x - 1) / blockDim.x;
for (unsigned int i = 0; i < copies_per_thread; ++i) {
unsigned int idx = threadIdx.x + i * blockDim.x;
if (idx >= m * cols) break;
const unsigned int row = idx % m;
const unsigned int col = idx / m;
cuda::memcpy_async(&dst[row + col * lda_dst], &src[row + col * lda_src], sizeof(T), pipe);
} // This one and the one below are of similar numbers
/* for (unsigned int idx = threadIdx.x; idx < m * cols; idx += blockDim.x) {
const unsigned int row = idx % m;
const unsigned int col = idx / m;
cuda::memcpy_async(&dst[row + col * lda_dst], &src[row + col * lda_src], sizeof(T), pipe);
} */ // This one was slightly worse than the one above
}
auto pipe = cuda::make_pipeline();
async_memcpy_panel(pipe, A + panel_i_write * cols_per_panel * lda, A_s_i + panel_i_offset_write * m, m, cols_per_panel, m, lda);
async_memcpy_panel(pipe, A + panel_j_write * cols_per_panel * lda, A_s_j + panel_j_offset_write * m, m, cols_per_panel, m, lda);
When running the above asynchronous copying function, I am trying different appraoches, where the best one is the one uncommented in the middle.
The tricky part is that the best approach in the asynchronous copying function async_memcpy_panel have resulted in a result close to memcpy_panel with no noticable improvement.
Something that have come to my mind is that the pipeline I am using cuda::pipeline<cuda::thread_scope_thread> instead of having something across the thread block.
Would appreciate any help, thanks!