In a program I need to copy a char buffer of N elements from 4-byte aligned shared memory to 4-byte aligned global memory. For efficient copy, as many 4-byte copies (ints) as possible are performed. the remainder is copied byte per byte.
I want to rewrite this code with memcpy_async for readability. memcpy_async allows to specify a compile-time pointer alignment, so ideally the 4-byte alignment should be specified. However, with the alignment x specified, only (N / x) * x seem to be copied.
Is this intended? Am I using it wrong? The programming guide is not very clear to me on this topic (C.6.2.1) Programming Guide :: CUDA Toolkit Documentation
For example, usage 2 mentions a copy of N elements, and then uses N as a template parameter for aligned_size_t.
The following code shows the problem. 5 chars should be copied from smem to gmem. The kernel which uses aligned_size_t<4> (kernel1) only copies 4 chars.
#include <iostream>
#include <algorithm>
#include <cassert>
#include <cooperative_groups.h>
#include <cooperative_groups/memcpy_async.h>
#include <cuda/barrier>
namespace cg = cooperative_groups;
__global__ void kernel0(char* a, const char* b, int N){
__shared__ char smem[1024];
auto g = cg::this_thread_block();
for(int i = g.thread_rank(); i < N; i += g.size()) {
smem[i] = b[i];
}
g.sync();
cg::memcpy_async(
g,
a,
smem,
N
);
cg::wait(g);
}
__global__ void kernel1(char* a, const char* b, int N){
__shared__ char smem[1024];
auto g = cg::this_thread_block();
for(int i = g.thread_rank(); i < N; i += g.size()) {
smem[i] = b[i];
}
g.sync();
cg::memcpy_async(
g,
a,
cuda::aligned_size_t<4>(N),
smem,
cuda::aligned_size_t<4>(N)
);
cg::wait(g);
}
__global__ void kernel2(char* a, const char* b, int N){
__shared__ char smem[1024];
__shared__ cuda::barrier<cuda::thread_scope_block> barrier;
auto g = cg::this_thread_block();
if(g.thread_rank() == 0) {
init(&barrier, g.size());
}
for(int i = g.thread_rank(); i < N; i += g.size()) {
smem[i] = b[i];
}
g.sync();
cuda::memcpy_async(
g,
a,
smem,
cuda::aligned_size_t<4>(N),
barrier
);
barrier.arrive_and_wait();
}
__global__ void kernel3(char* a, const char* b, int N){
__shared__ char smem[1024];
auto g = cg::this_thread_block();
for(int i = g.thread_rank(); i < N; i += g.size()) {
smem[i] = b[i];
}
g.sync();
const int fullInts = N / sizeof(int);
for(int i = g.thread_rank(); i < fullInts; i += g.size()) {
((int*)a)[i] = ((int*)smem)[i];
}
for(int i = g.thread_rank(); i < N - fullInts * sizeof(int); i += g.size()) {
a[fullInts * sizeof(int) + i]
= smem[fullInts * sizeof(int) + i];
}
}
int main(){
const int N = 5;
char* input; cudaMallocHost(&input, 1024);
char* output0; cudaMallocHost(&output0, 1024);
char* output1; cudaMallocHost(&output1, 1024);
char* output2; cudaMallocHost(&output2, 1024);
char* output3; cudaMallocHost(&output3, 1024);
std::fill(input, input + 1024, 'A');
std::fill(output0, output0 + 1024, 'B');
std::fill(output1, output1 + 1024, 'B');
std::fill(output2, output2 + 1024, 'B');
std::fill(output3, output3 + 1024, 'B');
kernel0<<<1, 256>>>(output0, input, N);
kernel1<<<1, 256>>>(output1, input, N);
kernel2<<<1, 256>>>(output2, input, N);
kernel3<<<1, 256>>>(output3, input, N);
auto status = cudaDeviceSynchronize();
assert(status == cudaSuccess);
std::cerr << "output0: \n";
for(int i = 0; i < N; i++){
std::cerr << output0[i];
}
std::cerr << "\n";
std::cerr << "output1: \n";
for(int i = 0; i < N; i++){
std::cerr << output1[i];
}
std::cerr << "\n";
std::cerr << "output2: \n";
for(int i = 0; i < N; i++){
std::cerr << output2[i];
}
std::cerr << "\n";
std::cerr << "output3: \n";
for(int i = 0; i < N; i++){
std::cerr << output3[i];
}
std::cerr << "\n";
}
Output on my machine with sm_70:
output0:
AAAAA
output1:
AAAAB
output2:
AAAAA
output3:
AAAAA