Sample code:
#include <cuda_runtime_api.h>
#include <cassert>
#include <cstdint>
extern "C" {
int64_t count_ticks_per_second();
int64_t get_ticks();
long _InterlockedIncrement(long volatile *Addend);
long _InterlockedDecrement(long volatile *Addend);
void __stdcall Sleep(unsigned long dwMilliseconds);
void create_thread(void *func, intptr_t x);
}
#define atomic32_dec _InterlockedDecrement
#define atomic32_inc _InterlockedIncrement
#define count_ticks_per_ms() ((double)count_ticks_per_second() / 1000.0)
enum {
InputCount = 4096
};
template <int BlockCount, int ThreadCount> static __global__ void
_test(int *in, int *ot) {
auto &inp = reinterpret_cast<int (&)[InputCount]>(*in);
auto &out = reinterpret_cast<int (&)[BlockCount][ThreadCount]>(*ot);
extern __shared__ int sharedmem[]; // Dynamic shared memory base pointer
for(int i = 0; i < InputCount * 256; ++i) {
sharedmem[threadIdx.x] += inp[i % InputCount] % (BlockCount * blockIdx.x + threadIdx.x);
}
out[blockIdx.x][threadIdx.x] = sharedmem[threadIdx.x];
}
#define ass(x) assert(x == 0)
static volatile bool running = false;
typedef struct _entry_s {
struct {
int *i;
int *o;
} hos;
struct {
int *i;
int *o;
} dev;
cudaStream_t s;
long volatile idle;
} entry_t;
template <int Streams> static entry_t data[Streams];
template <int Streams, int BlockCount, int ThreadCount>
struct Build {
enum {
InpSize = InputCount * sizeof(int),
OutSize = BlockCount * ThreadCount * sizeof(int),
SmemSize = 48 << 7,
};
static void
_callback(void *a) {
intptr_t i = reinterpret_cast<intptr_t>(a);
atomic32_inc(&data<Streams>[i].idle);
}
static void
ctor(intptr_t i, bool create_stream=true) {
entry_t &e = data<Streams>[i];
cudaError_t err;
err = cudaHostAlloc(&e.hos.i, InpSize, cudaHostAllocWriteCombined); ass(err);
err = cudaHostAlloc(&e.hos.o, OutSize, 0); ass(err);
err = cudaMalloc(&e.dev.i, InpSize); ass(err);
err = cudaMalloc(&e.dev.o, OutSize); ass(err);
if(create_stream){
err = cudaStreamCreateWithFlags(&e.s, cudaStreamNonBlocking); ass(err);
}else {
e.s = cudaStreamPerThread;
}
e.idle = 6;
}
static void
dtor(intptr_t i) {
entry_t &e = data<Streams>[i];
cudaError_t err;
err = cudaStreamDestroy(e.s); ass(err);
err = cudaFreeHost(e.hos.i); ass(err);
err = cudaFreeHost(e.hos.o); ass(err);
err = cudaFree(e.dev.i); ass(err);
err = cudaFree(e.dev.o); ass(err);
}
static void
fire(intptr_t i) {
entry_t &e = data<Streams>[i];
if (atomic32_dec(&e.idle) < 0) {
atomic32_inc(&e.idle);
return;
}
auto &s = e.s;
cudaMemcpyAsync(e.dev.i, e.hos.i, InpSize, cudaMemcpyHostToDevice, s);
_test<BlockCount, ThreadCount> << < BlockCount, ThreadCount, SmemSize, s >> > (e.dev.i, e.dev.o);
cudaMemcpyAsync(e.hos.o, e.dev.o, OutSize, cudaMemcpyDeviceToHost, s);
cudaLaunchHostFunc(s, _callback, reinterpret_cast<void *>(i));
}
static void
_thread_run(void *arg) {
auto i = reinterpret_cast<intptr_t>(arg);
ctor(i, false);
while(running) {
fire(i);
}
dtor(i);
}
static void
run(void) {
cudaError_t err;
auto ticks_per_s = count_ticks_per_second();
Sleep(1000 * 3);
#if 1
for (intptr_t i = 0; i < Streams; ctor(i), ++i);
for (intptr_t i = 0; i < Streams; fire(i), ++i);
auto period = ticks_per_s * 12;
for (auto start = get_ticks(); get_ticks() - start < period;) {
for (int i = 0; i < Streams; ++i) {
fire(i);
}
}
for (intptr_t i = 0; i < Streams; dtor(i), ++i);
#else
running = true;
for(intptr_t i = 0; i < Streams; create_thread(_thread_run, i), ++i);
auto period = ticks_per_s * 12;
for (auto start = get_ticks(); get_ticks() - start < period;) {
Sleep(200);
}
running = false;
#endif
}
};
extern "C" void
cuda_simple_test(void) {
// does cudaDeviceProp.asyncEngineCount affect this test?
Build<6, 15, 256>::run();
}
with these codes, an RTX3090, Intel i7 3770, Win 10, got this:
with same codes and executable, 3 RTX3080 (run with one of them), Intel W3265, Win 10, no concurrency at all.
but if we remove one of the cudaMemcpyAsync for H2D or D2H, the concurrency goes back again.
What I did wrong ? or RTX3080 cannot satisfy this need ? Are there some configuration works I have missed?
Hardware accelerated GPU scheduling is enabled on both machine. but turn it off does not help. Are there other options I can play with?