Can't get any concurrency with pattern H2D->K->D2H on several streams

Sample code:

#include <cuda_runtime_api.h> 
#include <cassert>
#include <cstdint>

extern "C" {
int64_t  count_ticks_per_second();
int64_t  get_ticks();
long    _InterlockedIncrement(long volatile *Addend);
long    _InterlockedDecrement(long volatile *Addend);
void __stdcall Sleep(unsigned long dwMilliseconds);
void    create_thread(void *func, intptr_t x);
}

#define atomic32_dec        _InterlockedDecrement
#define atomic32_inc        _InterlockedIncrement
#define count_ticks_per_ms() ((double)count_ticks_per_second() / 1000.0)

enum {
    InputCount = 4096
    };

template <int BlockCount, int ThreadCount> static __global__ void
_test(int *in, int *ot) {
    auto &inp = reinterpret_cast<int (&)[InputCount]>(*in);
    auto &out = reinterpret_cast<int (&)[BlockCount][ThreadCount]>(*ot);
    extern __shared__ int sharedmem[]; // Dynamic shared memory base pointer    
    for(int i = 0; i < InputCount * 256; ++i) {
        sharedmem[threadIdx.x] += inp[i % InputCount] % (BlockCount * blockIdx.x + threadIdx.x);
        }
    out[blockIdx.x][threadIdx.x] = sharedmem[threadIdx.x];
    }

#define ass(x) assert(x == 0)

static volatile bool running = false;

typedef struct _entry_s {
    struct {
        int *i;
        int *o;
        } hos;
    struct {
        int *i;
        int *o;
        } dev;
    cudaStream_t s;
    long volatile idle;
    } entry_t;

template <int Streams> static entry_t data[Streams];

template <int Streams, int BlockCount, int ThreadCount>
struct Build {

enum {
    InpSize = InputCount * sizeof(int),
    OutSize = BlockCount * ThreadCount * sizeof(int),
    SmemSize = 48 << 7,
    };

static void
_callback(void *a) {    
    intptr_t i = reinterpret_cast<intptr_t>(a);
    atomic32_inc(&data<Streams>[i].idle);    
    }

static void
ctor(intptr_t i, bool create_stream=true) {
    entry_t &e = data<Streams>[i];
    cudaError_t err; 
    err = cudaHostAlloc(&e.hos.i, InpSize, cudaHostAllocWriteCombined);  ass(err);
    err = cudaHostAlloc(&e.hos.o, OutSize, 0);  ass(err);
    err = cudaMalloc(&e.dev.i, InpSize);  ass(err);
    err = cudaMalloc(&e.dev.o, OutSize);  ass(err);
    if(create_stream){
    err = cudaStreamCreateWithFlags(&e.s, cudaStreamNonBlocking); ass(err);    
        }else {
        e.s = cudaStreamPerThread;
        }
    e.idle = 6;
    }

static void
dtor(intptr_t i) {
    entry_t &e = data<Streams>[i];
    cudaError_t err;     
    err = cudaStreamDestroy(e.s); ass(err);
    err = cudaFreeHost(e.hos.i); ass(err);
    err = cudaFreeHost(e.hos.o); ass(err);
    err = cudaFree(e.dev.i); ass(err);
    err = cudaFree(e.dev.o); ass(err);
    }

static void
fire(intptr_t i) {
    entry_t &e = data<Streams>[i];    
    if (atomic32_dec(&e.idle) < 0) {
        atomic32_inc(&e.idle);        
        return;
        }
    auto &s = e.s;    
    
    cudaMemcpyAsync(e.dev.i, e.hos.i, InpSize, cudaMemcpyHostToDevice, s);
    _test<BlockCount, ThreadCount> << < BlockCount, ThreadCount, SmemSize, s >> > (e.dev.i, e.dev.o);
    cudaMemcpyAsync(e.hos.o, e.dev.o, OutSize, cudaMemcpyDeviceToHost, s);
    cudaLaunchHostFunc(s, _callback, reinterpret_cast<void *>(i));    
    }

static void
_thread_run(void *arg) {
    auto i = reinterpret_cast<intptr_t>(arg);
    ctor(i, false);
    while(running) {
        fire(i);
        }
    dtor(i);
    }

static void
run(void) {
    cudaError_t err; 
    auto ticks_per_s = count_ticks_per_second();
    
    Sleep(1000 * 3);    
#if 1
    for (intptr_t i = 0; i < Streams; ctor(i), ++i);
    for (intptr_t i = 0; i < Streams; fire(i), ++i);

    auto period = ticks_per_s * 12;    
    for (auto start = get_ticks(); get_ticks() - start < period;) {
        for (int i = 0; i < Streams; ++i) {            
            fire(i);
            }
        }
    for (intptr_t i = 0; i < Streams; dtor(i), ++i);
#else
    running = true;
    for(intptr_t i = 0; i < Streams; create_thread(_thread_run, i), ++i);
    auto period = ticks_per_s * 12;    
    for (auto start = get_ticks(); get_ticks() - start < period;) {
        Sleep(200);
        }
    running = false;
#endif
    }

    };

extern "C" void
cuda_simple_test(void) {
    // does cudaDeviceProp.asyncEngineCount affect this test?
    Build<6, 15, 256>::run();
    }

with these codes, an RTX3090, Intel i7 3770, Win 10, got this:

with same codes and executable, 3 RTX3080 (run with one of them), Intel W3265, Win 10, no concurrency at all.


but if we remove one of the cudaMemcpyAsync for H2D or D2H, the concurrency goes back again.


What I did wrong ? or RTX3080 cannot satisfy this need ? Are there some configuration works I have missed?

Hardware accelerated GPU scheduling is enabled on both machine. but turn it off does not help. Are there other options I can play with?

deviceQuery.exe reports that RTX3080 has only one copy engine, and RTX3090 has two. Does it matter ? (I donā€™t think so, but I donā€™t have any other clues).

@Robert_Crovella, I did what you said in this post.

but I canā€™t get a good result on the machine with three RTX3080. Could you give me some hints on this, please?

codes for get_ticks.

#include <stdint.h>
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>


int64_t 
count_ticks_per_second() { 
    LARGE_INTEGER li;
    QueryPerformanceFrequency(&li);
    return li.QuadPart; 
    }

int64_t 
get_ticks() { 
    LARGE_INTEGER li;
    QueryPerformanceCounter(&li);
    return li.QuadPart;
    }

I donā€™t generally get or try to demonstrate particular concurrency patterns on windows WDDM. The WDDM mechanism often gets in the way. Try using a GPU that can be (and is) placed in TCC mode on windows to get more predictable behavior. Or switch to linux.

Thanks for your response.

I agree with you about windows being a trouble maker sometimes. But just telling my teammate we shall switch to linux wonā€™t let them buy me a drink. I have tried to set these RTX3080 to TCC mode without successļ¼Œand purchasing new cards is not an option at moment.

Back to my question, I think this time itā€™s not WDDM making the trouble, but how the GPU driver treats RTX3080 is not so right. I have some new discovery after original post.

if we changes the order of the async calls in function ā€˜fireā€™, from

    cudaMemcpyAsync(e.dev.i, e.hos.i, InpSize, cudaMemcpyHostToDevice, s); 
    _test<BlockCount, ThreadCount> << < BlockCount, ThreadCount, SmemSize, s >> > (e.dev.i, e.dev.o);
    cudaMemcpyAsync(e.hos.o, e.dev.o, OutSize, cudaMemcpyDeviceToHost, s);
    cudaLaunchHostFunc(s, _callback, reinterpret_cast<void *>(i));

to

    _test<BlockCount, ThreadCount> << < BlockCount, ThreadCount, SmemSize, s >> > (e.dev.i, e.dev.o);
    cudaMemcpyAsync(e.hos.o, e.dev.o, OutSize, cudaMemcpyDeviceToHost, s);
    cudaLaunchHostFunc(s, _callback, reinterpret_cast<void *>(i));
    cudaMemcpyAsync(e.dev.i, e.hos.i, InpSize, cudaMemcpyHostToDevice, s); 

the execution overlaps occur again.


My thoughts: the amount of copy engines do matters, or the amount of hardware queue being used do matters.

Assume we do H2D->K->D2H on some streams.

S1:  (H2D-1 KKK-1 D2H-1) (H2D-1 KKK-1 D2H-1) ...
S2:  (H2D-2 KKK-2 D2H-2) (H2D-2 KKK-2 D2H-2) ...

RTX3090 employs all 3 queues, one for kernels, one for d2h, one for h2d, items in these queues arrange like this:

Q1:  H2D-1 H2D-2 H2D-1 H2D-2 H2D-1 ....
Q2:  KKK-1 KKK-2 KKK-1 KKK-2 KKK-1 ....
Q3:  D2H-1 D2H-2 D2H-1 D2H-2 D2H-1 .....

Each stream-x starts with H2D-x, Q1 invokes them one by one, all streams do not affect each other, because a H2D only sits after another H2D in Q1, they never wait for other operations.

RTX3080 with my configuration employs 2 queues, one for kernels, one for d2h and h2d, items arrange like this:

Q1:  H2D-1         D2H-1 H2D-2        D2H-2 H2D-1        D2H-1 .....
Q2:         KKK-1               KKK-2              KKK-1 ...

thus, H2D-2 waits for D2H-1, D2H-1 waits for KKK-1 to complete. So stream-2 wonā€™t start first/next H2D->K->D2H until a H2D->K->D2H in stream-1 completely ended, everything are serialized.

When we changes the order of operations in streams:

S1:  (KKK-1 D2H-1 H2D-1) (KKK-1 D2H-1 H2D-1) ...
S2:  (KKK-2 D2H-2 H2D-2) (KKK-2 D2H-2 H2D-2) ...

in queues:

Q1:          D2H-1 H2D-1           D2H-2 H2D-2           D2H-1 H2D-1 .....
Q2:  KKK-1                 KKK-2                 KKK-1 ...

In this order, KKK-1 still needs previous H2D-1, but this time the necessary H2D-1 has accomplished after previous D2H-1 without any blocking.

This behavior of blocked/serialized version is not right, and RTX3080 shouldnā€™t have only 1 copy engine. Actually, after found this, I did some search. In this post, you prefer this is a bug. And the poster says that on linux, the copy engine could be correctly recognized, then everything goes fine.

And this post describe the problem better than me, though itā€™s different hardware:

What he called ā€œbi-directional data transfers overlapping with kernel executionā€ is important, cudaDeviceProp.asyncEngineCount is important, especially for the pattern H2D->K->D2H, but people rarely say that.

Though the manually operations-reordering trick works, but not all jobs could be done this way (the sample above is just a simplest demo), and different jobs have different level of difficulty depending on their conditions/situations/business logic. So I think that there are two things :

  1. clarify this in elementary courses, but not simply ā€œH2D->K->D2H works wellā€, users should learn more about details of copy engines/queues. At least a side note is necessary.

  2. fix the bug on some cards. With hardware scheduling enabled, RTX3080 is reported with 1 copy engines. With hardware scheduling disabled, RTX3080 is reported with 5 copy engines, but it still canā€™t make the overlap. Windows/WDDM has nothing to do with this, what I guess is the GPU driver confused itself.

Typing so much is not just for blaming NV. There are few materials about this topic online. So I engaged myself to express my view/thoughts. And more IMPORTANT, Iā€™m not sure what I said in this post is right. Please correct my mistakes and give better answers. Any hints are appreciated. Let us make it clear, once for all.

Sorry for my poor English. And to @Robert_Crovellaļ¼ŒI have viewed many your posts/replies in past years, and you helped me a lot though you even donā€™t know me. Taking this opportunity, I must say Thank you very much!

1 Like

That is one of the signatures of WDDM batching, in my experience.

I think the idea that copy engines are the issue is not even remotely supported by the data that suggests that a reordering of calls causes overlap. If the card truly had 1 and only 1 copy engine, you would never ever see a D->H transfer overlap with a H->D transfer. The hardware wouldnā€™t support it. So the fact that you sometimes get overlap of H->D with D->H and sometimes you donā€™t says to me that this issue is not because the card has only 1 copy engine.

Itā€™s OK if you donā€™t believe me. I could be wrong. Iā€™ve personally never been successful at making WDDM batching always behave the way I want it to.

Youā€™re welcome to file a bug if you wish. I donā€™t have 3080 or 3090 cards to test on. I personally donā€™t do much testing on windows, either.

I didnā€™t. In my previous reply I confirmed this, H2D and D2H are still in one/same queue after the change.

The original question is, with original codes, why all operations are serialized, resulting no concurrency at all. The later question is, with the simple change, why kernel executions could overlap with data transfers now. My guess/explanation is also in that reply.

I donā€™t know much about WDDM batching, but I think the one fifo queue for transfers is the answer to my original question.

In the queue for transfers, the H2D from second stream sits after the D2H from the first stream, so the kernel associated with second stream must wait for data. In situation like this, though kernels are associated with different streams, they are still implicitly forcibly placed in a sequence. I wrote the details in the previous reply.

So if we can explain these observations through the mechanism of copy engines and the interactions between streams and queues, we donā€™t need think too far or introduce other variables.

The amount of copy engines a card truly had is hard to know, but how many copy engines are properly present is enough to make things different. The fact of RTX3080 could use 2 copy engines on linux is not important for our understanding, but it seems that the WDDM is not the only drawback. Personally, I consider that the lost engine/queue is worse than WDDM, and this one is not up on MS, itā€™s NVā€™s responsibility.

when I run your code under compute-sanitizer, I observe that the kernel _test is making illegal accesses (invalid global read of size 4) on this line:

    sharedmem[threadIdx.x] += inp[i] % (BlockCount * blockIdx.x + threadIdx.x);

If it were me, I would never study the performance of code that has those kinds of defects.

Sorry, I forgot modify the first post, itā€™s the initial version, something scribbled on the wall.

The kernel itself is not important (even with those kinds of defects), it is just a dummy to consume time.

I will update it immediately. Thank you for pointing that.

Updated.

The main defect in original version is an obvious array bounds error. But it does run with those defects, indeed it performs as well as version without those defects :P

The previous update is with the change of operations order. So I modified it again to restore the order which causes serializing.

There is ā€œhappens to workā€ (but can break at any moment) and ā€œworks by designā€. The guild of software engineers is in the possession of closely guarded secrets that allow one to separate one from the other :-)

WDDM and WDDM2.0 (since Windows 10) is the Windows Device Driver Model and replaced the device driver model used with Windows XP. It gives the operating system a high degree of control over devices, especially those critical to the OS, such as a GPU driving graphical output. For example, this means that Windows controls GPU memory allocations. The goal of WDDM is to increase system stability.

As far as I am aware, WDDM achieves the goal of significantly increased system stability, but at the expense of performance. In particular, the WDDM mechanisms impose a large overhead on every single kernel launch. Since performance is the key reason-for-being of CUDA, the CUDA driver tries to mitigate this overhead by batching up kernel launches, so the overhead is amortized across all the kernels launches in a batch. In general, this works quite well: For many use cases, there is no significant performance difference at application level between a Linux platform (that doesnā€™t have this overhead issue) and a Windows platform using WDDM.

But occasionally, the impact is visible in the form of performance artifacts because GPU commands are not passed to the hardware immediately as they are issued by the application. The way around this headache is to use NVIDIAā€™s TCC driver (TCC originally was an acronym for Tesla Compute Cluster), which makes Windows see the GPU as a 3D controller. Since such a device cannot drive the GUI, Windows does not want to control it tightly and the CUDA driver can interact with the GPU directly. Best I know the TCC driver generally works only with professional GPUs, such as the Quadro line. Switching between TCC and WDDM drivers can be done with nvidia-smiand requires a reboot.

So I guess what you are asking about is kernel concurrency?

That pretty much takes copy engines out of the discussion. Some theory around queues or else my wddm batching suggestion seem like candidates.

When I run your code on a V100 on linux, I witness kernel concurrency, with at least 3 or more kernels overlapped with each other.

I donā€™t have 3080 or 3090 cards to test on, and donā€™t have a windows setup handy.

No. Iā€™m not asking about kernel concurrency. I am asking about why kernel concurrency disappear on windows with a RTX3080.

Copy engines, or be more exact, the amount of copy engines, canā€™t be taken out. Why we must talk about it, my theory is written above.

In my understanding, copy engine works like a FIFO queue. when H2D and D2H are in same queue, the egine must invoke them one by one. D2H must wait for data produced by K. so when next H2D->K->D2H starts, the H2D sits after previous D2H in the queue, that makes every H2D->K->D2H being serialized.

I can observe overlapping on my RTX3090 too, because with 2 copy engine, H2D and D2H are in different queues.

And I have a prediction, this kind of overlapping/concurrency will never disappear on my win10 machine with RTX3090.

I think there is potential for misunderstandings here. The way I use the terminology, kernel concurrency refers to multiple kernels running concurrently. Copy engines are orthogonal to this. Note that copy engine is just a marketing name for what is technically a DMA mechanism.

PCIe is a full-duplex interconnect. Concurrent H2D and D2H transfers (one transfer per direction at any one time) are possible, if there is a sufficient number of DMA units or channels to drive the traffic. Note that concurrent copies are independent of potential execution of one or several kernels concurrently with these host<->device transfers.

Historically, the way NVIDIA counted the copy engines on GPUs was one DMA unit per copy engine where each DMA unit could handle one single stream of uni-directional traffic at any given time. But in recent years, the copy engine counts reported by device query seem to be all over the place in weird ways that I do not understand. If there is truly only one single-channel DMA unit in the RTX 3080, concurrent copies would not be possible.

Thank you for your explanation about WDDM batching and other things.

You are another friend who I always found his posts when I was googling :)

Actually, my terminology is same as yours. Pure ā€˜kernel concurrencyā€™ discussion is nothing about transfers.

What I am concerned with in this post, is that enqueuing H2D and D2H into one queue destroys kernel concurrency in some cases.

Especially this widely used pattern: H2D->K->D2H

One queue ā†’ all H2Ds after a D2H only can be invoked after this D2H ā†’ Any K waits for a H2D only can be invoked after this D2H ā†’ these Ks only can be invoked after the K who satisfy this D2H ā†’ these Ks are serialized passively ā†’ kernel concurrency is broken.

Streams are streams, transfer queue is queue. Streams do not destroy the rule of a queue, the queue destroys streamsā€™ potentials of overlapping.

And IMO, RTX3080 may have more units, so long as NV does not present them on windows, it equals 1 to windows users.

False dependencies based on a shared queue delivering work to the GPU last were an issue with compute capability 2.x devices. Kepler (compute capability 3.x) was the first architecture sporting multiple hardware queues (eight, if memory serves), eliminating that issue.

To my knowledge there have been no false dependencies in delivering work to the GPU since then, other than those due to work batching in the CUDA driver for WDDM. As the details of this batching are an implementation artifact, and therefore undocumented and subject to change at any time, it is impossible to explain in detail specific instances of false dependencies when using the WDDM driver, such as those you appear to be encountering here.

The simple fact of life is this: To exploit the concurrency mechanisms of GPU to the fullest extent of the hardware capabilities, it is generally necessary to use either a Linux platform, or use the TCC driver on a Windows platform. The fact that NVIDIA does not support the TCC driver with all GPUs is a conscious decision on their part, but based on historical observation not one likely to change anytime soon.

I have learned that and more from you and @Robert_Crovella. Thanks both of you.

I see what you said: my analysis/guess could be right or wrong, after all we canā€™t know what happened under the hood. I knew form the beginning that my theory more like a virtual model to explain what I encountered, but not a truth. If it could help me in my work for a period, like these days and few more days, it partially succeed.

So I accept that. But I still expect a better explanation, even more, a real solution on windows.

useless complaining:

I hope NVIDIA could do it better. RTX3090 shows that concurrent D2H and H2D could be done with WDDM, and what you said, the history about shared queue and elimination of the issue, has reinforced my view on this point.

We can switch to linux for some of our projects, but I hope the company could make everything clear and definite. What are supported on windows, What are supported on linux, Whatā€™s the future plan. Otherwise, users who must works on both side have difficulty to make decision for some projects.

Your observation shows that you get the behavior you desire in one particular hardware + software context. That is a ā€œhappens to workā€ scenario. Any minor change to any of the ingredients could change that tomorrow. You may find cases where you observe the behavior you desire with an RTX 3080, in a different set of circumstances.

The only things that are guaranteed are those explicitly specified in writing. If things donā€™t work as specified (in a language standard or in vendor documentation), that is a reportable bug.

Other things may happen to work as desired by someone. They may even work that way for extended periods of time and people may become accustomed to them, until they suddenly no longer work. If one is brave enough to rely on implementation artifacts (something I myself have done a fair number of times over the decades; been bitten in the behind by it, too!), one needs to be aware that things could ā€œbreakā€ ā€“ in some way or other ā€“ at any moment. In NVIDIAā€™s documentation I see no specific guarantees given regarding concurrency in the context laid out in this thread.

You could file an RFE (enhancement request) with NVIDIA requesting stronger guarantees regarding concurrency when using the WDDM driver. Be aware though that companies, like programming language standard writers, are usually reluctant to provide stronger / tighter guarantees. This is so for a variety of reasons, including the fact that certain factors are out of their control (in this case: WDDM is controlled by Microsoft, not NVIDIA) and potential future performance issues.

That is not to say filing an RFE on this issue is useless, just trying to set realistic expectations as to what the outcome might be based on my industry experience.

I agree with you, but not completely.

Thatā€™s right, so I said that it ā€œcould beā€ done. This scenario gave us a proof: there is a path to the destination even we are in a big labyrinth full of traps.

The"labyrinth" is just a figure of speech. The designers, the coders, the maintainers of WDDM are just men and women with their logic, and they usually show their intentions but not hide.

All of us are living in an uncertain world. Rewriting parts of our codes, refactoring whole designs, just because of changes made by Microsoft or NVIDIA, those are our jobs. and NVIDIAā€™s developers are just the same. so,

Even if NVIDIA decides focusing on linux, or they donā€™t have enough resources for windows, at least do not play the same trick they used to play on 10bit color-depth, a ā€œGeforce limitationā€ on Windows.

Why not they just give users a clear statement and a plan with middle reliability level ? that is ā€œbetterā€ too.

Back to RTX3090, I have been using them with windows 10 on several machines since its release date. One is working with Intel i7 3770 which is an ancient CPU, another is working with AMD 5900x which has endless bugs, etc. These cards always work well with similar user-cases, never surprise and fail me like RTX3080.

So it ā€œhappens to workā€, but the ā€œscenarioā€ lasts long enough. And these cards could be replaced by next generation later this year. Some scientist says that we just happens to exist. To be honest, I donā€™t care if it ā€œmeant to beā€.

And RTX3080, about the solution to the problem I encountered, I already have some immature ideas. My goal is a ā€œscenarioā€ not ā€œhappens to workā€, but designed to work, whether if a NVIDIA GPU can do H2D and D2H concurrently or not. Those ideas are built on only one hypothesis: I canā€™t find an official way to solve the problem, like a environment variable, a global setting, a properly configured context, or make some function calls first.

May someone already solved a similar problem and got more details than me. What I need is waiting for him/her to find this post ^_^