Is this an edge-case with cuMemcpyHtoDAsync?

Does the sample code provided below behave as expected by the CUDA spec? The documentation is difficult to decipher in this area.

I would expect both options for mallocMode to produce the same result (3). But, they do not. Why is that?

//usr/local/cuda/bin/nvcc $0 -lcuda -lcudart --run; exit
//This is a self-running source file.
//chmod +x thisFile.cpp & ./thisFile.cpp to run it.
#include "cuda.h"
#include "cuda_runtime.h"
#include <memory.h>
#include <stdio.h>
// mallocMode = cuda works.  mallocMode = cstd fails.
enum MallocMode { cstd, cuda } mallocMode = cstd;
struct Params {
    uint8_t *pSrc;
    uint8_t *pDst;
    size_t size;
void hostFunc(void *pUserData) {
    Params *pParams = (Params*)pUserData;
    for (size_t i=0; i<pParams->size; i++)
        pParams->pDst[i] = pParams->pSrc[i] | 2;
int main() {
    CUdevice device;
    cuDeviceGet(&device, 0);
    CUcontext ctx;
    cuDevicePrimaryCtxRetain(&ctx, device);
    size_t bufferSize = 1<<20;
    uint8_t *hostSrc, *hostDst;
    switch(mallocMode) {
        case cstd:
            hostSrc = (uint8_t*)malloc(bufferSize);
            hostDst = (uint8_t*)malloc(bufferSize);
        case cuda:
            cuMemAllocHost((void**)&hostSrc, bufferSize);
            cuMemAllocHost((void**)&hostDst, bufferSize);
    uint8_t *devSrc;
    cuMemAlloc((CUdeviceptr*)&devSrc, bufferSize);
    cudaStream_t stream = nullptr;
    memset(hostSrc, 1, bufferSize);
    memset(hostDst, 4, bufferSize);
    Params hostParams {hostSrc, hostDst, bufferSize};
    cuLaunchHostFunc(stream, hostFunc, &hostParams);
    cuMemcpyHtoDAsync((CUdeviceptr)devSrc, hostDst, bufferSize, stream);
    cuMemcpyDtoHAsync(hostDst, (CUdeviceptr)devSrc, bufferSize, stream);
    printf("# %d\n", hostDst[0]); // 3 = success.  4 = failure.
    return 0;

It looks like a defect in CUDA to me. Stream semantics are not being followed in the failing case. You might wish to file a bug using the instructions linked at the top of this sub-forum.

Filed. BUG ID 3193744