cudaMemcpyAsync waiting for another unrelated cudaMemcpyAsync

Hello,
I stumbled upon a weird behavior. See the code:

#include <cstdio>
#include <thread>
#include <chrono>
#include <omp.h>
#include <cuda_runtime.h>



#define CHECK(status) do { _check((status), __FILE__, __LINE__); } while(false)
inline void _check(cudaError_t error_code, const char *file, int line)
{
    if (error_code != cudaSuccess)
    {
        fprintf(stderr, "CUDA Error %d %s: %s. In file '%s' on line %d\n", error_code, cudaGetErrorName(error_code), cudaGetErrorString(error_code), file, line);
        fflush(stderr);
        exit(1);
    }
}





__global__ void long_kernel(double * xx)
{
    double & x = *xx;
    for(size_t i = 0; i < 100000000; i++)
    {
        x = x * x - 0.5;
    }
}





int main(int argc, char ** argv)
{
    cudaStream_t streamA;
    cudaStream_t streamB;
    CHECK(cudaStreamCreate(&streamA));
    CHECK(cudaStreamCreate(&streamB));

    double * d_dataA;
    CHECK(cudaMalloc(&d_dataA, sizeof(double)));
    double * d_dataB;
    CHECK(cudaMalloc(&d_dataB, sizeof(double)));
    double dataA;
    double dataB;



    for(int r = 0; r < 3; r++)
    {
        printf("Test #%d\n", r+1);

        double total_start = omp_get_wtime();
        #pragma omp parallel num_threads(2)
        {
            int i = omp_get_thread_num();
            if(i == 0)
            {
                printf("  Thread A           submitting kernel\n");
                double kernel_start = omp_get_wtime();
                long_kernel<<<1,1,0,streamA>>>(d_dataA);
                double kernel_stop = omp_get_wtime();
                printf("  Thread A           kernel submitted, %.3f ms\n", (kernel_stop-kernel_start)*1000.0);

                // printf("  Thread A           synchronizing stream\n");
                // double streamsync_start = omp_get_wtime();
                // CHECK(cudaStreamSynchronize(streamA));
                // double streamsync_stop = omp_get_wtime();
                // printf("  Thread A           stream synchronized, %.3f ms\n", (streamsync_stop-streamsync_start)*1000.0);

                printf("  Thread A           submitting memcpy\n");
                double memcpy_start = omp_get_wtime();
                CHECK(cudaMemcpyAsync(&dataA, d_dataA, sizeof(double), cudaMemcpyDeviceToHost, streamA));
                double memcpy_stop = omp_get_wtime();
                printf("  Thread A           memcpy submitted, %.3f ms\n", (memcpy_stop-memcpy_start)*1000.0);
            }
            if(i == 1)
            {
                printf("            Thread B sleeping\n");
                double sleep_start = omp_get_wtime();
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
                double sleep_stop = omp_get_wtime();
                printf("            Thread B slept, %.3f ms\n", (sleep_stop-sleep_start)*1000.0);

                printf("            Thread B submitting memcpy\n");
                double memcpy_start = omp_get_wtime();
                CHECK(cudaMemcpyAsync(&dataB, d_dataB, sizeof(double), cudaMemcpyDeviceToHost, streamB));
                double memcpy_stop = omp_get_wtime();
                printf("            Thread B memcpy submitted, %.3f ms\n", (memcpy_stop-memcpy_start)*1000.0);
            }
        }
        double sync_start = omp_get_wtime();
        CHECK(cudaDeviceSynchronize());
        double sync_stop = omp_get_wtime();
        printf("  Synchronize: %.3f ms\n", (sync_stop-sync_start)*1000.0);
        double total_stop = omp_get_wtime();
        printf("  Total time: %.3f ms\n", (total_stop-total_start)*1000.0);
    }



    CHECK(cudaFree(d_dataA));
    CHECK(cudaFree(d_dataB));

    CHECK(cudaStreamDestroy(streamA));
    CHECK(cudaStreamDestroy(streamB));

    return 0;
}

I first create two cudaStreams and allocate two doubles in device memory. Then I launch two threads. Each thread uses one of the streams. The first thread submits a ~470ms long kernel to the device, and then submits a cudaMemcpyAsync which copies the result to host. The second thread does completely unrelated things - it first sleeps for 100ms, and then it copies a different variable from device to host using cudaMemcpyAsync. This is done 3 times to get rid of warmup overheads.

The host memory is pagable, so the cudaMemcpyAsync will actually be synchronous and the cudaMemcpyAsync function will wait for that memory transfer to finish. I am OK with that.

Compile with nvcc -g -O2 -Xcompiler -fopenmp source.cu -o program.x and run with ./program.x.

Expected output:

...
Test #3
            Thread B sleeping
  Thread A           submitting kernel
  Thread A           kernel submitted, 0.004 ms
  Thread A           submitting memcpy
            Thread B slept, 100.052 ms
            Thread B submitting memcpy
            Thread B memcpy submitted, 0.013 ms
  Thread A           memcpy submitted, 467.089 ms
  Synchronize: 0.003 ms
  Total time: 467.131 ms

That is, I expect that the memcpy in thread B executes and copies the data immediately, since nothing is blocking the stream and no other memory is being transferred (thread A is still executing the kernel at that time, the memcpy happens only after that).

Actual output:

...
Test #3
            Thread B sleeping
  Thread A           submitting kernel
  Thread A           kernel submitted, 0.004 ms
  Thread A           submitting memcpy
            Thread B slept, 100.052 ms
            Thread B submitting memcpy
  Thread A           memcpy submitted, 467.089 ms
            Thread B memcpy submitted, 367.062 ms
  Synchronize: 0.003 ms
  Total time: 467.131 ms

That is, threadB’s memcpy is waiting for the threadA’s memcpy to finish for some reason.

Why is threadB’s memcpy waiting for threadA’s memcpy to finish? At the time when threadB wants to do the memcpy, streamA is executing the kernel on the device, no memcpy is actually being performed in streamA at that time, that could interfere with the threadB’s memcpy.

Why is this happening? Is this behavior expected?

It seems like the cudaMemcpyAsync operations with pageable host memory are ordered the same way as they were submitted. They are not executed independently as their time comes in their respective streams.

A simple workaround is to put a cudaStreamSynchronize(streamA) in between the kernel and the memcpy in threadA’s code (the commented lines in the attached code). Then the threadB’s memcpy happens right away, and threadA’s runtime does not change. But why do I have to do that?

This was just a simple demonstration of the core of the problem. In my real application, threadA contains cusparseSpSM_analysis (which basically launches some kernels and performs pageable memcpy, according to nsight), and threadB is trying to submit a host function (cudaLaunchHostFunc), which I think uses the pageable memcpy on the inside (although I am not sure, but the behavior is the same if I replace the memcpy with cudalaunchostfunc). The ordering of the threadA/threadB execution does not match as exactly as in the original example, but the cudaLaunchHostFunc takes almost the same time as the cusparseSpSM_analysis - as if it was waiting for it to finish, despite being in two unrelated streams. I am unable to use the cudaStreamSynchronize workaround there, because I would need to modify the cusparseSpSM_analysis function, which I can’t. Here is the code for that problem:

#include <cstdio>
#include <thread>
#include <chrono>
#include <vector>
#include <omp.h>
#include <cuda_runtime.h>
#include <cusparse.h>



#define CHECK(status) do { _check((status), __FILE__, __LINE__); } while(false)
inline void _check(cudaError_t error_code, const char *file, int line)
{
    if (error_code != cudaSuccess)
    {
        fprintf(stderr, "CUDA Error %d %s: %s. In file '%s' on line %d\n", error_code, cudaGetErrorName(error_code), cudaGetErrorString(error_code), file, line);
        fflush(stderr);
        exit(1);
    }
}
inline void _check(cusparseStatus_t status, const char *file, int line)
{
    if (status != CUSPARSE_STATUS_SUCCESS) {
        fprintf(stderr, "CUSPARSE Error %d %s: %s. In file '%s' on line %d\n", status, cusparseGetErrorName(status), cusparseGetErrorString(status), file, line);
        fflush(stderr);
        exit(1);
    }
}



struct trsm_system
{
    int size;
    int nrhs;
    int nnz;
    int * A_rowptrs = nullptr;
    int * A_colidxs = nullptr;
    double * A_vals = nullptr;
    double * B = nullptr;
    double * X = nullptr;
};

trsm_system load_system(const char * file)
{
    FILE * f = fopen(file, "r");
    if(f == nullptr) throw std::runtime_error("cannot open matrix file");

    trsm_system sys;
    fscanf(f, "%d%d%d", &sys.size, &sys.nrhs, &sys.nnz);
    std::vector<int> rowptrs(sys.size+1);
    std::vector<int> colidxs(sys.nnz);
    std::vector<double> vals(sys.nnz);
    for(int r = 0; r <= sys.size; r++) fscanf(f, "%d", &rowptrs[r]);
    for(int i = 0; i < sys.nnz; i++) fscanf(f, "%d", &colidxs[i]);
    for(int i = 0; i < sys.nnz; i++) fscanf(f, "%lf", &vals[i]);

    fclose(f);

    std::vector<double> B(sys.size * sys.nrhs);
    for(int i = 0; i < B.size(); i++) B[i] = (double)rand() / RAND_MAX;

    CHECK(cudaMalloc(&sys.A_rowptrs, (sys.size + 1) * sizeof(int)));
    CHECK(cudaMalloc(&sys.A_colidxs, sys.nnz * sizeof(int)));
    CHECK(cudaMalloc(&sys.A_vals, sys.nnz * sizeof(double)));
    CHECK(cudaMalloc(&sys.B, sys.size * sys.nrhs * sizeof(double)));
    CHECK(cudaMalloc(&sys.X, sys.size * sys.nrhs * sizeof(double)));

    CHECK(cudaMemcpy(sys.A_rowptrs, rowptrs.data(), (sys.size + 1) * sizeof(int), cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(sys.A_colidxs, colidxs.data(), sys.nnz * sizeof(int), cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(sys.A_vals, vals.data(), sys.nnz * sizeof(double), cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(sys.B, B.data(), sys.size * sys.nrhs * sizeof(double), cudaMemcpyHostToDevice));

    return sys;
}

void clear_system(trsm_system & sys)
{
    CHECK(cudaFree(sys.A_rowptrs));
    CHECK(cudaFree(sys.A_colidxs));
    CHECK(cudaFree(sys.A_vals));
    CHECK(cudaFree(sys.B));
    CHECK(cudaFree(sys.X));
}









int main(int argc, char ** argv)
{
    trsm_system sys = load_system("matrix25.txt");

    cudaStream_t streamA;
    cudaStream_t streamB;
    CHECK(cudaStreamCreate(&streamA));
    CHECK(cudaStreamCreate(&streamB));

    for(int r = 0; r < 3; r++)
    {
        printf("Test #%d\n", r+1);

        cusparseHandle_t handle;
        CHECK(cusparseCreate(&handle));
        CHECK(cusparseSetStream(handle, streamA));

        cusparseSpMatDescr_t descr_A;
        CHECK(cusparseCreateCsr(&descr_A, sys.size, sys.size, sys.nnz, sys.A_rowptrs, sys.A_colidxs, sys.A_vals, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
        auto lower = CUSPARSE_FILL_MODE_LOWER;
        auto nonunit = CUSPARSE_DIAG_TYPE_NON_UNIT;
        CHECK(cusparseSpMatSetAttribute(descr_A, CUSPARSE_SPMAT_FILL_MODE, &lower, sizeof(lower)));
        CHECK(cusparseSpMatSetAttribute(descr_A, CUSPARSE_SPMAT_DIAG_TYPE, &nonunit, sizeof(nonunit)));

        cusparseDnMatDescr_t descr_B, descr_X;
        CHECK(cusparseCreateDnMat(&descr_B, sys.size, sys.nrhs, sys.nrhs, sys.B, CUDA_R_64F, CUSPARSE_ORDER_ROW));
        CHECK(cusparseCreateDnMat(&descr_X, sys.size, sys.nrhs, sys.nrhs, sys.X, CUDA_R_64F, CUSPARSE_ORDER_ROW));
    
        cusparseSpSMDescr_t descr_spsm;
        CHECK(cusparseSpSM_createDescr(&descr_spsm));

        cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
        cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
        double one = 1.0;
        size_t buffersize;
        void * buffer;

        CHECK(cusparseSpSM_bufferSize(handle, opA, opB, &one, descr_A, descr_B, descr_X, CUDA_R_64F, CUSPARSE_SPSM_ALG_DEFAULT, descr_spsm, &buffersize));
        CHECK(cudaDeviceSynchronize());
        CHECK(cudaMalloc(&buffer, buffersize));



        double total_start = omp_get_wtime();
        #pragma omp parallel num_threads(2)
        {
            int i = omp_get_thread_num();
            if(i == 0)
            {
                printf("  Thread A           analysis starting\n");
                double kernel_start = omp_get_wtime();
                CHECK(cusparseSpSM_analysis(handle, opA, opB, &one, descr_A, descr_B, descr_X, CUDA_R_64F, CUSPARSE_SPSM_ALG_DEFAULT, descr_spsm, buffer));
                double kernel_stop = omp_get_wtime();
                printf("  Thread A           analysis finished, %.3f ms\n", (kernel_stop-kernel_start)*1000.0);
            }
            if(i == 1)
            {
                printf("            Thread B sleeping\n");
                double sleep_start = omp_get_wtime();
                std::this_thread::sleep_for(std::chrono::milliseconds(10));
                double sleep_stop = omp_get_wtime();
                printf("            Thread B slept, %.3f ms\n", (sleep_stop-sleep_start)*1000.0);

                printf("            Thread B submitting host func\n");
                double memcpy_start = omp_get_wtime();
                CHECK(cudaLaunchHostFunc(streamB, [](void*){
                    printf("            Thread B host func exec\n");
                }, nullptr));
                double memcpy_stop = omp_get_wtime();
                printf("            Thread B host func submitted, %.3f ms\n", (memcpy_stop-memcpy_start)*1000.0);
            }
        }
        double sync_start = omp_get_wtime();
        CHECK(cudaDeviceSynchronize());
        double sync_stop = omp_get_wtime();
        printf("  Synchronize: %.3f ms\n", (sync_stop-sync_start)*1000.0);
        double total_stop = omp_get_wtime();
        printf("  Total time: %.3f ms\n", (total_stop-total_start)*1000.0);


        
        CHECK(cudaFree(buffer));
        CHECK(cusparseSpSM_destroyDescr(descr_spsm));
        CHECK(cusparseDestroyDnMat(descr_B));
        CHECK(cusparseDestroyDnMat(descr_X));
        CHECK(cusparseDestroySpMat(descr_A));
        CHECK(cusparseDestroy(handle));
    }

    CHECK(cudaStreamDestroy(streamA));
    CHECK(cudaStreamDestroy(streamB));

    clear_system(sys);

    return 0;
}

Compile with nvcc -Wno-deprecated-declarations -g -O2 -Xcompiler -fopenmp source_realworld.cu -o program_real.x -lcusparse. Also here is the matrix file that is used in the program: matrix25.txt (98.0 MB)

I use CUDA 12.4.0 with its libraries, gpu driver 550.54.15, A100-SXM4-40GB gpu.

Will provide additional details or explanation if needed.

Thanks in advance for help,

Jakub

Some clarification questions:

(1) Is this a Linux or a Windows platform? What version?
(2) Are you setting any of he following environment variables, and if so, to what value:CUDA_DEVICE_MAX_CONNECTIONS,CUDA_DEVICE_MAX_COPY_CONNECTIONS
(3) If on Windows: Do you operate the GPU with the TCC driver? Is Hardware-accelerated GPU scheduling enabled in the Windows Graphics settings?

  1. Linux:
$ cat /etc/os-release
NAME="Rocky Linux"
VERSION="8.9 (Green Obsidian)"
ID="rocky"
ID_LIKE="rhel centos fedora"
VERSION_ID="8.9"
PLATFORM_ID="platform:el8"
PRETTY_NAME="Rocky Linux 8.9 (Green Obsidian)"
ANSI_COLOR="0;32"
LOGO="fedora-logo-icon"
CPE_NAME="cpe:/o:rocky:rocky:8:GA"
HOME_URL="https://rockylinux.org/"
BUG_REPORT_URL="https://bugs.rockylinux.org/"
SUPPORT_END="2029-05-31"
ROCKY_SUPPORT_PRODUCT="Rocky-Linux-8"
ROCKY_SUPPORT_PRODUCT_VERSION="8.9"
REDHAT_SUPPORT_PRODUCT="Rocky Linux"
REDHAT_SUPPORT_PRODUCT_VERSION="8.9"
  1. These variables are empty

When posting code on these forums, please post inline, not as an attachment.

If you convert your host buffers to pinned buffers, I think you can get the behavior you are looking for. To see your exact desired print-out, you should put a cudaStreamSynchronize(...) inside the last timing region for each thread.

# cat t322.cu
#include <cstdio>
#include <thread>
#include <chrono>
#include <omp.h>
#include <cuda_runtime.h>



#define CHECK(status) do { _check((status), __FILE__, __LINE__); } while(false)
inline void _check(cudaError_t error_code, const char *file, int line)
{
    if (error_code != cudaSuccess)
    {
        fprintf(stderr, "CUDA Error %d %s: %s. In file '%s' on line %d\n", error_code, cudaGetErrorName(error_code), cudaGetErrorString(error_code), file, line);
        fflush(stderr);
        exit(1);
    }
}





__global__ void long_kernel(double * xx)
{
    double & x = *xx;
    for(size_t i = 0; i < 100000000; i++)
    {
        x = x * x - 0.5;
    }
}





int main(int argc, char ** argv)
{
    cudaStream_t streamA;
    cudaStream_t streamB;
    CHECK(cudaStreamCreate(&streamA));
    CHECK(cudaStreamCreate(&streamB));

    double * d_dataA;
    CHECK(cudaMalloc(&d_dataA, sizeof(double)));
    double * d_dataB;
    CHECK(cudaMalloc(&d_dataB, sizeof(double)));
    double dataA;
    double dataB;
    double *p_dataA, *p_dataB;
    CHECK(cudaHostAlloc(&p_dataA, sizeof(double), cudaHostAllocDefault));
    CHECK(cudaHostAlloc(&p_dataB, sizeof(double), cudaHostAllocDefault));


    for(int r = 0; r < 3; r++)
    {
        printf("Test #%d\n", r+1);

        double total_start = omp_get_wtime();
        #pragma omp parallel num_threads(2)
        {
            int i = omp_get_thread_num();
            if(i == 0)
            {
                printf("  Thread A           submitting kernel\n");
                double kernel_start = omp_get_wtime();
                long_kernel<<<1,1,0,streamA>>>(d_dataA);
                double kernel_stop = omp_get_wtime();
                printf("  Thread A           kernel submitted, %.3f ms\n", (kernel_stop-kernel_start)*1000.0);

                // printf("  Thread A           synchronizing stream\n");
                // double streamsync_start = omp_get_wtime();
                // CHECK(cudaStreamSynchronize(streamA));
                // double streamsync_stop = omp_get_wtime();
                // printf("  Thread A           stream synchronized, %.3f ms\n", (streamsync_stop-streamsync_start)*1000.0);

                printf("  Thread A           submitting memcpy\n");
                double memcpy_start = omp_get_wtime();
                CHECK(cudaMemcpyAsync(p_dataA, d_dataA, sizeof(double), cudaMemcpyDeviceToHost, streamA));
                CHECK(cudaStreamSynchronize(streamA));
                double memcpy_stop = omp_get_wtime();
                printf("  Thread A           memcpy submitted, %.3f ms\n", (memcpy_stop-memcpy_start)*1000.0);
            }
            if(i == 1)
            {
                printf("            Thread B sleeping\n");
                double sleep_start = omp_get_wtime();
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
                double sleep_stop = omp_get_wtime();
                printf("            Thread B slept, %.3f ms\n", (sleep_stop-sleep_start)*1000.0);

                printf("            Thread B submitting memcpy\n");
                double memcpy_start = omp_get_wtime();
                CHECK(cudaMemcpyAsync(p_dataB, d_dataB, sizeof(double), cudaMemcpyDeviceToHost, streamB));
                CHECK(cudaStreamSynchronize(streamB));
                double memcpy_stop = omp_get_wtime();
                printf("            Thread B memcpy submitted, %.3f ms\n", (memcpy_stop-memcpy_start)*1000.0);
            }
        }
        double sync_start = omp_get_wtime();
        CHECK(cudaDeviceSynchronize());
        double sync_stop = omp_get_wtime();
        printf("  Synchronize: %.3f ms\n", (sync_stop-sync_start)*1000.0);
        double total_stop = omp_get_wtime();
        printf("  Total time: %.3f ms\n", (total_stop-total_start)*1000.0);
    }



    CHECK(cudaFree(d_dataA));
    CHECK(cudaFree(d_dataB));

    CHECK(cudaStreamDestroy(streamA));
    CHECK(cudaStreamDestroy(streamB));

    return 0;
}

# nvcc -o t322 t322.cu -Xcompiler -fopenmp -lgomp
t322.cu(48): warning #177-D: variable "dataA" was declared but never referenced
      double dataA;
             ^

Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"

t322.cu(49): warning #177-D: variable "dataB" was declared but never referenced
      double dataB;
             ^

# ./t322
Test #1
  Thread A           submitting kernel
            Thread B sleeping
  Thread A           kernel submitted, 0.238 ms
  Thread A           submitting memcpy
            Thread B slept, 100.061 ms
            Thread B submitting memcpy
            Thread B memcpy submitted, 0.043 ms
  Thread A           memcpy submitted, 2352.955 ms
  Synchronize: 0.036 ms
  Total time: 2353.390 ms
Test #2
  Thread A           submitting kernel
            Thread B sleeping
  Thread A           kernel submitted, 0.039 ms
  Thread A           submitting memcpy
            Thread B slept, 100.060 ms
            Thread B submitting memcpy
            Thread B memcpy submitted, 0.030 ms
  Thread A           memcpy submitted, 2352.944 ms
  Synchronize: 0.005 ms
  Total time: 2353.020 ms
Test #3
            Thread B sleeping
  Thread A           submitting kernel
  Thread A           kernel submitted, 0.011 ms
  Thread A           submitting memcpy
            Thread B slept, 100.060 ms
            Thread B submitting memcpy
            Thread B memcpy submitted, 0.019 ms
  Thread A           memcpy submitted, 2352.963 ms
  Synchronize: 0.027 ms
  Total time: 2353.050 ms
#

Isn’t it nice to be able to look at the code I posted and changes I made without having to download a file? Plus it makes it searchable, which is generally a “good thing”.

When posting code on these forums, please post inline, not as an attachment.

Isn’t it nice to be able to look at the code I posted and changes I made without having to download a file? Plus it makes it searchable, which is generally a “good thing”.

yes, it is better. Some forums have this policy, some other forums have the policy to upload the file as attachment, wasn’t sure which one was this and for some reason assumed that this is the “upload the file” type. I probably did not want to clutter the post with long code, but now I see that the codeblock is small and scrollable, so that’s not an issue. Will know next time. Edited the post to make the code inline.

Suggestion: when trying to upload a .cu/.cpp file here, instead of displaying just that the file extension is not supported (that was weird, not able to upload a .cu file on cuda forum), also show a note that code should be inserted inline.

If you convert your host buffers to pinned buffers, I think you can get the behavior you are looking for. To see your exact desired print-out, you should put a cudaStreamSynchronize(...) inside the last timing region for each thread.

I know. But I cannot really do that in my actual code. The problem statement is given with pageable memory transfers. Anyway, to the question “why does X behave like this?”, the answer “just use Y it behaves as you want” is not what I was looking for.

I cannot change the pageable host memory to pinned, because that code is not in my power. In my real application, the pageable memcpy is inside the cusparseSpSM_analysis function, which I cannot modify, and I don’t provide the host buffer that is used there. I reported this here on the general CUDA forum, because the ordered cudaMemcpyAsync to pageable memory is the underlying issue IMO. Should I also post this to the hpc libraries subforum?

You can post however you wish. From a category perspective, I don’t think the demonstrator that uses cudaMemcpyAsync is a HPC libraries issue/topic.

Regarding the use with pageable memory, I don’t have a precise explanation for exactly why it works that way, but I don’t think you are doing anything wrong, and I don’t think you can “fix” the behavior (other than making a change to your code structure, which you don’t want to do or can’t do.)

IMO, what you are basically looking for is a form of asynchrony with pageable memory, and that has explicitly not worked in CUDA since day 1, and is a frequently discussed area; you can find many forum posts describing when you can get asynchrony with cudaMemcpyAsync, what the requirements are, and it is also documented in several places in the CUDA documentation.

The following is historical background information that may no longer be accurate as underlying implementation details likely have changed over time as the CUDA memory model has become more complex. Please see the CUDA API Reference for what is officially guaranteed in terms of synchronization behavior. Anything that is not officially guaranteed in documentation should not be relied upon by programmers.

Transfers between the device and pageable host memory involve two transfers: (1) system memory copy between pageable host memory and a pinned memory buffer owned by the driver (2) DMA transfer between the driver’s pinned memory buffer and device memory. The ordering that is being observed would appear to be a direct consequence of the use of this shared resource.

In essence, cudaMemcpyAsync operations involving pageable host memory fall back to plain cudaMemcpy behavior, with the possible exception of “small” host to device copies for which data could be sent through the GPU’s command queue.

So there is an ‘easy’ workaround for the OP: Do this manually.

Create an additional suitably sized pinned host buffer per stream, copy to it with host callbacks inserted into the stream and then use cudaMemcpyAsync from the pinned buffers.

The copies in question are device->host in the code I looked at. I agree that a cudaMemcpyAsync followed by a host callback (with an ordinary memcpy) may work. Everything up to the point of the host callback is already in the sample I provided.

Thanks for the replies.

Transfers between the device and pageable host memory involve two transfers: (1) system memory copy between pageable host memory and a pinned memory buffer owned by the driver (2) DMA transfer between the driver’s pinned memory buffer and device memory. The ordering that is being observed would appear to be a direct consequence of the use of this shared resource.

Yes, this is how I imagined that memcpy with pageable memory would work - with a pinned intermediate buffer.

And that buffer seems to be shared in the whole cuda context, and locked as a whole by each pageable memcpyD2H (not sure about H2D). I can now see in nsight that there are some kind of pthread mutex lock function.

And the issue probably is that the lock is created right away when the cudaMemcpyAsync is submitted, not later when it can start actually executing in the stream.

So there is an ‘easy’ workaround for the OP: Do this manually.

cudaMemcpyAsync [to pinned memory] followed by a host callback (with an ordinary memcpy )

yes, that would work in the cudaMemcpyAsync example.

Anyway, now I think I understand better where the issue is, most probably not in my code, but in the way how the pageable memory transfers work internally. I will now message official support and report back here.

No matter how the protection works in detail, if there is only a single staging buffer for staged transfers, once data has been deposited into it, nothing can be allowed to write to it until that data has been retrieved in its entirety, and this precludes out-of-order operation.