Why Cudamemcpyasync has different behaviors on different CPU platforms?

I wrote a simple asynchronous copy test program:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda.h>
#include <cusolverDn.h>
#include <cublas_v2.h>
#include <npp.h>
#define MAX_LINEPOINTS  650
#define MAX_BEAMS       32
#define MAX_TXNUMBER    17
#define MAX_ENSEMBLE    40
#define MAX_FRAMESIZE   (MAX_LINEPOINTS * MAX_BEAMS * MAX_TXNUMBER)
int main()
{
    cudaError_t cudaStatus;
    cudaStream_t cudaStream1 = NULL;        // CUDA stream
    cudaStream_t cudaStream2 = NULL;        // CUDA stream
    cudaStreamCreateWithFlags(&cudaStream1, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&cudaStream2, cudaStreamNonBlocking);
    //
    cudaStatus = cudaSetDevice(0);
    //
    cuDoubleComplex *h_tmp;
    cudaStatus = cudaMallocHost(&h_tmp, MAX_FRAMESIZE * sizeof(cuDoubleComplex));

    for (int i = 0; i < MAX_FRAMESIZE; i++)
        h_tmp[i] = { rand() / (double)RAND_MAX, rand() / (double)RAND_MAX };
    //
    cuDoubleComplex *d_tmp;
    cudaStatus = cudaMalloc(&d_tmp, MAX_FRAMESIZE * MAX_ENSEMBLE * sizeof(cuDoubleComplex));
    //
    for (int i = 0; i < MAX_ENSEMBLE; i++) {
        cudaStatus = cudaMemcpyAsync(d_tmp + i * MAX_FRAMESIZE, h_tmp, MAX_FRAMESIZE * sizeof(cuDoubleComplex),
            cudaMemcpyHostToDevice, cudaStream1);
        _sleep(5);
    }
    
    cudaStatus = cudaStreamSynchronize(cudaStream1); 
    //
    cudaFreeHost(h_tmp);
    cudaFree(d_tmp);
    cudaStatus = cudaDeviceReset();

    return 0;
}

When I run on different CPU platforms, I have different behaviors:
platform1: QuadCore Intel Xeon E5-1620 v4, 3800 MHz (38 x 100)
platform2: QuadCore Intel Xeon E3-1270 v5, 4000 MHz (40 x 100)
The two platform have the same graphics card (nVIDIA Quadro RTX 4000), the same operating system and drivers.
Use Nsight to observe the running status:
platform1:


download

The running process of platform1 is the expected way, and every asynchronous copy instruction will be followed by an actual transmission behavior. However, there is something wrong with the running of the code in platform2. There is no actual transmission behavior after every asynchronous copy instruction is issued, and the transmission does not start until cudaStreamSynchronize.
At the same time, I also found that the query results of asyncEngineCount attribute value of graphics card on two platforms are different.(platform1 is 2, platform2 is 6)
I want to know why this is the result. If I want my program to execute in the way of platform1, what should I pay attention to when choosing CPU?

Which are?

Microsoft Windows 10 Pro 10.0.19044.2130 (Win10 21H2 November 2021 Update)
The graphics card driver is the latest version of nvidia : Quadro RTX 4000 (517.40) WHQL

Do both systems have the same setting for hardware accelerated GPU scheduling?

1 Like

Thank you for solving my problem. I checked this setting of the two systems, and it’s really different. E3 platform did not turn on this setting.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.