I wrote a simple asynchronous copy test program:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda.h>
#include <cusolverDn.h>
#include <cublas_v2.h>
#include <npp.h>
#define MAX_LINEPOINTS 650
#define MAX_BEAMS 32
#define MAX_TXNUMBER 17
#define MAX_ENSEMBLE 40
#define MAX_FRAMESIZE (MAX_LINEPOINTS * MAX_BEAMS * MAX_TXNUMBER)
int main()
{
cudaError_t cudaStatus;
cudaStream_t cudaStream1 = NULL; // CUDA stream
cudaStream_t cudaStream2 = NULL; // CUDA stream
cudaStreamCreateWithFlags(&cudaStream1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&cudaStream2, cudaStreamNonBlocking);
//
cudaStatus = cudaSetDevice(0);
//
cuDoubleComplex *h_tmp;
cudaStatus = cudaMallocHost(&h_tmp, MAX_FRAMESIZE * sizeof(cuDoubleComplex));
for (int i = 0; i < MAX_FRAMESIZE; i++)
h_tmp[i] = { rand() / (double)RAND_MAX, rand() / (double)RAND_MAX };
//
cuDoubleComplex *d_tmp;
cudaStatus = cudaMalloc(&d_tmp, MAX_FRAMESIZE * MAX_ENSEMBLE * sizeof(cuDoubleComplex));
//
for (int i = 0; i < MAX_ENSEMBLE; i++) {
cudaStatus = cudaMemcpyAsync(d_tmp + i * MAX_FRAMESIZE, h_tmp, MAX_FRAMESIZE * sizeof(cuDoubleComplex),
cudaMemcpyHostToDevice, cudaStream1);
_sleep(5);
}
cudaStatus = cudaStreamSynchronize(cudaStream1);
//
cudaFreeHost(h_tmp);
cudaFree(d_tmp);
cudaStatus = cudaDeviceReset();
return 0;
}
When I run on different CPU platforms, I have different behaviors:
platform1: QuadCore Intel Xeon E5-1620 v4, 3800 MHz (38 x 100)
platform2: QuadCore Intel Xeon E3-1270 v5, 4000 MHz (40 x 100)
The two platform have the same graphics card (nVIDIA Quadro RTX 4000), the same operating system and drivers.
Use Nsight to observe the running status:
platform1:
download
The running process of platform1 is the expected way, and every asynchronous copy instruction will be followed by an actual transmission behavior. However, there is something wrong with the running of the code in platform2. There is no actual transmission behavior after every asynchronous copy instruction is issued, and the transmission does not start until cudaStreamSynchronize.
At the same time, I also found that the query results of asyncEngineCount attribute value of graphics card on two platforms are different.(platform1 is 2, platform2 is 6)
I want to know why this is the result. If I want my program to execute in the way of platform1, what should I pay attention to when choosing CPU?