Hi, I found a strange behavior that sometimes cudaMemPrefetchAsync seems to be synchronous with a simple test program below. This program issues API with a sequence Prefetch/Prefetch/Prefetch/Prefetch/KernelLaunch/KernelLaunch/KernelLaunch/KernelLaunch (PPPPKKKK).
#include <cuda.h>
#include <cuda_profiler_api.h>
#include <stdio.h>
#include <sys/time.h>
#include <unistd.h>
// sleep_cycles >= 1e7 or it would cause strange behavior
__global__ void gpu_sleep(long long *d_o, long long sleep_cycles) {
long long start = clock64();
long long cycles_elapsed;
do {
cycles_elapsed = clock64() - start;
} while (cycles_elapsed < sleep_cycles);
if (d_o != NULL) {
*d_o = cycles_elapsed;
}
}
cudaStream_t kernel_stream;
const long long GPU_SLEEP_TIME = (1 << 24);
const long long DATA_SIZE = (1 << 30); // large data
#define BLOCK_NUM 4
long long *d_ptr[BLOCK_NUM];
long long d_ptr_size[BLOCK_NUM] = {DATA_SIZE, DATA_SIZE, DATA_SIZE, DATA_SIZE};
int main() {
for (int i = 0; i < BLOCK_NUM; i++)
cudaMallocManaged(&d_ptr[i], d_ptr_size[i]);
cudaStreamCreate(&kernel_stream);
for (int i = 0; i < BLOCK_NUM; i++) {
cudaMemPrefetchAsync(d_ptr[i], d_ptr_size[i], 0, kernel_stream);
}
for (int i = 0; i < BLOCK_NUM; i++) {
gpu_sleep<<<1, 1, 0, kernel_stream>>>(d_ptr[i], GPU_SLEEP_TIME);
}
for (int i = 0; i < BLOCK_NUM; i++)
cudaFree(d_ptr[i]);
cudaDeviceSynchronize();
}
With nvvp this would look like,
https://ibb.co/QpzQJdC
I have tested this program on both GeForce RTX 2080 Ti and TESLA-V100 with CUDA 10.2.
If we profile this program with nvprof, we could see some of cudaMemPrefetchAsync seems to execute synchronously regarded to the host.
If we change the API issue sequence from PPPPKKKK to PKPKPKPK, we could found the first cudaMemPrefetchAsync would also seem to be synchronous.
// change 33~38 lines to
for (int i = 0; i < BLOCK_NUM; i++) {
cudaMemPrefetchAsync(d_ptr[i], d_ptr_size[i], 0, kernel_stream);
gpu_sleep<<<1, 1, 0, kernel_stream>>>(d_ptr[i], GPU_SLEEP_TIME);
}
With nvvp this would look like,
https://ibb.co/hD6DXWq
Why there would be this strange behavior?
Thanks a lot for your help.