Hello!
During our upgrade to Jetpack 4.6.1 from Jetpack 4.4 , we noticed something strange around cudaMemcpyAsync calls.
A H2D copy can be blocked by an earlier issued D2H copy from another non blocking Cuda stream.
This can cause big delays if D2H copy is last call in a long computation sequence.
Here is a small reproduction of the problem. It reproduces the bug on Jetson Xavier NX with 100% success rate.
#include <thread>
#include <iostream>
#include <cuda_runtime_api.h>
__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) y[i] = a*x[i] + y[i];
}
void kernel_call()
{
int N = 1<<20;
float *d_x;
float *d_y;
float *h_y;
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
cudaMallocHost(&h_y, N*sizeof(float));
cudaStream_t stream;
cudaEvent_t event;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
cudaEventCreateWithFlags(&event, cudaEventBlockingSync | cudaEventDisableTiming);
for(size_t i = 0; i < 10000; ++i)
{
for(size_t j = 0; j < 50000; ++j)
{
saxpy<<<(N+255)/256, 256,0, stream>>>(N, 2.0f, d_x, d_y);
if( cudaPeekAtLastError() != cudaSuccess)
std::exit(1);
}
cudaMemcpyAsync(h_y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost, stream);
cudaEventRecord(event, stream);
cudaEventSynchronize(event);
}
cudaFree(d_x);
cudaFree(d_y);
cudaFreeHost(h_y);
}
void upload_call(cudaStream_t stream)
{
constexpr size_t N = 20'000'000;
uint8_t *pinned_mem;
uint8_t *device_mem;
cudaMalloc(&device_mem, N*sizeof(uint8_t));
cudaMallocHost(&pinned_mem, N*sizeof(uint8_t));
cudaEvent_t event;
cudaEventCreateWithFlags(&event, cudaEventBlockingSync | cudaEventDisableTiming);
for(size_t i = 0; i < 10000; ++i)
{
auto t0 = std::chrono::steady_clock::now();
auto line_size = 1024 * 3 * sizeof(uint8_t);
auto transfer_size = 1024 * line_size;
auto device_mem_start = device_mem + (i % 4) * transfer_size;
auto pinned_mem_start = pinned_mem + (i % 4) * transfer_size;
auto err = cudaMemcpy2DAsync(device_mem_start, line_size, pinned_mem_start, line_size, line_size, 1024, cudaMemcpyHostToDevice, stream);
cudaEventRecord(event, stream);
cudaEventSynchronize(event);
auto t1 = std::chrono::steady_clock::now();
if(err != cudaSuccess)
std::exit(1);
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count() <<std::endl;
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
cudaFree(device_mem);
cudaFreeHost(pinned_mem);
}
int main()
{
cudaStream_t upload_stream;
cudaStreamCreateWithFlags(&upload_stream, cudaStreamNonBlocking);
std::thread upload(upload_call, upload_stream);
std::thread kernel(kernel_call);
kernel.join();
upload.join();
cudaStreamDestroy(upload_stream);
}
I hope you can create some bug fix patch for this in Jatpack 4.6.
Thanks for your help,
János