Hi,
I’m calling cudaMemcpy2DAsync like so:
cudaStream_t _gpuCopyStream;
cudaStreamCreateWithFlags(&_gpuCopyStream, cudaStreamNonBlocking);
auto t_start_check1 = std::chrono::high_resolution_clock::now();
cudaMemcpy2DAsync(…, … , …, …, …, cudaMemcpyDeviceToDevice, _gpuCopyStream);
auto t_end_check1 = std::chrono::high_resolution_clock::now();
float total_check1 = std::chrono::duration<float, std::milli>(t_end_check1 - t_start_check1).count();
printf(“Time taken to cross asynchronous function: %f ms.\n”, total_check1);
cudaStreamSynchronize(_gpuCopyStream);
When I measure the time across the cudaMemcpy2DAsync function the timing seems to spike at times (as shown in the graph below). I can understand why the time taken may spike if it’s a matter of “cudaMemcpyHostToDevice”. However in this case, it’s a “cudaMemcpyDeviceToDevice”.
Any reason for the spikes? Thanks.