I’m currently writing code for a system that has multiple processes, some CPU and GPU intensive. I noticed the performance I achieve for cuda kernel functions during runtime is noticeably different (dozens of percentage slower) than the results when I simply run those functions by themselves. One of the reasons I found is that calls for functions such as “std::this_thread::sleep_for” and condition_variable::wait would cause later runs of cuda kernel functions to be much slower.
I created a code snippet for example:
CMakeLists.txt
cmake_minimum_required(VERSION 3.22)
project(CudaSlow)
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
enable_language(CUDA)
FIND_PACKAGE(Threads REQUIRED)
FIND_PACKAGE(CUDA 10.2 REQUIRED)
set(CMAKE_CXX_STANDARD 17)
add_executable(CudaSlow main.cu)
target_include_directories(CudaSlow PUBLIC
${CUDA_INCLUDE_DIRS}
)
TARGET_LINK_LIBRARIES(CudaSlow PUBLIC
Threads::Threads
${CUDA_LIBRARIES}
)
main.cu
#include <iostream>
#include <cuda_device_runtime_api.h>
#include <cuda_runtime.h>
#include <chrono>
#include <thread>
__global__ void Func(float* A, float* B, float* C, int N) {
for (int j = threadIdx.x; j < N; j+= blockDim.x) {
C[j] = A[j] * B[j];
}
}
int main() {
// example function variables
int N = 1e5;
float* A;
float* B;
float* C;
cudaMalloc(&A, N * sizeof(float));
cudaMalloc(&B, N * sizeof(float));
cudaMalloc(&C, N * sizeof(float));
cudaStream_t stream;
cudaStreamCreate(&stream);
auto iterations = 2000;
// Run multiple iterations without std::this_thread::sleep_for
{
double totalTime = 0;
for (int i = 0; i < iterations; ++i) {
auto start = std::chrono::system_clock::now();
Func<<<1, 512, 0, stream>>>(A, B, C, N);
cudaStreamSynchronize(stream);
totalTime += std::chrono::duration_cast<std::chrono::duration<double>>(
std::chrono::system_clock::now() - start).count();
}
std::cout << "Average time without std::this_thread::sleep_for is " << totalTime / (float) iterations << std::endl;
}
// Run multiple iterations with std::this_thread::sleep_for
{
double totalTime = 0;
for (int i = 0; i < iterations; ++i) {
auto start = std::chrono::system_clock::now();
Func<<<1, 512, 0, stream>>>(A, B, C, N);
cudaStreamSynchronize(stream);
totalTime += std::chrono::duration_cast<std::chrono::duration<double>>(
std::chrono::system_clock::now() - start).count();
// call to sleep that is outside the time calculation and after synchronizing the stream
std::this_thread::sleep_for(std::chrono::milliseconds (1));
}
std::cout << "Average time with std::this_thread::sleep_for is " << totalTime / (float) iterations << std::endl;
}
return 0;
}
The output when I run this program on a GeForce RTX 3090 with Cuda 11.4:
Average time without std::this_thread::sleep_for is 7.91676e-05
Average time with std::this_thread::sleep_for is 0.000129487