I have a program, every 500 microseconds, a float array of size 709 is generated on the CPU, and I use the GPU to compute the result and return it to the CPU. The process is as follows:
float inference(std::vector<float> &input_h) {
cudaMemcpy(input_d, input_h, size * sizeof(float), cudaMemcpyHostToDevice);
kernel<<<...>>>(input_d, output_d, size);
cudaMemcpy(output_h, output_d, size * sizeof(float), cudaMemcpyDeviceToHost);
return output_h;
}
When running, I found that the time spent on inference
occasionally increases significantly by more than tenfold. I used nsys
to profile and discovered that the time spent in the cudaMemcpy
function fluctuates.
To further analyze this issue, I tested the time spent on cudaMemcpy
. The program is as follows:
#include <cuda_runtime.h>
#include <iostream>
#include <random>
#define BLOCK_SIZE 1024
#define CHECK_CUDA_ERROR(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " \
<< cudaGetErrorString(err) << std::endl; \
exit(1); \
} \
} while (0)
__global__ void demo_kernel(float *input, float *output, size_t size) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
output[idx] = input[idx] * 3;
}
}
void inference(float *input_h, float *input_d, float *output_d, size_t size) {
cudaMemcpy(input_d, input_h, size * sizeof(float), cudaMemcpyHostToDevice);
size_t num_blocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
demo_kernel<<<num_blocks, BLOCK_SIZE>>>(input_d, output_d, size);
// ignore copy output back to cpu
}
void testMemoryTransfer(size_t size) {
float *input_h;
float *input_d;
float *output_d;
CHECK_CUDA_ERROR(cudaMallocHost(&input_h, size * sizeof(float)));
CHECK_CUDA_ERROR(cudaMalloc(&input_d, size * sizeof(float)));
CHECK_CUDA_ERROR(cudaMalloc(&output_d, size * sizeof(float)));
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0.0, 1.0);
cudaEvent_t start, stop;
CHECK_CUDA_ERROR(cudaEventCreate(&start));
CHECK_CUDA_ERROR(cudaEventCreate(&stop));
for (int i = 0; i < 10000; ++i) {
// generate some random data
for (size_t i = 0; i < size; ++i) {
input_h[i] = dis(gen);
}
// test the inference time
CHECK_CUDA_ERROR(cudaEventRecord(start, 0));
inference(input_h, input_d, output_d, size);
CHECK_CUDA_ERROR(cudaEventRecord(stop, 0));
CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
float milliseconds = 0;
CHECK_CUDA_ERROR(cudaEventElapsedTime(&milliseconds, start, stop));
std::cout << milliseconds << std::endl;
}
// Clean up
CHECK_CUDA_ERROR(cudaEventDestroy(start));
CHECK_CUDA_ERROR(cudaEventDestroy(stop));
CHECK_CUDA_ERROR(cudaFree(input_d));
CHECK_CUDA_ERROR(cudaFree(output_d));
CHECK_CUDA_ERROR(cudaFreeHost(input_h));
}
int main() {
cudaSetDevice(0);
testMemoryTransfer(709);
return 0;
}
Run nvcc test.cu -o test && ./test > a.txt
, and then use the following Python program to visualize:
import matplotlib.pyplot as plt
import numpy as np
data = np.loadtxt("a.txt")
plt.figure(figsize=(20, 6))
plt.plot(data)
plt.ylabel("time (ms)")
plt.yscale('log')
plt.show()
I observed that cudaMemcpy
indeed occasionally slows down significantly. Why is this happening? How can I avoid the significant fluctuations in data transfer times?