Hi, Experts,I want to test the read/write bandwidth between device memory and host memory in the kernel. I tested on an L40S node , and found that writing device memory to host memory in the kernel is very slow, while writing host memory to device memory matches the PCIe bandwidth. Below is the log of my program:
GPU0 : device memory write to host memory :3.52317 GB/s
GPU0 : host memory write to device memory :25.7685 GB/s
This is my test code, you can compile it directly with nvcc. I want to ask, is writing device memory to host memory inside the kernel inherently slow, or is there something wrong with my program? (my colleague also tested on an A100 PCIe node and got the same conclusion)
#include <iostream>
#include <cuda_runtime.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/syscall.h>
#define CHECK_CUDA_ERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void memcpy_kernel(float *dst, float *src, size_t size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < size; i += stride) {
dst[i] = src[i];
}
}
int main(int argc, char **argv) {
CHECK_CUDA_ERROR(cudaSetDevice(0));
float *d_data;
float *h_data;
size_t mem_size = 1024 * 1024 * 1024; // 1GB
float elapsed;
cudaEvent_t start, stop;
cudaStream_t stream;
CHECK_CUDA_ERROR(cudaMalloc(&d_data, mem_size));
CHECK_CUDA_ERROR(cudaMallocHost(&h_data, mem_size));
CHECK_CUDA_ERROR(cudaEventCreate(&start));
CHECK_CUDA_ERROR(cudaEventCreate(&stop));
CHECK_CUDA_ERROR(cudaStreamCreate(&stream));
for (int i = 0 ;i<mem_size/sizeof(float);++i) {
h_data[i] = i*8+i%8+3*i;
}
CHECK_CUDA_ERROR(cudaMemcpyAsync(d_data, h_data, 1024 * sizeof(int), cudaMemcpyHostToDevice,stream));
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
for (int i = 0 ;i<mem_size/sizeof(float);++i) {
h_data[i] = -1;
}
//try to warmup
for (int i =0 ;i<10;++i){
memcpy_kernel<<<128, 1024,0,stream>>>(h_data, d_data, mem_size/sizeof(float));
memcpy_kernel<<<128, 1024,0,stream>>>(d_data, h_data, mem_size/sizeof(float));
}
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
CHECK_CUDA_ERROR(cudaEventRecord(start, stream));
for (int i =0 ;i<10;++i){
memcpy_kernel<<<128, 1024,0,stream>>>(h_data, d_data, mem_size/sizeof(float));
}
CHECK_CUDA_ERROR(cudaEventRecord(stop, stream));
CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
CHECK_CUDA_ERROR(cudaEventElapsedTime(&elapsed, start, stop));
std::cout << "GPU0 " << ": device memory write to host memory :" << 10*mem_size / elapsed / 1e6 << " GB/s" << std::endl;
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
CHECK_CUDA_ERROR(cudaEventRecord(start, stream));
for (int i =0 ;i<10;++i){
memcpy_kernel<<<128, 1024,0,stream>>>( d_data, h_data,mem_size/sizeof(float));
}
CHECK_CUDA_ERROR(cudaEventRecord(stop, stream));
CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
CHECK_CUDA_ERROR(cudaEventElapsedTime(&elapsed, start, stop));
std::cout << "GPU0 " << ": host memory write to device memory :" << 10*mem_size / elapsed / 1e6 << " GB/s" << std::endl;
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
CHECK_CUDA_ERROR(cudaFree(d_data));
CHECK_CUDA_ERROR(cudaFreeHost(h_data));
CHECK_CUDA_ERROR(cudaEventDestroy(start));
CHECK_CUDA_ERROR(cudaEventDestroy(stop));
CHECK_CUDA_ERROR(cudaStreamDestroy(stream));
return 0;
}