Hi,
I have a program that utilizes pinned memory allocated by cudaMallocHost to implement simultaneous H2D copy and kernel computation via cudaMemcpyAsync. However, some CPU code in it runs very slowly on TX1. The problem seems to be caused by zero-copy that disables memory cache on both CPU and GPU. This can be verified by the following program, which says reading from pinned memory achieves about 20% the bandwidth of reading a normal CPU buffer.
I think the best solution is to disable zero-copy on TX1; I want to use the same code for both TX1 and non-TX1 platforms.
Thanks for your help!
output on TX1:
pinned: memset: 8.567GiB/s
cpu buf: memset: 9.646GiB/s
cpu buf -> pinned: 3.639GiB/s
pinned -> cpu buf: 0.717GiB/s
cpu buf -> cpu buf: 3.519GiB/s
output on a normal linux PC:
pinned: memset: 6.724GiB/s
cpu buf: memset: 6.581GiB/s
cpu buf -> pinned: 8.910GiB/s
pinned -> cpu buf: 9.106GiB/s
cpu buf -> cpu buf: 8.907GiB/s
#include <cuda_runtime.h>
#include <vector>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
class Timer {
clock_t m_start;
public:
Timer():
m_start(clock())
{
}
double get_secs_reset() {
clock_t s = m_start;
m_start = clock();
return (m_start - s) / double(CLOCKS_PER_SEC);
}
};
#define CUDA_CHECK(expr) do { \
cudaError_t _err = (expr); \
if (_err != cudaSuccess) { \
fprintf(stderr, "%s failed: %s\n", #expr, cudaGetErrorString(_err)); \
} \
} while(0)
int main() {
constexpr size_t SIZE = 1024ull * 1024 * 1024 / 2;
constexpr double SIZE_G = SIZE / double(1024 * 1024 * 1024);
void *ptr;
CUDA_CHECK(cudaMallocHost(&ptr, SIZE));
std::vector<uint8_t> buf_cpu(SIZE), buf_cpu1(SIZE);
auto ptr1 = buf_cpu.data();
auto ptr2 = buf_cpu1.data();
Timer timer;
for (int i = 0; i < 3; ++ i) {
memset(ptr, i, SIZE);
printf("pinned: memset: %.3fGiB/s\n",
SIZE_G / timer.get_secs_reset());
memset(ptr1, i, SIZE);
printf("cpu buf: memset: %.3fGiB/s\n",
SIZE_G / timer.get_secs_reset());
memcpy(ptr, ptr1, SIZE);
printf("cpu buf -> pinned: %.3fGiB/s\n",
SIZE_G / timer.get_secs_reset());
memcpy(ptr1, ptr, SIZE);
printf("pinned -> cpu buf: %.3fGiB/s\n",
SIZE_G / timer.get_secs_reset());
memcpy(ptr2, ptr1, SIZE);
printf("cpu buf -> cpu buf: %.3fGiB/s\n",
SIZE_G / timer.get_secs_reset());
}
}