I don’t see a performance difference like you are suggesting (3ms vs. 8ms) but I do note that the performance seems to be better by using cudaHostAlloc
rather than a host allocation followed by cudaHostRegister
, so my suggestion would be to use cudaHostAlloc
:
# cat t198.cu
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
size_t sz = 1048576ULL*32;
template <typename T>
__global__ void write_kernel(T *d, size_t s, T val){
for (size_t i = blockIdx.x*blockDim.x+threadIdx.x; i < s; i+=gridDim.x*blockDim.x)
d[i] = val;
}
template <typename T>
__global__ void read_kernel(const T *d, size_t s, T tval, T *r){
T val = 0;
for (size_t i = blockIdx.x*blockDim.x+threadIdx.x; i < s; i+=gridDim.x*blockDim.x)
val += d[i];
if (val == tval) *r = val;
}
using mt = float;
int main(){
#ifndef USE_HALLOC
mt *d = new mt[sz];
cudaHostRegister(d, sizeof(*d)*sz, /* cudaHostRegisterPortable | */ cudaHostAllocMapped);
#else
mt *d;
cudaHostAlloc(&d, sizeof(*d)*sz, cudaHostAllocDefault);
#endif
mt *r;
cudaMalloc(&r, sizeof(mt));
memset(d, 0, sizeof(*d)*sz);
cudaMemset(r, 0, sizeof(*r));
// warm-up
write_kernel<<<3*58, 512>>>(d, sz, 1.0f);
cudaDeviceSynchronize();
read_kernel<<<3*58, 512>>>(d, sz, 1.0f, r);
cudaDeviceSynchronize();
unsigned long long dt = dtime_usec(0);
write_kernel<<<3*58, 512>>>(d, sz, 1.0f);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "write kernel time: " << dt/(float)USECPSEC << "s" << std::endl;
dt = dtime_usec(0);
read_kernel<<<3*58, 512>>>(d, sz, 1.0f, r);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "read kernel time: " << dt/(float)USECPSEC << "s" << std::endl;
}
# nvcc -o t198 t198.cu
# ./t198
write kernel time: 0.018488s
read kernel time: 0.016026s
# nvcc -o t198 t198.cu -DUSE_HALLOC
# ./t198
write kernel time: 0.011183s
read kernel time: 0.010902s
#
In the cudaHostAlloc
case, there is about a 2.5% difference for 128MB, between read and write speeds.
In the cudaHostRegister
case, there is about a 13% difference between read and write speeds, and compared to the cudaHostAlloc
case it is slower by around 40% or more. (Adding the cudaHostRegisterPortable
flag doesn’t seem to affect the data. And cudaHostAllocMapped
is implied in any UVA-enabled setting, which is for practical purposes all modern CUDA settings.)
I don’t have an explanation for the various differences. Like I said, based on that data above, I would prefer cudaHostAlloc
. Although both are methods to pin memory, I don’t know of any claims by NVIDIA that they result in identical behavior or circumstances. And the underlying mechanics of each are not documented or specified by NVIDIA.
CUDA 12.2, L4 GPU
I thought there could be an intersection with the cudaHostAllocPortable
flag in a multi-GPU case, but when I ran the following code on a DGX-H100 I observed a somewhat similar scenario:
$ cat t4.cu
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
size_t sz = 1048576ULL*32;
template <typename T>
__global__ void write_kernel(T *d, size_t s, T val){
for (size_t i = blockIdx.x*blockDim.x+threadIdx.x; i < s; i+=gridDim.x*blockDim.x)
d[i] = val;
}
template <typename T>
__global__ void read_kernel(const T *d, size_t s, T tval, T *r){
T val = 0;
for (size_t i = blockIdx.x*blockDim.x+threadIdx.x; i < s; i+=gridDim.x*blockDim.x)
val += d[i];
if (val == tval) *r = val;
}
using mt = float;
int main(){
cudaSetDevice(0);
cudaSetDevice(1);
cudaSetDevice(0);
#ifndef USE_HALLOC
mt *d = new mt[sz];
cudaHostRegister(d, sizeof(*d)*sz, cudaHostRegisterPortable | cudaHostAllocMapped);
#else
mt *d;
cudaHostAlloc(&d, sizeof(*d)*sz, cudaHostAllocDefault);
#endif
mt *r;
cudaMalloc(&r, sizeof(mt));
memset(d, 0, sizeof(*d)*sz);
cudaMemset(r, 0, sizeof(*r));
// warm-up
write_kernel<<<3*58, 512>>>(d, sz, 1.0f);
cudaDeviceSynchronize();
read_kernel<<<3*58, 512>>>(d, sz, 1.0f, r);
cudaDeviceSynchronize();
unsigned long long dt = dtime_usec(0);
write_kernel<<<3*58, 512>>>(d, sz, 1.0f);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "write kernel time: " << dt/(float)USECPSEC << "s" << std::endl;
dt = dtime_usec(0);
read_kernel<<<3*58, 512>>>(d, sz, 1.0f, r);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "read kernel time: " << dt/(float)USECPSEC << "s" << std::endl;
}
$ nvcc -o t4 t4.cu
$ ./t4
write kernel time: 0.003816s
read kernel time: 0.004044s
$ nvcc -o t4 t4.cu -DUSE_HALLOC
$ ./t4
write kernel time: 0.002561s
read kernel time: 0.002675s
$
Overall its faster because this machine has a newer PCIE generation.
In the cudaHostAlloc
case, the performance difference between read and write is very small, about 4%. In the cudaHostRegister
case, the perf difference between read and write is larger, about 6%, and the cudaHostRegister
case is overall slower than the cudaHostAlloc
case by about 34%.