I thought I’d try using Unified Memory with CUBLAS, but I’m seeing extremely poor performance with it, as the code below illustrates.
The code just multiplies 4096x4096 matrices R=100 times. Using Unified Memory, this takes 56s on GTX980SC, while with explicit memory management this takes 5s. Moving copying out of the loop would reduce the time to 3s.
Am I doing something wrong here, or do I just have unreasonable expectations for UM?
To compile the code on Linux, run
nvcc -O3 -arch=sm_35 cublas_um.cpp -lcublas
or, for explicit memory management,
nvcc -O3 -arch=sm_35 -DEXPLICIT cublas_um.cpp -lcublas
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define N 4096
#define R 100
#define S (N*N*sizeof(float))
#define CHECK(x) do { if((x) != 0) { printf("%s %d\n", __FILE__, __LINE__); exit(1);} } while(false)
void cp(const float* src, float* dest) {
CHECK(cudaMemcpy(dest, src, S, cudaMemcpyDefault));
}
void alloc_host(float** p) {
#ifdef EXPLICIT
CHECK(cudaMallocHost(p, S));
#else
CHECK(cudaMallocManaged(p, S));
#endif
CHECK(cudaMemset(*p, 0, S));
}
void alloc_device(float** p) {
CHECK(cudaMalloc(p, S));
CHECK(cudaMemset(*p, 0, S));
}
int main()
{
cublasHandle_t handle;
float *a, *b, *c, *ad, *bd, *cd, alpha=1, beta=0;
CHECK(cublasCreate(&handle));
alloc_host(&a); alloc_device(&ad);
alloc_host(&b); alloc_device(&bd);
alloc_host(&c); alloc_device(&cd);
for(int i=0; i<R; ++i) {
#ifdef EXPLICIT
cp(a, ad);
cp(b, bd);
CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
&alpha, ad, N, bd, N, &beta, cd, N));
cp(cd, c);
#else
CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
&alpha, a, N, b, N, &beta, c, N));
#endif
}
CHECK(cudaDeviceSynchronize());
}