CUDA kernel 10x slower when operating on cudaMallocManaged memory even when prefetched

#include <cuda_runtime.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;

class MyTimer {
    std::chrono::time_point<std::chrono::system_clock> start;

public:
    void startCounter() {
        start = std::chrono::system_clock::now();
    }

    int64_t getCounterNs() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
    }

    int64_t getCounterMs() {
        return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
    }

    double getCounterMsPrecise() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
                / 1000000.0;
    }
};

__global__
void HelloWorld()
{
  printf("Hello world\n");
}

volatile double dummy = 0;

__global__
void multiply(int N, float* __restrict__ output, const float* __restrict__ x, const float* __restrict__ y)
{
  int start = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for (int i = start; i < N; i += stride) {
    output[i] = x[i] * y[i];
  }
}


int main()
{
  MyTimer timer;
  srand(time(NULL));
  HelloWorld<<<1,1>>>();

  timer.startCounter();
  int N = 2000 * 2000;
  float* h_a = new float[N];
  float* h_b = new float[N];
  float* h_c = new float[N];
  float* h_res = new float[N];
  for (int i = 0; i < N; i++) {
    h_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    h_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    h_c[i] = h_a[i] * h_b[i];
  }
  dummy = timer.getCounterMsPrecise();

  timer.startCounter();
  float *d_a, *d_b, *d_c;
  cudaMalloc(&d_a, N * sizeof(float));
  cudaMalloc(&d_b, N * sizeof(float));
  cudaMalloc(&d_c, N * sizeof(float));
  dummy = timer.getCounterMsPrecise();
  cout << "cudaMalloc cost = " << dummy << "\n";

  timer.startCounter();
  cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice);  
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "H2D copy cost = " << dummy << "\n";
  
  timer.startCounter();
  constexpr int GRID_DIM = 256;
  constexpr int BLOCK_DIM = 256;
  multiply<<<GRID_DIM, BLOCK_DIM>>>(N, d_c, d_a, d_b);
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "kernel cost = " << dummy << "\n";

  timer.startCounter();
  cudaMemcpy(h_res, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "D2H copy cost = " << timer.getCounterMsPrecise() << "\n";

  for (int i = 0; i < N; i++) if (h_res[i] != h_c[i]) {
    cout << "error\n";
    exit(1);
  }

  return 0;
}

If I use normal cudaMalloc, the result is

Hello world
cudaMalloc cost = 0.599463
H2D copy cost = 5.16785
kernel cost = 0.109068
D2H copy cost = 7.18768

but if I use cudaMallocManaged, it becomes

Hello world
cudaMalloc cost = 0.116722
H2D copy cost = 8.26673
kernel cost = 1.70356
D2H copy cost = 6.8841

Why is there such a big performance drop? The code has manually copied the memory to device side, so shouldn’t it be exactly the same as regular cudaMalloc-ed device memory?

The use case is for a matrix library, where the user can treat it as a regular CPU matrix for convenience, but most heavy operations will use GPU to compute. Basically, it’s guaranteed that before any GPU kernel is called, all the data has already been prefetched to GPU side.

Thanks!

In what way have you prefetched it? I don’t see any prefetching. I don’t recommend using cudaMemcpy with a managed allocation. For managed memory questions, its also important to know if you are on windows or linux, and what device you are running on.

Question has been answered on stackoverflow, thanks @robert!

To summarize, it’s because d_c isn’t touched before calling the kernel. If cudaMemPrefetchAsync or cudaMemcpy is called on d_c before calling the kernel, then it will have the expected speed. Is it because device memory allocated by cudaMalloc or cudaMallocManaged isn’t actually allocated until it’s touched ?

Also, I should avoid using cudaMemcpy with managed memory.