cudaMallocManaged() not working

d.bergsma · November 17, 2018, 12:35pm

I’m running the latest version of the CUDA Toolkit (10.0.130), for Windows 8.1 (x64); the latest version of MS Visual Studio 2017 (15.9.1); the latest driver (416.94), but also tried with driver version 411.31. I have a GeForce GT640M LE (Kepler) video card, with Compute Capability 3.0.

When I try to run this example from https://devblogs.nvidia.com/even-easier-introduction-cuda/:

#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  for (int i = 0; i < n; i++)
    y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 1>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}

the program freezes at the first cudaMallocManaged() call (line 17) and I get 100% CPU usage on one of my CPU cores. After two minutes, the driver is automatically reset and I get an access violation at line 22. The cudaMallocManaged() call returns an unknown error.

I sometimes manage to run the program outside Visual Studio’s debug mode once (just running the executable from the command line). But when I run the program again, I get the same issues as described above.

The sample program created automatically when creating a new CUDA project runs with no problems:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

What is going wrong here?

d.bergsma · November 18, 2018, 2:28pm

I found the solution to my issue: After I downgraded to CUDA Toolkit 9.1 and included driver 388.19 (and MSVC Toolset 14.11), everything works fine. When I upgraded to driver 398.75 (the driver from CUDA Toolkit 9.2), I got the same issues as described above. So it seems my problem is a driver issue and I will stick to 388.19 (which unfortunately also implies that I cannot upgrade to CUDA 9.2 or 10.0).

Topic		Replies	Views
Whole system freezes when using cudaMallocManaged CUDA Programming and Performance	18	2820	February 11, 2019
cudaMallocManaged error on my machine CUDA Programming and Performance	3	3950	October 23, 2014
Why does access cudaMallocManaged memory throw exception? CUDA Programming and Performance	2	127	June 17, 2025
Using unified memory causes system crash CUDA Programming and Performance	28	6321	February 4, 2019
Calling cudaMallocManaged always returns null, but only if a cu file CUDA Programming and Performance	2	833	November 9, 2017
My first CUDA project crash when executed (but it compiles) why ? CUDA Programming and Performance	2	846	December 6, 2017
Demo delivers segmentation fault System Management and Monitoring (NVML) cuda	2	719	August 26, 2023
cuda malloc managed fails Jetson TX2	4	1464	November 17, 2018
SOLVED (sort of): cudaMalloc fails where cudaMallocManaged succeeds CUDA Programming and Performance	1	651	July 7, 2019
cudaMallocManaged() clarification needed CUDA Programming and Performance	5	11594	November 20, 2018

cudaMallocManaged() not working

Related topics