I’ve been trying to use the cuda managed memory api to simplify my multi-gpu code. My test machine currently has two GPUs and the following simple code fails with the error “Fatal UVM CPU fault due to invalid operation”
#include <stdio.h>
#include <string.h>
#include <mpi.h>
#include <cuda_runtime.h>
#include "kernel.h"
int main(int argc, char *argv[])
{
int myrank;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
cudaSetDevice(myrank);
int n = 256;
double* d_data_send, *d_data_recv;
cudaMallocManaged((void**)&d_data_send, sizeof(double) * n);
cudaMallocManaged((void**)&d_data_recv, sizeof(double) * n);
for(int i = 0; i < n; i++) d_data_send[i] = myrank;
MPI_Request requests[2];
MPI_Isend(d_data_send, n, MPI_DOUBLE, (myrank + 1) % 2, 0, MPI_COMM_WORLD, &requests[0]);
MPI_Irecv(d_data_recv, n, MPI_DOUBLE, (myrank + 1) % 2, 0, MPI_COMM_WORLD, &requests[1]);
double* d_data_processing;
cudaMalloc((void**)&d_data_processing, sizeof(double) * n);
set_seq(n, d_data_processing);
MPI_Waitall(2, requests, MPI_STATUSES_IGNORE);
printf("The deed is done\n");
MPI_Finalize();
return 0;
}
where set_seq is the following simple kernel:
__global__
void set_seq_kernel(int n, double* data)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) data[i] = i;
}
void set_seq(int n, double* data)
{
int block_size = 256;
set_seq_kernel<<<(n + block_size - 1) / block_size, block_size>>>(n, data);
}
I’m basically trying to do some computations while data is being exchanged between the two GPUs. If I don’t execute the kernel then there are no issues and if I don’t do the exchange and only do the kernel, everything works as well. When I try to do both, it fails with the above error when I do a cuda-memcheck. Am I doing something I’m not supposed to?