Mmap() is slower than malloc() when use cudaMemcpy()

My code and results are below:

#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h> 
#include <fcntl.h>
#include <cuda.h>

__global__ void addVecOnDevice(char *d_in, char *d_out, int n)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
 
    if (i < n)
        d_out[i] = d_in[i] << 1;
}

int main()
{
   int n = 1024 * 1024 * 1024;
   int segment = n/8;

   int fd_physicalAddr = open("/dev/mem", O_RDWR|O_SYNC);
   char* pt_physicalAddr_mmap = (char*) mmap(0,n, PROT_READ|PROT_WRITE, MAP_SHARED, fd_physicalAddr, 0xd35000000);

   char * pt_malloc;
   pt_malloc = (char* )malloc(segment);

   for(int j = 0; j < segment; j++ )
      {
         pt_malloc[j] = 1;  //pt_malloc[j] or pt_physicalAddr_mmap[j]
      }

   char *d_out, *d_in;
   size_t bytes = segment * sizeof(char);
   cudaMalloc(&d_in, bytes);
   cudaMalloc(&d_out, bytes);

   dim3 blockSize(512);
   dim3 gridSize((n - 1) / blockSize.x + 1);
     
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);

   cudaEventRecord(start);
   cudaMemcpy(d_in, pt_malloc, bytes, cudaMemcpyHostToDevice); //pt_malloc or pt_physicalAddr_mmap
   addVecOnDevice<<<gridSize, blockSize>>>(d_in, d_out, bytes);
   cudaEventRecord(stop);
   cudaMemcpy(pt_physicalAddr_mmap+939524096, d_out, bytes, cudaMemcpyDeviceToHost);
   cudaEventSynchronize(stop);

   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);

   int deviceSum = 0;
      for(int l = 0; l < segment; l++)
          deviceSum += pt_physicalAddr_mmap[l+939524096];
     
      printf("GPU: %d bytes\n", segment);
      printf("\tResult: %d\n", deviceSum/segment);
      printf("\tTime: %.3f ms \n", milliseconds);
      printf("___________________________________\n");

   printf("CPU pointer: %p \n", pt_physicalAddr_mmap);
   cudaFree(d_in);
   cudaFree(d_out);
   munmap(pt_physicalAddr_mmap,n);
   free(pt_malloc);
}

So the question is why when I copy data from a contiguous physical memory area which is created by mmap() function to GPU memory, it is slower than when I copy data from a incoherent physical memory area which is created by malloc() function to GPU memory? the difference is very large…

The program crashes on my machine.
You are not performing error checking for both the C API calls and CUDA API calls. Maybe something did not work.
Maybe the pages only get populated when accessed by cudaMemcpy. Try MAP_POPULATE. With normal malloc, memory pages are already set up.

On my system, the mmap request fails, even when run with root privilege. The request to map /dev/mem seems strange and unwise to me.

Not sure what the object of the exercise is. There are a number of choices that seem quite strange to me.

If I do something I consider “ordinary”, I witness no (significant) difference in time:

$ cat t2010.cu
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <cuda.h>

__global__ void addVecOnDevice(char *d_in, char *d_out, int n)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < n)
        d_out[i] = d_in[i] << 1;
}

int main()
{
   int segment = 0x1000000; //16MB

   int fd_physicalAddr = open("./test", O_RDWR|O_SYNC);
   if (fd_physicalAddr == -1) {printf("oops 1\n"); return 0;}
   char* pt_physicalAddr_mmap = (char*) mmap(0,segment, PROT_READ|PROT_WRITE, MAP_SHARED, fd_physicalAddr, 0); //16 MB
   if (pt_physicalAddr_mmap == (void *)-1) {printf("oops 2 \n"); return 0;}
   char * pt_malloc;
   pt_malloc = (char* )malloc(segment);

   for(int j = 0; j < segment; j++ )
      {
         pt_malloc[j] = 1;  //pt_malloc[j] or pt_physicalAddr_mmap[j]
      }

   char *d_out, *d_in;
   size_t bytes = segment * sizeof(char);
   cudaMalloc(&d_in, bytes);
   cudaMalloc(&d_out, bytes);

   dim3 blockSize(512);
   dim3 gridSize((segment-blockSize.x+1) / blockSize.x);

   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);

   cudaEventRecord(start);
   cudaMemcpy(d_in, pt_malloc, bytes, cudaMemcpyHostToDevice); //pt_malloc or pt_physicalAddr_mmap
   addVecOnDevice<<<gridSize, blockSize>>>(d_in, d_out, bytes);
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);

   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);

      printf("GPU: %d bytes\n", segment);
      printf("\tTime: %.3f ms \n", milliseconds);
      printf("___________________________________\n");
   cudaMemcpy(d_in, pt_physicalAddr_mmap, bytes, cudaMemcpyHostToDevice); //warm-up
   cudaEventRecord(start);
   cudaMemcpy(d_in, pt_physicalAddr_mmap, bytes, cudaMemcpyHostToDevice);
   addVecOnDevice<<<gridSize, blockSize>>>(d_in, d_out, bytes);
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);

   cudaEventElapsedTime(&milliseconds, start, stop);

      printf("\tTime: %.3f ms \n", milliseconds);
      printf("___________________________________\n");

   cudaFree(d_in);
   cudaFree(d_out);
   munmap(pt_physicalAddr_mmap,segment);
   free(pt_malloc);
}
$ nvcc -o t2010 t2010.cu
$ truncate -s 16M test
$ cuda-memcheck ./t2010
========= CUDA-MEMCHECK
GPU: 16777216 bytes
        Time: 27.385 ms
___________________________________
        Time: 22.917 ms
___________________________________
========= ERROR SUMMARY: 0 errors
$ ./t2010
GPU: 16777216 bytes
        Time: 5.210 ms
___________________________________
        Time: 4.816 ms
___________________________________
$

The mmap() didn’t work in your PC because in my code, the line “mmap(0,n, PROT_READ|PROT_WRITE, MAP_SHARED, fd_physicalAddr, 0xd35000000)” needs a contiguous physical RAM area from address 0xD35000000 to 0xD74FFFFFF (1024^3 addresses) which is exclusive for program and untouchable by Linux OS, if not, the program return “segment fault”. To fixed this issue, in the cmd line, typing “sudo vim /etc/default/grub” and enter, then go to the line GRUB_CMDLINE_LINUX=“something”, edit it to GRUB_CMDLINE_LINUX=“something crashkernel=auto rhgb quiet mem=4000M memmap=4000M@4096M” (the 4000M means your linux OS only use 4000Mib of RAM to operate, from the 4096M address, it means 4096 * 1024^2+4000 * 1024^2 = 0x1FA000000, so from the address 0x1FA000000, the RAM area (1024^3 bytes) is yours), save it, and type “sudo update-grub2” if use Ubuntu, type “sudo grub2-mkconfig -o /boot/grub2/grub.cfg” if use RHEL & Centos, then “sudo reboot”. After that, back to the program, edit 0xd35000000 to 0x1FA000000, then recompile and rerun the program. the program now can run correctly.


"4096.1024^2 + 4000.1024^2 = 0x1FA000000

Yeah, I’m not going to do all that. There’s no way I feel like taking my system and mapping/mounting /dev/mem. You can do that if you wish. Good luck!

Ordinary usage of mmap doesn’t run into the problem you suggest. I have already demonstrated that.

But in my project, FPGA use DMA protocol with descriptor bypass method, it write data to a countiguous physical memory area directly, and with speed 10 Gbytes/s, I need a 1024^3 large area in RAM so I must use “open (”/dev/mem",O_RDWR|O_SYNC)" and “mmap (0, 1024^3,…”. In your code, you just use open("./test", O_RDWR|O_SYNC) and mmap(0, 16 MB,…), it can not used for me :(

You could check if it also takes a long time to copy the mmap buffer into an ordinary malloc buffer.