Mmap() is slower than malloc() when use cudaMemcpy()

phuongothe · April 30, 2022, 9:14am

My code and results are below:

#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h> 
#include <fcntl.h>
#include <cuda.h>

__global__ void addVecOnDevice(char *d_in, char *d_out, int n)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
 
    if (i < n)
        d_out[i] = d_in[i] << 1;
}

int main()
{
   int n = 1024 * 1024 * 1024;
   int segment = n/8;

   int fd_physicalAddr = open("/dev/mem", O_RDWR|O_SYNC);
   char* pt_physicalAddr_mmap = (char*) mmap(0,n, PROT_READ|PROT_WRITE, MAP_SHARED, fd_physicalAddr, 0xd35000000);

   char * pt_malloc;
   pt_malloc = (char* )malloc(segment);

   for(int j = 0; j < segment; j++ )
      {
         pt_malloc[j] = 1;  //pt_malloc[j] or pt_physicalAddr_mmap[j]
      }

   char *d_out, *d_in;
   size_t bytes = segment * sizeof(char);
   cudaMalloc(&d_in, bytes);
   cudaMalloc(&d_out, bytes);

   dim3 blockSize(512);
   dim3 gridSize((n - 1) / blockSize.x + 1);
     
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);

   cudaEventRecord(start);
   cudaMemcpy(d_in, pt_malloc, bytes, cudaMemcpyHostToDevice); //pt_malloc or pt_physicalAddr_mmap
   addVecOnDevice<<<gridSize, blockSize>>>(d_in, d_out, bytes);
   cudaEventRecord(stop);
   cudaMemcpy(pt_physicalAddr_mmap+939524096, d_out, bytes, cudaMemcpyDeviceToHost);
   cudaEventSynchronize(stop);

   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);

   int deviceSum = 0;
      for(int l = 0; l < segment; l++)
          deviceSum += pt_physicalAddr_mmap[l+939524096];
     
      printf("GPU: %d bytes\n", segment);
      printf("\tResult: %d\n", deviceSum/segment);
      printf("\tTime: %.3f ms \n", milliseconds);
      printf("___________________________________\n");

   printf("CPU pointer: %p \n", pt_physicalAddr_mmap);
   cudaFree(d_in);
   cudaFree(d_out);
   munmap(pt_physicalAddr_mmap,n);
   free(pt_malloc);
}

So the question is why when I copy data from a contiguous physical memory area which is created by mmap() function to GPU memory, it is slower than when I copy data from a incoherent physical memory area which is created by malloc() function to GPU memory? the difference is very large…

striker159 · May 2, 2022, 7:00pm

The program crashes on my machine.
You are not performing error checking for both the C API calls and CUDA API calls. Maybe something did not work.
Maybe the pages only get populated when accessed by cudaMemcpy. Try MAP_POPULATE. With normal malloc, memory pages are already set up.

Robert_Crovella · May 2, 2022, 8:02pm

On my system, the mmap request fails, even when run with root privilege. The request to map /dev/mem seems strange and unwise to me.

Not sure what the object of the exercise is. There are a number of choices that seem quite strange to me.

If I do something I consider “ordinary”, I witness no (significant) difference in time:

$ cat t2010.cu
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <cuda.h>

__global__ void addVecOnDevice(char *d_in, char *d_out, int n)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < n)
        d_out[i] = d_in[i] << 1;
}

int main()
{
   int segment = 0x1000000; //16MB

   int fd_physicalAddr = open("./test", O_RDWR|O_SYNC);
   if (fd_physicalAddr == -1) {printf("oops 1\n"); return 0;}
   char* pt_physicalAddr_mmap = (char*) mmap(0,segment, PROT_READ|PROT_WRITE, MAP_SHARED, fd_physicalAddr, 0); //16 MB
   if (pt_physicalAddr_mmap == (void *)-1) {printf("oops 2 \n"); return 0;}
   char * pt_malloc;
   pt_malloc = (char* )malloc(segment);

   for(int j = 0; j < segment; j++ )
      {
         pt_malloc[j] = 1;  //pt_malloc[j] or pt_physicalAddr_mmap[j]
      }

   char *d_out, *d_in;
   size_t bytes = segment * sizeof(char);
   cudaMalloc(&d_in, bytes);
   cudaMalloc(&d_out, bytes);

   dim3 blockSize(512);
   dim3 gridSize((segment-blockSize.x+1) / blockSize.x);

   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);

   cudaEventRecord(start);
   cudaMemcpy(d_in, pt_malloc, bytes, cudaMemcpyHostToDevice); //pt_malloc or pt_physicalAddr_mmap
   addVecOnDevice<<<gridSize, blockSize>>>(d_in, d_out, bytes);
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);

   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);

      printf("GPU: %d bytes\n", segment);
      printf("\tTime: %.3f ms \n", milliseconds);
      printf("___________________________________\n");
   cudaMemcpy(d_in, pt_physicalAddr_mmap, bytes, cudaMemcpyHostToDevice); //warm-up
   cudaEventRecord(start);
   cudaMemcpy(d_in, pt_physicalAddr_mmap, bytes, cudaMemcpyHostToDevice);
   addVecOnDevice<<<gridSize, blockSize>>>(d_in, d_out, bytes);
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);

   cudaEventElapsedTime(&milliseconds, start, stop);

      printf("\tTime: %.3f ms \n", milliseconds);
      printf("___________________________________\n");

   cudaFree(d_in);
   cudaFree(d_out);
   munmap(pt_physicalAddr_mmap,segment);
   free(pt_malloc);
}
$ nvcc -o t2010 t2010.cu
$ truncate -s 16M test
$ cuda-memcheck ./t2010
========= CUDA-MEMCHECK
GPU: 16777216 bytes
        Time: 27.385 ms
___________________________________
        Time: 22.917 ms
___________________________________
========= ERROR SUMMARY: 0 errors
$ ./t2010
GPU: 16777216 bytes
        Time: 5.210 ms
___________________________________
        Time: 4.816 ms
___________________________________
$

phuongothe · May 3, 2022, 3:52am

The mmap() didn’t work in your PC because in my code, the line “mmap(0,n, PROT_READ|PROT_WRITE, MAP_SHARED, fd_physicalAddr, 0xd35000000)” needs a contiguous physical RAM area from address 0xD35000000 to 0xD74FFFFFF (1024^3 addresses) which is exclusive for program and untouchable by Linux OS, if not, the program return “segment fault”. To fixed this issue, in the cmd line, typing “sudo vim /etc/default/grub” and enter, then go to the line GRUB_CMDLINE_LINUX=“something”, edit it to GRUB_CMDLINE_LINUX=“something crashkernel=auto rhgb quiet mem=4000M memmap=4000M@4096M” (the 4000M means your linux OS only use 4000Mib of RAM to operate, from the 4096M address, it means 4096 * 1024^2+4000 * 1024^2 = 0x1FA000000, so from the address 0x1FA000000, the RAM area (1024^3 bytes) is yours), save it, and type “sudo update-grub2” if use Ubuntu, type “sudo grub2-mkconfig -o /boot/grub2/grub.cfg” if use RHEL & Centos, then “sudo reboot”. After that, back to the program, edit 0xd35000000 to 0x1FA000000, then recompile and rerun the program. the program now can run correctly.

phuongothe · May 3, 2022, 3:56am

"4096.1024^2 + 4000.1024^2 = 0x1FA000000

Robert_Crovella · May 3, 2022, 6:47am

Yeah, I’m not going to do all that. There’s no way I feel like taking my system and mapping/mounting /dev/mem. You can do that if you wish. Good luck!

Ordinary usage of mmap doesn’t run into the problem you suggest. I have already demonstrated that.

phuongothe · May 3, 2022, 10:12am

But in my project, FPGA use DMA protocol with descriptor bypass method, it write data to a countiguous physical memory area directly, and with speed 10 Gbytes/s, I need a 1024^3 large area in RAM so I must use “open (”/dev/mem",O_RDWR|O_SYNC)" and “mmap (0, 1024^3,…”. In your code, you just use open(“./test”, O_RDWR|O_SYNC) and mmap(0, 16 MB,…), it can not used for me :(

striker159 · May 3, 2022, 6:20pm

You could check if it also takes a long time to copy the mmap buffer into an ordinary malloc buffer.