My code and results are below:
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <cuda.h>
__global__ void addVecOnDevice(char *d_in, char *d_out, int n)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
d_out[i] = d_in[i] << 1;
}
int main()
{
int n = 1024 * 1024 * 1024;
int segment = n/8;
int fd_physicalAddr = open("/dev/mem", O_RDWR|O_SYNC);
char* pt_physicalAddr_mmap = (char*) mmap(0,n, PROT_READ|PROT_WRITE, MAP_SHARED, fd_physicalAddr, 0xd35000000);
char * pt_malloc;
pt_malloc = (char* )malloc(segment);
for(int j = 0; j < segment; j++ )
{
pt_malloc[j] = 1; //pt_malloc[j] or pt_physicalAddr_mmap[j]
}
char *d_out, *d_in;
size_t bytes = segment * sizeof(char);
cudaMalloc(&d_in, bytes);
cudaMalloc(&d_out, bytes);
dim3 blockSize(512);
dim3 gridSize((n - 1) / blockSize.x + 1);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cudaMemcpy(d_in, pt_malloc, bytes, cudaMemcpyHostToDevice); //pt_malloc or pt_physicalAddr_mmap
addVecOnDevice<<<gridSize, blockSize>>>(d_in, d_out, bytes);
cudaEventRecord(stop);
cudaMemcpy(pt_physicalAddr_mmap+939524096, d_out, bytes, cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
int deviceSum = 0;
for(int l = 0; l < segment; l++)
deviceSum += pt_physicalAddr_mmap[l+939524096];
printf("GPU: %d bytes\n", segment);
printf("\tResult: %d\n", deviceSum/segment);
printf("\tTime: %.3f ms \n", milliseconds);
printf("___________________________________\n");
printf("CPU pointer: %p \n", pt_physicalAddr_mmap);
cudaFree(d_in);
cudaFree(d_out);
munmap(pt_physicalAddr_mmap,n);
free(pt_malloc);
}
So the question is why when I copy data from a contiguous physical memory area which is created by mmap() function to GPU memory, it is slower than when I copy data from a incoherent physical memory area which is created by malloc() function to GPU memory? the difference is very large…