If it helps, below is my code for the background process:
#include
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <assert.h>
#define MEM_SIZE (1024 * 1024 * 1024)
#define gpuErrchk(ans) { gpuAssert((ans), FILE, LINE); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,“GPUassert: %s %s %d\n”, cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void initializeEvents(cudaEvent_t *start, cudaEvent_t *stop){
gpuErrchk( cudaEventCreate(start) );
gpuErrchk( cudaEventCreate(stop) );
gpuErrchk( cudaEventRecord(*start, 0) );
}
float finalizeEvents(cudaEvent_t start, cudaEvent_t stop){
gpuErrchk( cudaGetLastError() );
gpuErrchk( cudaEventRecord(stop, 0) );
gpuErrchk( cudaEventSynchronize(stop) );
float kernel_time;
gpuErrchk( cudaEventElapsedTime(&kernel_time, start, stop) );
gpuErrchk( cudaEventDestroy(start) );
gpuErrchk( cudaEventDestroy(stop) );
return kernel_time;
}
void *allocate_gpu_contigous(size_t mem)
{
size_t size = mem;
int device = -1;
void *gpu_mem;
cudaEvent_t start, stop;
float time;
gpuErrchk(cudaGetDevice(&device));
initializeEvents(&start, &stop);
gpuErrchk(cudaMallocManaged(&gpu_mem, size));
time = finalizeEvents(start, stop);
printf("CudaMallocManaged: Size:0x%lx, Time: %f ms\n", size, time);
memset(gpu_mem, 0x1, size);
initializeEvents(&start, &stop);
gpuErrchk(cudaMemPrefetchAsync(gpu_mem, size, device, NULL));
time = finalizeEvents(start, stop);
printf("cudaMemPrefetchAsync To GPU: Size:0x%lx, Time: %f ms\n", size, time);
return gpu_mem;
}
void memcpy_to_device(void *dest, void *src, size_t size, int eval)
{
cudaEvent_t start, stop;
float time;
if (eval) {
initializeEvents(&start, &stop);
gpuErrchk(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
time = finalizeEvents(start, stop);
printf("CudaMemCpy (HostToDevice): Size:0x%lx, Time: %f ms\n", size, time);
} else {
gpuErrchk(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
}
}
void memcpy_to_host(void *dest, void *src, size_t size, int eval)
{
cudaEvent_t start, stop;
float time;
if (eval) {
initializeEvents(&start, &stop);
gpuErrchk(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost));
time = finalizeEvents(start, stop);
printf("CudaMemCpy (DeviceToHost): Size:0x%lx, Time: %f ms\n", size, time);
} else {
gpuErrchk(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost));
}
}
int main()
{
void *gpu_mem = (void *)allocate_gpu_contigous(MEM_SIZE);
void *cpu_mem = (void *)malloc(MEM_SIZE);
assert(gpu_mem != NULL);
assert(cpu_mem != NULL);
memset(cpu_mem, 0x1, MEM_SIZE);
// Warmup
memcpy_to_device(gpu_mem, cpu_mem, MEM_SIZE, true);
while (1) {
memcpy_to_device(gpu_mem, cpu_mem, MEM_SIZE, false);
}
}