Hi everyone,
I have a question regarding CUDA memory copy operations.
I remember coming across an article (not really sure what was the article) that mentioned if multiple copy operations are performed in series (such as inside a for loop), CUDA will aggregate these copy operations into a single operation to optimize bandwidth utilization.
However, when I tried implementing this, I didn’t notice any signs of such aggregation.
Can anyone confirm if CUDA actually aggregates multiple memory copy operations? If yes, under what conditions does this happen, and how can I ensure it works in my code?
Thanks in advance for your help!
Best regards,
Below is my code
#include <iostream>
#include <iomanip>
#include <chrono>
#include <thread>
#include <assert.h>
#include <vector>
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
static inline void
checkDrvError(CUresult res, const char *tok, const char *file, unsigned line)
{
if (res != CUDA_SUCCESS) {
const char *errStr = NULL;
(void)cuGetErrorString(res, &errStr);
std::cerr << file << ':' << line << ' ' << tok
<< "failed (" << (unsigned)res << "): " << errStr << std::endl;
}
}
#define CHECK_DRV(x) checkDrvError(x, #x, __FILE__, __LINE__);
#define CHECK_CUDA(ans) {check_cuda((ans), __FILE__, __LINE__);}
inline void check_cuda(int code, const char *file, int line, bool abort=true){
if(code != 0){
fprintf(stderr, "[customHook][%s:%3d]:CUDAERROR: %d\n",file,line,code);
if (abort) exit(code);
}
}
double what_time_is_it_now()
{
struct timespec time;
if (clock_gettime(CLOCK_MONOTONIC, &time) == -1) exit(-1);
return (double)time.tv_sec + (double)time.tv_nsec * 0.000000001;
}
int main()
{
cudaStream_t copy_stream;
CHECK_CUDA(cudaStreamCreate(©_stream));
size_t min_size = 2*1024*1024;
size_t copy_size = 100*1024*1024;
int arr_size = copy_size/min_size;
int **host_arr, **dev_arr;
host_arr = (int **)malloc(sizeof(int *)*arr_size);
dev_arr = (int **)malloc(sizeof(int *)*arr_size);
for(int i = 0; i < arr_size; i++){
CHECK_CUDA(cudaHostAlloc(&host_arr[i], min_size, cudaHostAllocMapped));
CHECK_CUDA(cudaMalloc(&dev_arr[i], min_size));
for(int j = 0; j < min_size/sizeof(int); j++){
host_arr[i][j] = rand()/(int)RAND_MAX;
}
}
double start, end;
// 1. single copy
start = what_time_is_it_now();
CHECK_CUDA(cudaMemcpyAsync(dev_arr[0], host_arr[0], min_size, cudaMemcpyHostToDevice, copy_stream));
CHECK_CUDA(cudaDeviceSynchronize());
end = what_time_is_it_now();
printf("Single %dMB copy time: %.4fms\n", (int)min_size/(1024*1024), (end - start)*1000);
// 2. iterative copy
start = what_time_is_it_now();
for(int i = 0; i < arr_size; i++){
CHECK_CUDA(cudaMemcpyAsync(dev_arr[i], host_arr[i], min_size, cudaMemcpyHostToDevice, copy_stream));
}
CHECK_CUDA(cudaDeviceSynchronize());
end = what_time_is_it_now();
printf("Iterative copy time: %.4fms\n",(end - start)*1000);
// 3. iterative copy with sync
start = what_time_is_it_now();
for(int i = 0; i < arr_size; i++){
CHECK_CUDA(cudaMemcpyAsync(dev_arr[i], host_arr[i], min_size, cudaMemcpyHostToDevice, copy_stream));
CHECK_CUDA(cudaDeviceSynchronize());
}
end = what_time_is_it_now();
printf("Iterative copy with sync time: %.4fms\n",(end - start)*1000);
}