Aggregation of multiple coyies into a single copy

Hi everyone,

I have a question regarding CUDA memory copy operations.
I remember coming across an article (not really sure what was the article) that mentioned if multiple copy operations are performed in series (such as inside a for loop), CUDA will aggregate these copy operations into a single operation to optimize bandwidth utilization.

However, when I tried implementing this, I didn’t notice any signs of such aggregation.

Can anyone confirm if CUDA actually aggregates multiple memory copy operations? If yes, under what conditions does this happen, and how can I ensure it works in my code?

Thanks in advance for your help!

Best regards,

Below is my code

#include <iostream>
#include <iomanip>
#include <chrono>
#include <thread>
#include <assert.h>
#include <vector>
#include <stdio.h>
#include <string.h>

#include <cuda.h>
#include <cuda_runtime.h>
static inline void
checkDrvError(CUresult res, const char *tok, const char *file, unsigned line)
{
    if (res != CUDA_SUCCESS) {
        const char *errStr = NULL;
        (void)cuGetErrorString(res, &errStr);
        std::cerr << file << ':' << line << ' ' << tok
                  << "failed (" << (unsigned)res << "): " << errStr << std::endl;
    }
}

#define CHECK_DRV(x) checkDrvError(x, #x, __FILE__, __LINE__);

#define CHECK_CUDA(ans) {check_cuda((ans), __FILE__, __LINE__);}
inline void check_cuda(int code, const char *file, int line, bool abort=true){
    if(code != 0){
        fprintf(stderr, "[customHook][%s:%3d]:CUDAERROR: %d\n",file,line,code);
        if (abort) exit(code);
    }
}

double what_time_is_it_now()
{
    struct timespec time;
    if (clock_gettime(CLOCK_MONOTONIC, &time) == -1) exit(-1);
    
    return (double)time.tv_sec + (double)time.tv_nsec * 0.000000001;
}


int main()
{
    cudaStream_t copy_stream;
    CHECK_CUDA(cudaStreamCreate(&copy_stream));
    
    size_t min_size = 2*1024*1024;
    size_t copy_size = 100*1024*1024;
    int arr_size = copy_size/min_size;

    int **host_arr, **dev_arr;
    host_arr = (int **)malloc(sizeof(int *)*arr_size);
    dev_arr = (int **)malloc(sizeof(int *)*arr_size);

    for(int i = 0; i < arr_size; i++){
        CHECK_CUDA(cudaHostAlloc(&host_arr[i], min_size, cudaHostAllocMapped));
        CHECK_CUDA(cudaMalloc(&dev_arr[i], min_size));
        for(int j = 0; j < min_size/sizeof(int); j++){
            host_arr[i][j] = rand()/(int)RAND_MAX;
        }
    }

    double start, end;
    // 1. single copy 
    start = what_time_is_it_now();

    CHECK_CUDA(cudaMemcpyAsync(dev_arr[0], host_arr[0], min_size, cudaMemcpyHostToDevice, copy_stream));
    CHECK_CUDA(cudaDeviceSynchronize());

    end = what_time_is_it_now();

    printf("Single %dMB copy time: %.4fms\n", (int)min_size/(1024*1024), (end - start)*1000);
    

    // 2. iterative copy

    start = what_time_is_it_now();
    for(int i = 0; i < arr_size; i++){
        CHECK_CUDA(cudaMemcpyAsync(dev_arr[i], host_arr[i], min_size, cudaMemcpyHostToDevice, copy_stream));
    }
    CHECK_CUDA(cudaDeviceSynchronize());

    end = what_time_is_it_now();
    
    printf("Iterative copy time: %.4fms\n",(end - start)*1000);


    // 3. iterative copy with sync
    
    start = what_time_is_it_now();
    for(int i = 0; i < arr_size; i++){
        CHECK_CUDA(cudaMemcpyAsync(dev_arr[i], host_arr[i], min_size, cudaMemcpyHostToDevice, copy_stream));
        CHECK_CUDA(cudaDeviceSynchronize());
    }
    end = what_time_is_it_now();
    
    printf("Iterative copy with sync time: %.4fms\n",(end - start)*1000);

}

no, that doesn’t happen. CUDA doesn’t aggregate separate calls to cudaMemcpy even in a loop, into a single transfer.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.