P2P communication between GPU and FPGA

joshua.kim · July 5, 2022, 11:30am

Hi, everyone.
I’m trying to implement Nvidia GPU and Xilinx FPGA peer-to-peer data transfer.

The references about Xilinx FPGA are as follows.

Xilinx P2P Data Transfer between FPGA Card and NVMe Device example
1. Create P2P buffer
2. Map P2P buffer to the host space
3. Access the SSD location through Linux File System, the file needs to be opened with O_DIRECT
4. Read/Write through Linux pread/pwrite function
Comment from Xilinx
- You can use XRT P2P feature to do FPGA to GPU (or GPU to FPGA) p2p communication. You can let GPU read/write directly from/to the PCIe Bar using the mapped pointer of the p2p buffer with GPU Memcpy APIs.

I wrote the following code by referring to the above.

size_t buf_size = 1024 * 1024 * 1024; // 1GB

// GPU
void *gpu_device_memory;
cudaMalloc(&gpu_device_memory, buf_size);

// FPGA
cl_mem_ext_ptr_t mem_ext = {XCL_MEM_EXT_P2P_BUFFER, nullptr, 0};
cl_mem fpga_device_memory = clCreateBuffer(fpga_context, CL_MEM_READ_WRITE | CL_MEM_EXT_PTR_XILINX, buf_size, &mem_ext, &err);
void *fpga_mapped_ptr = clEnqueueMapBuffer(queue, fpga_device_memory, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, buf_size, 0, NULL, NULL, NULL);

// GPU to FPGA peer-to-peer
cudaMemcpy(fpga_mapped_ptr, gpu_device_memory, buf_size, cudaMemcpyDeviceToHost);
// FPGA to GPU peer-to-peer
cudaMemcpy(gpu_device_memory, fpga_mapped_ptr, buf_size, cudaMemcpyHostToDevice);

Here’s my question.

Is it correct that the code above describes the P2P transfer? Can the cudaMemcpy function invoke the p2p data transfer? Also, is there a way to confirm that the data was transferred through P2P?
Is my understanding of P2P correct?
1. When the computer boots up, the system assigns physical addresses to PCIe devices. After this PCIe bus enumeration, the PCIe system knows each device’s physical address.
2. Map the host pointer to the FPGA’s physical memory address by the clEnqueueMapBuffer() function. The MMU holds this information.
3. The host calls cudaMemcpy(). When the command is delivered to the GPU, the host pointer is translated to the physical address of the FPGA.
4. The GPU sends data to the FPGA. Because the PCIe system knows the physical address of the FPGA, data is sent directly to the FPGA without going through host memory.
If I’m missing something, what should I learn? OS? PCIe system? Device driver?

Full source code:

// includes, system
#include <stdlib.h>
#include <stdio.h>

// CUDA includes
#include "cuda.h"
#include <cuda_runtime.h>

// FPGA includes
#include <string.h>
#include <CL/opencl.h>
#include <CL/cl_ext.h>
#include "xclhal2.h"
#include "CL/cl_ext_xilinx.h"

#include <iostream>
#include <unistd.h>
#include <chrono>
#include <iomanip>

char* convertData(double bytes)
{
    static char retBuf[256] = "";
    const char* dataUnit[] = { "B", "KB", "MB", "GB", "TB" };
    double retBytes = 0;
    int i = 0;
    retBytes = bytes;
    if(bytes > 1024) {
        for(i = 0; ((bytes / 1024) >= 1); i++) {
            if(i == 4) {
                break;
            }
            retBytes = bytes / 1024.0;
            bytes /= 1024.0;
        }
    }
    sprintf(retBuf, "%.2lf %s", retBytes, dataUnit[i]);
    return retBuf;
}

cl_uint load_file_to_memory(const char *filename, char **result)
{
    cl_uint size = 0;
    FILE *f = fopen(filename, "rb");
    if (f == NULL) {
        *result = NULL;
        return -1; // -1 means file opening fail
    }
    fseek(f, 0, SEEK_END);
    size = ftell(f);
    fseek(f, 0, SEEK_SET);
    *result = (char *)malloc(size+1);
    if (size != fread(*result, sizeof(char), size, f)) {
        free(*result);
        return -2; // -2 means file reading fail
    }
    fclose(f);
    (*result)[size] = 0;
    return size;
}

int main(int argc, char **argv) {

    size_t buf_size = 1024 * 1024 * 1024;

    //--------------------------------------------------------------------------
    //
    // Configure GPU
    //
    //--------------------------------------------------------------------------
    printf("* GPU\n");

    CUresult status;
    int gpu_n;
    cudaGetDeviceCount(&gpu_n);
    printf("CUDA-capable device count: %i\n", gpu_n);

    CUdevice device;
    cuDeviceGet(&device, 0);

    char name[256];
    cuDeviceGetName(name, 256, device);
    fprintf(stderr, "Select device: %s\n", name);

    size_t global_mem = 0;
    cuDeviceTotalMem(&global_mem, device);
    fprintf(stderr, "Global memory: %llu MB\n", (unsigned long long)(global_mem >> 20));
    if(global_mem > (unsigned long long)4*1024*1024*1024L) {
        fprintf(stderr, "64-bit Memory Address support\n");
    }

    // memory allocation
    uint8_t *gpu_device_ptr = 0;
    cudaError_t cudaError = cudaMalloc(&gpu_device_ptr, buf_size);
    if(cudaError != cudaSuccess) {
        fprintf(stderr, "cudaMallocManaged failed\n");
    }

    fprintf(stderr, "Allocate memory address: 0x%llx\n", (unsigned long long)gpu_device_ptr);

    printf("\n");

    //--------------------------------------------------------------------------
    //
    // Configure FPGA
    //
    //--------------------------------------------------------------------------
    printf("* FPGA\n");

    cl_int err = 0;
    cl_platform_id platform_id;     // platform id
    cl_device_id device_id;         // compute device id

    // Get all platforms and then select Xilinx platform
    cl_platform_id platforms[16];       // platform id
    cl_uint platform_count;
    cl_uint platform_found = 0;
    err = clGetPlatformIDs(16, platforms, &platform_count);
    if (err != CL_SUCCESS) {
        printf("ERROR: Failed to find an OpenCL platform!\n");
        printf("ERROR: Test failed\n");
        return EXIT_FAILURE;
    }
    printf("INFO: Found %d platforms\n", platform_count);

    // Find Xilinx Plaftorm
    char cl_platform_vendor[1001];
    for (cl_uint iplat=0; iplat<platform_count; iplat++) {
        err = clGetPlatformInfo(platforms[iplat], CL_PLATFORM_VENDOR, 1000, (void *)cl_platform_vendor,NULL);
        if (err != CL_SUCCESS) {
            printf("ERROR: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
            printf("ERROR: Test failed\n");
            return EXIT_FAILURE;
        }
        if (strcmp(cl_platform_vendor, "Xilinx") == 0) {
            printf("INFO: Selected platform %d from %s\n", iplat, cl_platform_vendor);
            platform_id = platforms[iplat];
            platform_found = 1;
        }
    }
    if (!platform_found) {
        printf("ERROR: Platform Xilinx not found. Exit.\n");
        return EXIT_FAILURE;
    }

    // Get Accelerator compute device
    cl_uint num_devices;
    cl_uint device_found = 0;
    cl_device_id devices[16];  // compute device id
    char cl_device_name[1001];
    unsigned int num_dev;
    err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ACCELERATOR, 16, devices, &num_devices);
    printf("INFO: Found %d devices\n", num_devices);
    if (err != CL_SUCCESS) {
        printf("ERROR: Failed to create a device group!\n");
        printf("ERROR: Test failed\n");
        return -1;
    }

    //iterate all devices to select the target device.
    char target_device_name[1001] = "xilinx_u50_gen3x16_xdma_base_5";
    for (cl_uint i=0; i<num_devices; i++) {
        err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 1024, cl_device_name, 0);
        if (err != CL_SUCCESS) {
            printf("ERROR: Failed to get device name for device %d!\n", i);
            printf("ERROR: Test failed\n");
            return EXIT_FAILURE;
        }
        printf("CL_DEVICE_NAME %s\n", cl_device_name);
        if(strcmp(cl_device_name, target_device_name) == 0) {
            device_id = devices[i];
            device_found = 1;
            num_dev = i;
            printf("Selected %s as the target device\n", cl_device_name);
        }
    }

    if (!device_found) {
        printf("ERROR:Target device %s not found. Exit.\n", target_device_name);
        return EXIT_FAILURE;
    }

    // Create a compute context
    cl_context fpga_context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!fpga_context) {
        printf("ERROR: Failed to create a compute context!\n");
        printf("ERROR: Test failed\n");
        return EXIT_FAILURE;
    }

    // Create a command commands
    cl_command_queue commands = clCreateCommandQueue(fpga_context, device_id, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
    if (!commands) {
        printf("ERROR: Failed to create a command commands!\n");
        printf("ERROR: code %i\n",err);
        printf("ERROR: Test failed\n");
        return EXIT_FAILURE;
    }

    // xclbin
    unsigned char *kernelbinary;
    const char *xclbin = "bandwidth.xclbin";
    printf("INFO: loading xclbin %s\n", xclbin);
    cl_uint n_i0 = load_file_to_memory(xclbin, (char **) &kernelbinary);
    if (n_i0 < 0) {
        printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin);
        printf("ERROR: Test failed\n");
        return EXIT_FAILURE;
    }

    size_t n0 = n_i0;

    // Create the compute program from offline
    cl_int cl_status;
    cl_program program = clCreateProgramWithBinary(fpga_context, 1, &device_id, &n0,
                                                   (const unsigned char **) &kernelbinary, &cl_status, &err);
    free(kernelbinary);

    if ((!program) || (err!=CL_SUCCESS)) {
        printf("ERROR: Failed to create compute program from binary %d!\n", err);
        printf("ERROR: Test failed\n");
        return EXIT_FAILURE;
    }

    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS) {
        size_t len;
        char buffer[2048];

        printf("ERROR: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        printf("ERROR: Test failed\n");
        return EXIT_FAILURE;
    }

    // Create the compute kernel in the program we wish to run
    //
    cl_kernel kernel = clCreateKernel(program, "bandwidth", &err);
    if (!kernel || err != CL_SUCCESS) {
        printf("ERROR: Failed to create compute kernel!\n");
        printf("ERROR: Test failed\n");
        return EXIT_FAILURE;
    }

    cl_mem_ext_ptr_t mem_ext = {XCL_MEM_EXT_P2P_BUFFER, nullptr, 0};
    cl_mem fpga_memory = clCreateBuffer(fpga_context, CL_MEM_READ_WRITE | CL_MEM_EXT_PTR_XILINX, buf_size, &mem_ext, &err);
    if (err != CL_SUCCESS) {
        std::cout << "Return code for mem_src clCreateBuffer: " << err << std::endl;
    }

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &fpga_memory);

    uint8_t *fpga_mapped_ptr = (uint8_t*)clEnqueueMapBuffer(commands, fpga_memory, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
                                                            0, buf_size, 0, NULL, NULL, NULL);

    //--------------------------------------------------------------------------
    //
    // GPU <-> FPGA
    //
    //--------------------------------------------------------------------------
    std::chrono::high_resolution_clock::time_point p2pReadStart, p2pReadEnd;
    unsigned long p2pReadTime;
    double dnsduration, dsduration, gbpersec;
    std::cout << "Total Data Transfer = " << convertData(buf_size);

    p2pReadStart = std::chrono::high_resolution_clock::now();
    cudaMemcpy(fpga_mapped_ptr, gpu_device_ptr, buf_size, cudaMemcpyDeviceToHost);
    p2pReadEnd = std::chrono::high_resolution_clock::now();
    p2pReadTime = std::chrono::duration_cast<std::chrono::microseconds>(p2pReadEnd - p2pReadStart).count();
    dnsduration = (double)p2pReadTime;
    dsduration = dnsduration / ((double)1000000);
    gbpersec = (buf_size / dsduration) / ((double)1024 * 1024 * 1024);
    std::cout << "\nGPU->FPGA\t" << std::setprecision(3) << std::fixed << gbpersec << "GB/s";

    p2pReadStart = std::chrono::high_resolution_clock::now();
    cudaMemcpy(gpu_device_ptr, fpga_mapped_ptr, buf_size, cudaMemcpyHostToDevice);
    p2pReadEnd = std::chrono::high_resolution_clock::now();
    p2pReadTime = std::chrono::duration_cast<std::chrono::microseconds>(p2pReadEnd - p2pReadStart).count();
    dnsduration = (double)p2pReadTime;
    dsduration = dnsduration / ((double)1000000);
    gbpersec = (buf_size / dsduration) / ((double)1024 * 1024 * 1024);
    std::cout << "\nFPGA->GPU\t" << std::setprecision(3) << std::fixed << gbpersec << "GB/s\n";

    //--------------------------------------------------------------------------
    //
    // Release
    //
    //--------------------------------------------------------------------------

    // GPU
    cudaFree(gpu_device_ptr);

    // FPGA
    clEnqueueUnmapMemObject(commands, fpga_memory, fpga_mapped_ptr, 0, nullptr, nullptr);
    clReleaseDevice(device_id);
    clReleaseMemObject(fpga_memory);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(commands);
    clReleaseContext(fpga_context);

    return 0;
}

Thanks and regards.

1805334764 · September 13, 2022, 6:40am

@joshua.kim You can monitor the host memory usage/throughput to ensure that the transfer do not use the host memory.

nadeemm · September 13, 2022, 8:57pm

Could you share which specific GPU and FPGA are being used and the driver/toolkit versions.
This information may help to trivially reject capability concerns.

shimeng.wang · December 6, 2022, 11:24pm

Hi. I have some similar questions as well. Does your implementation above work as expected?