Hi, everyone.
I’m trying to implement Nvidia GPU and Xilinx FPGA peer-to-peer data transfer.
The references about Xilinx FPGA are as follows.
-
Xilinx P2P Data Transfer between FPGA Card and NVMe Device example
- Create P2P buffer
- Map P2P buffer to the host space
- Access the SSD location through Linux File System, the file needs to be opened with O_DIRECT
- Read/Write through Linux pread/pwrite function
-
Comment from Xilinx
- You can use XRT P2P feature to do FPGA to GPU (or GPU to FPGA) p2p communication. You can let GPU read/write directly from/to the PCIe Bar using the mapped pointer of the p2p buffer with GPU Memcpy APIs.
I wrote the following code by referring to the above.
size_t buf_size = 1024 * 1024 * 1024; // 1GB
// GPU
void *gpu_device_memory;
cudaMalloc(&gpu_device_memory, buf_size);
// FPGA
cl_mem_ext_ptr_t mem_ext = {XCL_MEM_EXT_P2P_BUFFER, nullptr, 0};
cl_mem fpga_device_memory = clCreateBuffer(fpga_context, CL_MEM_READ_WRITE | CL_MEM_EXT_PTR_XILINX, buf_size, &mem_ext, &err);
void *fpga_mapped_ptr = clEnqueueMapBuffer(queue, fpga_device_memory, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, buf_size, 0, NULL, NULL, NULL);
// GPU to FPGA peer-to-peer
cudaMemcpy(fpga_mapped_ptr, gpu_device_memory, buf_size, cudaMemcpyDeviceToHost);
// FPGA to GPU peer-to-peer
cudaMemcpy(gpu_device_memory, fpga_mapped_ptr, buf_size, cudaMemcpyHostToDevice);
Here’s my question.
- Is it correct that the code above describes the P2P transfer? Can the cudaMemcpy function invoke the p2p data transfer? Also, is there a way to confirm that the data was transferred through P2P?
- Is my understanding of P2P correct?
- When the computer boots up, the system assigns physical addresses to PCIe devices. After this PCIe bus enumeration, the PCIe system knows each device’s physical address.
- Map the host pointer to the FPGA’s physical memory address by the
clEnqueueMapBuffer()
function. The MMU holds this information. - The host calls
cudaMemcpy()
. When the command is delivered to the GPU, the host pointer is translated to the physical address of the FPGA. - The GPU sends data to the FPGA. Because the PCIe system knows the physical address of the FPGA, data is sent directly to the FPGA without going through host memory.
- If I’m missing something, what should I learn? OS? PCIe system? Device driver?
Full source code:
// includes, system
#include <stdlib.h>
#include <stdio.h>
// CUDA includes
#include "cuda.h"
#include <cuda_runtime.h>
// FPGA includes
#include <string.h>
#include <CL/opencl.h>
#include <CL/cl_ext.h>
#include "xclhal2.h"
#include "CL/cl_ext_xilinx.h"
#include <iostream>
#include <unistd.h>
#include <chrono>
#include <iomanip>
char* convertData(double bytes)
{
static char retBuf[256] = "";
const char* dataUnit[] = { "B", "KB", "MB", "GB", "TB" };
double retBytes = 0;
int i = 0;
retBytes = bytes;
if(bytes > 1024) {
for(i = 0; ((bytes / 1024) >= 1); i++) {
if(i == 4) {
break;
}
retBytes = bytes / 1024.0;
bytes /= 1024.0;
}
}
sprintf(retBuf, "%.2lf %s", retBytes, dataUnit[i]);
return retBuf;
}
cl_uint load_file_to_memory(const char *filename, char **result)
{
cl_uint size = 0;
FILE *f = fopen(filename, "rb");
if (f == NULL) {
*result = NULL;
return -1; // -1 means file opening fail
}
fseek(f, 0, SEEK_END);
size = ftell(f);
fseek(f, 0, SEEK_SET);
*result = (char *)malloc(size+1);
if (size != fread(*result, sizeof(char), size, f)) {
free(*result);
return -2; // -2 means file reading fail
}
fclose(f);
(*result)[size] = 0;
return size;
}
int main(int argc, char **argv) {
size_t buf_size = 1024 * 1024 * 1024;
//--------------------------------------------------------------------------
//
// Configure GPU
//
//--------------------------------------------------------------------------
printf("* GPU\n");
CUresult status;
int gpu_n;
cudaGetDeviceCount(&gpu_n);
printf("CUDA-capable device count: %i\n", gpu_n);
CUdevice device;
cuDeviceGet(&device, 0);
char name[256];
cuDeviceGetName(name, 256, device);
fprintf(stderr, "Select device: %s\n", name);
size_t global_mem = 0;
cuDeviceTotalMem(&global_mem, device);
fprintf(stderr, "Global memory: %llu MB\n", (unsigned long long)(global_mem >> 20));
if(global_mem > (unsigned long long)4*1024*1024*1024L) {
fprintf(stderr, "64-bit Memory Address support\n");
}
// memory allocation
uint8_t *gpu_device_ptr = 0;
cudaError_t cudaError = cudaMalloc(&gpu_device_ptr, buf_size);
if(cudaError != cudaSuccess) {
fprintf(stderr, "cudaMallocManaged failed\n");
}
fprintf(stderr, "Allocate memory address: 0x%llx\n", (unsigned long long)gpu_device_ptr);
printf("\n");
//--------------------------------------------------------------------------
//
// Configure FPGA
//
//--------------------------------------------------------------------------
printf("* FPGA\n");
cl_int err = 0;
cl_platform_id platform_id; // platform id
cl_device_id device_id; // compute device id
// Get all platforms and then select Xilinx platform
cl_platform_id platforms[16]; // platform id
cl_uint platform_count;
cl_uint platform_found = 0;
err = clGetPlatformIDs(16, platforms, &platform_count);
if (err != CL_SUCCESS) {
printf("ERROR: Failed to find an OpenCL platform!\n");
printf("ERROR: Test failed\n");
return EXIT_FAILURE;
}
printf("INFO: Found %d platforms\n", platform_count);
// Find Xilinx Plaftorm
char cl_platform_vendor[1001];
for (cl_uint iplat=0; iplat<platform_count; iplat++) {
err = clGetPlatformInfo(platforms[iplat], CL_PLATFORM_VENDOR, 1000, (void *)cl_platform_vendor,NULL);
if (err != CL_SUCCESS) {
printf("ERROR: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
printf("ERROR: Test failed\n");
return EXIT_FAILURE;
}
if (strcmp(cl_platform_vendor, "Xilinx") == 0) {
printf("INFO: Selected platform %d from %s\n", iplat, cl_platform_vendor);
platform_id = platforms[iplat];
platform_found = 1;
}
}
if (!platform_found) {
printf("ERROR: Platform Xilinx not found. Exit.\n");
return EXIT_FAILURE;
}
// Get Accelerator compute device
cl_uint num_devices;
cl_uint device_found = 0;
cl_device_id devices[16]; // compute device id
char cl_device_name[1001];
unsigned int num_dev;
err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ACCELERATOR, 16, devices, &num_devices);
printf("INFO: Found %d devices\n", num_devices);
if (err != CL_SUCCESS) {
printf("ERROR: Failed to create a device group!\n");
printf("ERROR: Test failed\n");
return -1;
}
//iterate all devices to select the target device.
char target_device_name[1001] = "xilinx_u50_gen3x16_xdma_base_5";
for (cl_uint i=0; i<num_devices; i++) {
err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 1024, cl_device_name, 0);
if (err != CL_SUCCESS) {
printf("ERROR: Failed to get device name for device %d!\n", i);
printf("ERROR: Test failed\n");
return EXIT_FAILURE;
}
printf("CL_DEVICE_NAME %s\n", cl_device_name);
if(strcmp(cl_device_name, target_device_name) == 0) {
device_id = devices[i];
device_found = 1;
num_dev = i;
printf("Selected %s as the target device\n", cl_device_name);
}
}
if (!device_found) {
printf("ERROR:Target device %s not found. Exit.\n", target_device_name);
return EXIT_FAILURE;
}
// Create a compute context
cl_context fpga_context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!fpga_context) {
printf("ERROR: Failed to create a compute context!\n");
printf("ERROR: Test failed\n");
return EXIT_FAILURE;
}
// Create a command commands
cl_command_queue commands = clCreateCommandQueue(fpga_context, device_id, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
if (!commands) {
printf("ERROR: Failed to create a command commands!\n");
printf("ERROR: code %i\n",err);
printf("ERROR: Test failed\n");
return EXIT_FAILURE;
}
// xclbin
unsigned char *kernelbinary;
const char *xclbin = "bandwidth.xclbin";
printf("INFO: loading xclbin %s\n", xclbin);
cl_uint n_i0 = load_file_to_memory(xclbin, (char **) &kernelbinary);
if (n_i0 < 0) {
printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin);
printf("ERROR: Test failed\n");
return EXIT_FAILURE;
}
size_t n0 = n_i0;
// Create the compute program from offline
cl_int cl_status;
cl_program program = clCreateProgramWithBinary(fpga_context, 1, &device_id, &n0,
(const unsigned char **) &kernelbinary, &cl_status, &err);
free(kernelbinary);
if ((!program) || (err!=CL_SUCCESS)) {
printf("ERROR: Failed to create compute program from binary %d!\n", err);
printf("ERROR: Test failed\n");
return EXIT_FAILURE;
}
// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS) {
size_t len;
char buffer[2048];
printf("ERROR: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
printf("ERROR: Test failed\n");
return EXIT_FAILURE;
}
// Create the compute kernel in the program we wish to run
//
cl_kernel kernel = clCreateKernel(program, "bandwidth", &err);
if (!kernel || err != CL_SUCCESS) {
printf("ERROR: Failed to create compute kernel!\n");
printf("ERROR: Test failed\n");
return EXIT_FAILURE;
}
cl_mem_ext_ptr_t mem_ext = {XCL_MEM_EXT_P2P_BUFFER, nullptr, 0};
cl_mem fpga_memory = clCreateBuffer(fpga_context, CL_MEM_READ_WRITE | CL_MEM_EXT_PTR_XILINX, buf_size, &mem_ext, &err);
if (err != CL_SUCCESS) {
std::cout << "Return code for mem_src clCreateBuffer: " << err << std::endl;
}
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &fpga_memory);
uint8_t *fpga_mapped_ptr = (uint8_t*)clEnqueueMapBuffer(commands, fpga_memory, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
0, buf_size, 0, NULL, NULL, NULL);
//--------------------------------------------------------------------------
//
// GPU <-> FPGA
//
//--------------------------------------------------------------------------
std::chrono::high_resolution_clock::time_point p2pReadStart, p2pReadEnd;
unsigned long p2pReadTime;
double dnsduration, dsduration, gbpersec;
std::cout << "Total Data Transfer = " << convertData(buf_size);
p2pReadStart = std::chrono::high_resolution_clock::now();
cudaMemcpy(fpga_mapped_ptr, gpu_device_ptr, buf_size, cudaMemcpyDeviceToHost);
p2pReadEnd = std::chrono::high_resolution_clock::now();
p2pReadTime = std::chrono::duration_cast<std::chrono::microseconds>(p2pReadEnd - p2pReadStart).count();
dnsduration = (double)p2pReadTime;
dsduration = dnsduration / ((double)1000000);
gbpersec = (buf_size / dsduration) / ((double)1024 * 1024 * 1024);
std::cout << "\nGPU->FPGA\t" << std::setprecision(3) << std::fixed << gbpersec << "GB/s";
p2pReadStart = std::chrono::high_resolution_clock::now();
cudaMemcpy(gpu_device_ptr, fpga_mapped_ptr, buf_size, cudaMemcpyHostToDevice);
p2pReadEnd = std::chrono::high_resolution_clock::now();
p2pReadTime = std::chrono::duration_cast<std::chrono::microseconds>(p2pReadEnd - p2pReadStart).count();
dnsduration = (double)p2pReadTime;
dsduration = dnsduration / ((double)1000000);
gbpersec = (buf_size / dsduration) / ((double)1024 * 1024 * 1024);
std::cout << "\nFPGA->GPU\t" << std::setprecision(3) << std::fixed << gbpersec << "GB/s\n";
//--------------------------------------------------------------------------
//
// Release
//
//--------------------------------------------------------------------------
// GPU
cudaFree(gpu_device_ptr);
// FPGA
clEnqueueUnmapMemObject(commands, fpga_memory, fpga_mapped_ptr, 0, nullptr, nullptr);
clReleaseDevice(device_id);
clReleaseMemObject(fpga_memory);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(commands);
clReleaseContext(fpga_context);
return 0;
}
Thanks and regards.