Hi,
I have been using GDS and the cuFile API on a DGX A100 for quite some applications and am generally quite happy with the results.
However, I often run into problems with calls to “cuFileBufRegister”.
It fails for some of the buffer sizes I would like to register, e.g. it might work for 64kb (and multiples of that), but will fail for 96kb.
The error is always the same: 5030, CU_FILE_INTERNAL_ERROR.
Here is an example using the slightly changed official sample code (only changed offsets, sizes and flags for open):
// https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#sample-program
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <cuda_runtime.h>
#include "cufile.h"
//#include "cufile_sample_utils.h"
using namespace std;
int main(void) {
int fd;
ssize_t ret;
void *devPtr_base;
off_t file_offset = 0x2000;
off_t devPtr_offset = 0;
ssize_t IO_size =(1UL << 16) + (1UL << 15) ;
size_t buff_size = IO_size;
// off_t devPtr_offset = 0x1000;
// ssize_t IO_size = 1UL << 24;
// size_t buff_size = IO_size + 0x1000;
CUfileError_t status;
// CUResult cuda_result;
int cuda_result;
CUfileDescr_t cf_descr;
CUfileHandle_t cf_handle;
char *testfn;
testfn=getenv("TESTFILE");
if (testfn==NULL) {
std::cerr << "No testfile defined via TESTFILE. Exiting." << std::endl;
return -1;
}
cout << std::endl;
cout << "Opening File " << testfn << std::endl;
fd = open(testfn, O_RDWR|O_DIRECT);
if(fd < 0) {
std::cerr << "file open " << testfn << "errno " << errno << std::endl;
return -1;
}
cout << "Opening cuFileDriver." << std::endl;
status = cuFileDriverOpen();
if (status.err != CU_FILE_SUCCESS) {
std::cerr << " cuFile driver failed to open " << std::endl;
close(fd);
return -1;
}
cout << "Registering cuFile handle to " << testfn << "." << std::endl;
memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t));
cf_descr.handle.fd = fd;
cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
status = cuFileHandleRegister(&cf_handle, &cf_descr);
if (status.err != CU_FILE_SUCCESS) {
std::cerr << "cuFileHandleRegister fd " << fd << " status " << status.err << std::endl;
close(fd);
return -1;
}
cout << "Allocating CUDA buffer of " << buff_size << " bytes." << std::endl;
cuda_result = cudaMalloc(&devPtr_base, buff_size);
if (cuda_result != CUDA_SUCCESS) {
std::cerr << "buffer allocation failed " << cuda_result << std::endl;
cuFileHandleDeregister(cf_handle);
close(fd);
return -1;
}
cout << "Registering Buffer of " << buff_size << " bytes." << std::endl;
status = cuFileBufRegister(devPtr_base, buff_size, 0);
if (status.err != CU_FILE_SUCCESS) {
std::cerr << "buffer registration failed " << status.err << std::endl;
cuFileHandleDeregister(cf_handle);
close(fd);
cudaFree(devPtr_base);
return -1;
}
// fill a pattern
cout << "Filling memory." << std::endl;
cudaMemset((void *) devPtr_base, 0xab, buff_size);
// perform write operation directly from GPU mem to file
cout << "Writing buffer to file." << std::endl;
ret = cuFileWrite(cf_handle, devPtr_base, IO_size, file_offset, devPtr_offset);
if (ret < 0 || ret != IO_size) {
std::cerr << "cuFileWrite failed " << ret << std::endl;
}
// release the GPU memory pinning
cout << "Releasing cuFile buffer." << std::endl;
status = cuFileBufDeregister(devPtr_base);
if (status.err != CU_FILE_SUCCESS) {
std::cerr << "buffer deregister failed" << std::endl;
cudaFree(devPtr_base);
cuFileHandleDeregister(cf_handle);
close(fd);
return -1;
}
cout << "Freeing CUDA buffer." << std::endl;
cudaFree(devPtr_base);
// deregister the handle from cuFile
cout << "Releasing file handle. " << std::endl;
(void) cuFileHandleDeregister(cf_handle);
close(fd);
// release all cuFile resources
cout << "Closing File Driver." << std::endl;
(void) cuFileDriverClose();
cout << std::endl;
return 0;
}
/*prints:
...
Allocating CUDA buffer of 98304 bytes.
Registering Buffer of 98304 bytes.
buffer registration failed 5030
*/
Here is the cufile log, including platform and library information:
14-02-2023 15:46:02:59 [pid=358055 tid=358055] INFO 0:136 Lib being used for urcup concurrency : liburcu-bp.so.6
14-02-2023 15:46:02:60 [pid=358055 tid=358055] INFO 0:151 Lib being used for concurrency : liburcu-cds.so.6
14-02-2023 15:46:02:60 [pid=358055 tid=358055] INFO cufio:320 Loaded successfully URCU library
14-02-2023 15:46:02:60 [pid=358055 tid=358055] INFO 0:147 nvidia_fs driver open invoked
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:381 GDS release version: 1.0.2.10
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:384 nvidia_fs version: 2.12 libcufile version: 2.4
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:276 NVMe: driver support OK
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:316 WekaFS: driver support OK
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:493 nvidia_fs driver version check ok
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:276 NVMe: driver support OK
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:316 WekaFS: driver support OK
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:184 ============
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:185 ENVIRONMENT:
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:186 ============
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:199 =====================
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:200 DRIVER CONFIGURATION:
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:201 =====================
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:204 NVMe : Supported
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:205 NVMeOF : Unsupported
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:206 SCSI : Unsupported
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:207 ScaleFlux CSD : Unsupported
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:208 NVMesh : Unsupported
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:212 DDN EXAScaler : Unsupported
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:216 IBM Spectrum Scale : Unsupported
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-drv:220 NFS : Unsupported
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-rdma:1185 WekaFS : Unsupported
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-rdma:1187 Userspace RDMA : Unsupported
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-rdma:1196 --Mellanox PeerDirect : Enabled
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-rdma:1199 --rdma library : Not Loaded (libcufile_rdma.so)
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-rdma:1202 --rdma devices : Not configured
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-rdma:1205 --rdma_device_status : Up: 0 Down: 0
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio:455 =====================
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio:456 CUFILE CONFIGURATION:
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio:457 =====================
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1114 properties.use_compat_mode : false
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1116 properties.gds_rdma_write_support : true
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1118 properties.use_poll_mode : false
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1120 properties.poll_mode_max_size_kb : 4
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1122 properties.max_batch_io_timeout_msecs : 5
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1124 properties.max_direct_io_size_kb : 16384
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1126 properties.max_device_cache_size_kb : 1048576
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1128 properties.max_device_pinned_mem_size_kb : 33554432
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1130 properties.posix_pool_slab_size_kb : 4 1024 16384
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1132 properties.posix_pool_slab_count : 128 64 32
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1134 properties.rdma_peer_affinity_policy : RoundRobin
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1136 properties.rdma_dynamic_routing : 0
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1142 fs.generic.posix_unaligned_writes : false
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1145 fs.lustre.posix_gds_min_kb: 0
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1159 fs.weka.rdma_write_support: false
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1195 profile.nvtx : false
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1197 profile.cufile_stats : 1
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO 0:1200 miscellaneous.api_check_aggressive : false
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-plat:725 =========
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-plat:726 GPU INFO:
14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO cufio-plat:727 =========
14-02-2023 15:46:02:63 [pid=358055 tid=358055] INFO cufio-plat:368 GPU index 0 NVIDIA A100-SXM4-40GB bar:1 bar size (MiB):65536 supports GDS
14-02-2023 15:46:02:63 [pid=358055 tid=358055] INFO cufio-plat:381 Total GPUS supported on this platform 1
14-02-2023 15:46:02:63 [pid=358055 tid=358055] INFO cufio-plat:738 ==============
14-02-2023 15:46:02:63 [pid=358055 tid=358055] INFO cufio-plat:739 PLATFORM INFO:
14-02-2023 15:46:02:63 [pid=358055 tid=358055] INFO cufio-plat:740 ==============
14-02-2023 15:46:02:65 [pid=358055 tid=358055] INFO cufio-plat:490 ACS not enabled in GPU paths
14-02-2023 15:46:02:65 [pid=358055 tid=358055] INFO cufio-plat:631 IOMMU: enabled
14-02-2023 15:46:02:65 [pid=358055 tid=358055] INFO cufio-plat:781 Platform verification succeeded
14-02-2023 15:46:02:65 [pid=358055 tid=358055] INFO cufio-px-pool:443 POSIX pool buffer initialization complete
14-02-2023 15:46:02:81 [pid=358055 tid=358055] INFO curdma-ldbal:501 No RDMA devices configured,skipping RDMA load balancer initialization
14-02-2023 15:46:02:81 [pid=358055 tid=358055] INFO cufio:506 CUFile initialization complete
14-02-2023 15:46:02:85 [pid=358055 tid=358055] INFO cufio-fs:391 RAID member: nvme3n1 wwid: eui.345950304ec018740025384500000004
14-02-2023 15:46:02:85 [pid=358055 tid=358055] INFO cufio-fs:405 Block dev pci info: nvme3n1 numa node 1 pci bridge: 0000:40:01.1
14-02-2023 15:46:02:85 [pid=358055 tid=358055] INFO cufio-fs:391 RAID member: nvme4n1 wwid: eui.34595030522010810025384700000003
14-02-2023 15:46:02:85 [pid=358055 tid=358055] INFO cufio-fs:405 Block dev pci info: nvme4n1 numa node 7 pci bridge: 0000:80:01.1
14-02-2023 15:46:02:85 [pid=358055 tid=358055] INFO cufio-fs:391 RAID member: nvme5n1 wwid: eui.34595030522011210025384700000003
14-02-2023 15:46:02:86 [pid=358055 tid=358055] INFO cufio-fs:405 Block dev pci info: nvme5n1 numa node 5 pci bridge: 0000:b0:01.1
14-02-2023 15:46:02:86 [pid=358055 tid=358055] INFO cufio-fs:428 RAID Device /dev/md1 pci bridge 0000:40:01.1 cross QPI true has 3 devices nvme3n1,nvme4n1,nvme5n1
14-02-2023 15:46:02:86 [pid=358055 tid=358055] INFO cufio-udev:88 sysfs attribute not found device/transport md1
14-02-2023 15:46:02:86 [pid=358055 tid=358055] INFO cufio-udev:88 sysfs attribute not found wwid md1
14-02-2023 15:46:02:207 [pid=358055 tid=358055] ERROR 0:390 R/W shadow buffer memory allocation failed, size : 98304
14-02-2023 15:46:02:207 [pid=358055 tid=358055] ERROR 0:447 map failed
14-02-2023 15:46:02:207 [pid=358055 tid=358055] ERROR cufio-obj:67 error allocating nvfs handle, size: 98304
14-02-2023 15:46:02:207 [pid=358055 tid=358055] ERROR cufio:1155 cuFileBufRegister error, object allocation failed
14-02-2023 15:46:02:207 [pid=358055 tid=358055] ERROR cufio:1201 cuFileBufRegister error internal error
14-02-2023 15:46:02:208 [pid=358055 tid=358055] INFO cufio:135 cuFile STATS VERSION : 4
GLOBAL STATS:
Total Files: 1
Total Read Errors : 0
Total Read Size (MiB): 0
Read BandWidth (GiB/s): 0
Avg Read Latency (us): 0
Total Write Errors : 0
Total Write Size (MiB): 0
Write BandWidth (GiB/s): 0
Avg Write Latency (us): 0
Is there a special necessary alignment or any other requirements that I am not keeping in mind here?
If anyone is experiencing similar behaviour or has any idea what might be the problem, I would be happy to read about it!
Thanks in advance!