"cuFileBufRegister" fails for some buffer sizes

Hi,
I have been using GDS and the cuFile API on a DGX A100 for quite some applications and am generally quite happy with the results.
However, I often run into problems with calls to “cuFileBufRegister”.
It fails for some of the buffer sizes I would like to register, e.g. it might work for 64kb (and multiples of that), but will fail for 96kb.
The error is always the same: 5030, CU_FILE_INTERNAL_ERROR.
Here is an example using the slightly changed official sample code (only changed offsets, sizes and flags for open):

// https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#sample-program
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>

#include <cstdlib>
#include <cstring>
#include <iostream>
#include <cuda_runtime.h>
#include "cufile.h"

//#include "cufile_sample_utils.h"
using namespace std;

int main(void) {
        int fd;
        ssize_t ret;
        void *devPtr_base;
        off_t file_offset = 0x2000;
        off_t devPtr_offset = 0;
        ssize_t IO_size =(1UL << 16) + (1UL << 15) ;
        size_t buff_size = IO_size;
        // off_t devPtr_offset = 0x1000;
        // ssize_t IO_size = 1UL << 24;
        // size_t buff_size = IO_size + 0x1000;
        CUfileError_t status;
        // CUResult cuda_result;
        int cuda_result;
        CUfileDescr_t cf_descr;
        CUfileHandle_t cf_handle;
        char *testfn;
        
        testfn=getenv("TESTFILE");
        if (testfn==NULL) {
            std::cerr << "No testfile defined via TESTFILE.  Exiting." << std::endl;
            return -1;
        } 
       
        cout << std::endl; 
        cout << "Opening File " << testfn << std::endl;

        fd = open(testfn, O_RDWR|O_DIRECT);
        if(fd < 0) {
                std::cerr << "file open " << testfn << "errno " << errno << std::endl;
                return -1;
        }

        cout << "Opening cuFileDriver." << std::endl;
        status = cuFileDriverOpen();
        if (status.err != CU_FILE_SUCCESS) {
                std::cerr << " cuFile driver failed to open " << std::endl;
                close(fd);
                return -1;
        }

        cout << "Registering cuFile handle to " << testfn << "." << std::endl;

        memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t));
        cf_descr.handle.fd = fd;
        cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
        status = cuFileHandleRegister(&cf_handle, &cf_descr);
        if (status.err != CU_FILE_SUCCESS) {
                std::cerr << "cuFileHandleRegister fd " << fd << " status " << status.err << std::endl;
                close(fd);
                return -1;
        }

        cout << "Allocating CUDA buffer of " << buff_size << " bytes." << std::endl;

        cuda_result = cudaMalloc(&devPtr_base, buff_size);
        if (cuda_result != CUDA_SUCCESS) {
                std::cerr << "buffer allocation failed " << cuda_result << std::endl;
                cuFileHandleDeregister(cf_handle);
                close(fd);
                return -1;
        }

        cout << "Registering Buffer of " << buff_size << " bytes." << std::endl;
        status = cuFileBufRegister(devPtr_base, buff_size, 0);
        if (status.err != CU_FILE_SUCCESS) {
                std::cerr << "buffer registration failed " << status.err << std::endl;
                cuFileHandleDeregister(cf_handle);
                close(fd);
                cudaFree(devPtr_base);
                return -1;
        }

        // fill a pattern
        cout << "Filling memory." << std::endl;

        cudaMemset((void *) devPtr_base, 0xab, buff_size);

        // perform write operation directly from GPU mem to file
        cout << "Writing buffer to file." << std::endl;
        ret = cuFileWrite(cf_handle, devPtr_base, IO_size, file_offset, devPtr_offset);

        if (ret < 0 || ret != IO_size) {
                std::cerr << "cuFileWrite failed " << ret << std::endl;
        }

        // release the GPU memory pinning
        cout << "Releasing cuFile buffer." << std::endl;
        status = cuFileBufDeregister(devPtr_base);
        if (status.err != CU_FILE_SUCCESS) {
                std::cerr << "buffer deregister failed" << std::endl;
                cudaFree(devPtr_base);
                cuFileHandleDeregister(cf_handle);
                close(fd);
                return -1;
        }

        cout << "Freeing CUDA buffer." << std::endl;
        cudaFree(devPtr_base);
        // deregister the handle from cuFile
        cout << "Releasing file handle. " << std::endl;
        (void) cuFileHandleDeregister(cf_handle);
        close(fd);

        // release all cuFile resources
        cout << "Closing File Driver." << std::endl;
        (void) cuFileDriverClose();

        cout << std::endl; 

        return 0;
}
/*prints:
...
Allocating CUDA buffer of 98304 bytes.                                                                                                   
Registering Buffer of 98304 bytes.                                                                                                       
buffer registration failed 5030
*/

Here is the cufile log, including platform and library information:

 14-02-2023 15:46:02:59 [pid=358055 tid=358055] INFO   0:136 Lib being used for urcup concurrency :  liburcu-bp.so.6
 14-02-2023 15:46:02:60 [pid=358055 tid=358055] INFO   0:151 Lib being used for concurrency :  liburcu-cds.so.6
 14-02-2023 15:46:02:60 [pid=358055 tid=358055] INFO   cufio:320 Loaded successfully URCU library

 14-02-2023 15:46:02:60 [pid=358055 tid=358055] INFO   0:147 nvidia_fs driver open invoked
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:381 GDS release version: 1.0.2.10
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:384 nvidia_fs version:  2.12 libcufile version: 2.4
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:276 NVMe: driver support OK
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:316 WekaFS: driver support OK
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:493 nvidia_fs driver version check ok
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:276 NVMe: driver support OK
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:316 WekaFS: driver support OK
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:184 ============
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:185 ENVIRONMENT:
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:186 ============
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:199 =====================
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:200 DRIVER CONFIGURATION:
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:201 =====================
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:204 NVMe               : Supported
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:205 NVMeOF             : Unsupported
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:206 SCSI               : Unsupported
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:207 ScaleFlux CSD      : Unsupported
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:208 NVMesh             : Unsupported
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:212 DDN EXAScaler      : Unsupported
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:216 IBM Spectrum Scale : Unsupported
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-drv:220 NFS                : Unsupported
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-rdma:1185 WekaFS             : Unsupported
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-rdma:1187 Userspace RDMA     : Unsupported
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-rdma:1196 --Mellanox PeerDirect : Enabled
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-rdma:1199 --rdma library        : Not Loaded (libcufile_rdma.so)
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-rdma:1202 --rdma devices        : Not configured
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-rdma:1205 --rdma_device_status  : Up: 0 Down: 0
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio:455 =====================
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio:456 CUFILE CONFIGURATION:
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio:457 =====================
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1114 properties.use_compat_mode : false
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1116 properties.gds_rdma_write_support : true
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1118 properties.use_poll_mode : false
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1120 properties.poll_mode_max_size_kb : 4
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1122 properties.max_batch_io_timeout_msecs : 5
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1124 properties.max_direct_io_size_kb : 16384
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1126 properties.max_device_cache_size_kb : 1048576
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1128 properties.max_device_pinned_mem_size_kb : 33554432
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1130 properties.posix_pool_slab_size_kb : 4 1024 16384 
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1132 properties.posix_pool_slab_count : 128 64 32 
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1134 properties.rdma_peer_affinity_policy : RoundRobin
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1136 properties.rdma_dynamic_routing : 0
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1142 fs.generic.posix_unaligned_writes : false
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1145 fs.lustre.posix_gds_min_kb: 0
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1159 fs.weka.rdma_write_support: false
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1195 profile.nvtx : false
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1197 profile.cufile_stats : 1
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   0:1200 miscellaneous.api_check_aggressive : false
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-plat:725 =========
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-plat:726 GPU INFO:
 14-02-2023 15:46:02:62 [pid=358055 tid=358055] INFO   cufio-plat:727 =========
 14-02-2023 15:46:02:63 [pid=358055 tid=358055] INFO   cufio-plat:368 GPU index 0 NVIDIA A100-SXM4-40GB bar:1 bar size (MiB):65536 supports GDS
 14-02-2023 15:46:02:63 [pid=358055 tid=358055] INFO   cufio-plat:381 Total GPUS supported on this platform 1
 14-02-2023 15:46:02:63 [pid=358055 tid=358055] INFO   cufio-plat:738 ==============
 14-02-2023 15:46:02:63 [pid=358055 tid=358055] INFO   cufio-plat:739 PLATFORM INFO:
 14-02-2023 15:46:02:63 [pid=358055 tid=358055] INFO   cufio-plat:740 ==============
 14-02-2023 15:46:02:65 [pid=358055 tid=358055] INFO   cufio-plat:490 ACS not enabled in GPU paths
 14-02-2023 15:46:02:65 [pid=358055 tid=358055] INFO   cufio-plat:631 IOMMU: enabled
 14-02-2023 15:46:02:65 [pid=358055 tid=358055] INFO   cufio-plat:781 Platform verification succeeded
 14-02-2023 15:46:02:65 [pid=358055 tid=358055] INFO   cufio-px-pool:443 POSIX pool buffer initialization complete
 14-02-2023 15:46:02:81 [pid=358055 tid=358055] INFO   curdma-ldbal:501 No RDMA devices configured,skipping RDMA load balancer initialization
 14-02-2023 15:46:02:81 [pid=358055 tid=358055] INFO   cufio:506 CUFile initialization complete
 14-02-2023 15:46:02:85 [pid=358055 tid=358055] INFO   cufio-fs:391 RAID member: nvme3n1 wwid: eui.345950304ec018740025384500000004
 14-02-2023 15:46:02:85 [pid=358055 tid=358055] INFO   cufio-fs:405 Block dev pci info: nvme3n1 numa node 1 pci bridge: 0000:40:01.1
 14-02-2023 15:46:02:85 [pid=358055 tid=358055] INFO   cufio-fs:391 RAID member: nvme4n1 wwid: eui.34595030522010810025384700000003
 14-02-2023 15:46:02:85 [pid=358055 tid=358055] INFO   cufio-fs:405 Block dev pci info: nvme4n1 numa node 7 pci bridge: 0000:80:01.1
 14-02-2023 15:46:02:85 [pid=358055 tid=358055] INFO   cufio-fs:391 RAID member: nvme5n1 wwid: eui.34595030522011210025384700000003
 14-02-2023 15:46:02:86 [pid=358055 tid=358055] INFO   cufio-fs:405 Block dev pci info: nvme5n1 numa node 5 pci bridge: 0000:b0:01.1
 14-02-2023 15:46:02:86 [pid=358055 tid=358055] INFO   cufio-fs:428 RAID Device /dev/md1 pci bridge 0000:40:01.1 cross QPI true has 3 devices nvme3n1,nvme4n1,nvme5n1
 14-02-2023 15:46:02:86 [pid=358055 tid=358055] INFO   cufio-udev:88 sysfs attribute not found device/transport md1
 14-02-2023 15:46:02:86 [pid=358055 tid=358055] INFO   cufio-udev:88 sysfs attribute not found wwid md1
 14-02-2023 15:46:02:207 [pid=358055 tid=358055] ERROR  0:390 R/W shadow buffer memory allocation failed, size :  98304
 14-02-2023 15:46:02:207 [pid=358055 tid=358055] ERROR  0:447 map failed

 14-02-2023 15:46:02:207 [pid=358055 tid=358055] ERROR  cufio-obj:67 error allocating nvfs handle, size: 98304
 14-02-2023 15:46:02:207 [pid=358055 tid=358055] ERROR  cufio:1155 cuFileBufRegister error, object allocation failed
 14-02-2023 15:46:02:207 [pid=358055 tid=358055] ERROR  cufio:1201 cuFileBufRegister error internal error
 14-02-2023 15:46:02:208 [pid=358055 tid=358055] INFO   cufio:135 cuFile STATS VERSION : 4
GLOBAL STATS:
Total Files: 1
Total Read Errors : 0
Total Read Size (MiB): 0
Read BandWidth (GiB/s): 0
Avg Read Latency (us): 0
Total Write Errors : 0
Total Write Size (MiB): 0
Write BandWidth (GiB/s): 0
Avg Write Latency (us): 0


Is there a special necessary alignment or any other requirements that I am not keeping in mind here?
If anyone is experiencing similar behaviour or has any idea what might be the problem, I would be happy to read about it!

Thanks in advance!

After some more testing, it seems that sizes that are multiples of 64kb, as well as 64kb aligned device pointers are necessary for successful buffer registrations in my setup.

Although I can work around this, I would still be happy if somebody could point out to me why this is the case or where it is documented, so that I might avoid a problem like this in the future.

Thanks!

Hi there @nils123 and welcome back to the NVIDIA developer forums!

I am not certain whether this is a Hardware or rather a CUDA constraint, but I am sure the people over in the CUDA programming category will be able to answer that.

So I went ahead and moved your post over there, I hope that is ok with you.

Thanks!

1 Like