GPUDirect Storage: "Non-registered case, not yet implemented" error

I have the following CUDA code:

#include <fstream>
#include <iostream>
#include <vector>
#include <cstring>

#include <fcntl.h>
#include <sys/types.h>
#include <errno.h>
#include <unistd.h>

#include <thrust/device_vector.h>
#include <cufile.h>

int main(){
    {
        std::ofstream fout("temp.bin", std::ios::binary);
        for(std::size_t i = 0; i < 4100096; ++i){
            fout.write("\0", 1);
        }
    }

    int exit_status = 1;

    CUfileDescr_t desc{};
    CUfileHandle_t handle{};

    thrust::device_vector<unsigned char> data(8192, 0);

    std::vector<CUfileIOParams_t> io_params;

    CUfileBatchHandle_t batch_handle;

    std::vector<CUfileIOEvents_t> io_events(2);

    unsigned total_completed = 0;

    int fd = ::open("temp.bin", O_RDONLY | O_DIRECT);
    if(fd == -1){
        auto err = errno;
        std::cerr << "::open errno=" << err << std::endl;
        return 1;
    }

    {
        CUfileError_t status = ::cuFileDriverOpen();
        if(status.err != CU_FILE_SUCCESS){
            std::cerr << "::cuFileDriverOpen status={.err=" << status.err << ", .cu_err=" << status.cu_err << "}" << std::endl;
            goto do_driver_close;
        }
    }

    desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
    desc.handle.fd = fd;
    {
        CUfileError_t status = ::cuFileHandleRegister(&handle, &desc);
        if(status.err != CU_FILE_SUCCESS){
            std::cerr << "::cuFileHandleRegister status={.err=" << status.err << ", .cu_err=" << status.cu_err << "}" << std::endl;
            goto do_close;
        }
    }

    {
        CUfileError_t status = ::cuFileBufRegister(static_cast<const void *>(thrust::raw_pointer_cast(data.data())), 8192, 0);
        if(status.err != CU_FILE_SUCCESS){
            std::cerr << "::cuFileBufRegister status={.err=" << status.err << ", .cu_err=" << status.cu_err << "}" << std::endl;
            goto do_file_deregister;
        }
    }

    io_params.push_back({
        .mode = cufileBatchMode::CUFILE_BATCH,
        .u = {
            .batch = {
                .devPtr_base = static_cast<void *>(thrust::raw_pointer_cast(data.data())),
                .file_offset = 3948544,
                .devPtr_offset = 0,
                .size = 4096
            }
        },
        .fh = handle,
        .opcode = CUfileOpcode::CUFILE_READ,
        .cookie = nullptr
    });
    io_params.push_back({
        .mode = cufileBatchMode::CUFILE_BATCH,
        .u = {
            .batch = {
                .devPtr_base = static_cast<void *>(thrust::raw_pointer_cast(data.data())),
                .file_offset = 929792,
                .devPtr_offset = 4096,
                .size = 4096
            }
        },
        .fh = handle,
        .opcode = CUfileOpcode::CUFILE_READ,
        .cookie = nullptr
    });
    for(auto &io_param : io_params){
        io_param.cookie = static_cast<void *>(&io_param);
    }

    {
        CUfileError_t status = ::cuFileBatchIOSetUp(&batch_handle, io_params.size());
        if(status.err != CU_FILE_SUCCESS){
            std::cerr << "::cuFileBatchIOSetUp status={.err=" << status.err << ", .cu_err=" << status.cu_err << "}" << std::endl;
            goto do_buf_deregister;
        }
    }
    {
        CUfileError_t status = ::cuFileBatchIOSubmit(batch_handle, io_params.size(), io_params.data(), 0);
        if(status.err != CU_FILE_SUCCESS){
            std::cerr << "::cuFileBatchIOSubmit status={.err=" << status.err << ", .cu_err=" << status.cu_err << "}" << std::endl;
            goto do_batch_destroy;
        }
    }

    (void)std::memset(static_cast<void *>(io_events.data()), 0, io_events.size() * sizeof(CUfileIOEvents_t));

    while(total_completed < io_params.size()){
        unsigned nr = io_params.size();
        CUfileError_t status = ::cuFileBatchIOGetStatus(batch_handle, io_params.size(), &nr, io_events.data(), nullptr);
        if(status.err != CU_FILE_SUCCESS){
            std::cerr << "::cuFileBatchIOGetStatus status={.err=" << status.err << ", .cu_err=" << status.cu_err << "}" << std::endl;
            goto do_batch_destroy;
        }
        total_completed += nr;
    }

    exit_status = 0;

    do_batch_destroy:
    ::cuFileBatchIODestroy(batch_handle);
    do_buf_deregister:
    {
        CUfileError_t status = ::cuFileBufDeregister(static_cast<const void *>(thrust::raw_pointer_cast(data.data())));
        if(status.err != CU_FILE_SUCCESS){
            std::cerr << "::cuFileBufDeregister status={.err=" << status.err << ", .cu_err=" << status.cu_err << "}" << std::endl;
        }
    }
    do_file_deregister:
    ::cuFileHandleDeregister(handle);
    do_driver_close:
    {
        CUfileError_t status = ::cuFileDriverClose();
        if(status.err != CU_FILE_SUCCESS){
            std::cerr << "::cuFileDriverClose status={.err=" << status.err << ", .cu_err=" << status.cu_err << "}" << std::endl;
        }
    }
    do_close:
    {
        int ret = ::close(fd);
        if(ret == -1){
            auto err = errno;
            std::cerr << "::close errno=" << err << std::endl;
        }
    }
    return exit_status;
}

And compiles it with command nvcc -std=c++17 -g -o temp1_cu temp1.cu /usr/local/cuda-11.7/targets/x86_64-linux/lib/libcufile.so.

I would expect the program run and exit normally when I execute it. However, what I got was the following output:

::cuFileBatchIOSubmit status={.err=5022, .cu_err=0}

And the generated cufile.log contains the following content:

 19-05-2023 18:52:43:699 [pid=4194 tid=4194] WARN   0:172 failed to open /proc/driver/nvidia-fs/devcount  error: No such file or directory
 19-05-2023 18:52:43:699 [pid=4194 tid=4194] NOTICE  cufio-drv:705 running in compatible mode
 19-05-2023 18:52:43:799 [pid=4194 tid=4194] ERROR  cufio_batch:218 Non-registered case, not yet implemented
 19-05-2023 18:52:43:799 [pid=4194 tid=4194] ERROR  cufio_batch:352 Error while submitting IO events 0 errno:  0
 19-05-2023 18:52:43:799 [pid=4194 tid=4194] ERROR  0:258 Batch Ctx state transition failed in cuFileBatchGetContextFromID
 19-05-2023 18:52:43:799 [pid=4194 tid=4194] ERROR  cufio_batch:597 Batch state is not in IDLE state, try destroy later

My question is: what kind of input for ::cuFileBatchIOSubmit would trigger such error? I read the GDS API documentation but could not find any clues.

My OS is RHEL 8 with CUDA 11.7 installed through the .run file, along with nvidia driver 515.65.01 and GDS 1.3.1. My host compiler is GCC 11 provided by gcc-toolset-11 system package. I don’t have any GDS-related environment variable set, and no /etc/cufile.json exists in my environment.