Mlx5dv_devx_obj_create fails

I’m trying to create a completion queue using the DEVX API.

I get errno 121:Remote I/O error, which the man page says to check the status and syndrome. Which I get:

  • status (0x5)
  • syndrome (0xd544f)

The status seems to correspond to “BAD_RESOURCE Attempt to access reserved or unallocated resource, or resource in inappropriate status. for example, not existing CQ when creating SQ/RQ”

However I have been unable to find that syndrome.

I only get these errors if I use MPI elsewhere in my application. Does anyone have any suggestion on the syndrome or how to interpret the status?

int my_create_dv_cq(const struct ibv_context *ibctx, struct my_dv_cq *dvcq)
{
	uint32_t in[DEVX_ST_SZ_DW(create_cq_in)] = {};
	uint32_t out[DEVX_ST_SZ_DW(create_cq_out)] = {};
	void *cqc = DEVX_ADDR_OF(create_cq_in, in, cq_context);
	struct mlx5_cqe64 *cqe;
	uint32_t eqn;
	int i, err;
	int size;

	dvcq->cqe_sz = 64;
	dvcq->ncqe = 1 << PP_MAX_LOG_CQ_SIZE;

	err = mlx5dv_devx_query_eqn(ibctx, 0, &eqn);
	if (err) {
		fprintf(stderr, "devx_query_eqn failed: %d, errno %d\n", ret, errno);
		return err;
	}

	err = posix_memalign((void **)&dvcq->db, 8, 8);
	if (err) {
		fprintf(stderr,"cq.db posix_memalign(8) failed\n");
		return err;
	}

	dvcq->db[0] = 0;
	dvcq->db[1] = 0;

	dvcq->db_umem = mlx5dv_devx_umem_reg(ibctx, dvcq->db, 8, MY_ACCESS_FALGS);
	if (!dvcq->db_umem) {
		fprintf(stderr,"cq.db umem_reg() failed\n");
    return -1;
	}

	size = roundup_pow_of_two(dvcq->cqe_sz * dvcq->ncqe);
	dvcq->buflen = align(size, sysconf(_SC_PAGESIZE));
	err = posix_memalign(&dvcq->buf, sysconf(_SC_PAGESIZE), dvcq->buflen);
	if (err) {
		fprintf(stderr,"cq.buf posix_memalign(0x%lx) failed\n", dvcq->buflen);
    return err;
	}

	memset(dvcq->buf, 0, dvcq->buflen);
	dvcq->buff_umem = mlx5dv_devx_umem_reg(ibctx, dvcq->buf,
					                               dvcq->buflen, PP_ACCESS_FALGS);
	if (!dvcq->buff_umem) {
		fprintf(stderr,"cq.buf umem_reg(0x%lx) failed\n", dvcq->buflen);
    return -1;
	}

	dvcq->uar = mlx5dv_devx_alloc_uar(ibctx, MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC);
	if (!dvcq->uar) {
		fprintf(stderr,"mlx5dv_devc_alloc_uar failed\n");
    return -1;
	}

	DEVX_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ);
	DEVX_SET(create_cq_in, in, cq_umem_id, dvcq->buff_umem->umem_id);
	DEVX_SET(create_cq_in, in, cq_umem_valid, 1);

	DEVX_SET(cqc, cqc, log_cq_size, PP_MAX_LOG_CQ_SIZE);
	DEVX_SET(cqc, cqc, cqe_sz, 0);
	DEVX_SET(cqc, cqc, uar_page, dvcq->uar->page_id);
	DEVX_SET(cqc, cqc, c_eqn, eqn);

	DEVX_SET64(cqc, cqc, dbr_umem_id, dvcq->db_umem->umem_id);
	DEVX_SET64(cqc, cqc, dbr_umem_valid, 1);
	DEVX_SET64(cqc, cqc, log_page_size, 0);
	DEVX_SET64(cqc, cqc, page_offset, 0);

	dvcq->obj = mlx5dv_devx_obj_create(ibctx, in, sizeof(in), out, sizeof(out));
	if (!dvcq->obj) {
    uint32_t syndrome;
    uint8_t  status;
    uint16_t opcode;
    uint16_t op_mod;

    status = DEVX_GET(mbox_out, out, status);
    syndrome = DEVX_GET(mbox_out, out, syndrome);
    opcode = DEVX_GET(mbox_in, in, opcode);
    op_mod = DEVX_GET(mbox_in, in, op_mod);

		fprintf(stderr,"devx_obj_create(cq) failed: eqn %d\n mlx5_code(0x%x), op_mod(0x%x) failed, status (0x%x), syndrome (0x%x)\n",
        eqn, opcode, op_mod, status, syndrome);
    return -1;
	}

	dvcq->cqn = DEVX_GET(create_cq_out, out, cqn);
	printf("dv: CQ %d created, eqn %d, db@%p, buf@%p\n",
	       dvcq->cqn, eqn, dvcq->db, dvcq->buf);

	dvcq->cons_index = 0;
	for (i = 0; i < dvcq->ncqe; i++) {
		cqe = pp_dv_get_cqe(dvcq, i);
		cqe->op_own = MLX5_CQE_INVALID << 4;
	}

	return 0;
}

Hello 12yht2,

Thank you for posting your inquiry to the NVIDIA Developer Forums.

It is likely that these details are not publicly available due to proprietary content.
Programming assistance is out of scope for Enterprise Support as well.

For this type of query, we recommend reaching out to our Sales and Solutions team. They will be able to direct you to resources within NVIDIA who can assist you with this inquiry:

https://www.nvidia.com/en-us/contact/sales/

Thanks, and best regards,
NVIDIA Enterprise Experience

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.