[libfabric-users] connection error using prov/verbs
Sergey Tatarintsev
s.tatarintsev at postgrespro.ru
Thu Jul 24 20:17:05 PDT 2025
Hello!
I'm trying to write an simple app, that's allow to two RoCE hosts to
communicate using verbs provider, but I'm getting an error during
connection:
libfabric:534529:1753411967::verbs:core:ofi_check_ep_attr():742<info>
Unsupported protocol version
I'm trying to debug and there are indeed differences in prov and user
infos in ofi_check_ep_attr():
(gdb) p *user_info->ep_attr
$1 = {type = FI_EP_MSG, protocol = 0, protocol_version = 1, max_msg_size
= 1073741824, msg_prefix_size = 0, max_order_raw_size = 1073741824,
max_order_war_size = 0,
max_order_waw_size = 1073741824, mem_tag_format = 0, tx_ctx_cnt = 1,
rx_ctx_cnt = 1, auth_key_size = 0, auth_key = 0x0}
(gdb) p *prov_info->ep_attr
$2 = {type = FI_EP_MSG, protocol = 0, protocol_version = 0, max_msg_size
= 0, msg_prefix_size = 0, max_order_raw_size = 0, max_order_war_size = 0,
max_order_waw_size = 0, mem_tag_format = 0, tx_ctx_cnt = 0,
rx_ctx_cnt = 0, auth_key_size = 0, auth_key = 0x0}
Can anyone help me to fix my mistake. Code below
#include <netinet/in.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdbool.h>
#include <rdma/fabric.h>
#include <rdma/fi_endpoint.h>
#include <rdma/fi_errno.h>
#include <rdma/fi_cm.h>
#include <rdma/fi_rma.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#define report(res) do{if (res) {printf("res=%i
%s\n",res,fi_strerror(res)); return -1;}}while(0);
#define eval(exp) do { \
res = exp; \
if (res) {\
printf(#exp "; res=%i
%s\n",res,fi_strerror(res)); return -1;} \
}while(0);
struct fi_info *hints, *info;
struct fid_fabric *fabric;
struct fid_eq *eq;
struct fi_eq_attr eq_attr = {0};
struct fid_domain *domain;
struct fid_ep *ep;
struct fid_pep *pep;
struct fi_av_attr av_attr;
struct fid_av *av = {0};
struct fi_cq_attr cq_attr = {0};
struct fid_cq *cq;
struct fid_av *av;
fi_addr_t fi_addr[32];
#define bufsize 32
char *buf;
struct fid_mr *mr;
int res;
struct in_addr iaddr;
struct sockaddr_in addrs[2];
struct fi_av_attr av_attr = {0};
pthread_t cqth, eqth;
void *
cq_thread(void *arg)
{
struct fi_cq_data_entry comp;
ssize_t ret;
struct fi_cq_err_entry err;
const char *err_str;
struct fi_eq_entry eq_entry;
uint32_t event;
while (1)
{
ret = fi_cq_sread(cq, &comp, 1, NULL, -1);
if (ret != 1) {
printf("fi_cq_sread() = %i", res);
continue;
}
printf("===> CQ=%lu (%lu)\n", comp.len, comp.flags);
}
return NULL;
}
void *
eq_thread(void *arg)
{
int ret;
struct fi_eq_cm_entry entry;
uint32_t event;
while(1)
{
ret = fi_eq_sread(eq, &event, &entry, sizeof (entry), -1, 0);
if (ret != sizeof (entry))
{
printf("\nfi_eq_sread err %i %s", ret, fi_strerror(ret));
continue;
}
printf("===> EQ=%i\n", event);
}
return NULL;
}
static void print_infos(struct fi_info *inf)
{
struct fi_info *cur;
int ret;
for (cur = inf; cur; cur = cur->next) {
printf("provider: %s\n", cur->fabric_attr->prov_name);
printf(" fabric: %s\n", cur->fabric_attr->name);
printf(" domain: %s\n", cur->domain_attr->name);
printf(" EP type: %i\n", cur->ep_attr->type);
printf(" proto: %i\n", cur->ep_attr->protocol);
printf(" proto version: %i\n", cur->ep_attr->protocol_version);
printf(" version: %d.%d\n",
FI_MAJOR(cur->fabric_attr->prov_version),
FI_MINOR(cur->fabric_attr->prov_version));
}
}
ssize_t post_writemsg(char *buf, size_t size, struct fi_rma_iov *remote)
{
struct fi_msg_rma msg;
struct iovec msg_iov;
struct fi_rma_iov rma_iov;
msg_iov.iov_base = buf;
msg_iov.iov_len = size;
msg.msg_iov = &msg_iov;
msg.desc = fi_mr_desc(mr);
msg.iov_count = 1;
rma_iov.addr = remote->addr;
rma_iov.len = size;
rma_iov.key = remote->key;
msg.rma_iov = &rma_iov;
msg.rma_iov_count = 1;
msg.addr = fi_addr[0];
msg.data = 5678;
msg.context = NULL;
return fi_writemsg(ep, &msg, FI_REMOTE_CQ_DATA);
}
int main(int argc, char *argv[])
{
char *host = argc>1?argv[1]:NULL;
buf = malloc(bufsize);
memset(buf, 'x', bufsize);
hints = fi_allocinfo();
hints->addr_format = FI_SOCKADDR;
hints->ep_attr->type = FI_EP_RDM;
hints->ep_attr->protocol = FI_PROTO_UNSPEC;
hints->domain_attr->mr_mode =
FI_MR_LOCAL|FI_MR_VIRT_ADDR|FI_MR_ALLOCATED|FI_MR_PROV_KEY;
hints->fabric_attr->prov_name = "verbs";
hints->caps = FI_RMA;
hints->mode = FI_CONTEXT | FI_RX_CQ_DATA;
eval(fi_getinfo(FI_VERSION(1, 8), host, "1234", 0, hints, &info));
print_infos(info);
eval(fi_fabric(info->fabric_attr, &fabric, NULL));
eq_attr.wait_obj = FI_WAIT_UNSPEC;
eval(fi_eq_open(fabric, &eq_attr, &eq, NULL));
eval(fi_domain(fabric, info, &domain, NULL));
cq_attr.format = FI_CQ_FORMAT_DATA;
cq_attr.wait_obj = FI_WAIT_UNSPEC;
cq_attr.wait_cond = FI_CQ_COND_NONE;
eval(fi_cq_open(domain, &cq_attr, &cq, NULL));
eval(fi_mr_reg(domain, buf, bufsize,
FI_REMOTE_READ|FI_REMOTE_WRITE|FI_SEND|FI_RECV, 0, 0, 0, &mr, NULL));
eval(fi_endpoint(domain, info, &ep, NULL));
eval(fi_ep_bind(ep, &eq->fid, 0));
eval(fi_ep_bind(ep, &cq->fid, FI_TRANSMIT|FI_RECV));
eval(fi_av_open(domain, &av_attr, &av, NULL));
if (host)
{
inet_aton(host, &iaddr);
addrs[0].sin_family = AF_INET;
addrs[0].sin_port = htons(1234);
addrs[0].sin_addr = iaddr;
av_attr.type = FI_AV_UNSPEC;
av_attr.count = 32;
if(fi_av_insert(av, addrs, 1, fi_addr, 0, NULL) !=1)
report(res);
}
eval(fi_ep_bind(ep, &av->fid, 0));
eval(fi_enable(ep));
printf("%s\n", host?"writemsg":"wait for client");
pthread_create(&cqth, NULL, cq_thread, NULL);
pthread_create(&eqth, NULL, eq_thread, NULL);
for (;;)
{
struct fi_rma_iov remote;
if (host)
{
remote.addr = (uint64_t)buf; // doesn't matter now
remote.key = fi_mr_key(mr); // doesn't matter now
remote.len = bufsize;
res = post_writemsg(buf, bufsize, &remote);
printf("(%i) %s\n",res, fi_strerror(res));
sleep(2);
}
}
}
--
With best regards,
Sergey Tatarintsev,
PostgresPro
More information about the Libfabric-users
mailing list