[libfabric-users] fi_cq_sread fails with "Resource temporarily unavailable"

Alisa Parashchenko ge24cuc at mytum.de
Wed May 1 05:28:56 PDT 2024


Hello,

I am new to Libfabric and trying to write some code that does RMAs. 
Currently, however, even reading from the completion queue after doing a 
regular fi_recv() is failing with "Resource temporarily unavailable".

Here is a minimal program that gets this error. Could someone tell me 
what I'm doing wrong? Setting FI_LOG_LEVEL=Debug didn't give any helpful 
information. I am on a regular Linux desktop, with Libfabric using its 
TCP provider, if that's relevant.

Regards,
Alisa

#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

#include <rdma/fabric.h>
#include <rdma/fi_cm.h>
#include <rdma/fi_domain.h>
#include <rdma/fi_endpoint.h>
#include <rdma/fi_rma.h>

#define PANIC_NZ(a) if ((ret = a)) panic("" #a "", fi_strerror(ret));

static struct fi_info *info;
static struct fid_fabric *fabric;
static struct fid_domain *domain;
static struct fid_ep *ep;
static struct fi_av_attr av_attr = { 0 };
static struct fi_cq_attr cq_attr = { 0 };
static struct fi_eq_attr eq_attr = { 0 };
static struct fid_av *av;
static struct fid_cq *cq;
static struct fid_eq *eq;
int ret;

void panic(char *f, const char *msg) {
   fprintf(stderr, "%s failed: %s\n", f, msg);
   exit(1);
}

void hexdump(int len, void *buf) {
   for (int i = 0; i < len; i++) printf("%02hhx ", ((char*)buf)[i]);
   printf("\n");
}

int main(int argc, char **argv) {
   char *host = "localhost";
   int is_server = argc <= 1;
   char *port = is_server ? "1234" : "4321" ;

   /* Select fabric */
   struct fi_info *hints = fi_allocinfo();
   hints->ep_attr->type = FI_EP_RDM;
   hints->caps = FI_MSG | FI_RMA;
   PANIC_NZ(fi_getinfo(FI_VERSION(1,21), host, port, FI_SOURCE, hints, 
&info));
   printf("Selected fabric \"%s\", domain \"%s\"\n",
       info->fabric_attr->name, info->domain_attr->name);
   fi_freeinfo(hints);

   /* Set up address vector */
   PANIC_NZ(fi_fabric(info->fabric_attr, &fabric, NULL));
   PANIC_NZ(fi_domain(fabric, info, &domain, NULL));
   av_attr.type = FI_AV_TABLE;
   av_attr.count = 2;
   PANIC_NZ(fi_av_open(domain, &av_attr, &av, NULL));

   /* Open the endpoint, bind it to an EQ, CQ, and AV*/
   PANIC_NZ(fi_endpoint(domain, info, &ep, NULL));
   cq_attr.wait_obj = FI_WAIT_UNSPEC;
   PANIC_NZ(fi_cq_open(domain, &cq_attr, &cq, NULL));
   PANIC_NZ(fi_eq_open(fabric, &eq_attr, &eq, NULL));
   PANIC_NZ(fi_ep_bind(ep, &av->fid, 0));
   PANIC_NZ(fi_ep_bind(ep, &cq->fid, FI_TRANSMIT|FI_RECV));
   PANIC_NZ(fi_ep_bind(ep, &eq->fid, 0));
   PANIC_NZ(fi_enable(ep));

   /* Get the address of the endpoint */
   char fi_addr[160];
   size_t fi_addrlen = 160;
   PANIC_NZ(fi_getname(&ep->fid, fi_addr, &fi_addrlen));
   printf("Got libfabric EP addr of length %zu:\n", fi_addrlen);
   hexdump(fi_addrlen, fi_addr);

   /* Insert own address and peer's address into AV */
   ret = fi_av_insert(av, fi_addr, 1, NULL, 0, NULL);
   assert(ret == 1);
   /* Obviously not the right way to do this, but the shortest way */
   char *peer_port = is_server ? "\x10\xe1" : "\x04\xd2";
   memcpy(fi_addr + 2, peer_port, 2);
   ret = fi_av_insert(av, fi_addr, 1, NULL, 0, NULL);
   assert(ret == 1);

   /* Try to exchange a message */
   if (is_server) {
     char buf[6];
     char cq_buf[128];
     PANIC_NZ(fi_recv(ep, buf, 5, NULL, 1, NULL));
     ret = fi_cq_sread(cq, cq_buf, 1, NULL, 0);
     if (ret < 0) panic("fi_cq_sread", fi_strerror(ret));
     printf("Got message: %s\n", buf);
   } else {
     char buf[6] = "Hello";
     PANIC_NZ(fi_inject(ep, buf, 6, 1));
   }

   fi_close((struct fid *) ep);
   fi_close((struct fid *) av);
   fi_close((struct fid *) eq);
   fi_close((struct fid *) cq);
   fi_close((struct fid *) domain);
   fi_close((struct fid *) fabric);
   fi_freeinfo(info);
   return 0;
}



More information about the Libfabric-users mailing list