[libfabric-users] verbs provider completion queue: "fi_cq_sread:Resource temporarily unavailable"

Andreas webmaster at i-need-change.org
Sat Dec 8 07:00:46 PST 2018


Little update:
I extended the error handling for the CQ.
The process fails on establishing the final stage of a handshake, where the server hands its rdma keys to the client
On client side 

ssize_t rret;
    rret = fi_recv(ep, ctrl_buff, sizeof(keys), fi_mr_desc(mr), 0, NULL);
    if (rret) {
        perror("fi_recv");
        return (int)rret;
    }


    ret = fi_connect(ep, fi->dest_addr, NULL, 0);
    if (ret) {
        perror("fi_connect");
        return ret;
    }

    struct fi_eq_cm_entry entry;
    uint32_t event;

    rret = fi_eq_sread(eq, &event, &entry, sizeof(entry), -1, 0);
    if (rret > 0){
        if (event != FI_CONNECTED) {
            fprintf(stderr, "invalid event %u\n", event);
            return -1;
        }
    }
    else if (rret != -FI_EAGAIN) {
        struct fi_eq_err_entry err_entry;
        fi_eq_readerr(eq, &err_entry, 0);
        printf("[%d] %s %s \n", thread, fi_strerror(err_entry.err), fi_eq_strerror(eq, err_entry.prov_errno, err_entry.err_data, NULL, 0));
        return ret;
    }



    struct fi_cq_msg_entry comp;
    ret = fi_cq_sread(cq, &comp, 1, NULL, -1);
    if (ret != 1) {
		struct fi_cq_err_entry err_entry;
        fi_cq_readerr(cq, &err_entry, 0);
        printf("[%d] %s %s \n", thread, fi_strerror(err_entry.err), fi_cq_strerror (cq, err_entry.prov_errno, err_entry.err_data, NULL, 0));
        perror("fi_cq_sread");
        return ret;
    }

it prints
Input/output error,  local protection error

While server side 

ret = fi_accept(ep, NULL, 0);
        if (ret) {
            perror("fi_accept");
            return ret;
        }

        rret = fi_eq_sread(eq, &event, &entry, sizeof(entry), -1, 0);
        if (rret > 0) {
            if (event != FI_CONNECTED) {
                fprintf(stderr, "invalid event %u\n", event);
                return (int) rret;
            }
        } else if (rret != -FI_EAGAIN) {
            struct fi_eq_err_entry err_entry;
            fi_eq_readerr(eq, &err_entry, 0);
            printf("%s %s \n", fi_strerror(err_entry.err),
                   fi_eq_strerror(eq, err_entry.prov_errno, err_entry.err_data, NULL, 0));
            return (int) rret;
        }

        memcpy(ctrl_buff, &keys, sizeof(keys));

        rret = fi_send(ep, ctrl_buff, sizeof(keys), fi_mr_desc(mr), 0, NULL);
        if (rret) {
            printf("fi_send: %s\n", fi_strerror((int) rret));
            return (int) rret;
        }

        struct fi_cq_msg_entry comp;
		ret = fi_cq_sread(cq, &comp, 1, NULL, -1);
        if (ret < 1) {
            struct fi_cq_err_entry err_entry;
			fi_cq_readerr(cq, &err_entry, 0);
			printf("[%d] %s %s \n", thread, fi_strerror(err_entry.err), fi_cq_strerror(cq, err_entry.prov_errno, err_entry.err_data, NULL, 0));
			perror("fi_cq_sread");
			return ret;
        }

It prints:
Input/output error remote operation error

Von: Andreas
Gesendet: Saturday, December 8, 2018 2:57 PM
An: libfabric-users at lists.openfabrics.org
Betreff: [libfabric-users] verbs provider completion queue: "fi_cq_sread:Resource temporarily unavailable"


Hello everyone,

I could use some help with this issue. 
That’s my little project I’m working on: https://github.com/germanafro/Libfabric_Benchmark

A Benchmark one day hoping to achieve Bandwidths of up to 1TB/s though apparently I’m struggling with the basic setup still. 

I started testing and implementation from home with the sockets provider. So far everything seemed to work just fine, once I moved to my universities testcluster the sockets provider still does what it’s supposed to do but the transition to the verbs provider runs me into one bump after another :/ 
The testcluster itself consist of ~40 linux nodes connected through ethernet and Infiniband switches.

With the most recent issue of:  fi_cq_sread: Resource temporarily unavailable
fi_cq_open(…) and fi_ep_bind(…) seem to be fine.

Could it be my config or is this actually a temporary network issue? I tested several clusters.

//Completion queue config
                cq_attr.size = 0;
                cq_attr.flags = 0;
                cq_attr.format = FI_CQ_FORMAT_MSG;
                cq_attr.wait_obj = FI_WAIT_UNSPEC;
                cq_attr.signaling_vector = 0;
                cq_attr.wait_cond = FI_CQ_COND_NONE;
                cq_attr.wait_set = NULL;


I appreciate the help and time,

Andreas


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/libfabric-users/attachments/20181208/2e574413/attachment-0001.html>


More information about the Libfabric-users mailing list