[ofa-general] [PATCH v3] iw_cxgb3: Support "iwarp-only" interfaces to avoid 4-tuple conflicts.
Sean Hefty
mshefty at ichips.intel.com
Thu Sep 27 11:38:39 PDT 2007
> The sysadmin creates "for iwarp use only" alias interfaces of the form
> "devname:iw*" where devname is the native interface name (eg eth0) for the
> iwarp netdev device. The alias label can be anything starting with "iw".
> The "iw" immediately after the ':' is the key used by the iw_cxgb3 driver.
I'm still not sure about this, but haven't come up with anything better
myself. And if there's a good chance of other rnic's needing the same
support, I'd rather see the common code separated out, even if just
encapsulated within this module for easy re-use.
As for the code, I have a couple of questions about whether deadlock and
a race condition are possible, plus a few minor comments.
> +static void insert_ifa(struct iwch_dev *rnicp, struct in_ifaddr *ifa)
> +{
> + struct iwch_addrlist *addr;
> +
> + addr = kmalloc(sizeof *addr, GFP_KERNEL);
> + if (!addr) {
> + printk(KERN_ERR MOD "%s - failed to alloc memory!\n",
> + __FUNCTION__);
> + return;
> + }
> + addr->ifa = ifa;
> + mutex_lock(&rnicp->mutex);
> + list_add_tail(&addr->entry, &rnicp->addrlist);
> + mutex_unlock(&rnicp->mutex);
> +}
Should this return success/failure?
> +static int nb_callback(struct notifier_block *self, unsigned long event,
> + void *ctx)
> +{
> + struct in_ifaddr *ifa = ctx;
> + struct iwch_dev *rnicp = container_of(self, struct iwch_dev, nb);
> +
> + PDBG("%s rnicp %p event %lx\n", __FUNCTION__, rnicp, event);
> +
> + switch (event) {
> + case NETDEV_UP:
> + if (netdev_is_ours(rnicp, ifa->ifa_dev->dev) &&
> + is_iwarp_label(ifa->ifa_label)) {
> + PDBG("%s label %s addr 0x%x added\n",
> + __FUNCTION__, ifa->ifa_label, ifa->ifa_address);
> + insert_ifa(rnicp, ifa);
> + iwch_listeners_add_addr(rnicp, ifa->ifa_address);
If insert_ifa() fails, what will iwch_listeners_add_addr() do? (I'm not
easily seeing the relationship between the address list and the listen
list at this point.)
> + }
> + break;
> + case NETDEV_DOWN:
> + if (netdev_is_ours(rnicp, ifa->ifa_dev->dev) &&
> + is_iwarp_label(ifa->ifa_label)) {
> + PDBG("%s label %s addr 0x%x deleted\n",
> + __FUNCTION__, ifa->ifa_label, ifa->ifa_address);
> + iwch_listeners_del_addr(rnicp, ifa->ifa_address);
> + remove_ifa(rnicp, ifa);
> + }
> + break;
> + default:
> + break;
> + }
> + return 0;
> +}
> +
> +static void delete_addrlist(struct iwch_dev *rnicp)
> +{
> + struct iwch_addrlist *addr, *tmp;
> +
> + mutex_lock(&rnicp->mutex);
> + list_for_each_entry_safe(addr, tmp, &rnicp->addrlist, entry) {
> + list_del(&addr->entry);
> + kfree(addr);
> + }
> + mutex_unlock(&rnicp->mutex);
> +}
> +
> +static void populate_addrlist(struct iwch_dev *rnicp)
> +{
> + int i;
> + struct in_device *indev;
> +
> + for (i = 0; i < rnicp->rdev.port_info.nports; i++) {
> + indev = in_dev_get(rnicp->rdev.port_info.lldevs[i]);
> + if (!indev)
> + continue;
> + for_ifa(indev)
> + if (is_iwarp_label(ifa->ifa_label)) {
> + PDBG("%s label %s addr 0x%x added\n",
> + __FUNCTION__, ifa->ifa_label,
> + ifa->ifa_address);
> + insert_ifa(rnicp, ifa);
> + }
> + endfor_ifa(indev);
> + }
> +}
> +
> static void rnic_init(struct iwch_dev *rnicp)
> {
> PDBG("%s iwch_dev %p\n", __FUNCTION__, rnicp);
> @@ -70,6 +187,12 @@ static void rnic_init(struct iwch_dev *r
> idr_init(&rnicp->qpidr);
> idr_init(&rnicp->mmidr);
> spin_lock_init(&rnicp->lock);
> + INIT_LIST_HEAD(&rnicp->addrlist);
> + INIT_LIST_HEAD(&rnicp->listen_eps);
> + mutex_init(&rnicp->mutex);
> + rnicp->nb.notifier_call = nb_callback;
> + populate_addrlist(rnicp);
> + register_inetaddr_notifier(&rnicp->nb);
>
> rnicp->attr.vendor_id = 0x168;
> rnicp->attr.vendor_part_id = 7;
> @@ -148,6 +271,8 @@ static void close_rnic_dev(struct t3cdev
> mutex_lock(&dev_mutex);
> list_for_each_entry_safe(dev, tmp, &dev_list, entry) {
> if (dev->rdev.t3cdev_p == tdev) {
> + unregister_inetaddr_notifier(&dev->nb);
> + delete_addrlist(dev);
> list_del(&dev->entry);
> iwch_unregister_device(dev);
> cxio_rdev_close(&dev->rdev);
> diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h
> index caf4e60..7fa0a47 100644
> --- a/drivers/infiniband/hw/cxgb3/iwch.h
> +++ b/drivers/infiniband/hw/cxgb3/iwch.h
> @@ -36,6 +36,8 @@ #include <linux/mutex.h>
> #include <linux/list.h>
> #include <linux/spinlock.h>
> #include <linux/idr.h>
> +#include <linux/notifier.h>
> +#include <linux/inetdevice.h>
>
> #include <rdma/ib_verbs.h>
>
> @@ -101,6 +103,11 @@ struct iwch_rnic_attributes {
> u32 cq_overflow_detection;
> };
>
> +struct iwch_addrlist {
> + struct list_head entry;
> + struct in_ifaddr *ifa;
> +};
> +
> struct iwch_dev {
> struct ib_device ibdev;
> struct cxio_rdev rdev;
> @@ -111,6 +118,10 @@ struct iwch_dev {
> struct idr mmidr;
> spinlock_t lock;
> struct list_head entry;
> + struct notifier_block nb;
> + struct list_head addrlist;
> + struct list_head listen_eps;
The behavior of the listen lists is similar to what's done in the
rdma_cm: Wildcard listens are stored in a listen_any_list. When new
devices are added, associated listens are added to each device. This is
similar, except we're dealing with devices and addresses. I'm wondering
if we shouldn't mimic the same behavior and track listens in
iwch_addrlist directly. (I don't see anything wrong with this approach
though.)
What happens if an address changes between iwarp only and non-iwarp?
How are listens on specific addresses handled from an rdma_cm level?
Does the rdma_cm map the address to the device, call the iw_cm to
listen, which in turn calls the device listen function? The device then
checks that the address has been marked as iwarp only? (I'm being too
lazy to trace this through the code, but if you don't know off the top
of your head, I will do that.)
> + struct mutex mutex;
> };
>
> static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev)
> diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
> index 1cdfcd4..afc8a48 100644
> --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
> +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
> @@ -1127,23 +1127,149 @@ static int act_open_rpl(struct t3cdev *t
> return CPL_RET_BUF_DONE;
> }
>
> -static int listen_start(struct iwch_listen_ep *ep)
> +static int wait_for_reply(struct iwch_ep_common *epc)
> +{
> + PDBG("%s ep %p waiting\n", __FUNCTION__, epc);
> + wait_event(epc->waitq, epc->rpl_done);
> + PDBG("%s ep %p done waiting err %d\n", __FUNCTION__, epc, epc->rpl_err);
> + return epc->rpl_err;
> +}
What thread is being blocked here, and what sets the event?
> +
> +static struct iwch_listen_entry *alloc_listener(struct iwch_listen_ep *ep,
> + __be32 addr)
> +{
> + struct iwch_dev *h = to_iwch_dev(ep->com.cm_id->device);
> + struct iwch_listen_entry *le;
> +
> + le = kmalloc(sizeof *le, GFP_KERNEL);
> + if (!le) {
> + printk(KERN_ERR MOD "%s - failed to alloc memory!\n",
> + __FUNCTION__);
> + return NULL;
> + }
> + le->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p,
> + &t3c_client, ep);
> + if (le->stid == -1) {
> + printk(KERN_ERR MOD "%s - cannot alloc stid.\n",
> + __FUNCTION__);
> + kfree(le);
> + return NULL;
> + }
> + le->addr = addr;
> + PDBG("%s stid %u addr %x port %x\n", __FUNCTION__, le->stid,
> + ntohl(le->addr), ntohs(ep->com.local_addr.sin_port));
> + return le;
> +}
> +
> +static void dealloc_listener(struct iwch_listen_ep *ep,
> + struct iwch_listen_entry *le)
> +{
> + PDBG("%s stid %u addr %x port %x\n", __FUNCTION__, le->stid,
> + ntohl(le->addr), ntohs(ep->com.local_addr.sin_port));
> + cxgb3_free_stid(ep->com.tdev, le->stid);
> + kfree(le);
> +}
> +
> +static void dealloc_listener_list(struct iwch_listen_ep *ep)
> +{
> + struct iwch_listen_entry *le, *tmp;
> + struct iwch_dev *h = to_iwch_dev(ep->com.cm_id->device);
> +
> + mutex_lock(&h->mutex);
> + list_for_each_entry_safe(le, tmp, &ep->listeners, entry) {
> + list_del(&le->entry);
> + dealloc_listener(ep, le);
> + }
> + mutex_unlock(&h->mutex);
> +}
> +
> +static int alloc_listener_list(struct iwch_listen_ep *ep)
> +{
> + struct iwch_dev *h = to_iwch_dev(ep->com.cm_id->device);
nit: in place of 'h' in several places, how about 'rnicp', which is also
used?
> + struct iwch_addrlist *addr;
> + struct iwch_listen_entry *le;
> + int err = 0;
> + int added=0;
> + mutex_lock(&h->mutex);
> + list_for_each_entry(addr, &h->addrlist, entry) {
> + if (ep->com.local_addr.sin_addr.s_addr == 0 ||
> + ep->com.local_addr.sin_addr.s_addr ==
> + addr->ifa->ifa_address) {
> + le = alloc_listener(ep, addr->ifa->ifa_address);
> + if (!le)
> + break;
> + list_add_tail(&le->entry, &ep->listeners);
> + added++;
> + }
> + }
> + mutex_unlock(&h->mutex);
> + if (ep->com.local_addr.sin_addr.s_addr != 0 && !added)
> + err = -EADDRNOTAVAIL;
> + if (!err && !added)
> + printk(KERN_WARNING MOD
> + "No RDMA interface found for device %s\n",
> + pci_name(h->rdev.rnic_info.pdev));
> + return err;
> +}
Adding some white space would improve readability.
> +
> +static int listen_stop_one(struct iwch_listen_ep *ep, unsigned int stid)
> {
> struct sk_buff *skb;
> - struct cpl_pass_open_req *req;
> + struct cpl_close_listserv_req *req;
> +
> + PDBG("%s stid %u\n", __FUNCTION__, stid);
> + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
> + if (!skb) {
> + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __FUNCTION__);
> + return -ENOMEM;
> + }
> + req = (struct cpl_close_listserv_req *) skb_put(skb, sizeof(*req));
> + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
> + req->cpu_idx = 0;
> + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid));
> + skb->priority = 1;
> + ep->com.rpl_err = 0;
> + ep->com.rpl_done = 0;
> + cxgb3_ofld_send(ep->com.tdev, skb);
> + return wait_for_reply(&ep->com);
> +}
> +
> +static int listen_stop(struct iwch_listen_ep *ep)
> +{
> + struct iwch_listen_entry *le;
> + struct iwch_dev *h = to_iwch_dev(ep->com.cm_id->device);
> + int err = 0;
>
> PDBG("%s ep %p\n", __FUNCTION__, ep);
> + mutex_lock(&h->mutex);
> + list_for_each_entry(le, &ep->listeners, entry) {
> + err = listen_stop_one(ep, le->stid);
This ends up blocking while holding a mutex, which looks like deadlock
potential.
> + if (err)
> + break;
> + }
> + mutex_unlock(&h->mutex);
> + return err;
> +}
> +
> +static int listen_start_one(struct iwch_listen_ep *ep, unsigned int stid,
> + __be32 addr, __be16 port)
> +{
> + struct sk_buff *skb;
> + struct cpl_pass_open_req *req;
> +
> + PDBG("%s stid %u addr %x port %x\n", __FUNCTION__, stid, ntohl(addr),
> + ntohs(port));
> skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
> if (!skb) {
> - printk(KERN_ERR MOD "t3c_listen_start failed to alloc skb!\n");
> + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __FUNCTION__);
> return -ENOMEM;
> }
>
> req = (struct cpl_pass_open_req *) skb_put(skb, sizeof(*req));
> req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
> - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, ep->stid));
> - req->local_port = ep->com.local_addr.sin_port;
> - req->local_ip = ep->com.local_addr.sin_addr.s_addr;
> + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, stid));
> + req->local_port = port;
> + req->local_ip = addr;
> req->peer_port = 0;
> req->peer_ip = 0;
> req->peer_netmask = 0;
> @@ -1152,8 +1278,32 @@ static int listen_start(struct iwch_list
> req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
>
> skb->priority = 1;
> + ep->com.rpl_err = 0;
> + ep->com.rpl_done = 0;
> cxgb3_ofld_send(ep->com.tdev, skb);
> - return 0;
> + return wait_for_reply(&ep->com);
> +}
> +
> +static int listen_start(struct iwch_listen_ep *ep)
> +{
> + struct iwch_listen_entry *le;
> + struct iwch_dev *h = to_iwch_dev(ep->com.cm_id->device);
> + int err = 0;
> +
> + PDBG("%s ep %p\n", __FUNCTION__, ep);
> + mutex_lock(&h->mutex);
> + list_for_each_entry(le, &ep->listeners, entry) {
> + err = listen_start_one(ep, le->stid, le->addr,
> + ep->com.local_addr.sin_port);
Similar to above - blocking while holding a mutex. There are a couple
of other places where this also occurs.
> + if (err)
> + goto fail;
> + }
> + mutex_unlock(&h->mutex);
> + return err;
> +fail:
> + mutex_unlock(&h->mutex);
> + listen_stop(ep);
> + return err;
> }
>
> static int pass_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
> @@ -1170,39 +1320,59 @@ static int pass_open_rpl(struct t3cdev *
> return CPL_RET_BUF_DONE;
> }
>
> -static int listen_stop(struct iwch_listen_ep *ep)
> -{
> - struct sk_buff *skb;
> - struct cpl_close_listserv_req *req;
> -
> - PDBG("%s ep %p\n", __FUNCTION__, ep);
> - skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
> - if (!skb) {
> - printk(KERN_ERR MOD "%s - failed to alloc skb\n", __FUNCTION__);
> - return -ENOMEM;
> - }
> - req = (struct cpl_close_listserv_req *) skb_put(skb, sizeof(*req));
> - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
> - req->cpu_idx = 0;
> - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, ep->stid));
> - skb->priority = 1;
> - cxgb3_ofld_send(ep->com.tdev, skb);
> - return 0;
> -}
> -
> static int close_listsrv_rpl(struct t3cdev *tdev, struct sk_buff *skb,
> void *ctx)
> {
> struct iwch_listen_ep *ep = ctx;
> struct cpl_close_listserv_rpl *rpl = cplhdr(skb);
>
> - PDBG("%s ep %p\n", __FUNCTION__, ep);
> + PDBG("%s ep %p stid %u\n", __FUNCTION__, ep, GET_TID(rpl));
> +
> ep->com.rpl_err = status2errno(rpl->status);
> ep->com.rpl_done = 1;
> wake_up(&ep->com.waitq);
> return CPL_RET_BUF_DONE;
> }
>
> +void iwch_listeners_add_addr(struct iwch_dev *rnicp, __be32 addr)
> +{
> + struct iwch_listen_ep *listen_ep;
> + struct iwch_listen_entry *le;
> +
> + mutex_lock(&rnicp->mutex);
> + list_for_each_entry(listen_ep, &rnicp->listen_eps, entry) {
> + if (listen_ep->com.local_addr.sin_addr.s_addr)
> + continue;
> + le = alloc_listener(listen_ep, addr);
> + if (le) {
> + list_add_tail(&le->entry, &listen_ep->listeners);
> + listen_start_one(listen_ep, le->stid, addr,
> + listen_ep->com.local_addr.sin_port);
> + }
> + }
> + mutex_unlock(&rnicp->mutex);
> +}
> +
> +void iwch_listeners_del_addr(struct iwch_dev *rnicp, __be32 addr)
> +{
> + struct iwch_listen_ep *listen_ep;
> + struct iwch_listen_entry *le, *tmp;
> +
> + mutex_lock(&rnicp->mutex);
> + list_for_each_entry(listen_ep, &rnicp->listen_eps, entry) {
> + if (listen_ep->com.local_addr.sin_addr.s_addr)
> + continue;
> + list_for_each_entry_safe(le, tmp, &listen_ep->listeners,
> + entry)
> + if (le->addr == addr) {
> + listen_stop_one(listen_ep, le->stid);
> + list_del(&le->entry);
> + dealloc_listener(listen_ep, le);
> + }
> + }
> + mutex_unlock(&rnicp->mutex);
> +}
> +
> static void accept_cr(struct iwch_ep *ep, __be32 peer_ip, struct sk_buff *skb)
> {
> struct cpl_pass_accept_rpl *rpl;
> @@ -1767,8 +1937,7 @@ int iwch_accept_cr(struct iw_cm_id *cm_i
> goto err;
>
> /* wait for wr_ack */
> - wait_event(ep->com.waitq, ep->com.rpl_done);
> - err = ep->com.rpl_err;
> + err = wait_for_reply(&ep->com);
> if (err)
> goto err;
>
> @@ -1887,31 +2056,23 @@ int iwch_create_listen(struct iw_cm_id *
> ep->com.cm_id = cm_id;
> ep->backlog = backlog;
> ep->com.local_addr = cm_id->local_addr;
> + INIT_LIST_HEAD(&ep->listeners);
>
> - /*
> - * Allocate a server TID.
> - */
> - ep->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p, &t3c_client, ep);
> - if (ep->stid == -1) {
> - printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __FUNCTION__);
> - err = -ENOMEM;
> + err = alloc_listener_list(ep);
> + if (err)
> goto fail2;
> - }
>
> state_set(&ep->com, LISTEN);
> err = listen_start(ep);
> - if (err)
> - goto fail3;
>
> - /* wait for pass_open_rpl */
> - wait_event(ep->com.waitq, ep->com.rpl_done);
> - err = ep->com.rpl_err;
> if (!err) {
> cm_id->provider_data = ep;
> + mutex_lock(&h->mutex);
> + list_add_tail(&ep->entry, &h->listen_eps);
> + mutex_unlock(&h->mutex);
Is there a race between listen_start() being called and inserting the ep
into the list? Could anything try to find the ep on the list after
listen_start returns?
> goto out;
> }
> -fail3:
> - cxgb3_free_stid(ep->com.tdev, ep->stid);
> + dealloc_listener_list(ep);
> fail2:
> cm_id->rem_ref(cm_id);
> put_ep(&ep->com);
> @@ -1923,18 +2084,20 @@ out:
> int iwch_destroy_listen(struct iw_cm_id *cm_id)
> {
> int err;
> + struct iwch_dev *h = to_iwch_dev(cm_id->device);
> struct iwch_listen_ep *ep = to_listen_ep(cm_id);
>
> PDBG("%s ep %p\n", __FUNCTION__, ep);
>
> might_sleep();
> + mutex_lock(&h->mutex);
> + list_del(&ep->entry);
> + mutex_unlock(&h->mutex);
> state_set(&ep->com, DEAD);
> ep->com.rpl_done = 0;
> ep->com.rpl_err = 0;
> err = listen_stop(ep);
> - wait_event(ep->com.waitq, ep->com.rpl_done);
> - cxgb3_free_stid(ep->com.tdev, ep->stid);
> - err = ep->com.rpl_err;
> + dealloc_listener_list(ep);
> cm_id->rem_ref(cm_id);
> put_ep(&ep->com);
> return err;
> diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h
> index 6107e7c..23e5a22 100644
> --- a/drivers/infiniband/hw/cxgb3/iwch_cm.h
> +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h
> @@ -162,10 +162,19 @@ struct iwch_ep_common {
> int rpl_err;
> };
>
> -struct iwch_listen_ep {
> - struct iwch_ep_common com;
> +struct iwch_listen_entry {
> + struct list_head entry;
> unsigned int stid;
> + __be32 addr;
> +};
> +
> +struct iwch_listen_ep {
> + struct iwch_ep_common com; /* Must be first entry! */
> + struct list_head entry;
> + struct list_head listeners;
> int backlog;
> + int listen_count;
I didn't notice where this was used.
> + int listen_rpls;
or this.
> };
>
> struct iwch_ep {
> @@ -222,6 +231,8 @@ int iwch_resume_tid(struct iwch_ep *ep);
> void __free_ep(struct kref *kref);
> void iwch_rearp(struct iwch_ep *ep);
> int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, struct l2t_entry *l2t);
> +void iwch_listeners_add_addr(struct iwch_dev *rnicp, __be32 addr);
> +void iwch_listeners_del_addr(struct iwch_dev *rnicp, __be32 addr);
>
> int __init iwch_cm_init(void);
> void __exit iwch_cm_term(void);
> _______________________________________________
> general mailing list
> general at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
>
> To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
>
More information about the general
mailing list