[ofa-general] Re: [PATCH 2/2] IB/IPoIB: Separate IB events to groups and handle each according to level of severity

Olga Shern (Voltaire) olga.shern at gmail.com
Thu May 22 08:28:19 PDT 2008


On 5/22/08, Hal Rosenstock <hrosenstock at xsigo.com> wrote:
>
> Moni,
>
> On Thu, 2008-05-22 at 16:58 +0300, Moni Shoua wrote:
> > Hal, Roland
> > Thanks for the comments. The patch below tries to address the issues that
> were
> > raised in its previous form. Please note that I'm only asking for opinion
> for now.
> > If the idea is acceptable then I will recreate more elegant patch with
> the required
> > fixes if any and with respect to previous comments (such as replacing 0,1
> and 2 with
> > textual names).
> >
> > The idea in few words is to flush only paths but keeping address handles
> in ipoib_neigh.
> > This will trigger a new path lookup when an ARP probe arrives and
> eventually an addess
> > handle renewal. In the meantime, the old address handle is kept and can
> be used. In most
> > cases this address handle is a valid address handle and when it is not
> than the situatio
> > is not worse than before.
>
> This part seems OK to me.
>
> > My tests show that this patch completes the improvement that was archived
> with patch #1
> > to zero packet loss (tested with ping flood) when SM change event occurs.
>
> Looks to me like SM change is still "level 0". I may have missed it but
> I don't see how this addresses the general architectural concerns
> previously raised. This patch may work in your test environment but I
> don't think that covers all the cases.


Hal,
You pointed out that we cannot rely on the assumption that on SM failover
there is not path change.
In the previous patch we only flush multicast.
What Moni changed in this patch is that on SM failover (SM change event),
we will flush not only multicast but also all paths but without destroying
ah.

Olga



-- Hal
>
> > thanks
> >
> >  MoniS
> >
> >
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h
> b/drivers/infiniband/ulp/ipoib/ipoib.h
> > index ca126fc..8ef6573 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> > @@ -276,10 +276,11 @@ struct ipoib_dev_priv {
> >
> >       struct delayed_work pkey_poll_task;
> >       struct delayed_work mcast_task;
> > -     struct work_struct flush_task;
> > +     struct work_struct flush_task0;
> > +     struct work_struct flush_task1;
> > +     struct work_struct flush_task2;
> >       struct work_struct restart_task;
> >       struct delayed_work ah_reap_task;
> > -     struct work_struct pkey_event_task;
> >
> >       struct ib_device *ca;
> >       u8                port;
> > @@ -423,11 +424,14 @@ void ipoib_send(struct net_device *dev, struct
> sk_buff *skb,
> >               struct ipoib_ah *address, u32 qpn);
> >  void ipoib_reap_ah(struct work_struct *work);
> >
> > +void ipoib_flush_paths_only(struct net_device *dev);
> >  void ipoib_flush_paths(struct net_device *dev);
> >  struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
> >
> >  int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int
> port);
> > -void ipoib_ib_dev_flush(struct work_struct *work);
> > +void ipoib_ib_dev_flush0(struct work_struct *work);
> > +void ipoib_ib_dev_flush1(struct work_struct *work);
> > +void ipoib_ib_dev_flush2(struct work_struct *work);
> >  void ipoib_pkey_event(struct work_struct *work);
> >  void ipoib_ib_dev_cleanup(struct net_device *dev);
> >
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > index f429bce..5a6bbe8 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > @@ -898,7 +898,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct
> ib_device *ca, int port)
> >       return 0;
> >  }
> >
> > -static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int
> pkey_event)
> > +static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int level)
> >  {
> >       struct ipoib_dev_priv *cpriv;
> >       struct net_device *dev = priv->dev;
> > @@ -911,7 +911,7 @@ static void __ipoib_ib_dev_flush(struct
> ipoib_dev_priv *priv, int pkey_event)
> >        * the parent is down.
> >        */
> >       list_for_each_entry(cpriv, &priv->child_intfs, list)
> > -             __ipoib_ib_dev_flush(cpriv, pkey_event);
> > +             __ipoib_ib_dev_flush(cpriv, level);
> >
> >       mutex_unlock(&priv->vlan_mutex);
> >
> > @@ -925,7 +925,7 @@ static void __ipoib_ib_dev_flush(struct
> ipoib_dev_priv *priv, int pkey_event)
> >               return;
> >       }
> >
> > -     if (pkey_event) {
> > +     if (level == 2) {
> >               if (ib_find_pkey(priv->ca, priv->port, priv->pkey,
> &new_index)) {
> >                       clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
> >                       ipoib_ib_dev_down(dev, 0);
> > @@ -943,11 +943,13 @@ static void __ipoib_ib_dev_flush(struct
> ipoib_dev_priv *priv, int pkey_event)
> >               priv->pkey_index = new_index;
> >       }
> >
> > -     ipoib_dbg(priv, "flushing\n");
> > -
> > -     ipoib_ib_dev_down(dev, 0);
> > +     ipoib_flush_paths_only(dev);
> > +     ipoib_mcast_dev_flush(dev);
> > +
> > +     if (level >= 1)
> > +             ipoib_ib_dev_down(dev, 0);
> >
> > -     if (pkey_event) {
> > +     if (level >= 2) {
> >               ipoib_ib_dev_stop(dev, 0);
> >               ipoib_ib_dev_open(dev);
> >       }
> > @@ -957,29 +959,36 @@ static void __ipoib_ib_dev_flush(struct
> ipoib_dev_priv *priv, int pkey_event)
> >        * we get here, don't bring it back up if it's not configured up
> >        */
> >       if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
> > -             ipoib_ib_dev_up(dev);
> > +             if (level >= 1)
> > +                     ipoib_ib_dev_up(dev);
> >               ipoib_mcast_restart_task(&priv->restart_task);
> >       }
> >  }
> >
> > -void ipoib_ib_dev_flush(struct work_struct *work)
> > +void ipoib_ib_dev_flush0(struct work_struct *work)
> >  {
> >       struct ipoib_dev_priv *priv =
> > -             container_of(work, struct ipoib_dev_priv, flush_task);
> > +             container_of(work, struct ipoib_dev_priv, flush_task0);
> >
> > -     ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
> >       __ipoib_ib_dev_flush(priv, 0);
> >  }
> >
> > -void ipoib_pkey_event(struct work_struct *work)
> > +void ipoib_ib_dev_flush1(struct work_struct *work)
> >  {
> >       struct ipoib_dev_priv *priv =
> > -             container_of(work, struct ipoib_dev_priv, pkey_event_task);
> > +             container_of(work, struct ipoib_dev_priv, flush_task1);
> >
> > -     ipoib_dbg(priv, "Flushing %s and restarting its QP\n",
> priv->dev->name);
> >       __ipoib_ib_dev_flush(priv, 1);
> >  }
> >
> > +void ipoib_ib_dev_flush2(struct work_struct *work)
> > +{
> > +     struct ipoib_dev_priv *priv =
> > +             container_of(work, struct ipoib_dev_priv, flush_task2);
> > +
> > +     __ipoib_ib_dev_flush(priv, 2);
> > +}
> > +
> >  void ipoib_ib_dev_cleanup(struct net_device *dev)
> >  {
> >       struct ipoib_dev_priv *priv = netdev_priv(dev);
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > index 2442090..c41798d 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > @@ -259,6 +259,21 @@ static int __path_add(struct net_device *dev, struct
> ipoib_path *path)
> >       return 0;
> >  }
> >
> > +static void path_free_only(struct net_device *dev, struct ipoib_path
> *path)
> > +{
> > +     struct ipoib_dev_priv *priv = netdev_priv(dev);
> > +     struct ipoib_neigh *neigh, *tn;
> > +     struct sk_buff *skb;
> > +     unsigned long flags;
> > +
> > +     while ((skb = __skb_dequeue(&path->queue)))
> > +             dev_kfree_skb_irq(skb);
> > +
> > +     if (path->ah)
> > +             ipoib_put_ah(path->ah);
> > +
> > +     kfree(path);
> > +}
> >  static void path_free(struct net_device *dev, struct ipoib_path *path)
> >  {
> >       struct ipoib_dev_priv *priv = netdev_priv(dev);
> > @@ -350,6 +365,34 @@ void ipoib_path_iter_read(struct ipoib_path_iter
> *iter,
> >
> >  #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
> >
> > +void ipoib_flush_paths_only(struct net_device *dev)
> > +{
> > +     struct ipoib_dev_priv *priv = netdev_priv(dev);
> > +     struct ipoib_path *path, *tp;
> > +     LIST_HEAD(remove_list);
> > +
> > +     spin_lock_irq(&priv->tx_lock);
> > +     spin_lock(&priv->lock);
> > +
> > +     list_splice_init(&priv->path_list, &remove_list);
> > +
> > +     list_for_each_entry(path, &remove_list, list)
> > +             rb_erase(&path->rb_node, &priv->path_tree);
> > +
> > +     list_for_each_entry_safe(path, tp, &remove_list, list) {
> > +             if (path->query)
> > +                     ib_sa_cancel_query(path->query_id, path->query);
> > +             spin_unlock(&priv->lock);
> > +             spin_unlock_irq(&priv->tx_lock);
> > +             wait_for_completion(&path->done);
> > +             path_free_only(dev, path);
> > +             spin_lock_irq(&priv->tx_lock);
> > +             spin_lock(&priv->lock);
> > +     }
> > +     spin_unlock(&priv->lock);
> > +     spin_unlock_irq(&priv->tx_lock);
> > +}
> > +
> >  void ipoib_flush_paths(struct net_device *dev)
> >  {
> >       struct ipoib_dev_priv *priv = netdev_priv(dev);
> > @@ -421,6 +464,8 @@ static void path_rec_completion(int status,
> >                       __skb_queue_tail(&skqueue, skb);
> >
> >               list_for_each_entry_safe(neigh, tn, &path->neigh_list,
> list) {
> > +                     if (neigh->ah)
> > +                             ipoib_put_ah(neigh->ah);
> >                       kref_get(&path->ah->ref);
> >                       neigh->ah = path->ah;
> >                       memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
> > @@ -989,9 +1034,10 @@ static void ipoib_setup(struct net_device *dev)
> >       INIT_LIST_HEAD(&priv->multicast_list);
> >
> >       INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
> > -     INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
> >       INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
> > -     INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
> > +     INIT_WORK(&priv->flush_task0,   ipoib_ib_dev_flush0);
> > +     INIT_WORK(&priv->flush_task1,   ipoib_ib_dev_flush1);
> > +     INIT_WORK(&priv->flush_task2,   ipoib_ib_dev_flush2);
> >       INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
> >       INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
> >  }
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
> b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
> > index 8766d29..80c0409 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
> > @@ -289,15 +289,16 @@ void ipoib_event(struct ib_event_handler *handler,
> >       if (record->element.port_num != priv->port)
> >               return;
> >
> > -     if (record->event == IB_EVENT_PORT_ERR    ||
> > -         record->event == IB_EVENT_PORT_ACTIVE ||
> > -         record->event == IB_EVENT_LID_CHANGE  ||
> > -         record->event == IB_EVENT_SM_CHANGE   ||
> > -         record->event == IB_EVENT_CLIENT_REREGISTER) {
> > -             ipoib_dbg(priv, "Port state change event\n");
> > -             queue_work(ipoib_workqueue, &priv->flush_task);
> > +     ipoib_dbg(priv, "Event %d on device %s port %d\n",record->event,
> > +                     record->device->name, record->element.port_num);
> > +     if ( record->event == IB_EVENT_SM_CHANGE   ||
> > +          record->event == IB_EVENT_CLIENT_REREGISTER) {
> > +             queue_work(ipoib_workqueue, &priv->flush_task0);
> > +     } else if (record->event == IB_EVENT_PORT_ERR ||
> > +                record->event == IB_EVENT_PORT_ACTIVE ||
> > +                record->event == IB_EVENT_LID_CHANGE) {
> > +             queue_work(ipoib_workqueue, &priv->flush_task1);
> >       } else if (record->event == IB_EVENT_PKEY_CHANGE) {
> > -             ipoib_dbg(priv, "P_Key change event on port:%d\n",
> priv->port);
> > -             queue_work(ipoib_workqueue, &priv->pkey_event_task);
> > +             queue_work(ipoib_workqueue, &priv->flush_task2);
> >       }
> >  }
>
> _______________________________________________
> general mailing list
> general at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
>
> To unsubscribe, please visit
> http://openib.org/mailman/listinfo/openib-general
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20080522/b8cacf51/attachment.html>


More information about the general mailing list