<br><br>
<div><span class="gmail_quote">On 5/22/08, <b class="gmail_sendername">Hal Rosenstock</b> <<a href="mailto:hrosenstock@xsigo.com">hrosenstock@xsigo.com</a>> wrote:</span>
<blockquote class="gmail_quote" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid">Moni,<br><br>On Thu, 2008-05-22 at 16:58 +0300, Moni Shoua wrote:<br>> Hal, Roland<br>> Thanks for the comments. The patch below tries to address the issues that were<br>
> raised in its previous form. Please note that I'm only asking for opinion for now.<br>> If the idea is acceptable then I will recreate more elegant patch with the required<br>> fixes if any and with respect to previous comments (such as replacing 0,1 and 2 with<br>
> textual names).<br>><br>> The idea in few words is to flush only paths but keeping address handles in ipoib_neigh.<br>> This will trigger a new path lookup when an ARP probe arrives and eventually an addess<br>
> handle renewal. In the meantime, the old address handle is kept and can be used. In most<br>> cases this address handle is a valid address handle and when it is not than the situatio<br>> is not worse than before.<br>
<br>This part seems OK to me.<br><br>> My tests show that this patch completes the improvement that was archived with patch #1<br>> to zero packet loss (tested with ping flood) when SM change event occurs.<br><br>Looks to me like SM change is still "level 0". I may have missed it but<br>
I don't see how this addresses the general architectural concerns<br>previously raised. This patch may work in your test environment but I<br>don't think that covers all the cases.</blockquote>
<div> </div>
<div>Hal, </div>
<div>You pointed out that we cannot rely on the assumption that on SM failover there is not path change.</div>
<div>In the previous patch we only flush multicast.</div>
<div>What Moni changed in this patch is that on SM failover (SM change event), we will flush not only multicast but also all paths but without destroying ah.</div>
<div> </div>
<div>Olga</div>
<div> </div>
<div> </div><br>
<blockquote class="gmail_quote" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid">-- Hal<br><br>> thanks<br>><br>>  MoniS<br>><br>><br>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h<br>
> index ca126fc..8ef6573 100644<br>> --- a/drivers/infiniband/ulp/ipoib/ipoib.h<br>> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h<br>> @@ -276,10 +276,11 @@ struct ipoib_dev_priv {<br>><br>>       struct delayed_work pkey_poll_task;<br>
>       struct delayed_work mcast_task;<br>> -     struct work_struct flush_task;<br>> +     struct work_struct flush_task0;<br>> +     struct work_struct flush_task1;<br>> +     struct work_struct flush_task2;<br>
>       struct work_struct restart_task;<br>>       struct delayed_work ah_reap_task;<br>> -     struct work_struct pkey_event_task;<br>><br>>       struct ib_device *ca;<br>>       u8                port;<br>
> @@ -423,11 +424,14 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,<br>>               struct ipoib_ah *address, u32 qpn);<br>>  void ipoib_reap_ah(struct work_struct *work);<br>><br>> +void ipoib_flush_paths_only(struct net_device *dev);<br>
>  void ipoib_flush_paths(struct net_device *dev);<br>>  struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);<br>><br>>  int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);<br>
> -void ipoib_ib_dev_flush(struct work_struct *work);<br>> +void ipoib_ib_dev_flush0(struct work_struct *work);<br>> +void ipoib_ib_dev_flush1(struct work_struct *work);<br>> +void ipoib_ib_dev_flush2(struct work_struct *work);<br>
>  void ipoib_pkey_event(struct work_struct *work);<br>>  void ipoib_ib_dev_cleanup(struct net_device *dev);<br>><br>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c<br>
> index f429bce..5a6bbe8 100644<br>> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c<br>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c<br>> @@ -898,7 +898,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)<br>
>       return 0;<br>>  }<br>><br>> -static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)<br>> +static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int level)<br>>  {<br>
>       struct ipoib_dev_priv *cpriv;<br>>       struct net_device *dev = priv->dev;<br>> @@ -911,7 +911,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)<br>>        * the parent is down.<br>
>        */<br>>       list_for_each_entry(cpriv, &priv->child_intfs, list)<br>> -             __ipoib_ib_dev_flush(cpriv, pkey_event);<br>> +             __ipoib_ib_dev_flush(cpriv, level);<br>><br>
>       mutex_unlock(&priv->vlan_mutex);<br>><br>> @@ -925,7 +925,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)<br>>               return;<br>>       }<br>><br>
> -     if (pkey_event) {<br>> +     if (level == 2) {<br>>               if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {<br>>                       clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);<br>
>                       ipoib_ib_dev_down(dev, 0);<br>> @@ -943,11 +943,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)<br>>               priv->pkey_index = new_index;<br>>       }<br>
><br>> -     ipoib_dbg(priv, "flushing\n");<br>> -<br>> -     ipoib_ib_dev_down(dev, 0);<br>> +     ipoib_flush_paths_only(dev);<br>> +     ipoib_mcast_dev_flush(dev);<br>> +<br>> +     if (level >= 1)<br>
> +             ipoib_ib_dev_down(dev, 0);<br>><br>> -     if (pkey_event) {<br>> +     if (level >= 2) {<br>>               ipoib_ib_dev_stop(dev, 0);<br>>               ipoib_ib_dev_open(dev);<br>>       }<br>
> @@ -957,29 +959,36 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)<br>>        * we get here, don't bring it back up if it's not configured up<br>>        */<br>>       if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {<br>
> -             ipoib_ib_dev_up(dev);<br>> +             if (level >= 1)<br>> +                     ipoib_ib_dev_up(dev);<br>>               ipoib_mcast_restart_task(&priv->restart_task);<br>>       }<br>
>  }<br>><br>> -void ipoib_ib_dev_flush(struct work_struct *work)<br>> +void ipoib_ib_dev_flush0(struct work_struct *work)<br>>  {<br>>       struct ipoib_dev_priv *priv =<br>> -             container_of(work, struct ipoib_dev_priv, flush_task);<br>
> +             container_of(work, struct ipoib_dev_priv, flush_task0);<br>><br>> -     ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);<br>>       __ipoib_ib_dev_flush(priv, 0);<br>>  }<br>
><br>> -void ipoib_pkey_event(struct work_struct *work)<br>> +void ipoib_ib_dev_flush1(struct work_struct *work)<br>>  {<br>>       struct ipoib_dev_priv *priv =<br>> -             container_of(work, struct ipoib_dev_priv, pkey_event_task);<br>
> +             container_of(work, struct ipoib_dev_priv, flush_task1);<br>><br>> -     ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);<br>>       __ipoib_ib_dev_flush(priv, 1);<br>
>  }<br>><br>> +void ipoib_ib_dev_flush2(struct work_struct *work)<br>> +{<br>> +     struct ipoib_dev_priv *priv =<br>> +             container_of(work, struct ipoib_dev_priv, flush_task2);<br>> +<br>
> +     __ipoib_ib_dev_flush(priv, 2);<br>> +}<br>> +<br>>  void ipoib_ib_dev_cleanup(struct net_device *dev)<br>>  {<br>>       struct ipoib_dev_priv *priv = netdev_priv(dev);<br>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c<br>
> index 2442090..c41798d 100644<br>> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c<br>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c<br>> @@ -259,6 +259,21 @@ static int __path_add(struct net_device *dev, struct ipoib_path *path)<br>
>       return 0;<br>>  }<br>><br>> +static void path_free_only(struct net_device *dev, struct ipoib_path *path)<br>> +{<br>> +     struct ipoib_dev_priv *priv = netdev_priv(dev);<br>> +     struct ipoib_neigh *neigh, *tn;<br>
> +     struct sk_buff *skb;<br>> +     unsigned long flags;<br>> +<br>> +     while ((skb = __skb_dequeue(&path->queue)))<br>> +             dev_kfree_skb_irq(skb);<br>> +<br>> +     if (path->ah)<br>
> +             ipoib_put_ah(path->ah);<br>> +<br>> +     kfree(path);<br>> +}<br>>  static void path_free(struct net_device *dev, struct ipoib_path *path)<br>>  {<br>>       struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
> @@ -350,6 +365,34 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,<br>><br>>  #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */<br>><br>> +void ipoib_flush_paths_only(struct net_device *dev)<br>> +{<br>
> +     struct ipoib_dev_priv *priv = netdev_priv(dev);<br>> +     struct ipoib_path *path, *tp;<br>> +     LIST_HEAD(remove_list);<br>> +<br>> +     spin_lock_irq(&priv->tx_lock);<br>> +     spin_lock(&priv->lock);<br>
> +<br>> +     list_splice_init(&priv->path_list, &remove_list);<br>> +<br>> +     list_for_each_entry(path, &remove_list, list)<br>> +             rb_erase(&path->rb_node, &priv->path_tree);<br>
> +<br>> +     list_for_each_entry_safe(path, tp, &remove_list, list) {<br>> +             if (path->query)<br>> +                     ib_sa_cancel_query(path->query_id, path->query);<br>> +             spin_unlock(&priv->lock);<br>
> +             spin_unlock_irq(&priv->tx_lock);<br>> +             wait_for_completion(&path->done);<br>> +             path_free_only(dev, path);<br>> +             spin_lock_irq(&priv->tx_lock);<br>
> +             spin_lock(&priv->lock);<br>> +     }<br>> +     spin_unlock(&priv->lock);<br>> +     spin_unlock_irq(&priv->tx_lock);<br>> +}<br>> +<br>>  void ipoib_flush_paths(struct net_device *dev)<br>
>  {<br>>       struct ipoib_dev_priv *priv = netdev_priv(dev);<br>> @@ -421,6 +464,8 @@ static void path_rec_completion(int status,<br>>                       __skb_queue_tail(&skqueue, skb);<br>><br>>               list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {<br>
> +                     if (neigh->ah)<br>> +                             ipoib_put_ah(neigh->ah);<br>>                       kref_get(&path->ah->ref);<br>>                       neigh->ah = path->ah;<br>
>                       memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,<br>> @@ -989,9 +1034,10 @@ static void ipoib_setup(struct net_device *dev)<br>>       INIT_LIST_HEAD(&priv->multicast_list);<br>
><br>>       INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);<br>> -     INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);<br>>       INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);<br>
> -     INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);<br>> +     INIT_WORK(&priv->flush_task0,   ipoib_ib_dev_flush0);<br>> +     INIT_WORK(&priv->flush_task1,   ipoib_ib_dev_flush1);<br>
> +     INIT_WORK(&priv->flush_task2,   ipoib_ib_dev_flush2);<br>>       INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);<br>>       INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);<br>
>  }<br>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c<br>> index 8766d29..80c0409 100644<br>> --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c<br>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c<br>
> @@ -289,15 +289,16 @@ void ipoib_event(struct ib_event_handler *handler,<br>>       if (record->element.port_num != priv->port)<br>>               return;<br>><br>> -     if (record->event == IB_EVENT_PORT_ERR    ||<br>
> -         record->event == IB_EVENT_PORT_ACTIVE ||<br>> -         record->event == IB_EVENT_LID_CHANGE  ||<br>> -         record->event == IB_EVENT_SM_CHANGE   ||<br>> -         record->event == IB_EVENT_CLIENT_REREGISTER) {<br>
> -             ipoib_dbg(priv, "Port state change event\n");<br>> -             queue_work(ipoib_workqueue, &priv->flush_task);<br>> +     ipoib_dbg(priv, "Event %d on device %s port %d\n",record->event,<br>
> +                     record->device->name, record->element.port_num);<br>> +     if ( record->event == IB_EVENT_SM_CHANGE   ||<br>> +          record->event == IB_EVENT_CLIENT_REREGISTER) {<br>> +             queue_work(ipoib_workqueue, &priv->flush_task0);<br>
> +     } else if (record->event == IB_EVENT_PORT_ERR ||<br>> +                record->event == IB_EVENT_PORT_ACTIVE ||<br>> +                record->event == IB_EVENT_LID_CHANGE) {<br>> +             queue_work(ipoib_workqueue, &priv->flush_task1);<br>
>       } else if (record->event == IB_EVENT_PKEY_CHANGE) {<br>> -             ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);<br>> -             queue_work(ipoib_workqueue, &priv->pkey_event_task);<br>
> +             queue_work(ipoib_workqueue, &priv->flush_task2);<br>>       }<br>>  }<br><br>_______________________________________________<br>general mailing list<br><a href="mailto:general@lists.openfabrics.org">general@lists.openfabrics.org</a><br>
<a href="http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general">http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general</a><br><br>To unsubscribe, please visit <a href="http://openib.org/mailman/listinfo/openib-general">http://openib.org/mailman/listinfo/openib-general</a><br>
</blockquote></div><br>