<br><br>
<div><span class="gmail_quote">On 5/22/08, <b class="gmail_sendername">Hal Rosenstock</b> <<a href="mailto:hrosenstock@xsigo.com">hrosenstock@xsigo.com</a>> wrote:</span>
<blockquote class="gmail_quote" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid">Moni,<br><br>On Thu, 2008-05-22 at 16:58 +0300, Moni Shoua wrote:<br>> Hal, Roland<br>> Thanks for the comments. The patch below tries to address the issues that were<br>
> raised in its previous form. Please note that I'm only asking for opinion for now.<br>> If the idea is acceptable then I will recreate more elegant patch with the required<br>> fixes if any and with respect to previous comments (such as replacing 0,1 and 2 with<br>
> textual names).<br>><br>> The idea in few words is to flush only paths but keeping address handles in ipoib_neigh.<br>> This will trigger a new path lookup when an ARP probe arrives and eventually an addess<br>
> handle renewal. In the meantime, the old address handle is kept and can be used. In most<br>> cases this address handle is a valid address handle and when it is not than the situatio<br>> is not worse than before.<br>
<br>This part seems OK to me.<br><br>> My tests show that this patch completes the improvement that was archived with patch #1<br>> to zero packet loss (tested with ping flood) when SM change event occurs.<br><br>Looks to me like SM change is still "level 0". I may have missed it but<br>
I don't see how this addresses the general architectural concerns<br>previously raised. This patch may work in your test environment but I<br>don't think that covers all the cases.</blockquote>
<div> </div>
<div>Hal, </div>
<div>You pointed out that we cannot rely on the assumption that on SM failover there is not path change.</div>
<div>In the previous patch we only flush multicast.</div>
<div>What Moni changed in this patch is that on SM failover (SM change event), we will flush not only multicast but also all paths but without destroying ah.</div>
<div> </div>
<div>Olga</div>
<div> </div>
<div> </div><br>
<blockquote class="gmail_quote" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid">-- Hal<br><br>> thanks<br>><br>> MoniS<br>><br>><br>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h<br>
> index ca126fc..8ef6573 100644<br>> --- a/drivers/infiniband/ulp/ipoib/ipoib.h<br>> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h<br>> @@ -276,10 +276,11 @@ struct ipoib_dev_priv {<br>><br>> struct delayed_work pkey_poll_task;<br>
> struct delayed_work mcast_task;<br>> - struct work_struct flush_task;<br>> + struct work_struct flush_task0;<br>> + struct work_struct flush_task1;<br>> + struct work_struct flush_task2;<br>
> struct work_struct restart_task;<br>> struct delayed_work ah_reap_task;<br>> - struct work_struct pkey_event_task;<br>><br>> struct ib_device *ca;<br>> u8 port;<br>
> @@ -423,11 +424,14 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,<br>> struct ipoib_ah *address, u32 qpn);<br>> void ipoib_reap_ah(struct work_struct *work);<br>><br>> +void ipoib_flush_paths_only(struct net_device *dev);<br>
> void ipoib_flush_paths(struct net_device *dev);<br>> struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);<br>><br>> int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);<br>
> -void ipoib_ib_dev_flush(struct work_struct *work);<br>> +void ipoib_ib_dev_flush0(struct work_struct *work);<br>> +void ipoib_ib_dev_flush1(struct work_struct *work);<br>> +void ipoib_ib_dev_flush2(struct work_struct *work);<br>
> void ipoib_pkey_event(struct work_struct *work);<br>> void ipoib_ib_dev_cleanup(struct net_device *dev);<br>><br>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c<br>
> index f429bce..5a6bbe8 100644<br>> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c<br>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c<br>> @@ -898,7 +898,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)<br>
> return 0;<br>> }<br>><br>> -static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)<br>> +static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int level)<br>> {<br>
> struct ipoib_dev_priv *cpriv;<br>> struct net_device *dev = priv->dev;<br>> @@ -911,7 +911,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)<br>> * the parent is down.<br>
> */<br>> list_for_each_entry(cpriv, &priv->child_intfs, list)<br>> - __ipoib_ib_dev_flush(cpriv, pkey_event);<br>> + __ipoib_ib_dev_flush(cpriv, level);<br>><br>
> mutex_unlock(&priv->vlan_mutex);<br>><br>> @@ -925,7 +925,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)<br>> return;<br>> }<br>><br>
> - if (pkey_event) {<br>> + if (level == 2) {<br>> if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {<br>> clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);<br>
> ipoib_ib_dev_down(dev, 0);<br>> @@ -943,11 +943,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)<br>> priv->pkey_index = new_index;<br>> }<br>
><br>> - ipoib_dbg(priv, "flushing\n");<br>> -<br>> - ipoib_ib_dev_down(dev, 0);<br>> + ipoib_flush_paths_only(dev);<br>> + ipoib_mcast_dev_flush(dev);<br>> +<br>> + if (level >= 1)<br>
> + ipoib_ib_dev_down(dev, 0);<br>><br>> - if (pkey_event) {<br>> + if (level >= 2) {<br>> ipoib_ib_dev_stop(dev, 0);<br>> ipoib_ib_dev_open(dev);<br>> }<br>
> @@ -957,29 +959,36 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)<br>> * we get here, don't bring it back up if it's not configured up<br>> */<br>> if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {<br>
> - ipoib_ib_dev_up(dev);<br>> + if (level >= 1)<br>> + ipoib_ib_dev_up(dev);<br>> ipoib_mcast_restart_task(&priv->restart_task);<br>> }<br>
> }<br>><br>> -void ipoib_ib_dev_flush(struct work_struct *work)<br>> +void ipoib_ib_dev_flush0(struct work_struct *work)<br>> {<br>> struct ipoib_dev_priv *priv =<br>> - container_of(work, struct ipoib_dev_priv, flush_task);<br>
> + container_of(work, struct ipoib_dev_priv, flush_task0);<br>><br>> - ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);<br>> __ipoib_ib_dev_flush(priv, 0);<br>> }<br>
><br>> -void ipoib_pkey_event(struct work_struct *work)<br>> +void ipoib_ib_dev_flush1(struct work_struct *work)<br>> {<br>> struct ipoib_dev_priv *priv =<br>> - container_of(work, struct ipoib_dev_priv, pkey_event_task);<br>
> + container_of(work, struct ipoib_dev_priv, flush_task1);<br>><br>> - ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);<br>> __ipoib_ib_dev_flush(priv, 1);<br>
> }<br>><br>> +void ipoib_ib_dev_flush2(struct work_struct *work)<br>> +{<br>> + struct ipoib_dev_priv *priv =<br>> + container_of(work, struct ipoib_dev_priv, flush_task2);<br>> +<br>
> + __ipoib_ib_dev_flush(priv, 2);<br>> +}<br>> +<br>> void ipoib_ib_dev_cleanup(struct net_device *dev)<br>> {<br>> struct ipoib_dev_priv *priv = netdev_priv(dev);<br>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c<br>
> index 2442090..c41798d 100644<br>> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c<br>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c<br>> @@ -259,6 +259,21 @@ static int __path_add(struct net_device *dev, struct ipoib_path *path)<br>
> return 0;<br>> }<br>><br>> +static void path_free_only(struct net_device *dev, struct ipoib_path *path)<br>> +{<br>> + struct ipoib_dev_priv *priv = netdev_priv(dev);<br>> + struct ipoib_neigh *neigh, *tn;<br>
> + struct sk_buff *skb;<br>> + unsigned long flags;<br>> +<br>> + while ((skb = __skb_dequeue(&path->queue)))<br>> + dev_kfree_skb_irq(skb);<br>> +<br>> + if (path->ah)<br>
> + ipoib_put_ah(path->ah);<br>> +<br>> + kfree(path);<br>> +}<br>> static void path_free(struct net_device *dev, struct ipoib_path *path)<br>> {<br>> struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
> @@ -350,6 +365,34 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,<br>><br>> #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */<br>><br>> +void ipoib_flush_paths_only(struct net_device *dev)<br>> +{<br>
> + struct ipoib_dev_priv *priv = netdev_priv(dev);<br>> + struct ipoib_path *path, *tp;<br>> + LIST_HEAD(remove_list);<br>> +<br>> + spin_lock_irq(&priv->tx_lock);<br>> + spin_lock(&priv->lock);<br>
> +<br>> + list_splice_init(&priv->path_list, &remove_list);<br>> +<br>> + list_for_each_entry(path, &remove_list, list)<br>> + rb_erase(&path->rb_node, &priv->path_tree);<br>
> +<br>> + list_for_each_entry_safe(path, tp, &remove_list, list) {<br>> + if (path->query)<br>> + ib_sa_cancel_query(path->query_id, path->query);<br>> + spin_unlock(&priv->lock);<br>
> + spin_unlock_irq(&priv->tx_lock);<br>> + wait_for_completion(&path->done);<br>> + path_free_only(dev, path);<br>> + spin_lock_irq(&priv->tx_lock);<br>
> + spin_lock(&priv->lock);<br>> + }<br>> + spin_unlock(&priv->lock);<br>> + spin_unlock_irq(&priv->tx_lock);<br>> +}<br>> +<br>> void ipoib_flush_paths(struct net_device *dev)<br>
> {<br>> struct ipoib_dev_priv *priv = netdev_priv(dev);<br>> @@ -421,6 +464,8 @@ static void path_rec_completion(int status,<br>> __skb_queue_tail(&skqueue, skb);<br>><br>> list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {<br>
> + if (neigh->ah)<br>> + ipoib_put_ah(neigh->ah);<br>> kref_get(&path->ah->ref);<br>> neigh->ah = path->ah;<br>
> memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,<br>> @@ -989,9 +1034,10 @@ static void ipoib_setup(struct net_device *dev)<br>> INIT_LIST_HEAD(&priv->multicast_list);<br>
><br>> INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);<br>> - INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);<br>> INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);<br>
> - INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush);<br>> + INIT_WORK(&priv->flush_task0, ipoib_ib_dev_flush0);<br>> + INIT_WORK(&priv->flush_task1, ipoib_ib_dev_flush1);<br>
> + INIT_WORK(&priv->flush_task2, ipoib_ib_dev_flush2);<br>> INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);<br>> INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);<br>
> }<br>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c<br>> index 8766d29..80c0409 100644<br>> --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c<br>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c<br>
> @@ -289,15 +289,16 @@ void ipoib_event(struct ib_event_handler *handler,<br>> if (record->element.port_num != priv->port)<br>> return;<br>><br>> - if (record->event == IB_EVENT_PORT_ERR ||<br>
> - record->event == IB_EVENT_PORT_ACTIVE ||<br>> - record->event == IB_EVENT_LID_CHANGE ||<br>> - record->event == IB_EVENT_SM_CHANGE ||<br>> - record->event == IB_EVENT_CLIENT_REREGISTER) {<br>
> - ipoib_dbg(priv, "Port state change event\n");<br>> - queue_work(ipoib_workqueue, &priv->flush_task);<br>> + ipoib_dbg(priv, "Event %d on device %s port %d\n",record->event,<br>
> + record->device->name, record->element.port_num);<br>> + if ( record->event == IB_EVENT_SM_CHANGE ||<br>> + record->event == IB_EVENT_CLIENT_REREGISTER) {<br>> + queue_work(ipoib_workqueue, &priv->flush_task0);<br>
> + } else if (record->event == IB_EVENT_PORT_ERR ||<br>> + record->event == IB_EVENT_PORT_ACTIVE ||<br>> + record->event == IB_EVENT_LID_CHANGE) {<br>> + queue_work(ipoib_workqueue, &priv->flush_task1);<br>
> } else if (record->event == IB_EVENT_PKEY_CHANGE) {<br>> - ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);<br>> - queue_work(ipoib_workqueue, &priv->pkey_event_task);<br>
> + queue_work(ipoib_workqueue, &priv->flush_task2);<br>> }<br>> }<br><br>_______________________________________________<br>general mailing list<br><a href="mailto:general@lists.openfabrics.org">general@lists.openfabrics.org</a><br>
<a href="http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general">http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general</a><br><br>To unsubscribe, please visit <a href="http://openib.org/mailman/listinfo/openib-general">http://openib.org/mailman/listinfo/openib-general</a><br>
</blockquote></div><br>