[ofa-general] Re: [PATCH 2/2] IB/IPoIB: Separate IB events to groups and handle each according to level of severity

Moni Shoua monis at Voltaire.COM
Thu May 22 06:58:33 PDT 2008


Hal, Roland 
Thanks for the comments. The patch below tries to address the issues that were 
raised in its previous form. Please note that I'm only asking for opinion for now.
If the idea is acceptable then I will recreate more elegant patch with the required
fixes if any and with respect to previous comments (such as replacing 0,1 and 2 with
textual names).

The idea in few words is to flush only paths but keeping address handles in ipoib_neigh.
This will trigger a new path lookup when an ARP probe arrives and eventually an addess 
handle renewal. In the meantime, the old address handle is kept and can be used. In most
cases this address handle is a valid address handle and when it is not than the situatio
is not worse than before. 
My tests show that this patch completes the improvement that was archived with patch #1 
to zero packet loss (tested with ping flood) when SM change event occurs.


thanks

 MoniS


diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index ca126fc..8ef6573 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -276,10 +276,11 @@ struct ipoib_dev_priv {
 
 	struct delayed_work pkey_poll_task;
 	struct delayed_work mcast_task;
-	struct work_struct flush_task;
+	struct work_struct flush_task0;
+	struct work_struct flush_task1;
+	struct work_struct flush_task2;
 	struct work_struct restart_task;
 	struct delayed_work ah_reap_task;
-	struct work_struct pkey_event_task;
 
 	struct ib_device *ca;
 	u8		  port;
@@ -423,11 +424,14 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		struct ipoib_ah *address, u32 qpn);
 void ipoib_reap_ah(struct work_struct *work);
 
+void ipoib_flush_paths_only(struct net_device *dev);
 void ipoib_flush_paths(struct net_device *dev);
 struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
 
 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
-void ipoib_ib_dev_flush(struct work_struct *work);
+void ipoib_ib_dev_flush0(struct work_struct *work);
+void ipoib_ib_dev_flush1(struct work_struct *work);
+void ipoib_ib_dev_flush2(struct work_struct *work);
 void ipoib_pkey_event(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index f429bce..5a6bbe8 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -898,7 +898,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 	return 0;
 }
 
-static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int level)
 {
 	struct ipoib_dev_priv *cpriv;
 	struct net_device *dev = priv->dev;
@@ -911,7 +911,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 	 * the parent is down.
 	 */
 	list_for_each_entry(cpriv, &priv->child_intfs, list)
-		__ipoib_ib_dev_flush(cpriv, pkey_event);
+		__ipoib_ib_dev_flush(cpriv, level);
 
 	mutex_unlock(&priv->vlan_mutex);
 
@@ -925,7 +925,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		return;
 	}
 
-	if (pkey_event) {
+	if (level == 2) {
 		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
 			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 			ipoib_ib_dev_down(dev, 0);
@@ -943,11 +943,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		priv->pkey_index = new_index;
 	}
 
-	ipoib_dbg(priv, "flushing\n");
-
-	ipoib_ib_dev_down(dev, 0);
+	ipoib_flush_paths_only(dev);
+	ipoib_mcast_dev_flush(dev);
+	
+	if (level >= 1)
+		ipoib_ib_dev_down(dev, 0);
 
-	if (pkey_event) {
+	if (level >= 2) {
 		ipoib_ib_dev_stop(dev, 0);
 		ipoib_ib_dev_open(dev);
 	}
@@ -957,29 +959,36 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 	 * we get here, don't bring it back up if it's not configured up
 	 */
 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
-		ipoib_ib_dev_up(dev);
+		if (level >= 1)
+			ipoib_ib_dev_up(dev);
 		ipoib_mcast_restart_task(&priv->restart_task);
 	}
 }
 
-void ipoib_ib_dev_flush(struct work_struct *work)
+void ipoib_ib_dev_flush0(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, flush_task);
+		container_of(work, struct ipoib_dev_priv, flush_task0);
 
-	ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
 	__ipoib_ib_dev_flush(priv, 0);
 }
 
-void ipoib_pkey_event(struct work_struct *work)
+void ipoib_ib_dev_flush1(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, pkey_event_task);
+		container_of(work, struct ipoib_dev_priv, flush_task1);
 
-	ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
 	__ipoib_ib_dev_flush(priv, 1);
 }
 
+void ipoib_ib_dev_flush2(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_task2);
+
+	__ipoib_ib_dev_flush(priv, 2);
+}
+
 void ipoib_ib_dev_cleanup(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 2442090..c41798d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -259,6 +259,21 @@ static int __path_add(struct net_device *dev, struct ipoib_path *path)
 	return 0;
 }
 
+static void path_free_only(struct net_device *dev, struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh *neigh, *tn;
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	while ((skb = __skb_dequeue(&path->queue)))
+		dev_kfree_skb_irq(skb);
+
+	if (path->ah)
+		ipoib_put_ah(path->ah);
+
+	kfree(path);
+}
 static void path_free(struct net_device *dev, struct ipoib_path *path)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -350,6 +365,34 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 
 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
 
+void ipoib_flush_paths_only(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path, *tp;
+	LIST_HEAD(remove_list);
+
+	spin_lock_irq(&priv->tx_lock);
+	spin_lock(&priv->lock);
+
+	list_splice_init(&priv->path_list, &remove_list);
+
+	list_for_each_entry(path, &remove_list, list)
+		rb_erase(&path->rb_node, &priv->path_tree);
+
+	list_for_each_entry_safe(path, tp, &remove_list, list) {
+		if (path->query)
+			ib_sa_cancel_query(path->query_id, path->query);
+		spin_unlock(&priv->lock);
+		spin_unlock_irq(&priv->tx_lock);
+		wait_for_completion(&path->done);
+		path_free_only(dev, path);
+		spin_lock_irq(&priv->tx_lock);
+		spin_lock(&priv->lock);
+	}
+	spin_unlock(&priv->lock);
+	spin_unlock_irq(&priv->tx_lock);
+}
+
 void ipoib_flush_paths(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -421,6 +464,8 @@ static void path_rec_completion(int status,
 			__skb_queue_tail(&skqueue, skb);
 
 		list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
+			if (neigh->ah)
+				ipoib_put_ah(neigh->ah);
 			kref_get(&path->ah->ref);
 			neigh->ah = path->ah;
 			memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
@@ -989,9 +1034,10 @@ static void ipoib_setup(struct net_device *dev)
 	INIT_LIST_HEAD(&priv->multicast_list);
 
 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
-	INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
-	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
+	INIT_WORK(&priv->flush_task0,   ipoib_ib_dev_flush0);
+	INIT_WORK(&priv->flush_task1,   ipoib_ib_dev_flush1);
+	INIT_WORK(&priv->flush_task2,   ipoib_ib_dev_flush2);
 	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
 	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 8766d29..80c0409 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -289,15 +289,16 @@ void ipoib_event(struct ib_event_handler *handler,
 	if (record->element.port_num != priv->port)
 		return;
 
-	if (record->event == IB_EVENT_PORT_ERR    ||
-	    record->event == IB_EVENT_PORT_ACTIVE ||
-	    record->event == IB_EVENT_LID_CHANGE  ||
-	    record->event == IB_EVENT_SM_CHANGE   ||
-	    record->event == IB_EVENT_CLIENT_REREGISTER) {
-		ipoib_dbg(priv, "Port state change event\n");
-		queue_work(ipoib_workqueue, &priv->flush_task);
+	ipoib_dbg(priv, "Event %d on device %s port %d\n",record->event,
+			record->device->name, record->element.port_num);
+	if ( record->event == IB_EVENT_SM_CHANGE   ||
+	     record->event == IB_EVENT_CLIENT_REREGISTER) {
+		queue_work(ipoib_workqueue, &priv->flush_task0);
+	} else if (record->event == IB_EVENT_PORT_ERR ||
+		   record->event == IB_EVENT_PORT_ACTIVE ||
+		   record->event == IB_EVENT_LID_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_task1);
 	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
-		ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
-		queue_work(ipoib_workqueue, &priv->pkey_event_task);
+		queue_work(ipoib_workqueue, &priv->flush_task2);
 	}
 }



More information about the general mailing list