[ofa-general] [PATCH V3] IB/ipoib: refresh paths instead of fushing them on SM change event

Moni Shoua monis at Voltaire.COM
Tue Jul 1 07:39:23 PDT 2008


The patch tries to solve the problem of device going down and paths being
flushed on SM change event. The method is to mark the paths as candidates for
refresh (with a valid flag) and wait for ARP probe to start a flow of path
lookup that leads to patch query  which ends up in path (and address handle)
refresh. 
The solution requires a different and less intrusive handling of SM change
event. For that, the second argument of the flush function changed it's meaning
from boolean flag to a level.
In most cases, SM failover doesn't cause LID change so traffic won't stop. In
the rare cases of LID change, the remote host (the one that hadn't changed
its LID) will lose connectivity until paths are refreshed. This is no worse than
the current state. In fact, preventing the device from going down saves packets
that otherwise would be lost.

Signed-off-by: Moni Levy <monil at voltaire.com>
Signed-off-by: Moni Shoua <monis at voltaire.com>
---
 drivers/infiniband/ulp/ipoib/ipoib.h       |   17 +++++++--
 drivers/infiniband/ulp/ipoib/ipoib_ib.c    |   42 +++++++++++++++--------
 drivers/infiniband/ulp/ipoib/ipoib_main.c  |   52 ++++++++++++++++++++++++++---
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |   19 +++++-----
 4 files changed, 98 insertions(+), 32 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index ca126fc..bb24e34 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -55,6 +55,12 @@
 
 /* constants */
 
+enum ipoib_flush_level {
+	IPOIB_FLUSH_LIGHT,
+	IPOIB_FLUSH_NORMAL,
+	IPOIB_FLUSH_HEAVY
+};
+
 enum {
 	IPOIB_ENCAP_LEN		  = 4,
 
@@ -276,10 +282,11 @@ struct ipoib_dev_priv {
 
 	struct delayed_work pkey_poll_task;
 	struct delayed_work mcast_task;
-	struct work_struct flush_task;
+	struct work_struct flush_light;
+	struct work_struct flush_normal;
+	struct work_struct flush_heavy;
 	struct work_struct restart_task;
 	struct delayed_work ah_reap_task;
-	struct work_struct pkey_event_task;
 
 	struct ib_device *ca;
 	u8		  port;
@@ -358,6 +365,7 @@ struct ipoib_path {
 	struct completion     done;
 
 	struct rb_node	      rb_node;
+	int  		      valid;
 	struct list_head      list;
 };
 
@@ -423,11 +431,14 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		struct ipoib_ah *address, u32 qpn);
 void ipoib_reap_ah(struct work_struct *work);
 
+void ipoib_mark_paths_invalid(struct net_device *dev);
 void ipoib_flush_paths(struct net_device *dev);
 struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
 
 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
-void ipoib_ib_dev_flush(struct work_struct *work);
+void ipoib_ib_dev_flush_light(struct work_struct *work);
+void ipoib_ib_dev_flush_normal(struct work_struct *work);
+void ipoib_ib_dev_flush_heavy(struct work_struct *work);
 void ipoib_pkey_event(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index f429bce..146ee31 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -898,7 +898,8 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 	return 0;
 }
 
-static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
+				enum ipoib_flush_level level)
 {
 	struct ipoib_dev_priv *cpriv;
 	struct net_device *dev = priv->dev;
@@ -911,7 +912,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 	 * the parent is down.
 	 */
 	list_for_each_entry(cpriv, &priv->child_intfs, list)
-		__ipoib_ib_dev_flush(cpriv, pkey_event);
+		__ipoib_ib_dev_flush(cpriv, level);
 
 	mutex_unlock(&priv->vlan_mutex);
 
@@ -925,7 +926,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		return;
 	}
 
-	if (pkey_event) {
+	if (level == IPOIB_FLUSH_HEAVY) {
 		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
 			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 			ipoib_ib_dev_down(dev, 0);
@@ -943,11 +944,15 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		priv->pkey_index = new_index;
 	}
 
-	ipoib_dbg(priv, "flushing\n");
+	if (level == IPOIB_FLUSH_LIGHT) {
+		ipoib_mark_paths_invalid(dev);
+		ipoib_mcast_dev_flush(dev);
+	}
 
-	ipoib_ib_dev_down(dev, 0);
+	if (level >= IPOIB_FLUSH_NORMAL)
+		ipoib_ib_dev_down(dev, 0);
 
-	if (pkey_event) {
+	if (level == IPOIB_FLUSH_HEAVY) {
 		ipoib_ib_dev_stop(dev, 0);
 		ipoib_ib_dev_open(dev);
 	}
@@ -957,27 +962,34 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 	 * we get here, don't bring it back up if it's not configured up
 	 */
 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
-		ipoib_ib_dev_up(dev);
+		if (level >= IPOIB_FLUSH_NORMAL)
+			ipoib_ib_dev_up(dev);
 		ipoib_mcast_restart_task(&priv->restart_task);
 	}
 }
 
-void ipoib_ib_dev_flush(struct work_struct *work)
+void ipoib_ib_dev_flush_light(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_light);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
+}
+
+void ipoib_ib_dev_flush_normal(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, flush_task);
+		container_of(work, struct ipoib_dev_priv, flush_normal);
 
-	ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
-	__ipoib_ib_dev_flush(priv, 0);
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
 }
 
-void ipoib_pkey_event(struct work_struct *work)
+void ipoib_ib_dev_flush_heavy(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, pkey_event_task);
+		container_of(work, struct ipoib_dev_priv, flush_heavy);
 
-	ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
-	__ipoib_ib_dev_flush(priv, 1);
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
 }
 
 void ipoib_ib_dev_cleanup(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 2442090..5cf5ffb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -350,6 +350,23 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 
 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
 
+void ipoib_mark_paths_invalid(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path, *tp;
+	spin_lock_irq(&priv->tx_lock);
+	spin_lock(&priv->lock);
+
+	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
+		ipoib_dbg(priv, "mark path LID 0x%04x GID " IPOIB_GID_FMT " invalid\n",
+			be16_to_cpu(path->pathrec.dlid),
+			IPOIB_GID_ARG(path->pathrec.dgid));
+		path->valid =  0;
+	}
+	spin_unlock(&priv->lock);
+	spin_unlock_irq(&priv->tx_lock);
+}
+
 void ipoib_flush_paths(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -386,6 +403,7 @@ static void path_rec_completion(int status,
 	struct net_device *dev = path->dev;
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_ah *ah = NULL;
+	struct ipoib_ah *old_ah = NULL;
 	struct ipoib_neigh *neigh, *tn;
 	struct sk_buff_head skqueue;
 	struct sk_buff *skb;
@@ -409,6 +427,15 @@ static void path_rec_completion(int status,
 
 	spin_lock_irqsave(&priv->lock, flags);
 
+	/*
+	 * if path->ah is not NULL then it means that this is not the first
+	 * visit in path_rec_completion() for this path and we are sure
+	 * that there is at least one more reference held by a neighbour
+	 * in this path
+	 */
+	if (path->ah)
+		ipoib_put_ah(path->ah);
+
 	path->ah = ah;
 
 	if (ah) {
@@ -421,6 +448,16 @@ static void path_rec_completion(int status,
 			__skb_queue_tail(&skqueue, skb);
 
 		list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
+			if (neigh->ah) {
+				/*
+				 * We don't need to remember all old_ah's but
+				 * only the last one that may hold the last
+				 * reference
+				 */
+				if (old_ah)
+					ipoib_put_ah(old_ah);
+				old_ah = neigh->ah;
+			}
 			kref_get(&path->ah->ref);
 			neigh->ah = path->ah;
 			memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
@@ -443,13 +480,15 @@ static void path_rec_completion(int status,
 			while ((skb = __skb_dequeue(&neigh->queue)))
 				__skb_queue_tail(&skqueue, skb);
 		}
+		path->valid = 1;
 	}
 
 	path->query = NULL;
 	complete(&path->done);
 
 	spin_unlock_irqrestore(&priv->lock, flags);
-
+	if (old_ah)
+		ipoib_put_ah(old_ah);
 	while ((skb = __skb_dequeue(&skqueue))) {
 		skb->dev = dev;
 		if (dev_queue_xmit(skb))
@@ -471,6 +510,7 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
 		return NULL;
 
 	path->dev = dev;
+	path->valid = 0;
 
 	skb_queue_head_init(&path->queue);
 
@@ -623,8 +663,9 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 	spin_lock(&priv->lock);
 
 	path = __path_find(dev, phdr->hwaddr + 4);
-	if (!path) {
-		path = path_rec_create(dev, phdr->hwaddr + 4);
+	if (!path || !path->valid) {
+		if (!path)
+			path = path_rec_create(dev, phdr->hwaddr + 4);
 		if (path) {
 			/* put pseudoheader back on for next time */
 			skb_push(skb, sizeof *phdr);
@@ -989,9 +1030,10 @@ static void ipoib_setup(struct net_device *dev)
 	INIT_LIST_HEAD(&priv->multicast_list);
 
 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
-	INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
-	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
+	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
+	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
+	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
 	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
 	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 8766d29..4ab06ea 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -289,15 +289,16 @@ void ipoib_event(struct ib_event_handler *handler,
 	if (record->element.port_num != priv->port)
 		return;
 
-	if (record->event == IB_EVENT_PORT_ERR    ||
-	    record->event == IB_EVENT_PORT_ACTIVE ||
-	    record->event == IB_EVENT_LID_CHANGE  ||
-	    record->event == IB_EVENT_SM_CHANGE   ||
-	    record->event == IB_EVENT_CLIENT_REREGISTER) {
-		ipoib_dbg(priv, "Port state change event\n");
-		queue_work(ipoib_workqueue, &priv->flush_task);
+	ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
+			record->device->name, record->element.port_num);
+	if (record->event == IB_EVENT_SM_CHANGE   ||
+	     record->event == IB_EVENT_CLIENT_REREGISTER) {
+		queue_work(ipoib_workqueue, &priv->flush_light);
+	} else if (record->event == IB_EVENT_PORT_ERR ||
+		   record->event == IB_EVENT_PORT_ACTIVE ||
+		   record->event == IB_EVENT_LID_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_normal);
 	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
-		ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
-		queue_work(ipoib_workqueue, &priv->pkey_event_task);
+		queue_work(ipoib_workqueue, &priv->flush_heavy);
 	}
 }



More information about the general mailing list