[ofa-general] [PATCH 1/3 v4] ipoib: restart interfaces on pkey change events

Yosef Etigin yosefe at voltaire.com
Wed May 2 08:56:23 PDT 2007


This issue was found during partitioning & SM fail over testing. The fix was tested with pkey reshuffling, removal and addition every few seconds concurrent with OFED restart.

Changes from v1:
	* added flush flag to ipoib_ib_dev_stop(), ipoib_ib_dev_down() alike
	* fixed a bug in device extraction from the work struct
	* removed some warnings in case they are caused due to missing PKEY as 
	  this seems like a valid flow now.

Changes from v2:
	* less/fixed debug prints - (MST remark)
	* flush_restart_qp stuff renamed to just restart_qp (MST remark)
	* the patch now depends on Roland's "IPoIB: Only handle async events for one port"
	
Changed from v3:
	* We now reschedule that qp_restart_task in case the PKEY cache was not 
	  coherent.
	  
Changed from v4:
	* We do not reschedue qp_restart_task, but assume that the cache is coherent
	* Do not restart QP if iface is not iniliallized, but do restart if not ADMIN_UP
	* Restart child interfaces first, so if parent is down child still restarted
	* Remove the pkey polling thread and pkey dalyed initiallization
	* If an interface is brought up but pkey is not found, mark it with IPOIB_PKEY_NEEDED
	  and when a pkey event arrives, try to restart it

SM reconfiguration or failover possibly causes a shuffling of the values in the port pkey
table. The current implementation only queries for the index of the pkey once, when it creates
the device QP and after that moves it into working state, and hence does not address this
scenario. Fix this by using the PKEY_CHANGE event as a trigger to reconfigure the device QP.

When a interface is brought up, it many  


Signed-off-by: Moni Levy <monil at voltaire.com>
Signed-off-by: Yosef Etigin <yosefe at voltaire.com>
---
 drivers/infiniband/ulp/ipoib/ipoib.h           |   10 -
 drivers/infiniband/ulp/ipoib/ipoib_ib.c        |  142 ++++++++-----------------
 drivers/infiniband/ulp/ipoib/ipoib_main.c      |   11 -
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   11 +
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c     |   21 +--
 5 files changed, 74 insertions(+), 121 deletions(-)

Index: b/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- a/drivers/infiniband/ulp/ipoib/ipoib.h	2007-05-02 17:48:05.276713741 +0300
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h	2007-05-02 17:48:30.149283427 +0300
@@ -80,7 +80,7 @@ enum {
 	IPOIB_FLAG_INITIALIZED    = 1,
 	IPOIB_FLAG_ADMIN_UP 	  = 2,
 	IPOIB_PKEY_ASSIGNED 	  = 3,
-	IPOIB_PKEY_STOP 	  = 4,
+	IPOIB_PKEY_NEEDED		  = 4,
 	IPOIB_FLAG_SUBINTERFACE   = 5,
 	IPOIB_MCAST_RUN 	  = 6,
 	IPOIB_STOP_REAPER         = 7,
@@ -202,9 +202,9 @@ struct ipoib_dev_priv {
 	struct list_head multicast_list;
 	struct rb_root multicast_tree;
 
-	struct delayed_work pkey_task;
 	struct delayed_work mcast_task;
 	struct work_struct flush_task;
+	struct work_struct pkey_task;
 	struct work_struct restart_task;
 	struct delayed_work ah_reap_task;
 
@@ -333,12 +333,13 @@ struct ipoib_dev_priv *ipoib_intf_alloc(
 
 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
 void ipoib_ib_dev_flush(struct work_struct *work);
+void ipoib_pkey_event(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);
 
 int ipoib_ib_dev_open(struct net_device *dev);
 int ipoib_ib_dev_up(struct net_device *dev);
 int ipoib_ib_dev_down(struct net_device *dev, int flush);
-int ipoib_ib_dev_stop(struct net_device *dev);
+int ipoib_ib_dev_stop(struct net_device *dev, int flush);
 
 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
 void ipoib_dev_cleanup(struct net_device *dev);
@@ -384,9 +385,6 @@ void ipoib_event(struct ib_event_handler
 int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey);
 int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
 
-void ipoib_pkey_poll(struct work_struct *work);
-int ipoib_pkey_dev_delay_open(struct net_device *dev);
-
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 
 #define IPOIB_FLAGS_RC          0x80
Index: b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-05-02 17:48:05.276713741 +0300
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-05-02 18:04:16.512553724 +0300
@@ -422,14 +422,14 @@ int ipoib_ib_dev_open(struct net_device 
 	ret = ipoib_ib_post_receives(dev);
 	if (ret) {
 		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
-		ipoib_ib_dev_stop(dev);
+		ipoib_ib_dev_stop(dev, 1);
 		return -1;
 	}
 
 	ret = ipoib_cm_dev_open(dev);
 	if (ret) {
 		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
-		ipoib_ib_dev_stop(dev);
+		ipoib_ib_dev_stop(dev, 1);
 		return -1;
 	}
 
@@ -441,28 +441,10 @@ int ipoib_ib_dev_open(struct net_device 
 	return 0;
 }
 
-static void ipoib_pkey_dev_check_presence(struct net_device *dev)
-{
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	u16 pkey_index = 0;
-
-	if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index))
-		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
-	else
-		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
-}
-
 int ipoib_ib_dev_up(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-	ipoib_pkey_dev_check_presence(dev);
-
-	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
-		ipoib_dbg(priv, "PKEY is not assigned.\n");
-		return 0;
-	}
-
 	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
 
 	return ipoib_mcast_start_thread(dev);
@@ -477,16 +459,6 @@ int ipoib_ib_dev_down(struct net_device 
 	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
 	netif_carrier_off(dev);
 
-	/* Shutdown the P_Key thread if still active */
-	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
-		mutex_lock(&pkey_mutex);
-		set_bit(IPOIB_PKEY_STOP, &priv->flags);
-		cancel_delayed_work(&priv->pkey_task);
-		mutex_unlock(&pkey_mutex);
-		if (flush)
-			flush_workqueue(ipoib_workqueue);
-	}
-
 	ipoib_mcast_stop_thread(dev, flush);
 	ipoib_mcast_dev_flush(dev);
 
@@ -508,7 +480,7 @@ static int recvs_pending(struct net_devi
 	return pending;
 }
 
-int ipoib_ib_dev_stop(struct net_device *dev)
+int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_attr qp_attr;
@@ -581,7 +553,8 @@ timeout:
 	/* Wait for all AHs to be reaped */
 	set_bit(IPOIB_STOP_REAPER, &priv->flags);
 	cancel_delayed_work(&priv->ah_reap_task);
-	flush_workqueue(ipoib_workqueue);
+	if (flush)
+		flush_workqueue(ipoib_workqueue);
 
 	begin = jiffies;
 
@@ -622,14 +595,31 @@ int ipoib_ib_dev_init(struct net_device 
 	return 0;
 }
 
-void ipoib_ib_dev_flush(struct work_struct *work)
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int restart_qp)
 {
-	struct ipoib_dev_priv *cpriv, *priv =
-		container_of(work, struct ipoib_dev_priv, flush_task);
+	struct ipoib_dev_priv *cpriv;
 	struct net_device *dev = priv->dev;
 
-	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) ) {
-		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
+	mutex_lock(&priv->vlan_mutex);
+
+	/* Flush any child interfaces */
+	list_for_each_entry(cpriv, &priv->child_intfs, list)
+		__ipoib_ib_dev_flush(cpriv, restart_qp);
+
+	mutex_unlock(&priv->vlan_mutex);
+
+	/*
+	 * If the device is not initiallized since it needs a pkey -
+	 * try to reopen it
+	 */
+	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
+		if (restart_qp && test_bit(IPOIB_PKEY_NEEDED, &priv->flags)
+		    && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
+			/* if this iface needs pkey, try to assign it one */
+			ipoib_open(priv->dev);
+		}
+		else
+			ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
 		return;
 	}
 
@@ -642,6 +632,12 @@ void ipoib_ib_dev_flush(struct work_stru
 
 	ipoib_ib_dev_down(dev, 0);
 
+	if (restart_qp) {
+		if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) )
+			ipoib_ib_dev_stop(dev, 0);
+		ipoib_ib_dev_open(dev);
+	}
+
 	/*
 	 * The device could have been brought down between the start and when
 	 * we get here, don't bring it back up if it's not configured up
@@ -650,14 +646,25 @@ void ipoib_ib_dev_flush(struct work_stru
 		ipoib_ib_dev_up(dev);
 		ipoib_mcast_restart_task(&priv->restart_task);
 	}
+}
 
-	mutex_lock(&priv->vlan_mutex);
+void ipoib_ib_dev_flush(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_task);
+	/* we only restart the QP in case of pkey change event */
+	ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
+	__ipoib_ib_dev_flush(priv, 0);
+}
 
-	/* Flush any child interfaces too */
-	list_for_each_entry(cpriv, &priv->child_intfs, list)
-		ipoib_ib_dev_flush(&cpriv->flush_task);
+void ipoib_pkey_event(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, pkey_task);
 
-	mutex_unlock(&priv->vlan_mutex);
+	/* restart the QP in case of pkey change event */
+	ipoib_dbg(priv, "Flushing %s and restarting it's QP\n", priv->dev->name);
+	__ipoib_ib_dev_flush(priv, 1);
 }
 
 void ipoib_ib_dev_cleanup(struct net_device *dev)
@@ -672,54 +679,3 @@ void ipoib_ib_dev_cleanup(struct net_dev
 	ipoib_transport_dev_cleanup(dev);
 }
 
-/*
- * Delayed P_Key Assigment Interim Support
- *
- * The following is initial implementation of delayed P_Key assigment
- * mechanism. It is using the same approach implemented for the multicast
- * group join. The single goal of this implementation is to quickly address
- * Bug #2507. This implementation will probably be removed when the P_Key
- * change async notification is available.
- */
-
-void ipoib_pkey_poll(struct work_struct *work)
-{
-	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, pkey_task.work);
-	struct net_device *dev = priv->dev;
-
-	ipoib_pkey_dev_check_presence(dev);
-
-	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
-		ipoib_open(dev);
-	else {
-		mutex_lock(&pkey_mutex);
-		if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
-			queue_delayed_work(ipoib_workqueue,
-					   &priv->pkey_task,
-					   HZ);
-		mutex_unlock(&pkey_mutex);
-	}
-}
-
-int ipoib_pkey_dev_delay_open(struct net_device *dev)
-{
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-
-	/* Look for the interface pkey value in the IB Port P_Key table and */
-	/* set the interface pkey assigment flag                            */
-	ipoib_pkey_dev_check_presence(dev);
-
-	/* P_Key value not assigned yet - start polling */
-	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
-		mutex_lock(&pkey_mutex);
-		clear_bit(IPOIB_PKEY_STOP, &priv->flags);
-		queue_delayed_work(ipoib_workqueue,
-				   &priv->pkey_task,
-				   HZ);
-		mutex_unlock(&pkey_mutex);
-		return 1;
-	}
-
-	return 0;
-}
Index: b/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-05-02 17:48:05.276713741 +0300
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-05-02 17:48:30.150283249 +0300
@@ -100,14 +100,11 @@ int ipoib_open(struct net_device *dev)
 
 	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 
-	if (ipoib_pkey_dev_delay_open(dev))
-		return 0;
-
 	if (ipoib_ib_dev_open(dev))
-		return -EINVAL;
+		return test_bit(IPOIB_PKEY_NEEDED, &priv->flags) ? 0 : -EINVAL;
 
 	if (ipoib_ib_dev_up(dev)) {
-		ipoib_ib_dev_stop(dev);
+		ipoib_ib_dev_stop(dev, 1);
 		return -EINVAL;
 	}
 
@@ -152,7 +149,7 @@ static int ipoib_stop(struct net_device 
 	flush_workqueue(ipoib_workqueue);
 
 	ipoib_ib_dev_down(dev, 1);
-	ipoib_ib_dev_stop(dev);
+	ipoib_ib_dev_stop(dev, 1);
 
 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 		struct ipoib_dev_priv *cpriv;
@@ -990,7 +987,7 @@ static void ipoib_setup(struct net_devic
 	INIT_LIST_HEAD(&priv->dead_ahs);
 	INIT_LIST_HEAD(&priv->multicast_list);
 
-	INIT_DELAYED_WORK(&priv->pkey_task,    ipoib_pkey_poll);
+	INIT_WORK(&priv->pkey_task, ipoib_pkey_event);
 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
 	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
 	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
Index: b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2007-05-02 17:48:05.277713563 +0300
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2007-05-02 17:48:30.151283071 +0300
@@ -232,9 +232,10 @@ static int ipoib_mcast_join_finish(struc
 		ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid),
 					 &mcast->mcmember.mgid);
 		if (ret < 0) {
-			ipoib_warn(priv, "couldn't attach QP to multicast group "
-				   IPOIB_GID_FMT "\n",
-				   IPOIB_GID_ARG(mcast->mcmember.mgid));
+			if (ret != -ENXIO) /* No pkey found */
+				ipoib_warn(priv, "couldn't attach QP to multicast group "
+					   IPOIB_GID_FMT "\n",
+					   IPOIB_GID_ARG(mcast->mcmember.mgid));
 
 			clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
 			return ret;
@@ -312,7 +313,7 @@ ipoib_mcast_sendonly_join_complete(int s
 		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
 
 	if (status) {
-		if (mcast->logcount++ < 20)
+		if (mcast->logcount++ < 20 && status != -ENXIO)
 			ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for "
 					IPOIB_GID_FMT ", status %d\n",
 					IPOIB_GID_ARG(mcast->mcmember.mgid), status);
@@ -420,7 +421,7 @@ static int ipoib_mcast_join_complete(int
 					", status %d\n",
 					IPOIB_GID_ARG(mcast->mcmember.mgid),
 					status);
-		} else {
+		} else if (status != -ENXIO) {
 			ipoib_warn(priv, "multicast join failed for "
 				   IPOIB_GID_FMT ", status %d\n",
 				   IPOIB_GID_ARG(mcast->mcmember.mgid),
Index: b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
===================================================================
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2007-05-02 17:48:05.277713563 +0300
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2007-05-02 17:48:30.152282893 +0300
@@ -33,8 +33,6 @@
  * $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $
  */
 
-#include <rdma/ib_cache.h>
-
 #include "ipoib.h"
 
 int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
@@ -49,12 +47,12 @@ int ipoib_mcast_attach(struct net_device
 	if (!qp_attr)
 		goto out;
 
-	if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
-		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+		clear_bit(IPOIB_PKEY_NEEDED, &priv->flags);
 		ret = -ENXIO;
 		goto out;
 	}
-	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	set_bit(IPOIB_PKEY_NEEDED, &priv->flags);
 
 	/* set correct QKey for QP */
 	qp_attr->qkey = priv->qkey;
@@ -103,12 +101,12 @@ int ipoib_init_qp(struct net_device *dev
 	 * The port has to be assigned to the respective IB partition in
 	 * advance.
 	 */
-	ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index);
+	ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index);
 	if (ret) {
-		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+		set_bit(IPOIB_PKEY_NEEDED, &priv->flags);
 		return ret;
 	}
-	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	clear_bit(IPOIB_PKEY_NEEDED, &priv->flags);
 
 	qp_attr.qp_state = IB_QPS_INIT;
 	qp_attr.qkey = 0;
@@ -238,7 +236,7 @@ void ipoib_transport_dev_cleanup(struct 
 			ipoib_warn(priv, "ib_qp_destroy failed\n");
 
 		priv->qp = NULL;
-		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+		clear_bit(IPOIB_PKEY_NEEDED, &priv->flags);
 	}
 
 	if (ib_destroy_cq(priv->cq))
@@ -260,7 +258,6 @@ void ipoib_event(struct ib_event_handler
 		container_of(handler, struct ipoib_dev_priv, event_handler);
 
 	if ((record->event == IB_EVENT_PORT_ERR    ||
-	     record->event == IB_EVENT_PKEY_CHANGE ||
 	     record->event == IB_EVENT_PORT_ACTIVE ||
 	     record->event == IB_EVENT_LID_CHANGE  ||
 	     record->event == IB_EVENT_SM_CHANGE   ||
@@ -268,5 +265,9 @@ void ipoib_event(struct ib_event_handler
 	    record->element.port_num == priv->port) {
 		ipoib_dbg(priv, "Port state change event\n");
 		queue_work(ipoib_workqueue, &priv->flush_task);
+	} else if (record->event == IB_EVENT_PKEY_CHANGE &&
+		record->element.port_num == priv->port) {
+		ipoib_dbg(priv, "pkey change event on port:%d\n", priv->port);
+		queue_work(ipoib_workqueue, &priv->pkey_task);
 	}
 }




More information about the general mailing list