[ofa-general] Re: [PATCH 3/6 v2] fix pkey change handling and remove the cahce

Michael S. Tsirkin mst at dev.mellanox.co.il
Mon May 7 08:30:25 PDT 2007


All in all, this patch tries to do many things at once.  I wonder whether you
can split the patch in 2: fix the pkey change case separately, and remove pkey
polling separately.

> Quoting Yosef Etigin <yosefe at voltaire.com>:
> Subject: [PATCH 3/6 v2] fix pkey change handling and remove the cahce
> 
> ipoib: handle pkey change events
> 
> This issue was found during partitioning & SM fail over testing.
> 
>  * added flush flag to ipoib_ib_dev_stop(), ipoib_ib_dev_down() alike
>  * fixed a bug in device extraction from the work struct
>  * removed some warnings in case they are caused due to missing PKEY as 
> 	  this seems like a valid flow now.

This seems to remove a useful tool for debugging invalid pkeys.
Why is this a valid flow now?

>  * Assume that the cache is coherent - do not retry on cache queries
>  * Restart child interfaces first before parent

Why? Is this related to pkey change somehow?

>  * Remove the pkey polling thread and pkey delayed initiallization
>  * If an interface is brought up but pkey is not found, mark it with
>    IPOIB_PKEY_NEEDED and when a pkey event arrives, try to restart it.
> 
> SM reconfiguration or failover possibly causes a shuffling of the values in the port
> pkey table. The current implementation only queries for the index of the pkey once,
> when it creates the device QP and after that moves it into working state, and hence
> does not address this scenario. Fix this by using the PKEY_CHANGE event as a trigger
> to reconfigure the device QP.
> 
> 
> Signed-off-by: Moni Levy <monil at voltaire.com>
> Signed-off-by: Yosef Etigin <yosefe at voltaire.com>
> ---
>  drivers/infiniband/ulp/ipoib/ipoib.h           |   10 -
>  drivers/infiniband/ulp/ipoib/ipoib_ib.c        |  144 ++++++++-----------------
>  drivers/infiniband/ulp/ipoib/ipoib_main.c      |   11 -
>  drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   11 +
>  drivers/infiniband/ulp/ipoib/ipoib_verbs.c     |   21 +--
>  5 files changed, 76 insertions(+), 121 deletions(-)
> 
> Index: b/drivers/infiniband/ulp/ipoib/ipoib.h
> ===================================================================
> --- a/drivers/infiniband/ulp/ipoib/ipoib.h	2007-05-06 09:26:08.000000000 +0300
> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h	2007-05-06 09:26:18.000000000 +0300
> @@ -80,7 +80,7 @@ enum {
>  	IPOIB_FLAG_INITIALIZED    = 1,
>  	IPOIB_FLAG_ADMIN_UP 	  = 2,
>  	IPOIB_PKEY_ASSIGNED 	  = 3,
> -	IPOIB_PKEY_STOP 	  = 4,
> +	IPOIB_PKEY_NEEDED		  = 4,
>  	IPOIB_FLAG_SUBINTERFACE   = 5,
>  	IPOIB_MCAST_RUN 	  = 6,
>  	IPOIB_STOP_REAPER         = 7,
> @@ -202,9 +202,9 @@ struct ipoib_dev_priv {
>  	struct list_head multicast_list;
>  	struct rb_root multicast_tree;
>  
> -	struct delayed_work pkey_task;
>  	struct delayed_work mcast_task;
>  	struct work_struct flush_task;
> +	struct work_struct pkey_task;
>  	struct work_struct restart_task;
>  	struct delayed_work ah_reap_task;
>  
> @@ -333,12 +333,13 @@ struct ipoib_dev_priv *ipoib_intf_alloc(
>  
>  int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
>  void ipoib_ib_dev_flush(struct work_struct *work);
> +void ipoib_pkey_event(struct work_struct *work);
>  void ipoib_ib_dev_cleanup(struct net_device *dev);
>  
>  int ipoib_ib_dev_open(struct net_device *dev);
>  int ipoib_ib_dev_up(struct net_device *dev);
>  int ipoib_ib_dev_down(struct net_device *dev, int flush);
> -int ipoib_ib_dev_stop(struct net_device *dev);
> +int ipoib_ib_dev_stop(struct net_device *dev, int flush);
>  
>  int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
>  void ipoib_dev_cleanup(struct net_device *dev);
> @@ -384,9 +385,6 @@ void ipoib_event(struct ib_event_handler
>  int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey);
>  int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
>  
> -void ipoib_pkey_poll(struct work_struct *work);
> -int ipoib_pkey_dev_delay_open(struct net_device *dev);
> -
>  #ifdef CONFIG_INFINIBAND_IPOIB_CM
>  
>  #define IPOIB_FLAGS_RC          0x80
> Index: b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> ===================================================================
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-05-06 09:26:17.000000000 +0300
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-05-06 09:26:26.000000000 +0300
> @@ -422,14 +422,14 @@ int ipoib_ib_dev_open(struct net_device 
>  	ret = ipoib_ib_post_receives(dev);
>  	if (ret) {
>  		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
> -		ipoib_ib_dev_stop(dev);
> +		ipoib_ib_dev_stop(dev, 1);
>  		return -1;
>  	}
>  
>  	ret = ipoib_cm_dev_open(dev);
>  	if (ret) {
>  		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
> -		ipoib_ib_dev_stop(dev);
> +		ipoib_ib_dev_stop(dev, 1);
>  		return -1;
>  	}
>  
> @@ -441,28 +441,10 @@ int ipoib_ib_dev_open(struct net_device 
>  	return 0;
>  }
>  
> -static void ipoib_pkey_dev_check_presence(struct net_device *dev)
> -{
> -	struct ipoib_dev_priv *priv = netdev_priv(dev);
> -	u16 pkey_index = 0;
> -
> -	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index))
> -		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
> -	else
> -		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
> -}
> -
>  int ipoib_ib_dev_up(struct net_device *dev)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  
> -	ipoib_pkey_dev_check_presence(dev);
> -
> -	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
> -		ipoib_dbg(priv, "PKEY is not assigned.\n");
> -		return 0;
> -	}
> -
>  	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
>  
>  	return ipoib_mcast_start_thread(dev);
> @@ -477,16 +459,6 @@ int ipoib_ib_dev_down(struct net_device 
>  	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
>  	netif_carrier_off(dev);
>  
> -	/* Shutdown the P_Key thread if still active */
> -	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
> -		mutex_lock(&pkey_mutex);
> -		set_bit(IPOIB_PKEY_STOP, &priv->flags);
> -		cancel_delayed_work(&priv->pkey_task);
> -		mutex_unlock(&pkey_mutex);
> -		if (flush)
> -			flush_workqueue(ipoib_workqueue);
> -	}
> -
>  	ipoib_mcast_stop_thread(dev, flush);
>  	ipoib_mcast_dev_flush(dev);
>  
> @@ -508,7 +480,7 @@ static int recvs_pending(struct net_devi
>  	return pending;
>  }
>  
> -int ipoib_ib_dev_stop(struct net_device *dev)
> +int ipoib_ib_dev_stop(struct net_device *dev, int flush)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  	struct ib_qp_attr qp_attr;
> @@ -581,7 +553,8 @@ timeout:
>  	/* Wait for all AHs to be reaped */
>  	set_bit(IPOIB_STOP_REAPER, &priv->flags);
>  	cancel_delayed_work(&priv->ah_reap_task);
> -	flush_workqueue(ipoib_workqueue);
> +	if (flush)
> +		flush_workqueue(ipoib_workqueue);
>  
>  	begin = jiffies;
>  
> @@ -622,14 +595,33 @@ int ipoib_ib_dev_init(struct net_device 
>  	return 0;
>  }
>  
> -void ipoib_ib_dev_flush(struct work_struct *work)
> +static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int restart_qp)
>  {
> -	struct ipoib_dev_priv *cpriv, *priv =
> -		container_of(work, struct ipoib_dev_priv, flush_task);
> +	struct ipoib_dev_priv *cpriv;
>  	struct net_device *dev = priv->dev;
>  
> -	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) ) {
> -		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
> +	mutex_lock(&priv->vlan_mutex);
> +
> +	/* Flush any child interfaces */
> +	list_for_each_entry(cpriv, &priv->child_intfs, list)
> +		__ipoib_ib_dev_flush(cpriv, restart_qp);
> +
> +	mutex_unlock(&priv->vlan_mutex);
> +
> +	/*
> +	 * If the device is not initiallized since it needs a pkey -
> +	 * try to reopen it
> +	 */
> +	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
> +
> +		if (restart_qp
> +			&& test_bit(IPOIB_PKEY_NEEDED, &priv->flags)
> +		    && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
> +			/* this iface needs pkey, try to bring it up */
> +			ipoib_open(priv->dev);
> +		}
> +		else
> +			ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
>  		return;
>  	}

Clean up the above please.

> @@ -642,6 +634,12 @@ void ipoib_ib_dev_flush(struct work_stru
>  
>  	ipoib_ib_dev_down(dev, 0);
>  
> +	if (restart_qp) {
> +		if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) )
> +			ipoib_ib_dev_stop(dev, 0);
> +		ipoib_ib_dev_open(dev);
> +	}
> +
>  	/*
>  	 * The device could have been brought down between the start and when
>  	 * we get here, don't bring it back up if it's not configured up

I find these if (restart_qp) branches somewhat confusing.
Why is this flag tested in 2 places?

> @@ -650,14 +648,25 @@ void ipoib_ib_dev_flush(struct work_stru
>  		ipoib_ib_dev_up(dev);
>  		ipoib_mcast_restart_task(&priv->restart_task);
>  	}
> +}
>  
> -	mutex_lock(&priv->vlan_mutex);
> +void ipoib_ib_dev_flush(struct work_struct *work)
> +{
> +	struct ipoib_dev_priv *priv =
> +		container_of(work, struct ipoib_dev_priv, flush_task);
> +	/* we only restart the QP in case of pkey change event */

Kill the comment please.

> +	ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
> +	__ipoib_ib_dev_flush(priv, 0);
> +}
>  
> -	/* Flush any child interfaces too */
> -	list_for_each_entry(cpriv, &priv->child_intfs, list)
> -		ipoib_ib_dev_flush(&cpriv->flush_task);
> +void ipoib_pkey_event(struct work_struct *work)
> +{
> +	struct ipoib_dev_priv *priv =
> +		container_of(work, struct ipoib_dev_priv, pkey_task);
>  
> -	mutex_unlock(&priv->vlan_mutex);
> +	/* restart the QP in case of pkey change event */
> +	ipoib_dbg(priv, "Flushing %s and restarting it's QP\n", priv->dev->name);

Kill the comment please.

> +	__ipoib_ib_dev_flush(priv, 1);
>  }
>  
>  void ipoib_ib_dev_cleanup(struct net_device *dev)
> @@ -672,54 +681,3 @@ void ipoib_ib_dev_cleanup(struct net_dev
>  	ipoib_transport_dev_cleanup(dev);
>  }
>  
> -/*
> - * Delayed P_Key Assigment Interim Support
> - *
> - * The following is initial implementation of delayed P_Key assigment
> - * mechanism. It is using the same approach implemented for the multicast
> - * group join. The single goal of this implementation is to quickly address
> - * Bug #2507. This implementation will probably be removed when the P_Key
> - * change async notification is available.
> - */
> -
> -void ipoib_pkey_poll(struct work_struct *work)
> -{
> -	struct ipoib_dev_priv *priv =
> -		container_of(work, struct ipoib_dev_priv, pkey_task.work);
> -	struct net_device *dev = priv->dev;
> -
> -	ipoib_pkey_dev_check_presence(dev);
> -
> -	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
> -		ipoib_open(dev);
> -	else {
> -		mutex_lock(&pkey_mutex);
> -		if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
> -			queue_delayed_work(ipoib_workqueue,
> -					   &priv->pkey_task,
> -					   HZ);
> -		mutex_unlock(&pkey_mutex);
> -	}
> -}
> -
> -int ipoib_pkey_dev_delay_open(struct net_device *dev)
> -{
> -	struct ipoib_dev_priv *priv = netdev_priv(dev);
> -
> -	/* Look for the interface pkey value in the IB Port P_Key table and */
> -	/* set the interface pkey assigment flag                            */
> -	ipoib_pkey_dev_check_presence(dev);
> -
> -	/* P_Key value not assigned yet - start polling */
> -	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
> -		mutex_lock(&pkey_mutex);
> -		clear_bit(IPOIB_PKEY_STOP, &priv->flags);
> -		queue_delayed_work(ipoib_workqueue,
> -				   &priv->pkey_task,
> -				   HZ);
> -		mutex_unlock(&pkey_mutex);
> -		return 1;
> -	}
> -
> -	return 0;
> -}
>
> Index: b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> ===================================================================
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-05-06 09:26:08.000000000 +0300
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-05-06 09:26:18.000000000 +0300
> @@ -100,14 +100,11 @@ int ipoib_open(struct net_device *dev)
>  
>  	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
>  
> -	if (ipoib_pkey_dev_delay_open(dev))
> -		return 0;
> -
>  	if (ipoib_ib_dev_open(dev))
> -		return -EINVAL;
> +		return test_bit(IPOIB_PKEY_NEEDED, &priv->flags) ? 0 : -EINVAL;
>  
>  	if (ipoib_ib_dev_up(dev)) {
> -		ipoib_ib_dev_stop(dev);
> +		ipoib_ib_dev_stop(dev, 1);
>  		return -EINVAL;
>  	}
>  
> @@ -152,7 +149,7 @@ static int ipoib_stop(struct net_device 
>  	flush_workqueue(ipoib_workqueue);
>  
>  	ipoib_ib_dev_down(dev, 1);
> -	ipoib_ib_dev_stop(dev);
> +	ipoib_ib_dev_stop(dev, 1);
>  
>  	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
>  		struct ipoib_dev_priv *cpriv;
> @@ -990,7 +987,7 @@ static void ipoib_setup(struct net_devic
>  	INIT_LIST_HEAD(&priv->dead_ahs);
>  	INIT_LIST_HEAD(&priv->multicast_list);
>  
> -	INIT_DELAYED_WORK(&priv->pkey_task,    ipoib_pkey_poll);
> +	INIT_WORK(&priv->pkey_task, ipoib_pkey_event);
>  	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
>  	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
>  	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
> Index: b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> ===================================================================
> --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2007-05-06 09:26:08.000000000 +0300
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2007-05-06 09:26:18.000000000 +0300
> @@ -232,9 +232,10 @@ static int ipoib_mcast_join_finish(struc
>  		ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid),
>  					 &mcast->mcmember.mgid);
>  		if (ret < 0) {
> -			ipoib_warn(priv, "couldn't attach QP to multicast group "
> -				   IPOIB_GID_FMT "\n",
> -				   IPOIB_GID_ARG(mcast->mcmember.mgid));
> +			if (ret != -ENXIO) /* No pkey found */
> +				ipoib_warn(priv, "couldn't attach QP to multicast group "
> +					   IPOIB_GID_FMT "\n",
> +					   IPOIB_GID_ARG(mcast->mcmember.mgid));
>  
>  			clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
>  			return ret;
> @@ -312,7 +313,7 @@ ipoib_mcast_sendonly_join_complete(int s
>  		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
>  
>  	if (status) {
> -		if (mcast->logcount++ < 20)
> +		if (mcast->logcount++ < 20 && status != -ENXIO)
>  			ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for "
>  					IPOIB_GID_FMT ", status %d\n",
>  					IPOIB_GID_ARG(mcast->mcmember.mgid), status);
> @@ -420,7 +421,7 @@ static int ipoib_mcast_join_complete(int
>  					", status %d\n",
>  					IPOIB_GID_ARG(mcast->mcmember.mgid),
>  					status);
> -		} else {
> +		} else if (status != -ENXIO) {
>  			ipoib_warn(priv, "multicast join failed for "
>  				   IPOIB_GID_FMT ", status %d\n",
>  				   IPOIB_GID_ARG(mcast->mcmember.mgid),
>
> Index: b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
> ===================================================================
> --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2007-05-06 09:26:08.000000000 +0300
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2007-05-06 09:26:18.000000000 +0300
> @@ -33,8 +33,6 @@
>   * $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $
>   */
>  
> -#include <rdma/ib_cache.h>
> -
>  #include "ipoib.h"
>  
>  int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
> @@ -49,12 +47,12 @@ int ipoib_mcast_attach(struct net_device
>  	if (!qp_attr)
>  		goto out;
>  
> -	if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
> -		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
> +	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
> +		clear_bit(IPOIB_PKEY_NEEDED, &priv->flags);
>  		ret = -ENXIO;
>  		goto out;
>  	}
> -	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
> +	set_bit(IPOIB_PKEY_NEEDED, &priv->flags);
>  
>  	/* set correct QKey for QP */
>  	qp_attr->qkey = priv->qkey;
> @@ -103,12 +101,12 @@ int ipoib_init_qp(struct net_device *dev
>  	 * The port has to be assigned to the respective IB partition in
>  	 * advance.
>  	 */
> -	ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index);
> +	ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index);
>  	if (ret) {
> -		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
> +		set_bit(IPOIB_PKEY_NEEDED, &priv->flags);
>  		return ret;
>  	}
> -	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
> +	clear_bit(IPOIB_PKEY_NEEDED, &priv->flags);
>  
>  	qp_attr.qp_state = IB_QPS_INIT;
>  	qp_attr.qkey = 0;
> @@ -238,7 +236,7 @@ void ipoib_transport_dev_cleanup(struct 
>  			ipoib_warn(priv, "ib_qp_destroy failed\n");
>  
>  		priv->qp = NULL;
> -		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
> +		clear_bit(IPOIB_PKEY_NEEDED, &priv->flags);
>  	}
>  
>  	if (ib_destroy_cq(priv->cq))
> @@ -260,7 +258,6 @@ void ipoib_event(struct ib_event_handler
>  		container_of(handler, struct ipoib_dev_priv, event_handler);
>  
>  	if ((record->event == IB_EVENT_PORT_ERR    ||
> -	     record->event == IB_EVENT_PKEY_CHANGE ||
>  	     record->event == IB_EVENT_PORT_ACTIVE ||
>  	     record->event == IB_EVENT_LID_CHANGE  ||
>  	     record->event == IB_EVENT_SM_CHANGE   ||
> @@ -268,5 +265,9 @@ void ipoib_event(struct ib_event_handler
>  	    record->element.port_num == priv->port) {
>  		ipoib_dbg(priv, "Port state change event\n");
>  		queue_work(ipoib_workqueue, &priv->flush_task);
> +	} else if (record->event == IB_EVENT_PKEY_CHANGE &&
> +		record->element.port_num == priv->port) {
> +		ipoib_dbg(priv, "pkey change event on port:%d\n", priv->port);
> +		queue_work(ipoib_workqueue, &priv->pkey_task);
>  	}
>  }

-- 
MST



More information about the general mailing list