[ofa-general] Re: [PATCH 2/2] IB/IPoIB: Separate IB events to groups and handle each according to level of severity

Hal Rosenstock hrosenstock at xsigo.com
Thu May 22 08:07:06 PDT 2008


Moni,

On Thu, 2008-05-22 at 16:58 +0300, Moni Shoua wrote:
> Hal, Roland 
> Thanks for the comments. The patch below tries to address the issues that were 
> raised in its previous form. Please note that I'm only asking for opinion for now.
> If the idea is acceptable then I will recreate more elegant patch with the required
> fixes if any and with respect to previous comments (such as replacing 0,1 and 2 with
> textual names).
> 
> The idea in few words is to flush only paths but keeping address handles in ipoib_neigh.
> This will trigger a new path lookup when an ARP probe arrives and eventually an addess 
> handle renewal. In the meantime, the old address handle is kept and can be used. In most
> cases this address handle is a valid address handle and when it is not than the situatio
> is not worse than before.

This part seems OK to me.
 
> My tests show that this patch completes the improvement that was archived with patch #1 
> to zero packet loss (tested with ping flood) when SM change event occurs.

Looks to me like SM change is still "level 0". I may have missed it but
I don't see how this addresses the general architectural concerns
previously raised. This patch may work in your test environment but I
don't think that covers all the cases.

-- Hal

> thanks
> 
>  MoniS
> 
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> index ca126fc..8ef6573 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> @@ -276,10 +276,11 @@ struct ipoib_dev_priv {
>  
>  	struct delayed_work pkey_poll_task;
>  	struct delayed_work mcast_task;
> -	struct work_struct flush_task;
> +	struct work_struct flush_task0;
> +	struct work_struct flush_task1;
> +	struct work_struct flush_task2;
>  	struct work_struct restart_task;
>  	struct delayed_work ah_reap_task;
> -	struct work_struct pkey_event_task;
>  
>  	struct ib_device *ca;
>  	u8		  port;
> @@ -423,11 +424,14 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
>  		struct ipoib_ah *address, u32 qpn);
>  void ipoib_reap_ah(struct work_struct *work);
>  
> +void ipoib_flush_paths_only(struct net_device *dev);
>  void ipoib_flush_paths(struct net_device *dev);
>  struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
>  
>  int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
> -void ipoib_ib_dev_flush(struct work_struct *work);
> +void ipoib_ib_dev_flush0(struct work_struct *work);
> +void ipoib_ib_dev_flush1(struct work_struct *work);
> +void ipoib_ib_dev_flush2(struct work_struct *work);
>  void ipoib_pkey_event(struct work_struct *work);
>  void ipoib_ib_dev_cleanup(struct net_device *dev);
>  
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index f429bce..5a6bbe8 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -898,7 +898,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
>  	return 0;
>  }
>  
> -static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
> +static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int level)
>  {
>  	struct ipoib_dev_priv *cpriv;
>  	struct net_device *dev = priv->dev;
> @@ -911,7 +911,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
>  	 * the parent is down.
>  	 */
>  	list_for_each_entry(cpriv, &priv->child_intfs, list)
> -		__ipoib_ib_dev_flush(cpriv, pkey_event);
> +		__ipoib_ib_dev_flush(cpriv, level);
>  
>  	mutex_unlock(&priv->vlan_mutex);
>  
> @@ -925,7 +925,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
>  		return;
>  	}
>  
> -	if (pkey_event) {
> +	if (level == 2) {
>  		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
>  			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
>  			ipoib_ib_dev_down(dev, 0);
> @@ -943,11 +943,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
>  		priv->pkey_index = new_index;
>  	}
>  
> -	ipoib_dbg(priv, "flushing\n");
> -
> -	ipoib_ib_dev_down(dev, 0);
> +	ipoib_flush_paths_only(dev);
> +	ipoib_mcast_dev_flush(dev);
> +	
> +	if (level >= 1)
> +		ipoib_ib_dev_down(dev, 0);
>  
> -	if (pkey_event) {
> +	if (level >= 2) {
>  		ipoib_ib_dev_stop(dev, 0);
>  		ipoib_ib_dev_open(dev);
>  	}
> @@ -957,29 +959,36 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
>  	 * we get here, don't bring it back up if it's not configured up
>  	 */
>  	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
> -		ipoib_ib_dev_up(dev);
> +		if (level >= 1)
> +			ipoib_ib_dev_up(dev);
>  		ipoib_mcast_restart_task(&priv->restart_task);
>  	}
>  }
>  
> -void ipoib_ib_dev_flush(struct work_struct *work)
> +void ipoib_ib_dev_flush0(struct work_struct *work)
>  {
>  	struct ipoib_dev_priv *priv =
> -		container_of(work, struct ipoib_dev_priv, flush_task);
> +		container_of(work, struct ipoib_dev_priv, flush_task0);
>  
> -	ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
>  	__ipoib_ib_dev_flush(priv, 0);
>  }
>  
> -void ipoib_pkey_event(struct work_struct *work)
> +void ipoib_ib_dev_flush1(struct work_struct *work)
>  {
>  	struct ipoib_dev_priv *priv =
> -		container_of(work, struct ipoib_dev_priv, pkey_event_task);
> +		container_of(work, struct ipoib_dev_priv, flush_task1);
>  
> -	ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
>  	__ipoib_ib_dev_flush(priv, 1);
>  }
>  
> +void ipoib_ib_dev_flush2(struct work_struct *work)
> +{
> +	struct ipoib_dev_priv *priv =
> +		container_of(work, struct ipoib_dev_priv, flush_task2);
> +
> +	__ipoib_ib_dev_flush(priv, 2);
> +}
> +
>  void ipoib_ib_dev_cleanup(struct net_device *dev)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index 2442090..c41798d 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -259,6 +259,21 @@ static int __path_add(struct net_device *dev, struct ipoib_path *path)
>  	return 0;
>  }
>  
> +static void path_free_only(struct net_device *dev, struct ipoib_path *path)
> +{
> +	struct ipoib_dev_priv *priv = netdev_priv(dev);
> +	struct ipoib_neigh *neigh, *tn;
> +	struct sk_buff *skb;
> +	unsigned long flags;
> +
> +	while ((skb = __skb_dequeue(&path->queue)))
> +		dev_kfree_skb_irq(skb);
> +
> +	if (path->ah)
> +		ipoib_put_ah(path->ah);
> +
> +	kfree(path);
> +}
>  static void path_free(struct net_device *dev, struct ipoib_path *path)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> @@ -350,6 +365,34 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
>  
>  #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
>  
> +void ipoib_flush_paths_only(struct net_device *dev)
> +{
> +	struct ipoib_dev_priv *priv = netdev_priv(dev);
> +	struct ipoib_path *path, *tp;
> +	LIST_HEAD(remove_list);
> +
> +	spin_lock_irq(&priv->tx_lock);
> +	spin_lock(&priv->lock);
> +
> +	list_splice_init(&priv->path_list, &remove_list);
> +
> +	list_for_each_entry(path, &remove_list, list)
> +		rb_erase(&path->rb_node, &priv->path_tree);
> +
> +	list_for_each_entry_safe(path, tp, &remove_list, list) {
> +		if (path->query)
> +			ib_sa_cancel_query(path->query_id, path->query);
> +		spin_unlock(&priv->lock);
> +		spin_unlock_irq(&priv->tx_lock);
> +		wait_for_completion(&path->done);
> +		path_free_only(dev, path);
> +		spin_lock_irq(&priv->tx_lock);
> +		spin_lock(&priv->lock);
> +	}
> +	spin_unlock(&priv->lock);
> +	spin_unlock_irq(&priv->tx_lock);
> +}
> +
>  void ipoib_flush_paths(struct net_device *dev)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> @@ -421,6 +464,8 @@ static void path_rec_completion(int status,
>  			__skb_queue_tail(&skqueue, skb);
>  
>  		list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
> +			if (neigh->ah)
> +				ipoib_put_ah(neigh->ah);
>  			kref_get(&path->ah->ref);
>  			neigh->ah = path->ah;
>  			memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
> @@ -989,9 +1034,10 @@ static void ipoib_setup(struct net_device *dev)
>  	INIT_LIST_HEAD(&priv->multicast_list);
>  
>  	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
> -	INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
>  	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
> -	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
> +	INIT_WORK(&priv->flush_task0,   ipoib_ib_dev_flush0);
> +	INIT_WORK(&priv->flush_task1,   ipoib_ib_dev_flush1);
> +	INIT_WORK(&priv->flush_task2,   ipoib_ib_dev_flush2);
>  	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
>  	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
>  }
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
> index 8766d29..80c0409 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
> @@ -289,15 +289,16 @@ void ipoib_event(struct ib_event_handler *handler,
>  	if (record->element.port_num != priv->port)
>  		return;
>  
> -	if (record->event == IB_EVENT_PORT_ERR    ||
> -	    record->event == IB_EVENT_PORT_ACTIVE ||
> -	    record->event == IB_EVENT_LID_CHANGE  ||
> -	    record->event == IB_EVENT_SM_CHANGE   ||
> -	    record->event == IB_EVENT_CLIENT_REREGISTER) {
> -		ipoib_dbg(priv, "Port state change event\n");
> -		queue_work(ipoib_workqueue, &priv->flush_task);
> +	ipoib_dbg(priv, "Event %d on device %s port %d\n",record->event,
> +			record->device->name, record->element.port_num);
> +	if ( record->event == IB_EVENT_SM_CHANGE   ||
> +	     record->event == IB_EVENT_CLIENT_REREGISTER) {
> +		queue_work(ipoib_workqueue, &priv->flush_task0);
> +	} else if (record->event == IB_EVENT_PORT_ERR ||
> +		   record->event == IB_EVENT_PORT_ACTIVE ||
> +		   record->event == IB_EVENT_LID_CHANGE) {
> +		queue_work(ipoib_workqueue, &priv->flush_task1);
>  	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
> -		ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
> -		queue_work(ipoib_workqueue, &priv->pkey_event_task);
> +		queue_work(ipoib_workqueue, &priv->flush_task2);
>  	}
>  }




More information about the general mailing list