[ofa-general] Re: [PATCH] ipoib: null tx/rx_ring skb pointers on free

Al Chu chu11 at llnl.gov
Wed Nov 5 17:46:03 PST 2008


Hey Arthur,

On Wed, 2008-11-05 at 17:23 -0800, akepner at sgi.com wrote:
> Way back in:
> 
> http:// lists.openfabrics.org/pipermail/general/2008-May/050196.html
> 
> I described an IPoIB-related panic we were seeing on large 
> clusters. The signature was a backtrace like this:
> 
> 	skb_over_panic
> 	:ib_ipoib:ipoib_ib_handle_rx_wc
> 	:ib_ipoib:ipoib_poll
> 	net_rx_action
> 	.....
> 
> The bug is difficult to reproduce, but we finally got a crashdump, 
> and the problem appears to be that stale skb pointers on the tx_ring 
> were left pointing to skbs that had been since reused, so that the 
> skb's data region was now unexpectedly short, etc. 
> 
> Recently LLNL reported something similar:
> 
> http:// lists.openfabrics.org/pipermail/general/2008-October/054824.html
> 
> A patch similar to the following seems to fix thing up. 
> 
> Ira, Al, if this looks OK, can you please sign off on it?

Looks good to me.

Al

> Signed-off-by: Arthur Kepner <akepner at sgi.com>
> 
> --- 
> 
>  ipoib_cm.c |    5 +++++
>  ipoib_ib.c |    4 ++++
>  2 files changed, 9 insertions(+)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> index 7b14c2c..8f8650b 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> @@ -200,6 +200,7 @@ static void ipoib_cm_free_rx_ring(struct net_device *dev,
>  			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
>  					      rx_ring[i].mapping);
>  			dev_kfree_skb_any(rx_ring[i].skb);
> +			rx_ring[i].skb = NULL;
>  		}
>  
>  	vfree(rx_ring);
> @@ -736,6 +737,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
>  	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
>  		++dev->stats.tx_errors;
>  		dev_kfree_skb_any(skb);
> +		tx_req->skb = NULL;
>  		return;
>  	}
>  
> @@ -747,6 +749,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
>  		++dev->stats.tx_errors;
>  		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
>  		dev_kfree_skb_any(skb);
> +		tx_req->skb = NULL;
>  	} else {
>  		dev->trans_start = jiffies;
>  		++tx->tx_head;
> @@ -785,6 +788,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
>  	dev->stats.tx_bytes += tx_req->skb->len;
>  
>  	dev_kfree_skb_any(tx_req->skb);
> +	tx_req->skb = NULL;
>  
>  	netif_tx_lock(dev);
>  
> @@ -1179,6 +1183,7 @@ timeout:
>  		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
>  				    DMA_TO_DEVICE);
>  		dev_kfree_skb_any(tx_req->skb);
> +		tx_req->skb = NULL;
>  		++p->tx_tail;
>  		netif_tx_lock_bh(p->dev);
>  		if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index 28eb6f0..f7e3497 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -383,6 +383,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
>  	dev->stats.tx_bytes += tx_req->skb->len;
>  
>  	dev_kfree_skb_any(tx_req->skb);
> +	tx_req->skb = NULL;
>  
>  	++priv->tx_tail;
>  	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
> @@ -572,6 +573,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
>  	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
>  		++dev->stats.tx_errors;
>  		dev_kfree_skb_any(skb);
> +		tx_req->skb = NULL;
>  		return;
>  	}
>  
> @@ -594,6 +596,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
>  		--priv->tx_outstanding;
>  		ipoib_dma_unmap_tx(priv->ca, tx_req);
>  		dev_kfree_skb_any(skb);
> +		tx_req->skb = NULL;
>  		if (netif_queue_stopped(dev))
>  			netif_wake_queue(dev);
>  	} else {
> @@ -833,6 +836,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
>  							(ipoib_sendq_size - 1)];
>  				ipoib_dma_unmap_tx(priv->ca, tx_req);
>  				dev_kfree_skb_any(tx_req->skb);
> +				tx_req->skb = NULL;
>  				++priv->tx_tail;
>  				--priv->tx_outstanding;
>  			}
> 
-- 
Albert Chu
chu11 at llnl.gov
Computer Scientist
High Performance Systems Division
Lawrence Livermore National Laboratory




More information about the general mailing list