[ofa-general] Re: [PATCH] ipoib: null tx/rx_ring skb pointers on free
Ira Weiny
weiny2 at llnl.gov
Wed Nov 5 17:53:52 PST 2008
On Wed, 5 Nov 2008 17:23:07 -0800
akepner at sgi.com wrote:
>
> Way back in:
>
> http:// lists.openfabrics.org/pipermail/general/2008-May/050196.html
>
> I described an IPoIB-related panic we were seeing on large
> clusters. The signature was a backtrace like this:
>
> skb_over_panic
> :ib_ipoib:ipoib_ib_handle_rx_wc
> :ib_ipoib:ipoib_poll
> net_rx_action
> .....
>
> The bug is difficult to reproduce, but we finally got a crashdump,
> and the problem appears to be that stale skb pointers on the tx_ring
> were left pointing to skbs that had been since reused, so that the
> skb's data region was now unexpectedly short, etc.
>
> Recently LLNL reported something similar:
>
> http:// lists.openfabrics.org/pipermail/general/2008-October/054824.html
>
> A patch similar to the following seems to fix thing up.
>
> Ira, Al, if this looks OK, can you please sign off on it?
Yep, looks good.
Ira
>
> Signed-off-by: Arthur Kepner <akepner at sgi.com>
>
> ---
>
> ipoib_cm.c | 5 +++++
> ipoib_ib.c | 4 ++++
> 2 files changed, 9 insertions(+)
>
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> index 7b14c2c..8f8650b 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> @@ -200,6 +200,7 @@ static void ipoib_cm_free_rx_ring(struct net_device *dev,
> ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
> rx_ring[i].mapping);
> dev_kfree_skb_any(rx_ring[i].skb);
> + rx_ring[i].skb = NULL;
> }
>
> vfree(rx_ring);
> @@ -736,6 +737,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> ++dev->stats.tx_errors;
> dev_kfree_skb_any(skb);
> + tx_req->skb = NULL;
> return;
> }
>
> @@ -747,6 +749,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> ++dev->stats.tx_errors;
> ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
> dev_kfree_skb_any(skb);
> + tx_req->skb = NULL;
> } else {
> dev->trans_start = jiffies;
> ++tx->tx_head;
> @@ -785,6 +788,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
> dev->stats.tx_bytes += tx_req->skb->len;
>
> dev_kfree_skb_any(tx_req->skb);
> + tx_req->skb = NULL;
>
> netif_tx_lock(dev);
>
> @@ -1179,6 +1183,7 @@ timeout:
> ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
> DMA_TO_DEVICE);
> dev_kfree_skb_any(tx_req->skb);
> + tx_req->skb = NULL;
> ++p->tx_tail;
> netif_tx_lock_bh(p->dev);
> if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index 28eb6f0..f7e3497 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -383,6 +383,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
> dev->stats.tx_bytes += tx_req->skb->len;
>
> dev_kfree_skb_any(tx_req->skb);
> + tx_req->skb = NULL;
>
> ++priv->tx_tail;
> if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
> @@ -572,6 +573,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
> if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
> ++dev->stats.tx_errors;
> dev_kfree_skb_any(skb);
> + tx_req->skb = NULL;
> return;
> }
>
> @@ -594,6 +596,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
> --priv->tx_outstanding;
> ipoib_dma_unmap_tx(priv->ca, tx_req);
> dev_kfree_skb_any(skb);
> + tx_req->skb = NULL;
> if (netif_queue_stopped(dev))
> netif_wake_queue(dev);
> } else {
> @@ -833,6 +836,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
> (ipoib_sendq_size - 1)];
> ipoib_dma_unmap_tx(priv->ca, tx_req);
> dev_kfree_skb_any(tx_req->skb);
> + tx_req->skb = NULL;
> ++priv->tx_tail;
> --priv->tx_outstanding;
> }
>
More information about the general
mailing list