[ewg] Continue of "defer skb_orphan() until irqs enabled"

Moni Shoua monis at Voltaire.COM
Wed Sep 24 09:01:02 PDT 2008


Hi,
On Sep 9 Arthur started a thread with a patch that fixes an issue with a call to skb_orphan() when irqs are disabled.
Please look at http://lists.openfabrics.org/pipermail/general/2008-September/054053.html

Although there is an agreement that the fix isn't yet complete I think that it is important enough to include it in
OFED **until** we have a full fix for it in upstream kernel because it is still a big improvement.

I want to suggest a small addition to Arthur's original patch that allows to disable call to skb_orphan() totally.
I hope it's OK with you Arthur.


Please comment.

---------------------------------------------------------------------------------------------

If a socket's sk_write_space() method expects to run with interrupts 
enabled, syslog can get very noisy with messages like:

Badness in local_bh_enable at kernel/softirq.c:140

 Call Trace:
  [<a000000100014ce0>] show_stack+0x40/0xa0
  [<a000000100014d70>] dump_stack+0x30/0x60
  [<a0000001000a1730>] local_bh_enable+0x90/0x140
  [<a000000100557d30>] _spin_unlock_bh+0x30/0x60
  [<a0000002097787f0>] svc_sock_enqueue+0x750/0x780 [sunrpc]
  [<a00000020977a420>] svc_write_space+0xc0/0x1c0 [sunrpc]
  [<a000000100451ad0>] sock_wfree+0xd0/0x140
  [<a0000002087959e0>] ipoib_send+0x1120/0x14a0 [ib_ipoib]
  [<a00000020878dfa0>] ipoib_start_xmit+0x380/0x1140 [ib_ipoib]
  [<a000000100461ab0>] dev_hard_start_xmit+0x4b0/0x680
  [<a00000010048d2f0>] __qdisc_run+0x2d0/0x680


A simple fix is to defer calling skb_orphan() until interrupts have 
been reenabled.
Also, since there is still a race that could end up with a call to skb_orphan() for an
already freed skb, a module parameter allows to disable it totally.


Signed-off-by: Arthur Kepner <akepner at sgi.com>
Signed-off-by: Moni Shoua <monis at voltaire.com>
---

Index: ofa_kernel-1.4/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- ofa_kernel-1.4.orig/drivers/infiniband/ulp/ipoib/ipoib.h	2008-09-24 18:01:28.000000000 -0400
+++ ofa_kernel-1.4/drivers/infiniband/ulp/ipoib/ipoib.h	2008-09-24 18:12:47.000000000 -0400
@@ -442,7 +442,7 @@
 int ipoib_add_pkey_attr(struct net_device *dev);
 int ipoib_add_umcast_attr(struct net_device *dev);
 
-void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+int ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		struct ipoib_ah *address, u32 qpn);
 void ipoib_reap_ah(struct work_struct *work);
 
Index: ofa_kernel-1.4/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- ofa_kernel-1.4.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2008-09-24 18:01:28.000000000 -0400
+++ ofa_kernel-1.4/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2008-09-24 18:12:47.000000000 -0400
@@ -525,13 +525,14 @@
 	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
 }
 
-void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+int ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		struct ipoib_ah *address, u32 qpn)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_tx_buf *tx_req;
 	int hlen;
 	void *phead;
+	int ret = 1; /* assume the worst */
 
 	if (skb_is_gso(skb)) {
 		hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
@@ -541,7 +542,7 @@
 			++dev->stats.tx_dropped;
 			++dev->stats.tx_errors;
 			dev_kfree_skb_any(skb);
-			return;
+			return 1;
 		}
 	} else {
 		if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
@@ -550,7 +551,7 @@
 			++dev->stats.tx_dropped;
 			++dev->stats.tx_errors;
 			ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
-			return;
+			return 1;
 		}
 		phead = NULL;
 		hlen  = 0;
@@ -571,7 +572,7 @@
 	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
 		++dev->stats.tx_errors;
 		dev_kfree_skb_any(skb);
-		return;
+		return 1;
 	}
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
@@ -593,6 +594,7 @@
 		--priv->tx_outstanding;
 		ipoib_dma_unmap_tx(priv->ca, tx_req);
 		dev_kfree_skb_any(skb);
+		ret = 1;
 		if (netif_queue_stopped(dev))
 			netif_wake_queue(dev);
 	} else {
@@ -600,13 +602,14 @@
 
 		address->last_send = priv->tx_head;
 		++priv->tx_head;
-		skb_orphan(skb);
-
+		ret = 0;
 	}
 
 	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
 		while (poll_tx(priv))
 			; /* nothing */
+
+	return ret;
 }
 
 static void __ipoib_reap_ah(struct net_device *dev)
Index: ofa_kernel-1.4/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- ofa_kernel-1.4.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c	2008-09-24 18:01:28.000000000 -0400
+++ ofa_kernel-1.4/drivers/infiniband/ulp/ipoib/ipoib_main.c	2008-09-24 18:12:47.000000000 -0400
@@ -69,6 +69,10 @@
 MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated "
 		"(default = 64)");
 
+static int early_skb_dtor = 1;
+module_param(early_skb_dtor, bool, 0444);
+MODULE_PARM_DESC(early_skb_dtor,  "Enable early call to skb sestructor after send)");
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 int ipoib_debug_level;
 
@@ -630,7 +634,7 @@
 				goto err_drop;
 			}
 		} else
-			ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+			(void) ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
 	} else {
 		neigh->ah  = NULL;
 
@@ -711,7 +715,7 @@
 		ipoib_dbg(priv, "Send unicast ARP to %04x\n",
 			  be16_to_cpu(path->pathrec.dlid));
 
-		ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
+		(void) ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
 	} else if ((path->query || !path_rec_start(dev, path)) &&
 		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
 		/* put pseudoheader back on for next time */
@@ -730,6 +734,7 @@
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_neigh *neigh;
 	unsigned long flags;
+	int orphan = 0;
 
 	if (unlikely(!spin_trylock_irqsave(&priv->tx_lock, flags)))
 		return NETDEV_TX_LOCKED;
@@ -769,7 +774,9 @@
 				goto out;
 			}
 		} else if (neigh->ah) {
-			ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+			int ret;
+			ret = ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+			orphan = !ret;
 			goto out;
 		}
 
@@ -814,6 +821,8 @@
 
 out:
 	spin_unlock_irqrestore(&priv->tx_lock, flags);
+	if (early_skb_dtor && orphan)
+		skb_orphan(skb);
 
 	return NETDEV_TX_OK;
 }
Index: ofa_kernel-1.4/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- ofa_kernel-1.4.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2008-09-18 08:16:26.000000000 -0400
+++ ofa_kernel-1.4/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2008-09-24 18:12:47.000000000 -0400
@@ -723,7 +723,7 @@
 			}
 		}
 
-		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
+		(void) ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
 	}
 
 unlock:



More information about the ewg mailing list