[ofa-general] [PATCH] IPoIB: keep ib_wc[] on stack

Tue May 13 18:21:46 PDT 2008

We're getting panics like this one on big clusters:

skb_over_panic: text:ffffffff8821f32e len:160 put:100 head:ffff810372b0f000 data:ffff810372b0f01c tail:ffff810372b0f0bc end:ffff810372b0f080 dev:ib0
----------- [cut here ] --------- [please bite here ] ---------
Kernel BUG at net/core/skbuff.c:94
invalid opcode: 0000 [1] SMP
last sysfs file: /class/infiniband/mlx4_0/node_type
CPU 0
Modules linked in: worm sg sd_mod crc32c libcrc32c rdma_ucm rdma_cm iw_cm ib_addr ib_uverbs ib_umad iw_cxgb3 cxgb3 firmware_class mlx4_ibib_mthca iscsi_tcp libiscsi scsi_transport_iscsi ib_ipoib ib_cm ib_sa ib_mad ib_core ipv6 loop numatools xpmem shpchp pci_hotplug i2c_i801 i2c_core mlx4_core libata scsi_mod nfs lockd nfs_acl af_packet sunrpc e1000
Pid: 0, comm: swapper Tainted: G     U 2.6.16.46-0.12-smp #1
RIP: 0010:[<ffffffff8027a830>] <ffffffff8027a830>{skb_over_panic+77}
RSP: 0018:ffffffff80417e28  EFLAGS: 00010292
RAX: 0000000000000098 RBX: ffff81041b4bee08 RCX: 0000000000000292
RDX: ffffffff80347868 RSI: 0000000000000292 RDI: ffffffff80347860
RBP: ffff8103725817c0 R08: ffffffff80347868 R09: ffff81041d94e3c0
R10: 0000000000000000 R11: 0000000000000000 R12: ffff81041b4be500
R13: 0000000000000060 R14: 0000000000000900 R15: ffffc20000078908
FS:  0000000000000000(0000) GS:ffffffff803be000(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 00002b44089dc000 CR3: 000000041f35d000 CR4: 00000000000006e0
Process swapper (pid: 0, threadinfo ffffffff803d8000, task ffffffff80341340)
Stack: ffff810372b0f0bc ffff810372b0f080 ffff81041b4be000 ffff81041b4be500
       0000000000000060 ffffffff8821f336 ffffffff80417ec8 ffff81041b4be000
       0000000417227014 0000000000000292
Call Trace: <IRQ> <ffffffff8821f336>{:ib_ipoib:ipoib_ib_handle_rx_wc+909}
       <ffffffff882205a2>{:ib_ipoib:ipoib_poll+159} <ffffffff802811a5>{net_rx_action+165}
       <ffffffff8013775d>{__do_softirq+85} <ffffffff8010c11e>{call_softirq+30}
       <ffffffff8010d07c>{do_softirq+44} <ffffffff8010d435>{do_IRQ+64}
       <ffffffff80109e3a>{mwait_idle+0} <ffffffff8010b25a>{ret_from_intr+0} <EOI>
       <ffffffff80109e3a>{mwait_idle+0} <ffffffff80109e70>{mwait_idle+54}
       <ffffffff80109e17>{cpu_idle+151} <ffffffff803da7ec>{start_kernel+601}
       <ffffffff803da28a>{_sinittext+650}


Started looking into what might cause this and I found that IPoIB 
always does something like this:

int ipoib_poll(struct net_device *dev, int *budget) 
{ 
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	....
	ib_poll_cq(priv->rcq, t, priv->ibwc);

	for (i = 0; i < n; i++) {
		struct ib_wc *wc = priv->ibwc + i;
		....
		ipoib_ib_handle_rx_wc(dev, wc);
		

What happens if we call ib_poll_cq() then, before processing the 
rx completions in ipoib_ib_handle_rx_wc(), ipoib_poll() gets called 
again (on a different CPU)? That could corrupt the priv->ibwc array, 
and lead to a panic like above. 

How about keeping the array of struct ib_wc on the stack? 

This has been tested only on a small system, not yet on one large
enough to verify that it prevents the panic. But this "obviously" 
needs to be fixed, no?

Signed-off-by: Arthur Kepner <akepner at sgi.com>

---
 ipoib.h    |    3 ---
 ipoib_ib.c |   31 +++++++++++++++++--------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff -rup a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h

--- a/drivers/infiniband/ulp/ipoib/ipoib.h	2008-05-12 16:39:22.024109931 -0700
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h	2008-05-13 16:21:52.433988977 -0700
@@ -326,7 +326,6 @@ struct ipoib_cm_dev_priv {
 	struct sk_buff_head     skb_queue;
 	struct list_head        start_list;
 	struct list_head        reap_list;
-	struct ib_wc            ibwc[IPOIB_NUM_WC];
 	struct ib_sge           rx_sge[IPOIB_CM_RX_SG];
 	struct ib_recv_wr       rx_wr;
 	int			nonsrq_conn_qp;
@@ -406,8 +405,6 @@ struct ipoib_dev_priv {
 	struct ib_send_wr    tx_wr;
 	unsigned             tx_outstanding;
 
-	struct ib_wc 	     ibwc[IPOIB_NUM_WC];
-	struct ib_wc         send_wc[MAX_SEND_CQE];
 	unsigned int	     tx_poll;
 
 	struct list_head dead_ahs;
diff -rup a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2008-05-12 16:39:22.020109690 -0700
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2008-05-13 17:19:28.809819954 -0700
@@ -366,12 +366,13 @@ static void ipoib_ib_handle_tx_wc(struct
 
 void poll_tx(struct ipoib_dev_priv *priv)
 {
+	struct ib_wc send_wc[MAX_SEND_CQE];
 	int n, i;
 
 	while (1) {
-		n = ib_poll_cq(priv->scq, MAX_SEND_CQE, priv->send_wc);
+		n = ib_poll_cq(priv->scq, MAX_SEND_CQE, send_wc);
 		for (i = 0; i < n; ++i)
-			ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
+			ipoib_ib_handle_tx_wc(priv->dev, send_wc + i);
 
 		if (n < MAX_SEND_CQE)
 			break;
@@ -380,6 +381,7 @@ void poll_tx(struct ipoib_dev_priv *priv
 
 int ipoib_poll(struct net_device *dev, int *budget)
 {
+	struct ib_wc ibwc[IPOIB_NUM_WC];
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int max = min(*budget, dev->quota);
 	int done;
@@ -393,10 +395,10 @@ poll_more:
 	while (max) {
 
 		t = min(IPOIB_NUM_WC, max);
-		n = ib_poll_cq(priv->rcq, t, priv->ibwc);
+		n = ib_poll_cq(priv->rcq, t, ibwc);
 
 		for (i = 0; i < n; i++) {
-			struct ib_wc *wc = priv->ibwc + i;
+			struct ib_wc *wc = ibwc + i;
 
 			if (wc->wr_id & IPOIB_OP_RECV) {
 				++done;
@@ -783,29 +785,30 @@ static int recvs_pending(struct net_devi
 
 void ipoib_drain_cq(struct net_device *dev)
 {
+	struct ib_wc ibwc[IPOIB_NUM_WC];
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int i, n;
 	do {
-		n = ib_poll_cq(priv->rcq, IPOIB_NUM_WC, priv->ibwc);
+		n = ib_poll_cq(priv->rcq, IPOIB_NUM_WC, ibwc);
 		for (i = 0; i < n; ++i) {
 			/*
 			 * Convert any successful completions to flush
 			 * errors to avoid passing packets up the
 			 * stack after bringing the device down.
 			 */
-			if (priv->ibwc[i].status == IB_WC_SUCCESS)
-				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
+			if (ibwc[i].status == IB_WC_SUCCESS)
+				ibwc[i].status = IB_WC_WR_FLUSH_ERR;
 
-			if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
-				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
-					ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
+			if (ibwc[i].wr_id & IPOIB_OP_RECV) {
+				if (ibwc[i].wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_rx_wc(dev, ibwc + i);
 				else
-					ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
+					ipoib_ib_handle_rx_wc(dev, ibwc + i);
 			} else {
-				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
-					ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
+				if (ibwc[i].wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_tx_wc(dev, ibwc + i);
 				else
-					ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
+					ipoib_ib_handle_tx_wc(dev, ibwc + i);
 			}
 		}
 	} while (n == IPOIB_NUM_WC);