[ofa-general] [PATCH] IPoIB: keep ib_wc[] on stack
akepner at sgi.com
akepner at sgi.com
Tue May 13 18:21:46 PDT 2008
We're getting panics like this one on big clusters:
skb_over_panic: text:ffffffff8821f32e len:160 put:100 head:ffff810372b0f000 data:ffff810372b0f01c tail:ffff810372b0f0bc end:ffff810372b0f080 dev:ib0
----------- [cut here ] --------- [please bite here ] ---------
Kernel BUG at net/core/skbuff.c:94
invalid opcode: 0000 [1] SMP
last sysfs file: /class/infiniband/mlx4_0/node_type
CPU 0
Modules linked in: worm sg sd_mod crc32c libcrc32c rdma_ucm rdma_cm iw_cm ib_addr ib_uverbs ib_umad iw_cxgb3 cxgb3 firmware_class mlx4_ibib_mthca iscsi_tcp libiscsi scsi_transport_iscsi ib_ipoib ib_cm ib_sa ib_mad ib_core ipv6 loop numatools xpmem shpchp pci_hotplug i2c_i801 i2c_core mlx4_core libata scsi_mod nfs lockd nfs_acl af_packet sunrpc e1000
Pid: 0, comm: swapper Tainted: G U 2.6.16.46-0.12-smp #1
RIP: 0010:[<ffffffff8027a830>] <ffffffff8027a830>{skb_over_panic+77}
RSP: 0018:ffffffff80417e28 EFLAGS: 00010292
RAX: 0000000000000098 RBX: ffff81041b4bee08 RCX: 0000000000000292
RDX: ffffffff80347868 RSI: 0000000000000292 RDI: ffffffff80347860
RBP: ffff8103725817c0 R08: ffffffff80347868 R09: ffff81041d94e3c0
R10: 0000000000000000 R11: 0000000000000000 R12: ffff81041b4be500
R13: 0000000000000060 R14: 0000000000000900 R15: ffffc20000078908
FS: 0000000000000000(0000) GS:ffffffff803be000(0000) knlGS:0000000000000000
CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 00002b44089dc000 CR3: 000000041f35d000 CR4: 00000000000006e0
Process swapper (pid: 0, threadinfo ffffffff803d8000, task ffffffff80341340)
Stack: ffff810372b0f0bc ffff810372b0f080 ffff81041b4be000 ffff81041b4be500
0000000000000060 ffffffff8821f336 ffffffff80417ec8 ffff81041b4be000
0000000417227014 0000000000000292
Call Trace: <IRQ> <ffffffff8821f336>{:ib_ipoib:ipoib_ib_handle_rx_wc+909}
<ffffffff882205a2>{:ib_ipoib:ipoib_poll+159} <ffffffff802811a5>{net_rx_action+165}
<ffffffff8013775d>{__do_softirq+85} <ffffffff8010c11e>{call_softirq+30}
<ffffffff8010d07c>{do_softirq+44} <ffffffff8010d435>{do_IRQ+64}
<ffffffff80109e3a>{mwait_idle+0} <ffffffff8010b25a>{ret_from_intr+0} <EOI>
<ffffffff80109e3a>{mwait_idle+0} <ffffffff80109e70>{mwait_idle+54}
<ffffffff80109e17>{cpu_idle+151} <ffffffff803da7ec>{start_kernel+601}
<ffffffff803da28a>{_sinittext+650}
Started looking into what might cause this and I found that IPoIB
always does something like this:
int ipoib_poll(struct net_device *dev, int *budget)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
....
ib_poll_cq(priv->rcq, t, priv->ibwc);
for (i = 0; i < n; i++) {
struct ib_wc *wc = priv->ibwc + i;
....
ipoib_ib_handle_rx_wc(dev, wc);
What happens if we call ib_poll_cq() then, before processing the
rx completions in ipoib_ib_handle_rx_wc(), ipoib_poll() gets called
again (on a different CPU)? That could corrupt the priv->ibwc array,
and lead to a panic like above.
How about keeping the array of struct ib_wc on the stack?
This has been tested only on a small system, not yet on one large
enough to verify that it prevents the panic. But this "obviously"
needs to be fixed, no?
Signed-off-by: Arthur Kepner <akepner at sgi.com>
---
ipoib.h | 3 ---
ipoib_ib.c | 31 +++++++++++++++++--------------
2 files changed, 17 insertions(+), 17 deletions(-)
diff -rup a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
--- a/drivers/infiniband/ulp/ipoib/ipoib.h 2008-05-12 16:39:22.024109931 -0700
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h 2008-05-13 16:21:52.433988977 -0700
@@ -326,7 +326,6 @@ struct ipoib_cm_dev_priv {
struct sk_buff_head skb_queue;
struct list_head start_list;
struct list_head reap_list;
- struct ib_wc ibwc[IPOIB_NUM_WC];
struct ib_sge rx_sge[IPOIB_CM_RX_SG];
struct ib_recv_wr rx_wr;
int nonsrq_conn_qp;
@@ -406,8 +405,6 @@ struct ipoib_dev_priv {
struct ib_send_wr tx_wr;
unsigned tx_outstanding;
- struct ib_wc ibwc[IPOIB_NUM_WC];
- struct ib_wc send_wc[MAX_SEND_CQE];
unsigned int tx_poll;
struct list_head dead_ahs;
diff -rup a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2008-05-12 16:39:22.020109690 -0700
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2008-05-13 17:19:28.809819954 -0700
@@ -366,12 +366,13 @@ static void ipoib_ib_handle_tx_wc(struct
void poll_tx(struct ipoib_dev_priv *priv)
{
+ struct ib_wc send_wc[MAX_SEND_CQE];
int n, i;
while (1) {
- n = ib_poll_cq(priv->scq, MAX_SEND_CQE, priv->send_wc);
+ n = ib_poll_cq(priv->scq, MAX_SEND_CQE, send_wc);
for (i = 0; i < n; ++i)
- ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
+ ipoib_ib_handle_tx_wc(priv->dev, send_wc + i);
if (n < MAX_SEND_CQE)
break;
@@ -380,6 +381,7 @@ void poll_tx(struct ipoib_dev_priv *priv
int ipoib_poll(struct net_device *dev, int *budget)
{
+ struct ib_wc ibwc[IPOIB_NUM_WC];
struct ipoib_dev_priv *priv = netdev_priv(dev);
int max = min(*budget, dev->quota);
int done;
@@ -393,10 +395,10 @@ poll_more:
while (max) {
t = min(IPOIB_NUM_WC, max);
- n = ib_poll_cq(priv->rcq, t, priv->ibwc);
+ n = ib_poll_cq(priv->rcq, t, ibwc);
for (i = 0; i < n; i++) {
- struct ib_wc *wc = priv->ibwc + i;
+ struct ib_wc *wc = ibwc + i;
if (wc->wr_id & IPOIB_OP_RECV) {
++done;
@@ -783,29 +785,30 @@ static int recvs_pending(struct net_devi
void ipoib_drain_cq(struct net_device *dev)
{
+ struct ib_wc ibwc[IPOIB_NUM_WC];
struct ipoib_dev_priv *priv = netdev_priv(dev);
int i, n;
do {
- n = ib_poll_cq(priv->rcq, IPOIB_NUM_WC, priv->ibwc);
+ n = ib_poll_cq(priv->rcq, IPOIB_NUM_WC, ibwc);
for (i = 0; i < n; ++i) {
/*
* Convert any successful completions to flush
* errors to avoid passing packets up the
* stack after bringing the device down.
*/
- if (priv->ibwc[i].status == IB_WC_SUCCESS)
- priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
+ if (ibwc[i].status == IB_WC_SUCCESS)
+ ibwc[i].status = IB_WC_WR_FLUSH_ERR;
- if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
- if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
- ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
+ if (ibwc[i].wr_id & IPOIB_OP_RECV) {
+ if (ibwc[i].wr_id & IPOIB_OP_CM)
+ ipoib_cm_handle_rx_wc(dev, ibwc + i);
else
- ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
+ ipoib_ib_handle_rx_wc(dev, ibwc + i);
} else {
- if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
- ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
+ if (ibwc[i].wr_id & IPOIB_OP_CM)
+ ipoib_cm_handle_tx_wc(dev, ibwc + i);
else
- ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
+ ipoib_ib_handle_tx_wc(dev, ibwc + i);
}
}
} while (n == IPOIB_NUM_WC);
More information about the general
mailing list