[ofa-general] Kernel panic in IPoIB (RHEL5.1)

Jack Morgenstein jackm at dev.mellanox.co.il
Thu Jan 22 06:30:57 PST 2009


We saw the following kernel panic when testing ipoib stability intensively
by simultaneously (i.e., in separate processes, with random wait intervals) doing:
- ifconfig up/down
- opensm up/down
- ipoib ping
- arp delete
- driver up/down

Does anyone have ideas as to what might have happened?

(the actual crash: is at:
static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
{
        struct sk_buff *next, *prev, *result;

        prev = (struct sk_buff *) list;
        next = prev->next;
        result = NULL;
        if (next != prev) {
                result       = next;
                next         = next->next;
                list->qlen--;
====> here ==>  next->prev   = prev;
                prev->next   = next;
                result->next = result->prev = NULL;
        }
        return result;
}

This is called by: ipoib_neigh_free:
void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
{
	struct sk_buff *skb;
	*to_ipoib_neigh(neigh->neighbour) = NULL;
===>	while ((skb = __skb_dequeue(&neigh->queue))) {

Which is called by path_free:
static void path_free(struct net_device *dev, struct ipoib_path *path)
{
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ipoib_neigh *neigh, *tn;
        struct sk_buff *skb;
        unsigned long flags;

        while ((skb = __skb_dequeue(&path->queue)))
                dev_kfree_skb_irq(skb);

        spin_lock_irqsave(&priv->lock, flags);

        list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
                /*
                 * It's safe to call ipoib_put_ah() inside priv->lock
                 * here, because we know that path->ah will always
                 * hold one more reference, so ipoib_put_ah() will
                 * never do more than decrement the ref count.
                 */
                if (neigh->ah)
                        ipoib_put_ah(neigh->ah);

===> HERE       ipoib_neigh_free(dev, neigh);
        }


CONSOLE DUMP
==============
ib0: ib_sa_path_rec_get failed: -11
Unable to handle kernel NULL pointer dereference at 0000000000000009
RIP:  [<ffffffff883f1cb1>] :ib_ipoib:ipoib_neigh_free+0x2f/0x6e
PGD 0
Oops: 0002 [1] SMP
last sysfs file: /devices/pci0000:00/0000:00:01.0/irq
CPU 2
Modules linked in: netconsole nfs fscache nfsd exportfs lockd nfs_acl autofs4 hidp 
rfcomm l2cap bluetooth sunrpc rdma_ucm(U) ib_sdp(U) rdma_cm(U) iw_cm(U) ib_addr(U)
ib_ipoib(U) ipoib_helper(U) ib_cm(U) ib_sa(U) ipv6 ib_uverbs(U) ib_umad(U) mlx4_ib(U)
ib_mthca(U) ib_mad(U) ib_core(U) dm_mirror dm_multipath dm_mod video sbs
backlight i2c_ec i2c_core button battery asus_acpi acpi_memhotplug ac parport_pc
lp parport sg ide_cd pcspkr cdrom mlx4_core(U) bnx2 k8_edac k8temp hwmon serio_raw
edac_mc shpchp sata_svw libata megaraid_sas sd_mod scsi_mod ext3 jbd ehci_hcd ohci_hcd uhci_hcd
Pid: 2195, comm: ipoib Not tainted 2.6.18-53.el5 #1
RIP: 0010:[<ffffffff883f1cb1>]  [<ffffffff883f1cb1>] :ib_ipoib:ipoib_neigh_free+0x2f/0x6e
RSP: 0018:ffff8102273c7d30  EFLAGS: 00010012
RAX: 0000000000000001 RBX: ffff81012b95f0c0 RCX: ffff81010acbdc20
RDX: ffff81012b95f0e0 RSI: ffff81012b95f0c0 RDI: ffffffff8840e7a0
RBP: ffff810122b54500 R08: ffff8102273c6000 R09: ffff810227eacd30
R10: ffff810227eacd18 R11: ffffffff883c00cf R12: ffff81010acbdbc0
R13: 0000000000000246 R14: ffff810122b54500 R15: ffff810122b54000
FS:  00002aaaab22bb00(0000) GS:ffff8101041593c0(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000009 CR3: 0000000000201000 CR4: 00000000000006e0
Process ipoib (pid: 2195, threadinfo ffff8102273c6000, task ffff810126b067a0)
Stack:  ffff810122b54500 ffff81010acbdbf0 ffff81010acbdbf0 ffffffff883f201d  ffffffff800884ac
ffff81010acbdbc0 ffff810122b54000 ffff8102273c7cc0  0000000000000246 ffff810122b54500
ffff810122b54280 ffffffff883f3559
Call Trace:  [<ffffffff883f201d>] :ib_ipoib:path_free+0xc7/0x116
 [<ffffffff800884ac>] default_wake_function+0x0/0xe
 [<ffffffff883f3559>] :ib_ipoib:ipoib_flush_paths+0x117/0x186
 [<ffffffff883f489c>] :ib_ipoib:ipoib_ib_dev_flush_normal+0x0/0x11
 [<ffffffff883f44e3>] :ib_ipoib:ipoib_ib_dev_down+0xac/0xb3
 [<ffffffff883f487e>] :ib_ipoib:__ipoib_ib_dev_flush+0x1a9/0x1b6
 [<ffffffff8004b2ab>] run_workqueue+0x94/0xe5

Code: 48 89 50 08 48 89 43 20 48 c7 47 08 00 00 00 00 48 c7 07 00
 RIP  [<ffffffff883f1cb1>] :ib_ipoib:ipoib_neigh_free+0x2f/0x6e
 RSP <ffff8102273c7d30>
 CR2: 0000000000000009
 <0>Kernel panic - not syncing: Fatal exception

- Jack




More information about the general mailing list