[ofa-general] ***SPAM*** System crashed while booting Linux (ia64) with three Mellanox HCAs (15b3:6274)

Phillip Wilson phillipwils at gmail.com
Wed Mar 25 19:39:57 PDT 2009


System crashes with three Mellanox mezzanine cards (VID=15b3,
DID=0x6274) installed when booting Linux (ia64).  I am using Linux
2.6.24, but this issue also occurs with Linux kernel 2.6.29-rc8.

When I added several printk() to the driver function
ib_mad_post_receive_mads(), the system was able to boot which implies
there is a timing issue in the driver.  The system boots if printk()
debug messages are added to the ib_mad_post_receive_mads() function
within the "../drivers/infiniband/core/mad.c".  I have copied the
ib_mad_post_receive_mads() function, with my printk to this email.
Only debug messages "ib_mad_post_receive_mads(0)" through
"ib_mad_post_receive_mads(6)" are seen while booting to Linux.

A partial listing from ib_mad_post_receive_mad.S is posted below the "C" code.
The exact instruction that cause the system crash was located at

ib_mad_post_*+0x0080           st4              [r2]=r3                      MII
                               nop.i            0x0
                               nop.i            0x0

It tries to store r3=0x1600 to [r2] @ 0xE0000007E01C7CCC.


<<System Crash Log>>
ib_mthca: Mellanox InfiniBand HCA driver v0.08 (February 14, 2006)
ib_mthca: Initializing 0000:05:00.0
GSI 26 (level, low) -> CPU 14 (0x1600) vector 74
ACPI: PCI Interrupt 0000:05:00.0[A] -> GSI 26 (level, low) -> IRQ 74
ib_mthca: Initializing 0000:06:00.0
GSI 30 (level, low) -> CPU 15 (0x1700) vector 78
ACPI: PCI Interrupt 0000:06:00.0[A] -> GSI 30 (level, low) -> IRQ 78
[0] Entered SalCheckHandler
[0] SalCheckHandler: Saved Entry State
[0] SalCheckHandler: MCA Monarch: 0
[0] SalCheckHandler: Get Sm4

I am using OFED 1.2.5 in user space.

HCA firmware on all 3 cards is version 1.2.0

#> lspci -d 15b3:
0000:05:00.0 InfiniBand: Mellanox Technologies MT25204 [InfiniHost III
Lx HCA] (rev 20)
0000:06:00.0 InfiniBand: Mellanox Technologies MT25204 [InfiniHost III
Lx HCA] (rev 20)
0000:07:00.0 InfiniBand: Mellanox Technologies MT25204 [InfiniHost III
Lx HCA] (rev 20)

#> uname -a
Linux (none) 2.6.24.02.02.08 #3 SMP Wed Mar 25 16:40:52 PST 2009 ia64 unknown


/*
 * Allocate receive MADs and post receive WRs for them
 */
static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
                                    struct ib_mad_private *mad)
{
        unsigned long flags;
        int post, ret;
        struct ib_mad_private *mad_priv;
        struct ib_sge sg_list;
        struct ib_recv_wr recv_wr, *bad_recv_wr;
        struct ib_mad_queue *recv_queue = &qp_info->recv_queue;

        printk("ib_mad_post_receive_mads(0)\n");
        /* Initialize common scatter list fields */
        sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
        sg_list.lkey = (*qp_info->port_priv->mr).lkey;

        /* Initialize common receive WR fields */
        recv_wr.next = NULL;
        recv_wr.sg_list = &sg_list;
        recv_wr.num_sge = 1;

        printk("ib_mad_post_receive_mads(1)\n");
        do {
                /* Allocate and map receive buffer */
                if (mad) {
                        mad_priv = mad;
                        mad = NULL;
                } else {
                        printk("ib_mad_post_receive_mads(2)\n");
                        mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
                        if (!mad_priv) {
                                printk(KERN_ERR PFX "No memory for
receive buffer\n");
                                ret = -ENOMEM;
                                break;
                        }
                }
                printk("ib_mad_post_receive_mads(3)\n");
                sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
                                                 &mad_priv->grh,
                                                 sizeof *mad_priv -
                                                   sizeof mad_priv->header,
                                                 DMA_FROM_DEVICE);
                mad_priv->header.mapping = sg_list.addr;
                recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
                mad_priv->header.mad_list.mad_queue = recv_queue;

                /* Post receive WR */
                printk("ib_mad_post_receive_mads(4)\n");
                spin_lock_irqsave(&recv_queue->lock, flags);
                post = (++recv_queue->count < recv_queue->max_active);
                list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list
);
                printk("ib_mad_post_receive_mads(5)\n");
                spin_unlock_irqrestore(&recv_queue->lock, flags);
                printk("ib_mad_post_receive_mads(6)\n");
                ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
                if (ret) {
                        printk("ib_mad_post_receive_mads(7)\n");
                        spin_lock_irqsave(&recv_queue->lock, flags);
                        printk("ib_mad_post_receive_mads(8)\n");
                        list_del(&mad_priv->header.mad_list.list);
                        recv_queue->count--;
                        printk("ib_mad_post_receive_mads(9)\n");
                        spin_unlock_irqrestore(&recv_queue->lock, flags);
                        printk("ib_mad_post_receive_mads(10)\n");
                        ib_dma_unmap_single(qp_info->port_priv->device,
                                            mad_priv->header.mapping,
                                            sizeof *mad_priv -
                                              sizeof mad_priv->header,
                                            DMA_FROM_DEVICE);
                        printk("ib_mad_post_receive_mads(11)\n");
                        kmem_cache_free(ib_mad_cache, mad_priv);
                        printk("ib_mad_post_receive_mads(12)\n");
                        printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
                        break;
                }
        } while (post);

        return ret;
}


"ib_mad_post_receive_mad.S"
ib_mad_post_*+0x0000           alloc            r50=ar64,0,20,4,0            MII
                               adds             r12=-64,r12
                               mov              r18=0x128
ib_mad_post_*+0x0010           mov              r10=1;;                      MMI
                               ld8              r17=[r32]
                               adds             r15=24,r12
ib_mad_post_*+0x0020           adds             r45=32,r12                   MII
                               adds             r8=16,r12;;
                               adds             r2=16,r45
ib_mad_post_*+0x0030           adds             r9=56,r12                    MMI
                               mov              r51=r1
                               adds             r35=56,r32
ib_mad_post_*+0x0040           adds             r14=48,r17                   MMI
                               st4              [r15]=r18
                               mov              r49=b0;;
ib_mad_post_*+0x0050           ld8              r16=[r14];;                  MMI
                               adds             r11=24,r16
                               nop.i            0x0;;
ib_mad_post_*+0x0060           ld4              r3=[r11]                     MMI
                               st8              [r2]=r8
                               adds             r2=28,r12
ib_mad_post_*+0x0070           st4              [r9]=r10                     MMI
                               st8              [r45]=r0
                               nop.i            0x0;;
ib_mad_post_*+0x0080           st4              [r2]=r3                      MII
                               nop.i            0x0
                               nop.i            0x0
ib_mad_post_*+0x0090           cmp.eq           p8,p9=0,r33                  MFB
                               nop.f            0x0
                               nop.b            0x0;;



More information about the general mailing list