[ofa-general] ***SPAM*** System crashed while booting Linux (ia64) with three Mellanox HCAs (15b3:6274)
Phillip Wilson
phillipwils at gmail.com
Wed Mar 25 19:39:57 PDT 2009
System crashes with three Mellanox mezzanine cards (VID=15b3,
DID=0x6274) installed when booting Linux (ia64). I am using Linux
2.6.24, but this issue also occurs with Linux kernel 2.6.29-rc8.
When I added several printk() to the driver function
ib_mad_post_receive_mads(), the system was able to boot which implies
there is a timing issue in the driver. The system boots if printk()
debug messages are added to the ib_mad_post_receive_mads() function
within the "../drivers/infiniband/core/mad.c". I have copied the
ib_mad_post_receive_mads() function, with my printk to this email.
Only debug messages "ib_mad_post_receive_mads(0)" through
"ib_mad_post_receive_mads(6)" are seen while booting to Linux.
A partial listing from ib_mad_post_receive_mad.S is posted below the "C" code.
The exact instruction that cause the system crash was located at
ib_mad_post_*+0x0080 st4 [r2]=r3 MII
nop.i 0x0
nop.i 0x0
It tries to store r3=0x1600 to [r2] @ 0xE0000007E01C7CCC.
<<System Crash Log>>
ib_mthca: Mellanox InfiniBand HCA driver v0.08 (February 14, 2006)
ib_mthca: Initializing 0000:05:00.0
GSI 26 (level, low) -> CPU 14 (0x1600) vector 74
ACPI: PCI Interrupt 0000:05:00.0[A] -> GSI 26 (level, low) -> IRQ 74
ib_mthca: Initializing 0000:06:00.0
GSI 30 (level, low) -> CPU 15 (0x1700) vector 78
ACPI: PCI Interrupt 0000:06:00.0[A] -> GSI 30 (level, low) -> IRQ 78
[0] Entered SalCheckHandler
[0] SalCheckHandler: Saved Entry State
[0] SalCheckHandler: MCA Monarch: 0
[0] SalCheckHandler: Get Sm4
I am using OFED 1.2.5 in user space.
HCA firmware on all 3 cards is version 1.2.0
#> lspci -d 15b3:
0000:05:00.0 InfiniBand: Mellanox Technologies MT25204 [InfiniHost III
Lx HCA] (rev 20)
0000:06:00.0 InfiniBand: Mellanox Technologies MT25204 [InfiniHost III
Lx HCA] (rev 20)
0000:07:00.0 InfiniBand: Mellanox Technologies MT25204 [InfiniHost III
Lx HCA] (rev 20)
#> uname -a
Linux (none) 2.6.24.02.02.08 #3 SMP Wed Mar 25 16:40:52 PST 2009 ia64 unknown
/*
* Allocate receive MADs and post receive WRs for them
*/
static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
struct ib_mad_private *mad)
{
unsigned long flags;
int post, ret;
struct ib_mad_private *mad_priv;
struct ib_sge sg_list;
struct ib_recv_wr recv_wr, *bad_recv_wr;
struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
printk("ib_mad_post_receive_mads(0)\n");
/* Initialize common scatter list fields */
sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
sg_list.lkey = (*qp_info->port_priv->mr).lkey;
/* Initialize common receive WR fields */
recv_wr.next = NULL;
recv_wr.sg_list = &sg_list;
recv_wr.num_sge = 1;
printk("ib_mad_post_receive_mads(1)\n");
do {
/* Allocate and map receive buffer */
if (mad) {
mad_priv = mad;
mad = NULL;
} else {
printk("ib_mad_post_receive_mads(2)\n");
mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
if (!mad_priv) {
printk(KERN_ERR PFX "No memory for
receive buffer\n");
ret = -ENOMEM;
break;
}
}
printk("ib_mad_post_receive_mads(3)\n");
sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
&mad_priv->grh,
sizeof *mad_priv -
sizeof mad_priv->header,
DMA_FROM_DEVICE);
mad_priv->header.mapping = sg_list.addr;
recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
mad_priv->header.mad_list.mad_queue = recv_queue;
/* Post receive WR */
printk("ib_mad_post_receive_mads(4)\n");
spin_lock_irqsave(&recv_queue->lock, flags);
post = (++recv_queue->count < recv_queue->max_active);
list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list
);
printk("ib_mad_post_receive_mads(5)\n");
spin_unlock_irqrestore(&recv_queue->lock, flags);
printk("ib_mad_post_receive_mads(6)\n");
ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
if (ret) {
printk("ib_mad_post_receive_mads(7)\n");
spin_lock_irqsave(&recv_queue->lock, flags);
printk("ib_mad_post_receive_mads(8)\n");
list_del(&mad_priv->header.mad_list.list);
recv_queue->count--;
printk("ib_mad_post_receive_mads(9)\n");
spin_unlock_irqrestore(&recv_queue->lock, flags);
printk("ib_mad_post_receive_mads(10)\n");
ib_dma_unmap_single(qp_info->port_priv->device,
mad_priv->header.mapping,
sizeof *mad_priv -
sizeof mad_priv->header,
DMA_FROM_DEVICE);
printk("ib_mad_post_receive_mads(11)\n");
kmem_cache_free(ib_mad_cache, mad_priv);
printk("ib_mad_post_receive_mads(12)\n");
printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
break;
}
} while (post);
return ret;
}
"ib_mad_post_receive_mad.S"
ib_mad_post_*+0x0000 alloc r50=ar64,0,20,4,0 MII
adds r12=-64,r12
mov r18=0x128
ib_mad_post_*+0x0010 mov r10=1;; MMI
ld8 r17=[r32]
adds r15=24,r12
ib_mad_post_*+0x0020 adds r45=32,r12 MII
adds r8=16,r12;;
adds r2=16,r45
ib_mad_post_*+0x0030 adds r9=56,r12 MMI
mov r51=r1
adds r35=56,r32
ib_mad_post_*+0x0040 adds r14=48,r17 MMI
st4 [r15]=r18
mov r49=b0;;
ib_mad_post_*+0x0050 ld8 r16=[r14];; MMI
adds r11=24,r16
nop.i 0x0;;
ib_mad_post_*+0x0060 ld4 r3=[r11] MMI
st8 [r2]=r8
adds r2=28,r12
ib_mad_post_*+0x0070 st4 [r9]=r10 MMI
st8 [r45]=r0
nop.i 0x0;;
ib_mad_post_*+0x0080 st4 [r2]=r3 MII
nop.i 0x0
nop.i 0x0
ib_mad_post_*+0x0090 cmp.eq p8,p9=0,r33 MFB
nop.f 0x0
nop.b 0x0;;
More information about the general
mailing list