[openib-general] Re: [PATCH] libmthca: fix wqe post

Viswanath Krishnamurthy viswa.krish at gmail.com
Tue Sep 13 17:22:01 PDT 2005


Roland,

I got the latest sorces, built it along with the drivers. 

Userland mthca
============
Your test application ran fine without any issue. (rctest)
When I ran the cmpost program which I sent you, I started getting errors 
from
the mthca library even for smaller number of connections (Earlier it was 
working). This looks
like error dump im mthca library.

.............. [ 0] 00000493
[ 4] 00000000
[ 8] 00000000
[ c] 00000000
[10] 05f40000
[14] 00000000 
[18] 00000042
[1c] fe100000
failed polling CQ: 142: err 1 <=== This is from cmpost program
[ 0] 00000493
[ 4] 00000000
[ 8] 00000000
[ c] 00000000
[10] 05f90000
[14] 00000000
[18] 00000082
[1c] fe100000
failed polling CQ: 142: err 1
[ 0] 00000493

Also it is now easier to create the panic when you kill the cmpost server 
program. The panic
may be happening on an error path.

printing eip:
c029197d
*pde = 35d56001
Oops: 0000 [#1]
SMP
Modules linked in: nfs nfsd exportfs lockd autofs4 sunrpc uhci_hcd ehci_hcd 
hw_random e1000 ext3 jbd sd_mod
CPU: 0
EIP: 0060:[<c029197d>] Not tainted VLI
EFLAGS: 00010002 (2.6.13)
EIP is at mthca_poll_cq+0x158/0x534
eax: 00000000 ebx: f5e90280 ecx: 00000006 edx: 00001250
esi: 0000023a edi: f5e90304 ebp: f7941f0c esp: f7941ea4
ds: 007b es: 007b ss: 0068
Process ib_mad1 (pid: 308, threadinfo=f7940000 task=f7cb7540)
Stack: f7941ed0 c0118c7d f7def41c c0355dc0 f7cb7540 f7dea41c c1a01bc0 
00000000
00000080 00000000 00000000 00000286 f7ce1000 f7941f0c 00000001 f7dea400
f8806000 00000292 00000001 00000000 f5e90280 f7ce1000 f7def400 f7941f0c
Call Trace:
[<c0118c7d>] load_balance_newidle+0x23/0xa2
[<c0276b42>] ib_mad_completion_handler+0x2c/0x8d
[<c012f9c6>] remove_wait_queue+0xf/0x34
[<c012bd0d>] worker_thread+0x1b0/0x23a
[<c02fdb53>] schedule+0x5d3/0xbdf
[<c0276b16>] ib_mad_completion_handler+0x0/0x8d
[<c011942d>] default_wake_function+0x0/0xc
[<c011942d>] default_wake_function+0x0/0xc
[<c012bb5d>] worker_thread+0x0/0x23a
[<c012f700>] kthread+0x8a/0xb2
[<c012f676>] kthread+0x0/0xb2
[<c0101cf9>] kernel_thread_helper+0x5/0xb
Code: 01 00 00 8b 44 24 18 8d bb 84 00 00 00 8b 53 5c 8b 70 18 8b 4f 24 0f 
ce 2b b3 b8 00 00 00 8b 83 bc 00 00 00 d3 ee 01 f2 8d 14 d0 <8b> 02 8b 52 04 
85 ff 89 45 00 89 55 04 74 16 8b 57 10 89 f0 39

-Viswa


On 9/13/05, Roland Dreier <rolandd at cisco.com> wrote:
> 
> Viswanath> Once you generate a kernel patch, I can test out both
> Viswanath> user and kernel mthca since I have the tests ready..
> 
> Excellent. I merged MST's patch, and applied the patch below to the
> kernel. (So you can either update from svn or apply the patches)
> 
> Thanks for testing -- let me know if you still see problems.
> 
> Index: infiniband/hw/mthca/mthca_srq.c
> ===================================================================
> --- infiniband/hw/mthca/mthca_srq.c (revision 3404)
> +++ infiniband/hw/mthca/mthca_srq.c (working copy)
> @@ -189,7 +189,6 @@ int mthca_alloc_srq(struct mthca_dev *de
> 
> srq->max = attr->max_wr;
> srq->max_gs = attr->max_sge;
> - srq->last = NULL;
> srq->counter = 0;
> 
> if (mthca_is_memfree(dev))
> @@ -264,6 +263,7 @@ int mthca_alloc_srq(struct mthca_dev *de
> 
> srq->first_free = 0;
> srq->last_free = srq->max - 1;
> + srq->last = get_wqe(srq, srq->max - 1);
> 
> return 0;
> 
> @@ -446,13 +446,11 @@ int mthca_tavor_post_srq_recv(struct ib_
> ((struct mthca_data_seg *) wqe)->addr = 0;
> }
> 
> - if (likely(prev_wqe)) {
> - ((struct mthca_next_seg *) prev_wqe)->nda_op =
> - cpu_to_be32((ind << srq->wqe_shift) | 1);
> - wmb();
> - ((struct mthca_next_seg *) prev_wqe)->ee_nds =
> - cpu_to_be32(MTHCA_NEXT_DBD);
> - }
> + ((struct mthca_next_seg *) prev_wqe)->nda_op =
> + cpu_to_be32((ind << srq->wqe_shift) | 1);
> + wmb();
> + ((struct mthca_next_seg *) prev_wqe)->ee_nds =
> + cpu_to_be32(MTHCA_NEXT_DBD);
> 
> srq->wrid[ind] = wr->wr_id;
> srq->first_free = next_ind;
> Index: infiniband/hw/mthca/mthca_qp.c
> ===================================================================
> --- infiniband/hw/mthca/mthca_qp.c (revision 3404)
> +++ infiniband/hw/mthca/mthca_qp.c (working copy)
> @@ -227,7 +227,6 @@ static void mthca_wq_init(struct mthca_w
> wq->last_comp = wq->max - 1;
> wq->head = 0;
> wq->tail = 0;
> - wq->last = NULL;
> }
> 
> void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
> @@ -1103,6 +1102,9 @@ static int mthca_alloc_qp_common(struct
> }
> }
> 
> + qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
> + qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
> +
> return 0;
> }
> 
> @@ -1583,15 +1585,13 @@ int mthca_tavor_post_send(struct ib_qp *
> goto out;
> }
> 
> - if (prev_wqe) {
> - ((struct mthca_next_seg *) prev_wqe)->nda_op =
> - cpu_to_be32(((ind << qp->sq.wqe_shift) +
> - qp->send_wqe_offset) |
> - mthca_opcode[wr->opcode]);
> - wmb();
> - ((struct mthca_next_seg *) prev_wqe)->ee_nds =
> - cpu_to_be32((size0 ? 0 : MTHCA_NEXT_DBD) | size);
> - }
> + ((struct mthca_next_seg *) prev_wqe)->nda_op =
> + cpu_to_be32(((ind << qp->sq.wqe_shift) +
> + qp->send_wqe_offset) |
> + mthca_opcode[wr->opcode]);
> + wmb();
> + ((struct mthca_next_seg *) prev_wqe)->ee_nds =
> + cpu_to_be32((size0 ? 0 : MTHCA_NEXT_DBD) | size);
> 
> if (!size0) {
> size0 = size;
> @@ -1688,13 +1688,11 @@ int mthca_tavor_post_receive(struct ib_q
> 
> qp->wrid[ind] = wr->wr_id;
> 
> - if (likely(prev_wqe)) {
> - ((struct mthca_next_seg *) prev_wqe)->nda_op =
> - cpu_to_be32((ind << qp->rq.wqe_shift) | 1);
> - wmb();
> - ((struct mthca_next_seg *) prev_wqe)->ee_nds =
> - cpu_to_be32(MTHCA_NEXT_DBD | size);
> - }
> + ((struct mthca_next_seg *) prev_wqe)->nda_op =
> + cpu_to_be32((ind << qp->rq.wqe_shift) | 1);
> + wmb();
> + ((struct mthca_next_seg *) prev_wqe)->ee_nds =
> + cpu_to_be32(MTHCA_NEXT_DBD | size);
> 
> if (!size0)
> size0 = size;
> @@ -1905,15 +1903,13 @@ int mthca_arbel_post_send(struct ib_qp *
> goto out;
> }
> 
> - if (likely(prev_wqe)) {
> - ((struct mthca_next_seg *) prev_wqe)->nda_op =
> - cpu_to_be32(((ind << qp->sq.wqe_shift) +
> - qp->send_wqe_offset) |
> - mthca_opcode[wr->opcode]);
> - wmb();
> - ((struct mthca_next_seg *) prev_wqe)->ee_nds =
> - cpu_to_be32(MTHCA_NEXT_DBD | size);
> - }
> + ((struct mthca_next_seg *) prev_wqe)->nda_op =
> + cpu_to_be32(((ind << qp->sq.wqe_shift) +
> + qp->send_wqe_offset) |
> + mthca_opcode[wr->opcode]);
> + wmb();
> + ((struct mthca_next_seg *) prev_wqe)->ee_nds =
> + cpu_to_be32(MTHCA_NEXT_DBD | size);
> 
> if (!size0) {
> size0 = size;
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20050913/ae71bff1/attachment.html>


More information about the general mailing list