Roland,<br>
<br>
I got the latest sorces, built it along with the drivers.  <br>
<br>
Userland mthca<br>
============<br>
Your test application ran fine without any issue. (rctest)<br>
When I ran the cmpost program which I sent you, I started getting errors from<br>
the mthca library even for smaller number of connections (Earlier it was working). This looks<br>
like error dump im mthca library.<br>
<br>
..............  [ 0] 00000493<br>
  [ 4] 00000000<br>
  [ 8] 00000000<br>
  [ c] 00000000<br>
  [10] 05f40000<br>
  [14] 00000000   <br>
  [18] 00000042<br>
  [1c] fe100000<br>
failed polling CQ: 142: err 1  <=== This is from cmpost program<br>
  [ 0] 00000493<br>
  [ 4] 00000000<br>
  [ 8] 00000000<br>
  [ c] 00000000<br>
  [10] 05f90000<br>
  [14] 00000000<br>
  [18] 00000082<br>
  [1c] fe100000<br>
failed polling CQ: 142: err 1<br>
  [ 0] 00000493<br>
<br>
Also it is now easier to create the panic when  you kill the cmpost server program. The panic<br>
may be happening on an error path.<br>
<br>
printing eip:<br>
c029197d<br>
*pde = 35d56001<br>
Oops: 0000 [#1]<br>
SMP<br>
Modules linked in: nfs nfsd exportfs lockd autofs4 sunrpc uhci_hcd ehci_hcd hw_random e1000 ext3 jbd sd_mod<br>
CPU:    0<br>
EIP:    0060:[<c029197d>]    Not tainted VLI<br>
EFLAGS: 00010002   (2.6.13)<br>
EIP is at mthca_poll_cq+0x158/0x534<br>
eax: 00000000   ebx: f5e90280   ecx: 00000006   edx: 00001250<br>
esi: 0000023a   edi: f5e90304   ebp: f7941f0c   esp: f7941ea4<br>
ds: 007b   es: 007b   ss: 0068<br>
Process ib_mad1 (pid: 308, threadinfo=f7940000 task=f7cb7540)<br>
Stack: f7941ed0 c0118c7d f7def41c c0355dc0 f7cb7540 f7dea41c c1a01bc0 00000000<br>
       00000080 00000000 00000000 00000286 f7ce1000 f7941f0c 00000001 f7dea400<br>
       f8806000 00000292 00000001 00000000 f5e90280 f7ce1000 f7def400 f7941f0c<br>
Call Trace:<br>
 [<c0118c7d>] load_balance_newidle+0x23/0xa2<br>
 [<c0276b42>] ib_mad_completion_handler+0x2c/0x8d<br>
 [<c012f9c6>] remove_wait_queue+0xf/0x34<br>
 [<c012bd0d>] worker_thread+0x1b0/0x23a<br>
 [<c02fdb53>] schedule+0x5d3/0xbdf<br>
 [<c0276b16>] ib_mad_completion_handler+0x0/0x8d<br>
 [<c011942d>] default_wake_function+0x0/0xc<br>
 [<c011942d>] default_wake_function+0x0/0xc<br>
 [<c012bb5d>] worker_thread+0x0/0x23a<br>
 [<c012f700>] kthread+0x8a/0xb2<br>
 [<c012f676>] kthread+0x0/0xb2<br>
 [<c0101cf9>] kernel_thread_helper+0x5/0xb<br>
Code: 01 00 00 8b 44 24 18 8d bb 84 00 00 00 8b 53 5c 8b 70 18 8b 4f 24
0f ce 2b b3 b8 00 00 00 8b 83 bc 00 00 00 d3 ee 01 f2 8d 14 d0
<8b> 02 8b 52 04 85 ff 89 45 00 89 55 04 74 16 8b 57 10 89 f0 39<br>
<br>
-Viswa<br>
<br><br><div><span class="gmail_quote">On 9/13/05, <b class="gmail_sendername">Roland Dreier</b> <<a href="mailto:rolandd@cisco.com">rolandd@cisco.com</a>> wrote:</span><blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;">
    Viswanath> Once you generate a kernel patch, I can test out both<br>    Viswanath> user and kernel mthca since I have the tests ready..<br><br>Excellent.  I merged MST's patch, and applied the patch below to the
<br>kernel.  (So you can either update from svn or apply the patches)<br><br>Thanks for testing -- let me know if you still see problems.<br><br>Index: infiniband/hw/mthca/mthca_srq.c<br>===================================================================
<br>--- infiniband/hw/mthca/mthca_srq.c     (revision 3404)<br>+++ infiniband/hw/mthca/mthca_srq.c     (working copy)<br>@@ -189,7 +189,6 @@ int mthca_alloc_srq(struct mthca_dev *de<br><br>        srq->max      = attr->max_wr;
<br>        srq->max_gs   = attr->max_sge;<br>-       srq->last     = NULL;<br>        srq->counter  = 0;<br><br>        if (mthca_is_memfree(dev))<br>@@ -264,6 +263,7 @@ int mthca_alloc_srq(struct mthca_dev *de
<br><br>        srq->first_free = 0;<br>        srq->last_free  = srq->max - 1;<br>+       srq->last       = get_wqe(srq, srq->max - 1);<br><br>        return 0;<br><br>@@ -446,13 +446,11 @@ int mthca_tavor_post_srq_recv(struct ib_
<br>                        ((struct
mthca_data_seg *) wqe)->addr = 0;<br>                }<br><br>-               if (likely(prev_wqe)) {<br>-                      
((struct mthca_next_seg *) prev_wqe)->nda_op =<br>-                              
cpu_to_be32((ind << srq->wqe_shift) | 1);<br>-                      
wmb();<br>-                      
((struct mthca_next_seg *) prev_wqe)->ee_nds =<br>-                              
cpu_to_be32(MTHCA_NEXT_DBD);<br>-               }<br>+              
((struct mthca_next_seg *) prev_wqe)->nda_op =<br>+                      
cpu_to_be32((ind << srq->wqe_shift) | 1);<br>+               wmb();<br>+              
((struct mthca_next_seg *) prev_wqe)->ee_nds =<br>+                      
cpu_to_be32(MTHCA_NEXT_DBD);<br><br>                srq->wrid[ind]  =
wr->wr_id;<br>                srq->first_free = next_ind;<br>Index: infiniband/hw/mthca/mthca_qp.c<br>===================================================================<br>--- infiniband/hw/mthca/mthca_qp.c      (revision 3404)
<br>+++ infiniband/hw/mthca/mthca_qp.c      (working copy)<br>@@ -227,7 +227,6 @@ static void mthca_wq_init(struct mthca_w<br>        wq->last_comp = wq->max - 1;<br>        wq->head      = 0;<br>        wq->tail      = 0;
<br>-       wq->last      = NULL;<br> }<br><br> void mthca_qp_event(struct mthca_dev *dev, u32 qpn,<br>@@ -1103,6 +1102,9 @@ static int mthca_alloc_qp_common(struct<br>                }<br>        }<br><br>+       qp->
sq.last = get_send_wqe(qp, qp->sq.max - 1);<br>+       qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);<br>+<br>        return 0;<br> }<br><br>@@ -1583,15 +1585,13 @@ int mthca_tavor_post_send(struct ib_qp *<br>                        goto
out;<br>                }<br><br>-               if (prev_wqe) {<br>-                      
((struct mthca_next_seg *) prev_wqe)->nda_op =<br>-                              
cpu_to_be32(((ind << qp->sq.wqe_shift) +<br>-                                            qp->send_wqe_offset)
|<br>-                                          
mthca_opcode[wr->opcode]);<br>-                      
wmb();<br>-                      
((struct mthca_next_seg *) prev_wqe)->ee_nds =<br>-                              
cpu_to_be32((size0 ? 0 : MTHCA_NEXT_DBD) | size);<br>-               }<br>+              
((struct mthca_next_seg *) prev_wqe)->nda_op =<br>+                      
cpu_to_be32(((ind << qp->sq.wqe_shift) +<br>+                                    qp->send_wqe_offset)
|<br>+                                  
mthca_opcode[wr->opcode]);<br>+               wmb();<br>+              
((struct mthca_next_seg *) prev_wqe)->ee_nds =<br>+                      
cpu_to_be32((size0 ? 0 : MTHCA_NEXT_DBD) | size);<br><br>                if (!size0) {<br>                        size0
= size;<br>@@ -1688,13 +1688,11 @@ int mthca_tavor_post_receive(struct ib_q<br><br>                qp->wrid[ind] = wr->wr_id;<br><br>-               if (likely(prev_wqe)) {<br>-                      
((struct mthca_next_seg *) prev_wqe)->nda_op =<br>-                              
cpu_to_be32((ind << qp->rq.wqe_shift) | 1);<br>-                      
wmb();<br>-                      
((struct mthca_next_seg *) prev_wqe)->ee_nds =<br>-                              
cpu_to_be32(MTHCA_NEXT_DBD | size);<br>-               }<br>+              
((struct mthca_next_seg *) prev_wqe)->nda_op =<br>+                      
cpu_to_be32((ind << qp->rq.wqe_shift) | 1);<br>+               wmb();<br>+              
((struct mthca_next_seg *) prev_wqe)->ee_nds =<br>+                      
cpu_to_be32(MTHCA_NEXT_DBD | size);<br><br>                if (!size0)<br>                        size0
= size;<br>@@ -1905,15 +1903,13 @@ int mthca_arbel_post_send(struct ib_qp *<br>                        goto
out;<br>                }<br><br>-               if (likely(prev_wqe)) {<br>-                      
((struct mthca_next_seg *) prev_wqe)->nda_op =<br>-                              
cpu_to_be32(((ind << qp->sq.wqe_shift) +<br>-                                            qp->send_wqe_offset)
|<br>-                                          
mthca_opcode[wr->opcode]);<br>-                      
wmb();<br>-                      
((struct mthca_next_seg *) prev_wqe)->ee_nds =<br>-                              
cpu_to_be32(MTHCA_NEXT_DBD | size);<br>-               }<br>+              
((struct mthca_next_seg *) prev_wqe)->nda_op =<br>+                      
cpu_to_be32(((ind << qp->sq.wqe_shift) +<br>+                                    qp->send_wqe_offset)
|<br>+                                  
mthca_opcode[wr->opcode]);<br>+               wmb();<br>+              
((struct mthca_next_seg *) prev_wqe)->ee_nds =<br>+                      
cpu_to_be32(MTHCA_NEXT_DBD | size);<br><br>                if (!size0) {<br>                        size0
= size;<br></blockquote></div><br>