[ofw] ConnectX functionality is completely broken

Leonid Keller leonid at mellanox.co.il
Thu Jul 31 12:09:42 PDT 2008


Find attached a fix, that will maybe solve the problem.
Maybe, because i haven't time to check it and this work week (down here)
is ended.
I'll check the patch next week.
 
About the patch.
 
First of all, special thanks to Anatoly for the right pointing to 1435
patch.
 
To recall, 1435 patch has improved event notification mechanism for cq,
qp and srq objects.
I found one problem in the patch, which repeats itself for all three
objects and for both drivers: new event handlers get the old (and wrong)
context values.
The new (and right) context values are nor used. As a result, IBAL
callbacks are called with wrong handle parameter, which ends up with
crash.
 
 
Index: hw/mlx4/kernel/bus/inc/ib_verbs.h
===================================================================
--- hw/mlx4/kernel/bus/inc/ib_verbs.h (revision 1452)
+++ hw/mlx4/kernel/bus/inc/ib_verbs.h (working copy)
@@ -742,7 +742,6 @@
  void *             cq_context;
  int                cqe;
  atomic_t           usecnt; /* count number of work queues */
- struct ib_cq_ex    x;
 };
 
 struct ib_srq {
@@ -752,7 +751,6 @@
  void        (*event_handler)(ib_event_rec_t *);
  void         *srq_context;
  atomic_t  usecnt;
- struct ib_srq_ex        x;
 };
 
 struct ib_qp {
@@ -766,7 +764,6 @@
  void         *qp_context;
  u32   qp_num;
  enum ib_qp_type  qp_type;
- struct ib_qp_ex         x;
 };
 
 struct ib_mr {
Index: hw/mlx4/kernel/bus/inc/ib_verbs_ex.h
===================================================================
--- hw/mlx4/kernel/bus/inc/ib_verbs_ex.h (revision 1452)
+++ hw/mlx4/kernel/bus/inc/ib_verbs_ex.h (working copy)
@@ -73,24 +73,6 @@
  int     fw_if_open;
 };
 
-/* extension for ib_cq */
-struct ib_cq_ex 
-{
- void *    ctx;  /* IBAL CQ context */
-};
-
-/* extension for ib_qp */
-struct ib_qp_ex 
-{
- void *    ctx;  /* IBAL QP context */
-};
-
-/* extension for ib_srq */
-struct ib_srq_ex 
-{
- void *    ctx;  /* IBAL SRQ context */
-};
-
 /* extension for ib_event */
 struct ib_event_ex 
 {
Index: hw/mlx4/kernel/hca/cq.c
===================================================================
--- hw/mlx4/kernel/hca/cq.c (revision 1452)
+++ hw/mlx4/kernel/hca/cq.c (working copy)
@@ -89,7 +89,7 @@
  // allocate cq 
  p_ib_cq = ibv_create_cq(p_ibdev, 
   cq_comp_handler, event_handler,
-  p_hca, *p_size, p_uctx, p_umv_buf );
+  (void*)cq_context, *p_size, p_uctx, p_umv_buf );
  if (IS_ERR(p_ib_cq)) {
   err = PTR_ERR(p_ib_cq);
   HCA_PRINT (TRACE_LEVEL_ERROR ,HCA_DBG_CQ, ("ibv_create_cq failed
(%d)\n", err));
@@ -97,9 +97,6 @@
   goto err_create_cq;
  }
 
- // fill the object
- p_ib_cq->x.ctx = (void*)cq_context;
- 
  // return the result
  *p_size = p_ib_cq->cqe;
 
Index: hw/mlx4/kernel/hca/qp.c
===================================================================
--- hw/mlx4/kernel/hca/qp.c (revision 1452)
+++ hw/mlx4/kernel/hca/qp.c (working copy)
@@ -100,8 +100,6 @@
  struct ib_qp_init_attr qp_init_attr;
  struct ib_ucontext *p_uctx = NULL;
  struct ib_pd *p_ib_pd = (struct ib_pd *)h_pd;
- struct ib_device *p_ib_dev = p_ib_pd->device;
- mlnx_hca_t *p_hca = ibdev2hca(p_ib_dev);
  struct ibv_create_qp *p_req = NULL;
  
  HCA_ENTER(HCA_DBG_QP);
@@ -121,7 +119,7 @@
  // prepare the parameters
  RtlZeroMemory(&qp_init_attr, sizeof(qp_init_attr));
  qp_init_attr.event_handler = event_handler;
- qp_init_attr.qp_context = p_hca;
+ qp_init_attr.qp_context = (void*)qp_uctx;
  qp_init_attr.send_cq = (struct ib_cq *)p_create_attr->h_sq_cq;
  qp_init_attr.recv_cq = (struct ib_cq *)p_create_attr->h_rq_cq;
  qp_init_attr.srq = (struct ib_srq *)p_create_attr->h_srq;
@@ -153,9 +151,6 @@
   goto err_create_qp;
  }
 
- // fill the object
- p_ib_qp->x.ctx = (void*)qp_uctx;
-
  // Query QP to obtain requested attributes
  if (p_qp_attr) {
   status = mlnx_query_qp((ib_qp_handle_t)p_ib_qp, p_qp_attr,
p_umv_buf);
Index: hw/mlx4/kernel/hca/srq.c
===================================================================
--- hw/mlx4/kernel/hca/srq.c (revision 1452)
+++ hw/mlx4/kernel/hca/srq.c (working copy)
@@ -55,8 +55,6 @@
  struct ib_srq_init_attr srq_init_attr;
  struct ib_ucontext *p_uctx = NULL;
  struct ib_pd *p_ib_pd = (struct ib_pd *)h_pd;
- struct ib_device *p_ib_dev = p_ib_pd->device;
- mlnx_hca_t *p_hca = ibdev2hca(p_ib_dev);
 
  HCA_ENTER(HCA_DBG_SRQ);
 
@@ -75,7 +73,7 @@
  // prepare the parameters
  RtlZeroMemory(&srq_init_attr, sizeof(srq_init_attr));
  srq_init_attr.event_handler = event_handler;
- srq_init_attr.srq_context = p_hca;
+ srq_init_attr.srq_context = (void*)srq_context;
  srq_init_attr.attr.max_wr = p_srq_attr->max_wr;
  srq_init_attr.attr.max_sge = p_srq_attr->max_sge;
  srq_init_attr.attr.srq_limit = p_srq_attr->srq_limit;
@@ -88,7 +86,6 @@
   status = errno_to_iberr(err);
   goto err_create_srq;
  }
- p_ib_srq->x.ctx = (void*)srq_context;
 
  // return the result
  if (ph_srq) *ph_srq = (ib_srq_handle_t)p_ib_srq;
Index: hw/mthca/kernel/hca_verbs.c
===================================================================
--- hw/mthca/kernel/hca_verbs.c (revision 1452)
+++ hw/mthca/kernel/hca_verbs.c (working copy)
@@ -870,12 +870,10 @@
  int err;
  ib_api_status_t  status;
  struct ib_srq *ib_srq_p;
- struct mthca_srq *srq_p;
  struct ib_srq_init_attr srq_init_attr;
  struct ib_ucontext *p_context = NULL;
  struct ib_pd *ib_pd_p = (struct ib_pd *)h_pd;
  struct ib_device *ib_dev = ib_pd_p->device;
- mlnx_hob_t  *hob_p = HOB_FROM_IBDEV(ib_dev);
 
  HCA_ENTER(HCA_DBG_SRQ);
 
@@ -894,7 +892,7 @@
  // prepare the parameters
  RtlZeroMemory(&srq_init_attr, sizeof(srq_init_attr));
  srq_init_attr.event_handler = event_handler;
- srq_init_attr.srq_context = hob_p;
+ srq_init_attr.srq_context = (void*)srq_context;
  srq_init_attr.attr = *p_srq_attr;
 
  // allocate srq 
@@ -906,12 +904,8 @@
   goto err_create_srq;
  }
 
- // fill the object
- srq_p = (struct mthca_srq *)ib_srq_p;
- srq_p->srq_context = (void*)srq_context;
- 
  // return the result
- if (ph_srq) *ph_srq = (ib_srq_handle_t)srq_p;
+ if (ph_srq) *ph_srq = (ib_srq_handle_t)ib_srq_p;
 
  status = IB_SUCCESS;
  
@@ -1044,7 +1038,6 @@
  struct ib_ucontext *p_context = NULL;
  struct ib_pd *ib_pd_p = (struct ib_pd *)h_pd;
  struct ib_device *ib_dev = ib_pd_p->device;
- mlnx_hob_t  *hob_p = HOB_FROM_IBDEV(ib_dev);
  
  HCA_ENTER(HCA_DBG_QP);
 
@@ -1063,7 +1056,7 @@
  RtlZeroMemory(&qp_init_attr, sizeof(qp_init_attr));
  qp_init_attr.qp_type = p_create_attr->qp_type;
  qp_init_attr.event_handler = event_handler;
- qp_init_attr.qp_context = hob_p;
+ qp_init_attr.qp_context = (void*)qp_context;
  qp_init_attr.recv_cq = (struct ib_cq *)p_create_attr->h_rq_cq;
  qp_init_attr.send_cq = (struct ib_cq *)p_create_attr->h_sq_cq;
  qp_init_attr.srq = (struct ib_srq *)p_create_attr->h_srq;
@@ -1087,7 +1080,6 @@
 
  // fill the object
  qp_p = (struct mthca_qp *)ib_qp_p;
- qp_p->qp_context = (void*)qp_context;
  qp_p->qp_init_attr = qp_init_attr;
 
  // Query QP to obtain requested attributes
@@ -1401,7 +1393,6 @@
  int err;
  ib_api_status_t  status;
  struct ib_cq *ib_cq_p;
- struct mthca_cq *cq_p;
  mlnx_hob_t   *hob_p;
  struct ib_device *ib_dev;
  struct ib_ucontext *p_context;
@@ -1437,7 +1428,7 @@
  // allocate cq 
  ib_cq_p = ibv_create_cq(ib_dev, 
   cq_comp_handler, event_handler,
-  hob_p, *p_size, p_context, p_umv_buf );
+  (void*)cq_context, *p_size, p_context, p_umv_buf );
  if (IS_ERR(ib_cq_p)) {
   err = PTR_ERR(ib_cq_p);
   HCA_PRINT (TRACE_LEVEL_ERROR ,HCA_DBG_CQ, ("ibv_create_cq failed
(%d)\n", err));
@@ -1445,15 +1436,11 @@
   goto err_create_cq;
  }
 
- // fill the object
- cq_p = (struct mthca_cq *)ib_cq_p;
- cq_p->cq_context = (void*)cq_context;
- 
  // return the result
 // *p_size = *p_size; // return the same value
  *p_size = ib_cq_p->cqe;
 
- if (ph_cq) *ph_cq = (ib_cq_handle_t)cq_p;
+ if (ph_cq) *ph_cq = (ib_cq_handle_t)ib_cq_p;
 
  status = IB_SUCCESS;
  
Index: hw/mthca/kernel/mthca_cq.c
===================================================================
--- hw/mthca/kernel/mthca_cq.c (revision 1452)
+++ hw/mthca/kernel/mthca_cq.c (working copy)
@@ -237,7 +237,7 @@
    ++cq->arm_sn;
  }
 
- cq->ibcq.comp_handler(cq->cq_context);
+ cq->ibcq.comp_handler(cq->ibcq.cq_context);
 }
 
 void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
Index: hw/mthca/kernel/mthca_provider.h
===================================================================
--- hw/mthca/kernel/mthca_provider.h (revision 1452)
+++ hw/mthca/kernel/mthca_provider.h (working copy)
@@ -189,7 +189,6 @@
 
 struct mthca_cq {
  struct ib_cq           ibcq;
- void      *cq_context; // leo: for IBAL shim
  spinlock_t             lock;
  atomic_t               refcount;
  int                    cqn;
@@ -234,7 +233,6 @@
 
  wait_queue_head_t wait;
  KMUTEX   mutex;
- void    *srq_context; 
 };
 
 struct mthca_wq {
@@ -254,7 +252,6 @@
 
 struct mthca_qp {
  struct ib_qp           ibqp;
- void      *qp_context; // leo: for IBAL shim
  //TODO: added just because absense of ibv_query_qp
  // thereafter it may be worth to be replaced by struct ib_qp_attr
qp_attr;
  struct ib_qp_init_attr qp_init_attr; // leo: for query_qp

 
 
 


________________________________

	From: ofw-bounces at lists.openfabrics.org
[mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Anatoly
Greenblatt
	Sent: Wednesday, July 30, 2008 2:35 PM
	To: ofw at lists.openfabrics.org
	Subject: [ofw] ConnectX functionality is completely broken
	Importance: High
	
	

	Hi,

	 

	I've compiled svn rev 1450 and tried to install on ws2k3/ws2k8
x64 with connect x HCA. The BSOD occurs during the installation. The
last revision we tested was 1421. I've recompiled few revisions and
found that changes between rev 1434 and 1435 cause the bugheck.

	 

	Bottom line - connect x functionality is broken in revision
1435.

	 

	Regards,

	Anatoly.

	 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080731/c927b12f/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: event_notify.patch
Type: application/octet-stream
Size: 8656 bytes
Desc: event_notify.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080731/c927b12f/attachment.obj>


More information about the ofw mailing list