[ofw] ConnectX functionality is completely broken
Leonid Keller
leonid at mellanox.co.il
Thu Jul 31 12:09:42 PDT 2008
Find attached a fix, that will maybe solve the problem.
Maybe, because i haven't time to check it and this work week (down here)
is ended.
I'll check the patch next week.
About the patch.
First of all, special thanks to Anatoly for the right pointing to 1435
patch.
To recall, 1435 patch has improved event notification mechanism for cq,
qp and srq objects.
I found one problem in the patch, which repeats itself for all three
objects and for both drivers: new event handlers get the old (and wrong)
context values.
The new (and right) context values are nor used. As a result, IBAL
callbacks are called with wrong handle parameter, which ends up with
crash.
Index: hw/mlx4/kernel/bus/inc/ib_verbs.h
===================================================================
--- hw/mlx4/kernel/bus/inc/ib_verbs.h (revision 1452)
+++ hw/mlx4/kernel/bus/inc/ib_verbs.h (working copy)
@@ -742,7 +742,6 @@
void * cq_context;
int cqe;
atomic_t usecnt; /* count number of work queues */
- struct ib_cq_ex x;
};
struct ib_srq {
@@ -752,7 +751,6 @@
void (*event_handler)(ib_event_rec_t *);
void *srq_context;
atomic_t usecnt;
- struct ib_srq_ex x;
};
struct ib_qp {
@@ -766,7 +764,6 @@
void *qp_context;
u32 qp_num;
enum ib_qp_type qp_type;
- struct ib_qp_ex x;
};
struct ib_mr {
Index: hw/mlx4/kernel/bus/inc/ib_verbs_ex.h
===================================================================
--- hw/mlx4/kernel/bus/inc/ib_verbs_ex.h (revision 1452)
+++ hw/mlx4/kernel/bus/inc/ib_verbs_ex.h (working copy)
@@ -73,24 +73,6 @@
int fw_if_open;
};
-/* extension for ib_cq */
-struct ib_cq_ex
-{
- void * ctx; /* IBAL CQ context */
-};
-
-/* extension for ib_qp */
-struct ib_qp_ex
-{
- void * ctx; /* IBAL QP context */
-};
-
-/* extension for ib_srq */
-struct ib_srq_ex
-{
- void * ctx; /* IBAL SRQ context */
-};
-
/* extension for ib_event */
struct ib_event_ex
{
Index: hw/mlx4/kernel/hca/cq.c
===================================================================
--- hw/mlx4/kernel/hca/cq.c (revision 1452)
+++ hw/mlx4/kernel/hca/cq.c (working copy)
@@ -89,7 +89,7 @@
// allocate cq
p_ib_cq = ibv_create_cq(p_ibdev,
cq_comp_handler, event_handler,
- p_hca, *p_size, p_uctx, p_umv_buf );
+ (void*)cq_context, *p_size, p_uctx, p_umv_buf );
if (IS_ERR(p_ib_cq)) {
err = PTR_ERR(p_ib_cq);
HCA_PRINT (TRACE_LEVEL_ERROR ,HCA_DBG_CQ, ("ibv_create_cq failed
(%d)\n", err));
@@ -97,9 +97,6 @@
goto err_create_cq;
}
- // fill the object
- p_ib_cq->x.ctx = (void*)cq_context;
-
// return the result
*p_size = p_ib_cq->cqe;
Index: hw/mlx4/kernel/hca/qp.c
===================================================================
--- hw/mlx4/kernel/hca/qp.c (revision 1452)
+++ hw/mlx4/kernel/hca/qp.c (working copy)
@@ -100,8 +100,6 @@
struct ib_qp_init_attr qp_init_attr;
struct ib_ucontext *p_uctx = NULL;
struct ib_pd *p_ib_pd = (struct ib_pd *)h_pd;
- struct ib_device *p_ib_dev = p_ib_pd->device;
- mlnx_hca_t *p_hca = ibdev2hca(p_ib_dev);
struct ibv_create_qp *p_req = NULL;
HCA_ENTER(HCA_DBG_QP);
@@ -121,7 +119,7 @@
// prepare the parameters
RtlZeroMemory(&qp_init_attr, sizeof(qp_init_attr));
qp_init_attr.event_handler = event_handler;
- qp_init_attr.qp_context = p_hca;
+ qp_init_attr.qp_context = (void*)qp_uctx;
qp_init_attr.send_cq = (struct ib_cq *)p_create_attr->h_sq_cq;
qp_init_attr.recv_cq = (struct ib_cq *)p_create_attr->h_rq_cq;
qp_init_attr.srq = (struct ib_srq *)p_create_attr->h_srq;
@@ -153,9 +151,6 @@
goto err_create_qp;
}
- // fill the object
- p_ib_qp->x.ctx = (void*)qp_uctx;
-
// Query QP to obtain requested attributes
if (p_qp_attr) {
status = mlnx_query_qp((ib_qp_handle_t)p_ib_qp, p_qp_attr,
p_umv_buf);
Index: hw/mlx4/kernel/hca/srq.c
===================================================================
--- hw/mlx4/kernel/hca/srq.c (revision 1452)
+++ hw/mlx4/kernel/hca/srq.c (working copy)
@@ -55,8 +55,6 @@
struct ib_srq_init_attr srq_init_attr;
struct ib_ucontext *p_uctx = NULL;
struct ib_pd *p_ib_pd = (struct ib_pd *)h_pd;
- struct ib_device *p_ib_dev = p_ib_pd->device;
- mlnx_hca_t *p_hca = ibdev2hca(p_ib_dev);
HCA_ENTER(HCA_DBG_SRQ);
@@ -75,7 +73,7 @@
// prepare the parameters
RtlZeroMemory(&srq_init_attr, sizeof(srq_init_attr));
srq_init_attr.event_handler = event_handler;
- srq_init_attr.srq_context = p_hca;
+ srq_init_attr.srq_context = (void*)srq_context;
srq_init_attr.attr.max_wr = p_srq_attr->max_wr;
srq_init_attr.attr.max_sge = p_srq_attr->max_sge;
srq_init_attr.attr.srq_limit = p_srq_attr->srq_limit;
@@ -88,7 +86,6 @@
status = errno_to_iberr(err);
goto err_create_srq;
}
- p_ib_srq->x.ctx = (void*)srq_context;
// return the result
if (ph_srq) *ph_srq = (ib_srq_handle_t)p_ib_srq;
Index: hw/mthca/kernel/hca_verbs.c
===================================================================
--- hw/mthca/kernel/hca_verbs.c (revision 1452)
+++ hw/mthca/kernel/hca_verbs.c (working copy)
@@ -870,12 +870,10 @@
int err;
ib_api_status_t status;
struct ib_srq *ib_srq_p;
- struct mthca_srq *srq_p;
struct ib_srq_init_attr srq_init_attr;
struct ib_ucontext *p_context = NULL;
struct ib_pd *ib_pd_p = (struct ib_pd *)h_pd;
struct ib_device *ib_dev = ib_pd_p->device;
- mlnx_hob_t *hob_p = HOB_FROM_IBDEV(ib_dev);
HCA_ENTER(HCA_DBG_SRQ);
@@ -894,7 +892,7 @@
// prepare the parameters
RtlZeroMemory(&srq_init_attr, sizeof(srq_init_attr));
srq_init_attr.event_handler = event_handler;
- srq_init_attr.srq_context = hob_p;
+ srq_init_attr.srq_context = (void*)srq_context;
srq_init_attr.attr = *p_srq_attr;
// allocate srq
@@ -906,12 +904,8 @@
goto err_create_srq;
}
- // fill the object
- srq_p = (struct mthca_srq *)ib_srq_p;
- srq_p->srq_context = (void*)srq_context;
-
// return the result
- if (ph_srq) *ph_srq = (ib_srq_handle_t)srq_p;
+ if (ph_srq) *ph_srq = (ib_srq_handle_t)ib_srq_p;
status = IB_SUCCESS;
@@ -1044,7 +1038,6 @@
struct ib_ucontext *p_context = NULL;
struct ib_pd *ib_pd_p = (struct ib_pd *)h_pd;
struct ib_device *ib_dev = ib_pd_p->device;
- mlnx_hob_t *hob_p = HOB_FROM_IBDEV(ib_dev);
HCA_ENTER(HCA_DBG_QP);
@@ -1063,7 +1056,7 @@
RtlZeroMemory(&qp_init_attr, sizeof(qp_init_attr));
qp_init_attr.qp_type = p_create_attr->qp_type;
qp_init_attr.event_handler = event_handler;
- qp_init_attr.qp_context = hob_p;
+ qp_init_attr.qp_context = (void*)qp_context;
qp_init_attr.recv_cq = (struct ib_cq *)p_create_attr->h_rq_cq;
qp_init_attr.send_cq = (struct ib_cq *)p_create_attr->h_sq_cq;
qp_init_attr.srq = (struct ib_srq *)p_create_attr->h_srq;
@@ -1087,7 +1080,6 @@
// fill the object
qp_p = (struct mthca_qp *)ib_qp_p;
- qp_p->qp_context = (void*)qp_context;
qp_p->qp_init_attr = qp_init_attr;
// Query QP to obtain requested attributes
@@ -1401,7 +1393,6 @@
int err;
ib_api_status_t status;
struct ib_cq *ib_cq_p;
- struct mthca_cq *cq_p;
mlnx_hob_t *hob_p;
struct ib_device *ib_dev;
struct ib_ucontext *p_context;
@@ -1437,7 +1428,7 @@
// allocate cq
ib_cq_p = ibv_create_cq(ib_dev,
cq_comp_handler, event_handler,
- hob_p, *p_size, p_context, p_umv_buf );
+ (void*)cq_context, *p_size, p_context, p_umv_buf );
if (IS_ERR(ib_cq_p)) {
err = PTR_ERR(ib_cq_p);
HCA_PRINT (TRACE_LEVEL_ERROR ,HCA_DBG_CQ, ("ibv_create_cq failed
(%d)\n", err));
@@ -1445,15 +1436,11 @@
goto err_create_cq;
}
- // fill the object
- cq_p = (struct mthca_cq *)ib_cq_p;
- cq_p->cq_context = (void*)cq_context;
-
// return the result
// *p_size = *p_size; // return the same value
*p_size = ib_cq_p->cqe;
- if (ph_cq) *ph_cq = (ib_cq_handle_t)cq_p;
+ if (ph_cq) *ph_cq = (ib_cq_handle_t)ib_cq_p;
status = IB_SUCCESS;
Index: hw/mthca/kernel/mthca_cq.c
===================================================================
--- hw/mthca/kernel/mthca_cq.c (revision 1452)
+++ hw/mthca/kernel/mthca_cq.c (working copy)
@@ -237,7 +237,7 @@
++cq->arm_sn;
}
- cq->ibcq.comp_handler(cq->cq_context);
+ cq->ibcq.comp_handler(cq->ibcq.cq_context);
}
void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
Index: hw/mthca/kernel/mthca_provider.h
===================================================================
--- hw/mthca/kernel/mthca_provider.h (revision 1452)
+++ hw/mthca/kernel/mthca_provider.h (working copy)
@@ -189,7 +189,6 @@
struct mthca_cq {
struct ib_cq ibcq;
- void *cq_context; // leo: for IBAL shim
spinlock_t lock;
atomic_t refcount;
int cqn;
@@ -234,7 +233,6 @@
wait_queue_head_t wait;
KMUTEX mutex;
- void *srq_context;
};
struct mthca_wq {
@@ -254,7 +252,6 @@
struct mthca_qp {
struct ib_qp ibqp;
- void *qp_context; // leo: for IBAL shim
//TODO: added just because absense of ibv_query_qp
// thereafter it may be worth to be replaced by struct ib_qp_attr
qp_attr;
struct ib_qp_init_attr qp_init_attr; // leo: for query_qp
________________________________
From: ofw-bounces at lists.openfabrics.org
[mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Anatoly
Greenblatt
Sent: Wednesday, July 30, 2008 2:35 PM
To: ofw at lists.openfabrics.org
Subject: [ofw] ConnectX functionality is completely broken
Importance: High
Hi,
I've compiled svn rev 1450 and tried to install on ws2k3/ws2k8
x64 with connect x HCA. The BSOD occurs during the installation. The
last revision we tested was 1421. I've recompiled few revisions and
found that changes between rev 1434 and 1435 cause the bugheck.
Bottom line - connect x functionality is broken in revision
1435.
Regards,
Anatoly.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080731/c927b12f/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: event_notify.patch
Type: application/octet-stream
Size: 8656 bytes
Desc: event_notify.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080731/c927b12f/attachment.obj>
More information about the ofw
mailing list