[ofa-general] Re: [PATCH RFC/untested v0.5] IPoIB/CM: fix SRQ WR leak

Thu May 17 00:30:28 PDT 2007

> Quoting Roland Dreier <rdreier at cisco.com>:
> Subject: Re: [PATCH RFC/untested v0.5] IPoIB/CM: fix SRQ WR leak
> 
>  > + * - Put the QP in the Error State
>  > + * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
>  > + * - either:
>  > + *       drain the CQ by invoking the Poll CQ verb and either wait for CQ
>  > + *       to be empty or the number of Poll CQ operations has exceeded
>  > + *       CQ capacity size;
>  > + * - or
>  > + *       post another WR that completes on the same CQ and wait for this
>  > + *       WR to return as a WC; (NB: this is the option that we use)
>  > + * and then invoke a Destroy QP or Reset QP.
> 
> I guess this last line would look better as
> 
>  * - invoke a Destroy QP or Reset QP.

Hmm, I would like to quote the spec *literally*. Maybe
- and then invoke a Destroy QP or Reset QP.

>  > +static struct ib_qp_attr ipoib_cm_err_attr __read_mostly = {
>  > +	.qp_state = IB_QPS_ERR
>  > +};
>  > +
>  > +#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
>  > +
>  > +static struct ib_send_wr ipoib_cm_rx_drain_wr __read_mostly = {
>  > +	.wr_id = IPOIB_CM_RX_DRAIN_WRID,
>  > +	.opcode = IB_WR_SEND
>  > +};
> 
> I don't think these are hot enough to be worth marking as __read_mostly.
> (better to leave them in normal .data so that stuff that is written to
> ends up getting spaced out more)

OK, thanks for the suggestion.

>  > +	qp_attr.qp_state = IB_QPS_INIT;
>  > +	qp_attr.port_num = priv->port;
>  > +	qp_attr.qkey = 0;
>  > +	qp_attr.qp_access_flags = 0;
>  > +	ret = ib_modify_qp(priv->cm.rx_drain_qp, &qp_attr,
>  > +			   IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PORT | IB_QP_QKEY);
>  > +	if (ret) {
>  > +		ipoib_warn(priv, "failed to modify drain QP to INIT: %d\n", ret);
>  > +		goto err_qp;
>  > +	}
>  > +
>  > +	/* We put the QP in error state directly: this way, hardware
>  > +	 * will immediately generate WC for each WR we post, without
>  > +	 * sending anything on the wire. */
>  > +	qp_attr.qp_state = IB_QPS_ERR;
>  > +	ret = ib_modify_qp(priv->cm.rx_drain_qp, &qp_attr, IB_QP_STATE);
>  > +	if (ret) {
>  > +		ipoib_warn(priv, "failed to modify drain QP to error: %d\n", ret);
>  > +		goto err_qp;
>  > +	}
> 
> This actually seems like a good motivation for the mthca RESET ->
> ERROR fix.  We could avoid the transition to INIT if we fixed mthca
> and mlx4, right?

Yes. That was the motivation.

> (By the way, any interest in making an mlx4 patch to
> fix that too?)

Easy (I also fixed reset to reset on the way).

IB/mlx4: fix RESET -> ERROR and RESET -> RESET transitions

Signed-off-by: Michael S. Tsirkin <mst at dev.mellanox.co.il>

---

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 5cd7069..c93daab 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -573,7 +573,7 @@ static int to_mlx4_st(enum ib_qp_type type)
 	}
 }
 
-static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, struct ib_qp_attr *attr,
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
 				   int attr_mask)
 {
 	u8 dest_rd_atomic;
@@ -603,7 +603,7 @@ static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, struct ib_qp_attr *att
 	return cpu_to_be32(hw_access_flags);
 }
 
-static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, struct ib_qp_attr *attr,
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
 			    int attr_mask)
 {
 	if (attr_mask & IB_QP_PKEY_INDEX)
@@ -619,7 +619,7 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
 	path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
 }
 
-static int mlx4_set_path(struct mlx4_ib_dev *dev, struct ib_ah_attr *ah,
+static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
 			 struct mlx4_qp_path *path, u8 port)
 {
 	path->grh_mylmc     = ah->src_path_bits & 0x7f;
@@ -655,14 +655,14 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, struct ib_ah_attr *ah,
 	return 0;
 }
 
-int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-		      int attr_mask, struct ib_udata *udata)
+static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
+			       const struct ib_qp_attr *attr, int attr_mask,
+			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *qp = to_mqp(ibqp);
 	struct mlx4_qp_context *context;
 	enum mlx4_qp_optpar optpar = 0;
-	enum ib_qp_state cur_state, new_state;
 	int sqd_event;
 	int err = -EINVAL;
 
@@ -670,34 +670,6 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	if (!context)
 		return -ENOMEM;
 
-	mutex_lock(&qp->mutex);
-
-	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
-	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
-
-	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
-		goto out;
-
-	if ((attr_mask & IB_QP_PKEY_INDEX) &&
-	     attr->pkey_index >= dev->dev->caps.pkey_table_len) {
-		goto out;
-	}
-
-	if ((attr_mask & IB_QP_PORT) &&
-	    (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
-		goto out;
-	}
-
-	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
-	    attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
-		goto out;
-	}
-
-	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
-	    attr->max_dest_rd_atomic > 1 << dev->dev->caps.max_qp_dest_rdma) {
-		goto out;
-	}
-
 	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
 				     (to_mlx4_st(ibqp->qp_type) << 16));
 	context->flags     |= cpu_to_be32(1 << 8); /* DE? */
@@ -920,11 +892,83 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	}
 
 out:
-	mutex_unlock(&qp->mutex);
 	kfree(context);
 	return err;
 }
 
+static const struct ib_qp_attr mlx4_ib_qp_attr = { .port_num = 1 };
+static const int mlx4_ib_qp_attr_mask_table[IB_QPT_UD + 1] = {
+		[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+				IB_QP_PORT			|
+				IB_QP_QKEY),
+		[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+				IB_QP_PORT			|
+				IB_QP_ACCESS_FLAGS),
+		[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+				IB_QP_PORT			|
+				IB_QP_ACCESS_FLAGS),
+		[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+				IB_QP_QKEY),
+		[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+				IB_QP_QKEY),
+};
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int err = -EINVAL;
+
+	mutex_lock(&qp->mutex);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
+		goto out;
+
+	if ((attr_mask & IB_QP_PKEY_INDEX) &&
+	     attr->pkey_index >= dev->dev->caps.pkey_table_len) {
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > 1 << dev->dev->caps.max_qp_dest_rdma) {
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
+		err = __mlx4_ib_modify_qp(ibqp, &mlx4_ib_qp_attr,
+					  mlx4_ib_qp_attr_mask_table[ibqp->qp_type],
+					  IB_QPS_RESET, IB_QPS_INIT);
+		if (err)
+			goto out;
+		cur_state = IB_QPS_INIT;
+	}
+ 
+	err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 			    void *wqe)
 {

-- 
MST