[ofa-general] [GIT PULL] please pull infiniband.git

Tue May 29 16:20:39 PDT 2007

Linus, please pull from

    master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git for-linus

This tree is also available from kernel.org mirrors at:

    git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband.git for-linus

This will get a few more nicely balanced ("55 insertions(+), 55
deletions(-)") 2.6.22-rc3 fixes, mostly for IPoIB connected mode:

Michael S. Tsirkin (2):
      IB/mthca: Fix handling of send CQE with error for QPs connected to SRQ
      IPoIB/cm: Fix performance regression on Mellanox

Roland Dreier (1):
      IB/mlx4: Fix last allocated object tracking in bitmap allocator

Sean Hefty (1):
      IB/cm: Fix stale connection detection

 drivers/infiniband/core/cm.c            |   25 ++++++-----
 drivers/infiniband/hw/mthca/mthca_qp.c  |    6 +-
 drivers/infiniband/ulp/ipoib/ipoib.h    |    3 +-
 drivers/infiniband/ulp/ipoib/ipoib_cm.c |   74 +++++++++++++++----------------
 drivers/net/mlx4/alloc.c                |    2 +-
 5 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index e840434..40c004a 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1297,26 +1297,29 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
 
 	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
 
-	/* Check for duplicate REQ and stale connections. */
+	/* Check for possible duplicate REQ. */
 	spin_lock_irqsave(&cm.lock, flags);
 	timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
-	if (!timewait_info)
-		timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
-
 	if (timewait_info) {
 		cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
 					   timewait_info->work.remote_id);
-		cm_cleanup_timewait(cm_id_priv->timewait_info);
 		spin_unlock_irqrestore(&cm.lock, flags);
 		if (cur_cm_id_priv) {
 			cm_dup_req_handler(work, cur_cm_id_priv);
 			cm_deref_id(cur_cm_id_priv);
-		} else
-			cm_issue_rej(work->port, work->mad_recv_wc,
-				     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
-				     NULL, 0);
-		listen_cm_id_priv = NULL;
-		goto out;
+		}
+		return NULL;
+	}
+
+	/* Check for stale connections. */
+	timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irqrestore(&cm.lock, flags);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
+			     NULL, 0);
+		return NULL;
 	}
 
 	/* Find matching listen request. */
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 0276649..eef415b 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -2284,10 +2284,10 @@ void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
 	struct mthca_next_seg *next;
 
 	/*
-	 * For SRQs, all WQEs generate a CQE, so we're always at the
-	 * end of the doorbell chain.
+	 * For SRQs, all receive WQEs generate a CQE, so we're always
+	 * at the end of the doorbell chain.
 	 */
-	if (qp->ibqp.srq) {
+	if (qp->ibqp.srq && !is_send) {
 		*new_wqe = 0;
 		return;
 	}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 158759e..285c143 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -156,7 +156,7 @@ struct ipoib_cm_data {
  * - and then invoke a Destroy QP or Reset QP.
  *
  * We use the second option and wait for a completion on the
- * rx_drain_qp before destroying QPs attached to our SRQ.
+ * same CQ before destroying QPs attached to our SRQ.
  */
 
 enum ipoib_cm_state {
@@ -199,7 +199,6 @@ struct ipoib_cm_dev_priv {
 	struct ib_srq  	       *srq;
 	struct ipoib_cm_rx_buf *srq_ring;
 	struct ib_cm_id        *id;
-	struct ib_qp           *rx_drain_qp;   /* generates WR described in 10.3.1 */
 	struct list_head        passive_ids;   /* state: LIVE */
 	struct list_head        rx_error_list; /* state: ERROR */
 	struct list_head        rx_flush_list; /* state: FLUSH, drain not started */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index f133b56..076a0bb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -69,8 +69,9 @@ static struct ib_qp_attr ipoib_cm_err_attr = {
 
 #define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
 
-static struct ib_recv_wr ipoib_cm_rx_drain_wr = {
-	.wr_id = IPOIB_CM_RX_DRAIN_WRID
+static struct ib_send_wr ipoib_cm_rx_drain_wr = {
+	.wr_id = IPOIB_CM_RX_DRAIN_WRID,
+	.opcode = IB_WR_SEND,
 };
 
 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
@@ -163,16 +164,22 @@ partial_error:
 
 static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
 {
-	struct ib_recv_wr *bad_wr;
+	struct ib_send_wr *bad_wr;
+	struct ipoib_cm_rx *p;
 
-	/* rx_drain_qp send queue depth is 1, so
+	/* We only reserved 1 extra slot in CQ for drain WRs, so
 	 * make sure we have at most 1 outstanding WR. */
 	if (list_empty(&priv->cm.rx_flush_list) ||
 	    !list_empty(&priv->cm.rx_drain_list))
 		return;
 
-	if (ib_post_recv(priv->cm.rx_drain_qp, &ipoib_cm_rx_drain_wr, &bad_wr))
-		ipoib_warn(priv, "failed to post rx_drain wr\n");
+	/*
+	 * QPs on flush list are error state.  This way, a "flush
+	 * error" WC will be immediately generated for each WR we post.
+	 */
+	p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+	if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
+		ipoib_warn(priv, "failed to post drain wr\n");
 
 	list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
 }
@@ -199,10 +206,10 @@ static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_init_attr attr = {
 		.event_handler = ipoib_cm_rx_event_handler,
-		.send_cq = priv->cq, /* does not matter, we never send anything */
+		.send_cq = priv->cq, /* For drain WR */
 		.recv_cq = priv->cq,
 		.srq = priv->cm.srq,
-		.cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */
+		.cap.max_send_wr = 1, /* For drain WR */
 		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
 		.sq_sig_type = IB_SIGNAL_ALL_WR,
 		.qp_type = IB_QPT_RC,
@@ -242,6 +249,27 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev,
 		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
 		return ret;
 	}
+
+	/*
+	 * Current Mellanox HCA firmware won't generate completions
+	 * with error for drain WRs unless the QP has been moved to
+	 * RTS first. This work-around leaves a window where a QP has
+	 * moved to error asynchronously, but this will eventually get
+	 * fixed in firmware, so let's not error out if modify QP
+	 * fails.
+	 */
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+		return 0;
+	}
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+		return 0;
+	}
+
 	return 0;
 }
 
@@ -623,38 +651,11 @@ static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
 int ipoib_cm_dev_open(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ib_qp_init_attr qp_init_attr = {
-		.send_cq = priv->cq,   /* does not matter, we never send anything */
-		.recv_cq = priv->cq,
-		.cap.max_send_wr = 1,  /* FIXME: 0 Seems not to work */
-		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
-		.cap.max_recv_wr = 1,
-		.cap.max_recv_sge = 1, /* FIXME: 0 Seems not to work */
-		.sq_sig_type = IB_SIGNAL_ALL_WR,
-		.qp_type = IB_QPT_UC,
-	};
 	int ret;
 
 	if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
 		return 0;
 
-	priv->cm.rx_drain_qp = ib_create_qp(priv->pd, &qp_init_attr);
-	if (IS_ERR(priv->cm.rx_drain_qp)) {
-		printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
-		ret = PTR_ERR(priv->cm.rx_drain_qp);
-		return ret;
-	}
-
-	/*
-	 * We put the QP in error state directly.  This way, a "flush
-	 * error" WC will be immediately generated for each WR we post.
-	 */
-	ret = ib_modify_qp(priv->cm.rx_drain_qp, &ipoib_cm_err_attr, IB_QP_STATE);
-	if (ret) {
-		ipoib_warn(priv, "failed to modify drain QP to error: %d\n", ret);
-		goto err_qp;
-	}
-
 	priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
 	if (IS_ERR(priv->cm.id)) {
 		printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
@@ -676,8 +677,6 @@ err_listen:
 	ib_destroy_cm_id(priv->cm.id);
 err_cm:
 	priv->cm.id = NULL;
-err_qp:
-	ib_destroy_qp(priv->cm.rx_drain_qp);
 	return ret;
 }
 
@@ -740,7 +739,6 @@ void ipoib_cm_dev_stop(struct net_device *dev)
 		kfree(p);
 	}
 
-	ib_destroy_qp(priv->cm.rx_drain_qp);
 	cancel_delayed_work(&priv->cm.stale_task);
 }
 
diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c
index dfbd580..f8d63d3 100644
--- a/drivers/net/mlx4/alloc.c
+++ b/drivers/net/mlx4/alloc.c
@@ -51,8 +51,8 @@ u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
 
 	if (obj < bitmap->max) {
 		set_bit(obj, bitmap->table);
+		bitmap->last = (obj + 1) & (bitmap->max - 1);
 		obj |= bitmap->top;
-		bitmap->last = obj + 1;
 	} else
 		obj = -1;