[openib-general] [PATCH] reduce qp locking on cq poll

Wed Jan 26 05:24:16 PST 2005

Locking during the poll cq operation can be reduced, by locking
the cq while qp is being removed from the qp array.
This also avoids an extra atomic operation for reference counting.

TODO implies the same can be done for cqs, but I dont see how, since
we dont take a lock on eq, so this I have left in the TODO.
This works but I dont have the fast hardware to check the speed effect.

Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

Index: hw/mthca/mthca_cq.c
===================================================================

--- hw/mthca/mthca_cq.c	(revision 1653)
+++ hw/mthca/mthca_cq.c	(working copy)
@@ -426,14 +426,11 @@
 				wake_up(&(*cur_qp)->wait);
 		}
 
-		spin_lock(&dev->qp_table.lock);
+		//We do not have to take the qp table lock here, because
+		//cq was locked while qp was removed from the array
 		*cur_qp = mthca_array_get(&dev->qp_table.qp,
 					  be32_to_cpu(cqe->my_qpn) &
 					  (dev->limits.num_qps - 1));
-		if (*cur_qp)
-			atomic_inc(&(*cur_qp)->refcount);
-		spin_unlock(&dev->qp_table.lock);
-
 		if (!*cur_qp) {
 			mthca_warn(dev, "CQ entry for unknown QP %06x\n",
 				   be32_to_cpu(cqe->my_qpn) & 0xffffff);
@@ -547,8 +544,6 @@
 
 	if (qp) {
 		spin_unlock(&qp->lock);
-		if (atomic_dec_and_test(&qp->refcount))
-			wake_up(&qp->wait);
 	}
 
 
@@ -775,7 +770,7 @@
 		int j;
 
 		printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n",
-		       cq->cqn, cq->cons_index, next_cqe_sw(cq));
+		       cq->cqn, cq->cons_index, next_cqe_sw(cq)?1:0);
 		for (j = 0; j < 16; ++j)
 			printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
 	}
Index: hw/mthca/mthca_qp.c
===================================================================
--- hw/mthca/mthca_qp.c	(revision 1653)
+++ hw/mthca/mthca_qp.c	(working copy)
@@ -1083,10 +1083,20 @@
 	return 0;
 
  err_out_free:
-	spin_lock_irq(&dev->qp_table.lock);
+	//Lock cq here, so that cq polling code
+	//can do qp lookup without taking a lock
+	spin_lock_irq(&send_cq->lock);
+	if (send_cq != recv_cq)
+		spin_lock(&recv_cq->lock);
+
+	spin_lock(&dev->qp_table.lock);
 	mthca_array_clear(&dev->qp_table.qp, mqpn);
-	spin_unlock_irq(&dev->qp_table.lock);
+	spin_unlock(&dev->qp_table.lock);
 
+	if (send_cq != recv_cq)
+		spin_unlock(&recv_cq->lock);
+	spin_unlock_irq(&send_cq->lock);
+
  err_out:
 	dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size,
 			  sqp->header_buf, sqp->header_dma);
@@ -1100,12 +1110,27 @@
 	u8 status;
 	int size;
 	int i;
+	struct mthca_cq *send_cq;
+	struct mthca_cq *recv_cq;
 
-	spin_lock_irq(&dev->qp_table.lock);
+	send_cq = to_mcq(qp->ibqp.send_cq);
+	recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+	//Lock cq here, so that cq polling code
+	//can do qp lookup without taking a lock
+	spin_lock_irq(&send_cq->lock);
+	if (send_cq != recv_cq)
+		spin_lock(&recv_cq->lock);
+
+	spin_lock(&dev->qp_table.lock);
 	mthca_array_clear(&dev->qp_table.qp,
 			  qp->qpn & (dev->limits.num_qps - 1));
-	spin_unlock_irq(&dev->qp_table.lock);
+	spin_unlock(&dev->qp_table.lock);
 
+	if (send_cq != recv_cq)
+		spin_unlock(&recv_cq->lock);
+	spin_unlock_irq(&send_cq->lock);
+
 	atomic_dec(&qp->refcount);
 	wait_event(qp->wait, !atomic_read(&qp->refcount));
 
Index: hw/mthca/TODO
===================================================================
--- hw/mthca/TODO	(revision 1653)
+++ hw/mthca/TODO	(working copy)
@@ -12,10 +12,9 @@ Small projects (extensions to existing c
     Miscellaneous verbs: At least query AH, query QP and resize CQ are
         not implemented.
     Reduce locking for CQ poll: The reference counting used to prevent
-        CQs and QPs from being destroyed while events are being
-        dispatched could be eliminated by locking EQs (and, for QPs,
-        the associated CQs) during destroy operations.  This saves an
-        atomic access in the CQ poll fast path.
+        CQs from being destroyed while events are being
+        dispatched could be eliminated by locking EQs
+        during destroy operations.
     Reduce doorbell locking on 32-bit architectures: By using a
         a doorbell page dedicated to each EQ, no lock is required in
         the EQ poll path.