[openib-general] [PATCH/RFC 1/2] IB: Return "maybe_missed_event" hint from ib_req_notify_cq()

Roland Dreier rdreier at cisco.com
Mon Oct 16 15:32:58 PDT 2006


The semantics defined by the InfiniBand specification say that
completion events are only generated when a completions is added to a
completion queue (CQ) after completion notification is requested.  In
other words, this means that the following race is possible:

	while (CQ is not empty)
		ib_poll_cq(CQ);
	// new completion is added after while loop is exited
	ib_req_notify_cq(CQ);
        // no event is generated for the existing completion

To close this race, the IB spec recommends doing another poll of the
CQ after requesting notification.

However, it is not always possible to arrange code this way (for
example, we have found that NAPI for IPoIB cannot poll after
requesting notification).  Also, some hardware (eg Mellanox HCAs)
actually will generate an event for completions added before the call
to ib_req_notify_cq() -- which is allowed by the spec, since there's
no way for any upper-layer consumer to know exactly when a completion
was really added -- so the extra poll of the CQ is just a waste.

Motivated by this, we add a new parameter "maybe_missed_event" to
ib_req_notify_cq() so that it can return a hint about whether the
a completion may have been added before the request for notification.
If the hint returned is 0, then the consumer knows that it is safe to
wait for another event; otherwise, the consumer needs to poll the CQ
again to make sure no completions are already there.

Signed-off-by: Roland Dreier <rolandd at cisco.com>
---
 drivers/infiniband/core/mad.c              |    4 ++--
 drivers/infiniband/core/uverbs_cmd.c       |    3 ++-
 drivers/infiniband/hw/amso1100/c2.h        |    3 ++-
 drivers/infiniband/hw/amso1100/c2_cq.c     |   10 +++++++++-
 drivers/infiniband/hw/ehca/ehca_reqs.c     |   10 +++++++++-
 drivers/infiniband/hw/ehca/ipz_pt_fn.h     |    8 ++++++++
 drivers/infiniband/hw/ipath/ipath_cq.c     |    8 +++++++-
 drivers/infiniband/hw/ipath/ipath_verbs.h  |    3 ++-
 drivers/infiniband/hw/mthca/mthca_cq.c     |   22 ++++++++++++++++++++--
 drivers/infiniband/hw/mthca/mthca_dev.h    |    6 ++++--
 drivers/infiniband/ulp/ipoib/ipoib_ib.c    |    2 +-
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |    2 +-
 drivers/infiniband/ulp/iser/iser_verbs.c   |    4 ++--
 drivers/infiniband/ulp/srp/ib_srp.c        |    4 ++--
 include/rdma/ib_verbs.h                    |   16 +++++++++++++---
 15 files changed, 84 insertions(+), 21 deletions(-)

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 493f4c6..5c4e037 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -2204,7 +2204,7 @@ static void ib_mad_completion_handler(vo
 	struct ib_wc wc;
 
 	port_priv = (struct ib_mad_port_private *)data;
-	ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+	ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP, NULL);
 
 	while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
 		if (wc.status == IB_WC_SUCCESS) {
@@ -2646,7 +2646,7 @@ static int ib_mad_port_start(struct ib_m
 		}
 	}
 
-	ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+	ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP, NULL);
 	if (ret) {
 		printk(KERN_ERR PFX "Failed to request completion "
 		       "notification: %d\n", ret);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index b72c7f6..8aaeb18 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -969,7 +969,8 @@ ssize_t ib_uverbs_req_notify_cq(struct i
 		return -EINVAL;
 
 	ib_req_notify_cq(cq, cmd.solicited_only ?
-			 IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
+			 IB_CQ_SOLICITED : IB_CQ_NEXT_COMP,
+			 NULL);
 
 	put_cq_read(cq);
 
diff --git a/drivers/infiniband/hw/amso1100/c2.h b/drivers/infiniband/hw/amso1100/c2.h
index 1b17dcd..f37b5f4 100644
--- a/drivers/infiniband/hw/amso1100/c2.h
+++ b/drivers/infiniband/hw/amso1100/c2.h
@@ -519,7 +519,8 @@ extern void c2_free_cq(struct c2_dev *c2
 extern void c2_cq_event(struct c2_dev *c2dev, u32 mq_index);
 extern void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index);
 extern int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
-extern int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify);
+extern int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify,
+		     int *maybe_missed_event);
 
 /* CM */
 extern int c2_llp_connect(struct iw_cm_id *cm_id,
diff --git a/drivers/infiniband/hw/amso1100/c2_cq.c b/drivers/infiniband/hw/amso1100/c2_cq.c
index 9d7bcc5..d491abf 100644
--- a/drivers/infiniband/hw/amso1100/c2_cq.c
+++ b/drivers/infiniband/hw/amso1100/c2_cq.c
@@ -217,10 +217,12 @@ int c2_poll_cq(struct ib_cq *ibcq, int n
 	return npolled;
 }
 
-int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify)
+int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify,
+	      int *maybe_missed_event)
 {
 	struct c2_mq_shared __iomem *shared;
 	struct c2_cq *cq;
+	unsigned long flags;
 
 	cq = to_c2cq(ibcq);
 	shared = cq->mq.peer;
@@ -241,6 +243,12 @@ int c2_arm_cq(struct ib_cq *ibcq, enum i
 	 */
 	readb(&shared->armed);
 
+	if (maybe_missed_event) {
+		spin_lock_irqsave(&cq->lock, flags);
+		*maybe_missed_event = !c2_mq_empty(&cq->mq);
+		spin_unlock_irqrestore(&cq->lock, flags);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
index b46bda1..acf08e6 100644
--- a/drivers/infiniband/hw/ehca/ehca_reqs.c
+++ b/drivers/infiniband/hw/ehca/ehca_reqs.c
@@ -634,9 +634,11 @@ poll_cq_exit0:
 	return ret;
 }
 
-int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify cq_notify)
+int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify cq_notify,
+		       int *maybe_missed_event)
 {
 	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+	unsigned long spl_flags;
 
 	switch (cq_notify) {
 	case IB_CQ_SOLICITED:
@@ -649,5 +651,11 @@ int ehca_req_notify_cq(struct ib_cq *cq,
 		return -EINVAL;
 	}
 
+	if (maybe_missed_event) {
+		spin_lock_irqsave(&my_cq->spinlock, spl_flags);
+		*maybe_missed_event = ipz_qeit_is_valid(my_cq->ipz_queue);
+		spin_unlock_irqrestore(&my_cq->spinlock, spl_flags);
+	}
+
 	return 0;
 }
diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.h b/drivers/infiniband/hw/ehca/ipz_pt_fn.h
index 2f13509..0bebb34 100644
--- a/drivers/infiniband/hw/ehca/ipz_pt_fn.h
+++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.h
@@ -140,6 +140,14 @@ static inline void *ipz_qeit_get_inc_val
 	return cqe;
 }
 
+static inline int ipz_qeit_is_valid(struct ipz_queue *queue)
+{
+	struct ehca_cqe *cqe = ipz_qeit_get(queue);
+	u32 cqe_flags = cqe->cqe_flags;
+
+	return cqe_flags >> 7 == queue->toggle_state & 1;
+}
+
 /*
  * returns and resets Queue Entry iterator
  * returns address (kv) of first Queue Entry
diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c
index 87462e0..c5d300f 100644
--- a/drivers/infiniband/hw/ipath/ipath_cq.c
+++ b/drivers/infiniband/hw/ipath/ipath_cq.c
@@ -307,13 +307,15 @@ int ipath_destroy_cq(struct ib_cq *ibcq)
  * ipath_req_notify_cq - change the notification type for a completion queue
  * @ibcq: the completion queue
  * @notify: the type of notification to request
+ * @maybe_missed_event: missed event hint
  *
  * Returns 0 for success.
  *
  * This may be called from interrupt context.  Also called by
  * ib_req_notify_cq() in the generic verbs code.
  */
-int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify)
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify,
+			int *maybe_missed_event)
 {
 	struct ipath_cq *cq = to_icq(ibcq);
 	unsigned long flags;
@@ -325,6 +327,10 @@ int ipath_req_notify_cq(struct ib_cq *ib
 	 */
 	if (cq->notify != IB_CQ_NEXT_COMP)
 		cq->notify = notify;
+
+	if (maybe_missed_event)
+		*maybe_missed_event = cq->queue->head != cq->queue->tail;
+
 	spin_unlock_irqrestore(&cq->lock, flags);
 	return 0;
 }
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
index 8039f6e..e36411d 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.h
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.h
@@ -716,7 +716,8 @@ struct ib_cq *ipath_create_cq(struct ib_
 
 int ipath_destroy_cq(struct ib_cq *ibcq);
 
-int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify);
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify,
+			int *maybe_missed_event);
 
 int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
index e393681..cb49aad 100644
--- a/drivers/infiniband/hw/mthca/mthca_cq.c
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -716,10 +716,19 @@ repoll:
 	return err == 0 || err == -EAGAIN ? npolled : err;
 }
 
-int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify)
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify,
+		       int *missed_event)
 {
 	__be32 doorbell[2];
 
+	/*
+	 * Mellanox devices trigger an event if an unpolled CQE is
+	 * already in the CQ when the CQ is armed, so there is no
+	 * possibility of missing an event.
+	 */
+	if (missed_event)
+		*missed_event = 0;
+
 	doorbell[0] = cpu_to_be32((notify == IB_CQ_SOLICITED ?
 				   MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
 				   MTHCA_TAVOR_CQ_DB_REQ_NOT)      |
@@ -733,13 +742,22 @@ int mthca_tavor_arm_cq(struct ib_cq *cq,
 	return 0;
 }
 
-int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify)
+int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify,
+		       int *missed_event)
 {
 	struct mthca_cq *cq = to_mcq(ibcq);
 	__be32 doorbell[2];
 	u32 sn;
 	__be32 ci;
 
+	/*
+	 * Mellanox devices trigger an event if an unpolled CQE is
+	 * already in the CQ when the CQ is armed, so there is no
+	 * possibility of missing an event.
+	 */
+	if (missed_event)
+		*missed_event = 0;
+
 	sn = cq->arm_sn & 3;
 	ci = cpu_to_be32(cq->cons_index);
 
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index fe5cecf..0a93a3e 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -493,8 +493,10 @@ void mthca_unmap_eq_icm(struct mthca_dev
 
 int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
 		  struct ib_wc *entry);
-int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify);
-int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify);
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify,
+		       int *missed_event);
+int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify,
+		       int *missed_event);
 int mthca_init_cq(struct mthca_dev *dev, int nent,
 		  struct mthca_ucontext *ctx, u32 pdn,
 		  struct mthca_cq *cq);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 8bf5e9e..6166cb4 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -300,7 +300,7 @@ void ipoib_ib_completion(struct ib_cq *c
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int n, i;
 
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP, NULL);
 	do {
 		n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->ibwc);
 		for (i = 0; i < n; ++i)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 7b717c6..e65ef47 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -181,7 +181,7 @@ int ipoib_transport_dev_init(struct net_
 		goto out_free_pd;
 	}
 
-	if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP))
+	if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP, NULL))
 		goto out_free_cq;
 
 	priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 18a0000..289c3a2 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -81,7 +81,7 @@ static int iser_create_device_ib_res(str
 	if (IS_ERR(device->cq))
 		goto cq_err;
 
-	if (ib_req_notify_cq(device->cq, IB_CQ_NEXT_COMP))
+	if (ib_req_notify_cq(device->cq, IB_CQ_NEXT_COMP, NULL))
 		goto cq_arm_err;
 
 	tasklet_init(&device->cq_tasklet,
@@ -820,7 +820,7 @@ static void iser_cq_tasklet_fn(unsigned 
 	}
 	/* #warning "it is assumed here that arming CQ only once its empty" *
 	 * " would not cause interrupts to be missed"                       */
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP, NULL);
 }
 
 static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 4b09147..9aa2f66 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -205,7 +205,7 @@ static int srp_create_target_ib(struct s
 		goto out;
 	}
 
-	ib_req_notify_cq(target->cq, IB_CQ_NEXT_COMP);
+	ib_req_notify_cq(target->cq, IB_CQ_NEXT_COMP, NULL);
 
 	init_attr->event_handler       = srp_qp_event;
 	init_attr->cap.max_send_wr     = SRP_SQ_SIZE;
@@ -858,7 +858,7 @@ static void srp_completion(struct ib_cq 
 	struct srp_target_port *target = target_ptr;
 	struct ib_wc wc;
 
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP, NULL);
 	while (ib_poll_cq(cq, 1, &wc) > 0) {
 		if (wc.status) {
 			printk(KERN_ERR PFX "failed %s status %d\n",
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 8eacc35..ed38655 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -941,7 +941,8 @@ struct ib_device {
 					      struct ib_wc *wc);
 	int                        (*peek_cq)(struct ib_cq *cq, int wc_cnt);
 	int                        (*req_notify_cq)(struct ib_cq *cq,
-						    enum ib_cq_notify cq_notify);
+						    enum ib_cq_notify cq_notify,
+						    int *maybe_missed_event);
 	int                        (*req_ncomp_notif)(struct ib_cq *cq,
 						      int wc_cnt);
 	struct ib_mr *             (*get_dma_mr)(struct ib_pd *pd,
@@ -1369,11 +1370,20 @@ int ib_peek_cq(struct ib_cq *cq, int wc_
  * @cq_notify: If set to %IB_CQ_SOLICITED, completion notification will
  *   occur on the next solicited event. If set to %IB_CQ_NEXT_COMP,
  *   notification will occur on the next completion.
+ * @maybe_missed_event: If non-NULL, will be used to return an integer
+ *   hint.  If this hint is 0, then it is guaranteed that any work
+ *   completions added to the CQ since the last call to ib_poll_cq()
+ *   will trigger a completion notification.  If the returned hint is
+ *   non-zero, then it is possible (but not guaranteed) that a work
+ *   completion has been added to the CQ since the last call to
+ *   ib_poll_cq(), but that no notification event will occur for this
+ *   work completion.
  */
 static inline int ib_req_notify_cq(struct ib_cq *cq,
-				   enum ib_cq_notify cq_notify)
+				   enum ib_cq_notify cq_notify,
+				   int *maybe_missed_event)
 {
-	return cq->device->req_notify_cq(cq, cq_notify);
+	return cq->device->req_notify_cq(cq, cq_notify, maybe_missed_event);
 }
 
 /**
-- 
1.4.1




More information about the general mailing list