[openib-general] [PATCH 1/2] [RFC] Implement resize of CQ

Krishna Kumar krkumar at us.ibm.com
Wed Nov 3 16:27:59 PST 2004


On Wed, 3 Nov 2004, Sean Hefty wrote:

> I didn't follow what you were trying to reference here.  Are you
> referring to the QP or CQ?

QP. When I do a query for the QP, all I really need is the qp ptr and
the qp_attr structure to fill in values. What I didn't figure out is
why an attr_mask and ib_qp_init_attr is needed. BTW, I had thought that
ib_qp_init_attr was used for initialization type of attributes, exactly
once the device is passed init attributes, then onwards ib_qp_attr should
be used. So ib_qp_init_attr seems redundant. Or I have understood the
code wrong.

> I'm adding in code to handle QP errors and overrun.  If we are unable to
> resize the CQ, we can prevent CQ overrun by limited the number of work
> requests posted to the corresponding QPs, rather than completely

Actually I read it wrong in this case, probably the code needs to check
only for "inaccessible" which is a critical error since the CEQ cannot be
posted to the CQ even though the CQ is not full.

If you are not already adding the exact same functionality, please let me
know if the following looks correct. I recreated both patches after Hal's
checkin (Patch1 and Patch2 below).

Also, I saw your other mail, and I had looked at the driver and it
didn't modify the final size of the new QP in the init_attr. It used the
structure to do it's work but doesn't update it. I was initially planning
on not using query() and instead rely on this structure getting updated.
The verb interface cannot do it since it qp doesn't contain the size. We
cannot change the driver to change the init structure since potentially
other drivers may not do it, so the reason to do a query to figure the
correct size.

verb create_qp():
if (!IS_ERR(qp)) {
                qp->device      = pd->device;
                qp->pd          = pd;
                qp->send_cq     = qp_init_attr->send_cq;
                qp->recv_cq     = qp_init_attr->recv_cq;
                qp->srq         = qp_init_attr->srq;
                qp->qp_context  = qp_init_attr->qp_context;

                atomic_inc(&pd->usecnt);
                atomic_inc(&qp_init_attr->send_cq->usecnt);
                atomic_inc(&qp_init_attr->recv_cq->usecnt);
                if (qp_init_attr->srq)
                        atomic_inc(&qp_init_attr->srq->usecnt);
        }

driver create_qp():
case IB_QPT_SMI:
        case IB_QPT_GSI:
        {
                qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
                if (!qp)
                        return ERR_PTR(-ENOMEM);

                qp->sq.max    = init_attr->cap.max_send_wr;
                qp->rq.max    = init_attr->cap.max_recv_wr;
                qp->sq.max_gs = init_attr->cap.max_send_sge;
                qp->rq.max_gs = init_attr->cap.max_recv_sge;

                qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0:1;

                err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd),
                                      to_mcq(init_attr->send_cq),
                                      to_mcq(init_attr->recv_cq),
                                      init_attr->sq_sig_type,
				      init_attr->rq_sig_type,
                                      qp->ibqp.qp_num, init_attr->port_num,
                                      to_msqp(qp));
                break;
	}


thanks,

- KK

--------------------------------------------------------------------------
                                 PATCH1
--------------------------------------------------------------------------

diff -ruNp 1/mad.c 2/mad.c
--- 1/mad.c	2004-11-03 16:03:25.000000000 -0800
+++ 2/mad.c	2004-11-03 16:03:43.000000000 -0800
@@ -1692,6 +1692,14 @@ static void init_mad_queue(struct ib_mad
 	INIT_LIST_HEAD(&mad_queue->list);
 }

+/*
+ * Allocate one mad QP.
+ *
+ * If the return indicates success, the value returned is the new size
+ * of the queue pair that got created.
+ *
+ * Return > 0 on success and -(ERRNO) on failure. Zero should never happen.
+ */
 static int create_mad_qp(struct ib_mad_port_private *port_priv,
 			 struct ib_mad_qp_info *qp_info,
 			 enum ib_qp_type qp_type)
@@ -1715,15 +1723,23 @@ static int create_mad_qp(struct ib_mad_p
 	qp_init_attr.qp_type = qp_type;
 	qp_init_attr.port_num = port_priv->port_num;
 	qp_info->qp = ib_create_qp(port_priv->pd, &qp_init_attr);
-	if (IS_ERR(qp_info->qp)) {
-		printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n",
-		       get_spl_qp_index(qp_type));
+	if (!IS_ERR(qp_info->qp)) {
+		struct ib_qp_attr	qp_attr;
+
+		ret = ib_query_qp(qp_info->qp, &qp_attr, 0, &qp_init_attr);
+		if (ret < 0) {
+			/*
+			 * For any error, use the same size we used to
+			 * create the queue.
+			 */
+			ret = qp_init_attr.cap.max_send_wr +
+					qp_init_attr.cap.max_recv_wr;
+		}
+	} else {
 		ret = PTR_ERR(qp_info->qp);
-		goto error;
+		printk(KERN_ERR PFX "Couldn't create ib_mad QP%d err:%d\n",
+		       get_spl_qp_index(qp_type), ret);
 	}
-	return 0;
-
-error:
 	return ret;
 }

@@ -1747,6 +1763,7 @@ static int ib_mad_port_open(struct ib_de
 		.size = (unsigned long) high_memory - PAGE_OFFSET
 	};
 	struct ib_mad_port_private *port_priv;
+	int total_qp_size;
 	unsigned long flags;

 	/* First, check if port already open at MAD layer */
@@ -1797,11 +1814,25 @@ static int ib_mad_port_open(struct ib_de
 	}

 	ret = create_mad_qp(port_priv, &port_priv->qp_info[0], IB_QPT_SMI);
-	if (ret)
+	if (ret <= 0)
 		goto error6;
+	total_qp_size = ret;
+
 	ret = create_mad_qp(port_priv, &port_priv->qp_info[1], IB_QPT_GSI);
-	if (ret)
+	if (ret <= 0)
 		goto error7;
+	total_qp_size += ret;
+
+	/* Resize if the total QP[0,1] size is greater than CQ size. */
+	if (total_qp_size > cq_size) {
+		printk(KERN_DEBUG PFX "ib_mad_port_open: increasing size of "
+		       "CQ from %d to %d\n", cq_size, total_qp_size);
+		if ((ret = ib_resize_cq(port_priv->cq, total_qp_size)) < 0) {
+			printk(KERN_DEBUG PFX "Couldn't increase CQ size - "
+			       "err:%d\n", ret);
+			/* continue, not an error */
+		}
+	}

 	spin_lock_init(&port_priv->reg_lock);
 	INIT_LIST_HEAD(&port_priv->agent_list);

----------------------------------------------------------------------------
                                     PATCH2
----------------------------------------------------------------------------

diff -ruNp 2/mad.c 3/mad.c
--- 2/mad.c	2004-11-03 16:03:43.000000000 -0800
+++ 3/mad.c	2004-11-03 16:17:54.000000000 -0800
@@ -1749,6 +1749,21 @@ static void destroy_mad_qp(struct ib_mad
 }

 /*
+ * Overrun and Inaccessible errors cannot be handled by QP resize operation.
+ */
+static inline int is_catastrophic_error(int err)
+{
+#define	CQ_ACCESS_ERROR		0x11
+
+	switch (err) {
+	default:	/* OK */
+		return 0;
+	case CQ_ACCESS_ERROR:
+		return 1;
+	}
+}
+
+/*
  * Open the port
  * Create the QP, PD, MR, and CQ if needed
  */
@@ -1830,6 +1845,10 @@ static int ib_mad_port_open(struct ib_de
 		if ((ret = ib_resize_cq(port_priv->cq, total_qp_size)) < 0) {
 			printk(KERN_DEBUG PFX "Couldn't increase CQ size - "
 			       "err:%d\n", ret);
+			if (is_catastrophic_error(ret)) {
+				/* Clean up qp_info[0,1] */
+				goto error8;
+			}
 			/* continue, not an error */
 		}
 	}




More information about the general mailing list