[openib-general] [RFC] [PATCH] iWARP Connection parameters negotiation

Krishna Kumar krkumar2 at in.ibm.com
Thu Feb 15 22:29:02 PST 2007


Hi all,

In the Nov 16-17th 2006 OpenFabrics Developer Summit, the
following presentation by Tom talked of negotiating
parameters at the time of establishing a connection :

http://www.openfabrics.org/conference/nov2006sc/OFA-Newstuff-SC06.ppt

See "IRD/ORD Negotiation, Option #1" 

This is an RFC patch that implements the same (after
interracting with Tom, Steve and Arkady). The working
of this functionality is described by Tom below (in one
of his mails to me) :

"Yes, basically what has to happen is that the IRD/ORD
information needs to be exchanged in the private data as
part of connect/accept. The parameters should be pre-pended
followed by the user specified private data. The IWCM strips
the ORD/IRD header off before delivering the private data to
the consumer. BTW, if you look at the IB code you can see
how they do this already for port number, etc...

"The options specified are really whether to be permissive
when the remote peer won't honor your request. My opinion
is that the policy should be permissive, which is to say
that if you ask for 8 outgoing, but the peer responds with
4 incoming, you set up the connection and the local node
adjusts his ORD down to 4 to match the peers IRD response.
Does this make sense?"

Please provide your comments/suggestions/(flames). Patch
applies to 2.6.20.

Thanks,

- KK

diff -ruNp org/drivers/infiniband/core/cma.c new/drivers/infiniband/core/cma.c
--- org/drivers/infiniband/core/cma.c	2007-02-16 11:01:08.000000000 +0530
+++ new/drivers/infiniband/core/cma.c	2007-02-16 11:01:12.000000000 +0530
@@ -2059,19 +2059,81 @@ out:
 	return ret;
 }
 
+/*
+ * set_connp_fields() : Set various fields of iw_param based on connnect or
+ * accept parameters (connp). Also, allocates a private header if
+ * config_privhdr is true.
+ * 
+ * Returns a private header on success and -errno converted-to-pointer on
+ * failure. A null private header indicates that config_privhdr was false.
+ */
+static struct iwcm_priv_hdr *set_connp_fields(struct iw_cm_conn_param *iw_param,
+					      struct rdma_conn_param *connp,
+					      int config_privhdr)
+{
+	struct iwcm_priv_hdr *priv_hdr;
+
+	iw_param->ord = connp->initiator_depth;
+	iw_param->ird = connp->responder_resources;
+
+	if (!config_privhdr) {
+		/*
+		 * We are not configured to send private header (for active),
+		 * or the peer didn't send a private header (for passive); in
+		 * both cases, do not use private header.
+		 */
+		priv_hdr = NULL;	/* Success */
+		iw_param->private_data = connp->private_data;
+		iw_param->private_data_len = connp->private_data_len;
+		goto out;
+	}
+
+	if ((typeof (connp->private_data_len))
+	    (connp->private_data_len + sizeof *priv_hdr) <
+	     connp->private_data_len) {
+		/* Overflow - private_data_len + priv_hdr is too large */
+		/* xxx.KK - Help - there is a better way to check overflow
+		 * than this - some macro that is already inbuilt.
+		 */
+		priv_hdr = ERR_PTR(-EOVERFLOW);
+		goto out;
+	}
+
+	/* Allocate memory for both private data and iwcm_priv_hdr */
+	iw_param->private_data_len = connp->private_data_len + sizeof *priv_hdr;
+	iw_param->private_data = kmalloc(iw_param->private_data_len,
+					 GFP_KERNEL);
+	if (!iw_param->private_data) {
+		priv_hdr = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	/* Prepend iwcm_priv_hdr header before actual private data */
+	priv_hdr = (struct iwcm_priv_hdr *) iw_param->private_data;
+	priv_hdr->ord = cpu_to_be32(iw_param->ord);
+	priv_hdr->ird = cpu_to_be32(iw_param->ird);
+	if (connp->private_data_len)
+		memcpy(priv_hdr->private_data, connp->private_data,
+		       connp->private_data_len);
+
+out:
+	return priv_hdr;
+}
+
+extern int iw_send_private_header;
+
 static int cma_connect_iw(struct rdma_id_private *id_priv,
 			  struct rdma_conn_param *conn_param)
 {
 	struct iw_cm_id *cm_id;
 	struct sockaddr_in* sin;
-	int ret;
 	struct iw_cm_conn_param iw_param;
+	struct iwcm_priv_hdr *priv_hdr = NULL;
+	int ret;
 
 	cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv);
-	if (IS_ERR(cm_id)) {
-		ret = PTR_ERR(cm_id);
-		goto out;
-	}
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
 
 	id_priv->cm_id.iw = cm_id;
 
@@ -2085,17 +2147,28 @@ static int cma_connect_iw(struct rdma_id
 	if (ret)
 		goto out;
 
-	iw_param.ord = conn_param->initiator_depth;
-	iw_param.ird = conn_param->responder_resources;
-	iw_param.private_data = conn_param->private_data;
-	iw_param.private_data_len = conn_param->private_data_len;
+	/* Initialize iw_param fields */
+	priv_hdr = set_connp_fields(&iw_param, conn_param,
+				    iw_send_private_header);
+	if (IS_ERR(priv_hdr)) {
+		ret = PTR_ERR(priv_hdr);
+		goto out;
+	}
+
+	/*
+	 * Save iwcm_priv_hdr till we get a connect response to negotiate.
+	 * priv_hdr can be NULL, indicating that negotiation is disabled.
+	 */
+	cm_id->priv_hdr = priv_hdr;
+
 	if (id_priv->id.qp)
 		iw_param.qpn = id_priv->qp_num;
 	else
 		iw_param.qpn = conn_param->qp_num;
 	ret = iw_cm_connect(cm_id, &iw_param);
 out:
-	if (ret && !IS_ERR(cm_id)) {
+	if (ret) {
+		/* cm_id->priv_hdr is freed up in iwcm_deref_id() */
 		iw_destroy_cm_id(cm_id);
 		id_priv->cm_id.iw = NULL;
 	}
@@ -2185,23 +2258,51 @@ out:
 static int cma_accept_iw(struct rdma_id_private *id_priv,
 		  struct rdma_conn_param *conn_param)
 {
+	struct iw_cm_id *cm_id;
+	struct iwcm_priv_hdr *priv_hdr;
 	struct iw_cm_conn_param iw_param;
 	int ret;
 
+	cm_id = id_priv->cm_id.iw;
+
+	/* Initialize iw_param fields */
+	priv_hdr = set_connp_fields(&iw_param, conn_param,
+				    (cm_id->priv_hdr ?  1 : 0) &
+				     iw_send_private_header);
+	if (IS_ERR(priv_hdr)) {
+		ret = PTR_ERR(priv_hdr);
+		goto out;
+	}
+
 	ret = cma_modify_qp_rtr(&id_priv->id);
 	if (ret)
-		return ret;
+		goto out;
 
-	iw_param.ord = conn_param->initiator_depth;
-	iw_param.ird = conn_param->responder_resources;
-	iw_param.private_data = conn_param->private_data;
-	iw_param.private_data_len = conn_param->private_data_len;
-	if (id_priv->id.qp) {
+	if (id_priv->id.qp)
 		iw_param.qpn = id_priv->qp_num;
-	} else
+	else
 		iw_param.qpn = conn_param->qp_num;
 
-	return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
+	ret = iw_cm_accept(cm_id, &iw_param);
+
+out:
+	/*
+	 * Free the just allocated priv_hdr+private data. priv_hdr could
+	 * be NULL if negotiation is not configured, or if the active side
+	 * didn't use private header.
+	 */
+	if (!IS_ERR(priv_hdr))
+		kfree(priv_hdr);
+
+	/*
+	 * Free priv_hdr that was allocated in parse_connection_params().
+	 */
+	if (cm_id->priv_hdr) {
+		kfree(cm_id->priv_hdr);
+		cm_id->priv_hdr = NULL;
+	}
+
+	return ret;
 }
 
 static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
diff -ruNp org/drivers/infiniband/core/iwcm.c new/drivers/infiniband/core/iwcm.c
--- org/drivers/infiniband/core/iwcm.c	2007-02-16 11:01:08.000000000 +0530
+++ new/drivers/infiniband/core/iwcm.c	2007-02-16 11:01:12.000000000 +0530
@@ -54,6 +54,12 @@ MODULE_AUTHOR("Tom Tucker");
 MODULE_DESCRIPTION("iWARP CM");
 MODULE_LICENSE("Dual BSD/GPL");
 
+int iw_send_private_header __read_mostly = 0;
+module_param_named(iw_send_private_header, iw_send_private_header, int, 0644);
+MODULE_PARM_DESC(iw_send_private_header,
+			"Enable private iwcm connection negotiation header");
+EXPORT_SYMBOL(iw_send_private_header);
+
 static struct workqueue_struct *iwcm_wq;
 struct iwcm_work {
 	struct work_struct work;
@@ -158,6 +164,7 @@ static int iwcm_deref_id(struct iwcm_id_
 	BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
 	if (atomic_dec_and_test(&cm_id_priv->refcount)) {
 		BUG_ON(!list_empty(&cm_id_priv->work_list));
+		kfree(cm_id_priv->id.priv_hdr);
 		if (waitqueue_active(&cm_id_priv->destroy_comp.wait)) {
 			BUG_ON(cm_id_priv->state != IW_CM_STATE_DESTROYING);
 			BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY,
@@ -470,6 +477,72 @@ int iw_cm_reject(struct iw_cm_id *cm_id,
 }
 EXPORT_SYMBOL(iw_cm_reject);
 
+/* 
+ * iw_negotiate_qp_conn_params() : gets incoming and outgoing ird/ord
+ * parameters and modifies incoming ird/ord if required as per Sequences
+ * #7 and #8 noted in Section 6.6.1.1; and Sequences #9 and 10 noted in
+ * 6.6.1.2 of the hilland draft.
+ *
+ * This routine also modifies the caller's ird/ord so that accept is
+ * provided with this smaller ird/ord.
+ *
+ * This routine is called by :
+ * 	- iw_cm_accept() for accepting an incoming connection, and
+ *	- parse_connection_params() IW_CM_EVENT_CONNECT_REPLY case where
+ * 	  the client gets response to the connect request;
+ * and since both these callers have process context, it is OK to call
+ * ib_modify_qp (which can sleep).
+ *
+ * Returns 0 if negotiation was done successfully; and < 0 if no negotiation
+ * was required or if modify_qp() failed (which is not a catastrophic error).
+ */
+static int iw_negotiate_qp_conn_params(struct ib_qp *qp, int *l_ird, int *l_ord,
+				       int r_ird, int r_ord)
+{
+	int qp_mask = 0;	/* mask of attributes to be modified */
+	int ret = -1;		/* no negotiation required */
+
+	if (*l_ord > r_ird) {
+		/* 
+		 * Local Outgoing is bigger than Peer Incoming, reduce
+		 * my Outgoing.
+		 */
+		pr_debug("%s: Reducing outgoing from %d to %d\n", __FUNCTION__,
+			 *l_ord, r_ird);
+		*l_ord = r_ird;
+		qp_mask = IB_QP_MAX_QP_RD_ATOMIC;
+	}
+
+	if (*l_ird > r_ord) {
+		/*
+		 * Local Incoming is greater than Peer Outgoing, reduce
+		 * my Incoming.
+		 */
+		pr_debug("%s: Reducing incoming from %d to %d\n", __FUNCTION__,
+			 *l_ird, r_ord);
+		*l_ird = r_ord;
+		qp_mask |= IB_QP_MAX_DEST_RD_ATOMIC;
+	}
+
+	if (qp_mask) {
+		struct ib_qp_attr qp_attr;
+
+		qp_attr.max_rd_atomic = *l_ird;
+		qp_attr.max_dest_rd_atomic = *l_ord;
+		ret = ib_modify_qp(qp, &qp_attr, qp_mask);
+		pr_debug("%s: modify qp with qp_mask:%x returns %d\n",
+			 __FUNCTION__, qp_mask, ret);
+		/* xxx.KK : This does NOTHING in amso driver, as
+		 * c2_qp_set_read_limits() is used to set rdma limits, this
+		 * seems to be a limitation of driver as modify_qp is
+		 * supposed to do the same. See mthca_modify_qp which
+		 * modifies the limits. Maybe chelsio driver supports this.
+		 */
+	}
+
+	return ret;
+}
+
 /*
  * CM_ID <-- ESTABLISHED
  *
@@ -483,6 +556,7 @@ int iw_cm_accept(struct iw_cm_id *cm_id,
 	struct iwcm_id_private *cm_id_priv;
 	struct ib_qp *qp;
 	unsigned long flags;
+	int r_ird, r_ord;
 	int ret;
 
 	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
@@ -505,6 +579,44 @@ int iw_cm_accept(struct iw_cm_id *cm_id,
 	cm_id_priv->qp = qp;
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
+	if (!iw_send_private_header || !cm_id->priv_hdr) {
+		/*
+		 * Not configured to send extra header, or peer didn't use
+		 * private header.
+		 */
+		goto accept;
+	}
+
+	/*
+	 * Retrieve client's connection parameters that were earlier saved
+	 * from an incoming client connect request, and negotiate with the
+	 * accept parameters. iw_param contains local connection values
+	 * while cm_id->priv_hdr contains peer connection values.
+	 */
+
+	r_ord = be32_to_cpu(cm_id->priv_hdr->ord);
+	r_ird = be32_to_cpu(cm_id->priv_hdr->ird);
+
+	kfree(cm_id->priv_hdr);
+	cm_id->priv_hdr = NULL;
+
+	ret = iw_negotiate_qp_conn_params(qp, &iw_param->ird,
+					  &iw_param->ord, r_ird, r_ord);
+	if (!ret) {
+		struct iwcm_priv_hdr *priv_hdr;
+
+		/*
+		 * iw_param's ird/ord now contains new values, update the
+		 * prepended header's ird/ord fields to reflect this change
+		 * so as to let the other side know of these updated values.
+		 */
+		priv_hdr = (struct iwcm_priv_hdr *) iw_param->private_data;
+
+		priv_hdr->ord = cpu_to_be32(iw_param->ord);
+		priv_hdr->ird = cpu_to_be32(iw_param->ird);
+	}
+
+accept:
 	ret = cm_id->device->iwcm->accept(cm_id, iw_param);
 	if (ret) {
 		/* An error on accept precludes provider events */
@@ -584,6 +696,126 @@ int iw_cm_connect(struct iw_cm_id *cm_id
 EXPORT_SYMBOL(iw_cm_connect);
 
 /*
+ * parse_connection_params() : parse connection parameters for both
+ * active side requests and passive side responses, and negotiate
+ * suitable values for ird/ord.
+ * 
+ * Returns 0 on success and -errno on failure (via modify_qp). Also
+ * returns the actual iw_event on success by stripping the extra
+ * header that was added by the remote peer.
+ */
+static int parse_connection_params(struct iwcm_id_private *cm_id_priv,
+	struct iw_cm_event *iw_event, struct iw_cm_event *new_iw_event)
+{
+	int ret = 0;
+	int l_ird, l_ord, r_ird, r_ord;
+	struct iw_cm_id *cm_id;
+	struct iwcm_priv_hdr *local_priv_hdr, *remote_priv_hdr;
+
+	cm_id = &cm_id_priv->id;
+	local_priv_hdr = cm_id->priv_hdr;
+	*new_iw_event = *iw_event;
+
+	if (!iw_send_private_header) {
+		/* Not configured to send extra header */
+		BUG_ON(local_priv_hdr);
+		goto out;
+	}
+
+	remote_priv_hdr = iw_event->private_data;
+	if (!iw_has_private_header(remote_priv_hdr, iw_event)) {
+		/*
+		 * Remote side has not sent a iw private header, we use the
+		 * old protocol.
+		 */
+		if (local_priv_hdr) {
+			/*
+			 * This is the active side code path. We had already
+			 * sent a private header when doing the connect, but
+			 * the server does not implement private header, so
+			 * we should fail the connect response otherwise we
+			 * end up having passed wrong private data to the
+			 * application on the server.
+			 */
+			kfree(local_priv_hdr);
+			cm_id->priv_hdr = NULL;
+			ret = -EAGAIN;
+		}
+		goto out;
+	}
+
+	switch (iw_event->event) {
+	case IW_CM_EVENT_CONNECT_REQUEST:
+		/*
+		 * This is Server code - a connect request was received.
+		 * Allocate a iwcm_priv_hdr that can be used later for
+		 * negotiation when accept() is performed.
+		 */
+		BUG_ON(local_priv_hdr);
+
+		/*
+		 * Save only the header (and not the private data passed
+		 * in the connect()). By the time an accept is done, we
+		 * will have the local params which we can then compare
+		 * with the remote params that we are just saving (the
+		 * peer's real private data is passed to the app later in
+		 * the flow when we call id->cm_handler(new_iw_event)).
+		 */
+		cm_id->priv_hdr = kmalloc(sizeof *cm_id->priv_hdr, GFP_KERNEL);
+                if (!cm_id->priv_hdr) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		/*
+		 * Save all the remote connection parameters. These will be
+		 * later used for negotiation when an accept is performed.
+		 * Save in Big-Endian format.
+		 */
+		cm_id->priv_hdr->ord = remote_priv_hdr->ord;
+		cm_id->priv_hdr->ird = remote_priv_hdr->ird;
+		break;
+
+	case IW_CM_EVENT_CONNECT_REPLY:
+		/*
+		 * This is Client code - a connect response was received,
+		 * use local connection parameters saved earlier and the
+		 * remote connection parameters to negotiate.
+		 */
+
+		BUG_ON(!local_priv_hdr);
+
+
+		l_ord = be32_to_cpu(local_priv_hdr->ord);
+		l_ird = be32_to_cpu(local_priv_hdr->ird);
+		r_ord = be32_to_cpu(remote_priv_hdr->ord);
+		r_ird = be32_to_cpu(remote_priv_hdr->ird);
+
+		kfree(local_priv_hdr); 
+		cm_id->priv_hdr = NULL;
+
+		BUG_ON(!cm_id_priv->qp);
+		(void) iw_negotiate_qp_conn_params(cm_id_priv->qp, &l_ird,
+						   &l_ord, r_ird, r_ord);
+		break;
+
+	default:
+		/* Should never get here */
+		BUG();
+		break;
+	}
+
+	/*
+	 * Reset the new event values to point to the real private_data/len.
+	 */
+	new_iw_event->private_data_len -= sizeof(struct iwcm_priv_hdr);
+	new_iw_event->private_data = remote_priv_hdr->private_data;
+
+out:
+	return ret;
+}
+
+/*
  * Passive Side: new CM_ID <-- CONN_RECV
  *
  * Handles an inbound connect request. The function creates a new
@@ -604,6 +836,7 @@ static void cm_conn_req_handler(struct i
 	unsigned long flags;
 	struct iw_cm_id *cm_id;
 	struct iwcm_id_private *cm_id_priv;
+	struct iw_cm_event new_iw_event;
 	int ret;
 
 	/*
@@ -644,8 +877,13 @@ static void cm_conn_req_handler(struct i
 		goto out;
 	}
 
-	/* Call the client CM handler */
-	ret = cm_id->cm_handler(cm_id, iw_event);
+	/* Save the incoming connection parameters in cm_id->priv_hdr */
+	ret = parse_connection_params(cm_id_priv, iw_event, &new_iw_event);
+	if (!ret) {
+		/* Call the client CM handler */
+		ret = cm_id->cm_handler(cm_id, &new_iw_event);
+	}
+
 	if (ret) {
 		set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
 		destroy_cm_id(cm_id);
@@ -704,6 +942,7 @@ static int cm_conn_rep_handler(struct iw
 			       struct iw_cm_event *iw_event)
 {
 	unsigned long flags;
+        struct iw_cm_event new_iw_event;
 	int ret;
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
@@ -724,7 +963,15 @@ static int cm_conn_rep_handler(struct iw
 		cm_id_priv->state = IW_CM_STATE_IDLE;
 	}
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+
+	/*
+	 * iw_event contains peer connection parameters, while
+	 * cm_id->priv_hdr has local connection parameters. Use these
+	 * to negotiate mutually agreeable connection parameters.
+	 */
+	ret = parse_connection_params(cm_id_priv, iw_event, &new_iw_event);
+	if (!ret)
+		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &new_iw_event);
 
 	if (iw_event->private_data_len)
 		kfree(iw_event->private_data);
diff -ruNp org/include/rdma/iw_cm.h new/include/rdma/iw_cm.h
--- org/include/rdma/iw_cm.h	2007-02-16 11:01:08.000000000 +0530
+++ new/include/rdma/iw_cm.h	2007-02-16 11:01:12.000000000 +0530
@@ -86,6 +86,32 @@ typedef int (*iw_cm_handler)(struct iw_c
 typedef int (*iw_event_handler)(struct iw_cm_id *cm_id,
 				 struct iw_cm_event *event);
 
+/*
+ * Header prepended before the actual private data passed during
+ * connection establishment in connect/accept calls.
+ */
+struct iwcm_priv_hdr {
+	u32 ord;		/* Outbound RDMA Read Queue Depth */
+	u32 ird;		/* Inbound RDMA Read Queue Depth */
+				/* Other negotiation params come here */
+	char private_data[0];	/* copy the real private data here */
+} __attribute__ ((packed));
+
+/*
+ * Returns true if priv_hdr seemingly points to a real private header
+ * structure.  Conditions to determine that a private header is present
+ * are :
+ * 	- non NULL pointer.
+ *	- event's private data is atleast equal to private header size.
+ * Note : This can return false positives.
+ */
+static inline int iw_has_private_header(struct iwcm_priv_hdr *priv_hdr,
+					struct iw_cm_event *event)
+{
+	return (priv_hdr &&
+		event->private_data_len >= sizeof (struct iwcm_priv_hdr));
+}
+
 struct iw_cm_id {
 	iw_cm_handler		cm_handler;      /* client callback function */
 	void		        *context;	 /* client cb context */
@@ -93,6 +119,7 @@ struct iw_cm_id {
 	struct sockaddr_in      local_addr;
 	struct sockaddr_in	remote_addr;
 	void			*provider_data;	 /* provider private data */
+	struct iwcm_priv_hdr	*priv_hdr;	 /* Extra header added */
 	iw_event_handler        event_handler;   /* cb for provider
 						    events */
 	/* Used by provider to add and remove refs on IW cm_id */




More information about the general mailing list