[ofa-general] [GIT PULL] 2.6.24: please pull rdma-dev.git for-roland branch

Sean Hefty sean.hefty at intel.com
Mon Sep 10 16:04:47 PDT 2007


Roland, please pull from:

	git://git.openfabrics.org/~shefty/rdma-dev.git for-roland

This will pick up QoS and CM scalability changes that I would like to get
into 2.6.24 (and OFED 1.3).  All have been posted to the list before, though
the QoS patches have received more attention.

Sean Hefty (7):
      ib/ipoib: specify Traffic Class with PR queries for QoS support
      ib/sa: add new QoS fields to path record
      rdma/cm: add ability to specify type of service
      rdma/ucm: export setting service type to user space
      ib/srp: add QoS support through service ID
      ib/cm: modify interface to send MRAs in response to duplicate messages
      rdma/cm: queue IB CM MRAs to avoid unnecessary remote retries

 drivers/infiniband/core/cm.c                   |   51 +++++++----------
 drivers/infiniband/core/cma.c                  |   46 ++++++++++++---
 drivers/infiniband/core/sa_query.c             |   10 +--
 drivers/infiniband/core/ucma.c                 |   74 ++++++++++++++++++++++++-
 drivers/infiniband/ulp/ipoib/ipoib.h           |   22 +++++++
 drivers/infiniband/ulp/ipoib/ipoib_main.c      |    8 +-
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   22 -------
 drivers/infiniband/ulp/srp/ib_srp.c            |    2 
 include/rdma/ib_cm.h                           |    7 +-
 include/rdma/ib_sa.h                           |   11 +--
 include/rdma/rdma_cm.h                         |   14 ++++
 include/rdma/rdma_user_cm.h                    |   18 ++++++
 12 files changed, 205 insertions(+), 80 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 4df269f..2e39236 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -2219,6 +2219,9 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
 {
 	struct cm_id_private *cm_id_priv;
 	struct ib_mad_send_buf *msg;
+	enum ib_cm_state cm_state;
+	enum ib_cm_lap_state lap_state;
+	enum cm_msg_response msg_response;
 	void *data;
 	unsigned long flags;
 	int ret;
@@ -2235,48 +2238,40 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	switch(cm_id_priv->id.state) {
 	case IB_CM_REQ_RCVD:
-		ret = cm_alloc_msg(cm_id_priv, &msg);
-		if (ret)
-			goto error1;
-
-		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
-			      CM_MSG_RESPONSE_REQ, service_timeout,
-			      private_data, private_data_len);
-		ret = ib_post_send_mad(msg, NULL);
-		if (ret)
-			goto error2;
-		cm_id->state = IB_CM_MRA_REQ_SENT;
+		cm_state = IB_CM_MRA_REQ_SENT;
+		lap_state = cm_id->lap_state;
+		msg_response = CM_MSG_RESPONSE_REQ;
 		break;
 	case IB_CM_REP_RCVD:
-		ret = cm_alloc_msg(cm_id_priv, &msg);
-		if (ret)
-			goto error1;
-
-		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
-			      CM_MSG_RESPONSE_REP, service_timeout,
-			      private_data, private_data_len);
-		ret = ib_post_send_mad(msg, NULL);
-		if (ret)
-			goto error2;
-		cm_id->state = IB_CM_MRA_REP_SENT;
+		cm_state = IB_CM_MRA_REP_SENT;
+		lap_state = cm_id->lap_state;
+		msg_response = CM_MSG_RESPONSE_REP;
 		break;
 	case IB_CM_ESTABLISHED:
+		cm_state = cm_id->state;
+		lap_state = IB_CM_MRA_LAP_SENT;
+		msg_response = CM_MSG_RESPONSE_OTHER;
+		break;
+	default:
+		ret = -EINVAL;
+		goto error1;
+	}
+
+	if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
 		ret = cm_alloc_msg(cm_id_priv, &msg);
 		if (ret)
 			goto error1;
 
 		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
-			      CM_MSG_RESPONSE_OTHER, service_timeout,
+			      msg_response, service_timeout,
 			      private_data, private_data_len);
 		ret = ib_post_send_mad(msg, NULL);
 		if (ret)
 			goto error2;
-		cm_id->lap_state = IB_CM_MRA_LAP_SENT;
-		break;
-	default:
-		ret = -EINVAL;
-		goto error1;
 	}
+
+	cm_id->state = cm_state;
+	cm_id->lap_state = lap_state;
 	cm_id_priv->service_timeout = service_timeout;
 	cm_set_private_data(cm_id_priv, data, private_data_len);
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 9ffb998..7253952 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -52,6 +52,7 @@ MODULE_LICENSE("Dual BSD/GPL");
 
 #define CMA_CM_RESPONSE_TIMEOUT 20
 #define CMA_MAX_CM_RETRIES 15
+#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
 
 static void cma_add_one(struct ib_device *device);
 static void cma_remove_one(struct ib_device *device);
@@ -138,6 +139,7 @@ struct rdma_id_private {
 	u32			qkey;
 	u32			qp_num;
 	u8			srq;
+	u8			tos;
 };
 
 struct cma_multicast {
@@ -1089,6 +1091,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 		event.param.ud.private_data_len =
 				IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
 	} else {
+		ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
 		conn_id = cma_new_conn_id(&listen_id->id, ib_event);
 		cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
 				       ib_event->private_data, offset);
@@ -1474,6 +1477,15 @@ err:
 }
 EXPORT_SYMBOL(rdma_listen);
 
+void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->tos = (u8) tos;
+}
+EXPORT_SYMBOL(rdma_set_service_type);
+
 static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
 			      void *context)
 {
@@ -1498,23 +1510,37 @@ static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
 static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
 			      struct cma_work *work)
 {
-	struct rdma_dev_addr *addr = &id_priv->id.route.addr.dev_addr;
+	struct rdma_addr *addr = &id_priv->id.route.addr;
 	struct ib_sa_path_rec path_rec;
+	ib_sa_comp_mask comp_mask;
+	struct sockaddr_in6 *sin6;
 
 	memset(&path_rec, 0, sizeof path_rec);
-	ib_addr_get_sgid(addr, &path_rec.sgid);
-	ib_addr_get_dgid(addr, &path_rec.dgid);
-	path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(addr));
+	ib_addr_get_sgid(&addr->dev_addr, &path_rec.sgid);
+	ib_addr_get_dgid(&addr->dev_addr, &path_rec.dgid);
+	path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr));
 	path_rec.numb_path = 1;
 	path_rec.reversible = 1;
+	path_rec.service_id = cma_get_service_id(id_priv->id.ps, &addr->dst_addr);
+
+	comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+		    IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+		    IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
+
+	if (addr->src_addr.sa_family == AF_INET) {
+		path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
+		comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
+	} else {
+		sin6 = (struct sockaddr_in6 *) &addr->src_addr;
+		path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
+		comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+	}
 
 	id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
-				id_priv->id.port_num, &path_rec,
-				IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
-				IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
-				IB_SA_PATH_REC_REVERSIBLE,
-				timeout_ms, GFP_KERNEL,
-				cma_query_handler, work, &id_priv->query);
+					       id_priv->id.port_num, &path_rec,
+					       comp_mask, timeout_ms,
+					       GFP_KERNEL, cma_query_handler,
+					       work, &id_priv->query);
 
 	return (id_priv->query_id < 0) ? id_priv->query_id : 0;
 }
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index d271bd7..6f56bb5 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -123,14 +123,10 @@ static u32 tid;
 	.field_name          = "sa_path_rec:" #field
 
 static const struct ib_field path_rec_table[] = {
-	{ RESERVED,
+	{ PATH_REC_FIELD(service_id),
 	  .offset_words = 0,
 	  .offset_bits  = 0,
-	  .size_bits    = 32 },
-	{ RESERVED,
-	  .offset_words = 1,
-	  .offset_bits  = 0,
-	  .size_bits    = 32 },
+	  .size_bits    = 64 },
 	{ PATH_REC_FIELD(dgid),
 	  .offset_words = 2,
 	  .offset_bits  = 0,
@@ -179,7 +175,7 @@ static const struct ib_field path_rec_table[] = {
 	  .offset_words = 12,
 	  .offset_bits  = 16,
 	  .size_bits    = 16 },
-	{ RESERVED,
+	{ PATH_REC_FIELD(qos_class),
 	  .offset_words = 13,
 	  .offset_bits  = 0,
 	  .size_bits    = 12 },
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 53b4c94..90d675a 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -792,6 +792,78 @@ out:
 	return ret;
 }
 
+static int ucma_set_option_id(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret = 0;
+
+	switch (optname) {
+	case RDMA_OPTION_ID_TOS:
+		if (optlen != sizeof(u8)) {
+			ret = -EINVAL;
+			break;
+		}
+		rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static int ucma_set_option_level(struct ucma_context *ctx, int level,
+				 int optname, void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (level) {
+	case RDMA_OPTION_ID:
+		ret = ucma_set_option_id(ctx, optname, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_set_option cmd;
+	struct ucma_context *ctx;
+	void *optval;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	optval = kmalloc(cmd.optlen, GFP_KERNEL);
+	if (!optval) {
+		ret = -ENOMEM;
+		goto out1;
+	}
+
+	if (copy_from_user(optval, (void __user *) (unsigned long) cmd.optval,
+			   cmd.optlen)) {
+		ret = -EFAULT;
+		goto out2;
+	}
+
+	ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
+				    cmd.optlen);
+out2:
+	kfree(optval);
+out1:
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
 static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
 			   int in_len, int out_len)
 {
@@ -936,7 +1008,7 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
 	[RDMA_USER_CM_CMD_INIT_QP_ATTR]	= ucma_init_qp_attr,
 	[RDMA_USER_CM_CMD_GET_EVENT]	= ucma_get_event,
 	[RDMA_USER_CM_CMD_GET_OPTION]	= NULL,
-	[RDMA_USER_CM_CMD_SET_OPTION]	= NULL,
+	[RDMA_USER_CM_CMD_SET_OPTION]	= ucma_set_option,
 	[RDMA_USER_CM_CMD_NOTIFY]	= ucma_notify,
 	[RDMA_USER_CM_CMD_JOIN_MCAST]	= ucma_join_multicast,
 	[RDMA_USER_CM_CMD_LEAVE_MCAST]	= ucma_leave_multicast,
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 285c143..fc16bce 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -113,7 +113,27 @@ struct ipoib_pseudoheader {
 	u8  hwaddr[INFINIBAND_ALEN];
 };
 
-struct ipoib_mcast;
+/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
+struct ipoib_mcast {
+	struct ib_sa_mcmember_rec mcmember;
+	struct ib_sa_multicast	 *mc;
+	struct ipoib_ah          *ah;
+
+	struct rb_node    rb_node;
+	struct list_head  list;
+
+	unsigned long created;
+	unsigned long backoff;
+
+	unsigned long flags;
+	unsigned char logcount;
+
+	struct list_head  neigh_list;
+
+	struct sk_buff_head pkt_queue;
+
+	struct net_device *dev;
+};
 
 struct ipoib_rx_buf {
 	struct sk_buff *skb;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 894b1dc..841e068 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -468,9 +468,10 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
 	INIT_LIST_HEAD(&path->neigh_list);
 
 	memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
-	path->pathrec.sgid      = priv->local_gid;
-	path->pathrec.pkey      = cpu_to_be16(priv->pkey);
-	path->pathrec.numb_path = 1;
+	path->pathrec.sgid          = priv->local_gid;
+	path->pathrec.pkey          = cpu_to_be16(priv->pkey);
+	path->pathrec.numb_path     = 1;
+	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
 
 	return path;
 }
@@ -491,6 +492,7 @@ static int path_rec_start(struct net_device *dev,
 				   IB_SA_PATH_REC_DGID		|
 				   IB_SA_PATH_REC_SGID		|
 				   IB_SA_PATH_REC_NUMB_PATH	|
+				   IB_SA_PATH_REC_TRAFFIC_CLASS |
 				   IB_SA_PATH_REC_PKEY,
 				   1000, GFP_ATOMIC,
 				   path_rec_completion,
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index aae3670..94a5709 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -57,28 +57,6 @@ MODULE_PARM_DESC(mcast_debug_level,
 
 static DEFINE_MUTEX(mcast_mutex);
 
-/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
-struct ipoib_mcast {
-	struct ib_sa_mcmember_rec mcmember;
-	struct ib_sa_multicast	 *mc;
-	struct ipoib_ah          *ah;
-
-	struct rb_node    rb_node;
-	struct list_head  list;
-
-	unsigned long created;
-	unsigned long backoff;
-
-	unsigned long flags;
-	unsigned char logcount;
-
-	struct list_head  neigh_list;
-
-	struct sk_buff_head pkt_queue;
-
-	struct net_device *dev;
-};
-
 struct ipoib_mcast_iter {
 	struct net_device *dev;
 	union ib_gid       mgid;
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index f6a0514..9ccc638 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -285,6 +285,7 @@ static int srp_lookup_path(struct srp_target_port *target)
 						   target->srp_host->dev->dev,
 						   target->srp_host->port,
 						   &target->path,
+						   IB_SA_PATH_REC_SERVICE_ID	|
 						   IB_SA_PATH_REC_DGID		|
 						   IB_SA_PATH_REC_SGID		|
 						   IB_SA_PATH_REC_NUMB_PATH	|
@@ -1692,6 +1693,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)
 				goto out;
 			}
 			target->service_id = cpu_to_be64(simple_strtoull(p, NULL, 16));
+			target->path.service_id = target->service_id;
 			kfree(p);
 			break;
 
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 12243e8..a627c86 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -477,12 +477,15 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id,
 		   const void *private_data,
 		   u8 private_data_len);
 
+#define IB_CM_MRA_FLAG_DELAY 0x80  /* Send MRA only after a duplicate msg */
+
 /**
  * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection
  *   message.
  * @cm_id: Connection identifier associated with the connection message.
- * @service_timeout: The maximum time required for the sender to reply to
- *   to the connection message.
+ * @service_timeout: The lower 5-bits specify the maximum time required for
+ *   the sender to reply to to the connection message.  The upper 3-bits
+ *   specify additional control flags.
  * @private_data: Optional user-defined private data sent with the
  *   message receipt acknowledgement.
  * @private_data_len: Size of the private data buffer, in bytes.
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 5e26b2f..942692b 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -109,8 +109,8 @@ enum ib_sa_selector {
  * Reserved rows are indicated with comments to help maintainability.
  */
 
-/* reserved:								 0 */
-/* reserved:								 1 */
+#define IB_SA_PATH_REC_SERVICE_ID		       (IB_SA_COMP_MASK( 0) |\
+							IB_SA_COMP_MASK( 1))
 #define IB_SA_PATH_REC_DGID				IB_SA_COMP_MASK( 2)
 #define IB_SA_PATH_REC_SGID				IB_SA_COMP_MASK( 3)
 #define IB_SA_PATH_REC_DLID				IB_SA_COMP_MASK( 4)
@@ -123,7 +123,7 @@ enum ib_sa_selector {
 #define IB_SA_PATH_REC_REVERSIBLE			IB_SA_COMP_MASK(11)
 #define IB_SA_PATH_REC_NUMB_PATH			IB_SA_COMP_MASK(12)
 #define IB_SA_PATH_REC_PKEY				IB_SA_COMP_MASK(13)
-/* reserved:								14 */
+#define IB_SA_PATH_REC_QOS_CLASS			IB_SA_COMP_MASK(14)
 #define IB_SA_PATH_REC_SL				IB_SA_COMP_MASK(15)
 #define IB_SA_PATH_REC_MTU_SELECTOR			IB_SA_COMP_MASK(16)
 #define IB_SA_PATH_REC_MTU				IB_SA_COMP_MASK(17)
@@ -134,8 +134,7 @@ enum ib_sa_selector {
 #define IB_SA_PATH_REC_PREFERENCE			IB_SA_COMP_MASK(22)
 
 struct ib_sa_path_rec {
-	/* reserved */
-	/* reserved */
+	__be64       service_id;
 	union ib_gid dgid;
 	union ib_gid sgid;
 	__be16       dlid;
@@ -148,7 +147,7 @@ struct ib_sa_path_rec {
 	int          reversible;
 	u8           numb_path;
 	__be16       pkey;
-	/* reserved */
+	__be16       qos_class;
 	u8           sl;
 	u8           mtu_selector;
 	u8           mtu;
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 2d6a770..010f876 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -314,4 +314,18 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
  */
 void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr);
 
+/**
+ * rdma_set_service_type - Set the type of service associated with a
+ *   connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @tos: Type of service.
+ *
+ * The type of service is interpretted as a differentiated service
+ * field (RFC 2474).  The service type should be specified before
+ * performing route resolution, as existing communication on the
+ * connection identifier may be unaffected.  The type of service
+ * requested may not be supported by the network to all destinations.
+ */
+void rdma_set_service_type(struct rdma_cm_id *id, int tos);
+
 #endif /* RDMA_CM_H */
diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h
index f632b0c..9749c1b 100644
--- a/include/rdma/rdma_user_cm.h
+++ b/include/rdma/rdma_user_cm.h
@@ -212,4 +212,22 @@ struct rdma_ucm_event_resp {
 	} param;
 };
 
+/* Option levels */
+enum {
+	RDMA_OPTION_ID		= 0
+};
+
+/* Option details */
+enum {
+	RDMA_OPTION_ID_TOS	= 0
+};
+
+struct rdma_ucm_set_option {
+	__u64 optval;
+	__u32 id;
+	__u32 level;
+	__u32 optname;
+	__u32 optlen;
+};
+
 #endif /* RDMA_USER_CM_H */




More information about the general mailing list