[ofa-general] [GIT PULL] 2.6.24: please pull rdma-dev.git for-roland branch
Sean Hefty
sean.hefty at intel.com
Mon Sep 10 16:04:47 PDT 2007
Roland, please pull from:
git://git.openfabrics.org/~shefty/rdma-dev.git for-roland
This will pick up QoS and CM scalability changes that I would like to get
into 2.6.24 (and OFED 1.3). All have been posted to the list before, though
the QoS patches have received more attention.
Sean Hefty (7):
ib/ipoib: specify Traffic Class with PR queries for QoS support
ib/sa: add new QoS fields to path record
rdma/cm: add ability to specify type of service
rdma/ucm: export setting service type to user space
ib/srp: add QoS support through service ID
ib/cm: modify interface to send MRAs in response to duplicate messages
rdma/cm: queue IB CM MRAs to avoid unnecessary remote retries
drivers/infiniband/core/cm.c | 51 +++++++----------
drivers/infiniband/core/cma.c | 46 ++++++++++++---
drivers/infiniband/core/sa_query.c | 10 +--
drivers/infiniband/core/ucma.c | 74 ++++++++++++++++++++++++-
drivers/infiniband/ulp/ipoib/ipoib.h | 22 +++++++
drivers/infiniband/ulp/ipoib/ipoib_main.c | 8 +-
drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 22 -------
drivers/infiniband/ulp/srp/ib_srp.c | 2
include/rdma/ib_cm.h | 7 +-
include/rdma/ib_sa.h | 11 +--
include/rdma/rdma_cm.h | 14 ++++
include/rdma/rdma_user_cm.h | 18 ++++++
12 files changed, 205 insertions(+), 80 deletions(-)
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 4df269f..2e39236 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -2219,6 +2219,9 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
{
struct cm_id_private *cm_id_priv;
struct ib_mad_send_buf *msg;
+ enum ib_cm_state cm_state;
+ enum ib_cm_lap_state lap_state;
+ enum cm_msg_response msg_response;
void *data;
unsigned long flags;
int ret;
@@ -2235,48 +2238,40 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
spin_lock_irqsave(&cm_id_priv->lock, flags);
switch(cm_id_priv->id.state) {
case IB_CM_REQ_RCVD:
- ret = cm_alloc_msg(cm_id_priv, &msg);
- if (ret)
- goto error1;
-
- cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_REQ, service_timeout,
- private_data, private_data_len);
- ret = ib_post_send_mad(msg, NULL);
- if (ret)
- goto error2;
- cm_id->state = IB_CM_MRA_REQ_SENT;
+ cm_state = IB_CM_MRA_REQ_SENT;
+ lap_state = cm_id->lap_state;
+ msg_response = CM_MSG_RESPONSE_REQ;
break;
case IB_CM_REP_RCVD:
- ret = cm_alloc_msg(cm_id_priv, &msg);
- if (ret)
- goto error1;
-
- cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_REP, service_timeout,
- private_data, private_data_len);
- ret = ib_post_send_mad(msg, NULL);
- if (ret)
- goto error2;
- cm_id->state = IB_CM_MRA_REP_SENT;
+ cm_state = IB_CM_MRA_REP_SENT;
+ lap_state = cm_id->lap_state;
+ msg_response = CM_MSG_RESPONSE_REP;
break;
case IB_CM_ESTABLISHED:
+ cm_state = cm_id->state;
+ lap_state = IB_CM_MRA_LAP_SENT;
+ msg_response = CM_MSG_RESPONSE_OTHER;
+ break;
+ default:
+ ret = -EINVAL;
+ goto error1;
+ }
+
+ if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
ret = cm_alloc_msg(cm_id_priv, &msg);
if (ret)
goto error1;
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_OTHER, service_timeout,
+ msg_response, service_timeout,
private_data, private_data_len);
ret = ib_post_send_mad(msg, NULL);
if (ret)
goto error2;
- cm_id->lap_state = IB_CM_MRA_LAP_SENT;
- break;
- default:
- ret = -EINVAL;
- goto error1;
}
+
+ cm_id->state = cm_state;
+ cm_id->lap_state = lap_state;
cm_id_priv->service_timeout = service_timeout;
cm_set_private_data(cm_id_priv, data, private_data_len);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 9ffb998..7253952 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -52,6 +52,7 @@ MODULE_LICENSE("Dual BSD/GPL");
#define CMA_CM_RESPONSE_TIMEOUT 20
#define CMA_MAX_CM_RETRIES 15
+#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
static void cma_add_one(struct ib_device *device);
static void cma_remove_one(struct ib_device *device);
@@ -138,6 +139,7 @@ struct rdma_id_private {
u32 qkey;
u32 qp_num;
u8 srq;
+ u8 tos;
};
struct cma_multicast {
@@ -1089,6 +1091,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
event.param.ud.private_data_len =
IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
} else {
+ ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
conn_id = cma_new_conn_id(&listen_id->id, ib_event);
cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
ib_event->private_data, offset);
@@ -1474,6 +1477,15 @@ err:
}
EXPORT_SYMBOL(rdma_listen);
+void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ id_priv->tos = (u8) tos;
+}
+EXPORT_SYMBOL(rdma_set_service_type);
+
static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
void *context)
{
@@ -1498,23 +1510,37 @@ static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
struct cma_work *work)
{
- struct rdma_dev_addr *addr = &id_priv->id.route.addr.dev_addr;
+ struct rdma_addr *addr = &id_priv->id.route.addr;
struct ib_sa_path_rec path_rec;
+ ib_sa_comp_mask comp_mask;
+ struct sockaddr_in6 *sin6;
memset(&path_rec, 0, sizeof path_rec);
- ib_addr_get_sgid(addr, &path_rec.sgid);
- ib_addr_get_dgid(addr, &path_rec.dgid);
- path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(addr));
+ ib_addr_get_sgid(&addr->dev_addr, &path_rec.sgid);
+ ib_addr_get_dgid(&addr->dev_addr, &path_rec.dgid);
+ path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr));
path_rec.numb_path = 1;
path_rec.reversible = 1;
+ path_rec.service_id = cma_get_service_id(id_priv->id.ps, &addr->dst_addr);
+
+ comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+ IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+ IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
+
+ if (addr->src_addr.sa_family == AF_INET) {
+ path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
+ comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
+ } else {
+ sin6 = (struct sockaddr_in6 *) &addr->src_addr;
+ path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
+ comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+ }
id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
- id_priv->id.port_num, &path_rec,
- IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
- IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
- IB_SA_PATH_REC_REVERSIBLE,
- timeout_ms, GFP_KERNEL,
- cma_query_handler, work, &id_priv->query);
+ id_priv->id.port_num, &path_rec,
+ comp_mask, timeout_ms,
+ GFP_KERNEL, cma_query_handler,
+ work, &id_priv->query);
return (id_priv->query_id < 0) ? id_priv->query_id : 0;
}
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index d271bd7..6f56bb5 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -123,14 +123,10 @@ static u32 tid;
.field_name = "sa_path_rec:" #field
static const struct ib_field path_rec_table[] = {
- { RESERVED,
+ { PATH_REC_FIELD(service_id),
.offset_words = 0,
.offset_bits = 0,
- .size_bits = 32 },
- { RESERVED,
- .offset_words = 1,
- .offset_bits = 0,
- .size_bits = 32 },
+ .size_bits = 64 },
{ PATH_REC_FIELD(dgid),
.offset_words = 2,
.offset_bits = 0,
@@ -179,7 +175,7 @@ static const struct ib_field path_rec_table[] = {
.offset_words = 12,
.offset_bits = 16,
.size_bits = 16 },
- { RESERVED,
+ { PATH_REC_FIELD(qos_class),
.offset_words = 13,
.offset_bits = 0,
.size_bits = 12 },
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 53b4c94..90d675a 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -792,6 +792,78 @@ out:
return ret;
}
+static int ucma_set_option_id(struct ucma_context *ctx, int optname,
+ void *optval, size_t optlen)
+{
+ int ret = 0;
+
+ switch (optname) {
+ case RDMA_OPTION_ID_TOS:
+ if (optlen != sizeof(u8)) {
+ ret = -EINVAL;
+ break;
+ }
+ rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static int ucma_set_option_level(struct ucma_context *ctx, int level,
+ int optname, void *optval, size_t optlen)
+{
+ int ret;
+
+ switch (level) {
+ case RDMA_OPTION_ID:
+ ret = ucma_set_option_id(ctx, optname, optval, optlen);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_set_option cmd;
+ struct ucma_context *ctx;
+ void *optval;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ optval = kmalloc(cmd.optlen, GFP_KERNEL);
+ if (!optval) {
+ ret = -ENOMEM;
+ goto out1;
+ }
+
+ if (copy_from_user(optval, (void __user *) (unsigned long) cmd.optval,
+ cmd.optlen)) {
+ ret = -EFAULT;
+ goto out2;
+ }
+
+ ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
+ cmd.optlen);
+out2:
+ kfree(optval);
+out1:
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
int in_len, int out_len)
{
@@ -936,7 +1008,7 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
[RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr,
[RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event,
[RDMA_USER_CM_CMD_GET_OPTION] = NULL,
- [RDMA_USER_CM_CMD_SET_OPTION] = NULL,
+ [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option,
[RDMA_USER_CM_CMD_NOTIFY] = ucma_notify,
[RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast,
[RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast,
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 285c143..fc16bce 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -113,7 +113,27 @@ struct ipoib_pseudoheader {
u8 hwaddr[INFINIBAND_ALEN];
};
-struct ipoib_mcast;
+/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
+struct ipoib_mcast {
+ struct ib_sa_mcmember_rec mcmember;
+ struct ib_sa_multicast *mc;
+ struct ipoib_ah *ah;
+
+ struct rb_node rb_node;
+ struct list_head list;
+
+ unsigned long created;
+ unsigned long backoff;
+
+ unsigned long flags;
+ unsigned char logcount;
+
+ struct list_head neigh_list;
+
+ struct sk_buff_head pkt_queue;
+
+ struct net_device *dev;
+};
struct ipoib_rx_buf {
struct sk_buff *skb;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 894b1dc..841e068 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -468,9 +468,10 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
INIT_LIST_HEAD(&path->neigh_list);
memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
- path->pathrec.sgid = priv->local_gid;
- path->pathrec.pkey = cpu_to_be16(priv->pkey);
- path->pathrec.numb_path = 1;
+ path->pathrec.sgid = priv->local_gid;
+ path->pathrec.pkey = cpu_to_be16(priv->pkey);
+ path->pathrec.numb_path = 1;
+ path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
return path;
}
@@ -491,6 +492,7 @@ static int path_rec_start(struct net_device *dev,
IB_SA_PATH_REC_DGID |
IB_SA_PATH_REC_SGID |
IB_SA_PATH_REC_NUMB_PATH |
+ IB_SA_PATH_REC_TRAFFIC_CLASS |
IB_SA_PATH_REC_PKEY,
1000, GFP_ATOMIC,
path_rec_completion,
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index aae3670..94a5709 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -57,28 +57,6 @@ MODULE_PARM_DESC(mcast_debug_level,
static DEFINE_MUTEX(mcast_mutex);
-/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
-struct ipoib_mcast {
- struct ib_sa_mcmember_rec mcmember;
- struct ib_sa_multicast *mc;
- struct ipoib_ah *ah;
-
- struct rb_node rb_node;
- struct list_head list;
-
- unsigned long created;
- unsigned long backoff;
-
- unsigned long flags;
- unsigned char logcount;
-
- struct list_head neigh_list;
-
- struct sk_buff_head pkt_queue;
-
- struct net_device *dev;
-};
-
struct ipoib_mcast_iter {
struct net_device *dev;
union ib_gid mgid;
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index f6a0514..9ccc638 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -285,6 +285,7 @@ static int srp_lookup_path(struct srp_target_port *target)
target->srp_host->dev->dev,
target->srp_host->port,
&target->path,
+ IB_SA_PATH_REC_SERVICE_ID |
IB_SA_PATH_REC_DGID |
IB_SA_PATH_REC_SGID |
IB_SA_PATH_REC_NUMB_PATH |
@@ -1692,6 +1693,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)
goto out;
}
target->service_id = cpu_to_be64(simple_strtoull(p, NULL, 16));
+ target->path.service_id = target->service_id;
kfree(p);
break;
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 12243e8..a627c86 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -477,12 +477,15 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id,
const void *private_data,
u8 private_data_len);
+#define IB_CM_MRA_FLAG_DELAY 0x80 /* Send MRA only after a duplicate msg */
+
/**
* ib_send_cm_mra - Sends a message receipt acknowledgement to a connection
* message.
* @cm_id: Connection identifier associated with the connection message.
- * @service_timeout: The maximum time required for the sender to reply to
- * to the connection message.
+ * @service_timeout: The lower 5-bits specify the maximum time required for
+ * the sender to reply to to the connection message. The upper 3-bits
+ * specify additional control flags.
* @private_data: Optional user-defined private data sent with the
* message receipt acknowledgement.
* @private_data_len: Size of the private data buffer, in bytes.
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 5e26b2f..942692b 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -109,8 +109,8 @@ enum ib_sa_selector {
* Reserved rows are indicated with comments to help maintainability.
*/
-/* reserved: 0 */
-/* reserved: 1 */
+#define IB_SA_PATH_REC_SERVICE_ID (IB_SA_COMP_MASK( 0) |\
+ IB_SA_COMP_MASK( 1))
#define IB_SA_PATH_REC_DGID IB_SA_COMP_MASK( 2)
#define IB_SA_PATH_REC_SGID IB_SA_COMP_MASK( 3)
#define IB_SA_PATH_REC_DLID IB_SA_COMP_MASK( 4)
@@ -123,7 +123,7 @@ enum ib_sa_selector {
#define IB_SA_PATH_REC_REVERSIBLE IB_SA_COMP_MASK(11)
#define IB_SA_PATH_REC_NUMB_PATH IB_SA_COMP_MASK(12)
#define IB_SA_PATH_REC_PKEY IB_SA_COMP_MASK(13)
-/* reserved: 14 */
+#define IB_SA_PATH_REC_QOS_CLASS IB_SA_COMP_MASK(14)
#define IB_SA_PATH_REC_SL IB_SA_COMP_MASK(15)
#define IB_SA_PATH_REC_MTU_SELECTOR IB_SA_COMP_MASK(16)
#define IB_SA_PATH_REC_MTU IB_SA_COMP_MASK(17)
@@ -134,8 +134,7 @@ enum ib_sa_selector {
#define IB_SA_PATH_REC_PREFERENCE IB_SA_COMP_MASK(22)
struct ib_sa_path_rec {
- /* reserved */
- /* reserved */
+ __be64 service_id;
union ib_gid dgid;
union ib_gid sgid;
__be16 dlid;
@@ -148,7 +147,7 @@ struct ib_sa_path_rec {
int reversible;
u8 numb_path;
__be16 pkey;
- /* reserved */
+ __be16 qos_class;
u8 sl;
u8 mtu_selector;
u8 mtu;
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 2d6a770..010f876 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -314,4 +314,18 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
*/
void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr);
+/**
+ * rdma_set_service_type - Set the type of service associated with a
+ * connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @tos: Type of service.
+ *
+ * The type of service is interpretted as a differentiated service
+ * field (RFC 2474). The service type should be specified before
+ * performing route resolution, as existing communication on the
+ * connection identifier may be unaffected. The type of service
+ * requested may not be supported by the network to all destinations.
+ */
+void rdma_set_service_type(struct rdma_cm_id *id, int tos);
+
#endif /* RDMA_CM_H */
diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h
index f632b0c..9749c1b 100644
--- a/include/rdma/rdma_user_cm.h
+++ b/include/rdma/rdma_user_cm.h
@@ -212,4 +212,22 @@ struct rdma_ucm_event_resp {
} param;
};
+/* Option levels */
+enum {
+ RDMA_OPTION_ID = 0
+};
+
+/* Option details */
+enum {
+ RDMA_OPTION_ID_TOS = 0
+};
+
+struct rdma_ucm_set_option {
+ __u64 optval;
+ __u32 id;
+ __u32 level;
+ __u32 optname;
+ __u32 optlen;
+};
+
#endif /* RDMA_USER_CM_H */
More information about the general
mailing list