[ofa-general] [GIT PULL] 2.6.22: please pull rdma-dev.git

Thu Apr 5 14:33:35 PDT 2007

Roland, please review and pull patches from

	git.openfabrics.org/~shefty/rdma-dev.git for-roland

This will pull in some patches that I would like queued for 2.6.22.

Sean Hefty (6):
      rdma_ucm: simplify ucma_get_event code
      ib_ucm: simplify ib_ucm_event code
      ib_sa: set src_path_bits correctly in ib_init_ah_from_path
      IB/cm: limit cm message timeout
      IB/mad: Fix GRH handling for sent/received MADs
      IB/ipoib: use ib_init_ah_from_path to initialize ah_attr

Patch details are listed below for easier review / feedback.

- Sean


commit 6042f5b86a92af4392c85949049f237396447d69
Author: Sean Hefty <sean.hefty at intel.com>
Date:   Thu Apr 5 11:50:11 2007 -0700

    IB/ipoib: use ib_init_ah_from_path to initialize ah_attr
    
    To support destinations that are not on the local IB subnet,
    IPoIB should include the GRH information when constructing an
    address handle.  Using the existing ib_init_ah_from_path call
    will do this for us.
    
    Signed-off-by: Sean Hefty <sean.hefty at intel.com>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 0741c6d..5a9ff7f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -395,14 +395,10 @@ static void path_rec_completion(int status,
 	skb_queue_head_init(&skqueue);
 
 	if (!status) {
-		struct ib_ah_attr av = {
-			.dlid 	       = be16_to_cpu(pathrec->dlid),
-			.sl 	       = pathrec->sl,
-			.port_num      = priv->port,
-			.static_rate   = pathrec->rate
-		};
-
-		ah = ipoib_create_ah(dev, priv->pd, &av);
+		struct ib_ah_attr av;
+
+		if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
+			ah = ipoib_create_ah(dev, priv->pd, &av);
 	}
 
 	spin_lock_irqsave(&priv->lock, flags);

commit 86cbcbb332b85501df98a7dccd8e2d40d1c2ffa0
Author: Sean Hefty <sean.hefty at intel.com>
Date:   Thu Apr 5 11:49:21 2007 -0700

    IB/mad: Fix GRH handling for sent/received MADs
    
    We need to set the SGID index for routed MADs and pass received
    GRH information to userspace when a MAD is received.
    
    Signed-off-by: Sean Hefty <sean.hefty at intel.com>

diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index c069ebe..7774cf5 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -231,12 +231,17 @@ static void recv_handler(struct ib_mad_agent *agent,
 	packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits;
 	packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
 	if (packet->mad.hdr.grh_present) {
-		/* XXX parse GRH */
-		packet->mad.hdr.gid_index 	= 0;
-		packet->mad.hdr.hop_limit 	= 0;
-		packet->mad.hdr.traffic_class	= 0;
-		memset(packet->mad.hdr.gid, 0, 16);
-		packet->mad.hdr.flow_label	= 0;
+		struct ib_ah_attr ah_attr;
+
+		ib_init_ah_from_wc(agent->device, agent->port_num,
+				   mad_recv_wc->wc, mad_recv_wc->recv_buf.grh,
+				   &ah_attr);
+
+		packet->mad.hdr.gid_index = ah_attr.grh.sgid_index;
+		packet->mad.hdr.hop_limit = ah_attr.grh.hop_limit;
+		packet->mad.hdr.traffic_class = ah_attr.grh.traffic_class;
+		memcpy(packet->mad.hdr.gid, &ah_attr.grh.dgid, 16);
+		packet->mad.hdr.flow_label = cpu_to_be32(ah_attr.grh.flow_label);
 	}
 
 	if (queue_packet(file, agent, packet))
@@ -473,6 +478,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user
*buf,
 	if (packet->mad.hdr.grh_present) {
 		ah_attr.ah_flags = IB_AH_GRH;
 		memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16);
+		ah_attr.grh.sgid_index	   = packet->mad.hdr.gid_index;
 		ah_attr.grh.flow_label 	   = be32_to_cpu(packet->mad.hdr.flow_label);
 		ah_attr.grh.hop_limit  	   = packet->mad.hdr.hop_limit;
 		ah_attr.grh.traffic_class  = packet->mad.hdr.traffic_class;

commit 3bed3bb2d0bb02ca8a590111c57fc1843624d2a4
Author: Sean Hefty <sean.hefty at intel.com>
Date:   Thu Apr 5 10:51:16 2007 -0700

    IB/cm: limit cm message timeout
    
    Limit the timeout that the ib_cm will wait to receive a response to a
    message, to avoid excessively large (on the order of hours) timeout
    values.  This prevents consuming resources tracking requests for
    extended periods of time, and allows quicker retries.
    
    This helps correct for a bug in an SRP Engenio target sending a large
    value (> 1 hour) as a service timeout.
    
    Signed-off-by: Sean Hefty <sean.hefty at intel.com>

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 842cd0b..706fdbf 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -54,6 +54,17 @@ MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("InfiniBand CM");
 MODULE_LICENSE("Dual BSD/GPL");
 
+#define PFX	"ib_cm: "
+
+/*
+ * Limit CM message timeouts to something reasonable:
+ * 32 seconds per message, with up to 15 retries
+ */
+static int max_timeout = 23;
+module_param(max_timeout, int, 0644);
+MODULE_PARM_DESC(max_timeout, "Maximum IB CM per message timeout "
+			      "(default=23, or ~32 seconds)");
+
 static void cm_add_one(struct ib_device *device);
 static void cm_remove_one(struct ib_device *device);
 
@@ -888,11 +899,23 @@ static void cm_format_req(struct cm_req_msg *req_msg,
 	cm_req_set_init_depth(req_msg, param->initiator_depth);
 	cm_req_set_remote_resp_timeout(req_msg,
 				       param->remote_cm_response_timeout);
+	if (param->remote_cm_response_timeout > (u8) max_timeout) {
+		printk(KERN_WARNING PFX "req remote_cm_response_timeout %d > "
+		       "%d, decreasing\n", param->remote_cm_response_timeout,
+		       max_timeout);
+		cm_req_set_remote_resp_timeout(req_msg, (u8) max_timeout);
+	}
 	cm_req_set_qp_type(req_msg, param->qp_type);
 	cm_req_set_flow_ctrl(req_msg, param->flow_control);
 	cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn));
 	cm_req_set_local_resp_timeout(req_msg,
 				      param->local_cm_response_timeout);
+	if (param->local_cm_response_timeout > (u8) max_timeout) {
+		printk(KERN_WARNING PFX "req local_cm_response_timeout %d > "
+		       "%d, decreasing\n", param->local_cm_response_timeout,
+		       max_timeout);
+		cm_req_set_local_resp_timeout(req_msg, (u8) max_timeout);
+	}
 	cm_req_set_retry_count(req_msg, param->retry_count);
 	req_msg->pkey = param->primary_path->pkey;
 	cm_req_set_path_mtu(req_msg, param->primary_path->mtu);
@@ -1002,6 +1025,11 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
 				    param->primary_path->packet_life_time) * 2 +
 				 cm_convert_to_ms(
 				    param->remote_cm_response_timeout);
+	if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) {
+		printk(KERN_WARNING PFX "req timeout_ms %d > %d, decreasing\n",
+		       cm_id_priv->timeout_ms, cm_convert_to_ms(max_timeout));
+		cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
+	}
 	cm_id_priv->max_cm_retries = param->max_cm_retries;
 	cm_id_priv->initiator_depth = param->initiator_depth;
 	cm_id_priv->responder_resources = param->responder_resources;
@@ -1401,6 +1429,13 @@ static int cm_req_handler(struct cm_work *work)
 	cm_id_priv->tid = req_msg->hdr.tid;
 	cm_id_priv->timeout_ms = cm_convert_to_ms(
 					cm_req_get_local_resp_timeout(req_msg));
+	if (cm_req_get_local_resp_timeout(req_msg) > (u8) max_timeout) {
+		printk(KERN_WARNING PFX "rcvd cm_local_resp_timeout %d > %d, "
+		       "decreasing used timeout_ms\n",
+		       cm_req_get_local_resp_timeout(req_msg), max_timeout);
+		cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
+	}
+
 	cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg);
 	cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg);
 	cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg);
@@ -2304,6 +2339,12 @@ static int cm_mra_handler(struct cm_work *work)
 					cm_mra_get_service_timeout(mra_msg);
 	timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) +
 		  cm_convert_to_ms(cm_id_priv->av.packet_life_time);
+	if (timeout > cm_convert_to_ms(max_timeout)) {
+		printk(KERN_WARNING PFX "calculated mra timeout %d > %d, "
+		       "decreasing used timeout_ms\n", timeout,
+		       cm_convert_to_ms(max_timeout));
+		timeout = cm_convert_to_ms(max_timeout);
+	}
 
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	switch (cm_id_priv->id.state) {
@@ -2707,6 +2748,12 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
 	cm_id->service_id = param->service_id;
 	cm_id->service_mask = __constant_cpu_to_be64(~0ULL);
 	cm_id_priv->timeout_ms = param->timeout_ms;
+	if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) {
+		printk(KERN_WARNING PFX "sidr req timeout_ms %d > %d, "
+		       "decreasing used timeout_ms\n", param->timeout_ms,
+		       cm_convert_to_ms(max_timeout));
+		cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
+	}
 	cm_id_priv->max_cm_retries = param->max_cm_retries;
 	ret = cm_alloc_msg(cm_id_priv, &msg);
 	if (ret)

commit e847d67ea97caabb6aaa5b9e8a1c47bba9bc3824
Author: Sean Hefty <sean.hefty at intel.com>
Date:   Thu Apr 5 10:51:10 2007 -0700

    ib_sa: set src_path_bits correctly in ib_init_ah_from_path
    
    The src_path_bits needs to mask off the base LID value.
    
    Signed-off-by: Sean Hefty <sean.hefty at intel.com>

diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 68db633..9a7eaad 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -57,6 +57,7 @@ MODULE_LICENSE("Dual BSD/GPL");
 struct ib_sa_sm_ah {
 	struct ib_ah        *ah;
 	struct kref          ref;
+	u8		     src_path_mask;
 };
 
 struct ib_sa_port {
@@ -380,6 +381,7 @@ static void update_sm_ah(struct work_struct *work)
 	}
 
 	kref_init(&new_ah->ref);
+	new_ah->src_path_mask = (1 << port_attr.lmc) - 1;
 
 	memset(&ah_attr, 0, sizeof ah_attr);
 	ah_attr.dlid     = port_attr.sm_lid;
@@ -460,6 +462,25 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
 }
 EXPORT_SYMBOL(ib_sa_cancel_query);
 
+static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
+{
+	struct ib_sa_device *sa_dev;
+	struct ib_sa_port   *port;
+	unsigned long flags;
+	u8 src_path_mask;
+
+	sa_dev = ib_get_client_data(device, &sa_client);
+	if (!sa_dev)
+		return 0x7f;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	spin_lock_irqsave(&port->ah_lock, flags);
+	src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f;
+	spin_unlock_irqrestore(&port->ah_lock, flags);
+
+	return src_path_mask;
+}
+
 int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 			 struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr)
 {
@@ -469,7 +490,8 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 	memset(ah_attr, 0, sizeof *ah_attr);
 	ah_attr->dlid = be16_to_cpu(rec->dlid);
 	ah_attr->sl = rec->sl;
-	ah_attr->src_path_bits = be16_to_cpu(rec->slid) & 0x7f;
+	ah_attr->src_path_bits = be16_to_cpu(rec->slid) &
+				 get_src_path_mask(device, port_num);
 	ah_attr->port_num = port_num;
 	ah_attr->static_rate = rec->rate;
 

commit 1e6ed3730a3d1db723e4bfccc5f1cfd1b0691aab
Author: Sean Hefty <sean.hefty at intel.com>
Date:   Thu Apr 5 10:51:05 2007 -0700

    ib_ucm: simplify ib_ucm_event code
    
    Simplify the wait on event code.
    
    Signed-off-by: Sean Hefty <sean.hefty at intel.com>

diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index ee51d79..2586a3e 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -407,29 +407,18 @@ static ssize_t ib_ucm_event(struct ib_ucm_file *file,
 
 	mutex_lock(&file->file_mutex);
 	while (list_empty(&file->events)) {
+		mutex_unlock(&file->file_mutex);
 
-		if (file->filp->f_flags & O_NONBLOCK) {
-			result = -EAGAIN;
-			break;
-		}
+		if (file->filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
 
-		if (signal_pending(current)) {
-			result = -ERESTARTSYS;
-			break;
-		}
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->events)))
+			return -ERESTARTSYS;
 
-		prepare_to_wait(&file->poll_wait, &wait, TASK_INTERRUPTIBLE);
-
-		mutex_unlock(&file->file_mutex);
-		schedule();
 		mutex_lock(&file->file_mutex);
-
-		finish_wait(&file->poll_wait, &wait);
 	}
 
-	if (result)
-		goto done;
-
 	uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
 
 	if (ib_ucm_new_cm_id(uevent->resp.event)) {

commit ed0b96bf383b3352c400e684c1b8fcb4868f68f2
Author: Sean Hefty <sean.hefty at intel.com>
Date:   Thu Apr 5 10:49:51 2007 -0700

    rdma_ucm: simplify ucma_get_event code
    
    Simplify the wait on event code.
    
    Signed-off-by: Sean Hefty <sean.hefty at intel.com>

diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index c859134..53b4c94 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -306,26 +306,18 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char
__user *inbuf,
 
 	mutex_lock(&file->mut);
 	while (list_empty(&file->event_list)) {
-		if (file->filp->f_flags & O_NONBLOCK) {
-			ret = -EAGAIN;
-			break;
-		}
+		mutex_unlock(&file->mut);
 
-		if (signal_pending(current)) {
-			ret = -ERESTARTSYS;
-			break;
-		}
+		if (file->filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->event_list)))
+			return -ERESTARTSYS;
 
-		prepare_to_wait(&file->poll_wait, &wait, TASK_INTERRUPTIBLE);
-		mutex_unlock(&file->mut);
-		schedule();
 		mutex_lock(&file->mut);
-		finish_wait(&file->poll_wait, &wait);
 	}
 
-	if (ret)
-		goto done;
-
 	uevent = list_entry(file->event_list.next, struct ucma_event, list);
 
 	if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {