[ewg] [PATCH] Proposal for MAD Busy handling

Fri Oct 8 10:39:38 PDT 2010

Sean, Jason,

I backed off on this because the migration to OFED 1.5.2 and other issues was consuming all of my time; I've had this patch for quite a while but I finally had time recently to rework and test it for 1.5.2.

The intent of this patch is to try to address the feedback you gave me earlier this year. It does NOT implement the ABI/API changes that would be needed in user space to take advantage of the new features, but it lays the groundwork for doing so. In addition, it provides two new module parameters that allow the administrator to coerce existing code into using the new capabilities.

Initially, I had tried to completely separate BUSY retries from timeout handling, but that seemed difficult due to the way the timeout code is structured. As a result, true timeouts and busy handling still use the same timeout values, but I was still able to address the idea of randomizing the retry timeout if desired.

By default, the behavior of ib_mad wrt to BUSY responses is unchanged. If, however, a send work request is provided that has the new "busy_wait" parameter set, ib_mad will ignore BUSY responses to that WR, allowing it to timeout and retry as if no response had been received. 

In addition, if the send WR has the new "randomized_wait" parameter set, each time the WR times out, the the timeout for the next retry is set to (send_wr->timeout_ms + 511<<(send_wr->retries) - random32()&511). In other words, on the first retry, the randomization code will add between 0 and 1/2 second to the timeout. On the second, it will add between 1 and 1.5 seconds to the timeout, on the 3rd, between 2 and 2.5 seconds, on the 4th, between 4 and 4.5, et cetera. In addition, a new private field, total_timeout has been added to the WR and is initialized to (send_wr->timeout * send_wr->max_retries). Retry values are adjusted so that the total # of retry timeouts cannot exceed this value.

Finally, I've added two module parameters that coerce all mad work requests to use one or both of these settings:

parm:           treat_busy_as_timeout:When true, treat BUSY responses as if they were timeouts. (int)
parm:           randomized_wait:When true, use a randomized backoff algorithm to control retries for timeouts. (int)

As I mentioned in the past, these changes solve a problem we see in the real world all the time (the SM being pounded by "unintelligent" queries) so I strongly hope this meets your concerns and we can get it added to the next release of OFED.


-----------------

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 64e660c..88ae047 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -41,6 +41,11 @@
 #include "smi.h"
 #include "agent.h"
 
+#include "linux/random.h"
+
+#define MAD_MIN_TIMEOUT_MS 511
+#define MAD_RAND_TIMEOUT_MS 511
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("kernel IB MAD API");
 MODULE_AUTHOR("Hal Rosenstock");
@@ -54,6 +59,14 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests
 module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
 
+int mad_wait_on_busy = 0;
+module_param_named(treat_busy_as_timeout, mad_wait_on_busy, int, 0444);
+MODULE_PARM_DESC(treat_busy_as_timeout, "When true, treat BUSY responses as if they were timeouts.");
+
+int mad_randomized_wait = 0;
+module_param_named(randomized_wait, mad_randomized_wait, int, 0444);
+MODULE_PARM_DESC(randomized_wait, "When true, use a randomized backoff algorithm to control retries for timeouts.");
+
 static struct kmem_cache *ib_mad_cache;
 
 static struct list_head ib_mad_port_list;
@@ -1116,11 +1129,19 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 		}
 
 		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+
+		mad_send_wr->randomized_wait = mad_randomized_wait || send_buf->randomized_wait;
+		mad_send_wr->total_timeout = msecs_to_jiffies(send_buf->timeout_ms) * send_buf->retries;
+		
 		/* Timeout will be updated after send completes */
 		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+
 		mad_send_wr->max_retries = send_buf->retries;
 		mad_send_wr->retries_left = send_buf->retries;
+		mad_send_wr->wait_on_busy = send_buf->wait_on_busy || mad_wait_on_busy;
+		
 		send_buf->retries = 0;
+		
 		/* Reference for work request to QP + response */
 		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
 		mad_send_wr->status = IB_WC_SUCCESS;
@@ -1828,6 +1849,9 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 
 	/* Complete corresponding request */
 	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+		u16 busy = __be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.status) &
+						IB_MGMT_MAD_STATUS_BUSY;
+
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
 		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
 		if (!mad_send_wr) {
@@ -1836,6 +1860,18 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 			deref_mad_agent(mad_agent_priv);
 			return;
 		}
+
+		printk(KERN_DEBUG PFX "Completing recv %p: busy = %d, retries_left = %d, wait_on_busy = %d\n",
+			mad_send_wr, busy, mad_send_wr->retries_left, mad_send_wr->wait_on_busy);
+		if (busy && mad_send_wr->retries_left && mad_send_wr->wait_on_busy) {
+			/* Just let the query timeout and have it requeued later */
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			ib_free_recv_mad(mad_recv_wc);
+			deref_mad_agent(mad_agent_priv);
+			printk(KERN_INFO PFX "SA/SM responded MAD_STATUS_BUSY. Allowing request to time out.\n");
+			return;
+		}
+
 		ib_mark_mad_done(mad_send_wr);
 		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
@@ -2445,14 +2481,33 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	int ret;
 
-	if (!mad_send_wr->retries_left)
+	if (!mad_send_wr->retries_left || (mad_send_wr->total_timeout == 0))
 		return -ETIMEDOUT;
 
 	mad_send_wr->retries_left--;
 	mad_send_wr->send_buf.retries++;
 
-	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	if (mad_send_wr->randomized_wait) {
+		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms +
+												(MAD_MIN_TIMEOUT_MS<<mad_send_wr->send_buf.retries) -
+												(random32()&MAD_RAND_TIMEOUT_MS));
+		if (mad_send_wr->timeout > mad_send_wr->total_timeout) {
+			mad_send_wr->timeout = mad_send_wr->total_timeout;
+			mad_send_wr->total_timeout = 0;
+		} else {
+			mad_send_wr->total_timeout -= mad_send_wr->timeout;
+		}
+	} else {
+		mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	}
 
+	printk(KERN_DEBUG PFX "Retrying send %p: retries: %u, retries_left: %u, timeout: %lu, total_timeout: %lu\n",
+		mad_send_wr,
+		mad_send_wr->send_buf.retries,
+		mad_send_wr->retries_left,
+		mad_send_wr->timeout,
+		mad_send_wr->total_timeout);
+		
 	if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
 		ret = ib_retry_rmpp(mad_send_wr);
 		switch (ret) {
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 8b4df0a..7b9ea2a 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -133,8 +133,11 @@ struct ib_mad_send_wr_private {
 	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
 	__be64 tid;
 	unsigned long timeout;
+	unsigned long total_timeout;
 	int max_retries;
 	int retries_left;
+	int wait_on_busy;
+	int randomized_wait;
 	int retry;
 	int refcount;
 	enum ib_wc_status status;
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index d3b9401..3da55c3 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -77,6 +77,15 @@
 
 #define IB_MGMT_MAX_METHODS			128
 
+/* MAD Status field bit masks */
+#define IB_MGMT_MAD_STATUS_SUCCESS						0x0000
+#define IB_MGMT_MAD_STATUS_BUSY							0x0001
+#define IB_MGMT_MAD_STATUS_REDIRECT_REQD				0x0002
+#define IB_MGMT_MAD_STATUS_BAD_VERERSION				0x0004	
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD			0x0008	
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB	0x000c
+#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE			0x001c
+
 /* RMPP information */
 #define IB_MGMT_RMPP_VERSION			1
 
@@ -246,6 +255,8 @@ struct ib_mad_send_buf {
 	int			seg_count;
 	int			seg_size;
 	int			timeout_ms;
+	int			wait_on_busy;
+	int			randomized_wait;
 	int			retries;
 };


-------------- next part --------------
A non-text attachment was scrubbed...
Name: core_600_busy-v2.patch
Type: application/octet-stream
Size: 5916 bytes
Desc: core_600_busy-v2.patch
URL: <http://lists.openfabrics.org/pipermail/ewg/attachments/20101008/341f8f13/attachment.obj>