[ofa-general] [PATCH] sdp: make interrupt moderation adaptive

Amir Vadai amirv at mellanox.co.il
Mon Jun 22 01:18:48 PDT 2009


Make HW RC interrupt moderation adaptive.
For latency sensitive traffic - no moderation.
In BW mode - maximal moderation.
Use heuristics to detect mode.:w

Signed-off-by: Amir Vadai <amirv at mellanox.co.il>
---
 drivers/infiniband/ulp/sdp/sdp.h      |   42 ++++++++
 drivers/infiniband/ulp/sdp/sdp_cma.c  |    3 +
 drivers/infiniband/ulp/sdp/sdp_main.c |  187 +++++++++++++++++++++++++++------
 drivers/infiniband/ulp/sdp/sdp_rx.c   |   40 ++------
 drivers/infiniband/ulp/sdp/sdp_tx.c   |    7 +-
 5 files changed, 213 insertions(+), 66 deletions(-)

diff --git a/drivers/infiniband/ulp/sdp/sdp.h b/drivers/infiniband/ulp/sdp/sdp.h
index 5d3e7d7..cc38f5b 100644
--- a/drivers/infiniband/ulp/sdp/sdp.h
+++ b/drivers/infiniband/ulp/sdp/sdp.h
@@ -37,6 +37,16 @@
 	spin_unlock_irqrestore(&ssk->rx_ring.lock, f); \
 } while (0)
 
+#define SDP_MODPARAM_SINT(var, def_val, msg) \
+	static int var = def_val; \
+	module_param_named(var, var, int, 0644); \
+	MODULE_PARM_DESC(var, msg " [" #def_val "]"); \
+
+#define SDP_MODPARAM_INT(var, def_val, msg) \
+	int var = def_val; \
+	module_param_named(var, var, int, 0644); \
+	MODULE_PARM_DESC(var, msg " [" #def_val "]"); \
+
 #ifdef SDP_PROFILING
 struct sk_buff;
 struct sdpprf_log {
@@ -238,6 +248,9 @@ static inline void sdpstats_hist(u32 *h, u32 val, u32 maxidx, int is_log)
 /* how long (in jiffies) to block sender till tx completion*/
 #define SDP_BZCOPY_POLL_TIMEOUT (HZ / 10)
 
+#define SDP_AUTO_CONF	0xffff
+#define AUTO_MOD_DELAY (HZ / 4)
+
 #define BZCOPY_STATE(skb) (*(struct bzcopy_state **)(skb->cb))
 #ifndef MIN
 #define MIN(a, b) (a < b ? a : b)
@@ -372,6 +385,27 @@ struct sdp_chrecvbuf {
 	sdp_do_posts(ssk); \
 })
 
+struct sdp_moderation {
+	unsigned long last_moder_packets;
+	unsigned long last_moder_tx_packets;
+	unsigned long last_moder_bytes;
+	unsigned long last_moder_jiffies;
+	int last_moder_time;
+	u16 rx_usecs;
+	u16 rx_frames;
+	u16 tx_usecs;
+	u32 pkt_rate_low;
+	u16 rx_usecs_low;
+	u32 pkt_rate_high;
+	u16 rx_usecs_high;
+	u16 sample_interval;
+	u16 adaptive_rx_coal;
+	u32 msg_enable;
+
+	int moder_cnt;
+	int moder_time;
+};
+
 struct sdp_sock {
 	/* sk has to be the first member of inet_sock */
 	struct inet_sock isk;
@@ -451,6 +485,12 @@ struct sdp_sock {
 	int recv_frags; 	/* max skb frags in recv packets */
 	int send_frags; 	/* max skb frags in send packets */
 
+	unsigned long tx_packets;
+	unsigned long rx_packets;
+	unsigned long tx_bytes;
+	unsigned long rx_bytes;
+	struct sdp_moderation auto_mod;
+
 	/* BZCOPY data */
 	int   zcopy_thresh;
 };
@@ -568,6 +608,8 @@ void sdp_start_keepalive_timer(struct sock *sk);
 int sdp_init_sock(struct sock *sk);
 int __init sdp_proc_init(void);
 void sdp_proc_unregister(void);
+/* sdp_main.c */
+void sdp_set_default_moderation(struct sdp_sock *ssk);
 
 /* sdp_tx.c */
 int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device);
diff --git a/drivers/infiniband/ulp/sdp/sdp_cma.c b/drivers/infiniband/ulp/sdp/sdp_cma.c
index 83e39dc..55ade25 100644
--- a/drivers/infiniband/ulp/sdp/sdp_cma.c
+++ b/drivers/infiniband/ulp/sdp/sdp_cma.c
@@ -203,6 +203,7 @@ static int sdp_response_handler(struct sock *sk, struct rdma_cm_id *id,
 	sdp_dbg(sk, "%s\n", __func__);
 
 	sdp_exch_state(sk, TCPF_SYN_SENT, TCP_ESTABLISHED);
+	sdp_set_default_moderation(sdp_sk(sk));
 
 	if (sock_flag(sk, SOCK_KEEPOPEN))
 		sdp_start_keepalive_timer(sk);
@@ -249,6 +250,8 @@ static int sdp_connected_handler(struct sock *sk, struct rdma_cm_event *event)
 
 	sdp_exch_state(sk, TCPF_SYN_RECV, TCP_ESTABLISHED);
 
+	sdp_set_default_moderation(sdp_sk(sk));
+
 	if (sock_flag(sk, SOCK_KEEPOPEN))
 		sdp_start_keepalive_timer(sk);
 
diff --git a/drivers/infiniband/ulp/sdp/sdp_main.c b/drivers/infiniband/ulp/sdp/sdp_main.c
index 64511fe..b34743d 100644
--- a/drivers/infiniband/ulp/sdp/sdp_main.c
+++ b/drivers/infiniband/ulp/sdp/sdp_main.c
@@ -79,41 +79,33 @@ MODULE_DESCRIPTION("InfiniBand SDP module");
 MODULE_LICENSE("Dual BSD/GPL");
 
 #ifdef CONFIG_INFINIBAND_SDP_DEBUG
-int sdp_debug_level;
-
-module_param_named(debug_level, sdp_debug_level, int, 0644);
-MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0.");
+SDP_MODPARAM_INT(sdp_debug_level, 0, "Enable debug tracing if > 0.");
 #endif
 #ifdef CONFIG_INFINIBAND_SDP_DEBUG
-int sdp_data_debug_level;
-
-module_param_named(data_debug_level, sdp_data_debug_level, int, 0644);
-MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0.");
+SDP_MODPARAM_INT(sdp_data_debug_level, 0, "Enable data path debug tracing if > 0.");
 #endif
 
-static int recv_poll_hit;
-
-module_param_named(recv_poll_hit, recv_poll_hit, int, 0644);
-MODULE_PARM_DESC(recv_poll_hit, "How many times recv poll helped.");
-
-static int recv_poll_miss;
-
-module_param_named(recv_poll_miss, recv_poll_miss, int, 0644);
-MODULE_PARM_DESC(recv_poll_miss, "How many times recv poll missed.");
-
-static int recv_poll = 1000;
-
-module_param_named(recv_poll, recv_poll, int, 0644);
-MODULE_PARM_DESC(recv_poll, "How many times to poll recv.");
-
-static unsigned int sdp_keepalive_time = SDP_KEEPALIVE_TIME;
-
-module_param_named(sdp_keepalive_time, sdp_keepalive_time, uint, 0644);
-MODULE_PARM_DESC(sdp_keepalive_time, "Default idle time in seconds before keepalive probe sent.");
-
-static int sdp_zcopy_thresh = 65536;
-module_param_named(sdp_zcopy_thresh, sdp_zcopy_thresh, int, 0644);
-MODULE_PARM_DESC(sdp_zcopy_thresh, "Zero copy send threshold; 0=0ff.");
+SDP_MODPARAM_SINT(recv_poll_hit, -1, "How many times recv poll helped.");
+SDP_MODPARAM_SINT(recv_poll_miss, -1, "How many times recv poll missed.");
+SDP_MODPARAM_SINT(recv_poll, 1000, "How many times to poll recv.");
+SDP_MODPARAM_SINT(sdp_keepalive_time, SDP_KEEPALIVE_TIME, 
+	"Default idle time in seconds before keepalive probe sent.");
+SDP_MODPARAM_SINT(sdp_zcopy_thresh, 65536, "Zero copy send threshold; 0=0ff.");
+
+#define SDP_RX_COAL_TIME_HIGH 128
+SDP_MODPARAM_SINT(sdp_rx_coal_target, 0x50000,
+	"Target number of bytes to coalesce with interrupt moderation (bytes).");
+SDP_MODPARAM_SINT(sdp_rx_coal_time, 0x10, "rx coal time (jiffies).");
+SDP_MODPARAM_SINT(sdp_rx_rate_low, 80000, "rx_rate low (packets/sec).");
+SDP_MODPARAM_SINT(sdp_rx_coal_time_low, 0, "low moderation time val (usec).");
+SDP_MODPARAM_SINT(sdp_rx_rate_high, 100000, "rx_rate high (packets/sec).");
+SDP_MODPARAM_SINT(sdp_rx_coal_time_high, 128, "high moderation time val (usec).");
+SDP_MODPARAM_SINT(sdp_rx_rate_thresh, (200000 / SDP_RX_COAL_TIME_HIGH),
+	"rx rate thresh ().");
+SDP_MODPARAM_SINT(sdp_sample_interval, (HZ / 4), "sample interval (jiffies).");
+
+SDP_MODPARAM_INT(hw_int_mod_count, -1, "forced hw int moderation val. -1 for auto (packets).");
+SDP_MODPARAM_INT(hw_int_mod_usec, -1, "forced hw int moderation val. -1 for auto (usec).");
 
 struct workqueue_struct *sdp_wq;
 struct workqueue_struct *rx_comp_wq;
@@ -298,6 +290,135 @@ void sdp_start_keepalive_timer(struct sock *sk)
 	sdp_reset_keepalive_timer(sk, sdp_keepalive_time_when(sdp_sk(sk)));
 }
 
+void sdp_set_default_moderation(struct sdp_sock *ssk)
+{
+	struct sdp_moderation *mod = &ssk->auto_mod;
+	int rx_buf_size;
+
+	if (hw_int_mod_count > -1 || hw_int_mod_usec > -1) {
+		int err;
+
+		mod->adaptive_rx_coal = 0;
+
+		if (hw_int_mod_count > 0 && hw_int_mod_usec > 0) {
+			err = ib_modify_cq(ssk->rx_ring.cq, hw_int_mod_count, hw_int_mod_usec);
+			if (err)
+				sdp_warn(&ssk->isk.sk, "Failed modifying moderation for cq");
+			else 
+				sdp_dbg(&ssk->isk.sk, "Using fixed interrupt moderation\n");
+		}
+		return;
+	}
+
+	mod->adaptive_rx_coal = 1;
+	sdp_dbg(&ssk->isk.sk, "Using adaptive interrupt moderation\n");
+
+	/* If we haven't received a specific coalescing setting
+	 * (module param), we set the moderation paramters as follows:
+	 * - moder_cnt is set to the number of mtu sized packets to
+	 *   satisfy our coelsing target.
+	 * - moder_time is set to a fixed value.
+	 */
+	rx_buf_size = (ssk->recv_frags * PAGE_SIZE) + sizeof(struct sdp_bsdh);
+	mod->moder_cnt = sdp_rx_coal_target / rx_buf_size + 1;
+	mod->moder_time = sdp_rx_coal_time;
+	sdp_dbg(&ssk->isk.sk, "Default coalesing params for buf size:%d - "
+			     "moder_cnt:%d moder_time:%d\n",
+		 rx_buf_size, mod->moder_cnt, mod->moder_time);
+
+	/* Reset auto-moderation params */
+	mod->pkt_rate_low = sdp_rx_rate_low;
+	mod->rx_usecs_low = sdp_rx_coal_time_low;
+	mod->pkt_rate_high = sdp_rx_rate_high;
+	mod->rx_usecs_high = sdp_rx_coal_time_high;
+	mod->sample_interval = sdp_sample_interval;
+	
+	mod->last_moder_time = SDP_AUTO_CONF;
+	mod->last_moder_jiffies = 0;
+	mod->last_moder_packets = 0;
+	mod->last_moder_tx_packets = 0;
+	mod->last_moder_bytes = 0;
+}
+
+static void sdp_auto_moderation(struct sdp_sock *ssk)
+{
+	struct sdp_moderation *mod = &ssk->auto_mod;
+
+	unsigned long period = (unsigned long) (jiffies - mod->last_moder_jiffies);
+	unsigned long packets;
+	unsigned long rate;
+	unsigned long avg_pkt_size;
+	unsigned long tx_pkt_diff;
+	unsigned long rx_pkt_diff;
+	int moder_time;
+	int err;
+
+	if (!mod->adaptive_rx_coal)
+		return;
+
+	if (period < mod->sample_interval)
+		return;
+
+	if (!mod->last_moder_jiffies || !period)
+		goto out;
+
+	tx_pkt_diff = ((unsigned long) (ssk->tx_packets -
+					mod->last_moder_tx_packets));
+	rx_pkt_diff = ((unsigned long) (ssk->rx_packets -
+					mod->last_moder_packets));
+	packets = max(tx_pkt_diff, rx_pkt_diff);
+	rate = packets * HZ / period;
+	avg_pkt_size = packets ? ((unsigned long) (ssk->rx_bytes -
+				 mod->last_moder_bytes)) / packets : 0;
+
+	/* Apply auto-moderation only when packet rate exceeds a rate that
+	 * it matters */
+	if (rate > sdp_rx_rate_thresh) {
+		/* If tx and rx packet rates are not balanced, assume that
+		 * traffic is mainly BW bound and apply maximum moderation.
+		 * Otherwise, moderate according to packet rate */
+		if (2 * tx_pkt_diff > 3 * rx_pkt_diff ||
+		    2 * rx_pkt_diff > 3 * tx_pkt_diff) {
+			moder_time = mod->rx_usecs_high;
+		} else {
+			if (rate < mod->pkt_rate_low) {
+				moder_time = mod->rx_usecs_low;
+			} else if (rate > mod->pkt_rate_high)
+				moder_time = mod->rx_usecs_high;
+			else
+				moder_time = (rate - mod->pkt_rate_low) *
+					(mod->rx_usecs_high - mod->rx_usecs_low) /
+					(mod->pkt_rate_high - mod->pkt_rate_low) +
+					mod->rx_usecs_low;
+		}
+	} else {
+		/* When packet rate is low, use default moderation rather than
+		 * 0 to prevent interrupt storms if traffic suddenly increases */
+		moder_time = mod->moder_time;
+	}
+
+	sdp_dbg_data(&ssk->isk.sk, "tx rate:%lu rx_rate:%lu\n",
+			tx_pkt_diff * HZ / period, rx_pkt_diff * HZ / period);
+
+	sdp_dbg_data(&ssk->isk.sk, "Rx moder_time changed from:%d to %d period:%lu "
+			"[jiff] packets:%lu avg_pkt_size:%lu rate:%lu [p/s])\n",
+			mod->last_moder_time, moder_time, period, packets,
+			avg_pkt_size, rate);
+
+	if (moder_time != mod->last_moder_time) {
+		mod->last_moder_time = moder_time;
+		err = ib_modify_cq(ssk->rx_ring.cq, mod->moder_cnt, moder_time);
+		if (err)
+			sdp_dbg_data(&ssk->isk.sk, "Failed modifying moderation for cq");
+	}
+
+out:
+	mod->last_moder_packets = ssk->rx_packets;
+	mod->last_moder_tx_packets = ssk->tx_packets;
+	mod->last_moder_bytes = ssk->rx_bytes;
+	mod->last_moder_jiffies = jiffies;
+}
+
 void sdp_reset_sk(struct sock *sk, int rc)
 {
 	struct sdp_sock *ssk = sdp_sk(sk);
@@ -1780,6 +1901,8 @@ out:
 
 	posts_handler_put(ssk);
 
+	sdp_auto_moderation(ssk);
+
 	rdtscll(end);
 	SDPSTATS_COUNTER_ADD(sendmsg_sum, end - start);
 	release_sock(sk);
@@ -2050,6 +2173,8 @@ out:
 
 	posts_handler_put(ssk);
 
+	sdp_auto_moderation(ssk);
+
 	release_sock(sk);
 	return err;
 
diff --git a/drivers/infiniband/ulp/sdp/sdp_rx.c b/drivers/infiniband/ulp/sdp/sdp_rx.c
index 1ef28ac..2c33af8 100644
--- a/drivers/infiniband/ulp/sdp/sdp_rx.c
+++ b/drivers/infiniband/ulp/sdp/sdp_rx.c
@@ -37,34 +37,15 @@
 #include <rdma/rdma_cm.h>
 #include "sdp.h"
 
-static int rcvbuf_scale = 0x10;
-
-int rcvbuf_initial_size = 32 * 1024;
-module_param_named(rcvbuf_initial_size, rcvbuf_initial_size, int, 0644);
-MODULE_PARM_DESC(rcvbuf_initial_size, "Receive buffer initial size in bytes.");
-
-module_param_named(rcvbuf_scale, rcvbuf_scale, int, 0644);
-MODULE_PARM_DESC(rcvbuf_scale, "Receive buffer size scale factor.");
-
-static int top_mem_usage = 0;
-module_param_named(top_mem_usage, top_mem_usage, int, 0644);
-MODULE_PARM_DESC(top_mem_usage, "Top system wide sdp memory usage for recv (in MB).");
-
-static int hw_int_mod_count = 10;
-module_param_named(hw_int_mod_count, hw_int_mod_count, int, 0644);
-MODULE_PARM_DESC(hw_int_mod_count, "HW interrupt moderation. int count");
-
-static int hw_int_mod_msec = 200;
-module_param_named(hw_int_mod_msec, hw_int_mod_msec, int, 0644);
-MODULE_PARM_DESC(hw_int_mod_count, "HW interrupt moderation. mseq");
+SDP_MODPARAM_INT(rcvbuf_initial_size, 32 * 1024, "Receive buffer initial size in bytes.");
+SDP_MODPARAM_SINT(rcvbuf_scale, 0x10, "Receive buffer size scale factor.");
+SDP_MODPARAM_SINT(top_mem_usage, 0, "Top system wide sdp memory usage for recv (in MB).");
 
 #ifdef CONFIG_PPC
-static int max_large_sockets = 100;
+SDP_MODPARAM_SINT(max_large_sockets, 100, "Max number of large sockets (32k buffers).");
 #else
-static int max_large_sockets = 1000;
+SDP_MODPARAM_SINT(max_large_sockets, 1000, "Max number of large sockets (32k buffers).");
 #endif
-module_param_named(max_large_sockets, max_large_sockets, int, 0644);
-MODULE_PARM_DESC(max_large_sockets, "Max number of large sockets (32k buffers).");
 
 static int curr_large_sockets = 0;
 atomic_t sdp_current_mem_usage;
@@ -580,6 +561,9 @@ static struct sk_buff *sdp_process_rx_wc(struct sdp_sock *ssk, struct ib_wc *wc)
 	SDP_DUMP_PACKET(&ssk->isk.sk, "RX", skb, h);
 	skb_reset_transport_header(skb);
 
+	ssk->rx_packets++;
+	ssk->rx_bytes += skb->len;
+
 	mseq = ntohl(h->mseq);
 	atomic_set(&ssk->mseq_ack, mseq);
 	if (mseq != (int)wc->wr_id)
@@ -823,12 +807,6 @@ int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
 		goto err_cq;
 	}
 
-	rc = ib_modify_cq(rx_cq, hw_int_mod_count, hw_int_mod_msec);
-	if (rc) {
-		sdp_warn(&ssk->isk.sk, "Unable to modify RX CQ: %d.\n", rc);
-		goto err_mod;
-	}
-	sdp_warn(&ssk->isk.sk, "Initialized CQ moderation\n");
 	sdp_sk(&ssk->isk.sk)->rx_ring.cq = rx_cq;
 
 	INIT_WORK(&ssk->rx_comp_work, sdp_rx_comp_work);
@@ -837,8 +815,6 @@ int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
 
 	goto out;
 
-err_mod:
-	ib_destroy_cq(rx_cq);
 err_cq:
 	kfree(ssk->rx_ring.buffer);
 	ssk->rx_ring.buffer = NULL;
diff --git a/drivers/infiniband/ulp/sdp/sdp_tx.c b/drivers/infiniband/ulp/sdp/sdp_tx.c
index 9cc37a3..6fc4746 100644
--- a/drivers/infiniband/ulp/sdp/sdp_tx.c
+++ b/drivers/infiniband/ulp/sdp/sdp_tx.c
@@ -38,10 +38,8 @@
 #include "sdp.h"
 
 #define sdp_cnt(var) do { (var)++; } while (0)
-static unsigned sdp_keepalive_probes_sent = 0;
 
-module_param_named(sdp_keepalive_probes_sent, sdp_keepalive_probes_sent, uint, 0644);
-MODULE_PARM_DESC(sdp_keepalive_probes_sent, "Total number of keepalive probes sent.");
+SDP_MODPARAM_SINT(sdp_keepalive_probes_sent, 0, "Total number of keepalive probes sent.");
 
 static int sdp_process_tx_cq(struct sdp_sock *ssk);
 
@@ -83,6 +81,9 @@ void sdp_post_send(struct sdp_sock *ssk, struct sk_buff *skb, u8 mid)
 	SDPSTATS_COUNTER_MID_INC(post_send, mid);
 	SDPSTATS_HIST(send_size, skb->len);
 
+	ssk->tx_packets++;
+	ssk->tx_bytes += skb->len;
+
 	h->mid = mid;
 	if (unlikely(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG))
 		h->flags = SDP_OOB_PRES | SDP_OOB_PEND;
-- 
1.5.3.7




More information about the general mailing list