[ofa-general] [PATCH] sdp: make interrupt moderation adaptive
Amir Vadai
amirv at mellanox.co.il
Mon Jun 22 01:18:48 PDT 2009
Make HW RC interrupt moderation adaptive.
For latency sensitive traffic - no moderation.
In BW mode - maximal moderation.
Use heuristics to detect mode.:w
Signed-off-by: Amir Vadai <amirv at mellanox.co.il>
---
drivers/infiniband/ulp/sdp/sdp.h | 42 ++++++++
drivers/infiniband/ulp/sdp/sdp_cma.c | 3 +
drivers/infiniband/ulp/sdp/sdp_main.c | 187 +++++++++++++++++++++++++++------
drivers/infiniband/ulp/sdp/sdp_rx.c | 40 ++------
drivers/infiniband/ulp/sdp/sdp_tx.c | 7 +-
5 files changed, 213 insertions(+), 66 deletions(-)
diff --git a/drivers/infiniband/ulp/sdp/sdp.h b/drivers/infiniband/ulp/sdp/sdp.h
index 5d3e7d7..cc38f5b 100644
--- a/drivers/infiniband/ulp/sdp/sdp.h
+++ b/drivers/infiniband/ulp/sdp/sdp.h
@@ -37,6 +37,16 @@
spin_unlock_irqrestore(&ssk->rx_ring.lock, f); \
} while (0)
+#define SDP_MODPARAM_SINT(var, def_val, msg) \
+ static int var = def_val; \
+ module_param_named(var, var, int, 0644); \
+ MODULE_PARM_DESC(var, msg " [" #def_val "]"); \
+
+#define SDP_MODPARAM_INT(var, def_val, msg) \
+ int var = def_val; \
+ module_param_named(var, var, int, 0644); \
+ MODULE_PARM_DESC(var, msg " [" #def_val "]"); \
+
#ifdef SDP_PROFILING
struct sk_buff;
struct sdpprf_log {
@@ -238,6 +248,9 @@ static inline void sdpstats_hist(u32 *h, u32 val, u32 maxidx, int is_log)
/* how long (in jiffies) to block sender till tx completion*/
#define SDP_BZCOPY_POLL_TIMEOUT (HZ / 10)
+#define SDP_AUTO_CONF 0xffff
+#define AUTO_MOD_DELAY (HZ / 4)
+
#define BZCOPY_STATE(skb) (*(struct bzcopy_state **)(skb->cb))
#ifndef MIN
#define MIN(a, b) (a < b ? a : b)
@@ -372,6 +385,27 @@ struct sdp_chrecvbuf {
sdp_do_posts(ssk); \
})
+struct sdp_moderation {
+ unsigned long last_moder_packets;
+ unsigned long last_moder_tx_packets;
+ unsigned long last_moder_bytes;
+ unsigned long last_moder_jiffies;
+ int last_moder_time;
+ u16 rx_usecs;
+ u16 rx_frames;
+ u16 tx_usecs;
+ u32 pkt_rate_low;
+ u16 rx_usecs_low;
+ u32 pkt_rate_high;
+ u16 rx_usecs_high;
+ u16 sample_interval;
+ u16 adaptive_rx_coal;
+ u32 msg_enable;
+
+ int moder_cnt;
+ int moder_time;
+};
+
struct sdp_sock {
/* sk has to be the first member of inet_sock */
struct inet_sock isk;
@@ -451,6 +485,12 @@ struct sdp_sock {
int recv_frags; /* max skb frags in recv packets */
int send_frags; /* max skb frags in send packets */
+ unsigned long tx_packets;
+ unsigned long rx_packets;
+ unsigned long tx_bytes;
+ unsigned long rx_bytes;
+ struct sdp_moderation auto_mod;
+
/* BZCOPY data */
int zcopy_thresh;
};
@@ -568,6 +608,8 @@ void sdp_start_keepalive_timer(struct sock *sk);
int sdp_init_sock(struct sock *sk);
int __init sdp_proc_init(void);
void sdp_proc_unregister(void);
+/* sdp_main.c */
+void sdp_set_default_moderation(struct sdp_sock *ssk);
/* sdp_tx.c */
int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device);
diff --git a/drivers/infiniband/ulp/sdp/sdp_cma.c b/drivers/infiniband/ulp/sdp/sdp_cma.c
index 83e39dc..55ade25 100644
--- a/drivers/infiniband/ulp/sdp/sdp_cma.c
+++ b/drivers/infiniband/ulp/sdp/sdp_cma.c
@@ -203,6 +203,7 @@ static int sdp_response_handler(struct sock *sk, struct rdma_cm_id *id,
sdp_dbg(sk, "%s\n", __func__);
sdp_exch_state(sk, TCPF_SYN_SENT, TCP_ESTABLISHED);
+ sdp_set_default_moderation(sdp_sk(sk));
if (sock_flag(sk, SOCK_KEEPOPEN))
sdp_start_keepalive_timer(sk);
@@ -249,6 +250,8 @@ static int sdp_connected_handler(struct sock *sk, struct rdma_cm_event *event)
sdp_exch_state(sk, TCPF_SYN_RECV, TCP_ESTABLISHED);
+ sdp_set_default_moderation(sdp_sk(sk));
+
if (sock_flag(sk, SOCK_KEEPOPEN))
sdp_start_keepalive_timer(sk);
diff --git a/drivers/infiniband/ulp/sdp/sdp_main.c b/drivers/infiniband/ulp/sdp/sdp_main.c
index 64511fe..b34743d 100644
--- a/drivers/infiniband/ulp/sdp/sdp_main.c
+++ b/drivers/infiniband/ulp/sdp/sdp_main.c
@@ -79,41 +79,33 @@ MODULE_DESCRIPTION("InfiniBand SDP module");
MODULE_LICENSE("Dual BSD/GPL");
#ifdef CONFIG_INFINIBAND_SDP_DEBUG
-int sdp_debug_level;
-
-module_param_named(debug_level, sdp_debug_level, int, 0644);
-MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0.");
+SDP_MODPARAM_INT(sdp_debug_level, 0, "Enable debug tracing if > 0.");
#endif
#ifdef CONFIG_INFINIBAND_SDP_DEBUG
-int sdp_data_debug_level;
-
-module_param_named(data_debug_level, sdp_data_debug_level, int, 0644);
-MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0.");
+SDP_MODPARAM_INT(sdp_data_debug_level, 0, "Enable data path debug tracing if > 0.");
#endif
-static int recv_poll_hit;
-
-module_param_named(recv_poll_hit, recv_poll_hit, int, 0644);
-MODULE_PARM_DESC(recv_poll_hit, "How many times recv poll helped.");
-
-static int recv_poll_miss;
-
-module_param_named(recv_poll_miss, recv_poll_miss, int, 0644);
-MODULE_PARM_DESC(recv_poll_miss, "How many times recv poll missed.");
-
-static int recv_poll = 1000;
-
-module_param_named(recv_poll, recv_poll, int, 0644);
-MODULE_PARM_DESC(recv_poll, "How many times to poll recv.");
-
-static unsigned int sdp_keepalive_time = SDP_KEEPALIVE_TIME;
-
-module_param_named(sdp_keepalive_time, sdp_keepalive_time, uint, 0644);
-MODULE_PARM_DESC(sdp_keepalive_time, "Default idle time in seconds before keepalive probe sent.");
-
-static int sdp_zcopy_thresh = 65536;
-module_param_named(sdp_zcopy_thresh, sdp_zcopy_thresh, int, 0644);
-MODULE_PARM_DESC(sdp_zcopy_thresh, "Zero copy send threshold; 0=0ff.");
+SDP_MODPARAM_SINT(recv_poll_hit, -1, "How many times recv poll helped.");
+SDP_MODPARAM_SINT(recv_poll_miss, -1, "How many times recv poll missed.");
+SDP_MODPARAM_SINT(recv_poll, 1000, "How many times to poll recv.");
+SDP_MODPARAM_SINT(sdp_keepalive_time, SDP_KEEPALIVE_TIME,
+ "Default idle time in seconds before keepalive probe sent.");
+SDP_MODPARAM_SINT(sdp_zcopy_thresh, 65536, "Zero copy send threshold; 0=0ff.");
+
+#define SDP_RX_COAL_TIME_HIGH 128
+SDP_MODPARAM_SINT(sdp_rx_coal_target, 0x50000,
+ "Target number of bytes to coalesce with interrupt moderation (bytes).");
+SDP_MODPARAM_SINT(sdp_rx_coal_time, 0x10, "rx coal time (jiffies).");
+SDP_MODPARAM_SINT(sdp_rx_rate_low, 80000, "rx_rate low (packets/sec).");
+SDP_MODPARAM_SINT(sdp_rx_coal_time_low, 0, "low moderation time val (usec).");
+SDP_MODPARAM_SINT(sdp_rx_rate_high, 100000, "rx_rate high (packets/sec).");
+SDP_MODPARAM_SINT(sdp_rx_coal_time_high, 128, "high moderation time val (usec).");
+SDP_MODPARAM_SINT(sdp_rx_rate_thresh, (200000 / SDP_RX_COAL_TIME_HIGH),
+ "rx rate thresh ().");
+SDP_MODPARAM_SINT(sdp_sample_interval, (HZ / 4), "sample interval (jiffies).");
+
+SDP_MODPARAM_INT(hw_int_mod_count, -1, "forced hw int moderation val. -1 for auto (packets).");
+SDP_MODPARAM_INT(hw_int_mod_usec, -1, "forced hw int moderation val. -1 for auto (usec).");
struct workqueue_struct *sdp_wq;
struct workqueue_struct *rx_comp_wq;
@@ -298,6 +290,135 @@ void sdp_start_keepalive_timer(struct sock *sk)
sdp_reset_keepalive_timer(sk, sdp_keepalive_time_when(sdp_sk(sk)));
}
+void sdp_set_default_moderation(struct sdp_sock *ssk)
+{
+ struct sdp_moderation *mod = &ssk->auto_mod;
+ int rx_buf_size;
+
+ if (hw_int_mod_count > -1 || hw_int_mod_usec > -1) {
+ int err;
+
+ mod->adaptive_rx_coal = 0;
+
+ if (hw_int_mod_count > 0 && hw_int_mod_usec > 0) {
+ err = ib_modify_cq(ssk->rx_ring.cq, hw_int_mod_count, hw_int_mod_usec);
+ if (err)
+ sdp_warn(&ssk->isk.sk, "Failed modifying moderation for cq");
+ else
+ sdp_dbg(&ssk->isk.sk, "Using fixed interrupt moderation\n");
+ }
+ return;
+ }
+
+ mod->adaptive_rx_coal = 1;
+ sdp_dbg(&ssk->isk.sk, "Using adaptive interrupt moderation\n");
+
+ /* If we haven't received a specific coalescing setting
+ * (module param), we set the moderation paramters as follows:
+ * - moder_cnt is set to the number of mtu sized packets to
+ * satisfy our coelsing target.
+ * - moder_time is set to a fixed value.
+ */
+ rx_buf_size = (ssk->recv_frags * PAGE_SIZE) + sizeof(struct sdp_bsdh);
+ mod->moder_cnt = sdp_rx_coal_target / rx_buf_size + 1;
+ mod->moder_time = sdp_rx_coal_time;
+ sdp_dbg(&ssk->isk.sk, "Default coalesing params for buf size:%d - "
+ "moder_cnt:%d moder_time:%d\n",
+ rx_buf_size, mod->moder_cnt, mod->moder_time);
+
+ /* Reset auto-moderation params */
+ mod->pkt_rate_low = sdp_rx_rate_low;
+ mod->rx_usecs_low = sdp_rx_coal_time_low;
+ mod->pkt_rate_high = sdp_rx_rate_high;
+ mod->rx_usecs_high = sdp_rx_coal_time_high;
+ mod->sample_interval = sdp_sample_interval;
+
+ mod->last_moder_time = SDP_AUTO_CONF;
+ mod->last_moder_jiffies = 0;
+ mod->last_moder_packets = 0;
+ mod->last_moder_tx_packets = 0;
+ mod->last_moder_bytes = 0;
+}
+
+static void sdp_auto_moderation(struct sdp_sock *ssk)
+{
+ struct sdp_moderation *mod = &ssk->auto_mod;
+
+ unsigned long period = (unsigned long) (jiffies - mod->last_moder_jiffies);
+ unsigned long packets;
+ unsigned long rate;
+ unsigned long avg_pkt_size;
+ unsigned long tx_pkt_diff;
+ unsigned long rx_pkt_diff;
+ int moder_time;
+ int err;
+
+ if (!mod->adaptive_rx_coal)
+ return;
+
+ if (period < mod->sample_interval)
+ return;
+
+ if (!mod->last_moder_jiffies || !period)
+ goto out;
+
+ tx_pkt_diff = ((unsigned long) (ssk->tx_packets -
+ mod->last_moder_tx_packets));
+ rx_pkt_diff = ((unsigned long) (ssk->rx_packets -
+ mod->last_moder_packets));
+ packets = max(tx_pkt_diff, rx_pkt_diff);
+ rate = packets * HZ / period;
+ avg_pkt_size = packets ? ((unsigned long) (ssk->rx_bytes -
+ mod->last_moder_bytes)) / packets : 0;
+
+ /* Apply auto-moderation only when packet rate exceeds a rate that
+ * it matters */
+ if (rate > sdp_rx_rate_thresh) {
+ /* If tx and rx packet rates are not balanced, assume that
+ * traffic is mainly BW bound and apply maximum moderation.
+ * Otherwise, moderate according to packet rate */
+ if (2 * tx_pkt_diff > 3 * rx_pkt_diff ||
+ 2 * rx_pkt_diff > 3 * tx_pkt_diff) {
+ moder_time = mod->rx_usecs_high;
+ } else {
+ if (rate < mod->pkt_rate_low) {
+ moder_time = mod->rx_usecs_low;
+ } else if (rate > mod->pkt_rate_high)
+ moder_time = mod->rx_usecs_high;
+ else
+ moder_time = (rate - mod->pkt_rate_low) *
+ (mod->rx_usecs_high - mod->rx_usecs_low) /
+ (mod->pkt_rate_high - mod->pkt_rate_low) +
+ mod->rx_usecs_low;
+ }
+ } else {
+ /* When packet rate is low, use default moderation rather than
+ * 0 to prevent interrupt storms if traffic suddenly increases */
+ moder_time = mod->moder_time;
+ }
+
+ sdp_dbg_data(&ssk->isk.sk, "tx rate:%lu rx_rate:%lu\n",
+ tx_pkt_diff * HZ / period, rx_pkt_diff * HZ / period);
+
+ sdp_dbg_data(&ssk->isk.sk, "Rx moder_time changed from:%d to %d period:%lu "
+ "[jiff] packets:%lu avg_pkt_size:%lu rate:%lu [p/s])\n",
+ mod->last_moder_time, moder_time, period, packets,
+ avg_pkt_size, rate);
+
+ if (moder_time != mod->last_moder_time) {
+ mod->last_moder_time = moder_time;
+ err = ib_modify_cq(ssk->rx_ring.cq, mod->moder_cnt, moder_time);
+ if (err)
+ sdp_dbg_data(&ssk->isk.sk, "Failed modifying moderation for cq");
+ }
+
+out:
+ mod->last_moder_packets = ssk->rx_packets;
+ mod->last_moder_tx_packets = ssk->tx_packets;
+ mod->last_moder_bytes = ssk->rx_bytes;
+ mod->last_moder_jiffies = jiffies;
+}
+
void sdp_reset_sk(struct sock *sk, int rc)
{
struct sdp_sock *ssk = sdp_sk(sk);
@@ -1780,6 +1901,8 @@ out:
posts_handler_put(ssk);
+ sdp_auto_moderation(ssk);
+
rdtscll(end);
SDPSTATS_COUNTER_ADD(sendmsg_sum, end - start);
release_sock(sk);
@@ -2050,6 +2173,8 @@ out:
posts_handler_put(ssk);
+ sdp_auto_moderation(ssk);
+
release_sock(sk);
return err;
diff --git a/drivers/infiniband/ulp/sdp/sdp_rx.c b/drivers/infiniband/ulp/sdp/sdp_rx.c
index 1ef28ac..2c33af8 100644
--- a/drivers/infiniband/ulp/sdp/sdp_rx.c
+++ b/drivers/infiniband/ulp/sdp/sdp_rx.c
@@ -37,34 +37,15 @@
#include <rdma/rdma_cm.h>
#include "sdp.h"
-static int rcvbuf_scale = 0x10;
-
-int rcvbuf_initial_size = 32 * 1024;
-module_param_named(rcvbuf_initial_size, rcvbuf_initial_size, int, 0644);
-MODULE_PARM_DESC(rcvbuf_initial_size, "Receive buffer initial size in bytes.");
-
-module_param_named(rcvbuf_scale, rcvbuf_scale, int, 0644);
-MODULE_PARM_DESC(rcvbuf_scale, "Receive buffer size scale factor.");
-
-static int top_mem_usage = 0;
-module_param_named(top_mem_usage, top_mem_usage, int, 0644);
-MODULE_PARM_DESC(top_mem_usage, "Top system wide sdp memory usage for recv (in MB).");
-
-static int hw_int_mod_count = 10;
-module_param_named(hw_int_mod_count, hw_int_mod_count, int, 0644);
-MODULE_PARM_DESC(hw_int_mod_count, "HW interrupt moderation. int count");
-
-static int hw_int_mod_msec = 200;
-module_param_named(hw_int_mod_msec, hw_int_mod_msec, int, 0644);
-MODULE_PARM_DESC(hw_int_mod_count, "HW interrupt moderation. mseq");
+SDP_MODPARAM_INT(rcvbuf_initial_size, 32 * 1024, "Receive buffer initial size in bytes.");
+SDP_MODPARAM_SINT(rcvbuf_scale, 0x10, "Receive buffer size scale factor.");
+SDP_MODPARAM_SINT(top_mem_usage, 0, "Top system wide sdp memory usage for recv (in MB).");
#ifdef CONFIG_PPC
-static int max_large_sockets = 100;
+SDP_MODPARAM_SINT(max_large_sockets, 100, "Max number of large sockets (32k buffers).");
#else
-static int max_large_sockets = 1000;
+SDP_MODPARAM_SINT(max_large_sockets, 1000, "Max number of large sockets (32k buffers).");
#endif
-module_param_named(max_large_sockets, max_large_sockets, int, 0644);
-MODULE_PARM_DESC(max_large_sockets, "Max number of large sockets (32k buffers).");
static int curr_large_sockets = 0;
atomic_t sdp_current_mem_usage;
@@ -580,6 +561,9 @@ static struct sk_buff *sdp_process_rx_wc(struct sdp_sock *ssk, struct ib_wc *wc)
SDP_DUMP_PACKET(&ssk->isk.sk, "RX", skb, h);
skb_reset_transport_header(skb);
+ ssk->rx_packets++;
+ ssk->rx_bytes += skb->len;
+
mseq = ntohl(h->mseq);
atomic_set(&ssk->mseq_ack, mseq);
if (mseq != (int)wc->wr_id)
@@ -823,12 +807,6 @@ int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
goto err_cq;
}
- rc = ib_modify_cq(rx_cq, hw_int_mod_count, hw_int_mod_msec);
- if (rc) {
- sdp_warn(&ssk->isk.sk, "Unable to modify RX CQ: %d.\n", rc);
- goto err_mod;
- }
- sdp_warn(&ssk->isk.sk, "Initialized CQ moderation\n");
sdp_sk(&ssk->isk.sk)->rx_ring.cq = rx_cq;
INIT_WORK(&ssk->rx_comp_work, sdp_rx_comp_work);
@@ -837,8 +815,6 @@ int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
goto out;
-err_mod:
- ib_destroy_cq(rx_cq);
err_cq:
kfree(ssk->rx_ring.buffer);
ssk->rx_ring.buffer = NULL;
diff --git a/drivers/infiniband/ulp/sdp/sdp_tx.c b/drivers/infiniband/ulp/sdp/sdp_tx.c
index 9cc37a3..6fc4746 100644
--- a/drivers/infiniband/ulp/sdp/sdp_tx.c
+++ b/drivers/infiniband/ulp/sdp/sdp_tx.c
@@ -38,10 +38,8 @@
#include "sdp.h"
#define sdp_cnt(var) do { (var)++; } while (0)
-static unsigned sdp_keepalive_probes_sent = 0;
-module_param_named(sdp_keepalive_probes_sent, sdp_keepalive_probes_sent, uint, 0644);
-MODULE_PARM_DESC(sdp_keepalive_probes_sent, "Total number of keepalive probes sent.");
+SDP_MODPARAM_SINT(sdp_keepalive_probes_sent, 0, "Total number of keepalive probes sent.");
static int sdp_process_tx_cq(struct sdp_sock *ssk);
@@ -83,6 +81,9 @@ void sdp_post_send(struct sdp_sock *ssk, struct sk_buff *skb, u8 mid)
SDPSTATS_COUNTER_MID_INC(post_send, mid);
SDPSTATS_HIST(send_size, skb->len);
+ ssk->tx_packets++;
+ ssk->tx_bytes += skb->len;
+
h->mid = mid;
if (unlikely(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG))
h->flags = SDP_OOB_PRES | SDP_OOB_PEND;
--
1.5.3.7
More information about the general
mailing list