[ofa-general] [PATCH V1 1/2] sdp: add KEEPALIVE support

Jim Mott jim at mellanox.com
Tue Jul 31 05:07:00 PDT 2007


Hi,
  This is the kernel part an OFED 1.3 patch to add keepalive support to
SDP.  There are a couple things to highlight.

1) No specific 'active' bit
  Instead of setting or clearing some bit on every send or receive, this
code just remembers the TX and RX heads every time the keepalive timer
pops.  If they are the same this pop as last pop, then the probe is
sent.  

2) Counter of all keepalives sent
  The keepalive probe itself is a zero byte RDMA (as per-spec).  It does
not generate a CQ entry unless there is a problem.  Since unlike TCP
there is nothing that 'tcpdump' or a sniffer could see on the wire, it
is hard to test that keepalives are being sent in the absence of
problems.
  In order to create an automated test, there is a /sys counter that
gets incremented every time a keepalive is sent.  An argument could be
made to add a counter to each socket, and add some options to get (and
reset) it.  I am open to doing it that way if people think it is better.

Diff from OFED 1.2


Index: ofa_kernel/drivers/infiniband/ulp/sdp/sdp.h
===================================================================
--- ofa_kernel.orig/drivers/infiniband/ulp/sdp/sdp.h	2007-07-16
19:42:32.000000000 +0300
+++ ofa_kernel/drivers/infiniband/ulp/sdp/sdp.h	2007-07-21
03:05:29.000000000 +0300
@@ -42,6 +42,7 @@ extern int sdp_data_debug_level;
 #define SDP_RESOLVE_TIMEOUT 1000
 #define SDP_ROUTE_TIMEOUT 1000
 #define SDP_RETRY_COUNT 5
+#define SDP_KEEPALIVE_TIME (120 * 60)
 
 #define SDP_TX_SIZE 0x40
 #define SDP_RX_SIZE 0x40
@@ -51,6 +52,7 @@ extern int sdp_data_debug_level;
 #define SDP_NUM_WC 4
 
 #define SDP_OP_RECV 0x800000000LL
+#define SDP_OP_SEND 0x400000000LL
 
 enum sdp_mid {
 	SDP_MID_HELLO = 0x0,
@@ -115,6 +117,12 @@ struct sdp_sock {
 
 	int time_wait;
 
+	unsigned keepalive_time;
+
+	/* tx_head/rx_head when keepalive timer started */
+	unsigned keepalive_tx_head;
+	unsigned keepalive_rx_head;
+
 	/* Data below will be reset on error */
 	/* rdma specific */
 	struct rdma_cm_id *id;
@@ -221,5 +229,7 @@ void sdp_urg(struct sdp_sock *ssk, struc
 void sdp_add_sock(struct sdp_sock *ssk);
 void sdp_remove_sock(struct sdp_sock *ssk);
 void sdp_remove_large_sock(void);
+void sdp_post_keepalive(struct sdp_sock *ssk);
+void sdp_start_keepalive_timer(struct sock *sk);
 
 #endif
Index: ofa_kernel/drivers/infiniband/ulp/sdp/sdp_bcopy.c
===================================================================
--- ofa_kernel.orig/drivers/infiniband/ulp/sdp/sdp_bcopy.c
2007-07-16 19:42:32.000000000 +0300
+++ ofa_kernel/drivers/infiniband/ulp/sdp/sdp_bcopy.c	2007-07-16
23:00:04.000000000 +0300
@@ -60,6 +60,12 @@ static int max_large_sockets = 1000;
 module_param_named(max_large_sockets, max_large_sockets, int, 0644);
 MODULE_PARM_DESC(max_large_sockets, "Max number of large sockets (32k
buffers).");
 
+#define sdp_cnt(var) do { (var)++; } while (0)
+static unsigned sdp_keepalive_probes_sent = 0;
+
+module_param_named(sdp_keepalive_probes_sent,
sdp_keepalive_probes_sent, uint, 0644);
+MODULE_PARM_DESC(sdp_keepalive_probes_sent, "Total number of keepalive
probes sent.");
+
 static int curr_large_sockets = 0;
 atomic_t sdp_current_mem_usage;
 spinlock_t sdp_large_sockets_lock;
@@ -107,6 +113,31 @@ static void sdp_fin(struct sock *sk)
 	}
 }
 
+void sdp_post_keepalive(struct sdp_sock *ssk)
+{
+	int rc;
+	struct ib_send_wr wr, *bad_wr;
+
+	sdp_dbg(&ssk->isk.sk, "%s\n", __func__);
+
+	memset(&wr, 0, sizeof(wr));
+
+	wr.next    = NULL;
+	wr.wr_id   = 0;
+	wr.sg_list = NULL;
+	wr.num_sge = 0;
+	wr.opcode  = IB_WR_RDMA_WRITE;
+
+	rc = ib_post_send(ssk->qp, &wr, &bad_wr);
+	if (rc) {
+		sdp_dbg(&ssk->isk.sk, "ib_post_keepalive failed with
status %d.\n", rc);
+		sdp_set_error(&ssk->isk.sk, -ECONNRESET);
+		wake_up(&ssk->wq);
+	}
+
+	sdp_cnt(sdp_keepalive_probes_sent);
+}
+
 void sdp_post_send(struct sdp_sock *ssk, struct sk_buff *skb, u8 mid)
 {
 	struct sdp_buf *tx_req;
@@ -158,7 +189,7 @@ void sdp_post_send(struct sdp_sock *ssk,
 	}
 
 	ssk->tx_wr.next = NULL;
-	ssk->tx_wr.wr_id = ssk->tx_head;
+	ssk->tx_wr.wr_id = ssk->tx_head | SDP_OP_SEND;
 	ssk->tx_wr.sg_list = ssk->ibsge;
 	ssk->tx_wr.num_sge = frags + 1;
 	ssk->tx_wr.opcode = IB_WR_SEND;
@@ -604,7 +635,7 @@ static void sdp_handle_wc(struct sdp_soc
 				__kfree_skb(skb);
 			}
 		}
-	} else {
+	} else if (likely(wc->wr_id & SDP_OP_SEND)) {
 		skb = sdp_send_completion(ssk, wc->wr_id);
 		if (unlikely(!skb))
 			return;
@@ -620,6 +651,22 @@ static void sdp_handle_wc(struct sdp_soc
 		}
 
 		sk_stream_write_space(&ssk->isk.sk);
+	} else {
+		sdp_cnt(sdp_keepalive_probes_sent);
+
+		if (likely(!wc->status))
+			return;
+
+		sdp_dbg(&ssk->isk.sk, " %s consumes KEEPALIVE status
%d\n",
+		        __func__, wc->status);
+
+		if (wc->status == IB_WC_WR_FLUSH_ERR)
+			return;
+
+		sdp_set_error(&ssk->isk.sk, -ECONNRESET);
+		wake_up(&ssk->wq);
+
+		return;
 	}
 
 	if (likely(!wc->status)) {
Index: ofa_kernel/drivers/infiniband/ulp/sdp/sdp_cma.c
===================================================================
--- ofa_kernel.orig/drivers/infiniband/ulp/sdp/sdp_cma.c
2007-07-16 19:42:32.000000000 +0300
+++ ofa_kernel/drivers/infiniband/ulp/sdp/sdp_cma.c	2007-07-16
23:00:04.000000000 +0300
@@ -270,8 +270,8 @@ static int sdp_response_handler(struct s
 
 	sk->sk_state = TCP_ESTABLISHED;
 
-	/* TODO: If SOCK_KEEPOPEN set, need to reset and start
-	   keepalive timer here */
+	if (sock_flag(sk, SOCK_KEEPOPEN))
+		sdp_start_keepalive_timer(sk);
 
 	if (sock_flag(sk, SOCK_DEAD))
 		return 0;
@@ -311,8 +311,8 @@ int sdp_connected_handler(struct sock *s
 
 	sk->sk_state = TCP_ESTABLISHED;
 
-	/* TODO: If SOCK_KEEPOPEN set, need to reset and start
-	   keepalive timer here */
+	if (sock_flag(sk, SOCK_KEEPOPEN))
+		sdp_start_keepalive_timer(sk);
 
 	if (sock_flag(sk, SOCK_DEAD))
 		return 0;
Index: ofa_kernel/drivers/infiniband/ulp/sdp/sdp_main.c
===================================================================
--- ofa_kernel.orig/drivers/infiniband/ulp/sdp/sdp_main.c
2007-07-16 19:42:38.000000000 +0300
+++ ofa_kernel/drivers/infiniband/ulp/sdp/sdp_main.c	2007-07-21
03:10:14.000000000 +0300
@@ -117,6 +117,11 @@ static int send_poll_thresh = 8192;
 module_param_named(send_poll_thresh, send_poll_thresh, int, 0644);
 MODULE_PARM_DESC(send_poll_thresh, "Send message size thresh hold over
which to start polling.");
 
+static unsigned int sdp_keepalive_time = SDP_KEEPALIVE_TIME;
+
+module_param_named(sdp_keepalive_time, sdp_keepalive_time, uint, 0644);
+MODULE_PARM_DESC(sdp_keepalive_time, "Default idle time in seconds
before keepalive probe sent.");
+
 struct workqueue_struct *sdp_workqueue;
 
 static struct list_head sock_list;
@@ -124,6 +129,11 @@ static spinlock_t sock_list_lock;
 
 DEFINE_RWLOCK(device_removal_lock);
 
+static inline unsigned int sdp_keepalive_time_when(const struct
sdp_sock *ssk)
+{
+	return ssk->keepalive_time ? : sdp_keepalive_time * HZ;
+}
+
 inline void sdp_add_sock(struct sdp_sock *ssk)
 {
 	spin_lock_irq(&sock_list_lock);
@@ -221,6 +231,86 @@ static void sdp_destroy_qp(struct sdp_so
 	kfree(ssk->tx_ring);
 }
 
+
+static void sdp_reset_keepalive_timer(struct sock *sk, unsigned long
len)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	ssk->keepalive_tx_head = ssk->tx_head;
+	ssk->keepalive_rx_head = ssk->rx_head;
+
+	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+
+static void sdp_delete_keepalive_timer(struct sock *sk)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	ssk->keepalive_tx_head = 0;
+	ssk->keepalive_rx_head = 0;
+
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+
+static void sdp_keepalive_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct sdp_sock *ssk = sdp_sk(sk);
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	/* Only process if the socket is not in use */
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		sdp_reset_keepalive_timer(sk, HZ / 20);
+		goto out;
+	}
+
+	if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_LISTEN
||
+	    sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	if (ssk->keepalive_tx_head == ssk->tx_head &&
+	    ssk->keepalive_rx_head == ssk->rx_head)
+		sdp_post_keepalive(ssk);
+
+	sdp_reset_keepalive_timer(sk, sdp_keepalive_time_when(ssk));
+
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+static void sdp_init_timer(struct sock *sk)
+{
+	init_timer(&sk->sk_timer);
+
+	sk->sk_timer.function = sdp_keepalive_timer;
+	sk->sk_timer.data = (unsigned long)sk;
+}
+
+static void sdp_set_keepalive(struct sock *sk, int val)
+{
+	sdp_dbg(sk, "%s %d\n", __func__, val);
+
+	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+		return;
+
+	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
+		sdp_start_keepalive_timer(sk);
+	else if (!val)
+		sdp_delete_keepalive_timer(sk);
+}
+
+void sdp_start_keepalive_timer(struct sock *sk)
+{
+	sdp_reset_keepalive_timer(sk,
sdp_keepalive_time_when(sdp_sk(sk)));
+}
+
 void sdp_reset_sk(struct sock *sk, int rc)
 {
 	struct sdp_sock *ssk = sdp_sk(sk);
@@ -365,6 +455,8 @@ static void sdp_close(struct sock *sk, l
 
 	sdp_dbg(sk, "%s\n", __func__);
 
+	sdp_delete_keepalive_timer(sk);
+
 	sk->sk_shutdown = SHUTDOWN_MASK;
 	if (sk->sk_state == TCP_LISTEN || sk->sk_state == TCP_SYN_SENT)
{
 		sdp_set_state(sk, TCP_CLOSE);
@@ -818,9 +910,6 @@ static int sdp_setsockopt(struct sock *s
 	int err = 0;
 
 	sdp_dbg(sk, "%s\n", __func__);
-	if (level != SOL_TCP)
-		return -ENOPROTOOPT;
-
 	if (optlen < sizeof(int))
 		return -EINVAL;
 
@@ -829,6 +918,28 @@ static int sdp_setsockopt(struct sock *s
 
 	lock_sock(sk);
 
+	/* SOCK_KEEPALIVE is really a SOL_SOCKET level option but there
+	 * is a problem handling it at that level.  In order to start
+	 * the keepalive timer on an SDP socket, we must call an SDP
+	 * specific routine.  Since sock_setsockopt() can not be modifed
+	 * to understand SDP, the application must pass that option
+	 * through to us.  Since SO_KEEPALIVE and TCP_DEFER_ACCEPT both
+	 * use the same optname, the level must not be SOL_TCP or
SOL_SOCKET
+	 */
+	if (level == PF_INET_SDP && optname == SO_KEEPALIVE) {
+		sdp_set_keepalive(sk, val);
+		if (val)
+			sock_set_flag(sk, SOCK_KEEPOPEN);
+		else
+			sock_reset_flag(sk, SOCK_KEEPOPEN);
+		goto out;
+	}
+
+	if (level != SOL_TCP) {
+		err = -ENOPROTOOPT;
+		goto out;
+	}
+
 	switch (optname) {
 	case TCP_NODELAY:
 		if (val) {
@@ -867,11 +978,23 @@ static int sdp_setsockopt(struct sock *s
 			sdp_push_pending_frames(sk);
 		}
 		break;
+	case TCP_KEEPIDLE:
+		if (val < 1 || val > MAX_TCP_KEEPIDLE)
+			err = -EINVAL;
+		else {
+			ssk->keepalive_time = val * HZ;
+
+			if (sock_flag(sk, SOCK_KEEPOPEN) &&
+			    !((1 << sk->sk_state) & (TCPF_CLOSE |
TCPF_LISTEN)))
+				sdp_reset_keepalive_timer(sk,
ssk->keepalive_time);
+		}
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
 	}
 
+out:
 	release_sock(sk);
 	return err;
 }
@@ -904,6 +1027,9 @@ static int sdp_getsockopt(struct sock *s
 	case TCP_CORK:
 		val = !!(ssk->nonagle&TCP_NAGLE_CORK);
 		break;
+	case TCP_KEEPIDLE:
+		val = ssk->keepalive_time ? ssk->keepalive_time / HZ :
sdp_keepalive_time;
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -1687,6 +1813,8 @@ static int sdp_create_socket(struct sock
 
 	sk->sk_destruct = sdp_destruct;
 
+	sdp_init_timer(sk);
+
 	sock->ops = &sdp_proto_ops;
 	sock->state = SS_UNCONNECTED;




More information about the general mailing list