[ofa-general] Re: [PATCH 7/8]: RDS: Implement rds ping

Olaf Kirch okir at lst.de
Thu Apr 24 02:13:08 PDT 2008


From 24000a7c11fedb519aab11807703d91ae49ac421 Mon Sep 17 00:00:00 2001
From: Olaf Kirch <olaf.kirch at oracle.com>
Date: Thu, 24 Apr 2008 00:27:36 -0700
Subject: [PATCH] RDS: Implement rds ping

Several people have asked for a way to test reachability of
remote nodes via RDS. This is it - rds ping.

RDS ping is implemented by sending packets to port 0.
As a matter of simplicity, we do not handle packet payloads at this time -
the ping response is always an empty packet.

Signed-off-by: Olaf Kirch <olaf.kirch at oracle.com>
---
 net/rds/cong.c   |    2 +-
 net/rds/rds.h    |    5 ++++
 net/rds/recv.c   |    6 +++++
 net/rds/send.c   |   56 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/rds/stats.c  |    2 +
 net/rds/sysctl.c |   10 +++++++++
 6 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/net/rds/cong.c b/net/rds/cong.c
index 2db2362..4ec85ce 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -348,7 +348,7 @@ int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rd
 	if (!rds_cong_test_bit(map, port))
 	       return 0;
 	if (nonblock) {
-		if (rs->rs_cong_monitor) {
+		if (rs && rs->rs_cong_monitor) {
 			unsigned long flags;
 
 			/* It would have been nice to have an atomic set_bit on
diff --git a/net/rds/rds.h b/net/rds/rds.h
index d5a966d..a0fb20c 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -487,6 +487,7 @@ struct rds_statistics {
 	unsigned long	s_recv_delayed_retry;
 	unsigned long	s_recv_ack_required;
 	unsigned long	s_recv_rdma_bytes;
+	unsigned long	s_recv_ping;
 	unsigned long	s_send_queue_empty;
 	unsigned long	s_send_queue_full;
 	unsigned long	s_send_sem_contention;
@@ -497,6 +498,7 @@ struct rds_statistics {
 	unsigned long	s_send_ack_required;
 	unsigned long	s_send_rdma;
 	unsigned long	s_send_rdma_bytes;
+	unsigned long	s_send_pong;
 	unsigned long	s_page_remainder_hit;
 	unsigned long	s_page_remainder_miss;
 	unsigned long	s_cong_update_queued;
@@ -570,6 +572,7 @@ rds_conn_up(struct rds_connection *conn)
 }
 
 /* message.c */
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
 struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
 					       size_t total_len);
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
@@ -641,6 +644,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
 			 is_acked_func is_acked);
 int rds_send_acked_before(struct rds_connection *conn, u64 seq);
 void rds_send_remove_from_sock(struct list_head *messages, int status);
+int rds_send_pong(struct rds_connection *conn, __be16 dport);
 
 /* rdma.c */
 void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
@@ -672,6 +676,7 @@ extern unsigned long rds_sysctl_reconnect_min_jiffies;
 extern unsigned long rds_sysctl_reconnect_max_jiffies;
 extern unsigned int  rds_sysctl_max_unacked_packets;
 extern unsigned int  rds_sysctl_max_unacked_bytes;
+extern unsigned int  rds_sysctl_ping_enable;
 
 /* threads.c */
 int __init rds_threads_init(void);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 9adb24d..da3c879 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -196,6 +196,12 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 	}
 	conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
 
+	if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+		rds_stats_inc(s_recv_ping);
+		rds_send_pong(conn, inc->i_hdr.h_sport);
+		goto out;
+	}
+
 	rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
 	if (rs == NULL) {
 		rds_stats_inc(s_recv_drop_no_sock);
diff --git a/net/rds/send.c b/net/rds/send.c
index a2a5b2a..26e1e3e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -700,8 +700,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 
 	if (msg->msg_namelen) {
 		/* XXX fail non-unicast destination IPs? */
-		if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET ||
-		    usin->sin_port == 0) {
+		if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
 			ret = -EINVAL;
 			goto out;
 		}
@@ -820,3 +819,56 @@ out:
 		rds_message_put(rm);
 	return ret;
 }
+
+/*
+ * Reply to a ping packet.
+ */
+int
+rds_send_pong(struct rds_connection *conn, __be16 dport)
+{
+	struct rds_message *rm;
+	unsigned long flags;
+	int ret = 0;
+
+	rm = rds_message_alloc(0, GFP_ATOMIC);
+	if (rm == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rm->m_daddr = conn->c_faddr;
+
+	/* If the connection is down, trigger a connect. We may
+	 * have scheduled a delayed reconnect however - in this case
+	 * we should not interfere.
+	 */
+	if (rds_conn_state(conn) == RDS_CONN_DOWN
+	 && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+
+	ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
+	if (ret)
+		goto out;
+
+	spin_lock_irqsave(&conn->c_lock, flags);
+	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+	rds_message_addref(rm);
+	rm->m_inc.i_conn = conn;
+
+	rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
+				    conn->c_next_tx_seq);
+	conn->c_next_tx_seq++;
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+
+	rds_stats_inc(s_send_pong);
+
+	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+	rds_message_put(rm);
+	return 0;
+
+out:
+	if (rm)
+		rds_message_put(rm);
+	return ret;
+}
diff --git a/net/rds/stats.c b/net/rds/stats.c
index abf7103..0bd91fa 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -53,6 +53,7 @@ static char *rds_stat_names[] = {
 	"recv_delayed_retry",
 	"recv_ack_required",
 	"recv_rdma_bytes",
+	"recv_ping",
 	"send_queue_empty",
 	"send_queue_full",
 	"send_sem_contention",
@@ -63,6 +64,7 @@ static char *rds_stat_names[] = {
 	"send_ack_required",
 	"send_rdma",
 	"send_rdma_bytes",
+	"send_pong",
 	"page_remainder_hit",
 	"page_remainder_miss",
 	"cong_update_queued",
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index 5f7ce37..7b18c0a 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -47,6 +47,8 @@ unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
 unsigned int  rds_sysctl_max_unacked_packets = 16;
 unsigned int  rds_sysctl_max_unacked_bytes = (16 << 20);
 
+unsigned int rds_sysctl_ping_enable = 1;
+
 /* 
  * These can change over time until they're official.  Until that time we'll
  * give apps a way to figure out what the values are in a given machine.
@@ -107,6 +109,14 @@ static ctl_table rds_sysctl_rds_table[] = {
 		.mode           = 0644,
 		.proc_handler   = &proc_dointvec,
 	},
+	{
+		.ctl_name	= 10,
+		.procname	= "ping_enable",
+		.data		= &rds_sysctl_ping_enable,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
 	/* 100+ are reserved for transport subdirs */
 	{ .ctl_name = 0}
 };
-- 
1.5.4.rc3


-- 
Olaf Kirch  |  --- o --- Nous sommes du soleil we love when we play
okir at lst.de |    / | \   sol.dhoop.naytheet.ah kin.ir.samse.qurax



More information about the general mailing list