[ofa-general] Re: [PATCH 7/8]: RDS: Implement rds ping
Olaf Kirch
okir at lst.de
Thu Apr 24 02:13:08 PDT 2008
From 24000a7c11fedb519aab11807703d91ae49ac421 Mon Sep 17 00:00:00 2001
From: Olaf Kirch <olaf.kirch at oracle.com>
Date: Thu, 24 Apr 2008 00:27:36 -0700
Subject: [PATCH] RDS: Implement rds ping
Several people have asked for a way to test reachability of
remote nodes via RDS. This is it - rds ping.
RDS ping is implemented by sending packets to port 0.
As a matter of simplicity, we do not handle packet payloads at this time -
the ping response is always an empty packet.
Signed-off-by: Olaf Kirch <olaf.kirch at oracle.com>
---
net/rds/cong.c | 2 +-
net/rds/rds.h | 5 ++++
net/rds/recv.c | 6 +++++
net/rds/send.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
net/rds/stats.c | 2 +
net/rds/sysctl.c | 10 +++++++++
6 files changed, 78 insertions(+), 3 deletions(-)
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 2db2362..4ec85ce 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -348,7 +348,7 @@ int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rd
if (!rds_cong_test_bit(map, port))
return 0;
if (nonblock) {
- if (rs->rs_cong_monitor) {
+ if (rs && rs->rs_cong_monitor) {
unsigned long flags;
/* It would have been nice to have an atomic set_bit on
diff --git a/net/rds/rds.h b/net/rds/rds.h
index d5a966d..a0fb20c 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -487,6 +487,7 @@ struct rds_statistics {
unsigned long s_recv_delayed_retry;
unsigned long s_recv_ack_required;
unsigned long s_recv_rdma_bytes;
+ unsigned long s_recv_ping;
unsigned long s_send_queue_empty;
unsigned long s_send_queue_full;
unsigned long s_send_sem_contention;
@@ -497,6 +498,7 @@ struct rds_statistics {
unsigned long s_send_ack_required;
unsigned long s_send_rdma;
unsigned long s_send_rdma_bytes;
+ unsigned long s_send_pong;
unsigned long s_page_remainder_hit;
unsigned long s_page_remainder_miss;
unsigned long s_cong_update_queued;
@@ -570,6 +572,7 @@ rds_conn_up(struct rds_connection *conn)
}
/* message.c */
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
size_t total_len);
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
@@ -641,6 +644,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
is_acked_func is_acked);
int rds_send_acked_before(struct rds_connection *conn, u64 seq);
void rds_send_remove_from_sock(struct list_head *messages, int status);
+int rds_send_pong(struct rds_connection *conn, __be16 dport);
/* rdma.c */
void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
@@ -672,6 +676,7 @@ extern unsigned long rds_sysctl_reconnect_min_jiffies;
extern unsigned long rds_sysctl_reconnect_max_jiffies;
extern unsigned int rds_sysctl_max_unacked_packets;
extern unsigned int rds_sysctl_max_unacked_bytes;
+extern unsigned int rds_sysctl_ping_enable;
/* threads.c */
int __init rds_threads_init(void);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 9adb24d..da3c879 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -196,6 +196,12 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
}
conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+ if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+ rds_stats_inc(s_recv_ping);
+ rds_send_pong(conn, inc->i_hdr.h_sport);
+ goto out;
+ }
+
rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
if (rs == NULL) {
rds_stats_inc(s_recv_drop_no_sock);
diff --git a/net/rds/send.c b/net/rds/send.c
index a2a5b2a..26e1e3e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -700,8 +700,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
if (msg->msg_namelen) {
/* XXX fail non-unicast destination IPs? */
- if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET ||
- usin->sin_port == 0) {
+ if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
ret = -EINVAL;
goto out;
}
@@ -820,3 +819,56 @@ out:
rds_message_put(rm);
return ret;
}
+
+/*
+ * Reply to a ping packet.
+ */
+int
+rds_send_pong(struct rds_connection *conn, __be16 dport)
+{
+ struct rds_message *rm;
+ unsigned long flags;
+ int ret = 0;
+
+ rm = rds_message_alloc(0, GFP_ATOMIC);
+ if (rm == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rm->m_daddr = conn->c_faddr;
+
+ /* If the connection is down, trigger a connect. We may
+ * have scheduled a delayed reconnect however - in this case
+ * we should not interfere.
+ */
+ if (rds_conn_state(conn) == RDS_CONN_DOWN
+ && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+
+ ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
+ if (ret)
+ goto out;
+
+ spin_lock_irqsave(&conn->c_lock, flags);
+ list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+ set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+ rds_message_addref(rm);
+ rm->m_inc.i_conn = conn;
+
+ rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
+ conn->c_next_tx_seq);
+ conn->c_next_tx_seq++;
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+
+ rds_stats_inc(s_send_pong);
+
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ rds_message_put(rm);
+ return 0;
+
+out:
+ if (rm)
+ rds_message_put(rm);
+ return ret;
+}
diff --git a/net/rds/stats.c b/net/rds/stats.c
index abf7103..0bd91fa 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -53,6 +53,7 @@ static char *rds_stat_names[] = {
"recv_delayed_retry",
"recv_ack_required",
"recv_rdma_bytes",
+ "recv_ping",
"send_queue_empty",
"send_queue_full",
"send_sem_contention",
@@ -63,6 +64,7 @@ static char *rds_stat_names[] = {
"send_ack_required",
"send_rdma",
"send_rdma_bytes",
+ "send_pong",
"page_remainder_hit",
"page_remainder_miss",
"cong_update_queued",
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index 5f7ce37..7b18c0a 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -47,6 +47,8 @@ unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
unsigned int rds_sysctl_max_unacked_packets = 16;
unsigned int rds_sysctl_max_unacked_bytes = (16 << 20);
+unsigned int rds_sysctl_ping_enable = 1;
+
/*
* These can change over time until they're official. Until that time we'll
* give apps a way to figure out what the values are in a given machine.
@@ -107,6 +109,14 @@ static ctl_table rds_sysctl_rds_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = 10,
+ .procname = "ping_enable",
+ .data = &rds_sysctl_ping_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
/* 100+ are reserved for transport subdirs */
{ .ctl_name = 0}
};
--
1.5.4.rc3
--
Olaf Kirch | --- o --- Nous sommes du soleil we love when we play
okir at lst.de | / | \ sol.dhoop.naytheet.ah kin.ir.samse.qurax
More information about the general
mailing list