[ofa-general] Re: [PATCH 6/8]: RDS: Use IB for loopback

Thu Apr 24 02:12:19 PDT 2008

From 2a91ce118f8d4e7e644ea849f61bd8953faaacc6 Mon Sep 17 00:00:00 2001
From: Olaf Kirch <olaf.kirch at oracle.com>
Date: Thu, 24 Apr 2008 00:27:36 -0700
Subject: [PATCH] RDS: Use IB for loopback

Currently, when an application wants to send to a RDS port
on the local host, RDS will create a connection using the
special loopback transport.

In order to be able to test RDS (and RDS over RDMA) faithfully
on standalone machines, we want loopback traffic to use the IB
transport if possible.

This patch makes the necessary changes. This turns out to be a
little tricky, as we need two rds_connection objects with the same
address pair. The current code doesn't really handle this, so
we have to jump through some hoops.

 -	loopback connections for IB are represented by two
 	rds_connections; the "active" connection created when we
	initiate the connect, and a "passive" connection created
	when we accept the incoming RC.

 -	The active connection is used to transmit packets, which
	are then received by the passive conn.

 -	the passive conn is never added to the global hash table;
 	instead it is kept in conn->c_passive.

Signed-off-by: Olaf Kirch <olaf.kirch at oracle.com>
---
 net/rds/connection.c |   42 +++++++++++++++++++++++++++++++++++-------
 net/rds/rds.h        |    3 +++
 net/rds/tcp.c        |    1 +
 net/rds/threads.c    |   10 +++++++++-
 4 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 585123a..5d7788e 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -130,15 +130,26 @@ void rds_conn_reset(struct rds_connection *conn)
  */
 static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 				       struct rds_transport *trans, gfp_t gfp,
-				       int allow_loop_transport)
+				       int is_outgoing)
 {
-	struct rds_connection *conn, *tmp;
+	struct rds_connection *conn, *tmp, *parent = NULL;
 	struct hlist_head *head = rds_conn_bucket(laddr, faddr);
 	unsigned long flags;
 	int ret;
 
 	spin_lock_irqsave(&rds_conn_lock, flags);
 	conn = rds_conn_lookup(head, laddr, faddr, trans);
+	if (conn
+	 && conn->c_loopback
+	 && conn->c_trans != &rds_loop_transport
+	 && !is_outgoing) {
+		/* This is a looped back IB connection, and we're
+		 * called by the code handling the incoming connect.
+		 * We need a second connection object into which we
+		 * can stick the other QP. */
+		parent = conn;
+		conn = parent->c_passive;
+	}
 	spin_unlock_irqrestore(&rds_conn_lock, flags);
 	if (conn)
 		goto out;
@@ -151,7 +162,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 
 	memset(conn, 0, sizeof(*conn));
 
-	/* hash_node below */
+	INIT_HLIST_NODE(&conn->c_hash_node);
 	conn->c_laddr = laddr;
 	conn->c_faddr = faddr;
 	spin_lock_init(&conn->c_lock);
@@ -173,8 +184,16 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 	 * can bind to the destination address then we'd rather the messages
 	 * flow through loopback rather than either transport.
 	 */
-	if (allow_loop_transport && rds_trans_get_preferred(faddr))
-		trans = &rds_loop_transport;
+	if (rds_trans_get_preferred(faddr)) {
+		conn->c_loopback = 1;
+		if (is_outgoing && trans->t_prefer_loopback) {
+			/* "outgoing" connection - and the transport
+			 * says it wants the connection handled by the
+			 * loopback transport. This is what TCP does.
+			 */
+			trans = &rds_loop_transport;
+		}
+	}
 
 	conn->c_trans = trans;
 
@@ -198,14 +217,21 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 		 NIPQUAD(laddr), NIPQUAD(faddr));
 
 	spin_lock_irqsave(&rds_conn_lock, flags);
-	tmp = rds_conn_lookup(head, laddr, faddr, trans);
+	if (parent == NULL) {
+		tmp = rds_conn_lookup(head, laddr, faddr, trans);
+		if (tmp == NULL)
+			hlist_add_head(&conn->c_hash_node, head);
+	} else {
+		if ((tmp = parent->c_passive) == NULL)
+			parent->c_passive = conn;
+	}
+
 	if (tmp) {
 		trans->conn_free(conn->c_transport_data);
 		kmem_cache_free(rds_conn_slab, conn);
 		conn = tmp;
 	} else {
 		rds_cong_add_conn(conn);
-		hlist_add_head(&conn->c_hash_node, head);
 		rds_conn_count++;
 	}
 
@@ -415,6 +441,8 @@ void __exit rds_conn_exit(void)
 			/* the conn won't reconnect once it's unhashed */
 			hlist_del_init(&conn->c_hash_node);
 
+			if (conn->c_passive)
+				__rds_conn_destroy(conn->c_passive);
 			__rds_conn_destroy(conn);
 		}
 	}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index dc1ab4c..d5a966d 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -121,6 +121,8 @@ struct rds_connection {
 	struct hlist_node	c_hash_node;
 	__be32			c_laddr;
 	__be32			c_faddr;
+	unsigned int		c_loopback : 1;
+	struct rds_connection *	c_passive;
 	spinlock_t		c_lock;
 
 	struct rds_cong_map	*c_lcong;
@@ -342,6 +344,7 @@ struct rds_transport {
 	struct list_head	t_item;
 	struct module		*t_owner;
 	char			*t_name;
+	unsigned int		t_prefer_loopback : 1;
 	int (*laddr_check)(__be32 addr);
 	int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
 	void (*conn_free)(void *data);
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index baf876e..f4e6fce 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -252,6 +252,7 @@ struct rds_transport rds_tcp_transport = {
 	.exit			= rds_tcp_exit,
 	.t_owner		= THIS_MODULE,
 	.t_name			= "tcp",
+	.t_prefer_loopback	= 1,
 };
 
 int __init rds_tcp_init(void)
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 2a5dc0b..b86fbc3 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -178,6 +178,11 @@ void rds_shutdown_worker(struct work_struct *work)
 		up(&conn->c_send_sem);
 
 		if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+			/* This can happen - eg when we're in the middle of tearing
+			 * down the connection, and someone unloads the rds module.
+			 * Quite reproduceable with loopback connections.
+			 * Mostly harmless.
+			 */
 			rds_conn_error(conn,
 				"%s: failed to transition to state DOWN, "
 				"current state is %d\n",
@@ -187,7 +192,10 @@ void rds_shutdown_worker(struct work_struct *work)
 		}
 	}
 
-	/* then reconnect if it's still live */
+	/* Then reconnect if it's still live.
+	 * The passive side of an IB loopback connection is never added
+	 * to the conn hash, so we never trigger a reconnect on this
+	 * conn - the reconnect is always triggered by the active peer. */
 	cancel_delayed_work(&conn->c_conn_w);
 	if (!hlist_unhashed(&conn->c_hash_node)) {
 		rds_queue_reconnect(conn);
-- 
1.5.4.rc3


-- 
Olaf Kirch  |  --- o --- Nous sommes du soleil we love when we play
okir at lst.de |    / | \   sol.dhoop.naytheet.ah kin.ir.samse.qurax