[ofa-general] [PATCH 2/3] RDS: Fix RDS congestion monitoring

Olaf Kirch olaf.kirch at oracle.com
Wed May 7 03:54:13 PDT 2008


commit 01b72f27721c2dccf42dd5eea2e5d5e573cd8585
Author: Olaf Kirch <olaf.kirch at oracle.com>
Date:   Wed May 7 10:40:36 2008 +0200

    Fix RDS congestion monitoring
    
    The RDS congestion monitoring code tries to help applications
    deal with remote congestion more efficiently. If enabled,
    an application that tried to send to a congested port will
    receive a notification as soon as the port becomes uncongested
    again. For efficiency reasons, the application isn't given a
    complete 8K congestion bitmap, but a 64bit mask that represents
    the ports having changed, with port N being represented by
    (1 << (port % 64))
    
    This code had several bugs in it:
     -	the macro used to translate port numbers to the mask bit
     	shifted integer 1, not 1ULL, resulting in undefined
    	behavior when (port % 64) >= 32
    
     -	rds_ib_cong_recv computes the 64bit mask of all ports
     	that changed from congested to uncongested. It got the
    	bit arithmetics wrong.
    	Also, it used be64_to_cpu to convert the mask, which
    	is wrong, as the congestion map is little endian
    
     -	in the IB send completion handler, we need to check
     	whether there is a pending congestion map update we
    	need to send.
    
     -	in rds_poll, we should grab the rs_lock spinlock
     	when testing whether rs_cong_mask is non-zero
    
    Signed-off-by: Olaf Kirch <olaf.kirch at oracle.com>

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 7c20dd4..2fa2d0e 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -165,8 +165,10 @@ static unsigned int rds_poll(struct file *file, struct socket *sock,
 		if (rds_cong_updated_since(&rs->rs_cong_track))
 			mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
 	} else {
+		spin_lock(&rs->rs_lock);
 		if (rs->rs_cong_notify)
 			mask |= (POLLIN | POLLRDNORM);
+		spin_unlock(&rs->rs_lock);
 	}
 	if (!list_empty(&rs->rs_recv_queue)
 	 || !list_empty(&rs->rs_notify_queue))
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 4ec85ce..beeb539 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -238,7 +238,7 @@ void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
 	if (waitqueue_active(&rds_poll_waitq))
 		wake_up_all(&rds_poll_waitq);
 
-	if (!list_empty(&rds_cong_monitor)) {
+	if (portmask && !list_empty(&rds_cong_monitor)) {
 		unsigned long flags;
 		struct rds_sock *rs;
 
diff --git a/net/rds/ib_rds.h b/net/rds/ib_rds.h
index cea73fc..e098036 100644
--- a/net/rds/ib_rds.h
+++ b/net/rds/ib_rds.h
@@ -176,7 +176,7 @@ struct rds_info_tcp_socket {
  */
 #define RDS_CONG_MONITOR_SIZE	64
 #define RDS_CONG_MONITOR_BIT(port)  (((unsigned int) port) % RDS_CONG_MONITOR_SIZE)
-#define RDS_CONG_MONITOR_MASK(port) (1 << RDS_CONG_MONITOR_BIT(port))
+#define RDS_CONG_MONITOR_MASK(port) (1ULL << RDS_CONG_MONITOR_BIT(port))
 
 /*
  * RDMA related types
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 6c9cc9e..8ffbb0c 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -639,8 +639,9 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
 		src = addr + frag_off;
 		dst = (void *)map->m_page_addrs[map_page] + map_off;
 		for (k = 0; k < to_copy; k += 8) {
-			/* Record ports that became uncongested */
-			uncongested |= *src & (*src ^ *dst);
+			/* Record ports that became uncongested, ie
+			 * bits that changed from 0 to 1. */
+			uncongested |= ~(*src) & *dst;
 			*dst++ = *src++;
 		}
 		kunmap_atomic(addr, KM_SOFTIRQ0);
@@ -662,7 +663,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
 	}
 
 	/* the congestion map is in little endian order */
-	uncongested = be64_to_cpu(uncongested);
+	uncongested = le64_to_cpu(uncongested);
 
 	rds_cong_map_updated(map, uncongested);
 }
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 724167c..567f62f 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -218,7 +218,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
 		rds_ib_ring_free(&ic->i_send_ring, completed);
 
-		if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+		 || test_bit(0, &conn->c_map_queued))
 			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
 
 		/* We expect errors as the qp is drained during shutdown */

-- 
Olaf Kirch  |  --- o --- Nous sommes du soleil we love when we play
okir at lst.de |    / | \   sol.dhoop.naytheet.ah kin.ir.samse.qurax



More information about the general mailing list