[openib-general] [PATCH/RFC] IB/ipoib: add selective tx signaling

Moni Shoua monis at voltaire.com
Tue Jan 9 02:00:48 PST 2007


This patch implements selective tx signaling for IPoIB.
It lets the user set the ratio between the number of 
sent packets and the number of TX completion signals.
This optimization has the following advantages:
+ increase the packet per second (PPS) rate
+ reduce the number of interrupts related to ipoib tx completions  

Since the IB HCA HW executes work requests posted QP in-order, we can i
assume that a completion of a work request means that all the work 
requests posted before it are also completed and hence their 
associated resources (skbs in this context) can be recycled.

The current driver implementation asks for a completion signaling for every sent packet (a ratio of 1). 
This patch enables the user to set a higher ratio.

Asking for a completion signal for every n (>1) packets saves the following:
1. less interrupts to the host
2. the amortized cost for tx completion handing is lowered
3. the tx_lock  is taken less often


The cost of selective signaling is in the average amount of memory that the IPoIB 
driver consumes since skbs are freed in the TX completion handler (which is now executed less often). 
So, if the current driver holds only few skbs at any given time (and normally not more than one) the new driver holds 
skbs up to n (the ratio between sent packets and the number of tx completions). For reasonable value of n 
can lead to over consumption of few tens of Kbytes but the real issue is elsewhere. Applications that set the 
socket buffer  to a small size (with setsockopt()) may suffer from ENOBUFS failures 
when calling to sendto() or sendmsg(). A good example for this is ping and a signaling ratio 
of 16 packets to 1 completion request. In this case few successful pings are followed by an endless sequence 
of errors (until ping restarts). The solution is to set n with attention to the specific user applications and
to use setsockopt() with care (ping for instance, can be run with -S).

Another issue is related to the ipoib_ib_dev_stop() operation. This function checks that the tail and head
of the tx_ring are equal and if they are not it assumes that there are uncompleted work requests.
With this patch it is normal that the tail and head of the tx_ring would be different sice we are not always
asking for a completion notification. Since I don't see a way to tell if the tail/head gap is normal or 
due to a failure I only reduce the message severity from warn to dbg if the condition for expected gap is true.
However, I still see there a tiny chance that a completion notification would arrive after the timeout in ipoib_ib_dev_stop()
expires and the it tries to free the skbs in the tx_ring(). Solutions to that can be
   1. protect the code with a lock - but I started with trying to avoid locks
   2. reduce the hazard by adding more to the timeout and calling 
	test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); in the TX completion handler to check if ipoib_ib_dev_stop() 
	had started.


I would be happy to get comments for the last issue and for the rest of the patch of course.
thanks 
 > MoniS



 ipoib.h       |    2 ++
 ipoib_ib.c    |   39 ++++++++++++++++++++++++++++-----------
 ipoib_main.c  |   10 +++++++++-
 ipoib_verbs.c |    4 ++--
 4 files changed, 41 insertions(+), 14 deletions(-)

Signed-off-by: Moni Shoua <monis at voltaire.com>
---

Index: infiniband/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- infiniband.orig/drivers/infiniband/ulp/ipoib/ipoib.h	2007-01-07 15:39:49.421190295 +0200
+++ infiniband/drivers/infiniband/ulp/ipoib/ipoib.h	2007-01-07 15:42:33.768824668 +0200
@@ -164,6 +164,7 @@ struct ipoib_dev_priv {
 	struct ipoib_tx_buf *tx_ring;
 	unsigned             tx_head;
 	unsigned             tx_tail;
+	unsigned             tx_completion_mark;
 	struct ib_sge        tx_sge;
 	struct ib_send_wr    tx_wr;
 
@@ -335,6 +336,7 @@ static inline void ipoib_unregister_debu
 
 extern int ipoib_sendq_size;
 extern int ipoib_recvq_size;
+extern int num_unsignal_tx;
 
 extern struct ib_sa_client ipoib_sa_client;
 
Index: infiniband/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- infiniband.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-01-07 15:39:49.443186365 +0200
+++ infiniband/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-01-07 19:29:21.885896644 +0200
@@ -256,29 +256,32 @@ static void ipoib_ib_handle_tx_wc(struct
 		return;
 	}
 
-	tx_req = &priv->tx_ring[wr_id];
+	do {
+		tx_req = &priv->tx_ring[wr_id];
 
-	ib_dma_unmap_single(priv->ca, tx_req->mapping,
-			    tx_req->skb->len, DMA_TO_DEVICE);
+		ib_dma_unmap_single(priv->ca, tx_req->mapping,
+				    tx_req->skb->len, DMA_TO_DEVICE);
 
-	++priv->stats.tx_packets;
-	priv->stats.tx_bytes += tx_req->skb->len;
+		++priv->stats.tx_packets;
+		priv->stats.tx_bytes += tx_req->skb->len;
 
-	dev_kfree_skb_any(tx_req->skb);
+		dev_kfree_skb_any(tx_req->skb);
+	} while (wr_id-- > priv->tx_completion_mark);
 
 	spin_lock_irqsave(&priv->tx_lock, flags);
-	++priv->tx_tail;
+	priv->tx_tail += wc->wr_id - priv->tx_completion_mark + 1;
 	if (netif_queue_stopped(dev) &&
 	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) &&
 	    priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
 		netif_wake_queue(dev);
 	spin_unlock_irqrestore(&priv->tx_lock, flags);
+	priv->tx_completion_mark = (wc->wr_id + 1) & (ipoib_sendq_size - 1);
 
 	if (wc->status != IB_WC_SUCCESS &&
 	    wc->status != IB_WC_WR_FLUSH_ERR)
 		ipoib_warn(priv, "failed send event "
 			   "(status=%d, wrid=%d vend_err %x)\n",
-			   wc->status, wr_id, wc->vendor_err);
+			   wc->status, (unsigned int)wc->wr_id, wc->vendor_err);
 }
 
 static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc)
@@ -309,7 +312,11 @@ static inline int post_send(struct ipoib
 			    u64 addr, int len)
 {
 	struct ib_send_wr *bad_wr;
+	int send_signaled=0;
+	int ret;
 
+	if ((wr_id & num_unsignal_tx) == num_unsignal_tx)
+		send_signaled=1;
 	priv->tx_sge.addr             = addr;
 	priv->tx_sge.length           = len;
 
@@ -317,7 +324,11 @@ static inline int post_send(struct ipoib
 	priv->tx_wr.wr.ud.remote_qpn  = qpn;
 	priv->tx_wr.wr.ud.ah 	      = address;
 
-	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+	if (send_signaled)
+		priv->tx_wr.send_flags |= IB_SEND_SIGNALED;
+	ret = ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+	priv->tx_wr.send_flags &= ~IB_SEND_SIGNALED;
+	return ret;
 }
 
 void ipoib_send(struct net_device *dev, struct sk_buff *skb,
@@ -522,8 +533,14 @@ int ipoib_ib_dev_stop(struct net_device 
 
 	while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) {
 		if (time_after(jiffies, begin + 5 * HZ)) {
-			ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
-				   priv->tx_head - priv->tx_tail, recvs_pending(dev));
+			if (recvs_pending(dev) ||
+				(priv->tx_head - priv->tx_tail) > num_unsignal_tx)
+				ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
+					   priv->tx_head - priv->tx_tail, recvs_pending(dev));
+			else
+				ipoib_dbg(priv, "%d sends left in q."
+						  "probably sent without completion notification.\n",
+					   priv->tx_head - priv->tx_tail);
 
 			/*
 			 * assume the HW is wedged and just free up
Index: infiniband/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- infiniband.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-01-07 15:39:49.454184399 +0200
+++ infiniband/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-01-07 15:42:33.806817879 +0200
@@ -57,11 +57,15 @@ MODULE_LICENSE("Dual BSD/GPL");
 
 int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
 int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
+int tx_signal_rate __read_mostly = 1;
+int num_unsignal_tx;
 
 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
+module_param_named(tx_signal_rate, tx_signal_rate, int, 0444);
+MODULE_PARM_DESC(tx_signal_rate, "Number of tx wr per wc. Must be power of 2");
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 int ipoib_debug_level;
@@ -849,7 +853,7 @@ int ipoib_dev_init(struct net_device *de
 		goto out_rx_ring_cleanup;
 	}
 
-	/* priv->tx_head & tx_tail are already 0 */
+	/* tx_completion_mark, priv->tx_head & tx_tail are already 0 */
 
 	if (ipoib_ib_dev_init(dev, ca, port))
 		goto out_tx_ring_cleanup;
@@ -1179,6 +1183,10 @@ static int __init ipoib_init_module(void
 	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
 	ipoib_sendq_size = max(ipoib_sendq_size, IPOIB_MIN_QUEUE_SIZE);
 
+	tx_signal_rate = roundup_pow_of_two(tx_signal_rate);
+	tx_signal_rate = min(tx_signal_rate, ipoib_sendq_size);
+	tx_signal_rate = max(tx_signal_rate, 1);
+	num_unsignal_tx = tx_signal_rate - 1;
 	ret = ipoib_register_debugfs();
 	if (ret)
 		return ret;
Index: infiniband/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
===================================================================
--- infiniband.orig/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2007-01-07 15:39:49.481179576 +0200
+++ infiniband/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2007-01-07 15:42:33.832813235 +0200
@@ -164,7 +164,7 @@ int ipoib_transport_dev_init(struct net_
 			.max_send_sge = 1,
 			.max_recv_sge = 1
 		},
-		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.sq_sig_type = IB_SIGNAL_REQ_WR,
 		.qp_type     = IB_QPT_UD
 	};
 
@@ -208,7 +208,7 @@ int ipoib_transport_dev_init(struct net_
 	priv->tx_wr.opcode 	= IB_WR_SEND;
 	priv->tx_wr.sg_list 	= &priv->tx_sge;
 	priv->tx_wr.num_sge 	= 1;
-	priv->tx_wr.send_flags 	= IB_SEND_SIGNALED;
+	priv->tx_wr.send_flags 	= 0;
 
 	return 0;
 





More information about the general mailing list