[openib-general] [PATCH for-2.6.21] IPoIB/cm: improve small message bandwidth

Michael S. Tsirkin mst at mellanox.co.il
Tue Feb 20 10:17:55 PST 2007


Avoid overhead of freeing/reallocating and mapping/unmapping for dma
for pages that have not been written to by hardware.

Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

---

This gives >10% boost in BW for message sizes up to 32K. Please queue for 2.6.21.

before:

# ./netperf-2.4.2/src/netperf -f M -H 11.4.3.68 -c -C -- -m 32000
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 11.4.3.68 (11.4.3.68) port 0 AF_INET : demo
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    MBytes  /s  % S      % S      us/KB   us/KB

 87380  16384  32000    10.00       716.23   26.22    23.94    1.430   1.306


after:

# ./netperf-2.4.2/src/netperf -f M -H 11.4.3.68 -c -C -- -m 32000
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 11.4.3.68 (11.4.3.68) port 0 AF_INET : demo
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    MBytes  /s  % S      % S      us/KB   us/KB

 87380  16384  32000    10.00       888.67   24.13    25.08    1.061   1.102


diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 8ee6f06..a23c8e3 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -68,14 +68,14 @@ struct ipoib_cm_id {
 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 			       struct ib_cm_event *event);
 
-static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv,
+static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
 				  u64 mapping[IPOIB_CM_RX_SG])
 {
 	int i;
 
 	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
 
-	for (i = 0; i < IPOIB_CM_RX_SG - 1; ++i)
+	for (i = 0; i < frags; ++i)
 		ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
 }
 
@@ -93,7 +93,8 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id)
 	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
-		ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[id].mapping);
+		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+				      priv->cm.srq_ring[id].mapping);
 		dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
 		priv->cm.srq_ring[id].skb = NULL;
 	}
@@ -101,8 +102,8 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id)
 	return ret;
 }
 
-static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id,
-				 u64 mapping[IPOIB_CM_RX_SG])
+static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, int frags,
+					     u64 mapping[IPOIB_CM_RX_SG])
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct sk_buff *skb;
@@ -110,7 +111,7 @@ static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id,
 
 	skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
 	if (unlikely(!skb))
-		return -ENOMEM;
+		return NULL;
 
 	/*
 	 * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
@@ -122,10 +123,10 @@ static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id,
 				       DMA_FROM_DEVICE);
 	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
 		dev_kfree_skb_any(skb);
-		return -EIO;
+		return NULL;
 	}
 
-	for (i = 0; i < IPOIB_CM_RX_SG - 1; i++) {
+	for (i = 0; i < frags; i++) {
 		struct page *page = alloc_page(GFP_ATOMIC);
 
 		if (!page)
@@ -139,7 +140,7 @@ static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id,
 	}
 
 	priv->cm.srq_ring[id].skb = skb;
-	return 0;
+	return skb;
 
 partial_error:
 
@@ -148,8 +149,8 @@ partial_error:
 	for (; i >= 0; --i)
 		ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
 
-	dev_kfree_skb_any(skb);
-	return -ENOMEM;
+	dev_kfree_skb_any(skb);
+	return NULL;
 }
 
 static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
@@ -312,7 +313,7 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
 }
 /* Adjust length of skb with fragments to match received data */
 static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
-			  unsigned int length)
+			  unsigned int length, struct sk_buff *toskb)
 {
 	int i, num_frags;
 	unsigned int size;
@@ -329,7 +330,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
 
 		if (length == 0) {
 			/* don't need this page */
-			__free_page(frag->page);
+			skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE);
 			--skb_shinfo(skb)->nr_frags;
 		} else {
 			size = min(length, (unsigned) PAGE_SIZE);
@@ -347,10 +348,11 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ;
-	struct sk_buff *skb;
+	struct sk_buff *skb, *newskb;
 	struct ipoib_cm_rx *p;
 	unsigned long flags;
 	u64 mapping[IPOIB_CM_RX_SG];
+	int frags;
 
 	ipoib_dbg_data(priv, "cm recv completion: id %d, op %d, status: %d\n",
 		       wr_id, wc->opcode, wc->status);
@@ -386,7 +388,11 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		}
 	}
 
-	if (unlikely(ipoib_cm_alloc_rx_skb(dev, wr_id, mapping))) {
+	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
+					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
+
+	newskb = ipoib_cm_alloc_rx_skb(dev, wr_id, frags, mapping);
+	if (unlikely(!newskb)) {
 		/*
 		 * If we can't allocate a new RX buffer, dump
 		 * this packet and reuse the old buffer.
@@ -396,13 +402,13 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		goto repost;
 	}
 
-	ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[wr_id].mapping);
-	memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, sizeof mapping);
+	ipoib_cm_dma_unmap_rx(priv, frags, priv->cm.srq_ring[wr_id].mapping);
+	memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);
 
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
 
-	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len);
+	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
 
 	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
 	skb->mac.raw = skb->data;
@@ -1196,7 +1202,8 @@ int ipoib_cm_dev_init(struct net_device *dev)
 	priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG;
 
 	for (i = 0; i < ipoib_recvq_size; ++i) {
-		if (ipoib_cm_alloc_rx_skb(dev, i, priv->cm.srq_ring[i].mapping)) {
+		if (!ipoib_cm_alloc_rx_skb(dev, i, IPOIB_CM_RX_SG - 1,
+					   priv->cm.srq_ring[i].mapping)) {
 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
 			ipoib_cm_dev_cleanup(dev);
 			return -ENOMEM;
@@ -1231,7 +1238,8 @@ void ipoib_cm_dev_cleanup(struct net_device *dev)
 		return;
 	for (i = 0; i < ipoib_recvq_size; ++i)
 		if (priv->cm.srq_ring[i].skb) {
-			ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[i].mapping);
+			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+					      priv->cm.srq_ring[i].mapping);
 			dev_kfree_skb_any(priv->cm.srq_ring[i].skb);
 			priv->cm.srq_ring[i].skb = NULL;
 		}

-- 
MST




More information about the general mailing list