[ofa-general] socket buffer accounting with UDP/ipoib

Tue Jul 17 10:41:49 PDT 2007

I did a quick hack to enable copybreak for UD packets up to 256 bytes
(see below).  This is still missing copybreak for CM / RC mode.
However I just wanted to see how it affected performance.  And the
answer is that on my system (fast quad-core Xeon, 1-port Mellanox PCIe
HCA) is that it didn't make any difference in small-message latency or
throughput, at least none that I could measure with netpipe (NPtcp).

I'm not sure whether to pursue this or not.

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 285c143..bf60bbb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -59,6 +59,8 @@ enum {
 	IPOIB_PACKET_SIZE         = 2048,
 	IPOIB_BUF_SIZE 		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
 
+	IPOIB_COPYBREAK		  = 256,
+
 	IPOIB_ENCAP_LEN 	  = 4,
 
 	IPOIB_CM_MTU              = 0x10000 - 0x10, /* padding to align header to 16 */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 1094488..8d6d0d0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -203,22 +203,48 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
 		goto repost;
 
-	/*
-	 * If we can't allocate a new RX buffer, dump
-	 * this packet and reuse the old buffer.
-	 */
-	if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
-		++priv->stats.rx_dropped;
-		goto repost;
-	}
-
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
 
-	ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+	if (wc->byte_len < IPOIB_COPYBREAK + IB_GRH_BYTES) {
+		struct sk_buff *new_skb;
+
+		/*
+		 * Add 12 bytes to 4-byte IPoIB header to get IP
+		 * header at a multiple of 16.
+		 */
+		new_skb = dev_alloc_skb(wc->byte_len - IB_GRH_BYTES + 12);
+		if (unlikely(!new_skb)) {
+			++priv->stats.rx_dropped;
+			goto repost;
+		}
+
+		skb_reserve(new_skb, 12);
+		skb_put(new_skb, wc->byte_len - IB_GRH_BYTES);
 
-	skb_put(skb, wc->byte_len);
-	skb_pull(skb, IB_GRH_BYTES);
+		ib_dma_sync_single_for_cpu(priv->ca, addr, IPOIB_BUF_SIZE,
+					   DMA_FROM_DEVICE);
+		skb_copy_from_linear_data_offset(skb, IB_GRH_BYTES, new_skb->data,
+						 wc->byte_len - IB_GRH_BYTES);
+		ib_dma_sync_single_for_device(priv->ca, addr, IPOIB_BUF_SIZE,
+					      DMA_FROM_DEVICE);
+
+		skb = new_skb;
+	} else {
+		/*
+		 * If we can't allocate a new RX buffer, dump
+		 * this packet and reuse the old buffer.
+		 */
+		if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
+			++priv->stats.rx_dropped;
+			goto repost;
+		}
+
+		ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+
+		skb_put(skb, wc->byte_len);
+		skb_pull(skb, IB_GRH_BYTES);
+	}
 
 	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
 	skb_reset_mac_header(skb);