[ofa-general] [PATCH 2 of 17] ipoib: hw csum and s/g support

Eli Cohen eli at mellanox.co.il
Tue Sep 11 08:53:51 PDT 2007


From: Michael S. Tsirkin <mst at mellanox.co.il>

Add module option hw_csum: when set, IPoIB will report S/G
support, and rely on hardware end-to-end transport checksum (ICRC)
instead of software-level protocol checksums.

Since this will not inter-operate with older IPoIB modules,
this option is off by default.

Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

---

When applied on top of previously posted mlx4 patches,
and with hw_csum enabled, this patch speeds up single-stream
netperf bandwidth on connectx DDR from 1000 to 1250 MBytes/sec.

I know some people find this approach controversial,
but from my perspective, this is not worse than e.g.
SDP which does not have SW checksums pretty much by design.

Hopefully the option being off by default is enough to
pacify the critics :).

Add module option hw_csum: when set, IPoIB will report S/G
support, and rely on hardware end-to-end transport checksum (ICRC)
instead of software-level protocol checksums.

Since this will not inter-operate with older IPoIB modules,
this option is off by default.

Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

---

When applied on top of previously posted mlx4 patches,
and with hw_csum enabled, this patch speeds up single-stream
netperf bandwidth on connectx DDR from 1000 to 1250 MBytes/sec.

I know some people find this approach controversial,
but from my perspective, this is not worse than e.g.
SDP which does not have SW checksums pretty much by design.

Hopefully the option being off by default is enough to
pacify the critics :).

Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib.h	2007-09-11 21:15:17.000000000 +0300
+++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib.h	2007-09-11 21:15:25.000000000 +0300
@@ -86,6 +86,7 @@ enum {
 	IPOIB_MCAST_STARTED       = 8,
 	IPOIB_FLAG_NETIF_STOPPED  = 9,
 	IPOIB_FLAG_ADMIN_CM 	  = 10,
+	IPOIB_FLAG_HW_CSUM	  = 11,
 
 	IPOIB_MAX_BACKOFF_SECONDS = 16,
 
@@ -104,9 +105,11 @@ enum {
 
 /* structs */
 
+#define IPOIB_HEADER_F_HWCSUM 0x1
+
 struct ipoib_header {
 	__be16	proto;
-	u16	reserved;
+	__be16	flags;
 };
 
 struct ipoib_pseudoheader {
@@ -122,9 +125,54 @@ struct ipoib_rx_buf {
 
 struct ipoib_tx_buf {
 	struct sk_buff *skb;
-	u64		mapping;
+	u64		mapping[MAX_SKB_FRAGS + 1];
 };
 
+static inline int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
+{
+	struct sk_buff *skb = tx_req->skb;
+	u64 *mapping = tx_req->mapping;
+	int i, frags;
+
+	mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb), DMA_TO_DEVICE);
+	if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
+		return -EIO;
+
+	frags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < frags; ++i) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		mapping[i + 1] = ib_dma_map_page(ca, frag->page, frag->page_offset,
+						 frag->size, DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[i + 1])))
+			goto partial_error;
+	}
+	return 0;
+
+partial_error:
+	ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
+
+	for (; i > 0; --i) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
+		ib_dma_unmap_page(ca, mapping[i], frag->size, DMA_TO_DEVICE);
+	}
+	return -EIO;
+}
+
+static inline void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
+{
+	struct sk_buff *skb = tx_req->skb;
+	u64 *mapping = tx_req->mapping;
+	int i, frags;
+
+	ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
+
+	frags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < frags; ++i) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		ib_dma_unmap_page(ca, mapping[i + 1], frag->size, DMA_TO_DEVICE);
+	}
+}
+
 struct ib_cm_id;
 
 struct ipoib_cm_data {
@@ -269,7 +317,7 @@ struct ipoib_dev_priv {
 	struct ipoib_tx_buf *tx_ring;
 	unsigned             tx_head;
 	unsigned             tx_tail;
-	struct ib_sge        tx_sge;
+	struct ib_sge        tx_sge[MAX_SKB_FRAGS + 1];
 	struct ib_send_wr    tx_wr;
 
 	struct ib_wc ibwc[IPOIB_NUM_WC];
Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_cm.c
===================================================================
--- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-09-11 21:14:35.000000000 +0300
+++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-09-11 21:15:25.000000000 +0300
@@ -407,6 +407,7 @@ void ipoib_cm_handle_rx_wc(struct net_de
 	unsigned long flags;
 	u64 mapping[IPOIB_CM_RX_SG];
 	int frags;
+	struct ipoib_header *header;
 
 	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
 		       wr_id, wc->status);
@@ -469,7 +470,10 @@ void ipoib_cm_handle_rx_wc(struct net_de
 
 	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
 
-	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+	header = (struct ipoib_header *)skb->data;
+	skb->protocol = header->proto;
+	if (header->flags & cpu_to_be16(IPOIB_HEADER_F_HWCSUM))
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	skb_reset_mac_header(skb);
 	skb_pull(skb, IPOIB_ENCAP_LEN);
 
@@ -491,14 +495,21 @@ repost:
 static inline int post_send(struct ipoib_dev_priv *priv,
 			    struct ipoib_cm_tx *tx,
 			    unsigned int wr_id,
-			    u64 addr, int len)
+			    u64 *mapping, int headlen,
+			    skb_frag_t *frags,
+			    int nr_frags)
 {
 	struct ib_send_wr *bad_wr;
+	int i;
 
-	priv->tx_sge.addr             = addr;
-	priv->tx_sge.length           = len;
-
-	priv->tx_wr.wr_id 	      = wr_id;
+	priv->tx_sge[0].addr   = mapping[0];
+	priv->tx_sge[0].length = headlen;
+	for (i = 0; i < nr_frags; ++i) {
+		priv->tx_sge[i + 1].addr = mapping[i + 1];
+		priv->tx_sge[i + 1].length = frags[i].size;
+	}
+	priv->tx_wr.num_sge    = nr_frags + 1;
+	priv->tx_wr.wr_id      = wr_id;
 
 	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
 }
@@ -507,7 +518,6 @@ void ipoib_cm_send(struct net_device *de
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_tx_buf *tx_req;
-	u64 addr;
 
 	if (unlikely(skb->len > tx->mtu)) {
 		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
@@ -530,20 +540,19 @@ void ipoib_cm_send(struct net_device *de
 	 */
 	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
 	tx_req->skb = skb;
-	addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
-	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
+	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
 		++priv->stats.tx_errors;
 		dev_kfree_skb_any(skb);
 		return;
 	}
 
-	tx_req->mapping = addr;
-
 	if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
-			        addr, skb->len))) {
+			        tx_req->mapping, skb_headlen(skb),
+				skb_shinfo(skb)->frags,
+				skb_shinfo(skb)->nr_frags))) {
 		ipoib_warn(priv, "post_send failed\n");
 		++priv->stats.tx_errors;
-		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
+		ipoib_dma_unmap_tx(priv->ca, tx_req);
 		dev_kfree_skb_any(skb);
 	} else {
 		dev->trans_start = jiffies;
@@ -577,7 +586,7 @@ static void ipoib_cm_handle_tx_wc(struct
 
 	tx_req = &tx->tx_ring[wr_id];
 
-	ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
+	ipoib_dma_unmap_tx(priv->ca, tx_req);
 
 	/* FIXME: is this right? Shouldn't we only increment on success? */
 	++priv->stats.tx_packets;
@@ -814,7 +823,7 @@ static struct ib_qp *ipoib_cm_create_tx_
 	attr.recv_cq = priv->cq;
 	attr.srq = priv->cm.srq;
 	attr.cap.max_send_wr = ipoib_sendq_size;
-	attr.cap.max_send_sge = 1;
+	attr.cap.max_send_sge = dev->features & NETIF_F_SG ? MAX_SKB_FRAGS + 1 : 0;
 	attr.sq_sig_type = IB_SIGNAL_ALL_WR;
 	attr.qp_type = IB_QPT_RC;
 	attr.send_cq = cq;
@@ -981,8 +990,7 @@ static void ipoib_cm_tx_destroy(struct i
 	if (p->tx_ring) {
 		while ((int) p->tx_tail - (int) p->tx_head < 0) {
 			tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
-			ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
-					 DMA_TO_DEVICE);
+			ipoib_dma_unmap_tx(priv->ca, tx_req);
 			dev_kfree_skb_any(tx_req->skb);
 			++p->tx_tail;
 		}
Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-09-11 21:14:35.000000000 +0300
+++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-09-11 21:15:25.000000000 +0300
@@ -170,6 +170,7 @@ static void ipoib_ib_handle_rx_wc(struct
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
 	struct sk_buff *skb;
+	struct ipoib_header *header;
 	u64 addr;
 
 	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
@@ -220,7 +221,10 @@ static void ipoib_ib_handle_rx_wc(struct
 	skb_put(skb, wc->byte_len);
 	skb_pull(skb, IB_GRH_BYTES);
 
-	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+	header = (struct ipoib_header *)skb->data;
+	skb->protocol = header->proto;
+	if (header->flags & cpu_to_be16(IPOIB_HEADER_F_HWCSUM))
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	skb_reset_mac_header(skb);
 	skb_pull(skb, IPOIB_ENCAP_LEN);
 
@@ -257,8 +261,7 @@ static void ipoib_ib_handle_tx_wc(struct
 
 	tx_req = &priv->tx_ring[wr_id];
 
-	ib_dma_unmap_single(priv->ca, tx_req->mapping,
-			    tx_req->skb->len, DMA_TO_DEVICE);
+	ipoib_dma_unmap_tx(priv->ca, tx_req);
 
 	++priv->stats.tx_packets;
 	priv->stats.tx_bytes += tx_req->skb->len;
@@ -343,16 +346,23 @@ void ipoib_ib_completion(struct ib_cq *c
 static inline int post_send(struct ipoib_dev_priv *priv,
 			    unsigned int wr_id,
 			    struct ib_ah *address, u32 qpn,
-			    u64 addr, int len)
+			    u64 *mapping, int headlen,
+			    skb_frag_t *frags,
+			    int nr_frags)
 {
 	struct ib_send_wr *bad_wr;
+	int i;
 
-	priv->tx_sge.addr             = addr;
-	priv->tx_sge.length           = len;
-
-	priv->tx_wr.wr_id 	      = wr_id;
-	priv->tx_wr.wr.ud.remote_qpn  = qpn;
-	priv->tx_wr.wr.ud.ah 	      = address;
+	priv->tx_sge[0].addr         = mapping[0];
+	priv->tx_sge[0].length       = headlen;
+	for (i = 0; i < nr_frags; ++i) {
+		priv->tx_sge[i + 1].addr = mapping[i + 1];
+		priv->tx_sge[i + 1].length = frags[i].size;
+	}
+	priv->tx_wr.num_sge          = nr_frags + 1;
+	priv->tx_wr.wr_id 	     = wr_id;
+	priv->tx_wr.wr.ud.remote_qpn = qpn;
+	priv->tx_wr.wr.ud.ah 	     = address;
 
 	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
 }
@@ -362,7 +372,6 @@ void ipoib_send(struct net_device *dev, 
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_tx_buf *tx_req;
-	u64 addr;
 
 	if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
 		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
@@ -385,20 +394,19 @@ void ipoib_send(struct net_device *dev, 
 	 */
 	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
 	tx_req->skb = skb;
-	addr = ib_dma_map_single(priv->ca, skb->data, skb->len,
-				 DMA_TO_DEVICE);
-	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
+	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
 		++priv->stats.tx_errors;
 		dev_kfree_skb_any(skb);
 		return;
 	}
-	tx_req->mapping = addr;
 
 	if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
-			       address->ah, qpn, addr, skb->len))) {
+			       address->ah, qpn,
+			       tx_req->mapping, skb_headlen(skb),
+			       skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags))) {
 		ipoib_warn(priv, "post_send failed\n");
 		++priv->stats.tx_errors;
-		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
+		ipoib_dma_unmap_tx(priv->ca, tx_req);
 		dev_kfree_skb_any(skb);
 	} else {
 		dev->trans_start = jiffies;
@@ -604,10 +612,7 @@ int ipoib_ib_dev_stop(struct net_device 
 			while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
 				tx_req = &priv->tx_ring[priv->tx_tail &
 							(ipoib_sendq_size - 1)];
-				ib_dma_unmap_single(priv->ca,
-						    tx_req->mapping,
-						    tx_req->skb->len,
-						    DMA_TO_DEVICE);
+				ipoib_dma_unmap_tx(priv->ca, tx_req);
 				dev_kfree_skb_any(tx_req->skb);
 				++priv->tx_tail;
 			}
Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-09-11 21:15:24.000000000 +0300
+++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-09-11 21:15:25.000000000 +0300
@@ -55,11 +55,14 @@ MODULE_LICENSE("Dual BSD/GPL");
 
 int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
 int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
+static int ipoib_hw_csum __read_mostly = 0;
 
 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
+module_param_named(hw_csum, ipoib_hw_csum, int, 0444);
+MODULE_PARM_DESC(hw_csum, "Rely on hardware end-to-end checksum (ICRC) if > 0");
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 int ipoib_debug_level;
@@ -804,11 +807,16 @@ static int ipoib_hard_header(struct sk_b
 			     void *daddr, void *saddr, unsigned len)
 {
 	struct ipoib_header *header;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
 	header = (struct ipoib_header *) skb_push(skb, sizeof *header);
 
 	header->proto = htons(type);
-	header->reserved = 0;
+        if (test_bit(IPOIB_FLAG_HW_CSUM, &priv->flags) &&
+	    skb->ip_summed == CHECKSUM_PARTIAL)
+		header->flags = cpu_to_be16(IPOIB_HEADER_F_HWCSUM);
+	else
+		header->flags = 0;
 
 	/*
 	 * If we don't have a neighbour structure, stuff the
@@ -1006,6 +1014,10 @@ static void ipoib_setup(struct net_devic
 	dev->type 		 = ARPHRD_INFINIBAND;
 	dev->tx_queue_len 	 = ipoib_sendq_size * 2;
 	dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
+	if (ipoib_hw_csum) {
+		dev->features   |= NETIF_F_SG | NETIF_F_HW_CSUM;
+                set_bit(IPOIB_FLAG_HW_CSUM, &priv->flags);
+	}
 
 	/* MTU will be reset when mcast join happens */
 	dev->mtu 		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
===================================================================
--- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2007-09-11 21:14:35.000000000 +0300
+++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2007-09-11 21:15:25.000000000 +0300
@@ -149,14 +149,14 @@ int ipoib_transport_dev_init(struct net_
 		.cap = {
 			.max_send_wr  = ipoib_sendq_size,
 			.max_recv_wr  = ipoib_recvq_size,
-			.max_send_sge = 1,
+			.max_send_sge = MAX_SKB_FRAGS + 1,
 			.max_recv_sge = 1
 		},
 		.sq_sig_type = IB_SIGNAL_ALL_WR,
 		.qp_type     = IB_QPT_UD
 	};
 
-	int ret, size;
+	int i, ret, size;
 
 	priv->pd = ib_alloc_pd(priv->ca);
 	if (IS_ERR(priv->pd)) {
@@ -197,11 +197,11 @@ int ipoib_transport_dev_init(struct net_
 	priv->dev->dev_addr[2] = (priv->qp->qp_num >>  8) & 0xff;
 	priv->dev->dev_addr[3] = (priv->qp->qp_num      ) & 0xff;
 
-	priv->tx_sge.lkey 	= priv->mr->lkey;
+	for (i = 0; i < MAX_SKB_FRAGS + 1; ++i)
+		priv->tx_sge[i].lkey    = priv->mr->lkey;
 
 	priv->tx_wr.opcode 	= IB_WR_SEND;
-	priv->tx_wr.sg_list 	= &priv->tx_sge;
-	priv->tx_wr.num_sge 	= 1;
+	priv->tx_wr.sg_list 	= priv->tx_sge;
 	priv->tx_wr.send_flags 	= IB_SEND_SIGNALED;
 
 	return 0;




More information about the general mailing list