[ofa-general] 4K MTU patch review

Eli Cohen eli at mellanox.co.il
Wed Feb 6 09:45:40 PST 2008


Hi Shirley,

I have created the following patch based on your patch series such that
it touches less of the previous code and is thus smaller. Please review
it and see if you think we can make any use of it.


Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib.h	2008-02-06 18:57:34.847708000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib.h	2008-02-06 19:22:02.372113000 +0200
@@ -56,11 +56,11 @@
 /* constants */
 
 enum {
-	IPOIB_PACKET_SIZE         = 2048,
-	IPOIB_BUF_SIZE 		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
-
 	IPOIB_ENCAP_LEN 	  = 4,
 
+ 	IPOIB_UD_HEAD_SIZE	  = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
+ 	IPOIB_UD_RX_SG		  = 2, /* for 4K MTU */
+
 	IPOIB_CM_MTU              = 0x10000 - 0x10, /* padding to align header to 16 */
 	IPOIB_CM_BUF_SIZE         = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
 	IPOIB_CM_HEAD_SIZE 	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
@@ -141,9 +141,9 @@ struct ipoib_mcast {
 	struct net_device *dev;
 };
 
-struct ipoib_rx_buf {
+struct ipoib_sg_rx_buf {
 	struct sk_buff *skb;
-	u64		mapping;
+	u64		mapping[IPOIB_UD_RX_SG];
 };
 
 struct ipoib_tx_buf {
@@ -337,7 +337,7 @@ struct ipoib_dev_priv {
 
 	struct net_device      *dev;
 	struct ib_recv_wr	rx_wr_draft[UD_POST_RCV_COUNT];
-	struct ib_sge 		sglist_draft[UD_POST_RCV_COUNT];
+	struct ib_sge 		sglist_draft[UD_POST_RCV_COUNT][IPOIB_UD_RX_SG];
 	unsigned int		rx_outst;
 
 	struct napi_struct napi;
@@ -378,7 +378,7 @@ struct ipoib_dev_priv {
 	unsigned int admin_mtu;
 	unsigned int mcast_mtu;
 
-	struct ipoib_rx_buf *rx_ring;
+	struct ipoib_sg_rx_buf *rx_ring;
 
 	spinlock_t           tx_lock;
 	struct ipoib_tx_buf *tx_ring;
@@ -412,6 +412,7 @@ struct ipoib_dev_priv {
 	struct ipoib_ethtool_st etool;
 	struct timer_list poll_timer;
 	struct ib_ah *own_ah;
+ 	int max_ib_mtu;
 };
 
 struct ipoib_ah {
@@ -452,6 +453,28 @@ struct ipoib_neigh {
 	struct list_head    list;
 };
 
+#define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN)
+#define IPOIB_UD_BUF_SIZE(ib_mtu)	(ib_mtu + IB_GRH_BYTES)
+
+static inline int ipoib_ud_need_sg(int ib_mtu)
+{
+	return (IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE) ? 1 : 0;
+}
+
+static inline void ipoib_sg_dma_unmap_rx(struct ipoib_dev_priv *priv,
+					 u64 mapping[IPOIB_UD_RX_SG])
+{
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE,
+				    DMA_FROM_DEVICE);
+		ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE,
+				  DMA_FROM_DEVICE);
+	} else
+		ib_dma_unmap_single(priv->ca, mapping[0],
+				    IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
+				    DMA_FROM_DEVICE);
+}
+
 /*
  * We stash a pointer to our private neighbour information after our
  * hardware address in neigh->ha.  The ALIGN() expression here makes
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2008-02-06 18:57:34.007682000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2008-02-06 19:23:08.903503000 +0200
@@ -96,14 +96,35 @@ static void clean_pending_receives(struc
 
 	for (i = 0; i < priv->rx_outst; ++i) {
 		id = priv->rx_wr_draft[i].wr_id & ~IPOIB_OP_RECV;
-		ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
-                                            IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+		ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[i].mapping);
 		dev_kfree_skb_any(priv->rx_ring[id].skb);
 		priv->rx_ring[id].skb = NULL;
 	}
 	priv->rx_outst = 0;
 }
 
+static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
+				   struct sk_buff *skb, unsigned int length)
+{
+	unsigned int size;
+
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
+
+		/* put header into skb */
+		size = min(length, (unsigned)IPOIB_UD_HEAD_SIZE);
+		__skb_put(skb, size);
+
+		length -= size;
+
+		frag->size = length;
+		skb->data_len += length;
+		skb->len += length;
+		skb->truesize += length;
+       } else
+               skb_put(skb, length);
+}
+
 static int ipoib_ib_post_receive(struct net_device *dev, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -111,7 +132,9 @@ static int ipoib_ib_post_receive(struct 
 	int ret = 0;
 	int i = priv->rx_outst;
 
-	priv->sglist_draft[i].addr = priv->rx_ring[id].mapping;
+	priv->sglist_draft[i][0].addr = priv->rx_ring[id].mapping[0];
+	priv->sglist_draft[i][1].addr = priv->rx_ring[id].mapping[1];
+
 	priv->rx_wr_draft[i].wr_id = id | IPOIB_OP_RECV;
 	if (++priv->rx_outst == UD_POST_RCV_COUNT) {
 		ret = ib_post_recv(priv->qp, priv->rx_wr_draft, &bad_wr);
@@ -120,8 +143,8 @@ static int ipoib_ib_post_receive(struct 
 			ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
 			while (bad_wr) {
 				id = bad_wr->wr_id & ~IPOIB_OP_RECV;
-				ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
-						    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+				ipoib_sg_dma_unmap_rx(priv,
+						      priv->rx_ring[i].mapping);
 				dev_kfree_skb_any(priv->rx_ring[id].skb);
 				priv->rx_ring[id].skb = NULL;
 			}
@@ -136,11 +159,17 @@ static int ipoib_alloc_rx_skb(struct net
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct sk_buff *skb;
-	u64 addr;
+	int buf_size;
+	u64 *mapping = priv->rx_ring[id].mapping;
+
+	if (ipoib_ud_need_sg(priv->max_ib_mtu))
+		buf_size = IPOIB_UD_HEAD_SIZE;
+	else
+		buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
 
-	skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
-	if (!skb)
-		return -ENOMEM;
+	skb = dev_alloc_skb(buf_size + 4);
+ 	if (unlikely(!skb))
+ 		return -ENOMEM;
 
 	/*
 	 * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
@@ -149,17 +178,33 @@ static int ipoib_alloc_rx_skb(struct net
 	 */
 	skb_reserve(skb, 4);
 
-	addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE,
-				 DMA_FROM_DEVICE);
-	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
-		dev_kfree_skb_any(skb);
-		return -EIO;
+ 	mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
+				       DMA_FROM_DEVICE);
+ 	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
+		goto out_free;
+
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		struct page *page = alloc_page(GFP_ATOMIC);
+
+	 	if (!page)
+ 			goto partial_error;
+
+	 	skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
+ 		mapping[1] = ib_dma_map_page(priv->ca, page, 0, PAGE_SIZE,
+					     DMA_FROM_DEVICE);
+	 	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))
+ 			goto partial_error;
 	}
 
-	priv->rx_ring[id].skb     = skb;
-	priv->rx_ring[id].mapping = addr;
+ 	priv->rx_ring[id].skb = skb;
+ 	return 0;
 
-	return 0;
+partial_error:
+	ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE);
+
+out_free:
+ 	dev_kfree_skb_any(skb);
+ 	return -ENOMEM;
 }
 
 static int ipoib_ib_post_receives(struct net_device *dev)
@@ -186,7 +231,6 @@ static void ipoib_ib_handle_rx_wc(struct
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
 	struct sk_buff *skb;
-	u64 addr;
 
 	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
 		       wr_id, wc->status);
@@ -198,15 +242,13 @@ static void ipoib_ib_handle_rx_wc(struct
 	}
 
 	skb  = priv->rx_ring[wr_id].skb;
-	addr = priv->rx_ring[wr_id].mapping;
 
 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		if (wc->status != IB_WC_WR_FLUSH_ERR)
 			ipoib_warn(priv, "failed recv event "
 				   "(status=%d, wrid=%d vend_err %x)\n",
 				   wc->status, wr_id, wc->vendor_err);
-		ib_dma_unmap_single(priv->ca, addr,
-				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+		ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
 		dev_kfree_skb_any(skb);
 		priv->rx_ring[wr_id].skb = NULL;
 		return;
@@ -231,9 +273,9 @@ static void ipoib_ib_handle_rx_wc(struct
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
 
-	ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+	ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
 
-	skb_put(skb, wc->byte_len);
+	ipoib_ud_skb_put_frags(priv, skb, wc->byte_len);
 	skb_pull(skb, IB_GRH_BYTES);
 
 	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
@@ -828,15 +870,13 @@ int ipoib_ib_dev_stop(struct net_device 
 			 * all our pending work requests.
 			 */
 			for (i = 0; i < ipoib_recvq_size; ++i) {
-				struct ipoib_rx_buf *rx_req;
+				struct ipoib_sg_rx_buf *rx_req;
 
 				rx_req = &priv->rx_ring[i];
 
 				if (rx_req->skb) {
-					ib_dma_unmap_single(priv->ca,
-							    rx_req->mapping,
-							    IPOIB_BUF_SIZE,
-							    DMA_FROM_DEVICE);
+					ipoib_sg_dma_unmap_rx(priv,
+							      priv->rx_ring[i].mapping);
 					dev_kfree_skb_any(rx_req->skb);
 					rx_req->skb = NULL;
 				}
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c	2008-02-06 18:57:29.657377000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c	2008-02-06 19:25:56.553407000 +0200
@@ -193,7 +193,7 @@ static int ipoib_change_mtu(struct net_d
 		return 0;
 	}
 
-	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
 		return -EINVAL;
 
 	priv->admin_mtu = new_mtu;
@@ -981,10 +981,6 @@ static void ipoib_setup(struct net_devic
 	dev->tx_queue_len 	 = ipoib_sendq_size * 2;
 	dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
 
-	/* MTU will be reset when mcast join happens */
-	dev->mtu 		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
-	priv->mcast_mtu 	 = priv->admin_mtu = dev->mtu;
-
 	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
 
 	netif_carrier_off(dev);
@@ -1130,6 +1126,7 @@ static struct net_device *ipoib_add_port
 					 struct ib_device *hca, u8 port)
 {
 	struct ipoib_dev_priv *priv;
+	struct ib_port_attr attr;
 	int result = -ENOMEM;
 
 	priv = ipoib_intf_alloc(format);
@@ -1140,6 +1137,18 @@ static struct net_device *ipoib_add_port
 
 	priv->dev->features |= NETIF_F_HIGHDMA;
 
+	if (!ib_query_port(hca, port, &attr))
+		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+	else {
+		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
+		       hca->name, port);
+		goto device_init_failed;
+	}
+
+	/* MTU will be reset when mcast join happens */
+	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
+
 	result = ib_query_pkey(hca, port, 0, &priv->pkey);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2008-02-06 18:56:03.372135000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2008-02-06 19:22:02.386120000 +0200
@@ -567,8 +567,7 @@ void ipoib_mcast_join_task(struct work_s
 		return;
 	}
 
-	priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
-		IPOIB_ENCAP_LEN;
+	priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
 
 	if (!ipoib_cm_admin_enabled(dev))
 		dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2008-02-06 18:57:34.010682000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2008-02-06 19:22:02.373118000 +0200
@@ -151,7 +151,7 @@ int ipoib_transport_dev_init(struct net_
 			.max_send_wr  = ipoib_sendq_size,
 			.max_recv_wr  = ipoib_recvq_size,
 			.max_send_sge = dev->features & NETIF_F_SG ? MAX_SKB_FRAGS + 1 : 1,
-			.max_recv_sge = 1
+			.max_recv_sge = IPOIB_UD_RX_SG
 		},
 		.sq_sig_type = IB_SIGNAL_REQ_WR,
 		.qp_type     = IB_QPT_UD,
@@ -227,16 +227,27 @@ int ipoib_transport_dev_init(struct net_
 	priv->tx_wr.send_flags 	= IB_SEND_SIGNALED;
 
 	for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
-		priv->sglist_draft[i].length = IPOIB_BUF_SIZE;
-		priv->sglist_draft[i].lkey = priv->mr->lkey;
-
-		priv->rx_wr_draft[i].sg_list = &priv->sglist_draft[i];
-		priv->rx_wr_draft[i].num_sge = 1;
+		priv->sglist_draft[i][0].lkey = priv->mr->lkey;
+		priv->sglist_draft[i][1].lkey = priv->mr->lkey;
+		priv->rx_wr_draft[i].sg_list = &priv->sglist_draft[i][0];
 		if (i < UD_POST_RCV_COUNT - 1)
 			priv->rx_wr_draft[i].next = &priv->rx_wr_draft[i + 1];
 	}
 	priv->rx_wr_draft[i].next = NULL;
 
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
+			priv->sglist_draft[i][0].length = IPOIB_UD_HEAD_SIZE;
+			priv->sglist_draft[i][1].length = PAGE_SIZE;
+			priv->rx_wr_draft[i].num_sge = IPOIB_UD_RX_SG;
+		}
+	} else {
+		for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
+			priv->sglist_draft[i][0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+			priv->rx_wr_draft[i].num_sge = 1;
+		}
+	}
+
 	return 0;
 
 out_free_scq:





More information about the general mailing list