[ofa-general] [RFC] IPoIB UD 4K MTU support

Shirley Ma mashirle at us.ibm.com
Tue Jan 22 10:08:41 PST 2008


Hello Roland,

	IPoIB UD currently supports up to 2K MTU. Below is the draft patch to
enable IPoIB UD 4K MTU support for any IB device who has 4K MTU like IBM
eHCA. This patch limits packet in one page range by setting IPoIB UD MTU
size as 4K-48 (40 GRH, 4 IPoIB header, 4 padding to IP header align) to
avoid two contiguous pages allocation when kernel page size is 4K.
Enabling IPoIB UD 4K MTU relies on both SM to set default broadcast
group 4K MTU and of course switch should support 4K MTU. When SM default
broadcast group MTU sets 2K, IPoIB UD MTU will fall back to 2K.

	I have tested 2K MTU. 4K MTU is still under testing. The reason I send
this patch out before my test for review is I want comments as early as
possible. So I can integrate the comments into this patch and hopefully
we can make it into OFED-1.3-rc3 which is around Jan.30.

Thanks
Shirley


diff -urpN
ipoib/ipoib.h /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib.h
--- ipoib/ipoib.h	2008-01-21 14:16:19.000000000 -0500
+++ /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib.h
2008-01-22 15:50:13.000000000 -0500
@@ -56,9 +56,6 @@
 /* constants */
 
 enum {
-	IPOIB_PACKET_SIZE         = 2048,
-	IPOIB_BUF_SIZE 		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
-
 	IPOIB_ENCAP_LEN 	  = 4,
 
 	IPOIB_CM_MTU              = 0x10000 - 0x10, /* padding to align header
to 16 */
@@ -320,6 +317,7 @@ struct ipoib_dev_priv {
 	struct dentry *mcg_dentry;
 	struct dentry *path_dentry;
 #endif
+	unsigned int   max_ib_mtu;
 };
 
 struct ipoib_ah {
@@ -698,4 +696,11 @@ extern int ipoib_debug_level;
 
 #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
 
+/* padding packet to fit one page size for 4K IB mtu */
+static inline int ipoib_ud_mtu(unsigned int ib_mtu) 
+{
+	return (ib_mtu < 4096) ? (ib_mtu - IPOIB_ENCAP_LEN) :
+				 (ib_mtu - IB_GRH_BYTES - IPOIB_ENCAP_LEN - 4);
+}
+
 #endif /* _IPOIB_H */
diff -urpN
ipoib/ipoib_ib.c /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib_ib.c
--- ipoib/ipoib_ib.c	2008-01-10 13:13:12.000000000 -0500
+++ /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib_ib.c
2008-01-22 15:58:16.000000000 -0500
@@ -87,6 +87,15 @@ void ipoib_free_ah(struct kref *kref)
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
+static int ipoib_ud_buf_size(unsigned int max_ib_mtu)
+{
+	if (max_ib_mtu < 4096)
+		return (max_ib_mtu + IB_GRH_BYTES);
+	else
+		/* padding packet to one page for 4K mtu */
+		return (max_ib_mtu - 4);
+}
+
 static int ipoib_ib_post_receive(struct net_device *dev, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -96,7 +105,7 @@ static int ipoib_ib_post_receive(struct 
 	int ret;
 
 	list.addr     = priv->rx_ring[id].mapping;
-	list.length   = IPOIB_BUF_SIZE;
+	list.length   = ipoib_ud_buf_size(priv->max_ib_mtu);
 	list.lkey     = priv->mr->lkey;
 
 	param.next    = NULL;
@@ -108,7 +117,7 @@ static int ipoib_ib_post_receive(struct 
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
 		ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
-				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+				    ipoib_ud_buf_size(priv->max_ib_mtu), DMA_FROM_DEVICE);
 		dev_kfree_skb_any(priv->rx_ring[id].skb);
 		priv->rx_ring[id].skb = NULL;
 	}
@@ -122,7 +131,7 @@ static int ipoib_alloc_rx_skb(struct net
 	struct sk_buff *skb;
 	u64 addr;
 
-	skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
+	skb = dev_alloc_skb(ipoib_ud_buf_size(priv->max_ib_mtu) + 4);
 	if (!skb)
 		return -ENOMEM;
 
@@ -133,7 +142,7 @@ static int ipoib_alloc_rx_skb(struct net
 	 */
 	skb_reserve(skb, 4);
 
-	addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE,
+	addr = ib_dma_map_single(priv->ca, skb->data,
ipoib_ud_buf_size(priv->max_ib_mtu),
 				 DMA_FROM_DEVICE);
 	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
 		dev_kfree_skb_any(skb);
@@ -190,7 +199,7 @@ static void ipoib_ib_handle_rx_wc(struct
 				   "(status=%d, wrid=%d vend_err %x)\n",
 				   wc->status, wr_id, wc->vendor_err);
 		ib_dma_unmap_single(priv->ca, addr,
-				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+				    ipoib_ud_buf_size(priv->max_ib_mtu), DMA_FROM_DEVICE);
 		dev_kfree_skb_any(skb);
 		priv->rx_ring[wr_id].skb = NULL;
 		return;
@@ -215,7 +224,7 @@ static void ipoib_ib_handle_rx_wc(struct
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
 
-	ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+	ib_dma_unmap_single(priv->ca, addr,
ipoib_ud_buf_size(priv->max_ib_mtu), DMA_FROM_DEVICE);
 
 	skb_put(skb, wc->byte_len);
 	skb_pull(skb, IB_GRH_BYTES);
@@ -632,7 +641,7 @@ int ipoib_ib_dev_stop(struct net_device 
 					continue;
 				ib_dma_unmap_single(priv->ca,
 						    rx_req->mapping,
-						    IPOIB_BUF_SIZE,
+						    ipoib_ud_buf_size(priv->max_ib_mtu),
 						    DMA_FROM_DEVICE);
 				dev_kfree_skb_any(rx_req->skb);
 				rx_req->skb = NULL;
diff -urpN
ipoib/ipoib_main.c /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib_main.c
--- ipoib/ipoib_main.c	2008-01-21 14:43:39.000000000 -0500
+++ /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib_main.c
2008-01-22 15:39:44.000000000 -0500
@@ -193,7 +193,7 @@ static int ipoib_change_mtu(struct net_d
 		return 0;
 	}
 
-	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+	if (new_mtu > ipoib_ud_mtu(priv->max_ib_mtu))
 		return -EINVAL;
 
 	priv->admin_mtu = new_mtu;
@@ -978,7 +978,7 @@ static void ipoib_setup(struct net_devic
 	dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
 
 	/* MTU will be reset when mcast join happens */
-	dev->mtu 		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
+	dev->mtu		 = ipoib_ud_mtu(priv->max_ib_mtu);
 	priv->mcast_mtu 	 = priv->admin_mtu = dev->mtu;
 
 	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
@@ -1112,6 +1112,7 @@ static struct net_device *ipoib_add_port
 					 struct ib_device *hca, u8 port)
 {
 	struct ipoib_dev_priv *priv;
+	struct ib_port_attr attr;
 	int result = -ENOMEM;
 
 	priv = ipoib_intf_alloc(format);
@@ -1120,6 +1121,13 @@ static struct net_device *ipoib_add_port
 
 	SET_NETDEV_DEV(priv->dev, hca->dma_device);
 
+	if (!ib_query_port(hca, port, &attr))
+		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+	else {
+		printk(KERN_WARNING "%s: ib_query_port %d failed\n", 
+		       hca->name, port);
+		goto device_init_failed;
+	}
 	result = ib_query_pkey(hca, port, 0, &priv->pkey);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
diff -urpN
ipoib/ipoib_multicast.c /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
--- ipoib/ipoib_multicast.c	2008-01-10 13:13:12.000000000 -0500
+++ /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2008-01-22 15:42:10.000000000 -0500
@@ -567,9 +567,7 @@ void ipoib_mcast_join_task(struct work_s
 		return;
 	}
 
-	priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
-		IPOIB_ENCAP_LEN;
-
+	priv->mcast_mtu =
ipoib_ud_mtu(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
 	if (!ipoib_cm_admin_enabled(dev))
 		dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
 




More information about the general mailing list