[ofa-general] [PATCH] IPoIB UD 4K MTU support

Shirley Ma mashirle at us.ibm.com
Thu Jan 24 00:14:51 PST 2008


Hello Eli,

Below is the updated patch against Roland's for-2.6.25 tree. 

This patch allows IPoIB UD MTU up to 4K when HCA is capable.
To simple this patch, the IPoIB MTU size is limited to 4K - IB_GRH_BYTES
- IPOIB_ENCAP_LEN - 4 bytes (padding to align IP header) so we can limit
skb buffer allocation to one page. The node IPoIB link MTU size is the
minimum value of admin configurable MTU through ifconfig and IPoIB
default broadcast group MTU size. When Subnet Manager enables default
broadcast group during start up, this subnet IPoIB link MTU will be the
value of default broadcast group MTU size. For any node IB MTU smaller
than this value, the node can't join this IPoIB subnet. For any node IB
MTU is greater than this value, the node will join this IPoIB subnet and
set this value as its link MTU. If Subnet Manager disables default
broadcast group during start up, the first bring up node in this subnet
will create the default IPoIB broadcast group based on the negotiation
with the Subnet Manager.

Signed-off-by: Shirley Ma <xma at us.ibm.com>
---

 drivers/infiniband/ulp/ipoib/ipoib.h           |   11 ++++++++---
 drivers/infiniband/ulp/ipoib/ipoib_ib.c        |   21
++++++++++++++-------
 drivers/infiniband/ulp/ipoib/ipoib_main.c      |   19
++++++++++++++-----
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |    4 +---
 4 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h
b/drivers/infiniband/ulp/ipoib/ipoib.h
index fe250c6..af11e2c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -56,9 +56,6 @@
 /* constants */
 
 enum {
-	IPOIB_PACKET_SIZE	  = 2048,
-	IPOIB_BUF_SIZE		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
-
 	IPOIB_ENCAP_LEN		  = 4,
 
 	IPOIB_CM_MTU		  = 0x10000 - 0x10, /* padding to align header to 16 */
@@ -319,6 +316,7 @@ struct ipoib_dev_priv {
 	struct dentry *mcg_dentry;
 	struct dentry *path_dentry;
 #endif
+	unsigned int   max_ib_mtu;
 };
 
 struct ipoib_ah {
@@ -424,6 +422,13 @@ int ipoib_mcast_stop_thread(struct net_device *dev,
int flush);
 void ipoib_mcast_dev_down(struct net_device *dev);
 void ipoib_mcast_dev_flush(struct net_device *dev);
 
+/* padding packet to fit one page size for 4K IB mtu */
+static inline int ipoib_ud_mtu(unsigned int ib_mtu) 
+{
+	return (ib_mtu < 4096) ? (ib_mtu - IPOIB_ENCAP_LEN) :
+				 (ib_mtu - IB_GRH_BYTES - IPOIB_ENCAP_LEN - 4);
+}
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
 int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 52bc2bd..662ec8e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -87,6 +87,13 @@ void ipoib_free_ah(struct kref *kref)
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
+/* padding packet to fit one page size for 4K IB mtu */
+static int ipoib_ud_buf_size(unsigned int max_ib_mtu)
+{
+	return (max_ib_mtu < 4096) ? (max_ib_mtu + IB_GRH_BYTES) :
+				     (max_ib_mtu - 4);
+}
+
 static int ipoib_ib_post_receive(struct net_device *dev, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -96,7 +103,7 @@ static int ipoib_ib_post_receive(struct net_device
*dev, int id)
 	int ret;
 
 	list.addr     = priv->rx_ring[id].mapping;
-	list.length   = IPOIB_BUF_SIZE;
+	list.length   = ipoib_ud_buf_size(priv->max_ib_mtu);
 	list.lkey     = priv->mr->lkey;
 
 	param.next    = NULL;
@@ -108,7 +115,7 @@ static int ipoib_ib_post_receive(struct net_device
*dev, int id)
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
 		ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
-				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+				    ipoib_ud_buf_size(priv->max_ib_mtu), DMA_FROM_DEVICE);
 		dev_kfree_skb_any(priv->rx_ring[id].skb);
 		priv->rx_ring[id].skb = NULL;
 	}
@@ -122,7 +129,7 @@ static int ipoib_alloc_rx_skb(struct net_device
*dev, int id)
 	struct sk_buff *skb;
 	u64 addr;
 
-	skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
+	skb = dev_alloc_skb(ipoib_ud_buf_size(priv->max_ib_mtu) + 4);
 	if (!skb)
 		return -ENOMEM;
 
@@ -133,7 +140,7 @@ static int ipoib_alloc_rx_skb(struct net_device
*dev, int id)
 	 */
 	skb_reserve(skb, 4);
 
-	addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE,
+	addr = ib_dma_map_single(priv->ca, skb->data,
ipoib_ud_buf_size(priv->max_ib_mtu),
 				 DMA_FROM_DEVICE);
 	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
 		dev_kfree_skb_any(skb);
@@ -190,7 +197,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device
*dev, struct ib_wc *wc)
 				   "(status=%d, wrid=%d vend_err %x)\n",
 				   wc->status, wr_id, wc->vendor_err);
 		ib_dma_unmap_single(priv->ca, addr,
-				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+				    ipoib_ud_buf_size(priv->max_ib_mtu), DMA_FROM_DEVICE);
 		dev_kfree_skb_any(skb);
 		priv->rx_ring[wr_id].skb = NULL;
 		return;
@@ -215,7 +222,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device
*dev, struct ib_wc *wc)
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
 
-	ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+	ib_dma_unmap_single(priv->ca, addr,
ipoib_ud_buf_size(priv->max_ib_mtu), DMA_FROM_DEVICE);
 
 	skb_put(skb, wc->byte_len);
 	skb_pull(skb, IB_GRH_BYTES);
@@ -632,7 +639,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int
flush)
 					continue;
 				ib_dma_unmap_single(priv->ca,
 						    rx_req->mapping,
-						    IPOIB_BUF_SIZE,
+						    ipoib_ud_buf_size(priv->max_ib_mtu),
 						    DMA_FROM_DEVICE);
 				dev_kfree_skb_any(rx_req->skb);
 				rx_req->skb = NULL;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index a082466..b7192ca 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -194,7 +194,7 @@ static int ipoib_change_mtu(struct net_device *dev,
int new_mtu)
 		return 0;
 	}
 
-	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+	if (new_mtu > ipoib_ud_mtu(priv->max_ib_mtu))
 		return -EINVAL;
 
 	priv->admin_mtu = new_mtu;
@@ -968,10 +968,6 @@ static void ipoib_setup(struct net_device *dev)
 	dev->tx_queue_len	 = ipoib_sendq_size * 2;
 	dev->features		 = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
 
-	/* MTU will be reset when mcast join happens */
-	dev->mtu		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
-	priv->mcast_mtu		 = priv->admin_mtu = dev->mtu;
-
 	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
 
 	netif_carrier_off(dev);
@@ -1103,6 +1099,7 @@ static struct net_device *ipoib_add_port(const
char *format,
 					 struct ib_device *hca, u8 port)
 {
 	struct ipoib_dev_priv *priv;
+	struct ib_port_attr attr;
 	int result = -ENOMEM;
 
 	priv = ipoib_intf_alloc(format);
@@ -1111,6 +1108,18 @@ static struct net_device *ipoib_add_port(const
char *format,
 
 	SET_NETDEV_DEV(priv->dev, hca->dma_device);
 
+	if (!ib_query_port(hca, port, &attr))
+		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+	else {
+		printk(KERN_WARNING "%s: ib_query_port %d failed\n", 
+		       hca->name, port);
+		goto device_init_failed;
+	}
+
+	/* MTU will be reset when mcast join happens */
+	priv->dev->mtu  = ipoib_ud_mtu(priv->max_ib_mtu);
+	priv->mcast_mtu	 = priv->admin_mtu = priv->dev->mtu;
+
 	result = ib_query_pkey(hca, port, 0, &priv->pkey);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 2628339..0661e87 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -567,9 +567,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
 		return;
 	}
 
-	priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
-		IPOIB_ENCAP_LEN;
-
+	priv->mcast_mtu =
ipoib_ud_mtu(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
 	if (!ipoib_cm_admin_enabled(dev))
 		dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
 





More information about the general mailing list