[ofa-general] [RFC] IPoIB UD 4K MTU support

Shirley Ma mashirle at us.ibm.com
Wed Jan 23 13:09:56 PST 2008


Hello Eli,

Here is the patch against Roland's for-2.6.25 git tree. Please let me
know if any problem. Thanks for reviewing this patch.

Shirley

Signed-off-by Shirley Ma <xma at us.ibm.com>

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h
b/drivers/infiniband/ulp/ipoib/ipoib.h
index fe250c6..af11e2c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -56,9 +56,6 @@
 /* constants */
 
 enum {
-	IPOIB_PACKET_SIZE	  = 2048,
-	IPOIB_BUF_SIZE		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
-
 	IPOIB_ENCAP_LEN		  = 4,
 
 	IPOIB_CM_MTU		  = 0x10000 - 0x10, /* padding to align header to 16 */
@@ -319,6 +316,7 @@ struct ipoib_dev_priv {
 	struct dentry *mcg_dentry;
 	struct dentry *path_dentry;
 #endif
+	unsigned int   max_ib_mtu;
 };
 
 struct ipoib_ah {
@@ -424,6 +422,13 @@ int ipoib_mcast_stop_thread(struct net_device *dev,
int flush);
 void ipoib_mcast_dev_down(struct net_device *dev);
 void ipoib_mcast_dev_flush(struct net_device *dev);
 
+/* padding packet to fit one page size for 4K IB mtu */
+static inline int ipoib_ud_mtu(unsigned int ib_mtu) 
+{
+	return (ib_mtu < 4096) ? (ib_mtu - IPOIB_ENCAP_LEN) :
+				 (ib_mtu - IB_GRH_BYTES - IPOIB_ENCAP_LEN - 4);
+}
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
 int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 52bc2bd..d888a47 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -87,6 +87,15 @@ void ipoib_free_ah(struct kref *kref)
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
+static int ipoib_ud_buf_size(unsigned int max_ib_mtu)
+{
+	if (max_ib_mtu < 4096)
+		return (max_ib_mtu + IB_GRH_BYTES);
+	else
+		/* padding packet to one page for 4K mtu */
+		return (max_ib_mtu - 4);
+}
+
 static int ipoib_ib_post_receive(struct net_device *dev, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -96,7 +105,7 @@ static int ipoib_ib_post_receive(struct net_device
*dev, int id)
 	int ret;
 
 	list.addr     = priv->rx_ring[id].mapping;
-	list.length   = IPOIB_BUF_SIZE;
+	list.length   = ipoib_ud_buf_size(priv->max_ib_mtu);
 	list.lkey     = priv->mr->lkey;
 
 	param.next    = NULL;
@@ -108,7 +117,7 @@ static int ipoib_ib_post_receive(struct net_device
*dev, int id)
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
 		ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
-				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+				    ipoib_ud_buf_size(priv->max_ib_mtu), DMA_FROM_DEVICE);
 		dev_kfree_skb_any(priv->rx_ring[id].skb);
 		priv->rx_ring[id].skb = NULL;
 	}
@@ -122,7 +131,7 @@ static int ipoib_alloc_rx_skb(struct net_device
*dev, int id)
 	struct sk_buff *skb;
 	u64 addr;
 
-	skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
+	skb = dev_alloc_skb(ipoib_ud_buf_size(priv->max_ib_mtu) + 4);
 	if (!skb)
 		return -ENOMEM;
 
@@ -133,7 +142,7 @@ static int ipoib_alloc_rx_skb(struct net_device
*dev, int id)
 	 */
 	skb_reserve(skb, 4);
 
-	addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE,
+	addr = ib_dma_map_single(priv->ca, skb->data,
ipoib_ud_buf_size(priv->max_ib_mtu),
 				 DMA_FROM_DEVICE);
 	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
 		dev_kfree_skb_any(skb);
@@ -190,7 +199,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device
*dev, struct ib_wc *wc)
 				   "(status=%d, wrid=%d vend_err %x)\n",
 				   wc->status, wr_id, wc->vendor_err);
 		ib_dma_unmap_single(priv->ca, addr,
-				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+				    ipoib_ud_buf_size(priv->max_ib_mtu), DMA_FROM_DEVICE);
 		dev_kfree_skb_any(skb);
 		priv->rx_ring[wr_id].skb = NULL;
 		return;
@@ -215,7 +224,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device
*dev, struct ib_wc *wc)
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
 
-	ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+	ib_dma_unmap_single(priv->ca, addr,
ipoib_ud_buf_size(priv->max_ib_mtu), DMA_FROM_DEVICE);
 
 	skb_put(skb, wc->byte_len);
 	skb_pull(skb, IB_GRH_BYTES);
@@ -632,7 +641,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int
flush)
 					continue;
 				ib_dma_unmap_single(priv->ca,
 						    rx_req->mapping,
-						    IPOIB_BUF_SIZE,
+						    ipoib_ud_buf_size(priv->max_ib_mtu),
 						    DMA_FROM_DEVICE);
 				dev_kfree_skb_any(rx_req->skb);
 				rx_req->skb = NULL;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index a082466..8a994f3 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -194,7 +194,7 @@ static int ipoib_change_mtu(struct net_device *dev,
int new_mtu)
 		return 0;
 	}
 
-	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+	if (new_mtu > ipoib_ud_mtu(priv->max_ib_mtu))
 		return -EINVAL;
 
 	priv->admin_mtu = new_mtu;
@@ -969,7 +969,7 @@ static void ipoib_setup(struct net_device *dev)
 	dev->features		 = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
 
 	/* MTU will be reset when mcast join happens */
-	dev->mtu		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
+	dev->mtu                 = ipoib_ud_mtu(priv->max_ib_mtu);
 	priv->mcast_mtu		 = priv->admin_mtu = dev->mtu;
 
 	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
@@ -1103,6 +1103,7 @@ static struct net_device *ipoib_add_port(const
char *format,
 					 struct ib_device *hca, u8 port)
 {
 	struct ipoib_dev_priv *priv;
+	struct ib_port_attr attr;
 	int result = -ENOMEM;
 
 	priv = ipoib_intf_alloc(format);
@@ -1111,6 +1112,13 @@ static struct net_device *ipoib_add_port(const
char *format,
 
 	SET_NETDEV_DEV(priv->dev, hca->dma_device);
 
+	if (!ib_query_port(hca, port, &attr))
+		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+	else {
+		printk(KERN_WARNING "%s: ib_query_port %d failed\n", 
+		       hca->name, port);
+		goto device_init_failed;
+	}
 	result = ib_query_pkey(hca, port, 0, &priv->pkey);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 2628339..0661e87 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -567,9 +567,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
 		return;
 	}
 
-	priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
-		IPOIB_ENCAP_LEN;
-
+	priv->mcast_mtu =
ipoib_ud_mtu(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
 	if (!ipoib_cm_admin_enabled(dev))
 		dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
 





More information about the general mailing list