[ofa-general] [RFC] IPoIB UD 4K MTU support

Eli Cohen eli at dev.mellanox.co.il
Wed Jan 23 07:32:26 PST 2008


Hi Shirley,

can you send a path to the git tree this patch is based on?

On Tue, 2008-01-22 at 10:08 -0800, Shirley Ma wrote:
> Hello Roland,
> 
> 	IPoIB UD currently supports up to 2K MTU. Below is the draft patch to
> enable IPoIB UD 4K MTU support for any IB device who has 4K MTU like IBM
> eHCA. This patch limits packet in one page range by setting IPoIB UD MTU
> size as 4K-48 (40 GRH, 4 IPoIB header, 4 padding to IP header align) to
> avoid two contiguous pages allocation when kernel page size is 4K.
> Enabling IPoIB UD 4K MTU relies on both SM to set default broadcast
> group 4K MTU and of course switch should support 4K MTU. When SM default
> broadcast group MTU sets 2K, IPoIB UD MTU will fall back to 2K.
> 
> 	I have tested 2K MTU. 4K MTU is still under testing. The reason I send
> this patch out before my test for review is I want comments as early as
> possible. So I can integrate the comments into this patch and hopefully
> we can make it into OFED-1.3-rc3 which is around Jan.30.
> 
> Thanks
> Shirley
> 
> 
> diff -urpN
> ipoib/ipoib.h /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib.h
> --- ipoib/ipoib.h	2008-01-21 14:16:19.000000000 -0500
> +++ /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib.h
> 2008-01-22 15:50:13.000000000 -0500
> @@ -56,9 +56,6 @@
>  /* constants */
>  
>  enum {
> -	IPOIB_PACKET_SIZE         = 2048,
> -	IPOIB_BUF_SIZE 		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
> -
>  	IPOIB_ENCAP_LEN 	  = 4,
>  
>  	IPOIB_CM_MTU              = 0x10000 - 0x10, /* padding to align header
> to 16 */
> @@ -320,6 +317,7 @@ struct ipoib_dev_priv {
>  	struct dentry *mcg_dentry;
>  	struct dentry *path_dentry;
>  #endif
> +	unsigned int   max_ib_mtu;
>  };
>  
>  struct ipoib_ah {
> @@ -698,4 +696,11 @@ extern int ipoib_debug_level;
>  
>  #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
>  
> +/* padding packet to fit one page size for 4K IB mtu */
> +static inline int ipoib_ud_mtu(unsigned int ib_mtu) 
> +{
> +	return (ib_mtu < 4096) ? (ib_mtu - IPOIB_ENCAP_LEN) :
> +				 (ib_mtu - IB_GRH_BYTES - IPOIB_ENCAP_LEN - 4);
> +}
> +
>  #endif /* _IPOIB_H */
> diff -urpN
> ipoib/ipoib_ib.c /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> --- ipoib/ipoib_ib.c	2008-01-10 13:13:12.000000000 -0500
> +++ /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> 2008-01-22 15:58:16.000000000 -0500
> @@ -87,6 +87,15 @@ void ipoib_free_ah(struct kref *kref)
>  	spin_unlock_irqrestore(&priv->lock, flags);
>  }
>  
> +static int ipoib_ud_buf_size(unsigned int max_ib_mtu)
> +{
> +	if (max_ib_mtu < 4096)
> +		return (max_ib_mtu + IB_GRH_BYTES);
> +	else
> +		/* padding packet to one page for 4K mtu */
> +		return (max_ib_mtu - 4);
> +}
> +
>  static int ipoib_ib_post_receive(struct net_device *dev, int id)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> @@ -96,7 +105,7 @@ static int ipoib_ib_post_receive(struct 
>  	int ret;
>  
>  	list.addr     = priv->rx_ring[id].mapping;
> -	list.length   = IPOIB_BUF_SIZE;
> +	list.length   = ipoib_ud_buf_size(priv->max_ib_mtu);
>  	list.lkey     = priv->mr->lkey;
>  
>  	param.next    = NULL;
> @@ -108,7 +117,7 @@ static int ipoib_ib_post_receive(struct 
>  	if (unlikely(ret)) {
>  		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
>  		ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
> -				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
> +				    ipoib_ud_buf_size(priv->max_ib_mtu), DMA_FROM_DEVICE);
>  		dev_kfree_skb_any(priv->rx_ring[id].skb);
>  		priv->rx_ring[id].skb = NULL;
>  	}
> @@ -122,7 +131,7 @@ static int ipoib_alloc_rx_skb(struct net
>  	struct sk_buff *skb;
>  	u64 addr;
>  
> -	skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
> +	skb = dev_alloc_skb(ipoib_ud_buf_size(priv->max_ib_mtu) + 4);
>  	if (!skb)
>  		return -ENOMEM;
>  
> @@ -133,7 +142,7 @@ static int ipoib_alloc_rx_skb(struct net
>  	 */
>  	skb_reserve(skb, 4);
>  
> -	addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE,
> +	addr = ib_dma_map_single(priv->ca, skb->data,
> ipoib_ud_buf_size(priv->max_ib_mtu),
>  				 DMA_FROM_DEVICE);
>  	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
>  		dev_kfree_skb_any(skb);
> @@ -190,7 +199,7 @@ static void ipoib_ib_handle_rx_wc(struct
>  				   "(status=%d, wrid=%d vend_err %x)\n",
>  				   wc->status, wr_id, wc->vendor_err);
>  		ib_dma_unmap_single(priv->ca, addr,
> -				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
> +				    ipoib_ud_buf_size(priv->max_ib_mtu), DMA_FROM_DEVICE);
>  		dev_kfree_skb_any(skb);
>  		priv->rx_ring[wr_id].skb = NULL;
>  		return;
> @@ -215,7 +224,7 @@ static void ipoib_ib_handle_rx_wc(struct
>  	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
>  		       wc->byte_len, wc->slid);
>  
> -	ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
> +	ib_dma_unmap_single(priv->ca, addr,
> ipoib_ud_buf_size(priv->max_ib_mtu), DMA_FROM_DEVICE);
>  
>  	skb_put(skb, wc->byte_len);
>  	skb_pull(skb, IB_GRH_BYTES);
> @@ -632,7 +641,7 @@ int ipoib_ib_dev_stop(struct net_device 
>  					continue;
>  				ib_dma_unmap_single(priv->ca,
>  						    rx_req->mapping,
> -						    IPOIB_BUF_SIZE,
> +						    ipoib_ud_buf_size(priv->max_ib_mtu),
>  						    DMA_FROM_DEVICE);
>  				dev_kfree_skb_any(rx_req->skb);
>  				rx_req->skb = NULL;
> diff -urpN
> ipoib/ipoib_main.c /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib_main.c
> --- ipoib/ipoib_main.c	2008-01-21 14:43:39.000000000 -0500
> +++ /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib_main.c
> 2008-01-22 15:39:44.000000000 -0500
> @@ -193,7 +193,7 @@ static int ipoib_change_mtu(struct net_d
>  		return 0;
>  	}
>  
> -	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
> +	if (new_mtu > ipoib_ud_mtu(priv->max_ib_mtu))
>  		return -EINVAL;
>  
>  	priv->admin_mtu = new_mtu;
> @@ -978,7 +978,7 @@ static void ipoib_setup(struct net_devic
>  	dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
>  
>  	/* MTU will be reset when mcast join happens */
> -	dev->mtu 		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
> +	dev->mtu		 = ipoib_ud_mtu(priv->max_ib_mtu);
>  	priv->mcast_mtu 	 = priv->admin_mtu = dev->mtu;
>  
>  	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
> @@ -1112,6 +1112,7 @@ static struct net_device *ipoib_add_port
>  					 struct ib_device *hca, u8 port)
>  {
>  	struct ipoib_dev_priv *priv;
> +	struct ib_port_attr attr;
>  	int result = -ENOMEM;
>  
>  	priv = ipoib_intf_alloc(format);
> @@ -1120,6 +1121,13 @@ static struct net_device *ipoib_add_port
>  
>  	SET_NETDEV_DEV(priv->dev, hca->dma_device);
>  
> +	if (!ib_query_port(hca, port, &attr))
> +		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
> +	else {
> +		printk(KERN_WARNING "%s: ib_query_port %d failed\n", 
> +		       hca->name, port);
> +		goto device_init_failed;
> +	}
>  	result = ib_query_pkey(hca, port, 0, &priv->pkey);
>  	if (result) {
>  		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
> diff -urpN
> ipoib/ipoib_multicast.c /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> --- ipoib/ipoib_multicast.c	2008-01-10 13:13:12.000000000 -0500
> +++ /home/shirley/ipoib-test/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2008-01-22 15:42:10.000000000 -0500
> @@ -567,9 +567,7 @@ void ipoib_mcast_join_task(struct work_s
>  		return;
>  	}
>  
> -	priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
> -		IPOIB_ENCAP_LEN;
> -
> +	priv->mcast_mtu =
> ipoib_ud_mtu(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
>  	if (!ipoib_cm_admin_enabled(dev))
>  		dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
>  
> 
> _______________________________________________
> general mailing list
> general at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
> 
> To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general




More information about the general mailing list