[ewg] Re: [ofa-general] Feedback on Developer's Summit

Or Gerlitz ogerlitz at voltaire.com
Tue Nov 6 00:36:46 PST 2007


Roland Dreier wrote:
>> As I said earlier on this thread,  the open issues I see with the stateless
>> offload series are (A) the non interoperable checksum offload patch based on
>> the IB ICRC sent by Michael (and if it is inter-operable, I'd like to be
>> educated how) 

> For (A): as far as I'm concerned, turning off TCP/IP checksums is not
> something we want to do in IPoIB.  Is there anyone arguing in favor of
> it at this point?

Not that someone argues, but they just went and put the below patch in 
OFED 1.3 skipping the review process and maintainer acceptance.  So if 
Dror does not want to stand and refine the approach, eg as Jason 
suggested, maybe we can avoid discussing it at this point - Dror?

Or.


> Add module option hw_csum: when set, IPoIB will report HW CSUM
> and S/G support, and rely on hardware end-to-end transport
> checksum (ICRC) instead of software-level protocol checksums.
> 
> Forwarding such packets outside the IB subnet would increase
> the risk of data corruption, so it is safest not to set
> hw_csum flag on gateways. To reduce the chance of
> this routing triggering data corruption by mistake, on RX
> we set skb checksum field to CHECKSUM_UNNECESSARY - this way
> if such a packet ends up outside the IB network,
> it is detected as malformed and dropped.
> 
> To enable interoperability with IEEE IPoIB, checksum
> for outgoing packets is calculated in software
> unless the remote advertises hw_csum capability
> by setting a bit in hardware address flag.
> 
> Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>
> 
> ---
> 
> This patch has to be applied on top of
> [PATCH 2/11] IB/ipoib: support for sending gather skbs.
> 
> Updates since v2: 
> 
> Enable interoperability with IEEE IPoIB.
> Split out S/G support to a separate patch.
> 
> Updates since v1: fixed thinko in setting header flags.
> 
> When applied on top of previously posted mlx4 patches,
> and with hw_csum enabled on both ends, this patch speeds up
> single-stream netperf bandwidth on connectx DDR from 1000
> to 1250 MBytes/sec.
> 
> Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib.h
> ===================================================================
> --- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib.h	2007-09-24 16:21:10.000000000 +0200
> +++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib.h	2007-09-24 16:21:29.000000000 +0200
> @@ -86,6 +86,7 @@ enum {
>  	IPOIB_MCAST_STARTED       = 8,
>  	IPOIB_FLAG_NETIF_STOPPED  = 9,
>  	IPOIB_FLAG_ADMIN_CM 	  = 10,
> +	IPOIB_FLAG_HW_CSUM	  = 11,
>  
>  	IPOIB_MAX_BACKOFF_SECONDS = 16,
>  
> @@ -104,9 +105,11 @@ enum {
>  
>  /* structs */
>  
> +#define IPOIB_HEADER_F_HWCSUM 0x1
> +
>  struct ipoib_header {
>  	__be16	proto;
> -	u16	reserved;
> +	__be16	flags;
>  };
>  
>  struct ipoib_pseudoheader {
> @@ -484,6 +487,8 @@ void ipoib_pkey_poll(struct work_struct 
>  int ipoib_pkey_dev_delay_open(struct net_device *dev);
>  void ipoib_drain_cq(struct net_device *dev);
>  
> +#define IPOIB_FLAGS_HWCSUM      0x01
> +
>  #ifdef CONFIG_INFINIBAND_IPOIB_CM
>  
>  #define IPOIB_FLAGS_RC          0x80
> Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> ===================================================================
> --- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-09-24 16:21:10.000000000 +0200
> +++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-09-24 16:21:29.000000000 +0200
> @@ -407,6 +407,7 @@ void ipoib_cm_handle_rx_wc(struct net_de
>  	unsigned long flags;
>  	u64 mapping[IPOIB_CM_RX_SG];
>  	int frags;
> +	struct ipoib_header *header;
>  
>  	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
>  		       wr_id, wc->status);
> @@ -469,7 +470,10 @@ void ipoib_cm_handle_rx_wc(struct net_de
>  
>  	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
>  
> -	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
> +	header = (struct ipoib_header *)skb->data;
> +	skb->protocol = header->proto;
> +	if (header->flags & cpu_to_be16(IPOIB_HEADER_F_HWCSUM))
> +		skb->ip_summed = CHECKSUM_UNNECESSARY;
>  	skb_reset_mac_header(skb);
>  	skb_pull(skb, IPOIB_ENCAP_LEN);
>  
> Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> ===================================================================
> --- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-09-24 16:21:10.000000000 +0200
> +++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-09-24 16:21:29.000000000 +0200
> @@ -170,6 +170,7 @@ static void ipoib_ib_handle_rx_wc(struct
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
>  	struct sk_buff *skb;
> +	struct ipoib_header *header;
>  	u64 addr;
>  
>  	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
> @@ -220,7 +221,10 @@ static void ipoib_ib_handle_rx_wc(struct
>  	skb_put(skb, wc->byte_len);
>  	skb_pull(skb, IB_GRH_BYTES);
>  
> -	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
> +	header = (struct ipoib_header *)skb->data;
> +	skb->protocol = header->proto;
> +	if (header->flags & cpu_to_be16(IPOIB_HEADER_F_HWCSUM))
> +		skb->ip_summed = CHECKSUM_UNNECESSARY;
>  	skb_reset_mac_header(skb);
>  	skb_pull(skb, IPOIB_ENCAP_LEN);
>  
> Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c
> ===================================================================
> --- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-09-24 16:18:38.000000000 +0200
> +++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-09-24 16:21:29.000000000 +0200
> @@ -55,11 +55,14 @@ MODULE_LICENSE("Dual BSD/GPL");
>  
>  int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
>  int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
> +static int ipoib_hw_csum __read_mostly = 0;
>  
>  module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
>  MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
>  module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
>  MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
> +module_param_named(hw_csum, ipoib_hw_csum, int, 0444);
> +MODULE_PARM_DESC(hw_csum, "Rely on hardware end-to-end checksum (ICRC) if > 0");
>  
>  #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
>  int ipoib_debug_level;
> @@ -804,11 +807,18 @@ static int ipoib_hard_header(struct sk_b
>  			     void *daddr, void *saddr, unsigned len)
>  {
>  	struct ipoib_header *header;
> +	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  
>  	header = (struct ipoib_header *) skb_push(skb, sizeof *header);
>  
>  	header->proto = htons(type);
> -	header->reserved = 0;
> +        if (!test_bit(IPOIB_FLAG_HW_CSUM, &priv->flags) ||
> +	    skb->ip_summed != CHECKSUM_PARTIAL)
> +		header->flags = 0;
> +	else if (daddr && *((char *)daddr) & IPOIB_FLAGS_HWCSUM)
> +		header->flags = cpu_to_be16(IPOIB_HEADER_F_HWCSUM);
> +	else
> +		skb_checksum_help(skb);
>  
>  	/*
>  	 * If we don't have a neighbour structure, stuff the
> @@ -943,6 +953,9 @@ int ipoib_dev_init(struct net_device *de
>  	if (ipoib_ib_dev_init(dev, ca, port))
>  		goto out_tx_ring_cleanup;
>  
> +	if (ipoib_hw_csum)
> +		dev->dev_addr[0] |= IPOIB_FLAGS_HWCSUM;
> +
>  	return 0;
>  
>  out_tx_ring_cleanup:
> @@ -1006,6 +1019,10 @@ static void ipoib_setup(struct net_devic
>  	dev->type 		 = ARPHRD_INFINIBAND;
>  	dev->tx_queue_len 	 = ipoib_sendq_size * 2;
>  	dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
> +	if (ipoib_hw_csum) {
> +		dev->features   |= NETIF_F_SG | NETIF_F_HW_CSUM;
> +		set_bit(IPOIB_FLAG_HW_CSUM, &priv->flags);
> +	}
>  
>  	/* MTU will be reset when mcast join happens */
>  	dev->mtu 		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;




More information about the ewg mailing list