[ofa-general] [PATCH 13 of 17]: add LRO support

Eli Cohen eli at mellanox.co.il
Tue Sep 11 08:54:55 PDT 2007


Add Large Receive Offload support to IPOIB

Reduce overhead incurred by handling many small packets
by aggregating SKBs related to the same stream and passing
them up. This patch is based on the work done for MTNIC
by Liran Liss <liranl at mellanox.co.il>

Signed-off-by: Eli Cohen <eli at mellanox.co.il>

---

Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/Makefile
===================================================================
--- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/Makefile	2007-09-11 21:15:29.000000000 +0300
+++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/Makefile	2007-09-11 21:15:29.000000000 +0300
@@ -5,7 +5,8 @@ ib_ipoib-y					:= ipoib_main.o \
 						   ipoib_multicast.o \
 						   ipoib_verbs.o \
 						   ipoib_vlan.o \
-						   ipoib_etool.o
+						   ipoib_etool.o \
+						   ipoib_lro.o
 ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM)		+= ipoib_cm.o
 ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG)	+= ipoib_fs.o
 
Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib.h	2007-09-11 21:15:29.000000000 +0300
+++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib.h	2007-09-11 21:15:29.000000000 +0300
@@ -95,6 +95,8 @@ enum {
 	IPOIB_MCAST_FLAG_SENDONLY = 1,
 	IPOIB_MCAST_FLAG_BUSY 	  = 2,	/* joining or already joined */
 	IPOIB_MCAST_FLAG_ATTACHED = 3,
+
+	MAX_LRO_SESSIONS	  = 1 << 5, /* must be power of 2 */
 };
 
 #define	IPOIB_OP_RECV   (1ul << 31)
@@ -281,6 +283,30 @@ struct ipoib_cm_dev_priv {
 	struct ib_recv_wr       rx_wr;
 };
 
+struct ipoib_lro {
+	struct hlist_node node;
+	struct hlist_node flush_node;
+
+	/* Id fields come first: */
+	u32 saddr;
+	u32 daddr;
+	u32 sport_dport;
+	u32 next_seq;
+	u16 tot_len;
+	u8 psh;
+
+	u32 tsval;
+	__be32 tsecr;
+	__be32 ack_seq;
+	__be16 window;
+	u16 has_vlan;
+	u16 has_timestamp;
+
+	unsigned long expires;
+	struct sk_buff *head;
+	struct sk_buff *tail;
+};
+
 /*
  * Device private locking: tx_lock protects members used in TX fast
  * path (and we use LLTX so upper layers don't do extra locking).
@@ -357,6 +383,11 @@ struct ipoib_dev_priv {
 	struct dentry *mcg_dentry;
 	struct dentry *path_dentry;
 #endif
+
+	struct hlist_head   *lro_hash;
+	struct hlist_head    lro_free;
+	struct hlist_head    lro_flush;
+	int		     lro_sz; /* must be 2^x */
 };
 
 struct ipoib_ah {
@@ -498,6 +529,11 @@ void ipoib_drain_cq(struct net_device *d
 
 void ipoib_set_ethtool_ops(struct net_device *dev);
 
+int ipoib_lro_init(struct ipoib_dev_priv *priv, int num_lro);
+void ipoib_lro_destroy(struct ipoib_dev_priv *priv);
+int ipoib_lro_rx(struct ipoib_dev_priv *priv, struct sk_buff *skb);
+void ipoib_lro_flush(struct ipoib_dev_priv *priv, int all);
+
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 
 #define IPOIB_FLAGS_RC          0x80
Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-09-11 21:15:28.000000000 +0300
+++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-09-11 21:15:29.000000000 +0300
@@ -239,8 +239,11 @@ static void ipoib_ib_handle_rx_wc(struct
 	skb->pkt_type = PACKET_HOST;
 
 	/* check rx csum */
-	if (test_bit(IPOIB_FLAG_RX_CSUM, &priv->flags) && likely(wc->csum_ok))
+	if (test_bit(IPOIB_FLAG_RX_CSUM, &priv->flags) && likely(wc->csum_ok)) {
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		if (!ipoib_lro_rx(priv, skb))
+			goto repost;
+	}
 
 	netif_receive_skb(skb);
 
@@ -332,13 +335,13 @@ int ipoib_poll(struct net_device *dev, i
 	*budget    -= done;
 
 	if (empty) {
+		ipoib_lro_flush(priv, 1);
 		netif_rx_complete(dev);
 		if (unlikely(ib_req_notify_cq(priv->cq,
 					      IB_CQ_NEXT_COMP |
 					      IB_CQ_REPORT_MISSED_EVENTS)) &&
 		    netif_rx_reschedule(dev, 0))
 			return 1;
-
 		return 0;
 	}
 
Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- ofa_1_3_dev_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-09-11 21:15:29.000000000 +0300
+++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-09-11 21:15:29.000000000 +0300
@@ -1165,7 +1165,7 @@ static struct net_device *ipoib_add_port
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
 		       hca->name, port, result);
-		goto alloc_mem_failed;
+		goto device_init_failed;
 	}
 
 	/*
@@ -1181,7 +1181,7 @@ static struct net_device *ipoib_add_port
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
 		       hca->name, port, result);
-		goto alloc_mem_failed;
+		goto device_init_failed;
 	} else
 		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
 
@@ -1211,6 +1211,9 @@ static struct net_device *ipoib_add_port
 		priv->dev->features |= NETIF_F_TSO;
 
 
+	if (ipoib_lro_init(priv, MAX_LRO_SESSIONS))
+		goto lro_init_failed;
+
 	result = register_netdev(priv->dev);
 	if (result) {
 		printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
@@ -1236,6 +1239,9 @@ sysfs_failed:
 	unregister_netdev(priv->dev);
 
 register_failed:
+	ipoib_lro_destroy(priv);
+
+lro_init_failed:
 	ib_unregister_event_handler(&priv->event_handler);
 	flush_scheduled_work();
 
@@ -1295,6 +1301,7 @@ static void ipoib_remove_one(struct ib_d
 	dev_list = ib_get_client_data(device, &ipoib_client);
 
 	list_for_each_entry_safe(priv, tmp, dev_list, list) {
+		ipoib_lro_destroy(priv);
 		ib_unregister_event_handler(&priv->event_handler);
 		flush_scheduled_work();
 
Index: ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_lro.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ ofa_1_3_dev_kernel/drivers/infiniband/ulp/ipoib/ipoib_lro.c	2007-09-11 21:15:29.000000000 +0300
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies LTD. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param_named(lro_data_debug_level, data_debug_level, int, 0644);
+MODULE_PARM_DESC(lro_data_debug_level,
+		 "Enable data path debug tracing for lro code if > 0");
+#endif
+
+#include "ipoib.h"
+
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+
+/* LRO hash function - using sum of source and destination port LSBs is
+ * good enough */
+#define LRO_INDEX(th, size) \
+	((*((u8 *)&th->source + 1) + *((u8 *)&th->dest + 1)) & (size - 1))
+
+
+int ipoib_lro_init(struct ipoib_dev_priv *priv, int num_lro)
+{
+	struct ipoib_lro *lro;
+	int i;
+
+	INIT_HLIST_HEAD(&priv->lro_free);
+	INIT_HLIST_HEAD(&priv->lro_flush);
+	priv->lro_hash = kmalloc(sizeof(struct hlist_head) * num_lro,
+				 GFP_KERNEL);
+	if (!priv->lro_hash)
+		return -ENOMEM;
+
+	for (i = 0; i < num_lro; ++i) {
+		INIT_HLIST_HEAD(&priv->lro_hash[i]);
+		lro = kzalloc(sizeof(struct ipoib_lro), GFP_KERNEL);
+		if (!lro) {
+			ipoib_lro_destroy(priv);
+			return -ENOMEM;
+		}
+		INIT_HLIST_NODE(&lro->node);
+		INIT_HLIST_NODE(&lro->flush_node);
+		hlist_add_head(&lro->node, &priv->lro_free);
+	}
+	priv->lro_sz = num_lro;
+
+	return 0;
+}
+
+void ipoib_lro_destroy(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_lro *lro;
+	struct hlist_node *node, *tmp;
+
+	hlist_for_each_entry_safe(lro, node, tmp, &priv->lro_free, node) {
+		hlist_del(&lro->node);
+		kfree(lro);
+	}
+	kfree(priv->lro_hash);
+}
+
+static inline int skb_valid_for_lro(const struct sk_buff *skb)
+{
+	const struct iphdr *hdr = (struct iphdr *)(skb->data);
+
+	/* FIXME: mlx4 hw can supply all these test in the ipoib status
+	   field - need to change implenetation that this value is passed
+	   up to the caller */
+	/* This packet is eligible for LRO if it is:
+	 * - TCP/IP (v4)
+	 * - without IP options
+	 * - not an IP fragment */
+	return hdr->protocol == IPPROTO_TCP && hdr->ihl == 5 &&
+		!(hdr->frag_off & htons(0x2000));
+}
+
+static struct ipoib_lro *lro_find_session(struct ipoib_dev_priv *priv,
+					  const struct iphdr *iph,
+					  const struct tcphdr *th)
+{
+	struct ipoib_lro *lro;
+	struct hlist_node *pos;
+	int index = LRO_INDEX(th, priv->lro_sz);
+	struct hlist_head *head = &priv->lro_hash[index];
+
+	ipoib_dbg_data(priv, "Searching session at index:%d\n", index);
+
+	hlist_for_each_entry(lro, pos, head, node) {
+		if (lro->sport_dport == *((__be32 *)&th->source) &&
+		    lro->saddr == iph->saddr &&
+		    lro->daddr == iph->daddr)
+			return lro;
+	}
+	return NULL;
+}
+
+static void lro_flush_single(struct ipoib_dev_priv *priv,
+			     struct ipoib_lro *lro)
+{
+	struct sk_buff *skb = lro->head;
+	struct iphdr *iph = (struct iphdr *)skb->data;
+	struct tcphdr *th = (struct tcphdr *)(iph + 1);
+	struct net_device *dev = priv->dev;
+	u32 *ts;
+
+	/* Update IP length and checksum */
+	iph->tot_len = htons(lro->tot_len);
+	iph->check = 0;
+	iph->check = ip_fast_csum(iph, sizeof(*iph) >> 2);
+
+	/* Update latest TCP ack, window, psh, and timestamp */
+	th->ack_seq = lro->ack_seq;
+	th->window = lro->window;
+	th->psh = !!lro->psh;
+	if (lro->has_timestamp) {
+		ts = (u32 *) (th + 1);
+		ts[1] = htonl(lro->tsval);
+		ts[2] = lro->tsecr;
+	}
+
+	ipoib_dbg_data(priv, "Flushing LRO session (%p) - tot_len:%d\n",
+		  lro, lro->tot_len);
+
+	netif_receive_skb(skb);
+	dev->last_rx = jiffies;
+
+	/* TBD Increment stats ?? */
+
+	/* Move session back to the free list */
+	ipoib_dbg_data(priv, "Returning LRO session to free list\n");
+	hlist_del(&lro->node);
+	hlist_del(&lro->flush_node);
+	hlist_add_head(&lro->node, &priv->lro_free);
+}
+
+static void lro_append(struct ipoib_dev_priv *priv, struct ipoib_lro *lro,
+		       struct sk_buff *skb, int tcp_len, int tcp_hlen)
+{
+	struct sk_buff *head = lro->head;
+
+	ipoib_dbg_data(priv, "append %d bytes\n", tcp_len);
+	head->len += tcp_len;
+	head->data_len += tcp_len;
+	skb_pull(skb, tcp_hlen + sizeof(struct iphdr));
+	if (skb_shinfo(head)->frag_list)
+		lro->tail->next = skb;
+	else
+		skb_shinfo(head)->frag_list = skb;
+
+	head->truesize += skb->truesize;
+	lro->tail = skb;
+	return;
+}
+
+static struct ipoib_lro *lro_alloc_session(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_lro *lro;
+
+	if (hlist_empty(&priv->lro_free))
+		return NULL;
+
+	lro = hlist_entry(priv->lro_free.first, struct ipoib_lro, node);
+        hlist_del(&lro->node);
+
+	return lro;
+}
+
+int ipoib_lro_rx(struct ipoib_dev_priv *priv, struct sk_buff *skb)
+{
+	struct ipoib_lro *lro;
+	const struct iphdr *iph;
+	const struct tcphdr *th;
+	int tcp_hlen;
+	int tcp_data_len;
+	u16 ip_len;
+	u32 *ts;
+	u32 seq;
+	u32 tsval = 0xffffffff;
+	__be32 tsecr = 0;
+
+	if (unlikely(!skb_valid_for_lro(skb)))
+		return -1;
+
+	/* Get pointer to TCP header */
+	iph = (struct iphdr *)(skb->data);
+	th = (struct tcphdr *)(iph + 1);
+
+	/* We only handle aligned timestamp options */
+	tcp_hlen = th->doff << 2;
+	if (tcp_hlen == sizeof *th + TCPOLEN_TSTAMP_ALIGNED) {
+		ts = (u32 *)(th + 1);
+		if (unlikely(*ts != htonl((TCPOPT_NOP << 24) |
+					  (TCPOPT_NOP << 16) |
+					  (TCPOPT_TIMESTAMP << 8) |
+					  TCPOLEN_TIMESTAMP)))
+			return -1;
+
+		tsval = ntohl(ts[1]);
+		tsecr = ts[2];
+		ipoib_dbg_data(priv, "Found ts:0x%x tsecr:0x%x\n", tsval,
+			  ntohl(tsecr));
+	} else if (tcp_hlen != sizeof(*th)) {
+		ipoib_dbg_data(priv, "Cannot LRO - tcp options\n");
+		return -1;
+	}
+
+	/* At this point we know we have a TCP packet that is likely to be
+	 * eligible for LRO. Therefore, see now if we have an oustanding
+	 * session that corresponds to this packet so we could flush it if
+	 * something still prevents LRO */
+	lro = lro_find_session(priv, iph, th);
+	ipoib_dbg_data(priv, "%s LRO session\n", lro ? "Found" : "Unrecognized");
+
+	/* ensure no bits set besides ack or psh */
+	if (th->fin || th->syn || th->rst || th->urg || th->ece ||
+	    th->cwr || !th->ack) {
+		ipoib_dbg_data(priv, "Cannot LRO - tcp flags\n");
+		if (lro)
+			lro_flush_single(priv, lro);
+
+		return -1;
+	}
+
+	ip_len = ntohs(iph->tot_len);
+	/* Get TCP payload length */
+	tcp_data_len = ip_len - tcp_hlen - sizeof(struct iphdr);
+	seq = ntohl(th->seq);
+	ipoib_dbg_data(priv, "ip_len:%d ip_hlen:%d tcp_hlen:%d tcp_data_len:%d\n",
+		  ip_len, iph->ihl * 4, tcp_hlen, tcp_data_len);
+
+	if (lro) {
+		ipoib_dbg_data(priv, "Extending LRO (%p) session with current "
+				"current tot_len:%d\n", lro, lro->tot_len);
+
+		/* Check sequence number */
+		if (unlikely(seq != lro->next_seq)) {
+			ipoib_dbg_data(priv, "Sequence mismatch (got: 0x%08x, "
+				  "expected:0x%08x)\n", seq, lro->next_seq);
+			lro_flush_single(priv, lro);
+			return -1;
+		}
+
+		/* If the cummulative IP length is over 64K, flush and start
+		 * a new session */
+		if (lro->tot_len + tcp_data_len > 0xffff) {
+			ipoib_dbg_data(priv, "LRO 64K exceeded - "
+				  "starting new session\n");
+			lro_flush_single(priv, lro);
+			goto new_session;
+		}
+
+		/* Check timestamps */
+		if (tcp_hlen != sizeof(*th)) {
+			if (unlikely(lro->tsval > tsval || !tsecr)) {
+				ipoib_dbg_data(priv, "LRO  - bad timestamp\n");
+				return -1;
+			}
+		}
+
+		/* Update session */
+		lro->psh |= th->psh;
+		lro->next_seq += tcp_data_len;
+		lro->tot_len += tcp_data_len;
+		lro->tsval = tsval;
+		lro->tsecr = tsecr;
+		lro->ack_seq = th->ack_seq;
+		lro->window = th->window;
+
+		if (likely(tcp_data_len))
+			lro_append(priv, lro, skb, tcp_data_len, tcp_hlen);
+		else
+			dev_kfree_skb_any(skb);
+
+#ifdef IPOIB_LRO_FLUSH_PSH
+		if (th->psh)
+			lro_flush_single(priv, lro);
+#endif
+
+		return 0;
+	}
+
+new_session:
+	ipoib_dbg_data(priv, "LRO session not found - allocating new\n");
+#ifdef IPOIB_LRO_FLUSH_PSH
+	if (th->psh) {
+		ipoib_dbg_data(priv, "Aborting new session due to set psh bit\n");
+		return -1;
+	}
+#endif
+
+	lro = lro_alloc_session(priv);
+	if (likely(lro)) {
+		int index;
+
+		/* Add in the skb */
+		lro->head = skb;
+		lro->tail = skb;
+
+		/* Initialize session */
+		lro->saddr = iph->saddr;
+		lro->daddr = iph->daddr;
+		lro->sport_dport = *((u32 *)&th->source);
+
+		lro->next_seq = seq + tcp_data_len;
+		lro->tot_len = ip_len;
+		lro->psh = th->psh;
+		lro->ack_seq = th->ack_seq;
+		lro->window = th->window;
+
+		/* Handle timestamps */
+		if (tcp_hlen != sizeof(*th)) {
+			lro->tsval = tsval;
+			lro->tsecr = tsecr;
+			lro->has_timestamp = 1;
+		} else {
+			lro->tsval = 0xffffffff;
+			lro->has_timestamp = 0;
+		}
+
+		/* Activate this session */
+		lro->expires = jiffies + HZ / 25;
+		index = LRO_INDEX(th, priv->lro_sz);
+
+		ipoib_dbg_data(priv, "Inserting session (%p) to list at index:%d\n",
+			  lro, index);
+		hlist_add_head(&lro->node, &priv->lro_hash[index]);
+		hlist_add_head(&lro->flush_node, &priv->lro_flush);
+		return 0;
+	} else
+		ipoib_dbg_data(priv, "No more LRO sessions\n");
+
+	return -1;
+}
+
+
+void ipoib_lro_flush(struct ipoib_dev_priv *priv, int all)
+{
+	struct ipoib_lro *lro;
+	struct hlist_node *node, *tmp;
+
+	ipoib_dbg_data(priv, "LRO flush called with all:%d at jiffies:%lu\n",
+		  all, jiffies);
+
+	hlist_for_each_entry_safe(lro, node, tmp, &priv->lro_flush,
+				  flush_node) {
+		if (all || time_after(jiffies, lro->expires)) {
+			ipoib_dbg_data(priv, "Flushing session - saddr:0x%x "
+				  "daddr:0x%x sport:%d dport:%d expires:%lu\n",
+				  be32_to_cpu(lro->saddr),
+				  be32_to_cpu(lro->daddr),
+				  be16_to_cpu(*((__be16 *)&lro->sport_dport)),
+				  be16_to_cpu(*((__be16 *)&lro->sport_dport + 1)),
+				  lro->expires);
+			lro_flush_single(priv, lro);
+		}
+	}
+}




More information about the general mailing list