[ofa-general] [PATCH v4 for-2.6.27] IPOIB: add LRO support.
Vladimir Sokolovsky
vladsk at gmail.com
Sun Jun 29 11:33:48 PDT 2008
From c859eed63331e43993b0c3f0c903e48d1e27def2 Mon Sep 17 00:00:00 2001
From: Vladimir Sokolovsky <vlad at mellanox.co.il>
Date: Mon, 23 Jun 2008 15:38:27 +0300
Subject: [PATCH] IPOIB: add LRO support.
add "ipoib_use_lro" module parameter to enable LRO.
add "ipoib_lro_max_aggr" module parameter to set
the Max number of packets to be aggregated.
LRO statistics accessible through ethtool.
Signed-off-by: Vladimir Sokolovsky <vlad at mellanox.co.il>
Signed-off-by: Eli Cohen <eli at mellanox.co.il>
---
Changes from v3:
Added comment for checksum validation.
drivers/infiniband/ulp/ipoib/Kconfig | 1 +
drivers/infiniband/ulp/ipoib/ipoib.h | 13 +++++
drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 52 +++++++++++++++++++++
drivers/infiniband/ulp/ipoib/ipoib_ib.c | 8 +++-
drivers/infiniband/ulp/ipoib/ipoib_main.c | 62 ++++++++++++++++++++++++++
5 files changed, 135 insertions(+), 1 deletions(-)
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
index 1f76bad..691525c 100644
--- a/drivers/infiniband/ulp/ipoib/Kconfig
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -1,6 +1,7 @@
config INFINIBAND_IPOIB
tristate "IP-over-InfiniBand"
depends on NETDEVICES && INET && (IPV6 || IPV6=n)
+ select INET_LRO
---help---
Support for the IP-over-InfiniBand protocol (IPoIB). This
transports IP packets over InfiniBand so you can use your IB
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 8754b36..985ee87 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -50,6 +50,7 @@
#include <rdma/ib_verbs.h>
#include <rdma/ib_pack.h>
#include <rdma/ib_sa.h>
+#include <linux/inet_lro.h>
/* constants */
@@ -94,6 +95,9 @@ enum {
IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */
IPOIB_MCAST_FLAG_ATTACHED = 3,
+ IPOIB_MAX_LRO_DESCRIPTORS = 8,
+ IPOIB_LRO_MAX_AGGR = 64,
+
MAX_SEND_CQE = 16,
IPOIB_CM_COPYBREAK = 256,
};
@@ -248,6 +252,11 @@ struct ipoib_ethtool_st {
u16 max_coalesced_frames;
};
+struct ipoib_lro {
+ struct net_lro_mgr lro_mgr;
+ struct net_lro_desc lro_desc[IPOIB_MAX_LRO_DESCRIPTORS];
+};
+
/*
* Device private locking: tx_lock protects members used in TX fast
* path (and we use LLTX so upper layers don't do extra locking).
@@ -334,6 +343,8 @@ struct ipoib_dev_priv {
int hca_caps;
struct ipoib_ethtool_st ethtool;
struct timer_list poll_timer;
+
+ struct ipoib_lro lro;
};
struct ipoib_ah {
@@ -679,6 +690,8 @@ extern struct ib_sa_client ipoib_sa_client;
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
extern int ipoib_debug_level;
+extern int ipoib_use_lro;
+extern int ipoib_lro_max_aggr;
#define ipoib_dbg(priv, format, arg...) \
do { \
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index 10279b7..86cf3e7 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -86,11 +86,63 @@ static int ipoib_set_coalesce(struct net_device *dev,
return 0;
}
+#define IPOIB_STATS_LEN ARRAY_SIZE(ipoib_gstrings_stats)
+
+static const char ipoib_gstrings_stats[][ETH_GSTRING_LEN] = {
+ "LRO aggregated", "LRO flushed",
+ "LRO avg aggr", "LRO no_desc"
+};
+
+static void
+ipoib_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
+{
+ switch (stringset) {
+ case ETH_SS_STATS:
+ memcpy(data, *ipoib_gstrings_stats,
+ sizeof(ipoib_gstrings_stats));
+ data += sizeof(ipoib_gstrings_stats);
+ break;
+ }
+}
+
+static int ipoib_get_sset_count(struct net_device *dev, int sset)
+{
+ switch (sset) {
+ case ETH_SS_STATS:
+ return IPOIB_STATS_LEN;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static void ipoib_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats, uint64_t *data)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int index = 0;
+
+ /* Get LRO statistics */
+ data[index++] = priv->lro.lro_mgr.stats.aggregated;
+ data[index++] = priv->lro.lro_mgr.stats.flushed;
+ if (priv->lro.lro_mgr.stats.flushed)
+ data[index++] = priv->lro.lro_mgr.stats.aggregated /
+ priv->lro.lro_mgr.stats.flushed;
+ else
+ data[index++] = 0;
+ data[index++] = priv->lro.lro_mgr.stats.no_desc;
+
+}
+
static const struct ethtool_ops ipoib_ethtool_ops = {
.get_drvinfo = ipoib_get_drvinfo,
.get_tso = ethtool_op_get_tso,
.get_coalesce = ipoib_get_coalesce,
.set_coalesce = ipoib_set_coalesce,
+ .get_flags = ethtool_op_get_flags,
+ .set_flags = ethtool_op_set_flags,
+ .get_strings = ipoib_get_strings,
+ .get_sset_count = ipoib_get_sset_count,
+ .get_ethtool_stats = ipoib_get_ethtool_stats,
};
void ipoib_set_ethtool_ops(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index eca8518..8bd7db8 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -288,7 +288,10 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok))
skb->ip_summed = CHECKSUM_UNNECESSARY;
- netif_receive_skb(skb);
+ if (dev->features & NETIF_F_LRO)
+ lro_receive_skb(&priv->lro.lro_mgr, skb, 0);
+ else
+ netif_receive_skb(skb);
repost:
if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
@@ -448,6 +451,9 @@ poll_more:
goto poll_more;
}
+ if (dev->features & NETIF_F_LRO)
+ lro_flush_all(&priv->lro.lro_mgr);
+
return done;
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index bfe1dbf..1c5f6c4 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -60,6 +60,17 @@ MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
+int ipoib_use_lro __read_mostly;
+module_param_named(ipoib_use_lro, ipoib_use_lro, int, 0444);
+MODULE_PARM_DESC(ipoib_use_lro, " Large Receive Offload, 1: enable, "
+ "0: disable, Default = 0");
+
+int ipoib_lro_max_aggr __read_mostly = IPOIB_LRO_MAX_AGGR;
+module_param_named(ipoib_lro_max_aggr, ipoib_lro_max_aggr, int, 0644);
+MODULE_PARM_DESC(ipoib_lro_max_aggr, " LRO: Max packets to be aggregated. "
+ "Default = 64");
+
+
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
int ipoib_debug_level;
@@ -936,6 +947,53 @@ static const struct header_ops ipoib_header_ops = {
.create = ipoib_hard_header,
};
+static int get_skb_hdr(struct sk_buff *skb, void **iphdr,
+ void **tcph, u64 *hdr_flags, void *priv)
+{
+ unsigned int ip_len;
+ struct iphdr *iph;
+
+ if (unlikely(skb->protocol != htons(ETH_P_IP)))
+ return -1;
+
+ /* In the future we may add an else cluase that verifies the
+ * checksum and allows devices which do not calculate checksum
+ * to use LRO
+ */
+ if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY))
+ return -1;
+
+ /* non tcp packet */
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ if (iph->protocol != IPPROTO_TCP)
+ return -1;
+
+ ip_len = ip_hdrlen(skb);
+ skb_set_transport_header(skb, ip_len);
+ *tcph = tcp_hdr(skb);
+
+ /* check if ip header and tcp header are complete */
+ if (iph->tot_len < ip_len + tcp_hdrlen(skb))
+ return -1;
+
+ *hdr_flags = LRO_IPV4 | LRO_TCP;
+ *iphdr = iph;
+
+ return 0;
+}
+
+static void ipoib_lro_setup(struct ipoib_dev_priv *priv)
+{
+ priv->lro.lro_mgr.max_aggr = ipoib_lro_max_aggr;
+ priv->lro.lro_mgr.max_desc = IPOIB_MAX_LRO_DESCRIPTORS;
+ priv->lro.lro_mgr.lro_arr = priv->lro.lro_desc;
+ priv->lro.lro_mgr.get_skb_header = get_skb_hdr;
+ priv->lro.lro_mgr.features = LRO_F_NAPI;
+ priv->lro.lro_mgr.dev = priv->dev;
+ priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY;
+}
+
static void ipoib_setup(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -975,6 +1033,8 @@ static void ipoib_setup(struct net_device *dev)
priv->dev = dev;
+ ipoib_lro_setup(priv);
+
spin_lock_init(&priv->lock);
spin_lock_init(&priv->tx_lock);
@@ -1152,6 +1212,8 @@ static struct net_device *ipoib_add_port(const char *format,
priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM;
}
+ if (ipoib_use_lro)
+ priv->dev->features |= NETIF_F_LRO;
/*
* Set the full membership bit, so that we join the right
* broadcast group, etc.
More information about the general
mailing list