[openib-general] [RFC] [PATCH v2] IB/ipoib: Add bonding support to IPoIB

Moni Shoua monis at voltaire.com
Mon Feb 26 09:07:30 PST 2007


Hi,

This post follows a previous one, regarding required changes to IPoIB to enable
it to work with bonding. Please find it here: http://openib.org/pipermail/openib-general/2007-February/032598.html

This patch version adds fixes to the comments from Michael Tsirkin from the last post.

IPoIB uses a two layer neighboring scheme, such that for each struct neighbour
whose device is an ipoib one, there is a struct ipoib_neigh buddy which is
created on demand at the tx flow by an ipoib_neigh_alloc(skb->dst->neighbour)
call.

When using the bonding driver, neighbours are created by the net stack on behalf
of the bonding (master) device. On the tx flow the bonding code gets an skb such
that skb->dev points to the master device, it changes this skb to point on the
slave device and calls the slave hard_start_xmit function.

Combing these two flows, there is a hole if some code at ipoib
(ipoib_neigh_destructor) assumes that for each struct neighbour it gets, n->dev
is an ipoib device so for example netdev_priv(n->dev) would be of type struct
ipoib_dev_priv.

To fix it, this patch adds a dev field to struct ipoib_neigh which is used
instead of the struct neighbour dev one.

In addition, if an IPoIB device is removed before bonding is unloaded it may 
cause bond0 neighbours (neighbours that point to bond0) to exist after the IPoIB
device no longer exist. This is why a neighbour cleanup is required during device 
cleanup. This cleanup scans the arp cache and the ndisc cache to find there 
neighbours of bond0 which refer also to the relevant ibX. Also, when ib_ipoib module is
unloaded, the neighbour destructor must be set to NULL because the neighbour function is in
ib_ipoib.
For this neigh table cleanup, it is required to export the symbol nd_tbl just like the symbol arp_tbl is.

During my tests I found that when running 

	1. modprobe -r ib_mthca (to delete IPoIB interfaces)
	2. ping somewhere on the subnet of bond0

I get this stack dump (which ends with kernel death)
	 [<ffffffff8037ff32>] skb_under_panic+0x5c/0x60
	 [<ffffffff882e00c2>] :ib_ipoib:ipoib_hard_header+0xa6/0xc0
	 [<ffffffff803c3c98>] arp_create+0x120/0x226
	 [<ffffffff803c3dc3>] arp_send+0x25/0x3b
	 [<ffffffff803c466a>] arp_solicit+0x186/0x195
	 [<ffffffff8038c0ac>] neigh_timer_handler+0x2b5/0x309
	 [<ffffffff8038bdf7>] neigh_timer_handler+0x0/0x309
	 [<ffffffff80239599>] run_timer_softirq+0x130/0x19e
	 [<ffffffff80235fcc>] __do_softirq+0x55/0xc3
	 [<ffffffff8020acac>] call_softirq+0x1c/0x28
	 [<ffffffff8020c02b>] do_softirq+0x2c/0x7d
	 [<ffffffff8021864a>] smp_apic_timer_interrupt+0x57/0x6a
	 [<ffffffff80208e19>] mwait_idle+0x0/0x45
	 [<ffffffff8020a756>] apic_timer_interrupt+0x66/0x70
	 <EOI>  [<ffffffff80208e5b>] mwait_idle+0x42/0x45
	 [<ffffffff80208db1>] cpu_idle+0x8b/0xae
	 [<ffffffff80217d60>] start_secondary+0x47f/0x48f

The only way I found to avoid this (for now) is to check skb headroom in
ipoib_hard_header. I guess that this safety check doesn't harm regular IPoIB 
operation and it seems to solve my problem. However, I would be happy to hear what
others think of this last issue.

I would really appreciate comments.

thanks

 -MoniS

------------------------------------------------------------------------------
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 07deee8..31bc6d8 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -216,6 +216,7 @@ struct ipoib_neigh {
 	struct sk_buff_head queue;
 
 	struct neighbour   *neighbour;
+	struct net_device *dev;
 
 	struct list_head    list;
 };
@@ -232,7 +233,8 @@ static inline struct ipoib_neigh **to_ip
 				     INFINIBAND_ALEN, sizeof(void *));
 }
 
-struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh);
+struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh,
+				      struct net_device *dev);
 void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh);
 
 extern struct workqueue_struct *ipoib_workqueue;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 705eb1d..0e3953e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -48,6 +48,8 @@ #include <linux/ip.h>
 #include <linux/in.h>
 
 #include <net/dst.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
 
 #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
 
@@ -70,6 +72,7 @@ module_param_named(debug_level, ipoib_de
 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
 #endif
 
+static int ipoib_at_exit = 0;
 struct ipoib_path_iter {
 	struct net_device *dev;
 	struct ipoib_path  path;
@@ -490,7 +493,7 @@ static void neigh_add_path(struct sk_buf
 	struct ipoib_path *path;
 	struct ipoib_neigh *neigh;
 
-	neigh = ipoib_neigh_alloc(skb->dst->neighbour);
+	neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev);
 	if (!neigh) {
 		++priv->stats.tx_dropped;
 		dev_kfree_skb_any(skb);
@@ -735,6 +738,9 @@ static int ipoib_hard_header(struct sk_b
 {
 	struct ipoib_header *header;
 
+	if (skb_headroom(skb) < sizeof *header) {
+		return -1;
+	}
 	header = (struct ipoib_header *) skb_push(skb, sizeof *header);
 
 	header->proto = htons(type);
@@ -746,8 +752,11 @@ static int ipoib_hard_header(struct sk_b
 	 * figure out where to send the packet later.
 	 */
 	if ((!skb->dst || !skb->dst->neighbour) && daddr) {
-		struct ipoib_pseudoheader *phdr =
-			(struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
+		struct ipoib_pseudoheader *phdr = NULL;
+		if (skb_headroom(skb) < sizeof *phdr) {
+			return -1;
+		}
+		phdr = (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
 		memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
 	}
 
@@ -769,32 +778,69 @@ static void ipoib_set_mcast_list(struct 
 static void ipoib_neigh_destructor(struct neighbour *n)
 {
 	struct ipoib_neigh *neigh;
-	struct ipoib_dev_priv *priv = netdev_priv(n->dev);
+	struct ipoib_dev_priv *priv;
 	unsigned long flags;
 	struct ipoib_ah *ah = NULL;
 
-	ipoib_dbg(priv,
-		  "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
-		  IPOIB_QPN(n->ha),
-		  IPOIB_GID_RAW_ARG(n->ha + 4));
-
-	spin_lock_irqsave(&priv->lock, flags);
 
 	neigh = *to_ipoib_neigh(n);
 	if (neigh) {
+		priv = netdev_priv(neigh->dev);
+		ipoib_dbg(priv,
+			  "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
+			  IPOIB_QPN(n->ha),
+			  IPOIB_GID_RAW_ARG(n->ha + 4));
+
+		spin_lock_irqsave(&priv->lock, flags);
 		if (neigh->ah)
 			ah = neigh->ah;
 		list_del(&neigh->list);
 		ipoib_neigh_free(n->dev, neigh);
+		spin_unlock_irqrestore(&priv->lock, flags);
 	}
-
-	spin_unlock_irqrestore(&priv->lock, flags);
-
 	if (ah)
 		ipoib_put_ah(ah);
 }
 
-struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour)
+static void ipoib_neigh_tbl_cleanup_master(struct neigh_table *tbl,
+					   struct net_device* master,
+					   struct net_device* slave)
+{
+	int i;
+	struct ipoib_neigh *neigh;
+
+	write_lock_bh(&tbl->lock);
+	for (i = 0; i <= tbl->hash_mask; i++) {
+		struct neighbour *n, **np;
+
+		np = &tbl->hash_buckets[i];
+		while ((n = *np) != NULL) {
+			write_lock(&n->lock);
+			if (n->dev == master) {
+				neigh = *to_ipoib_neigh(n);
+				if (neigh && (neigh->dev == slave)){
+					if (ipoib_at_exit)
+						n->parms->neigh_destructor = NULL;
+					ipoib_neigh_destructor(n);
+				}
+			}
+			write_unlock(&n->lock);
+			np = &n->next;
+		}
+	}
+	write_unlock_bh(&tbl->lock);
+}
+
+static void ipoib_neigh_cleanup_by_master(struct net_device* master,struct net_device* slave){
+	netif_stop_queue(slave);
+	if (master) {
+		ipoib_neigh_tbl_cleanup_master(&arp_tbl,master, slave);
+		ipoib_neigh_tbl_cleanup_master(&nd_tbl,master, slave);
+	}
+}
+
+struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour,
+				      struct net_device *dev)
 {
 	struct ipoib_neigh *neigh;
 
@@ -803,6 +849,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(st
 		return NULL;
 
 	neigh->neighbour = neighbour;
+	neigh->dev = dev;
 	*to_ipoib_neigh(neighbour) = neigh;
 	skb_queue_head_init(&neigh->queue);
 
@@ -874,6 +921,7 @@ void ipoib_dev_cleanup(struct net_device
 
 	/* Delete any child interfaces first */
 	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+		ipoib_neigh_cleanup_by_master(cpriv->dev->master, cpriv->dev);
 		unregister_netdev(cpriv->dev);
 		ipoib_dev_cleanup(cpriv->dev);
 		free_netdev(cpriv->dev);
@@ -1159,6 +1207,7 @@ static void ipoib_remove_one(struct ib_d
 		ib_unregister_event_handler(&priv->event_handler);
 		flush_scheduled_work();
 
+		ipoib_neigh_cleanup_by_master(priv->dev->master, priv->dev);
 		unregister_netdev(priv->dev);
 		ipoib_dev_cleanup(priv->dev);
 		free_netdev(priv->dev);
@@ -1217,6 +1266,8 @@ err_fs:
 
 static void __exit ipoib_cleanup_module(void)
 {
+	ipoib_at_exit = 1;
+
 	ib_unregister_client(&ipoib_client);
 	ib_sa_unregister_client(&ipoib_sa_client);
 	ipoib_unregister_debugfs();
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index b04b72c..a41a949 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -774,7 +774,7 @@ out:
 		if (skb->dst            &&
 		    skb->dst->neighbour &&
 		    !*to_ipoib_neigh(skb->dst->neighbour)) {
-			struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour);
+			struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev);
 
 			if (neigh) {
 				kref_get(&mcast->ah->ref);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 6a9f616..557be98 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -153,6 +153,7 @@ struct neigh_table nd_tbl = {
 	.gc_thresh2 =	 512,
 	.gc_thresh3 =	1024,
 };
+EXPORT_SYMBOL(nd_tbl);
 
 /* ND options */
 struct ndisc_options {





More information about the general mailing list