[openib-general] [RFC] [PATCH v2] IB/ipoib: Add bonding support to IPoIB

Michael S. Tsirkin mst at mellanox.co.il
Mon Feb 26 22:02:45 PST 2007


> When using the bonding driver, neighbours are created by the net stack on behalf
> of the bonding (master) device. On the tx flow the bonding code gets an skb such
> that skb->dev points to the master device, it changes this skb to point on the
> slave device and calls the slave hard_start_xmit function.
> 
> 
> Combing these two flows, there is a hole if some code at ipoib
> (ipoib_neigh_destructor) assumes that for each struct neighbour it gets, n->dev
> is an ipoib device so for example netdev_priv(n->dev) would be of type struct
> ipoib_dev_priv.
> 
> To fix it, this patch adds a dev field to struct ipoib_neigh which is used
> instead of the struct neighbour dev one.

It seems that in this design, if multiple ipoib interfaces are present, we might
get an skb such that skb->dev will be different from the new dev field in struct
ipoib_neigh.

It seems that the result will be that the packet will be sent on a wrong interface.
Right?

> In addition, if an IPoIB device is removed before bonding is unloaded it may 
> cause bond0 neighbours (neighbours that point to bond0) to exist after the IPoIB
> device no longer exist. This is why a neighbour cleanup is required during device 
> cleanup. This cleanup scans the arp cache and the ndisc cache to find there 
> neighbours of bond0 which refer also to the relevant ibX. Also, when ib_ipoib module is
> unloaded, the neighbour destructor must be set to NULL because the neighbour function is in
> ib_ipoib.
> For this neigh table cleanup, it is required to export the symbol nd_tbl just like the symbol arp_tbl is.

I wonder about this: is it really true that any allocated neighbour is always in
either arp_tbl or nd_tbl? For example, could some code have called neigh_hold
and retained a neighbour that is not in either one of these tables?

> During my tests I found that when running 
> 
> 	1. modprobe -r ib_mthca (to delete IPoIB interfaces)
> 	2. ping somewhere on the subnet of bond0
> 
> I get this stack dump (which ends with kernel death)
> 	 [<ffffffff8037ff32>] skb_under_panic+0x5c/0x60
> 	 [<ffffffff882e00c2>] :ib_ipoib:ipoib_hard_header+0xa6/0xc0
> 	 [<ffffffff803c3c98>] arp_create+0x120/0x226
> 	 [<ffffffff803c3dc3>] arp_send+0x25/0x3b
> 	 [<ffffffff803c466a>] arp_solicit+0x186/0x195
> 	 [<ffffffff8038c0ac>] neigh_timer_handler+0x2b5/0x309
> 	 [<ffffffff8038bdf7>] neigh_timer_handler+0x0/0x309
> 	 [<ffffffff80239599>] run_timer_softirq+0x130/0x19e
> 	 [<ffffffff80235fcc>] __do_softirq+0x55/0xc3
> 	 [<ffffffff8020acac>] call_softirq+0x1c/0x28
> 	 [<ffffffff8020c02b>] do_softirq+0x2c/0x7d
> 	 [<ffffffff8021864a>] smp_apic_timer_interrupt+0x57/0x6a
> 	 [<ffffffff80208e19>] mwait_idle+0x0/0x45
> 	 [<ffffffff8020a756>] apic_timer_interrupt+0x66/0x70
> 	 <EOI>  [<ffffffff80208e5b>] mwait_idle+0x42/0x45
> 	 [<ffffffff80208db1>] cpu_idle+0x8b/0xae
> 	 [<ffffffff80217d60>] start_secondary+0x47f/0x48f
> 
> The only way I found to avoid this (for now) is to check skb headroom in
> ipoib_hard_header. I guess that this safety check doesn't harm regular IPoIB 
> operation and it seems to solve my problem. However, I would be happy to hear what
> others think of this last issue.

As I said, this seems to indicate a problem in the bonding code.
But what will happen after you error out in ipoib_hard_header?
Is the packet dropped? What might break as a result?

> I would really appreciate comments.
> 
> thanks
> 
>  -MoniS

------------------------------------------------------------------------------
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 07deee8..31bc6d8 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -216,6 +216,7 @@ struct ipoib_neigh {
 	struct sk_buff_head queue;
 
 	struct neighbour   *neighbour;
+	struct net_device *dev;
 
 	struct list_head    list;
 };
@@ -232,7 +233,8 @@ static inline struct ipoib_neigh **to_ip
 				     INFINIBAND_ALEN, sizeof(void *));
 }
 
-struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh);
+struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh,
+				      struct net_device *dev);
 void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh);
 
 extern struct workqueue_struct *ipoib_workqueue;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 705eb1d..0e3953e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -48,6 +48,8 @@ #include <linux/ip.h>
 #include <linux/in.h>
 
 #include <net/dst.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
 
 #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
 
@@ -70,6 +72,7 @@ module_param_named(debug_level, ipoib_de
 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
 #endif
 
+static int ipoib_at_exit = 0;
 struct ipoib_path_iter {
 	struct net_device *dev;
 	struct ipoib_path  path;
@@ -490,7 +493,7 @@ static void neigh_add_path(struct sk_buf
 	struct ipoib_path *path;
 	struct ipoib_neigh *neigh;
 
-	neigh = ipoib_neigh_alloc(skb->dst->neighbour);
+	neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev);
 	if (!neigh) {
 		++priv->stats.tx_dropped;
 		dev_kfree_skb_any(skb);
@@ -735,6 +738,9 @@ static int ipoib_hard_header(struct sk_b
 {
 	struct ipoib_header *header;
 
+	if (skb_headroom(skb) < sizeof *header) {
+		return -1;
+	}
 	header = (struct ipoib_header *) skb_push(skb, sizeof *header);
 
 	header->proto = htons(type);
@@ -746,8 +752,11 @@ static int ipoib_hard_header(struct sk_b
 	 * figure out where to send the packet later.
 	 */
 	if ((!skb->dst || !skb->dst->neighbour) && daddr) {
-		struct ipoib_pseudoheader *phdr =
-			(struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
+		struct ipoib_pseudoheader *phdr = NULL;
+		if (skb_headroom(skb) < sizeof *phdr) {
+			return -1;
+		}
+		phdr = (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
 		memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
 	}
 
@@ -769,32 +778,69 @@ static void ipoib_set_mcast_list(struct 
 static void ipoib_neigh_destructor(struct neighbour *n)
 {
 	struct ipoib_neigh *neigh;
-	struct ipoib_dev_priv *priv = netdev_priv(n->dev);
+	struct ipoib_dev_priv *priv;
 	unsigned long flags;
 	struct ipoib_ah *ah = NULL;
 
-	ipoib_dbg(priv,
-		  "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
-		  IPOIB_QPN(n->ha),
-		  IPOIB_GID_RAW_ARG(n->ha + 4));
-
-	spin_lock_irqsave(&priv->lock, flags);
 
 	neigh = *to_ipoib_neigh(n);
 	if (neigh) {
+		priv = netdev_priv(neigh->dev);
+		ipoib_dbg(priv,
+			  "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
+			  IPOIB_QPN(n->ha),
+			  IPOIB_GID_RAW_ARG(n->ha + 4));
+
+		spin_lock_irqsave(&priv->lock, flags);
 		if (neigh->ah)
 			ah = neigh->ah;
 		list_del(&neigh->list);
 		ipoib_neigh_free(n->dev, neigh);
+		spin_unlock_irqrestore(&priv->lock, flags);
 	}
-
-	spin_unlock_irqrestore(&priv->lock, flags);
-
 	if (ah)
 		ipoib_put_ah(ah);
 }
 
-struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour)
+static void ipoib_neigh_tbl_cleanup_master(struct neigh_table *tbl,
+					   struct net_device* master,
+					   struct net_device* slave)
+{
+	int i;
+	struct ipoib_neigh *neigh;
+
+	write_lock_bh(&tbl->lock);
+	for (i = 0; i <= tbl->hash_mask; i++) {
+		struct neighbour *n, **np;
+
+		np = &tbl->hash_buckets[i];
+		while ((n = *np) != NULL) {
+			write_lock(&n->lock);
+			if (n->dev == master) {
+				neigh = *to_ipoib_neigh(n);
+				if (neigh && (neigh->dev == slave)){
+					if (ipoib_at_exit)
+						n->parms->neigh_destructor = NULL;
+					ipoib_neigh_destructor(n);
+				}
+			}
+			write_unlock(&n->lock);
+			np = &n->next;
+		}
+	}
+	write_unlock_bh(&tbl->lock);
+}
+
+static void ipoib_neigh_cleanup_by_master(struct net_device* master,struct net_device* slave){
+	netif_stop_queue(slave);
+	if (master) {
+		ipoib_neigh_tbl_cleanup_master(&arp_tbl,master, slave);
+		ipoib_neigh_tbl_cleanup_master(&nd_tbl,master, slave);
+	}
+}
+
+struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour,
+				      struct net_device *dev)
 {
 	struct ipoib_neigh *neigh;
 
@@ -803,6 +849,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(st
 		return NULL;
 
 	neigh->neighbour = neighbour;
+	neigh->dev = dev;
 	*to_ipoib_neigh(neighbour) = neigh;
 	skb_queue_head_init(&neigh->queue);
 
@@ -874,6 +921,7 @@ void ipoib_dev_cleanup(struct net_device
 
 	/* Delete any child interfaces first */
 	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+		ipoib_neigh_cleanup_by_master(cpriv->dev->master, cpriv->dev);
 		unregister_netdev(cpriv->dev);
 		ipoib_dev_cleanup(cpriv->dev);
 		free_netdev(cpriv->dev);
@@ -1159,6 +1207,7 @@ static void ipoib_remove_one(struct ib_d
 		ib_unregister_event_handler(&priv->event_handler);
 		flush_scheduled_work();
 
+		ipoib_neigh_cleanup_by_master(priv->dev->master, priv->dev);
 		unregister_netdev(priv->dev);
 		ipoib_dev_cleanup(priv->dev);
 		free_netdev(priv->dev);
@@ -1217,6 +1266,8 @@ err_fs:
 
 static void __exit ipoib_cleanup_module(void)
 {
+	ipoib_at_exit = 1;
+
 	ib_unregister_client(&ipoib_client);
 	ib_sa_unregister_client(&ipoib_sa_client);
 	ipoib_unregister_debugfs();
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index b04b72c..a41a949 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -774,7 +774,7 @@ out:
 		if (skb->dst            &&
 		    skb->dst->neighbour &&
 		    !*to_ipoib_neigh(skb->dst->neighbour)) {
-			struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour);
+			struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev);
 
 			if (neigh) {
 				kref_get(&mcast->ah->ref);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 6a9f616..557be98 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -153,6 +153,7 @@ struct neigh_table nd_tbl = {
 	.gc_thresh2 =	 512,
 	.gc_thresh3 =	1024,
 };
+EXPORT_SYMBOL(nd_tbl);
 
 /* ND options */
 struct ndisc_options {


-- 
MST




More information about the general mailing list