[openib-general] [RFC] [PATCH v2] IB/ipoib: Add bonding support to IPoIB
Michael S. Tsirkin
mst at mellanox.co.il
Mon Feb 26 22:02:45 PST 2007
> When using the bonding driver, neighbours are created by the net stack on behalf
> of the bonding (master) device. On the tx flow the bonding code gets an skb such
> that skb->dev points to the master device, it changes this skb to point on the
> slave device and calls the slave hard_start_xmit function.
>
>
> Combing these two flows, there is a hole if some code at ipoib
> (ipoib_neigh_destructor) assumes that for each struct neighbour it gets, n->dev
> is an ipoib device so for example netdev_priv(n->dev) would be of type struct
> ipoib_dev_priv.
>
> To fix it, this patch adds a dev field to struct ipoib_neigh which is used
> instead of the struct neighbour dev one.
It seems that in this design, if multiple ipoib interfaces are present, we might
get an skb such that skb->dev will be different from the new dev field in struct
ipoib_neigh.
It seems that the result will be that the packet will be sent on a wrong interface.
Right?
> In addition, if an IPoIB device is removed before bonding is unloaded it may
> cause bond0 neighbours (neighbours that point to bond0) to exist after the IPoIB
> device no longer exist. This is why a neighbour cleanup is required during device
> cleanup. This cleanup scans the arp cache and the ndisc cache to find there
> neighbours of bond0 which refer also to the relevant ibX. Also, when ib_ipoib module is
> unloaded, the neighbour destructor must be set to NULL because the neighbour function is in
> ib_ipoib.
> For this neigh table cleanup, it is required to export the symbol nd_tbl just like the symbol arp_tbl is.
I wonder about this: is it really true that any allocated neighbour is always in
either arp_tbl or nd_tbl? For example, could some code have called neigh_hold
and retained a neighbour that is not in either one of these tables?
> During my tests I found that when running
>
> 1. modprobe -r ib_mthca (to delete IPoIB interfaces)
> 2. ping somewhere on the subnet of bond0
>
> I get this stack dump (which ends with kernel death)
> [<ffffffff8037ff32>] skb_under_panic+0x5c/0x60
> [<ffffffff882e00c2>] :ib_ipoib:ipoib_hard_header+0xa6/0xc0
> [<ffffffff803c3c98>] arp_create+0x120/0x226
> [<ffffffff803c3dc3>] arp_send+0x25/0x3b
> [<ffffffff803c466a>] arp_solicit+0x186/0x195
> [<ffffffff8038c0ac>] neigh_timer_handler+0x2b5/0x309
> [<ffffffff8038bdf7>] neigh_timer_handler+0x0/0x309
> [<ffffffff80239599>] run_timer_softirq+0x130/0x19e
> [<ffffffff80235fcc>] __do_softirq+0x55/0xc3
> [<ffffffff8020acac>] call_softirq+0x1c/0x28
> [<ffffffff8020c02b>] do_softirq+0x2c/0x7d
> [<ffffffff8021864a>] smp_apic_timer_interrupt+0x57/0x6a
> [<ffffffff80208e19>] mwait_idle+0x0/0x45
> [<ffffffff8020a756>] apic_timer_interrupt+0x66/0x70
> <EOI> [<ffffffff80208e5b>] mwait_idle+0x42/0x45
> [<ffffffff80208db1>] cpu_idle+0x8b/0xae
> [<ffffffff80217d60>] start_secondary+0x47f/0x48f
>
> The only way I found to avoid this (for now) is to check skb headroom in
> ipoib_hard_header. I guess that this safety check doesn't harm regular IPoIB
> operation and it seems to solve my problem. However, I would be happy to hear what
> others think of this last issue.
As I said, this seems to indicate a problem in the bonding code.
But what will happen after you error out in ipoib_hard_header?
Is the packet dropped? What might break as a result?
> I would really appreciate comments.
>
> thanks
>
> -MoniS
------------------------------------------------------------------------------
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 07deee8..31bc6d8 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -216,6 +216,7 @@ struct ipoib_neigh {
struct sk_buff_head queue;
struct neighbour *neighbour;
+ struct net_device *dev;
struct list_head list;
};
@@ -232,7 +233,8 @@ static inline struct ipoib_neigh **to_ip
INFINIBAND_ALEN, sizeof(void *));
}
-struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh);
+struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh,
+ struct net_device *dev);
void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh);
extern struct workqueue_struct *ipoib_workqueue;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 705eb1d..0e3953e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -48,6 +48,8 @@ #include <linux/ip.h>
#include <linux/in.h>
#include <net/dst.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
@@ -70,6 +72,7 @@ module_param_named(debug_level, ipoib_de
MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
#endif
+static int ipoib_at_exit = 0;
struct ipoib_path_iter {
struct net_device *dev;
struct ipoib_path path;
@@ -490,7 +493,7 @@ static void neigh_add_path(struct sk_buf
struct ipoib_path *path;
struct ipoib_neigh *neigh;
- neigh = ipoib_neigh_alloc(skb->dst->neighbour);
+ neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev);
if (!neigh) {
++priv->stats.tx_dropped;
dev_kfree_skb_any(skb);
@@ -735,6 +738,9 @@ static int ipoib_hard_header(struct sk_b
{
struct ipoib_header *header;
+ if (skb_headroom(skb) < sizeof *header) {
+ return -1;
+ }
header = (struct ipoib_header *) skb_push(skb, sizeof *header);
header->proto = htons(type);
@@ -746,8 +752,11 @@ static int ipoib_hard_header(struct sk_b
* figure out where to send the packet later.
*/
if ((!skb->dst || !skb->dst->neighbour) && daddr) {
- struct ipoib_pseudoheader *phdr =
- (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
+ struct ipoib_pseudoheader *phdr = NULL;
+ if (skb_headroom(skb) < sizeof *phdr) {
+ return -1;
+ }
+ phdr = (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
}
@@ -769,32 +778,69 @@ static void ipoib_set_mcast_list(struct
static void ipoib_neigh_destructor(struct neighbour *n)
{
struct ipoib_neigh *neigh;
- struct ipoib_dev_priv *priv = netdev_priv(n->dev);
+ struct ipoib_dev_priv *priv;
unsigned long flags;
struct ipoib_ah *ah = NULL;
- ipoib_dbg(priv,
- "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
- IPOIB_QPN(n->ha),
- IPOIB_GID_RAW_ARG(n->ha + 4));
-
- spin_lock_irqsave(&priv->lock, flags);
neigh = *to_ipoib_neigh(n);
if (neigh) {
+ priv = netdev_priv(neigh->dev);
+ ipoib_dbg(priv,
+ "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
+ IPOIB_QPN(n->ha),
+ IPOIB_GID_RAW_ARG(n->ha + 4));
+
+ spin_lock_irqsave(&priv->lock, flags);
if (neigh->ah)
ah = neigh->ah;
list_del(&neigh->list);
ipoib_neigh_free(n->dev, neigh);
+ spin_unlock_irqrestore(&priv->lock, flags);
}
-
- spin_unlock_irqrestore(&priv->lock, flags);
-
if (ah)
ipoib_put_ah(ah);
}
-struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour)
+static void ipoib_neigh_tbl_cleanup_master(struct neigh_table *tbl,
+ struct net_device* master,
+ struct net_device* slave)
+{
+ int i;
+ struct ipoib_neigh *neigh;
+
+ write_lock_bh(&tbl->lock);
+ for (i = 0; i <= tbl->hash_mask; i++) {
+ struct neighbour *n, **np;
+
+ np = &tbl->hash_buckets[i];
+ while ((n = *np) != NULL) {
+ write_lock(&n->lock);
+ if (n->dev == master) {
+ neigh = *to_ipoib_neigh(n);
+ if (neigh && (neigh->dev == slave)){
+ if (ipoib_at_exit)
+ n->parms->neigh_destructor = NULL;
+ ipoib_neigh_destructor(n);
+ }
+ }
+ write_unlock(&n->lock);
+ np = &n->next;
+ }
+ }
+ write_unlock_bh(&tbl->lock);
+}
+
+static void ipoib_neigh_cleanup_by_master(struct net_device* master,struct net_device* slave){
+ netif_stop_queue(slave);
+ if (master) {
+ ipoib_neigh_tbl_cleanup_master(&arp_tbl,master, slave);
+ ipoib_neigh_tbl_cleanup_master(&nd_tbl,master, slave);
+ }
+}
+
+struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour,
+ struct net_device *dev)
{
struct ipoib_neigh *neigh;
@@ -803,6 +849,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(st
return NULL;
neigh->neighbour = neighbour;
+ neigh->dev = dev;
*to_ipoib_neigh(neighbour) = neigh;
skb_queue_head_init(&neigh->queue);
@@ -874,6 +921,7 @@ void ipoib_dev_cleanup(struct net_device
/* Delete any child interfaces first */
list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+ ipoib_neigh_cleanup_by_master(cpriv->dev->master, cpriv->dev);
unregister_netdev(cpriv->dev);
ipoib_dev_cleanup(cpriv->dev);
free_netdev(cpriv->dev);
@@ -1159,6 +1207,7 @@ static void ipoib_remove_one(struct ib_d
ib_unregister_event_handler(&priv->event_handler);
flush_scheduled_work();
+ ipoib_neigh_cleanup_by_master(priv->dev->master, priv->dev);
unregister_netdev(priv->dev);
ipoib_dev_cleanup(priv->dev);
free_netdev(priv->dev);
@@ -1217,6 +1266,8 @@ err_fs:
static void __exit ipoib_cleanup_module(void)
{
+ ipoib_at_exit = 1;
+
ib_unregister_client(&ipoib_client);
ib_sa_unregister_client(&ipoib_sa_client);
ipoib_unregister_debugfs();
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index b04b72c..a41a949 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -774,7 +774,7 @@ out:
if (skb->dst &&
skb->dst->neighbour &&
!*to_ipoib_neigh(skb->dst->neighbour)) {
- struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour);
+ struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev);
if (neigh) {
kref_get(&mcast->ah->ref);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 6a9f616..557be98 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -153,6 +153,7 @@ struct neigh_table nd_tbl = {
.gc_thresh2 = 512,
.gc_thresh3 = 1024,
};
+EXPORT_SYMBOL(nd_tbl);
/* ND options */
struct ndisc_options {
--
MST
More information about the general
mailing list