[openib-general] [PATCH] IPoIB neighbour fixes

Roland Dreier roland at topspin.com
Fri Dec 10 18:23:36 PST 2004


I just committed this patch, which fixes both the "path mismatch for
unicast ARP" and "neighbour destructor after rmmod" issues.  I've
tested rmmod'ing ipoib with traffic running with this patch, and it
survives fine.

I now have everything that I wanted to get done, and I'm planning on
submitting patches to lkml again on Monday.

 - R.

Index: infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- infiniband/ulp/ipoib/ipoib_main.c	(revision 1320)
+++ infiniband/ulp/ipoib/ipoib_main.c	(working copy)
@@ -149,22 +149,116 @@
 	return 0;
 }
 
+static struct ipoib_path *__path_find(struct net_device *dev,
+				      union ib_gid *gid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node *n = priv->path_tree.rb_node;
+	struct ipoib_path *path;
+	int ret;
+
+	while (n) {
+		path = rb_entry(n, struct ipoib_path, rb_node);
+
+		ret = memcmp(path->pathrec.dgid.raw, gid->raw,
+			     sizeof (union ib_gid));
+
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return path;
+	}
+
+	return NULL;
+}
+
+static int __path_add(struct net_device *dev, struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node **n = &priv->path_tree.rb_node;
+	struct rb_node *pn = NULL;
+	struct ipoib_path *tpath;
+	int ret;
+
+	while (*n) {
+		pn = *n;
+		tpath = rb_entry(pn, struct ipoib_path, rb_node);
+
+		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&path->rb_node, pn, n);
+	rb_insert_color(&path->rb_node, &priv->path_tree);
+
+	list_add_tail(&path->list, &priv->path_list);
+
+	return 0;
+}
+
+static void __path_free(struct net_device *dev, struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh *neigh, *tn;
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&path->queue)))
+		dev_kfree_skb_irq(skb);
+
+	list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
+		if (neigh->ah)
+			ipoib_put_ah(neigh->ah);
+		*to_ipoib_neigh(neigh->neighbour) = NULL;
+		neigh->neighbour->ops->destructor = NULL;
+		kfree(neigh);
+	}
+
+	if (path->ah)
+		ipoib_put_ah(path->ah);
+
+	rb_erase(&path->rb_node, &priv->path_tree);
+	list_del(&path->list);
+	kfree(path);
+}
+
+void ipoib_flush_paths(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path, *tp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(path, tp, &priv->path_list, list)
+		__path_free(dev, path);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
 static void path_rec_completion(int status,
 				struct ib_sa_path_rec *pathrec,
 				void *path_ptr)
 {
 	struct ipoib_path *path = path_ptr;
 	struct ipoib_dev_priv *priv = netdev_priv(path->dev);
+	struct ipoib_ah *ah = NULL;
+	struct ipoib_neigh *neigh;
+	struct sk_buff_head skqueue;
 	struct sk_buff *skb;
-	struct ipoib_ah *ah;
+	unsigned long flags;
 
 	ipoib_dbg(priv, "status %d, LID 0x%04x for GID " IPOIB_GID_FMT "\n",
 		  status, be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid));
 
-	if (status != IB_WC_SUCCESS)
-		goto err;
-
-	{
+	if (status == IB_WC_SUCCESS) {
 		struct ib_ah_attr av = {
 			.dlid 	       = be16_to_cpu(pathrec->dlid),
 			.sl 	       = pathrec->sl,
@@ -177,215 +271,216 @@
 		ah = ipoib_create_ah(path->dev, priv->pd, &av);
 	}
 
-	if (!ah)
-		goto err;
+	spin_lock_irqsave(&priv->lock, flags);
 
 	path->ah = ah;
 
-	ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
-		  ah, pathrec->dlid, pathrec->sl);
+	if (ah) {
+		path->pathrec = *pathrec;
 
-	while ((skb = __skb_dequeue(&path->queue))) {
+		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
+			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
+
+		skb_queue_head_init(&skqueue);
+
+		while ((skb = __skb_dequeue(&path->queue)))
+			__skb_queue_tail(&skqueue, skb);
+
+		list_for_each_entry(neigh, &path->neigh_list, list) {
+			neigh->ah = path->ah;
+			kref_get(&path->ah->ref);
+
+			while ((skb = __skb_dequeue(&neigh->queue)))
+				__skb_queue_tail(&skqueue, skb);
+		}
+	} else
+		path->query = NULL;
+
+
+	complete(&path->done);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	while ((skb = __skb_dequeue(&skqueue))) {
 		skb->dev = path->dev;
 		if (dev_queue_xmit(skb))
 			ipoib_warn(priv, "dev_queue_xmit failed "
 				   "to requeue packet\n");
 	}
-
-	return;
-
-err:
-	while ((skb = __skb_dequeue(&path->queue)))
-		dev_kfree_skb(skb);
-	
-	if (path->neighbour)
-		*to_ipoib_path(path->neighbour) = NULL;
-
-	kfree(path);
 }
 
-static void path_rec_start(struct sk_buff *skb, struct net_device *dev)
+static struct ipoib_path *path_rec_create(struct net_device *dev,
+					  union ib_gid *gid)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_path *path = kmalloc(sizeof *path, GFP_ATOMIC);
-	struct ib_sa_path_rec rec = {
-		.numb_path = 1
-	};
-	struct ib_sa_query *query;
+	struct ipoib_path *path;
 
+	path = kmalloc(sizeof *path, GFP_ATOMIC);
 	if (!path)
-		goto err;
+		return NULL;
 
-	path->ah  = NULL;
 	path->dev = dev;
+	path->pathrec.dlid = 0;
+
 	skb_queue_head_init(&path->queue);
-	__skb_queue_tail(&path->queue, skb);
-	path->neighbour = NULL;
 
-	rec.sgid = priv->local_gid;
-	memcpy(rec.dgid.raw, skb->dst->neighbour->ha + 4, 16);
-	rec.pkey = cpu_to_be16(priv->pkey);
+	INIT_LIST_HEAD(&path->neigh_list);
+	path->query = NULL;
+	init_completion(&path->done);
 
-	/*
-	 * XXX there's a race here if path record completion runs
-	 * before we get to finish up.  Add a lock to path struct?
-	 */
-	if (ib_sa_path_rec_get(priv->ca, priv->port, &rec,
-			       IB_SA_PATH_REC_DGID	|
-			       IB_SA_PATH_REC_SGID	|
-			       IB_SA_PATH_REC_NUMB_PATH	|
-			       IB_SA_PATH_REC_PKEY,
-			       1000, GFP_ATOMIC,
-			       path_rec_completion,
-			       path, &query) < 0) {
-		ipoib_warn(priv, "ib_sa_path_rec_get failed\n");
-		goto err;
-	}
+	memcpy(path->pathrec.dgid.raw, gid->raw, sizeof (union ib_gid));
+	path->pathrec.sgid      = priv->local_gid;
+	path->pathrec.pkey      = cpu_to_be16(priv->pkey);
+	path->pathrec.numb_path = 1;
 
-	path->neighbour = skb->dst->neighbour;
-	*to_ipoib_path(skb->dst->neighbour) = path;
-	return;
+	__path_add(dev, path);
 
-err:
-	kfree(path);
-	++priv->stats.tx_dropped;
-	dev_kfree_skb_any(skb);
+	return path;
 }
 
-static void path_lookup(struct sk_buff *skb, struct net_device *dev)
+static int path_rec_start(struct net_device *dev,
+			  struct ipoib_path *path)
 {
-	struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-	/* Look up path record for unicasts */
-	if (skb->dst->neighbour->ha[4] != 0xff) {
-		path_rec_start(skb, dev);
-		return;
+	path->query_id =
+		ib_sa_path_rec_get(priv->ca, priv->port,
+				   &path->pathrec,
+				   IB_SA_PATH_REC_DGID		|
+				   IB_SA_PATH_REC_SGID		|
+				   IB_SA_PATH_REC_NUMB_PATH	|
+				   IB_SA_PATH_REC_PKEY,
+				   1000, GFP_ATOMIC,
+				   path_rec_completion,
+				   path, &path->query);
+	if (path->query_id < 0) {
+		ipoib_warn(priv, "ib_sa_path_rec_get failed\n");
+		path->query = NULL;
+		return path->query_id;
 	}
 
-	/* Add in the P_Key */
-	skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff;
-	skb->dst->neighbour->ha[9] = priv->pkey & 0xff;
-	ipoib_mcast_send(dev,
-			 (union ib_gid *) (skb->dst->neighbour->ha + 4),
-			 skb);
+	return 0;
 }
 
-static void unicast_arp_completion(int status,
-				   struct ib_sa_path_rec *pathrec,
-				   void *skb_ptr)
+static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
 {
-	struct sk_buff *skb = skb_ptr;
-	struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
-	struct ipoib_ah *ah;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+	struct ipoib_neigh *neigh; 
 
-	ipoib_dbg(priv, "status %d, LID 0x%04x for GID " IPOIB_GID_FMT "\n",
-		  status, be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid));
+	neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
+	if (!neigh) {
+		++priv->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+		return;
+	}
 
-	if (status)
-		goto err;
+	skb_queue_head_init(&neigh->queue);
+	neigh->neighbour = skb->dst->neighbour;
+	*to_ipoib_neigh(skb->dst->neighbour) = neigh;
 
-	{
-		struct ib_ah_attr av = {
-			.dlid 	       = be16_to_cpu(pathrec->dlid),
-			.sl 	       = pathrec->sl,
-			.src_path_bits = 0,
-			.static_rate   = 0,
-			.ah_flags      = 0,
-			.port_num      = priv->port
-		};
+	/*
+	 * We can only be called from ipoib_start_xmit, so we're
+	 * inside tx_lock -- no need to save/restore flags.
+	 */
+	spin_lock(&priv->lock);
 
-		ah = ipoib_create_ah(skb->dev, priv->pd, &av);
+	path = __path_find(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4));
+	if (!path) {
+		path = path_rec_create(dev,
+				       (union ib_gid *) (skb->dst->neighbour->ha + 4));
+		if (!path)
+			goto err;
 	}
 
-	if (!ah)
-		goto err;
+	list_add_tail(&neigh->list, &path->neigh_list);
 
-	*(struct ipoib_ah **) skb->cb = ah;
+	if (path->pathrec.dlid) {
+		neigh->ah = path->ah;
+		kref_get(&path->ah->ref);
 
-	if (dev_queue_xmit(skb))
-		ipoib_warn(priv, "dev_queue_xmit failed "
-			   "to requeue ARP packet\n");
+		ipoib_send(dev, skb, path->ah,
+			   be32_to_cpup((__be32 *) skb->dst->neighbour->ha));
+	} else if (!path->query) {
+		neigh->ah  = NULL;
+		__skb_queue_tail(&neigh->queue, skb);
+		if (path_rec_start(dev, path))
+			goto err;
+	}
 
+	spin_unlock(&priv->lock);
 	return;
 
 err:
-	dev_kfree_skb(skb);
+	*to_ipoib_neigh(skb->dst->neighbour) = NULL;
+	list_del(&neigh->list);
+	kfree(neigh);
+	neigh->neighbour->ops->destructor = NULL;
+
+	++priv->stats.tx_dropped;
+	dev_kfree_skb_any(skb);
+
+	spin_unlock(&priv->lock);
 }
 
-static void unicast_arp_finish(struct sk_buff *skb)
+static void path_lookup(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
-	struct ipoib_ah *ah = *(struct ipoib_ah **) skb->cb;
-	unsigned long flags;
 
-	if (ah) {
-		spin_lock_irqsave(&priv->lock, flags);
-		list_add_tail(&ah->list, &priv->dead_ahs);
-		spin_unlock_irqrestore(&priv->lock, flags);
+	/* Look up path record for unicasts */
+	if (skb->dst->neighbour->ha[4] != 0xff) {
+		neigh_add_path(skb, dev);
+		return;
 	}
+
+	/* Add in the P_Key for multicasts */
+	skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff;
+	skb->dst->neighbour->ha[9] = priv->pkey & 0xff;
+	ipoib_mcast_send(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4), skb);
 }
 
-/*
- * For unicast packets with no skb->dst->neighbour (unicast ARPs are
- * the main example), we fire off a path record query for each packet.
- * This is pretty bad for scalability (since this is going to hammer
- * the SM on a big fabric) but it's the best I can think of for now.
- *
- * Also we might have a problem if a path changes, because ARPs will
- * still go through (since we'll get the new path from the SM for
- * these queries) so we'll never update the neighbour.
- */
-static void unicast_arp_start(struct sk_buff *skb, struct net_device *dev,
-			      struct ipoib_pseudoheader *phdr)
+static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
+			     struct ipoib_pseudoheader *phdr)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct sk_buff *tmp_skb;
-	struct ib_sa_path_rec rec = {
-		.numb_path = 1
-	};
-	struct ib_sa_query *query;
+	struct ipoib_path *path;
 
-	if (skb->destructor) {
-		tmp_skb = skb;
-		skb = skb_clone(tmp_skb, GFP_ATOMIC);
-		dev_kfree_skb_any(tmp_skb);
-		if (!skb) {
+	/*
+	 * We can only be called from ipoib_start_xmit, so we're
+	 * inside tx_lock -- no need to save/restore flags.
+	 */
+	spin_lock(&priv->lock);
+
+	path = __path_find(dev, (union ib_gid *) (phdr->hwaddr + 4));
+	if (!path) {
+		path = path_rec_create(dev,
+				       (union ib_gid *) (phdr->hwaddr + 4));
+		if (path) {
+			__skb_queue_tail(&path->queue, skb);
+
+			if (path_rec_start(dev, path))
+				__path_free(dev, path);
+		} else {
 			++priv->stats.tx_dropped;
-			return;
+			dev_kfree_skb_any(skb);
 		}
+
+		spin_unlock(&priv->lock);
+		return;
 	}
 
-	skb->dev        = dev;
-	skb->destructor = unicast_arp_finish;
-	memset(skb->cb, 0, sizeof skb->cb);
+	ipoib_dbg(priv, "Send unicast ARP to %04x\n", be16_to_cpu(path->pathrec.dlid));
 
-	rec.sgid = priv->local_gid;
-	memcpy(rec.dgid.raw, phdr->hwaddr + 4, 16);
-	rec.pkey = cpu_to_be16(priv->pkey);
+	ipoib_send(dev, skb, path->ah,
+		   be32_to_cpup((__be32 *) phdr->hwaddr));
 
-	/*
-	 * XXX We need to keep a record of the skb and TID somewhere
-	 * so that we can cancel the request if the device goes down
-	 * before it finishes.
-	 */
-	if (ib_sa_path_rec_get(priv->ca, priv->port, &rec,
-			       IB_SA_PATH_REC_DGID	|
-			       IB_SA_PATH_REC_SGID	|
-			       IB_SA_PATH_REC_NUMB_PATH	|
-			       IB_SA_PATH_REC_PKEY,
-			       1000, GFP_ATOMIC,
-			       unicast_arp_completion,
-			       skb, &query) < 0) {
-		ipoib_warn(priv, "ib_sa_path_rec_get failed\n");
-		++priv->stats.tx_dropped;
-		dev_kfree_skb_any(skb);
-	}
+	spin_unlock(&priv->lock);
 }
 
 static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_path *path;
+	struct ipoib_neigh *neigh;
 	unsigned long flags;
 
 	local_irq_save(flags);
@@ -395,21 +490,21 @@
 	} 
 
 	if (skb->dst && skb->dst->neighbour) {
-		if (unlikely(!*to_ipoib_path(skb->dst->neighbour))) {
+		if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
 			path_lookup(skb, dev);
 			goto out;
 		}
 
-		path = *to_ipoib_path(skb->dst->neighbour);
+		neigh = *to_ipoib_neigh(skb->dst->neighbour);
 
-		if (likely(path->ah)) {
-			ipoib_send(dev, skb, path->ah,
+		if (likely(neigh->ah)) {
+			ipoib_send(dev, skb, neigh->ah,
 				   be32_to_cpup((__be32 *) skb->dst->neighbour->ha));
 			goto out;
 		}
 
-		if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE)
-			__skb_queue_tail(&path->queue, skb);
+		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
+			__skb_queue_tail(&neigh->queue, skb);
 		else
 			goto err;
 	} else {
@@ -418,25 +513,14 @@
 		skb_pull(skb, sizeof *phdr);
 
 		if (phdr->hwaddr[4] == 0xff) {
-			/* Add in the P_Key */
+			/* Add in the P_Key for multicast*/
 			phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
 			phdr->hwaddr[9] = priv->pkey & 0xff;
 
 			ipoib_mcast_send(dev, (union ib_gid *) (phdr->hwaddr + 4), skb);
 		} else {
-			/* unicast GID -- ARP reply?? */
+			/* unicast GID -- should be ARP reply */
 
-			/*
-			 * If destructor is unicast_arp_finish, we've
-			 * already been through the path lookup and
-			 * now we can just send the packet.
-			 */
-			if (skb->destructor == unicast_arp_finish) {
-				ipoib_send(dev, skb, *(struct ipoib_ah **) skb->cb,
-					   be32_to_cpup((u32 *) phdr->hwaddr));
-				goto out;
-			}
-
 			if (be16_to_cpup((u16 *) skb->data) != ETH_P_ARP) {
 				ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x "
 					   IPOIB_GID_FMT "\n",
@@ -449,9 +533,8 @@
 				goto out;
 			}
 
-			/* put the pseudoheader back on */			  
-			skb_push(skb, sizeof *phdr);
-			unicast_arp_start(skb, dev, phdr);
+			/* put the pseudoheader back on and try to send */
+			unicast_arp_send(skb, dev, phdr);
 		}
 	}
 
@@ -516,19 +599,28 @@
 	schedule_work(&priv->restart_task);
 }
 
-static void ipoib_neigh_destructor(struct neighbour *neigh)
+static void ipoib_neigh_destructor(struct neighbour *n)
 {
-	struct ipoib_path     *path = *to_ipoib_path(neigh);
+	struct ipoib_neigh *neigh = *to_ipoib_neigh(n);
+	struct ipoib_dev_priv *priv = netdev_priv(n->dev);
+	unsigned long flags;
 
-	ipoib_dbg(netdev_priv(neigh->dev),
+	ipoib_dbg(priv,
 		  "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
-		  be32_to_cpup((__be32 *) neigh->ha),
-		  IPOIB_GID_ARG(*((union ib_gid *) (neigh->ha + 4))));
+		  be32_to_cpup((__be32 *) n->ha),
+		  IPOIB_GID_ARG(*((union ib_gid *) (n->ha + 4))));
 
-	if (path && path->ah) {
-		ipoib_put_ah(path->ah);
-		kfree(path);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (neigh) {
+		if (neigh->ah)
+			ipoib_put_ah(neigh->ah);
+		list_del(&neigh->list);
+		*to_ipoib_neigh(n) = NULL;
+		kfree(neigh);
 	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
 static int ipoib_neigh_setup(struct neighbour *neigh)
@@ -669,6 +761,7 @@
 	init_MUTEX(&priv->mcast_mutex);
 	init_MUTEX(&priv->vlan_mutex);
 
+	INIT_LIST_HEAD(&priv->path_list);
 	INIT_LIST_HEAD(&priv->child_intfs);
 	INIT_LIST_HEAD(&priv->dead_ahs);
 	INIT_LIST_HEAD(&priv->multicast_list);
Index: infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- infiniband/ulp/ipoib/ipoib_multicast.c	(revision 1320)
+++ infiniband/ulp/ipoib/ipoib_multicast.c	(working copy)
@@ -60,6 +60,8 @@
 	unsigned long flags;
 	unsigned char logcount;
 
+	struct list_head  neigh_list;
+
 	struct sk_buff_head pkt_queue;
 
 	struct net_device *dev;
@@ -77,11 +79,25 @@
 static void ipoib_mcast_free(struct ipoib_mcast *mcast)
 {
 	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh *neigh, *tmp;
+	unsigned long flags;
 
 	ipoib_dbg_mcast(netdev_priv(dev),
 			"deleting multicast group " IPOIB_GID_FMT "\n",
 			IPOIB_GID_ARG(mcast->mcmember.mgid));
 
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) {
+		ipoib_put_ah(neigh->ah);
+		*to_ipoib_neigh(neigh->neighbour) = NULL;
+		neigh->neighbour->ops->destructor = NULL;
+		kfree(neigh);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
 	if (mcast->ah)
 		ipoib_put_ah(mcast->ah);
 
@@ -114,6 +130,7 @@
 	mcast->logcount = 0;
 
 	INIT_LIST_HEAD(&mcast->list);
+	INIT_LIST_HEAD(&mcast->neigh_list);
 	skb_queue_head_init(&mcast->pkt_queue);
 
 	mcast->ah    = NULL;
@@ -671,24 +688,25 @@
 	}
 
 out:
-	spin_unlock_irqrestore(&priv->lock, flags);
 	if (mcast && mcast->ah) {
 		if (skb->dst            &&
 		    skb->dst->neighbour &&
-		    !*to_ipoib_path(skb->dst->neighbour)) {
-			struct ipoib_path *path = kmalloc(sizeof *path, GFP_ATOMIC);
+		    !*to_ipoib_neigh(skb->dst->neighbour)) {
+			struct ipoib_neigh *neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
 
-			if (path) {
+			if (neigh) {
 				kref_get(&mcast->ah->ref);
-				path->ah  	= mcast->ah;
-				path->dev 	= dev;
-				path->neighbour = skb->dst->neighbour;
-				*to_ipoib_path(skb->dst->neighbour) = path;
+				neigh->ah  	= mcast->ah;
+				neigh->neighbour = skb->dst->neighbour;
+				*to_ipoib_neigh(skb->dst->neighbour) = neigh;
+				list_add_tail(&neigh->list, &mcast->neigh_list);
 			}
 		}
 
 		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
 	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
 void ipoib_mcast_dev_flush(struct net_device *dev)
Index: infiniband/ulp/ipoib/ipoib.h
===================================================================
--- infiniband/ulp/ipoib/ipoib.h	(revision 1320)
+++ infiniband/ulp/ipoib/ipoib.h	(working copy)
@@ -93,6 +93,12 @@
 	DECLARE_PCI_UNMAP_ADDR(mapping)
 };
 
+/*
+ * Device private locking: tx_lock protects members used in TX fast
+ * path (and we use LLTX so upper layers don't do extra locking).
+ * lock protects everything else.  lock nests inside of tx_lock (ie
+ * tx_lock must be acquired first if needed).
+ */
 struct ipoib_dev_priv {
 	spinlock_t lock;
 
@@ -103,6 +109,9 @@
 	struct semaphore mcast_mutex;
 	struct semaphore vlan_mutex;
 
+	struct rb_root  path_tree;
+	struct list_head path_list;
+
 	struct ipoib_mcast *broadcast;
 	struct list_head multicast_list;
 	struct rb_root multicast_tree;
@@ -162,16 +171,34 @@
 };
 
 struct ipoib_path {
+	struct net_device    *dev;
+	struct ib_sa_path_rec pathrec;
+	struct ipoib_ah      *ah;
+	struct sk_buff_head   queue;
+
+	struct list_head      neigh_list;
+
+	int                   query_id;
+	struct ib_sa_query   *query;
+	struct completion     done;
+
+	struct rb_node        rb_node;
+	struct list_head      list;
+};
+
+struct ipoib_neigh {
 	struct ipoib_ah    *ah;
 	struct sk_buff_head queue;
 
-	struct net_device  *dev;
 	struct neighbour   *neighbour;
+
+	struct list_head    list;
 };
 
-static inline struct ipoib_path **to_ipoib_path(struct neighbour *neigh)
+static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh)
 {
-	return (struct ipoib_path **) (neigh->ha + 24);
+	return (struct ipoib_neigh **) (neigh->ha + 24 -
+					(offsetof(struct neighbour, ha) & 4));
 }
 
 extern struct workqueue_struct *ipoib_workqueue;
@@ -194,6 +221,7 @@
 		struct ipoib_ah *address, u32 qpn);
 void ipoib_reap_ah(void *dev_ptr);
 
+void ipoib_flush_paths(struct net_device *dev);
 struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
 
 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
Index: infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- infiniband/ulp/ipoib/ipoib_ib.c	(revision 1320)
+++ infiniband/ulp/ipoib/ipoib_ib.c	(working copy)
@@ -445,6 +445,8 @@
 	/* Delete broadcast and local addresses since they will be recreated */
 	ipoib_mcast_dev_down(dev);
 
+	ipoib_flush_paths(dev);
+
 	return 0;
 }
 



More information about the general mailing list