[ofa-general] [PATCH RESEND] ipoib: fix racing uses of ipoib_neigh in CM with RCU

akepner at sgi.com akepner at sgi.com
Mon Jul 6 10:26:39 PDT 2009


No comments since this was originally posted. 

Too beautiful for words? 

It does seem an awfully complex solution to a rarely encountered 
bug, but I don't see a better solution. If you do, please let me 
know.

-----------------------------------------------------------------

There is a race between the neighbour cleanup code, and some uses 
of the closely related ipoib_neigh structure when using IPoIB-CM. 

For example, a IPoIB-CM connection may time out, invoking 
ipoib_neigh_free(), which cleans up and frees the struct ipoib_neigh, 
while at the same time a neighbour entry times out, invoking 
ipoib_neigh_cleanup(), cleaning up and freeing the ipoib_neigh 
structure a second time.

The root cause is that the struct ipoib_neigh pointer that's stashed 
away in struct neighbour is read (and subsequently used) in 
ipoib_neigh_cleanup() without using locking that's consistent with 
other reads/writes of this pointer. (The pointer must be read before 
it can be known which lock to use, so it's difficult to avoid.)

To fix this, use RCU methods to read/write the struct ipoib_neigh 
pointer that's stashed away in struct neighbour, cleanup (most of) 
the struct ipoib_neigh's internal state, and free the structure in 
an RCU callback.  Updates of the struct ipoib_neigh pointer are 
protected by struct ipoib_dev_priv's lock. The pointer can be read, 
and most of the internal state of the ipoib_neigh can be used, without 
locking, inside a RCU read-side critical section.
 
The internal state of the struct ipoib_neigh which requires mutual 
exclusion (the list_head element) is protected by a new spinlock. 

Signed-off-by: Arthur Kepner <akepner at sgi.com>
---

 ipoib.h           |   38 ++++++++++++++--
 ipoib_cm.c        |   23 +--------
 ipoib_main.c      |  127 +++++++++++++++++++++++++++++-------------------------
 ipoib_multicast.c |   56 ++++++++++++-----------
 4 files changed, 138 insertions(+), 106 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 753a983..c84ab84 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -105,6 +105,7 @@ enum {
 
 	MAX_SEND_CQE		  = 16,
 	IPOIB_CM_COPYBREAK	  = 256,
+	IPOIB_FLAG_CM_CLEAR	  = 1, 
 };
 
 #define	IPOIB_OP_RECV   (1ul << 31)
@@ -382,6 +383,10 @@ struct ipoib_path {
 };
 
 struct ipoib_neigh {
+	spinlock_t lock;
+	int dead;
+	int rcu_flags;	/* flags used during rcu callback */
+	struct rcu_head rcu;
 	struct ipoib_ah    *ah;
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 	struct ipoib_cm_tx *cm;
@@ -409,15 +414,42 @@ static inline int ipoib_ud_need_sg(unsigned int ib_mtu)
  * sure that this pointer is stored aligned so that an unaligned
  * load is not needed to dereference it.
  */
-static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh)
+static inline struct ipoib_neigh* ipoib_neigh_retrieve(struct neighbour *n)
+{
+	struct ipoib_neigh **np;
+
+	np = (void*) n + ALIGN(offsetof(struct neighbour, ha) +
+			       INFINIBAND_ALEN, sizeof(void *));
+
+	return rcu_dereference(*np);
+}
+
+static inline void ipoib_neigh_assign(struct neighbour *n,
+                                      struct ipoib_neigh *in)
+{
+	struct ipoib_neigh **np;
+
+	np = (void*) n + ALIGN(offsetof(struct neighbour, ha) +
+			       INFINIBAND_ALEN, sizeof(void *));
+
+	rcu_assign_pointer(*np, in);
+}
+
+static inline int ipoib_cm_get_rcu_flag(struct ipoib_neigh *neigh, int flag)
+{
+	return !!(neigh->rcu_flags & flag);
+}
+
+static inline void ipoib_cm_set_rcu_flag(struct ipoib_neigh *neigh, int flag)
 {
-	return (void*) neigh + ALIGN(offsetof(struct neighbour, ha) +
-				     INFINIBAND_ALEN, sizeof(void *));
+	neigh->rcu_flags |= flag;
 }
 
 struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh,
 				      struct net_device *dev);
 void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh);
+void ipoib_neigh_tidy(struct net_device *dev, struct ipoib_neigh *neigh, 
+		      int cm);
 
 extern struct workqueue_struct *ipoib_workqueue;
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 181b1f3..f3eead1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -806,12 +806,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 		neigh = tx->neigh;
 
 		if (neigh) {
-			neigh->cm = NULL;
-			list_del(&neigh->list);
-			if (neigh->ah)
-				ipoib_put_ah(neigh->ah);
-			ipoib_neigh_free(dev, neigh);
-
+			ipoib_neigh_tidy(dev, neigh, 1);
 			tx->neigh = NULL;
 		}
 
@@ -1226,12 +1221,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 		neigh = tx->neigh;
 
 		if (neigh) {
-			neigh->cm = NULL;
-			list_del(&neigh->list);
-			if (neigh->ah)
-				ipoib_put_ah(neigh->ah);
-			ipoib_neigh_free(dev, neigh);
-
+			ipoib_neigh_tidy(dev, neigh, 1);
 			tx->neigh = NULL;
 		}
 
@@ -1315,13 +1305,8 @@ static void ipoib_cm_tx_start(struct work_struct *work)
 
 		if (ret) {
 			neigh = p->neigh;
-			if (neigh) {
-				neigh->cm = NULL;
-				list_del(&neigh->list);
-				if (neigh->ah)
-					ipoib_put_ah(neigh->ah);
-				ipoib_neigh_free(dev, neigh);
-			}
+			if (neigh)
+				ipoib_neigh_tidy(dev, neigh, 1);
 			list_del(&p->list);
 			kfree(p);
 		}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index e319d91..409453a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -273,18 +273,8 @@ static void path_free(struct net_device *dev, struct ipoib_path *path)
 
 	spin_lock_irqsave(&priv->lock, flags);
 
-	list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
-		/*
-		 * It's safe to call ipoib_put_ah() inside priv->lock
-		 * here, because we know that path->ah will always
-		 * hold one more reference, so ipoib_put_ah() will
-		 * never do more than decrement the ref count.
-		 */
-		if (neigh->ah)
-			ipoib_put_ah(neigh->ah);
-
+	list_for_each_entry_safe(neigh, tn, &path->neigh_list, list)
 		ipoib_neigh_free(dev, neigh);
-	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 
@@ -466,10 +456,7 @@ static void path_rec_completion(int status,
 									       path,
 									       neigh));
 				if (!ipoib_cm_get(neigh)) {
-					list_del(&neigh->list);
-					if (neigh->ah)
-						ipoib_put_ah(neigh->ah);
-					ipoib_neigh_free(dev, neigh);
+					ipoib_neigh_tidy(dev, neigh, 0);
 					continue;
 				}
 			}
@@ -561,15 +548,16 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
 	struct ipoib_neigh *neigh;
 	unsigned long flags;
 
+	spin_lock_irqsave(&priv->lock, flags);
+
 	neigh = ipoib_neigh_alloc(skb_dst(skb)->neighbour, skb->dev);
 	if (!neigh) {
+		spin_unlock_irqrestore(&priv->lock, flags);
 		++dev->stats.tx_dropped;
 		dev_kfree_skb_any(skb);
 		return;
 	}
 
-	spin_lock_irqsave(&priv->lock, flags);
-
 	path = __path_find(dev, skb_dst(skb)->neighbour->ha + 4);
 	if (!path) {
 		path = path_rec_create(dev, skb_dst(skb)->neighbour->ha + 4);
@@ -591,10 +579,7 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
 			if (!ipoib_cm_get(neigh))
 				ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
 			if (!ipoib_cm_get(neigh)) {
-				list_del(&neigh->list);
-				if (neigh->ah)
-					ipoib_put_ah(neigh->ah);
-				ipoib_neigh_free(dev, neigh);
+				ipoib_neigh_tidy(dev, neigh, 0);
 				goto err_drop;
 			}
 			if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
@@ -709,30 +694,22 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	unsigned long flags;
 
 	if (likely(skb_dst(skb) && skb_dst(skb)->neighbour)) {
-		if (unlikely(!*to_ipoib_neigh(skb_dst(skb)->neighbour))) {
+		struct neighbour *n = skb_dst(skb)->neighbour;
+		rcu_read_lock();
+		neigh = ipoib_neigh_retrieve(n);
+		if (unlikely(!neigh)) {
+			rcu_read_unlock();
 			ipoib_path_lookup(skb, dev);
 			return NETDEV_TX_OK;
 		}
 
-		neigh = *to_ipoib_neigh(skb_dst(skb)->neighbour);
-
-		if (unlikely((memcmp(&neigh->dgid.raw,
-				     skb_dst(skb)->neighbour->ha + 4,
+		if (unlikely((memcmp(&neigh->dgid.raw, n->ha +4, 
 				     sizeof(union ib_gid))) ||
 			     (neigh->dev != dev))) {
 			spin_lock_irqsave(&priv->lock, flags);
-			/*
-			 * It's safe to call ipoib_put_ah() inside
-			 * priv->lock here, because we know that
-			 * path->ah will always hold one more reference,
-			 * so ipoib_put_ah() will never do more than
-			 * decrement the ref count.
-			 */
-			if (neigh->ah)
-				ipoib_put_ah(neigh->ah);
-			list_del(&neigh->list);
-			ipoib_neigh_free(dev, neigh);
+			ipoib_neigh_tidy(dev, neigh, 0);
 			spin_unlock_irqrestore(&priv->lock, flags);
+			rcu_read_unlock();
 			ipoib_path_lookup(skb, dev);
 			return NETDEV_TX_OK;
 		}
@@ -740,10 +717,12 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (ipoib_cm_get(neigh)) {
 			if (ipoib_cm_up(neigh)) {
 				ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+				rcu_read_unlock();
 				return NETDEV_TX_OK;
 			}
 		} else if (neigh->ah) {
-			ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb_dst(skb)->neighbour->ha));
+			ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(n->ha));
+			rcu_read_unlock();
 			return NETDEV_TX_OK;
 		}
 
@@ -755,6 +734,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			++dev->stats.tx_dropped;
 			dev_kfree_skb_any(skb);
 		}
+		rcu_read_unlock();
 	} else {
 		struct ipoib_pseudoheader *phdr =
 			(struct ipoib_pseudoheader *) skb->data;
@@ -843,62 +823,91 @@ static void ipoib_neigh_cleanup(struct neighbour *n)
 	struct ipoib_neigh *neigh;
 	struct ipoib_dev_priv *priv = netdev_priv(n->dev);
 	unsigned long flags;
-	struct ipoib_ah *ah = NULL;
 
-	neigh = *to_ipoib_neigh(n);
+	rcu_read_lock();
+
+	neigh = ipoib_neigh_retrieve(n);
 	if (neigh)
 		priv = netdev_priv(neigh->dev);
-	else
+	else {
+		rcu_read_unlock();
 		return;
+	}
 	ipoib_dbg(priv,
 		  "neigh_cleanup for %06x %pI6\n",
 		  IPOIB_QPN(n->ha),
 		  n->ha + 4);
 
 	spin_lock_irqsave(&priv->lock, flags);
-
-	if (neigh->ah)
-		ah = neigh->ah;
-	list_del(&neigh->list);
-	ipoib_neigh_free(n->dev, neigh);
-
+	ipoib_neigh_tidy(n->dev, neigh, 0);
 	spin_unlock_irqrestore(&priv->lock, flags);
-
-	if (ah)
-		ipoib_put_ah(ah);
+	rcu_read_unlock();
 }
 
-struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour,
+struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *n,
 				      struct net_device *dev)
 {
 	struct ipoib_neigh *neigh;
 
-	neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
+	neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
 	if (!neigh)
 		return NULL;
 
-	neigh->neighbour = neighbour;
+	spin_lock_init(&neigh->lock);
+	neigh->neighbour = n;
 	neigh->dev = dev;
-	*to_ipoib_neigh(neighbour) = neigh;
 	skb_queue_head_init(&neigh->queue);
-	ipoib_cm_set(neigh, NULL);
-
+	ipoib_neigh_assign(n, neigh);
 	return neigh;
 }
 
-void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
+static void ipoib_neigh_free_rcu(struct rcu_head *head)
 {
+	struct ipoib_neigh *neigh = container_of(head, struct ipoib_neigh, rcu);
+	struct net_device *dev = neigh->dev;
 	struct sk_buff *skb;
-	*to_ipoib_neigh(neigh->neighbour) = NULL;
+
 	while ((skb = __skb_dequeue(&neigh->queue))) {
 		++dev->stats.tx_dropped;
 		dev_kfree_skb_any(skb);
 	}
+
+	if (ipoib_cm_get_rcu_flag(neigh, IPOIB_FLAG_CM_CLEAR))
+		ipoib_cm_set(neigh, NULL);
+
 	if (ipoib_cm_get(neigh))
 		ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
+
+	if (neigh->ah)
+		ipoib_put_ah(neigh->ah);
+
 	kfree(neigh);
 }
 
+void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
+{
+	struct neighbour *n = neigh->neighbour;
+	ipoib_neigh_assign(n, NULL);
+
+	call_rcu(&neigh->rcu, ipoib_neigh_free_rcu);
+}
+
+void ipoib_neigh_tidy(struct net_device *dev, struct ipoib_neigh *neigh, int cm)
+{
+	spin_lock(&neigh->lock);
+	if (neigh->dead) {
+		/* lost the race */
+		spin_unlock(&neigh->lock);
+		return;
+	}
+	neigh->dead = 1;
+	list_del(&neigh->list);
+	spin_unlock(&neigh->lock);
+	if (cm)
+		ipoib_cm_set_rcu_flag(neigh, IPOIB_FLAG_CM_CLEAR);
+	ipoib_neigh_free(dev, neigh);
+}
+
 static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms)
 {
 	parms->neigh_cleanup = ipoib_neigh_cleanup;
@@ -1376,6 +1385,8 @@ static void ipoib_remove_one(struct ib_device *device)
 
 	dev_list = ib_get_client_data(device, &ipoib_client);
 
+	rcu_barrier();
+
 	list_for_each_entry_safe(priv, tmp, dev_list, list) {
 		ib_unregister_event_handler(&priv->event_handler);
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index a0e9753..ec96f82 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -76,17 +76,8 @@ static void ipoib_mcast_free(struct ipoib_mcast *mcast)
 
 	spin_lock_irq(&priv->lock);
 
-	list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) {
-		/*
-		 * It's safe to call ipoib_put_ah() inside priv->lock
-		 * here, because we know that mcast->ah will always
-		 * hold one more reference, so ipoib_put_ah() will
-		 * never do more than decrement the ref count.
-		 */
-		if (neigh->ah)
-			ipoib_put_ah(neigh->ah);
+	list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list)
 		ipoib_neigh_free(dev, neigh);
-	}
 
 	spin_unlock_irq(&priv->lock);
 
@@ -658,9 +649,7 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
 	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)		||
 	    !priv->broadcast					||
 	    !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
-		++dev->stats.tx_dropped;
-		dev_kfree_skb_any(skb);
-		goto unlock;
+		goto drop;
 	}
 
 	mcast = __ipoib_mcast_find(dev, mgid);
@@ -673,9 +662,7 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
 		if (!mcast) {
 			ipoib_warn(priv, "unable to allocate memory for "
 				   "multicast structure\n");
-			++dev->stats.tx_dropped;
-			dev_kfree_skb_any(skb);
-			goto out;
+			goto drop;
 		}
 
 		set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
@@ -705,25 +692,42 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
 		mcast = NULL;
 	}
 
-out:
 	if (mcast && mcast->ah) {
-		if (skb_dst(skb)		&&
-		    skb_dst(skb)->neighbour &&
-		    !*to_ipoib_neigh(skb_dst(skb)->neighbour)) {
-			struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb_dst(skb)->neighbour,
-									skb->dev);
+		if (skb_dst(skb) && skb_dst(skb)->neighbour) {
+			struct neighbour *n = skb_dst(skb)->neighbour;
+			struct ipoib_neigh *neigh;
+			int drop = 0;
+
+			rcu_read_lock();
+			neigh = ipoib_neigh_retrieve(n);
+
+			if (!neigh) 
+				neigh = ipoib_neigh_alloc(n, skb->dev);
 
 			if (neigh) {
-				kref_get(&mcast->ah->ref);
-				neigh->ah	= mcast->ah;
-				list_add_tail(&neigh->list, &mcast->neigh_list);
+				spin_lock(&neigh->lock);
+				if (!neigh->dead) {
+					kref_get(&mcast->ah->ref);
+					neigh->ah	= mcast->ah;
+					list_add_tail(&neigh->list, &mcast->neigh_list);
+				} else 
+					drop = 1;
+				spin_unlock(&neigh->lock);
 			}
+			rcu_read_unlock();
+			if (drop)
+				goto drop;
 		}
 
 		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
 	}
 
-unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+	return;
+
+drop:
+	++dev->stats.tx_dropped;
+	dev_kfree_skb_any(skb);
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 



More information about the general mailing list