[ofa-general] IPoIB kernel Oops -- race condition

Jack Morgenstein jackm at dev.mellanox.co.il
Mon Jun 29 23:55:09 PDT 2009


On Monday 29 June 2009 22:58, Yossi Etigin wrote:
> I think we could put just the leave/free on the WQ. It would be symmetrical to join which
> takes place on the WQ as well. We would probably need to add a "to-free" list to ipoib,
> the opposite of the join list (multicast_list) which is currently there.
> Instead of doing free to mcast, we list_add it to some priv->mcast_leave_list and queue
> the work. I think it can be safe to defer multicast leave/free, as long as it's removed from
> the mcast tree on time.
> 
Is the patch below what you mean?  I think this will work.
(It is compiled, but not yet tested).
I've opened Bugzilla 1666 on this issue.

-Jack
==================================================================================================================
ipoib: fix Oops in multicast leave flow.

To avoid race conditions (which may lead to a kernel Oops) between
multicast join and multicast leave, we transfer leave processing to the
workqueue (rather than do it in place).

This fixes Bugzilla 1666.

This fix was suggested by Yossi Etigin of Voltaire.

Signed-off-by: Jack Morgenstein <jackm at dev.mellanox.co.il>

Index: is4_kernel/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- is4_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib.h	2009-06-30 09:44:48.434443000 +0300
+++ is4_kernel/drivers/infiniband/ulp/ipoib/ipoib.h	2009-06-30 09:44:49.004011000 +0300
@@ -288,10 +288,12 @@ struct ipoib_dev_priv {
 
 	struct ipoib_mcast *broadcast;
 	struct list_head multicast_list;
+	struct list_head mcast_remove_list;
 	struct rb_root multicast_tree;
 
 	struct delayed_work pkey_poll_task;
 	struct delayed_work mcast_task;
+	struct work_struct mcast_remove_task;
 	struct work_struct carrier_on_task;
 	struct work_struct flush_light;
 	struct work_struct flush_normal;
@@ -463,6 +465,7 @@ int ipoib_dev_init(struct net_device *de
 void ipoib_dev_cleanup(struct net_device *dev);
 
 void ipoib_mcast_join_task(struct work_struct *work);
+void ipoib_mcast_remove_task(struct work_struct *work);
 void ipoib_mcast_carrier_on_task(struct work_struct *work);
 void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb);
 
Index: is4_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- is4_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c	2009-06-30 09:44:48.897904000 +0300
+++ is4_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c	2009-06-30 09:44:49.011018000 +0300
@@ -1096,9 +1096,11 @@ static void ipoib_setup(struct net_devic
 	INIT_LIST_HEAD(&priv->child_intfs);
 	INIT_LIST_HEAD(&priv->dead_ahs);
 	INIT_LIST_HEAD(&priv->multicast_list);
+	INIT_LIST_HEAD(&priv->mcast_remove_list);
 
 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
+	INIT_WORK(&priv->mcast_remove_task, ipoib_mcast_remove_task);
 	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
 	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
 	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
@@ -1468,6 +1470,7 @@ err_fs:
 static void __exit ipoib_cleanup_module(void)
 {
 	ib_unregister_client(&ipoib_client);
+	flush_workqueue(ipoib_workqueue);
 	ib_sa_unregister_client(&ipoib_sa_client);
 	ipoib_unregister_debugfs();
 	destroy_workqueue(ipoib_workqueue);
Index: is4_kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- is4_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2009-06-30 09:44:48.955963000 +0300
+++ is4_kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2009-06-30 09:45:44.990941000 +0300
@@ -657,6 +657,31 @@ static int ipoib_mcast_leave(struct net_
 	return 0;
 }
 
+void ipoib_mcast_remove_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, mcast_remove_task);
+	struct net_device *dev = priv->dev;
+
+	LIST_HEAD(remove_list);
+	struct ipoib_mcast *mcast, *tmcast;
+	unsigned long flags;
+
+	ipoib_dbg_mcast(priv, "flushing multicast list\n");
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry_safe(mcast, tmcast, &priv->mcast_remove_list, list) {
+		list_del(&mcast->list);
+		list_add_tail(&mcast->list, &remove_list);
+	}
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+		ipoib_mcast_leave(dev, mcast);
+		ipoib_mcast_free(mcast);
+	}
+}
+
 void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -740,7 +765,6 @@ unlock:
 void ipoib_mcast_dev_flush(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	LIST_HEAD(remove_list);
 	struct ipoib_mcast *mcast, *tmcast;
 	unsigned long flags;
 
@@ -751,21 +775,18 @@ void ipoib_mcast_dev_flush(struct net_de
 	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
 		list_del(&mcast->list);
 		rb_erase(&mcast->rb_node, &priv->multicast_tree);
-		list_add_tail(&mcast->list, &remove_list);
+		list_add_tail(&mcast->list, &priv->mcast_remove_list);
 	}
 
 	if (priv->broadcast) {
 		rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree);
-		list_add_tail(&priv->broadcast->list, &remove_list);
+		list_add_tail(&priv->broadcast->list,  &priv->mcast_remove_list);
 		priv->broadcast = NULL;
 	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
-		ipoib_mcast_leave(dev, mcast);
-		ipoib_mcast_free(mcast);
-	}
+	queue_work(ipoib_workqueue, &priv->mcast_remove_task);
 }
 
 void ipoib_mcast_restart_task(struct work_struct *work)
@@ -775,7 +796,6 @@ void ipoib_mcast_restart_task(struct wor
 	struct net_device *dev = priv->dev;
 	struct dev_mc_list *mclist;
 	struct ipoib_mcast *mcast, *tmcast;
-	LIST_HEAD(remove_list);
 	unsigned long flags;
 	struct ib_sa_mcmember_rec rec;
 
@@ -831,7 +851,7 @@ void ipoib_mcast_restart_task(struct wor
 
 			if (mcast) {
 				/* Destroy the send only entry */
-				list_move_tail(&mcast->list, &remove_list);
+				list_move_tail(&mcast->list, &priv->mcast_remove_list);
 
 				rb_replace_node(&mcast->rb_node,
 						&nmcast->rb_node,
@@ -856,7 +876,7 @@ void ipoib_mcast_restart_task(struct wor
 			rb_erase(&mcast->rb_node, &priv->multicast_tree);
 
 			/* Move to the remove list */
-			list_move_tail(&mcast->list, &remove_list);
+			list_move_tail(&mcast->list, &priv->mcast_remove_list);
 		}
 	}
 
@@ -864,11 +884,7 @@ void ipoib_mcast_restart_task(struct wor
 	netif_addr_unlock(dev);
 	local_irq_restore(flags);
 
-	/* We have to cancel outside of the spinlock */
-	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
-		ipoib_mcast_leave(mcast->dev, mcast);
-		ipoib_mcast_free(mcast);
-	}
+	queue_work(ipoib_workqueue, &priv->mcast_remove_task);
 
 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 		ipoib_mcast_start_thread(dev);



More information about the general mailing list