[openib-general] [PATCH 1 of 2] Re: ipoib oops

Michael S. Tsirkin mst at mellanox.co.il
Wed Nov 16 10:06:29 PST 2005


Quoting r. Michael S. Tsirkin <mst at mellanox.co.il>:
> Subject: ipoib oops
> 
> Hi!
> I saw this in /var/log/messages recently.
> Unfortunately I cant say exactly what I did to trigger this problem.

The following is for review only: this triggers another problem in
ipoib which I'm working on now.
Comments?

---

Make sure all users of priv->mcast_list and priv_broadcast are protected
by priv->lock. I had to add another list_head to the mcast structure
to avoid using mcast_list outside the lock in stop_thread.

Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

Index: linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- linux-2.6.14.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2005-11-15 15:53:09.000000000 +0200
+++ linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2005-11-15 17:31:35.000000000 +0200
@@ -62,6 +62,7 @@ struct ipoib_mcast {
 
 	struct rb_node    rb_node;
 	struct list_head  list;
+	struct list_head  cancel_list;
 	struct completion done;
 
 	int                 query_id;
@@ -587,10 +588,12 @@ int ipoib_mcast_start_thread(struct net_
 	return 0;
 }
 
-int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
+int ipoib_mcast_stop_thread(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_mcast *mcast;
+	struct ipoib_mcast *mcast, *tmcast;
+	unsigned long flags;
+	LIST_HEAD(cancel_list);
 
 	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
 
@@ -599,17 +602,20 @@ int ipoib_mcast_stop_thread(struct net_d
 	cancel_delayed_work(&priv->mcast_task);
 	up(&mcast_mutex);
 
-	if (flush)
-		flush_workqueue(ipoib_workqueue);
+	flush_workqueue(ipoib_workqueue);
 
-	if (priv->broadcast && priv->broadcast->query) {
-		ib_sa_cancel_query(priv->broadcast->query_id, priv->broadcast->query);
-		priv->broadcast->query = NULL;
-		ipoib_dbg_mcast(priv, "waiting for bcast\n");
-		wait_for_completion(&priv->broadcast->done);
-	}
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (priv->broadcast && priv->broadcast->query)
+		list_add_tail(&priv->broadcast->cancel_list, &cancel_list);
+
+	list_for_each_entry(mcast, &priv->multicast_list, list)
+		if (mcast->query)
+			list_add_tail(&mcast->cancel_list, &cancel_list);
 
-	list_for_each_entry(mcast, &priv->multicast_list, list) {
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	list_for_each_entry_safe(mcast, tmcast, &cancel_list, cancel_list) {
 		if (mcast->query) {
 			ib_sa_cancel_query(mcast->query_id, mcast->query);
 			mcast->query = NULL;
@@ -617,6 +623,7 @@ int ipoib_mcast_stop_thread(struct net_d
 					IPOIB_GID_ARG(mcast->mcmember.mgid));
 			wait_for_completion(&mcast->done);
 		}
+		list_del_init(&mcast->cancel_list);
 	}
 
 	return 0;
@@ -741,12 +748,14 @@ void ipoib_mcast_dev_flush(struct net_de
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	LIST_HEAD(remove_list);
+	LIST_HEAD(cancel_list);
 	struct ipoib_mcast *mcast, *tmcast, *nmcast;
 	unsigned long flags;
 
 	ipoib_dbg_mcast(priv, "flushing multicast list\n");
 
 	spin_lock_irqsave(&priv->lock, flags);
+
 	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
 		nmcast = ipoib_mcast_alloc(dev, 0);
 		if (nmcast) {
@@ -780,14 +789,24 @@ void ipoib_mcast_dev_flush(struct net_de
 					&priv->multicast_tree);
 
 			list_add_tail(&priv->broadcast->list, &remove_list);
+			priv->broadcast = nmcast;
+		} else {
+			ipoib_warn(priv, "could not reallocate multicast group "
+				   IPOIB_GID_FMT "\n",
+				   IPOIB_GID_ARG(priv->broadcast->mcmember.mgid));
 		}
-
-		priv->broadcast = nmcast;
 	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 
 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+		if (mcast->query) {
+			ib_sa_cancel_query(mcast->query_id, mcast->query);
+			mcast->query = NULL;
+			ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n",
+					IPOIB_GID_ARG(mcast->mcmember.mgid));
+			wait_for_completion(&mcast->done);
+		}
 		ipoib_mcast_leave(dev, mcast);
 		ipoib_mcast_free(mcast);
 	}
@@ -821,8 +840,12 @@ void ipoib_mcast_restart_task(void *dev_
 	unsigned long flags;
 
 	ipoib_dbg_mcast(priv, "restarting multicast task\n");
+	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
 
-	ipoib_mcast_stop_thread(dev, 0);
+	down(&mcast_mutex);
+	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+	cancel_delayed_work(&priv->mcast_task);
+	up(&mcast_mutex);
 
 	spin_lock_irqsave(&priv->lock, flags);
 
@@ -900,6 +923,13 @@ void ipoib_mcast_restart_task(void *dev_
 
 	/* We have to cancel outside of the spinlock */
 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+		if (mcast->query) {
+			ib_sa_cancel_query(mcast->query_id, mcast->query);
+			mcast->query = NULL;
+			ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n",
+					IPOIB_GID_ARG(mcast->mcmember.mgid));
+			wait_for_completion(&mcast->done);
+		}
 		ipoib_mcast_leave(mcast->dev, mcast);
 		ipoib_mcast_free(mcast);
 	}
Index: linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- linux-2.6.14.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2005-11-15 15:53:09.000000000 +0200
+++ linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2005-11-15 17:20:47.000000000 +0200
@@ -452,7 +452,7 @@ int ipoib_ib_dev_down(struct net_device 
 		flush_workqueue(ipoib_workqueue);
 	}
 
-	ipoib_mcast_stop_thread(dev, 1);
+	ipoib_mcast_stop_thread(dev);
 
 	/*
 	 * Flush the multicast groups first so we stop any multicast joins. The
@@ -619,7 +619,7 @@ void ipoib_ib_dev_cleanup(struct net_dev
 
 	ipoib_dbg(priv, "cleaning up ib_dev\n");
 
-	ipoib_mcast_stop_thread(dev, 1);
+	ipoib_mcast_stop_thread(dev);
 
 	/* Delete the broadcast address and the local address */
 	ipoib_mcast_dev_down(dev);
Index: linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- linux-2.6.14.orig/drivers/infiniband/ulp/ipoib/ipoib.h	2005-11-14 17:22:18.000000000 +0200
+++ linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib.h	2005-11-15 17:02:04.000000000 +0200
@@ -264,7 +264,7 @@ void ipoib_mcast_send(struct net_device 
 
 void ipoib_mcast_restart_task(void *dev_ptr);
 int ipoib_mcast_start_thread(struct net_device *dev);
-int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
+int ipoib_mcast_stop_thread(struct net_device *dev);
 
 void ipoib_mcast_dev_down(struct net_device *dev);
 void ipoib_mcast_dev_flush(struct net_device *dev);

-- 
MST



More information about the general mailing list