[openib-general] [PATCH 1 of 2] Re: ipoib oops
Michael S. Tsirkin
mst at mellanox.co.il
Wed Nov 16 10:06:29 PST 2005
Quoting r. Michael S. Tsirkin <mst at mellanox.co.il>:
> Subject: ipoib oops
>
> Hi!
> I saw this in /var/log/messages recently.
> Unfortunately I cant say exactly what I did to trigger this problem.
The following is for review only: this triggers another problem in
ipoib which I'm working on now.
Comments?
---
Make sure all users of priv->mcast_list and priv_broadcast are protected
by priv->lock. I had to add another list_head to the mcast structure
to avoid using mcast_list outside the lock in stop_thread.
Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>
Index: linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- linux-2.6.14.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2005-11-15 15:53:09.000000000 +0200
+++ linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2005-11-15 17:31:35.000000000 +0200
@@ -62,6 +62,7 @@ struct ipoib_mcast {
struct rb_node rb_node;
struct list_head list;
+ struct list_head cancel_list;
struct completion done;
int query_id;
@@ -587,10 +588,12 @@ int ipoib_mcast_start_thread(struct net_
return 0;
}
-int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
+int ipoib_mcast_stop_thread(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
- struct ipoib_mcast *mcast;
+ struct ipoib_mcast *mcast, *tmcast;
+ unsigned long flags;
+ LIST_HEAD(cancel_list);
ipoib_dbg_mcast(priv, "stopping multicast thread\n");
@@ -599,17 +602,20 @@ int ipoib_mcast_stop_thread(struct net_d
cancel_delayed_work(&priv->mcast_task);
up(&mcast_mutex);
- if (flush)
- flush_workqueue(ipoib_workqueue);
+ flush_workqueue(ipoib_workqueue);
- if (priv->broadcast && priv->broadcast->query) {
- ib_sa_cancel_query(priv->broadcast->query_id, priv->broadcast->query);
- priv->broadcast->query = NULL;
- ipoib_dbg_mcast(priv, "waiting for bcast\n");
- wait_for_completion(&priv->broadcast->done);
- }
+ spin_lock_irqsave(&priv->lock, flags);
+
+ if (priv->broadcast && priv->broadcast->query)
+ list_add_tail(&priv->broadcast->cancel_list, &cancel_list);
+
+ list_for_each_entry(mcast, &priv->multicast_list, list)
+ if (mcast->query)
+ list_add_tail(&mcast->cancel_list, &cancel_list);
- list_for_each_entry(mcast, &priv->multicast_list, list) {
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ list_for_each_entry_safe(mcast, tmcast, &cancel_list, cancel_list) {
if (mcast->query) {
ib_sa_cancel_query(mcast->query_id, mcast->query);
mcast->query = NULL;
@@ -617,6 +623,7 @@ int ipoib_mcast_stop_thread(struct net_d
IPOIB_GID_ARG(mcast->mcmember.mgid));
wait_for_completion(&mcast->done);
}
+ list_del_init(&mcast->cancel_list);
}
return 0;
@@ -741,12 +748,14 @@ void ipoib_mcast_dev_flush(struct net_de
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
LIST_HEAD(remove_list);
+ LIST_HEAD(cancel_list);
struct ipoib_mcast *mcast, *tmcast, *nmcast;
unsigned long flags;
ipoib_dbg_mcast(priv, "flushing multicast list\n");
spin_lock_irqsave(&priv->lock, flags);
+
list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
nmcast = ipoib_mcast_alloc(dev, 0);
if (nmcast) {
@@ -780,14 +789,24 @@ void ipoib_mcast_dev_flush(struct net_de
&priv->multicast_tree);
list_add_tail(&priv->broadcast->list, &remove_list);
+ priv->broadcast = nmcast;
+ } else {
+ ipoib_warn(priv, "could not reallocate multicast group "
+ IPOIB_GID_FMT "\n",
+ IPOIB_GID_ARG(priv->broadcast->mcmember.mgid));
}
-
- priv->broadcast = nmcast;
}
spin_unlock_irqrestore(&priv->lock, flags);
list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+ if (mcast->query) {
+ ib_sa_cancel_query(mcast->query_id, mcast->query);
+ mcast->query = NULL;
+ ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid));
+ wait_for_completion(&mcast->done);
+ }
ipoib_mcast_leave(dev, mcast);
ipoib_mcast_free(mcast);
}
@@ -821,8 +840,12 @@ void ipoib_mcast_restart_task(void *dev_
unsigned long flags;
ipoib_dbg_mcast(priv, "restarting multicast task\n");
+ ipoib_dbg_mcast(priv, "stopping multicast thread\n");
- ipoib_mcast_stop_thread(dev, 0);
+ down(&mcast_mutex);
+ clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+ cancel_delayed_work(&priv->mcast_task);
+ up(&mcast_mutex);
spin_lock_irqsave(&priv->lock, flags);
@@ -900,6 +923,13 @@ void ipoib_mcast_restart_task(void *dev_
/* We have to cancel outside of the spinlock */
list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+ if (mcast->query) {
+ ib_sa_cancel_query(mcast->query_id, mcast->query);
+ mcast->query = NULL;
+ ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid));
+ wait_for_completion(&mcast->done);
+ }
ipoib_mcast_leave(mcast->dev, mcast);
ipoib_mcast_free(mcast);
}
Index: linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- linux-2.6.14.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2005-11-15 15:53:09.000000000 +0200
+++ linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2005-11-15 17:20:47.000000000 +0200
@@ -452,7 +452,7 @@ int ipoib_ib_dev_down(struct net_device
flush_workqueue(ipoib_workqueue);
}
- ipoib_mcast_stop_thread(dev, 1);
+ ipoib_mcast_stop_thread(dev);
/*
* Flush the multicast groups first so we stop any multicast joins. The
@@ -619,7 +619,7 @@ void ipoib_ib_dev_cleanup(struct net_dev
ipoib_dbg(priv, "cleaning up ib_dev\n");
- ipoib_mcast_stop_thread(dev, 1);
+ ipoib_mcast_stop_thread(dev);
/* Delete the broadcast address and the local address */
ipoib_mcast_dev_down(dev);
Index: linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- linux-2.6.14.orig/drivers/infiniband/ulp/ipoib/ipoib.h 2005-11-14 17:22:18.000000000 +0200
+++ linux-2.6.14/drivers/infiniband/ulp/ipoib/ipoib.h 2005-11-15 17:02:04.000000000 +0200
@@ -264,7 +264,7 @@ void ipoib_mcast_send(struct net_device
void ipoib_mcast_restart_task(void *dev_ptr);
int ipoib_mcast_start_thread(struct net_device *dev);
-int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
+int ipoib_mcast_stop_thread(struct net_device *dev);
void ipoib_mcast_dev_down(struct net_device *dev);
void ipoib_mcast_dev_flush(struct net_device *dev);
--
MST
More information about the general
mailing list