[ofa-general] [RFC] ipoib: multicast sender will leave the group when neighbour is cleaned

Yossi Etigin yosefe at Voltaire.COM
Mon Jul 21 07:38:17 PDT 2008


The problem:
A multicast sender joins the MGID as full member, but does not leave 
(as long as the interface is up). This causes an MGID leakage in the SM.

Proposed solution:
IPOIB driver will leave an MGID when the kernel neighbour for this MGID
is cleaned. Use the ipoib_neigh_cleanup hook. Since it's called from
atomic context, schedule a task that will actually leave the group.
Allocate a bit IPOIB_MCAST_FLAG_UNUSED that will indicate that a group
is marked to be left by this task.

An issue we found with this solution, is that it takes a while until the
neighbour is cleaned, and it requires that the host will send some traffic
(I supposed that unless we put more entries in the routing table, out neighbour
will not be garbage collected).

I'll be happy to hear remarks/suggestions, particularly about the issue above.


The code below probably has issues, it's just to demonstrate the proposed change.

--

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index ca126fc..522e4a0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -95,6 +95,7 @@ enum {
	IPOIB_MCAST_FLAG_SENDONLY = 1,
	IPOIB_MCAST_FLAG_BUSY	  = 2,	/* joining or already joined */
	IPOIB_MCAST_FLAG_ATTACHED = 3,
+	IPOIB_MCAST_FLAG_UNUSED   = 4,

	MAX_SEND_CQE		  = 16,
};
@@ -275,7 +276,8 @@ struct ipoib_dev_priv {
	struct rb_root multicast_tree;

	struct delayed_work pkey_poll_task;
-	struct delayed_work mcast_task;
+	struct delayed_work mcast_join_task;
+	struct work_struct mcast_leave_task;
	struct work_struct flush_task;
	struct work_struct restart_task;
	struct delayed_work ah_reap_task;
@@ -440,7 +442,9 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
void ipoib_dev_cleanup(struct net_device *dev);

void ipoib_mcast_join_task(struct work_struct *work);
+void ipoib_mcast_leave_task(struct work_struct *work);
void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb);
+void ipoib_mcast_neigh_cleanup(struct net_device *dev, void *mgid);

void ipoib_mcast_restart_task(struct work_struct *work);
int ipoib_mcast_start_thread(struct net_device *dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 2442090..e299ea2 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -823,6 +823,8 @@ static void ipoib_neigh_cleanup(struct neighbour *n)
		  IPOIB_QPN(n->ha),
		  IPOIB_GID_RAW_ARG(n->ha + 4));

+	ipoib_mcast_neigh_cleanup(n->dev, n->ha + 4);
+
	spin_lock_irqsave(&priv->lock, flags);

	if (neigh->ah)
@@ -990,8 +992,9 @@ static void ipoib_setup(struct net_device *dev)

	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
	INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
-	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
-	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
+	INIT_DELAYED_WORK(&priv->mcast_join_task,   ipoib_mcast_join_task);
+	INIT_WORK(&priv->mcast_leave_task, ipoib_mcast_leave_task);
+	INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush);
	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 3f663fb..7175479 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -389,7 +389,7 @@ static int ipoib_mcast_join_complete(int status,
		mutex_lock(&mcast_mutex);
		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
			queue_delayed_work(ipoib_workqueue,
-					   &priv->mcast_task, 0);
+					   &priv->mcast_join_task, 0);
		mutex_unlock(&mcast_mutex);

		if (mcast == priv->broadcast)
@@ -422,7 +422,7 @@ static int ipoib_mcast_join_complete(int status,
	mutex_lock(&mcast_mutex);
	spin_lock_irq(&priv->lock);
	if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-		queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
+		queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task,
				   mcast->backoff * HZ);
	spin_unlock_irq(&priv->lock);
	mutex_unlock(&mcast_mutex);
@@ -492,7 +492,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
		mutex_lock(&mcast_mutex);
		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
			queue_delayed_work(ipoib_workqueue,
-					   &priv->mcast_task,
+					   &priv->mcast_join_task,
					   mcast->backoff * HZ);
		mutex_unlock(&mcast_mutex);
	}
@@ -501,7 +501,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
void ipoib_mcast_join_task(struct work_struct *work)
{
	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, mcast_task.work);
+		container_of(work, struct ipoib_dev_priv, mcast_join_task.work);
	struct net_device *dev = priv->dev;

	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
@@ -530,7 +530,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
			mutex_lock(&mcast_mutex);
			if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
				queue_delayed_work(ipoib_workqueue,
-						   &priv->mcast_task, HZ);
+						   &priv->mcast_join_task, HZ);
			mutex_unlock(&mcast_mutex);
			return;
		}
@@ -591,7 +591,7 @@ int ipoib_mcast_start_thread(struct net_device *dev)

	mutex_lock(&mcast_mutex);
	if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
-		queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
+		queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task, 0);
	mutex_unlock(&mcast_mutex);

	spin_lock_irq(&priv->lock);
@@ -613,7 +613,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush)

	mutex_lock(&mcast_mutex);
	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
-	cancel_delayed_work(&priv->mcast_task);
+	cancel_delayed_work(&priv->mcast_join_task);
	mutex_unlock(&mcast_mutex);

	if (flush)
@@ -864,6 +864,46 @@ void ipoib_mcast_restart_task(struct work_struct *work)
		ipoib_mcast_start_thread(dev);
}

+void ipoib_mcast_leave_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, mcast_leave_task);
+	struct net_device *dev = priv->dev;
+	struct ipoib_mcast *mcast;
+	
+	spin_lock_irq(&priv->lock);
+	list_for_each_entry(mcast, &priv->multicast_list, list) {
+		if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+		    test_bit(IPOIB_MCAST_FLAG_UNUSED, &mcast->flags)) {
+			break;
+		}
+	}
+	spin_unlock_irq(&priv->lock);
+
+	if (&mcast->list == &priv->multicast_list) {
+		ipoib_dbg(priv, "ipoib_mcast_leave_task: no groups to leave\n");
+		return;
+	}
+
+	list_del(&mcast->list);
+	rb_erase(&mcast->rb_node, &priv->multicast_tree);
+	ipoib_mcast_leave(dev, mcast);
+	ipoib_mcast_free(mcast);
+}
+
+void ipoib_mcast_neigh_cleanup(struct net_device *dev, void *mgid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_mcast *mcast = __ipoib_mcast_find(dev, mgid);
+	
+	if (mcast) {
+		set_bit(IPOIB_MCAST_FLAG_UNUSED, &mcast->flags);
+		queue_work(ipoib_workqueue, &priv->mcast_leave_task);
+	} else
+		ipoib_dbg_mcast(priv, "mcast gid " IPOIB_GID_FMT " not found\n",
+				IPOIB_GID_RAW_ARG(mgid));
+}
+
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG

struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev)

-- 
--Yossi




More information about the general mailing list