[ewg] [PATCH] IB/ipoib: Leave stale send-only multicast groups

Richard Croucher richard at informatix-sol.com
Thu Feb 3 01:16:44 PST 2011


The Ethernet community use IGMP to deal with this scenario.
IGMP periodically polls each publisher and receiver to verify that they
still want to belong to the MC group. This is handled by the kernel, the
apps never see this.
This approach deals with both the case of the leave request being lost, and
the case of the publisher/subscriber crashing and never explicitly sending a
leave request.
InfiniBand, similarly needs to deal with both these cases.

-----Original Message-----
From: ewg-bounces at lists.openfabrics.org
[mailto:ewg-bounces at lists.openfabrics.org] On Behalf Of Moni Shoua
Sent: 02 February 2011 14:38
To: Vlad
Cc: Yossi Etigin; ewg
Subject: [ewg] [PATCH] IB/ipoib: Leave stale send-only multicast groups

This patch was sent to linux-rdma a while ago but had not been accepted yet.
However, no objection was raised so far.

Note: the patch below is not to driver/infiniband/ulp/ipoib but it generates
a patch under kernel_patches/fixes.

--

Index: ofa_kernel-1.5.3/kernel_patches/fixes/zzz_0041_add_mcast_gc.diff
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5.3/kernel_patches/fixes/zzz_0041_add_mcast_gc.diff
2011-02-02 16:29:00.000000000 +0200
@@ -0,0 +1,206 @@
+The kernel never leaves send only multicast groups. In addition, IPoIB
doesn't
+implement real send only join but it sends the SM a send/receive join
request.
+In order to avoid MC group explosion on the switch, a mechanism of garbage
+collection to unused multicast groups is required.
+
+Signed-off-by: Yossi Etigin <yosefe at voltaire.com>
+Signed-off-by: Moni Shoua <monis at voltaire.com>
+--
+
+diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h
b/drivers/infiniband/ulp/ipoib/ipoib.h
+index ab97f92..fb1714f 100644
+--- a/drivers/infiniband/ulp/ipoib/ipoib.h
++++ b/drivers/infiniband/ulp/ipoib/ipoib.h
+@@ -92,6 +92,7 @@ enum {
+ 	IPOIB_FLAG_ADMIN_CM	  = 9,
+ 	IPOIB_FLAG_UMCAST	  = 10,
+ 	IPOIB_FLAG_CSUM		  = 11,
++	IPOIB_MCAST_RUN_GC	  = 12,
+ 
+ 	IPOIB_MAX_BACKOFF_SECONDS = 16,
+ 
+@@ -132,6 +133,7 @@ struct ipoib_mcast {
+ 	struct list_head  list;
+ 
+ 	unsigned long created;
++	unsigned long used;
+ 	unsigned long backoff;
+ 
+ 	unsigned long flags;
+@@ -283,7 +285,8 @@ struct ipoib_dev_priv {
+ 	struct rb_root multicast_tree;
+ 
+ 	struct delayed_work pkey_poll_task;
+-	struct delayed_work mcast_task;
++	struct delayed_work mcast_join_task;
++	struct delayed_work mcast_leave_task;
+ 	struct work_struct carrier_on_task;
+ 	struct work_struct flush_light;
+ 	struct work_struct flush_normal;
+@@ -411,6 +414,8 @@ void ipoib_neigh_free(struct net_device *dev, struct
ipoib_neigh *neigh);
+ 
+ extern struct workqueue_struct *ipoib_workqueue;
+ 
++extern int ipoib_mc_sendonly_timeout;
++
+ /* functions */
+ 
+ int ipoib_poll(struct napi_struct *napi, int budget);
+@@ -453,6 +458,7 @@ int ipoib_dev_init(struct net_device *dev, struct
ib_device *ca, int port);
+ void ipoib_dev_cleanup(struct net_device *dev);
+ 
+ void ipoib_mcast_join_task(struct work_struct *work);
++void ipoib_mcast_leave_task(struct work_struct *work);
+ void ipoib_mcast_carrier_on_task(struct work_struct *work);
+ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff
*skb);
+ 
+diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
+index 7a07a72..563370e 100644
+--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
++++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
+@@ -67,6 +67,11 @@ module_param_named(debug_level, ipoib_debug_level, int,
0644);
+ MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+ #endif
+ 
++int ipoib_mc_sendonly_timeout;
++
++module_param_named(mc_sendonly_timeout, ipoib_mc_sendonly_timeout, int,
0644);
++MODULE_PARM_DESC(mc_sendonly_timeout, "Enable debug tracing if > 0");
++
+ struct ipoib_path_iter {
+ 	struct net_device *dev;
+ 	struct ipoib_path  path;
+@@ -1020,7 +1025,8 @@ static void ipoib_setup(struct net_device *dev)
+ 	INIT_LIST_HEAD(&priv->multicast_list);
+ 
+ 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
+-	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
++	INIT_DELAYED_WORK(&priv->mcast_join_task,   ipoib_mcast_join_task);
++	INIT_DELAYED_WORK(&priv->mcast_leave_task, ipoib_mcast_leave_task);
+ 	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
+ 	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
+ 	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
+diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+index 3871ac6..87928c1 100644
+--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
++++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+@@ -117,6 +117,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct
net_device *dev,
+ 
+ 	mcast->dev = dev;
+ 	mcast->created = jiffies;
++	mcast->used = jiffies;
+ 	mcast->backoff = 1;
+ 
+ 	INIT_LIST_HEAD(&mcast->list);
+@@ -403,7 +404,7 @@ static int ipoib_mcast_join_complete(int status,
+ 		mutex_lock(&mcast_mutex);
+ 		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ 			queue_delayed_work(ipoib_workqueue,
+-					   &priv->mcast_task, 0);
++					   &priv->mcast_join_task, 0);
+ 		mutex_unlock(&mcast_mutex);
+ 
+ 		/*
+@@ -436,7 +437,7 @@ static int ipoib_mcast_join_complete(int status,
+ 	mutex_lock(&mcast_mutex);
+ 	spin_lock_irq(&priv->lock);
+ 	if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+-		queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
++		queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task,
+ 				   mcast->backoff * HZ);
+ 	spin_unlock_irq(&priv->lock);
+ 	mutex_unlock(&mcast_mutex);
+@@ -505,7 +506,7 @@ static void ipoib_mcast_join(struct net_device *dev,
struct ipoib_mcast *mcast,
+ 		mutex_lock(&mcast_mutex);
+ 		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ 			queue_delayed_work(ipoib_workqueue,
+-					   &priv->mcast_task,
++					   &priv->mcast_join_task,
+ 					   mcast->backoff * HZ);
+ 		mutex_unlock(&mcast_mutex);
+ 	}
+@@ -514,7 +515,7 @@ static void ipoib_mcast_join(struct net_device *dev,
struct ipoib_mcast *mcast,
+ void ipoib_mcast_join_task(struct work_struct *work)
+ {
+ 	struct ipoib_dev_priv *priv =
+-		container_of(work, struct ipoib_dev_priv, mcast_task.work);
++		container_of(work, struct ipoib_dev_priv,
mcast_join_task.work);
+ 	struct net_device *dev = priv->dev;
+ 
+ 	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
+@@ -546,7 +547,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
+ 			mutex_lock(&mcast_mutex);
+ 			if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ 				queue_delayed_work(ipoib_workqueue,
+-						   &priv->mcast_task, HZ);
++						   &priv->mcast_join_task,
HZ);
+ 			mutex_unlock(&mcast_mutex);
+ 			return;
+ 		}
+@@ -610,7 +611,9 @@ int ipoib_mcast_start_thread(struct net_device *dev)
+ 
+ 	mutex_lock(&mcast_mutex);
+ 	if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
+-		queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
++		queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task,
0);
++	if (!test_and_set_bit(IPOIB_MCAST_RUN_GC, &priv->flags))
++		queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task,
0);
+ 	mutex_unlock(&mcast_mutex);
+ 
+ 	return 0;
+@@ -624,7 +627,9 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int
flush)
+ 
+ 	mutex_lock(&mcast_mutex);
+ 	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+-	cancel_delayed_work(&priv->mcast_task);
++	clear_bit(IPOIB_MCAST_RUN_GC, &priv->flags);
++	cancel_delayed_work(&priv->mcast_join_task);
++	cancel_delayed_work(&priv->mcast_leave_task);
+ 	mutex_unlock(&mcast_mutex);
+ 
+ 	if (flush)
+@@ -727,7 +732,7 @@ out:
+ 				list_add_tail(&neigh->list,
&mcast->neigh_list);
+ 			}
+ 		}
+-
++		mcast->used = jiffies;
+ 		spin_unlock_irqrestore(&priv->lock, flags);
+ 		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
+ 		return;
+@@ -888,6 +893,35 @@ void ipoib_mcast_restart_task(struct work_struct
*work)
+ 		ipoib_mcast_start_thread(dev);
+ }
+ 
++void ipoib_mcast_leave_task(struct work_struct *work)
++{
++	struct ipoib_dev_priv *priv =
++		container_of(work, struct ipoib_dev_priv,
mcast_leave_task.work);
++	struct net_device *dev = priv->dev;
++	struct ipoib_mcast *mcast, *tmcast;
++	LIST_HEAD(remove_list);
++
++	if (!test_bit(IPOIB_MCAST_RUN_GC, &priv->flags))
++		return;
++
++	if (ipoib_mc_sendonly_timeout > 0) {
++		list_for_each_entry_safe(mcast, tmcast,
&priv->multicast_list, list) {
++			if (test_bit(IPOIB_MCAST_FLAG_SENDONLY,
&mcast->flags) &&
++			    time_before(mcast->used, jiffies -
ipoib_mc_sendonly_timeout * HZ)) {
++				rb_erase(&mcast->rb_node,
&priv->multicast_tree);
++				list_move_tail(&mcast->list, &remove_list);
++			}
++		}
++
++		list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
{
++			ipoib_mcast_leave(dev, mcast);
++			ipoib_mcast_free(mcast);
++		}
++	}
++
++	queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 60 *
HZ);
++}
++
+ #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+ 
+ struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev)
_______________________________________________
ewg mailing list
ewg at lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg




More information about the ewg mailing list