[ewg] [PATCH] IB/ipoib: Leave stale send-only multicast groups

Moni Shoua monis at Voltaire.COM
Wed Feb 2 06:37:31 PST 2011


This patch was sent to linux-rdma a while ago but had not been accepted yet.
However, no objection was raised so far.

Note: the patch below is not to driver/infiniband/ulp/ipoib but it generates
a patch under kernel_patches/fixes.

--

Index: ofa_kernel-1.5.3/kernel_patches/fixes/zzz_0041_add_mcast_gc.diff
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5.3/kernel_patches/fixes/zzz_0041_add_mcast_gc.diff	2011-02-02 16:29:00.000000000 +0200
@@ -0,0 +1,206 @@
+The kernel never leaves send only multicast groups. In addition, IPoIB doesn't
+implement real send only join but it sends the SM a send/receive join request.
+In order to avoid MC group explosion on the switch, a mechanism of garbage
+collection to unused multicast groups is required.
+
+Signed-off-by: Yossi Etigin <yosefe at voltaire.com>
+Signed-off-by: Moni Shoua <monis at voltaire.com>
+--
+
+diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
+index ab97f92..fb1714f 100644
+--- a/drivers/infiniband/ulp/ipoib/ipoib.h
++++ b/drivers/infiniband/ulp/ipoib/ipoib.h
+@@ -92,6 +92,7 @@ enum {
+ 	IPOIB_FLAG_ADMIN_CM	  = 9,
+ 	IPOIB_FLAG_UMCAST	  = 10,
+ 	IPOIB_FLAG_CSUM		  = 11,
++	IPOIB_MCAST_RUN_GC	  = 12,
+ 
+ 	IPOIB_MAX_BACKOFF_SECONDS = 16,
+ 
+@@ -132,6 +133,7 @@ struct ipoib_mcast {
+ 	struct list_head  list;
+ 
+ 	unsigned long created;
++	unsigned long used;
+ 	unsigned long backoff;
+ 
+ 	unsigned long flags;
+@@ -283,7 +285,8 @@ struct ipoib_dev_priv {
+ 	struct rb_root multicast_tree;
+ 
+ 	struct delayed_work pkey_poll_task;
+-	struct delayed_work mcast_task;
++	struct delayed_work mcast_join_task;
++	struct delayed_work mcast_leave_task;
+ 	struct work_struct carrier_on_task;
+ 	struct work_struct flush_light;
+ 	struct work_struct flush_normal;
+@@ -411,6 +414,8 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh);
+ 
+ extern struct workqueue_struct *ipoib_workqueue;
+ 
++extern int ipoib_mc_sendonly_timeout;
++
+ /* functions */
+ 
+ int ipoib_poll(struct napi_struct *napi, int budget);
+@@ -453,6 +458,7 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
+ void ipoib_dev_cleanup(struct net_device *dev);
+ 
+ void ipoib_mcast_join_task(struct work_struct *work);
++void ipoib_mcast_leave_task(struct work_struct *work);
+ void ipoib_mcast_carrier_on_task(struct work_struct *work);
+ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb);
+ 
+diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
+index 7a07a72..563370e 100644
+--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
++++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
+@@ -67,6 +67,11 @@ module_param_named(debug_level, ipoib_debug_level, int, 0644);
+ MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+ #endif
+ 
++int ipoib_mc_sendonly_timeout;
++
++module_param_named(mc_sendonly_timeout, ipoib_mc_sendonly_timeout, int, 0644);
++MODULE_PARM_DESC(mc_sendonly_timeout, "Enable debug tracing if > 0");
++
+ struct ipoib_path_iter {
+ 	struct net_device *dev;
+ 	struct ipoib_path  path;
+@@ -1020,7 +1025,8 @@ static void ipoib_setup(struct net_device *dev)
+ 	INIT_LIST_HEAD(&priv->multicast_list);
+ 
+ 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
+-	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
++	INIT_DELAYED_WORK(&priv->mcast_join_task,   ipoib_mcast_join_task);
++	INIT_DELAYED_WORK(&priv->mcast_leave_task, ipoib_mcast_leave_task);
+ 	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
+ 	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
+ 	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
+diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+index 3871ac6..87928c1 100644
+--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
++++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+@@ -117,6 +117,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
+ 
+ 	mcast->dev = dev;
+ 	mcast->created = jiffies;
++	mcast->used = jiffies;
+ 	mcast->backoff = 1;
+ 
+ 	INIT_LIST_HEAD(&mcast->list);
+@@ -403,7 +404,7 @@ static int ipoib_mcast_join_complete(int status,
+ 		mutex_lock(&mcast_mutex);
+ 		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ 			queue_delayed_work(ipoib_workqueue,
+-					   &priv->mcast_task, 0);
++					   &priv->mcast_join_task, 0);
+ 		mutex_unlock(&mcast_mutex);
+ 
+ 		/*
+@@ -436,7 +437,7 @@ static int ipoib_mcast_join_complete(int status,
+ 	mutex_lock(&mcast_mutex);
+ 	spin_lock_irq(&priv->lock);
+ 	if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+-		queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
++		queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task,
+ 				   mcast->backoff * HZ);
+ 	spin_unlock_irq(&priv->lock);
+ 	mutex_unlock(&mcast_mutex);
+@@ -505,7 +506,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
+ 		mutex_lock(&mcast_mutex);
+ 		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ 			queue_delayed_work(ipoib_workqueue,
+-					   &priv->mcast_task,
++					   &priv->mcast_join_task,
+ 					   mcast->backoff * HZ);
+ 		mutex_unlock(&mcast_mutex);
+ 	}
+@@ -514,7 +515,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
+ void ipoib_mcast_join_task(struct work_struct *work)
+ {
+ 	struct ipoib_dev_priv *priv =
+-		container_of(work, struct ipoib_dev_priv, mcast_task.work);
++		container_of(work, struct ipoib_dev_priv, mcast_join_task.work);
+ 	struct net_device *dev = priv->dev;
+ 
+ 	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
+@@ -546,7 +547,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
+ 			mutex_lock(&mcast_mutex);
+ 			if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ 				queue_delayed_work(ipoib_workqueue,
+-						   &priv->mcast_task, HZ);
++						   &priv->mcast_join_task, HZ);
+ 			mutex_unlock(&mcast_mutex);
+ 			return;
+ 		}
+@@ -610,7 +611,9 @@ int ipoib_mcast_start_thread(struct net_device *dev)
+ 
+ 	mutex_lock(&mcast_mutex);
+ 	if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
+-		queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
++		queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task, 0);
++	if (!test_and_set_bit(IPOIB_MCAST_RUN_GC, &priv->flags))
++		queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 0);
+ 	mutex_unlock(&mcast_mutex);
+ 
+ 	return 0;
+@@ -624,7 +627,9 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
+ 
+ 	mutex_lock(&mcast_mutex);
+ 	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+-	cancel_delayed_work(&priv->mcast_task);
++	clear_bit(IPOIB_MCAST_RUN_GC, &priv->flags);
++	cancel_delayed_work(&priv->mcast_join_task);
++	cancel_delayed_work(&priv->mcast_leave_task);
+ 	mutex_unlock(&mcast_mutex);
+ 
+ 	if (flush)
+@@ -727,7 +732,7 @@ out:
+ 				list_add_tail(&neigh->list, &mcast->neigh_list);
+ 			}
+ 		}
+-
++		mcast->used = jiffies;
+ 		spin_unlock_irqrestore(&priv->lock, flags);
+ 		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
+ 		return;
+@@ -888,6 +893,35 @@ void ipoib_mcast_restart_task(struct work_struct *work)
+ 		ipoib_mcast_start_thread(dev);
+ }
+ 
++void ipoib_mcast_leave_task(struct work_struct *work)
++{
++	struct ipoib_dev_priv *priv =
++		container_of(work, struct ipoib_dev_priv, mcast_leave_task.work);
++	struct net_device *dev = priv->dev;
++	struct ipoib_mcast *mcast, *tmcast;
++	LIST_HEAD(remove_list);
++
++	if (!test_bit(IPOIB_MCAST_RUN_GC, &priv->flags))
++		return;
++
++	if (ipoib_mc_sendonly_timeout > 0) {
++		list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
++			if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
++			    time_before(mcast->used, jiffies - ipoib_mc_sendonly_timeout * HZ)) {
++				rb_erase(&mcast->rb_node, &priv->multicast_tree);
++				list_move_tail(&mcast->list, &remove_list);
++			}
++		}
++
++		list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
++			ipoib_mcast_leave(dev, mcast);
++			ipoib_mcast_free(mcast);
++		}
++	}
++
++	queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 60 * HZ);
++}
++
+ #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+ 
+ struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev)



More information about the ewg mailing list