[openib-general] [PATCH 1/2] multicast: notify users on membership errors

Thu Jun 8 21:38:07 PDT 2006

Modify ib_multicast module to detect events that require clients to rejoin
multicast groups.  Add tracking of clients which are members of any groups,
and provide notification to those clients when such an event occurs.

This patch tracks all active members of a group.  When an event occurs that
requires clients to rejoin a multicast group, the active members are moved
into an error state, and the clients are notified of a network reset error.
The group is then reset to force additional join requests to generate requests
to the SA.

Signed-off-by: Sean Hefty <sean.hefty at intel.com>
---
Hal, can you apply these patches and see if it fixes the issues that you
are experiencing.  These should eliminate any races with ipoib leaving,
then quickly re-joining a group as a result of an event.

Index: multicast.c
===================================================================

--- multicast.c	(revision 7805)
+++ multicast.c	(working copy)
@@ -61,6 +61,7 @@ static struct ib_client mcast_client = {
 	.remove = mcast_remove_one
 };
 
+static struct ib_event_handler	event_handler;
 static struct workqueue_struct	*mcast_wq;
 
 struct mcast_device;
@@ -86,6 +87,7 @@ enum mcast_state {
 	MCAST_JOINING,
 	MCAST_MEMBER,
 	MCAST_BUSY,
+	MCAST_ERROR
 };
 
 struct mcast_member;
@@ -97,6 +99,7 @@ struct mcast_group {
 	spinlock_t		lock;
 	struct work_struct	work;
 	struct list_head	pending_list;
+	struct list_head	active_list;
 	struct mcast_member	*last_join;
 	int			members[3];
 	atomic_t		refcount;
@@ -338,6 +341,8 @@ static void join_group(struct mcast_grou
 	group->rec.join_state |= join_state;
 	member->multicast.rec = group->rec;
 	member->multicast.rec.join_state = join_state;
+	list_del(&member->list);
+	list_add(&member->list, &group->active_list);
 }
 
 static int fail_join(struct mcast_group *group, struct mcast_member *member,
@@ -349,6 +354,34 @@ static int fail_join(struct mcast_group 
 	return member->multicast.callback(status, &member->multicast);
 }
 
+static void process_group_error(struct mcast_group *group)
+{
+	struct mcast_member *member;
+	int ret;
+
+	spin_lock_irq(&group->lock);
+	while (!list_empty(&group->active_list)) {
+		member = list_entry(group->active_list.next,
+				    struct mcast_member, list);
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		adjust_membership(group, member->multicast.rec.join_state, -1);
+		member->state = MCAST_ERROR;
+		spin_unlock_irq(&group->lock);
+
+		ret = member->multicast.callback(-ENETRESET,
+						 &member->multicast);
+		deref_member(member);
+		if (ret)
+			ib_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	group->rec.join_state = 0;
+	group->state = MCAST_BUSY;
+	spin_unlock_irq(&group->lock);
+}
+
 static void mcast_work_handler(void *data)
 {
 	struct mcast_group *group = data;
@@ -359,6 +392,12 @@ static void mcast_work_handler(void *dat
 
 retest:
 	spin_lock_irq(&group->lock);
+	if (group->state == MCAST_ERROR) {
+		spin_unlock_irq(&group->lock);
+		process_group_error(group);
+		goto retest;
+	}
+
 	while (!list_empty(&group->pending_list)) {
 		member = list_entry(group->pending_list.next,
 				    struct mcast_member, list);
@@ -371,8 +410,8 @@ retest:
 					 multicast->comp_mask);
 			if (!status)
 				join_group(group, member, join_state);
-
-			list_del_init(&member->list);
+			else
+				list_del_init(&member->list);
 			spin_unlock_irq(&group->lock);
 			ret = multicast->callback(status, multicast);
 		} else {
@@ -467,6 +506,7 @@ static struct mcast_group *acquire_group
 	group->port = port;
 	group->rec.mgid = *mgid;
 	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->active_list);
 	INIT_WORK(&group->work, mcast_work_handler, group);
 	spin_lock_init(&group->lock);
 
@@ -551,16 +591,10 @@ void ib_free_multicast(struct ib_multica
 	group = member->group;
 
 	spin_lock_irq(&group->lock);
-	switch (member->state) {
-	case MCAST_MEMBER:
+	if (member->state == MCAST_MEMBER)
 		adjust_membership(group, multicast->rec.join_state, -1);
-		break;
-	case MCAST_JOINING:
-		list_del_init(&member->list);
-		break;
-	default:
-		break;
-	}
+
+	list_del_init(&member->list);
 
 	if (group->state == MCAST_IDLE) {
 		group->state = MCAST_BUSY;
@@ -578,6 +612,48 @@ void ib_free_multicast(struct ib_multica
 }
 EXPORT_SYMBOL(ib_free_multicast);
 
+static void mcast_groups_lost(struct mcast_port *port)
+{
+	struct mcast_group *group;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	for (node = rb_first(&port->table); node; node = rb_next(node)) {
+		group = rb_entry(node, struct mcast_group, node);
+		spin_lock(&group->lock);
+		if (group->state == MCAST_IDLE) {
+			atomic_inc(&group->refcount);
+			queue_work(mcast_wq, &group->work);
+		}
+		group->state = MCAST_ERROR;
+		spin_unlock(&group->lock);
+	}
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void mcast_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	struct mcast_device *dev;
+
+	dev = ib_get_client_data(event->device, &mcast_client);
+	if (!dev)
+		return;
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+		mcast_groups_lost(&dev->port[event->element.port_num -
+					     dev->start_port]);
+		break;
+	default:
+		break;
+	}
+}
+
 static void mcast_add_one(struct ib_device *device)
 {
 	struct mcast_device *dev;
@@ -611,6 +687,9 @@ static void mcast_add_one(struct ib_devi
 
 	dev->device = device;
 	ib_set_client_data(device, &mcast_client, dev);
+
+	INIT_IB_EVENT_HANDLER(&event_handler, device, mcast_event_handler);
+	ib_register_event_handler(&event_handler);
 }
 
 static void mcast_remove_one(struct ib_device *device)
@@ -623,6 +702,7 @@ static void mcast_remove_one(struct ib_d
 	if (!dev)
 		return;
 
+	ib_unregister_event_handler(&event_handler);
 	flush_workqueue(mcast_wq);
 
 	for (i = 0; i < dev->end_port - dev->start_port; i++) {