[openib-general] [PATCH] Fix MAD completion handling

Roland Dreier roland at topspin.com
Tue Sep 28 14:50:57 PDT 2004


While looking over Sean's changes, I noticed what look like a few bugs
in the mad thread usage.  It didn't seem like there was any way for
the MAD thread to stop, and I think there are a few race conditions
that could lead to lost wakeups.  This patch tries to fix both of
these problems.

I didn't test this because I didn't feel like messing with the
Makefile to get it to build in my environment.  (It would be good to
switch to a standard kbuild Makefile so things like cross-compiling
and separate object directories work)

Thanks,
  Roland

Index: infiniband/access/ib_mad_priv.h
===================================================================
--- infiniband/access/ib_mad_priv.h	(revision 899)
+++ infiniband/access/ib_mad_priv.h	(working copy)
@@ -131,11 +131,6 @@
 	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
 };
 
-struct ib_mad_thread_private {
-	wait_queue_head_t	wait;
-	atomic_t		completion_event;
-};
-
 struct ib_mad_port_private {
 	struct list_head port_list;
 	struct ib_device *device;
@@ -159,7 +154,7 @@
 	u32 recv_wr_index[IB_MAD_QPS_CORE];
 
 	struct task_struct *mad_thread;
-	struct ib_mad_thread_private mad_thread_private;
+	int thread_wake;
 };
 
 #endif	/* __IB_MAD_PRIV_H__ */
Index: infiniband/access/ib_mad.c
===================================================================
--- infiniband/access/ib_mad.c	(revision 899)
+++ infiniband/access/ib_mad.c	(working copy)
@@ -892,6 +892,8 @@
 	struct ib_wc wc;
 	int err_status = 0;
 	
+	ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+
 	while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
 		printk(KERN_DEBUG "Completion opcode 0x%x WRID 0x%Lx\n", wc.opcode, wc.wr_id);
 		if (wc.status != IB_WC_SUCCESS) {
@@ -928,11 +930,8 @@
 		}
 	}
 
-	if (err_status) {
+	if (err_status)
 		ib_mad_port_restart(port_priv);
-	} else {
-		ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
-	}
 }
 
 /*
@@ -941,23 +940,22 @@
 static int ib_mad_thread(void *param)
 {
 	struct ib_mad_port_private *port_priv = param;
-	struct ib_mad_thread_private *mad_thread_priv = &port_priv->mad_thread_private;
-	int ret;
 
-	while (1) {
-		while (!signal_pending(current)) {
-			ret = wait_event_interruptible(mad_thread_priv->wait,
-						       atomic_read(&mad_thread_priv->completion_event) > 0);
-			atomic_set(&mad_thread_priv->completion_event, 0);
-			if (ret) {
-				printk(KERN_ERR "ib_mad thread exiting\n");
-				return 0;
-			}
+	__set_current_state(TASK_RUNNING);
 
-			ib_mad_completion_handler(port_priv);
+	do {
+		port_priv->thread_wake = 0;
+		wmb();
 
-		}
-	}
+		ib_mad_completion_handler(port_priv);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!port_priv->thread_wake)
+			schedule();
+		__set_current_state(TASK_RUNNING);
+	} while (!kthread_should_stop());
+
+	return 0;
 }
 
 /*
@@ -965,11 +963,8 @@
  */
 static int ib_mad_thread_init(struct ib_mad_port_private *port_priv)
 {
-	struct ib_mad_thread_private *mad_thread_priv = &port_priv->mad_thread_private;
+	port_priv->thread_wake = 0;
 
-	atomic_set(&mad_thread_priv->completion_event, 0);
-	init_waitqueue_head(&mad_thread_priv->wait);
-
 	port_priv->mad_thread = kthread_create(ib_mad_thread,
 					       port_priv,
 					       "ib_mad(%6s-%-2d)",
@@ -978,27 +973,18 @@
 	if (IS_ERR(port_priv->mad_thread)) {
 		printk(KERN_ERR "Couldn't start ib_mad thread for %s port %d\n",
 		       port_priv->device->name, port_priv->port_num);
-		return 1;
+		return PTR_ERR(port_priv->mad_thread);
 	}	
-	wake_up_process(port_priv->mad_thread);
 	return 0;
 }
 
-/*
- * Stop the IB MAD thread
- */
-static void ib_mad_thread_stop(struct ib_mad_port_private *port_priv)
-{
-	kthread_stop(port_priv->mad_thread);	/* !!! */
-}
-
 static void ib_mad_thread_completion_handler(struct ib_cq *cq)
 {
 	struct ib_mad_port_private *port_priv = cq->cq_context;
-	struct ib_mad_thread_private *mad_thread_priv = &port_priv->mad_thread_private;
 
-	atomic_inc(&mad_thread_priv->completion_event);
-	wake_up_interruptible(&mad_thread_priv->wait);
+	port_priv->thread_wake = 1;
+	wmb();
+	wake_up_process(port_priv->mad_thread);
 }
 
 static int ib_mad_post_receive_mad(struct ib_mad_port_private *port_priv,
@@ -1527,7 +1513,7 @@
 	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
 
 	ib_mad_port_stop(port_priv);
-	ib_mad_thread_stop(port_priv);
+	kthread_stop(port_priv->mad_thread);
 	ib_destroy_qp(port_priv->qp[1]);
 	ib_destroy_qp(port_priv->qp[0]);
 	ib_dereg_mr(port_priv->mr);



More information about the general mailing list