[openib-general] [PATCH] osm_vendor_ibumad: termination crash fix

Sasha Khapyorsky sashak at voltaire.com
Mon Feb 19 13:46:30 PST 2007


When OpenSM is terminated umad_receiver thread still running even after
the structures are destroyed and freed, this causes to random (but easily
reproducible) crashes. The reason is that osm_vendor_delete() does not
care about thread termination. This patch adds the receiver thread
cancellation (by using pthread_cancel() and pthread_join()) and cares to
keep have all mutexes unlocked upon termination. There is also minor
termination code consolidation - osm_vendor_port_close() function.

Signed-off-by: Sasha Khapyorsky <sashak at voltaire.com>
---
 osm/include/vendor/osm_vendor_ibumad.h |    6 +-
 osm/libvendor/osm_vendor_ibumad.c      |  157 +++++++++++++++-----------------
 osm/libvendor/osm_vendor_ibumad_sa.c   |    3 +-
 3 files changed, 77 insertions(+), 89 deletions(-)

diff --git a/osm/include/vendor/osm_vendor_ibumad.h b/osm/include/vendor/osm_vendor_ibumad.h
index 4cbd59f..f6e3d69 100644
--- a/osm/include/vendor/osm_vendor_ibumad.h
+++ b/osm/include/vendor/osm_vendor_ibumad.h
@@ -39,7 +39,6 @@
 
 #include <iba/ib_types.h>
 #include <complib/cl_qlist.h>
-#include <complib/cl_thread.h>
 #include <opensm/osm_base.h>
 #include <opensm/osm_log.h>
 
@@ -87,7 +86,6 @@ typedef struct _osm_ca_info
 	ib_net64_t		guid;
 	size_t			attr_size;
 	ib_ca_attr_t		*p_attr;
-
 } osm_ca_info_t;
 /*
 * FIELDS
@@ -170,8 +168,8 @@ typedef	struct _osm_vendor
 	vendor_match_tbl_t mtbl;
 	umad_ca_t umad_ca;
 	umad_port_t umad_port;
-	cl_spinlock_t cb_lock;
-	cl_spinlock_t match_tbl_lock;
+	pthread_mutex_t cb_mutex;
+	pthread_mutex_t match_tbl_mutex;
 	int umad_port_id;
 	void *receiver;
 	int issmfd;
diff --git a/osm/libvendor/osm_vendor_ibumad.c b/osm/libvendor/osm_vendor_ibumad.c
index 35f127a..7320738 100644
--- a/osm/libvendor/osm_vendor_ibumad.c
+++ b/osm/libvendor/osm_vendor_ibumad.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
  * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
@@ -58,13 +58,11 @@
 
 #include <unistd.h>
 #include <stdlib.h>
-#include <signal.h>
 #include <fcntl.h>
 #include <errno.h>
 
 #include <iba/ib_types.h>
 #include <complib/cl_qlist.h>
-#include <complib/cl_thread.h>
 #include <complib/cl_math.h>
 #include <complib/cl_debug.h>
 #include <opensm/osm_madw.h>
@@ -97,12 +95,13 @@ typedef struct _osm_umad_bind_info
 
 typedef struct _umad_receiver
 {
-	cl_event_t		signal;
-	cl_thread_t		receiver;
-	osm_vendor_t		*p_vend;
-	osm_log_t		*p_log;
+	pthread_t tid;
+	osm_vendor_t	*p_vend;
+	osm_log_t	*p_log;
 } umad_receiver_t;
 
+static void osm_vendor_close_port(osm_vendor_t* const p_vend);
+
 static void
 clear_madw(osm_vendor_t *p_vend)
 {
@@ -110,7 +109,7 @@ clear_madw(osm_vendor_t *p_vend)
 	ib_net64_t old_tid;
 
 	OSM_LOG_ENTER( p_vend->p_log, clear_madw );
-	cl_spinlock_acquire( &p_vend->match_tbl_lock );
+	pthread_mutex_lock(&p_vend->match_tbl_mutex);
 	for (m = p_vend->mtbl.tbl, e = m + p_vend->mtbl.max; m < e; m++) {
 		if (m->tid) {
 			old_m = m;
@@ -119,7 +118,7 @@ clear_madw(osm_vendor_t *p_vend)
         		osm_mad_pool_put(
 				((osm_umad_bind_info_t *)((osm_madw_t *)m->v)->h_bind)->p_mad_pool,
 				m->v);
-			cl_spinlock_release( &p_vend->match_tbl_lock );
+			pthread_mutex_unlock(&p_vend->match_tbl_mutex);
 			osm_log(p_vend->p_log, OSM_LOG_ERROR,
 				"clear_madw: ERR 5401: "
 				"evicting entry %p (tid was 0x%"PRIx64")\n",
@@ -127,7 +126,7 @@ clear_madw(osm_vendor_t *p_vend)
 			goto Exit;
 		}
 	}
-	cl_spinlock_release( &p_vend->match_tbl_lock );
+	pthread_mutex_unlock(&p_vend->match_tbl_mutex);
 
 Exit:
 	OSM_LOG_EXIT( p_vend->p_log );
@@ -147,18 +146,18 @@ get_madw(osm_vendor_t *p_vend, ib_net64_t *tid)
 	if (mtid == 0)
 		return 0;
 
-	cl_spinlock_acquire( &p_vend->match_tbl_lock );
+	pthread_mutex_lock(&p_vend->match_tbl_mutex);
 	for (m = p_vend->mtbl.tbl, e = m + p_vend->mtbl.max; m < e; m++) {
 		if (m->tid == mtid) {
 			m->tid = 0;
 			*tid = mtid;
 			res = m->v;
-			cl_spinlock_release( &p_vend->match_tbl_lock );
+			pthread_mutex_unlock(&p_vend->match_tbl_mutex);
 			return res;
 		}
 	}
 
-	cl_spinlock_release( &p_vend->match_tbl_lock );
+	pthread_mutex_unlock(&p_vend->match_tbl_mutex);
 	return 0;
 }
 
@@ -171,13 +170,13 @@ put_madw(osm_vendor_t *p_vend, osm_madw_t *p_madw, ib_net64_t *tid)
 	ib_net64_t old_tid;
 	uint32_t oldest = ~0;
 
-	cl_spinlock_acquire( &p_vend->match_tbl_lock );
+	pthread_mutex_lock(&p_vend->match_tbl_mutex);
 	for (m = p_vend->mtbl.tbl, e = m + p_vend->mtbl.max; m < e; m++) {
 		if (m->tid == 0) {
 			m->tid = *tid;
 			m->v = p_madw;
 			m->version = cl_atomic_inc((atomic32_t *)&p_vend->mtbl.last_version);
-			cl_spinlock_release( &p_vend->match_tbl_lock );
+			pthread_mutex_unlock(&p_vend->match_tbl_mutex);
 			return;
 		}
 		if (oldest > m->version) {
@@ -191,13 +190,13 @@ put_madw(osm_vendor_t *p_vend, osm_madw_t *p_madw, ib_net64_t *tid)
 	p_req_madw = old_lru->v;
 	p_bind = p_req_madw->h_bind;
 	p_req_madw->status = IB_CANCELED;
-	cl_spinlock_acquire( &p_vend->cb_lock );
+	pthread_mutex_lock(&p_vend->cb_mutex);
 	(*p_bind->send_err_callback)(p_bind->client_context, old_lru->v);
-	cl_spinlock_release( &p_vend->cb_lock );
+	pthread_mutex_unlock(&p_vend->cb_mutex);
 	lru->tid = *tid;
 	lru->v = p_madw;
 	lru->version = cl_atomic_inc((atomic32_t *)&p_vend->mtbl.last_version);
-	cl_spinlock_release( &p_vend->match_tbl_lock );
+	pthread_mutex_unlock(&p_vend->match_tbl_mutex);
 	osm_log(p_vend->p_log, OSM_LOG_ERROR,
 		"put_madw: ERR 5402: "
 		"evicting entry %p (tid was 0x%"PRIx64")\n", old_lru, old_tid);
@@ -237,7 +236,12 @@ swap_mad_bufs(osm_madw_t *p_madw, void *umad)
 	return old;
 }
 
-void
+static void unlock_mutex(void *arg)
+{
+	pthread_mutex_unlock(arg);
+}
+
+void *
 umad_receiver(void *p_ptr)
 {
 	umad_receiver_t* const p_ur = (umad_receiver_t *)p_ptr;
@@ -356,9 +360,10 @@ umad_receiver(void *p_ptr)
 			} else {
 				p_req_madw->status = IB_TIMEOUT;
 				/* cb frees req_madw */
-				cl_spinlock_acquire( &p_vend->cb_lock );
+				pthread_mutex_lock(&p_vend->cb_mutex);
+				pthread_cleanup_push(unlock_mutex, &p_vend->cb_mutex);
 				(*p_bind->send_err_callback)(p_bind->client_context, p_req_madw);
-				cl_spinlock_release( &p_vend->cb_lock );
+				pthread_cleanup_pop(1);
 			}
 
 			osm_mad_pool_put(p_bind->p_mad_pool, p_madw);
@@ -398,47 +403,37 @@ umad_receiver(void *p_ptr)
 #endif
 
 		/* call the CB */
-		cl_spinlock_acquire( &p_vend->cb_lock );
+		pthread_mutex_lock(&p_vend->cb_mutex);
+		pthread_cleanup_push(unlock_mutex, &p_vend->cb_mutex);
 		(*p_bind->mad_recv_callback)(p_madw, p_bind->client_context, p_req_madw);
-		cl_spinlock_release( &p_vend->cb_lock );
+		pthread_cleanup_pop(1);
 	}
 
 	OSM_LOG_EXIT( p_vend->p_log );
-	return;
+	return NULL;
 }
 
-static int
-umad_receiver_init(osm_vendor_t *p_vend)
+static int umad_receiver_start(osm_vendor_t *p_vend)
 {
 	umad_receiver_t *p_ur = p_vend->receiver;
-	int r = -1;
-
-	OSM_LOG_ENTER( p_vend->p_log, umad_receiver_init );
 
 	p_ur->p_vend = p_vend;
 	p_ur->p_log = p_vend->p_log;
 
-	cl_event_construct(&p_ur->signal);
-	cl_thread_construct(&p_ur->receiver);
-
-  	if (cl_event_init(&p_ur->signal, FALSE))
-		goto Exit;
-
-	/*
-	 * Initialize the thread after all other dependent objects
-	 * have been initialized.
-	 */
-	if (cl_thread_init( &p_ur->receiver, umad_receiver, p_ur,
-			    "umad receiver" ))
-	    goto Exit;
-
-	r = 0;	/* success */
+	if (pthread_create(&p_ur->tid, NULL, umad_receiver, p_ur) < 0)
+		return -1;
 
-Exit:
-	OSM_LOG_EXIT( p_vend->p_log );
-	return r;
+	return 0;
 }
 
+static void umad_receiver_stop(umad_receiver_t *p_ur)
+{
+	pthread_cancel(p_ur->tid);
+	pthread_join(p_ur->tid, NULL);
+	p_ur->tid = 0;
+	p_ur->p_vend = NULL;
+	p_ur->p_log = NULL;
+}
 /**********************************************************************
  **********************************************************************/
 ib_api_status_t
@@ -454,23 +449,11 @@ osm_vendor_init(
 	p_vend->p_log = p_log;
 	p_vend->timeout = timeout;
 	p_vend->max_retries = OSM_DEFAULT_RETRY_COUNT;
-	cl_spinlock_construct( &p_vend->cb_lock );
-	cl_spinlock_construct( &p_vend->match_tbl_lock );
+	pthread_mutex_init(&p_vend->cb_mutex, NULL);
+	pthread_mutex_init(&p_vend->match_tbl_mutex, NULL);
 	p_vend->umad_port_id = -1;
 	p_vend->issmfd = -1;
 
-	if ((r = cl_spinlock_init( &p_vend->cb_lock ))) {
-		osm_log(p_vend->p_log, OSM_LOG_ERROR,
-			"osm_vendor_init: ERR 5435: Error initializing cb spinlock\n");
-		goto Exit;
-	}
-
-	if ((r = cl_spinlock_init( &p_vend->match_tbl_lock ))) {
-		osm_log(p_vend->p_log, OSM_LOG_ERROR,
-			"osm_vendor_init: ERR 5434: Error initializing match tbl spinlock\n");
-		goto Exit;
-	}
-	
 	/*
 	 * Open our instance of UMAD.
 	 */
@@ -541,29 +524,14 @@ void
 osm_vendor_delete(
   IN osm_vendor_t** const pp_vend )
 {
-	umad_receiver_t *p_ur;
-	int agent_id;
-
-	if ((*pp_vend)->umad_port_id >= 0) {
-		/* unregister UMAD agents */
-		for (agent_id = 0; agent_id < UMAD_CA_MAX_AGENTS; agent_id++)
-			if ( (*pp_vend)->agents[agent_id] )
-				umad_unregister((*pp_vend)->umad_port_id,
-						agent_id );
-		umad_close_port((*pp_vend)->umad_port_id);
-		(*pp_vend)->umad_port_id = -1;
-	}
+	osm_vendor_close_port(*pp_vend);
 
 	clear_madw( *pp_vend );
 	/* make sure all ports are closed */
 	umad_done();
 
-	/* umad receiver thread ? */
-	p_ur = (*pp_vend)->receiver;
-	if (p_ur)
-		cl_event_destroy( &p_ur->signal );
-	cl_spinlock_destroy( &(*pp_vend)->cb_lock );
-	cl_spinlock_destroy( &(*pp_vend)->match_tbl_lock );
+	pthread_mutex_destroy(&(*pp_vend)->cb_mutex);
+	pthread_mutex_destroy(&(*pp_vend)->match_tbl_mutex);
 	free( *pp_vend );
 	*pp_vend = NULL;
 }
@@ -780,7 +748,7 @@ osm_vendor_open_port(
 		p_vend->umad_port_id = umad_port_id = -1;
 		goto Exit;
 	}
-	if (umad_receiver_init(p_vend) != 0) {
+	if (umad_receiver_start(p_vend) != 0) {
 		osm_log( p_vend->p_log, OSM_LOG_ERROR,
 			 "osm_vendor_open_port: ERR 5420: "
 			 "umad_receiver_init failed\n" );
@@ -793,6 +761,27 @@ Exit:
 	return umad_port_id;
 }
 
+static void osm_vendor_close_port(osm_vendor_t* const p_vend)
+{
+	umad_receiver_t *p_ur;
+	int i;
+
+	p_ur = p_vend->receiver;
+	p_vend->receiver = NULL;
+	if (p_ur) {
+		umad_receiver_stop(p_ur);
+		free(p_ur);
+	}
+
+	if (p_vend->umad_port_id >= 0) {
+		for (i = 0; i < UMAD_CA_MAX_AGENTS; i++)
+			if (p_vend->agents[i])
+				umad_unregister(p_vend->umad_port_id, i);
+		umad_close_port(p_vend->umad_port_id);
+		p_vend->umad_port_id = -1;
+	}
+}
+
 static int set_bit(int nr, void *method_mask)
 {
 	int mask, retval;
@@ -985,10 +974,10 @@ osm_vendor_unbind(
 
 	OSM_LOG_ENTER( p_vend->p_log, osm_vendor_unbind );
 
-	cl_spinlock_acquire( &p_vend->cb_lock );
+	pthread_mutex_lock(&p_vend->cb_mutex);
 	p_bind->mad_recv_callback = __osm_vendor_recv_dummy_cb;
 	p_bind->send_err_callback = __osm_vendor_send_err_dummy_cb;
-	cl_spinlock_release( &p_vend->cb_lock );
+	pthread_mutex_unlock(&p_vend->cb_mutex);
 
 	OSM_LOG_EXIT( p_vend->p_log);
 }
@@ -1154,9 +1143,9 @@ Resp:
 			"Send p_madw = %p of size %d failed %d (%m)\n", 
 			p_madw, sent_mad_size, ret);
 		p_madw->status = IB_ERROR;
-		cl_spinlock_acquire( &p_vend->cb_lock );
+		pthread_mutex_lock(&p_vend->cb_mutex);
 		(*p_bind->send_err_callback)(p_bind->client_context, p_madw);	/* cb frees madw */
-		cl_spinlock_release( &p_vend->cb_lock );
+		pthread_mutex_unlock(&p_vend->cb_mutex);
 		goto Exit;
 	}
 
diff --git a/osm/libvendor/osm_vendor_ibumad_sa.c b/osm/libvendor/osm_vendor_ibumad_sa.c
index e3978ef..a110e81 100644
--- a/osm/libvendor/osm_vendor_ibumad_sa.c
+++ b/osm/libvendor/osm_vendor_ibumad_sa.c
@@ -39,9 +39,10 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include <sys/time.h>
 #include <vendor/osm_vendor_api.h>
 #include <vendor/osm_vendor_sa_api.h>
-#include <sys/time.h>
+#include <complib/cl_event.h>
 
 #define MAX_PORTS 64
 
-- 
1.5.0.1.40.gb40d





More information about the general mailing list