[openib-general] [PATCH] osm_vendor_ibumad: termination crash fix
Sasha Khapyorsky
sashak at voltaire.com
Mon Feb 19 13:46:30 PST 2007
When OpenSM is terminated umad_receiver thread still running even after
the structures are destroyed and freed, this causes to random (but easily
reproducible) crashes. The reason is that osm_vendor_delete() does not
care about thread termination. This patch adds the receiver thread
cancellation (by using pthread_cancel() and pthread_join()) and cares to
keep have all mutexes unlocked upon termination. There is also minor
termination code consolidation - osm_vendor_port_close() function.
Signed-off-by: Sasha Khapyorsky <sashak at voltaire.com>
---
osm/include/vendor/osm_vendor_ibumad.h | 6 +-
osm/libvendor/osm_vendor_ibumad.c | 157 +++++++++++++++-----------------
osm/libvendor/osm_vendor_ibumad_sa.c | 3 +-
3 files changed, 77 insertions(+), 89 deletions(-)
diff --git a/osm/include/vendor/osm_vendor_ibumad.h b/osm/include/vendor/osm_vendor_ibumad.h
index 4cbd59f..f6e3d69 100644
--- a/osm/include/vendor/osm_vendor_ibumad.h
+++ b/osm/include/vendor/osm_vendor_ibumad.h
@@ -39,7 +39,6 @@
#include <iba/ib_types.h>
#include <complib/cl_qlist.h>
-#include <complib/cl_thread.h>
#include <opensm/osm_base.h>
#include <opensm/osm_log.h>
@@ -87,7 +86,6 @@ typedef struct _osm_ca_info
ib_net64_t guid;
size_t attr_size;
ib_ca_attr_t *p_attr;
-
} osm_ca_info_t;
/*
* FIELDS
@@ -170,8 +168,8 @@ typedef struct _osm_vendor
vendor_match_tbl_t mtbl;
umad_ca_t umad_ca;
umad_port_t umad_port;
- cl_spinlock_t cb_lock;
- cl_spinlock_t match_tbl_lock;
+ pthread_mutex_t cb_mutex;
+ pthread_mutex_t match_tbl_mutex;
int umad_port_id;
void *receiver;
int issmfd;
diff --git a/osm/libvendor/osm_vendor_ibumad.c b/osm/libvendor/osm_vendor_ibumad.c
index 35f127a..7320738 100644
--- a/osm/libvendor/osm_vendor_ibumad.c
+++ b/osm/libvendor/osm_vendor_ibumad.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2004-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
* Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
* Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
*
@@ -58,13 +58,11 @@
#include <unistd.h>
#include <stdlib.h>
-#include <signal.h>
#include <fcntl.h>
#include <errno.h>
#include <iba/ib_types.h>
#include <complib/cl_qlist.h>
-#include <complib/cl_thread.h>
#include <complib/cl_math.h>
#include <complib/cl_debug.h>
#include <opensm/osm_madw.h>
@@ -97,12 +95,13 @@ typedef struct _osm_umad_bind_info
typedef struct _umad_receiver
{
- cl_event_t signal;
- cl_thread_t receiver;
- osm_vendor_t *p_vend;
- osm_log_t *p_log;
+ pthread_t tid;
+ osm_vendor_t *p_vend;
+ osm_log_t *p_log;
} umad_receiver_t;
+static void osm_vendor_close_port(osm_vendor_t* const p_vend);
+
static void
clear_madw(osm_vendor_t *p_vend)
{
@@ -110,7 +109,7 @@ clear_madw(osm_vendor_t *p_vend)
ib_net64_t old_tid;
OSM_LOG_ENTER( p_vend->p_log, clear_madw );
- cl_spinlock_acquire( &p_vend->match_tbl_lock );
+ pthread_mutex_lock(&p_vend->match_tbl_mutex);
for (m = p_vend->mtbl.tbl, e = m + p_vend->mtbl.max; m < e; m++) {
if (m->tid) {
old_m = m;
@@ -119,7 +118,7 @@ clear_madw(osm_vendor_t *p_vend)
osm_mad_pool_put(
((osm_umad_bind_info_t *)((osm_madw_t *)m->v)->h_bind)->p_mad_pool,
m->v);
- cl_spinlock_release( &p_vend->match_tbl_lock );
+ pthread_mutex_unlock(&p_vend->match_tbl_mutex);
osm_log(p_vend->p_log, OSM_LOG_ERROR,
"clear_madw: ERR 5401: "
"evicting entry %p (tid was 0x%"PRIx64")\n",
@@ -127,7 +126,7 @@ clear_madw(osm_vendor_t *p_vend)
goto Exit;
}
}
- cl_spinlock_release( &p_vend->match_tbl_lock );
+ pthread_mutex_unlock(&p_vend->match_tbl_mutex);
Exit:
OSM_LOG_EXIT( p_vend->p_log );
@@ -147,18 +146,18 @@ get_madw(osm_vendor_t *p_vend, ib_net64_t *tid)
if (mtid == 0)
return 0;
- cl_spinlock_acquire( &p_vend->match_tbl_lock );
+ pthread_mutex_lock(&p_vend->match_tbl_mutex);
for (m = p_vend->mtbl.tbl, e = m + p_vend->mtbl.max; m < e; m++) {
if (m->tid == mtid) {
m->tid = 0;
*tid = mtid;
res = m->v;
- cl_spinlock_release( &p_vend->match_tbl_lock );
+ pthread_mutex_unlock(&p_vend->match_tbl_mutex);
return res;
}
}
- cl_spinlock_release( &p_vend->match_tbl_lock );
+ pthread_mutex_unlock(&p_vend->match_tbl_mutex);
return 0;
}
@@ -171,13 +170,13 @@ put_madw(osm_vendor_t *p_vend, osm_madw_t *p_madw, ib_net64_t *tid)
ib_net64_t old_tid;
uint32_t oldest = ~0;
- cl_spinlock_acquire( &p_vend->match_tbl_lock );
+ pthread_mutex_lock(&p_vend->match_tbl_mutex);
for (m = p_vend->mtbl.tbl, e = m + p_vend->mtbl.max; m < e; m++) {
if (m->tid == 0) {
m->tid = *tid;
m->v = p_madw;
m->version = cl_atomic_inc((atomic32_t *)&p_vend->mtbl.last_version);
- cl_spinlock_release( &p_vend->match_tbl_lock );
+ pthread_mutex_unlock(&p_vend->match_tbl_mutex);
return;
}
if (oldest > m->version) {
@@ -191,13 +190,13 @@ put_madw(osm_vendor_t *p_vend, osm_madw_t *p_madw, ib_net64_t *tid)
p_req_madw = old_lru->v;
p_bind = p_req_madw->h_bind;
p_req_madw->status = IB_CANCELED;
- cl_spinlock_acquire( &p_vend->cb_lock );
+ pthread_mutex_lock(&p_vend->cb_mutex);
(*p_bind->send_err_callback)(p_bind->client_context, old_lru->v);
- cl_spinlock_release( &p_vend->cb_lock );
+ pthread_mutex_unlock(&p_vend->cb_mutex);
lru->tid = *tid;
lru->v = p_madw;
lru->version = cl_atomic_inc((atomic32_t *)&p_vend->mtbl.last_version);
- cl_spinlock_release( &p_vend->match_tbl_lock );
+ pthread_mutex_unlock(&p_vend->match_tbl_mutex);
osm_log(p_vend->p_log, OSM_LOG_ERROR,
"put_madw: ERR 5402: "
"evicting entry %p (tid was 0x%"PRIx64")\n", old_lru, old_tid);
@@ -237,7 +236,12 @@ swap_mad_bufs(osm_madw_t *p_madw, void *umad)
return old;
}
-void
+static void unlock_mutex(void *arg)
+{
+ pthread_mutex_unlock(arg);
+}
+
+void *
umad_receiver(void *p_ptr)
{
umad_receiver_t* const p_ur = (umad_receiver_t *)p_ptr;
@@ -356,9 +360,10 @@ umad_receiver(void *p_ptr)
} else {
p_req_madw->status = IB_TIMEOUT;
/* cb frees req_madw */
- cl_spinlock_acquire( &p_vend->cb_lock );
+ pthread_mutex_lock(&p_vend->cb_mutex);
+ pthread_cleanup_push(unlock_mutex, &p_vend->cb_mutex);
(*p_bind->send_err_callback)(p_bind->client_context, p_req_madw);
- cl_spinlock_release( &p_vend->cb_lock );
+ pthread_cleanup_pop(1);
}
osm_mad_pool_put(p_bind->p_mad_pool, p_madw);
@@ -398,47 +403,37 @@ umad_receiver(void *p_ptr)
#endif
/* call the CB */
- cl_spinlock_acquire( &p_vend->cb_lock );
+ pthread_mutex_lock(&p_vend->cb_mutex);
+ pthread_cleanup_push(unlock_mutex, &p_vend->cb_mutex);
(*p_bind->mad_recv_callback)(p_madw, p_bind->client_context, p_req_madw);
- cl_spinlock_release( &p_vend->cb_lock );
+ pthread_cleanup_pop(1);
}
OSM_LOG_EXIT( p_vend->p_log );
- return;
+ return NULL;
}
-static int
-umad_receiver_init(osm_vendor_t *p_vend)
+static int umad_receiver_start(osm_vendor_t *p_vend)
{
umad_receiver_t *p_ur = p_vend->receiver;
- int r = -1;
-
- OSM_LOG_ENTER( p_vend->p_log, umad_receiver_init );
p_ur->p_vend = p_vend;
p_ur->p_log = p_vend->p_log;
- cl_event_construct(&p_ur->signal);
- cl_thread_construct(&p_ur->receiver);
-
- if (cl_event_init(&p_ur->signal, FALSE))
- goto Exit;
-
- /*
- * Initialize the thread after all other dependent objects
- * have been initialized.
- */
- if (cl_thread_init( &p_ur->receiver, umad_receiver, p_ur,
- "umad receiver" ))
- goto Exit;
-
- r = 0; /* success */
+ if (pthread_create(&p_ur->tid, NULL, umad_receiver, p_ur) < 0)
+ return -1;
-Exit:
- OSM_LOG_EXIT( p_vend->p_log );
- return r;
+ return 0;
}
+static void umad_receiver_stop(umad_receiver_t *p_ur)
+{
+ pthread_cancel(p_ur->tid);
+ pthread_join(p_ur->tid, NULL);
+ p_ur->tid = 0;
+ p_ur->p_vend = NULL;
+ p_ur->p_log = NULL;
+}
/**********************************************************************
**********************************************************************/
ib_api_status_t
@@ -454,23 +449,11 @@ osm_vendor_init(
p_vend->p_log = p_log;
p_vend->timeout = timeout;
p_vend->max_retries = OSM_DEFAULT_RETRY_COUNT;
- cl_spinlock_construct( &p_vend->cb_lock );
- cl_spinlock_construct( &p_vend->match_tbl_lock );
+ pthread_mutex_init(&p_vend->cb_mutex, NULL);
+ pthread_mutex_init(&p_vend->match_tbl_mutex, NULL);
p_vend->umad_port_id = -1;
p_vend->issmfd = -1;
- if ((r = cl_spinlock_init( &p_vend->cb_lock ))) {
- osm_log(p_vend->p_log, OSM_LOG_ERROR,
- "osm_vendor_init: ERR 5435: Error initializing cb spinlock\n");
- goto Exit;
- }
-
- if ((r = cl_spinlock_init( &p_vend->match_tbl_lock ))) {
- osm_log(p_vend->p_log, OSM_LOG_ERROR,
- "osm_vendor_init: ERR 5434: Error initializing match tbl spinlock\n");
- goto Exit;
- }
-
/*
* Open our instance of UMAD.
*/
@@ -541,29 +524,14 @@ void
osm_vendor_delete(
IN osm_vendor_t** const pp_vend )
{
- umad_receiver_t *p_ur;
- int agent_id;
-
- if ((*pp_vend)->umad_port_id >= 0) {
- /* unregister UMAD agents */
- for (agent_id = 0; agent_id < UMAD_CA_MAX_AGENTS; agent_id++)
- if ( (*pp_vend)->agents[agent_id] )
- umad_unregister((*pp_vend)->umad_port_id,
- agent_id );
- umad_close_port((*pp_vend)->umad_port_id);
- (*pp_vend)->umad_port_id = -1;
- }
+ osm_vendor_close_port(*pp_vend);
clear_madw( *pp_vend );
/* make sure all ports are closed */
umad_done();
- /* umad receiver thread ? */
- p_ur = (*pp_vend)->receiver;
- if (p_ur)
- cl_event_destroy( &p_ur->signal );
- cl_spinlock_destroy( &(*pp_vend)->cb_lock );
- cl_spinlock_destroy( &(*pp_vend)->match_tbl_lock );
+ pthread_mutex_destroy(&(*pp_vend)->cb_mutex);
+ pthread_mutex_destroy(&(*pp_vend)->match_tbl_mutex);
free( *pp_vend );
*pp_vend = NULL;
}
@@ -780,7 +748,7 @@ osm_vendor_open_port(
p_vend->umad_port_id = umad_port_id = -1;
goto Exit;
}
- if (umad_receiver_init(p_vend) != 0) {
+ if (umad_receiver_start(p_vend) != 0) {
osm_log( p_vend->p_log, OSM_LOG_ERROR,
"osm_vendor_open_port: ERR 5420: "
"umad_receiver_init failed\n" );
@@ -793,6 +761,27 @@ Exit:
return umad_port_id;
}
+static void osm_vendor_close_port(osm_vendor_t* const p_vend)
+{
+ umad_receiver_t *p_ur;
+ int i;
+
+ p_ur = p_vend->receiver;
+ p_vend->receiver = NULL;
+ if (p_ur) {
+ umad_receiver_stop(p_ur);
+ free(p_ur);
+ }
+
+ if (p_vend->umad_port_id >= 0) {
+ for (i = 0; i < UMAD_CA_MAX_AGENTS; i++)
+ if (p_vend->agents[i])
+ umad_unregister(p_vend->umad_port_id, i);
+ umad_close_port(p_vend->umad_port_id);
+ p_vend->umad_port_id = -1;
+ }
+}
+
static int set_bit(int nr, void *method_mask)
{
int mask, retval;
@@ -985,10 +974,10 @@ osm_vendor_unbind(
OSM_LOG_ENTER( p_vend->p_log, osm_vendor_unbind );
- cl_spinlock_acquire( &p_vend->cb_lock );
+ pthread_mutex_lock(&p_vend->cb_mutex);
p_bind->mad_recv_callback = __osm_vendor_recv_dummy_cb;
p_bind->send_err_callback = __osm_vendor_send_err_dummy_cb;
- cl_spinlock_release( &p_vend->cb_lock );
+ pthread_mutex_unlock(&p_vend->cb_mutex);
OSM_LOG_EXIT( p_vend->p_log);
}
@@ -1154,9 +1143,9 @@ Resp:
"Send p_madw = %p of size %d failed %d (%m)\n",
p_madw, sent_mad_size, ret);
p_madw->status = IB_ERROR;
- cl_spinlock_acquire( &p_vend->cb_lock );
+ pthread_mutex_lock(&p_vend->cb_mutex);
(*p_bind->send_err_callback)(p_bind->client_context, p_madw); /* cb frees madw */
- cl_spinlock_release( &p_vend->cb_lock );
+ pthread_mutex_unlock(&p_vend->cb_mutex);
goto Exit;
}
diff --git a/osm/libvendor/osm_vendor_ibumad_sa.c b/osm/libvendor/osm_vendor_ibumad_sa.c
index e3978ef..a110e81 100644
--- a/osm/libvendor/osm_vendor_ibumad_sa.c
+++ b/osm/libvendor/osm_vendor_ibumad_sa.c
@@ -39,9 +39,10 @@
#include <stdlib.h>
#include <string.h>
+#include <sys/time.h>
#include <vendor/osm_vendor_api.h>
#include <vendor/osm_vendor_sa_api.h>
-#include <sys/time.h>
+#include <complib/cl_event.h>
#define MAX_PORTS 64
--
1.5.0.1.40.gb40d
More information about the general
mailing list