[openib-general] [PATCH] Opensm - exiting issues

Yael Kalka yael at mellanox.co.il
Mon Nov 7 05:25:07 PST 2005


Hi Hal,

There was a problem when running opensm with -o option, that caused
the opensm to always exit with segfault, due to object destruction
ordering. Also - there is the known issue of exiting opensm. We've
done some clearing to the exiting code. The following patch fixes most
of it. 
In the current code we saw that sometimes opensm gets "stuck" on exit,
and causes the machine to get stuck too - resulting in need for
rebooting. In the following patch fixes most of it.
We did run (in the patch) into rare cases where opensm exits with an
error, but at least it exits without stucking the machine...

Thanks,
Yael

Signed-off-by:  Yael Kalka <yael at mellanox.co.il>

Index: libvendor/osm_vendor_ibumad.c
===================================================================
--- libvendor/osm_vendor_ibumad.c	(revision 3975)
+++ libvendor/osm_vendor_ibumad.c	(working copy)
@@ -542,9 +542,14 @@ osm_vendor_delete(
 	int agent_id;
 
 	/* unregister UMAD agents */
+   /* This sometimes causes errors on exit, that cause kernel errors
+      and result in need to reboot machine. Currently - do not call
+      the umad_unregister. */
+   /*
 	for (agent_id = 0; agent_id < UMAD_CA_MAX_AGENTS; agent_id++)
 		if ( (*pp_vend)->agents[agent_id] )
 			umad_unregister( (*pp_vend)->umad_port_id, agent_id );
+   */
 	clear_madw( *pp_vend );
 	/* make sure all ports are closed */
 	umad_done();
@@ -839,7 +844,7 @@ osm_vendor_bind(
 			"osm_vendor_bind: ERR 5426: "
 			"Unable to register class %u version %u.\n",
 			p_user_bind->mad_class, p_user_bind->class_version);
-		free(p_bind);
+		cl_free(p_bind);
 		p_bind = 0;
 		goto Exit;
 	}
@@ -851,7 +856,7 @@ osm_vendor_bind(
 			"bad agent id %u or duplicate agent for class %u vers %u\n",
 			p_bind->agent_id, p_user_bind->mad_class,
 			p_user_bind->class_version);
-		free(p_bind);
+		cl_free(p_bind);
 		p_bind = 0;
 		goto Exit;
 	}
@@ -868,7 +873,7 @@ osm_vendor_bind(
 				"osm_vendor_bind: ERR 5428: "
 				"Unable to register class 1 version %u.\n",
 				p_user_bind->class_version);
-			free(p_bind);
+			cl_free(p_bind);
 			p_bind = 0;
 			goto Exit;
 		}
@@ -879,7 +884,7 @@ osm_vendor_bind(
 				"osm_vendor_bind: ERR 5429: "
 				"bad agent id %u or duplicate agent for class 1 vers %u\n",
 				p_bind->agent_id1, p_user_bind->class_version);
-			free(p_bind);
+			cl_free(p_bind);
 			p_bind = 0;
 			goto Exit;
 		}
@@ -892,6 +897,19 @@ Exit:
 	return( (osm_bind_handle_t)p_bind );
 }
 
+
+
+/**********************************************************************
+ **********************************************************************/
+void
+__osm_vendor_dummy_callback(
+  IN osm_madw_t *p_madw,
+  IN void *bind_context,
+  IN osm_madw_t *p_req_madw )
+{
+  printf("Ignoring received/sent mads after unbind\n");
+}
+
 /**********************************************************************
  **********************************************************************/
 void
@@ -903,6 +921,11 @@ osm_vendor_unbind(
 
 	OSM_LOG_ENTER( p_vend->p_log, osm_vendor_unbind );
 
+   cl_spinlock_acquire( &p_vend->cb_lock );
+   p_bind->mad_recv_callback = __osm_vendor_dummy_callback;
+   p_bind->send_err_callback = __osm_vendor_dummy_callback;
+   cl_spinlock_release( &p_vend->cb_lock );
+
 	OSM_LOG_EXIT( p_vend->p_log);
 }
 
Index: opensm/osm_opensm.c
===================================================================
--- opensm/osm_opensm.c	(revision 3975)
+++ opensm/osm_opensm.c	(working copy)
@@ -108,14 +108,11 @@ osm_opensm_destroy(
     */
    osm_sm_shutdown( &p_osm->sm );
 
-   /* shut down the dispatcher - so no new messages cross */
-   cl_disp_shutdown( &p_osm->disp );
-
    /* cleanup all messages on VL15 fifo that were not sent yet */
    osm_vl15_shutdown( &p_osm->vl15, &p_osm->mad_pool );
 
-   /* lock the whole thing so we do not get any requests etc */
-   cl_plock_excl_acquire( &p_osm->lock );
+   /* shut down the dispatcher - so no new messages cross */
+   cl_disp_shutdown( &p_osm->disp );
 
    /* do the destruction in reverse order as init */
    updn_destroy( p_osm->p_updn_ucast_routing );
@@ -128,7 +125,6 @@ osm_opensm_destroy(
    osm_subn_destroy( &p_osm->subn );
    cl_disp_destroy( &p_osm->disp );
 
-   cl_plock_release( &p_osm->lock );
    cl_plock_destroy( &p_osm->lock );
 
    cl_mem_display(  );
Index: opensm/osm_vl15intf.c
===================================================================
--- opensm/osm_vl15intf.c	(revision 3975)
+++ opensm/osm_vl15intf.c	(working copy)
@@ -334,8 +334,6 @@ osm_vl15_destroy(
   p_vl->state = OSM_VL15_STATE_INIT;
   cl_spinlock_destroy( &p_vl->lock );
 
-  cl_disp_unregister( p_vl->h_disp );
-
   OSM_LOG_EXIT( p_vl->p_log );
 }
 
@@ -500,6 +498,8 @@ osm_vl15_shutdown(
   /* grap a lock on the object */
   cl_spinlock_acquire( &p_vl->lock );
 
+  cl_disp_unregister( p_vl->h_disp );
+
   /* go over all outstanding MADs and retire their transactions */
 
   /* first we handle the list of response MADs */




More information about the general mailing list