[openib-general] [PATCH] osm: handle local events

Yevgeny Kliteynik kliteyn at mellanox.co.il
Tue Aug 22 08:41:58 PDT 2006


Hi Hal

This patch implements first item of the OSM todo list.

OpenSM opens a thread that is listening for events on the SM's port.
The events that are being taken care of are IBV_EVENT_DEVICE_FATAL and
IBV_EVENT_PORT_ERROR.

In case of IBV_EVENT_DEVICE_FATAL, osm is forced to exit.
in case of IBV_EVENT_PORT_ERROR, osm initiates heavy sweep.

Yevgeny

Signed-off-by:  Yevgeny Kliteynik <kliteyn at mellanox.co.il>

Index: include/opensm/osm_sm_mad_ctrl.h
===================================================================
--- include/opensm/osm_sm_mad_ctrl.h	(revision 8998)
+++ include/opensm/osm_sm_mad_ctrl.h	(working copy)
@@ -109,6 +109,7 @@ typedef struct _osm_sm_mad_ctrl
 	osm_mad_pool_t				*p_mad_pool;
 	osm_vl15_t					*p_vl15;
 	osm_vendor_t				*p_vendor;
+	struct _osm_state_mgr			*p_state_mgr; 
 	osm_bind_handle_t			h_bind;
 	cl_plock_t					*p_lock;
 	cl_dispatcher_t				*p_disp;
@@ -130,6 +131,9 @@ typedef struct _osm_sm_mad_ctrl
 *	p_vendor
 *		Pointer to the vendor specific interfaces object.
 *
+*	p_state_mgr
+*		Pointer to the state manager object.
+*
 *	h_bind
 *		Bind handle returned by the transport layer.
 *
@@ -233,6 +237,7 @@ osm_sm_mad_ctrl_init(
 	IN osm_mad_pool_t* const p_mad_pool,
 	IN osm_vl15_t* const p_vl15,
 	IN osm_vendor_t* const p_vendor,
+	IN struct _osm_state_mgr* const p_state_mgr,
 	IN osm_log_t* const p_log,
 	IN osm_stats_t* const p_stats,
 	IN cl_plock_t* const p_lock,
@@ -251,6 +256,9 @@ osm_sm_mad_ctrl_init(
 *	p_vendor
 *		[in] Pointer to the vendor specific interfaces object.
 *
+*	p_state_mgr
+*		[in] Pointer to the state manager object.
+*
 *	p_log
 *		[in] Pointer to the log object.
 *
Index: include/vendor/osm_vendor_ibumad.h
===================================================================
--- include/vendor/osm_vendor_ibumad.h	(revision 8998)
+++ include/vendor/osm_vendor_ibumad.h	(working copy)
@@ -74,6 +74,8 @@ BEGIN_C_DECLS
 #define OSM_UMAD_MAX_CAS	32
 #define OSM_UMAD_MAX_PORTS_PER_CA	2
 
+#define OSM_VENDOR_SUPPORT_EVENTS
+
 /* OpenIB gen2 doesn't support RMPP yet */
 
 /****s* OpenSM: Vendor UMAD/osm_ca_info_t
@@ -179,6 +181,10 @@ typedef	struct _osm_vendor
 	int umad_port_id;
 	void *receiver;
 	int issmfd;
+	cl_thread_t events_thread;  
+	void * events_callback;
+	void * sm_context;
+	struct ibv_context * ibv_context;
 } osm_vendor_t;
 
 #define OSM_BIND_INVALID_HANDLE 0
Index: include/vendor/osm_vendor_api.h
===================================================================
--- include/vendor/osm_vendor_api.h	(revision 8998)
+++ include/vendor/osm_vendor_api.h	(working copy)
@@ -526,6 +526,110 @@ osm_vendor_set_debug(
 * SEE ALSO
 *********/
 
+#ifdef OSM_VENDOR_SUPPORT_EVENTS
+
+#define OSM_EVENT_FATAL     1
+#define OSM_EVENT_PORT_ERR  2
+
+/****s* OpenSM Vendor API/osm_vend_events_callback_t
+* NAME
+*  osm_vend_events_callback_t
+*
+* DESCRIPTION
+*  Function prototype for the vendor events callback.
+*  The vendor layer calls this function on driver events.
+*
+* SYNOPSIS
+*/
+typedef void 
+(*osm_vend_events_callback_t)(
+   IN int events_mask,
+   IN void * const context );
+/*
+* PARAMETERS
+*  events_mask
+*     [in] The received event(s).
+*
+*  context
+*     [in] Context supplied as the "sm_context" argument in
+*          the osm_vendor_unreg_events_cb call
+*
+* RETURN VALUES
+*  None.
+*
+* NOTES
+*
+* SEE ALSO
+*  osm_vendor_reg_events_cb osm_vendor_unreg_events_cb
+*********/
+
+/****f* OpenSM Vendor API/osm_vendor_reg_events_cb
+* NAME
+*   osm_vendor_reg_events_cb
+*
+* DESCRIPTION
+*  Registers the events callback function and start the events
+*  thread
+*
+* SYNOPSIS
+*/
+int 
+osm_vendor_reg_events_cb(
+   IN osm_vendor_t * const p_vend,
+   IN void * const sm_callback,
+   IN void * const sm_context);
+/*
+* PARAMETERS
+*   p_vend
+*     [in] vendor handle.
+*
+*   sm_callback
+*     [in] Callback function that should be called when
+*          the event is received.
+*
+*  sm_context
+*     [in] Context supplied as the "context" argument in
+*          the subsequenct calls to the sm_callback function
+*
+* RETURN VALUE
+*  IB_SUCCESS if OK.
+*
+* NOTES
+*
+* SEE ALSO
+*  osm_vend_events_callback_t osm_vendor_unreg_events_cb
+*********/
+
+/****f* OpenSM Vendor API/osm_vendor_unreg_events_cb
+* NAME
+*   osm_vendor_unreg_events_cb
+*
+* DESCRIPTION
+*  Un-Registers the events callback function and stops the events
+*  thread
+*
+* SYNOPSIS
+*/
+void 
+osm_vendor_unreg_events_cb(
+   IN osm_vendor_t * const p_vend);
+/*
+* PARAMETERS
+*   p_vend
+*     [in] vendor handle.
+*
+*
+* RETURN VALUE
+*  None.
+*
+* NOTES
+*
+* SEE ALSO
+*  osm_vend_events_callback_t osm_vendor_reg_events_cb
+*********/
+
+#endif /* OSM_VENDOR_SUPPORT_EVENTS */
+
 END_C_DECLS
 
 #endif   /* _OSM_VENDOR_API_H_ */
Index: libvendor/osm_vendor_ibumad.c
===================================================================
--- libvendor/osm_vendor_ibumad.c	(revision 8998)
+++ libvendor/osm_vendor_ibumad.c	(working copy)
@@ -72,6 +72,7 @@
 #include <opensm/osm_log.h>
 #include <opensm/osm_mad_pool.h>
 #include <vendor/osm_vendor_api.h>
+#include <infiniband/verbs.h>
 
 /****s* OpenSM: Vendor AL/osm_umad_bind_info_t
  * NAME
@@ -441,6 +442,91 @@ Exit:
 
 /**********************************************************************
  **********************************************************************/
+static void
+umad_events_thread(
+   IN void * vend_context)
+{
+   int res = 0;
+   osm_vendor_t * p_vend = (osm_vendor_t *) vend_context;
+   struct ibv_async_event event;
+
+   OSM_LOG_ENTER( p_vend->p_log, umad_events_thread );
+
+   osm_log(p_vend->p_log, OSM_LOG_DEBUG,
+           "umad_events_thread: Device %s, async event FD: %d\n", 
+           p_vend->umad_port.ca_name, p_vend->ibv_context->async_fd);
+   osm_log(p_vend->p_log, OSM_LOG_DEBUG,
+           "umad_events_thread: Listening for events on device %s, port %d\n", 
+           p_vend->umad_port.ca_name, p_vend->umad_port.portnum);
+
+   while (1) {
+
+      res = ibv_get_async_event(p_vend->ibv_context, &event);
+      if (res)
+      {
+         osm_log(p_vend->p_log, OSM_LOG_ERROR,
+                 "umad_events_thread: ERR 5450: "
+                 "Failed getting async event (device %s, port %d)\n", 
+                 p_vend->umad_port.ca_name, p_vend->umad_port.portnum);
+         goto Exit;
+      }
+
+      if (!p_vend->events_callback)
+      {
+         osm_log(p_vend->p_log, OSM_LOG_DEBUG,
+                 "umad_events_thread: Events callback has been unregistered\n");
+         ibv_ack_async_event(&event);
+         goto Exit;
+      }
+      /*
+       * We're listening to events on the SM's port only
+       */
+      if ( event.element.port_num == p_vend->umad_port.portnum )
+      {
+         switch (event.event_type)
+         {
+            case IBV_EVENT_DEVICE_FATAL: 
+               osm_log(p_vend->p_log, OSM_LOG_INFO,
+                       "umad_events_thread: Received IBV_EVENT_DEVICE_FATAL\n");
+               ((osm_vend_events_callback_t)
+                     (p_vend->events_callback))(OSM_EVENT_FATAL, p_vend->sm_context);
+                                                
+               ibv_ack_async_event(&event);
+               goto Exit;
+               break;
+
+            case IBV_EVENT_PORT_ERR:
+               osm_log(p_vend->p_log, OSM_LOG_VERBOSE,
+                      "umad_events_thread: Received IBV_EVENT_PORT_ERR\n");
+               ((osm_vend_events_callback_t)
+                     (p_vend->events_callback))(OSM_EVENT_PORT_ERR, p_vend->sm_context);
+               break;
+
+            default:
+               osm_log(p_vend->p_log, OSM_LOG_DEBUG,
+                       "umad_events_thread: Received event #%d on port %d - Ignoring\n",
+                       event.event_type, event.element.port_num);
+         }
+      }
+      else 
+      {
+         osm_log(p_vend->p_log, OSM_LOG_DEBUG,
+                 "umad_events_thread: Received event #%d on port %d - Ignoring\n",
+                 event.event_type, event.element.port_num);
+      }
+
+      ibv_ack_async_event(&event);
+   }
+
+  Exit:
+   osm_log(p_vend->p_log, OSM_LOG_DEBUG,
+           "umad_events_thread: Terminating thread\n");
+   OSM_LOG_EXIT(p_vend->p_log);
+   return;
+}
+
+/**********************************************************************
+ **********************************************************************/
 ib_api_status_t
 osm_vendor_init(
   IN osm_vendor_t* const p_vend,
@@ -456,6 +542,7 @@ osm_vendor_init(
 	p_vend->max_retries = OSM_DEFAULT_RETRY_COUNT;
 	cl_spinlock_construct( &p_vend->cb_lock );
 	cl_spinlock_construct( &p_vend->match_tbl_lock );
+	cl_thread_construct( &p_vend->events_thread );
 	p_vend->umad_port_id = -1;
 	p_vend->issmfd = -1;
 
@@ -1217,4 +1304,114 @@ osm_vendor_set_debug(
 	umad_debug(level);
 }
 
+/**********************************************************************
+ **********************************************************************/
+int 
+osm_vendor_reg_events_cb(
+   IN osm_vendor_t * const p_vend,
+   IN void * const sm_callback,
+   IN void * const sm_context)
+{
+   ib_api_status_t status = IB_SUCCESS;
+   struct ibv_device ** dev_list;
+   struct ibv_device * device;
+
+   OSM_LOG_ENTER( p_vend->p_log, osm_vendor_reg_events_cb );
+
+   p_vend->events_callback = sm_callback;
+   p_vend->sm_context = sm_context;
+
+   dev_list = ibv_get_device_list(NULL);
+   if (!dev_list || !(*dev_list)) {
+      osm_log(p_vend->p_log, OSM_LOG_ERROR,
+              "osm_vendor_reg_events_cb: ERR 5440: "
+              "No IB devices found\n");
+      status = IB_ERROR;
+      goto Exit;
+   }
+
+   if (!p_vend->umad_port.ca_name || !p_vend->umad_port.ca_name[0])
+   {
+      osm_log(p_vend->p_log, OSM_LOG_ERROR,
+              "osm_vendor_reg_events_cb: ERR 5441: "
+              "Vendor initialization is not completed yet\n");
+      status = IB_ERROR;
+      goto Exit;
+   }
+
+   osm_log(p_vend->p_log, OSM_LOG_DEBUG,
+           "osm_vendor_reg_events_cb: Registering on device %s\n",
+           p_vend->umad_port.ca_name);
+
+   /*
+    * find device whos name matches the SM's device
+    */
+   for ( device = *dev_list; 
+         (device != NULL) && 
+         (strcmp(p_vend->umad_port.ca_name, ibv_get_device_name(device)) != 0);
+         device += sizeof(struct ibv_device *) )
+      ;
+   if (!device)
+   {
+      osm_log(p_vend->p_log, OSM_LOG_ERROR,
+              "osm_vendor_reg_events_cb: ERR 5442: "
+              "Device %s hasn't been found in the device list\n"
+              ,p_vend->umad_port.ca_name);
+      status = IB_ERROR;
+      goto Exit;
+   }
+
+   p_vend->ibv_context = ibv_open_device(device);
+   if (!p_vend->ibv_context) {
+      osm_log(p_vend->p_log, OSM_LOG_ERROR,
+              "osm_vendor_reg_events_cb: ERR 5443: "
+              "Couldn't get context for %s\n", 
+              p_vend->umad_port.ca_name);
+      status = IB_ERROR;
+      goto Exit;
+   }
+
+   /*
+    * Initiate the events thread
+    */
+   if (cl_thread_init(&p_vend->events_thread, 
+                      umad_events_thread,
+                      p_vend,
+                      "ibumad events thread") != CL_SUCCESS) {
+      osm_log(p_vend->p_log, OSM_LOG_ERROR,
+              "osm_vendor_reg_events_cb: ERR 5444: "
+              "Failed initiating event listening thread\n");
+      status = IB_ERROR;
+      goto Exit;
+   }
+
+  Exit:
+   if (status != IB_SUCCESS)
+   {
+      p_vend->events_callback = NULL;
+      p_vend->sm_context = NULL;
+      p_vend->ibv_context = NULL;
+      p_vend->events_callback = NULL;
+   }
+   OSM_LOG_EXIT( p_vend->p_log );
+   return status;
+}
+
+/**********************************************************************
+ **********************************************************************/
+void 
+osm_vendor_unreg_events_cb(
+   IN osm_vendor_t * const p_vend)
+{
+   OSM_LOG_ENTER( p_vend->p_log, osm_vendor_unreg_events_cb );
+   p_vend->events_callback = NULL;
+   p_vend->sm_context = NULL;
+   p_vend->ibv_context = NULL;
+   p_vend->events_callback = NULL;
+   OSM_LOG_EXIT( p_vend->p_log );
+}
+
+/**********************************************************************
+ **********************************************************************/
+
 #endif   /* OSM_VENDOR_INTF_OPENIB */
Index: libvendor/libosmvendor.map
===================================================================
--- libvendor/libosmvendor.map	(revision 8998)
+++ libvendor/libosmvendor.map	(working copy)
@@ -1,4 +1,4 @@
-OSMVENDOR_2.0 {
+OSMVENDOR_2.1 {
 	global:
 		umad_receiver;
 		osm_vendor_init;
@@ -23,5 +23,7 @@ OSMVENDOR_2.0 {
 		osmv_bind_sa;
 		osmv_query_sa;
 		osm_vendor_get_guid_ca_and_port;
+                osm_vendor_reg_events_cb;
+                osm_vendor_unreg_events_cb;
 	local: *;
 };
Index: opensm/osm_sm.c
===================================================================
--- opensm/osm_sm.c	(revision 8998)
+++ opensm/osm_sm.c	(working copy)
@@ -313,6 +313,7 @@ osm_sm_init(
                                   p_sm->p_mad_pool,
                                   p_sm->p_vl15,
                                   p_sm->p_vendor,
+                                  &p_sm->state_mgr,
                                   p_log, p_stats, p_lock, p_disp );
    if( status != IB_SUCCESS )
       goto Exit;
Index: opensm/osm_sm_mad_ctrl.c
===================================================================
--- opensm/osm_sm_mad_ctrl.c	(revision 8998)
+++ opensm/osm_sm_mad_ctrl.c	(working copy)
@@ -59,6 +59,7 @@
 #include <opensm/osm_msgdef.h>
 #include <opensm/osm_helper.h>
 #include <opensm/osm_opensm.h>
+#include <opensm/osm_state_mgr.h>
 
 /****f* opensm: SM/__osm_sm_mad_ctrl_retire_trans_mad
  * NAME
@@ -953,6 +954,7 @@ osm_sm_mad_ctrl_init(
   IN osm_mad_pool_t* const p_mad_pool,
   IN osm_vl15_t* const p_vl15,
   IN osm_vendor_t* const p_vendor,
+  IN struct _osm_state_mgr* const p_state_mgr, 
   IN osm_log_t* const p_log,
   IN osm_stats_t* const p_stats,
   IN cl_plock_t* const p_lock,
@@ -969,6 +971,7 @@ osm_sm_mad_ctrl_init(
   p_ctrl->p_disp = p_disp;
   p_ctrl->p_mad_pool = p_mad_pool;
   p_ctrl->p_vendor = p_vendor;
+  p_ctrl->p_state_mgr = p_state_mgr;
   p_ctrl->p_stats = p_stats;
   p_ctrl->p_lock = p_lock;
   p_ctrl->p_vl15 = p_vl15;
@@ -995,6 +998,47 @@ osm_sm_mad_ctrl_init(
 
 /**********************************************************************
  **********************************************************************/
+void 
+__osm_vend_events_callback(
+   IN int events_mask,
+   IN void * const context )
+{
+   osm_sm_mad_ctrl_t * const p_ctrl = (osm_sm_mad_ctrl_t * const) context;
+
+   OSM_LOG_ENTER(p_ctrl->p_log, __osm_vend_events_callback);
+
+   if (events_mask & OSM_EVENT_FATAL)
+   {
+      osm_log(p_ctrl->p_log, OSM_LOG_INFO,
+              "__osm_vend_events_callback: "
+              "Events callback got OSM_EVENT_FATAL\n");
+      osm_log(p_ctrl->p_log, OSM_LOG_SYS,
+              "Fatal HCA error - forcing OpenSM exit\n");
+      osm_exit_flag = 1;
+      OSM_LOG_EXIT(p_ctrl->p_log);
+      return;
+   }
+
+   if (events_mask & OSM_EVENT_PORT_ERR)
+   {
+      osm_log(p_ctrl->p_log, OSM_LOG_INFO,
+              "__osm_vend_events_callback: "
+              "Events callback got OSM_EVENT_PORT_ERR - forcing heavy sweep\n");
+      p_ctrl->p_subn->force_immediate_heavy_sweep = TRUE;
+      osm_state_mgr_process((osm_state_mgr_t * const)p_ctrl->p_state_mgr,
+                            OSM_SIGNAL_SWEEP);
+      OSM_LOG_EXIT(p_ctrl->p_log);
+      return;
+   }
+
+   osm_log(p_ctrl->p_log, OSM_LOG_INFO,
+           "__osm_vend_events_callback: "
+           "Events callback got event mask of %d - No action taken\n");
+   OSM_LOG_EXIT(p_ctrl->p_log);
+}
+
+/**********************************************************************
+ **********************************************************************/
 ib_api_status_t
 osm_sm_mad_ctrl_bind(
   IN osm_sm_mad_ctrl_t* const p_ctrl,
@@ -1044,6 +1088,17 @@ osm_sm_mad_ctrl_bind(
     goto Exit;
   }
 
+  if ( osm_vendor_reg_events_cb(p_ctrl->p_vendor, 
+                                __osm_vend_events_callback, 
+                                p_ctrl) )
+  {
+     status = IB_ERROR;
+     osm_log( p_ctrl->p_log, OSM_LOG_ERROR,
+              "osm_sm_mad_ctrl_bind: ERR 3120: "
+              "Vendor failed to register for events\n" );
+     goto Exit;
+  }
+
  Exit:
   OSM_LOG_EXIT( p_ctrl->p_log );
   return( status );
Index: config/osmvsel.m4
===================================================================
--- config/osmvsel.m4	(revision 8998)
+++ config/osmvsel.m4	(working copy)
@@ -63,9 +63,9 @@ if test $with_osmv = "openib"; then
    OSMV_CFLAGS="-DOSM_VENDOR_INTF_OPENIB"
    OSMV_INCLUDES="-I\$(srcdir)/../include -I\$(srcdir)/../../libibcommon/include/infiniband -I\$(srcdir)/../../libibumad/include/infiniband"
    if test "x$with_umad_libs" = "x"; then
-     OSMV_LDADD="-libumad"
+     OSMV_LDADD="-libumad -libverbs"
    else
-     OSMV_LDADD="-L$with_umad_libs -libumad"
+     OSMV_LDADD="-L$with_umad_libs -libumad -libverbs"
    fi
 
    if test "x$with_umad_includes" != "x"; then 
@@ -137,6 +137,8 @@ if test "$disable_libcheck" != "yes"; th
    LDFLAGS="$LDFLAGS $OSMV_LDADD"
    AC_CHECK_LIB(ibumad, umad_init, [],
 	 AC_MSG_ERROR([umad_init() not found. libosmvendor of type openib requires libibumad.]))
+   AC_CHECK_LIB(ibverbs, ibv_get_device_list, [],
+	 AC_MSG_ERROR([umad_init() not found. libosmvendor of type openib requires libibverbs.]))
    LD_FLAGS=$osmv_save_ldflags
  elif test $with_osmv = "sim" ; then
    LDFLAGS="$LDFLAGS -L$with_sim/lib"





More information about the general mailing list