[openib-general] [PATCH] osm: handle local events

Hal Rosenstock halr at voltaire.com
Tue Aug 22 10:35:58 PDT 2006


Hi Yevgeny,

On Tue, 2006-08-22 at 11:41, Yevgeny Kliteynik wrote: 
> Hi Hal
> 
> This patch implements first item of the OSM todo list.

Thanks!

Am I correct in assuming this is both for trunk and 1.1 ?

> OpenSM opens a thread that is listening for events on the SM's port.
> The events that are being taken care of are IBV_EVENT_DEVICE_FATAL and
> IBV_EVENT_PORT_ERROR.
> 
> In case of IBV_EVENT_DEVICE_FATAL, osm is forced to exit.
> in case of IBV_EVENT_PORT_ERROR, osm initiates heavy sweep.

Some minor comments below. Let me know what you think. You don't have to
resubmit for these.

> Yevgeny
> 
> Signed-off-by:  Yevgeny Kliteynik <kliteyn at mellanox.co.il>
> 
> Index: include/opensm/osm_sm_mad_ctrl.h
> ===================================================================
> -- include/opensm/osm_sm_mad_ctrl.h	(revision 8998)
> +++ include/opensm/osm_sm_mad_ctrl.h	(working copy)
> @@ -109,6 +109,7 @@ typedef struct _osm_sm_mad_ctrl
>  	osm_mad_pool_t				*p_mad_pool;
>  	osm_vl15_t					*p_vl15;
>  	osm_vendor_t				*p_vendor;
> +	struct _osm_state_mgr			*p_state_mgr; 
>  	osm_bind_handle_t			h_bind;
>  	cl_plock_t					*p_lock;
>  	cl_dispatcher_t				*p_disp;
> @@ -130,6 +131,9 @@ typedef struct _osm_sm_mad_ctrl
>  *	p_vendor
>  *		Pointer to the vendor specific interfaces object.
>  *
> +*	p_state_mgr
> +*		Pointer to the state manager object.
> +*
>  *	h_bind
>  *		Bind handle returned by the transport layer.
>  *
> @@ -233,6 +237,7 @@ osm_sm_mad_ctrl_init(
>  	IN osm_mad_pool_t* const p_mad_pool,
>  	IN osm_vl15_t* const p_vl15,
>  	IN osm_vendor_t* const p_vendor,
> +	IN struct _osm_state_mgr* const p_state_mgr,
>  	IN osm_log_t* const p_log,
>  	IN osm_stats_t* const p_stats,
>  	IN cl_plock_t* const p_lock,
> @@ -251,6 +256,9 @@ osm_sm_mad_ctrl_init(
>  *	p_vendor
>  *		[in] Pointer to the vendor specific interfaces object.
>  *
> +*	p_state_mgr
> +*		[in] Pointer to the state manager object.
> +*
>  *	p_log
>  *		[in] Pointer to the log object.
>  *
> Index: include/vendor/osm_vendor_ibumad.h
> ===================================================================
> -- include/vendor/osm_vendor_ibumad.h	(revision 8998)
> +++ include/vendor/osm_vendor_ibumad.h	(working copy)
> @@ -74,6 +74,8 @@ BEGIN_C_DECLS
>  #define OSM_UMAD_MAX_CAS	32
>  #define OSM_UMAD_MAX_PORTS_PER_CA	2
>  
> +#define OSM_VENDOR_SUPPORT_EVENTS
> +

I prefer this as an additional flag turned on in the build for OpenIB.

>  /* OpenIB gen2 doesn't support RMPP yet */
>  
>  /****s* OpenSM: Vendor UMAD/osm_ca_info_t
> @@ -179,6 +181,10 @@ typedef	struct _osm_vendor
>  	int umad_port_id;
>  	void *receiver;
>  	int issmfd;
> +	cl_thread_t events_thread;  
> +	void * events_callback;
> +	void * sm_context;
> +	struct ibv_context * ibv_context;
>  } osm_vendor_t;
>  
>  #define OSM_BIND_INVALID_HANDLE 0
> Index: include/vendor/osm_vendor_api.h
> ===================================================================
> -- include/vendor/osm_vendor_api.h	(revision 8998)
> +++ include/vendor/osm_vendor_api.h	(working copy)
> @@ -526,6 +526,110 @@ osm_vendor_set_debug(
>  * SEE ALSO
>  *********/
>  
> +#ifdef OSM_VENDOR_SUPPORT_EVENTS
> +
> +#define OSM_EVENT_FATAL     1
> +#define OSM_EVENT_PORT_ERR  2
> +
> +/****s* OpenSM Vendor API/osm_vend_events_callback_t
> +* NAME
> +*  osm_vend_events_callback_t
> +*
> +* DESCRIPTION
> +*  Function prototype for the vendor events callback.
> +*  The vendor layer calls this function on driver events.
> +*
> +* SYNOPSIS
> +*/
> +typedef void 
> +(*osm_vend_events_callback_t)(
> +   IN int events_mask,
> +   IN void * const context );
> +/*
> +* PARAMETERS
> +*  events_mask
> +*     [in] The received event(s).
> +*
> +*  context
> +*     [in] Context supplied as the "sm_context" argument in
> +*          the osm_vendor_unreg_events_cb call
> +*
> +* RETURN VALUES
> +*  None.
> +*
> +* NOTES
> +*
> +* SEE ALSO
> +*  osm_vendor_reg_events_cb osm_vendor_unreg_events_cb
> +*********/
> +
> +/****f* OpenSM Vendor API/osm_vendor_reg_events_cb
> +* NAME
> +*   osm_vendor_reg_events_cb
> +*
> +* DESCRIPTION
> +*  Registers the events callback function and start the events
> +*  thread
> +*
> +* SYNOPSIS
> +*/
> +int 
> +osm_vendor_reg_events_cb(
> +   IN osm_vendor_t * const p_vend,
> +   IN void * const sm_callback,
> +   IN void * const sm_context);
> +/*
> +* PARAMETERS
> +*   p_vend
> +*     [in] vendor handle.
> +*
> +*   sm_callback
> +*     [in] Callback function that should be called when
> +*          the event is received.
> +*
> +*  sm_context
> +*     [in] Context supplied as the "context" argument in
> +*          the subsequenct calls to the sm_callback function
> +*
> +* RETURN VALUE
> +*  IB_SUCCESS if OK.
> +*
> +* NOTES
> +*
> +* SEE ALSO
> +*  osm_vend_events_callback_t osm_vendor_unreg_events_cb
> +*********/
> +
> +/****f* OpenSM Vendor API/osm_vendor_unreg_events_cb
> +* NAME
> +*   osm_vendor_unreg_events_cb
> +*
> +* DESCRIPTION
> +*  Un-Registers the events callback function and stops the events
> +*  thread
> +*
> +* SYNOPSIS
> +*/
> +void 
> +osm_vendor_unreg_events_cb(
> +   IN osm_vendor_t * const p_vend);
> +/*
> +* PARAMETERS
> +*   p_vend
> +*     [in] vendor handle.
> +*
> +*
> +* RETURN VALUE
> +*  None.
> +*
> +* NOTES
> +*
> +* SEE ALSO
> +*  osm_vend_events_callback_t osm_vendor_reg_events_cb
> +*********/
> +
> +#endif /* OSM_VENDOR_SUPPORT_EVENTS */
> +
>  END_C_DECLS
>  
>  #endif   /* _OSM_VENDOR_API_H_ */
> Index: libvendor/osm_vendor_ibumad.c
> ===================================================================
> -- libvendor/osm_vendor_ibumad.c	(revision 8998)
> +++ libvendor/osm_vendor_ibumad.c	(working copy)
> @@ -72,6 +72,7 @@
>  #include <opensm/osm_log.h>
>  #include <opensm/osm_mad_pool.h>
>  #include <vendor/osm_vendor_api.h>
> +#include <infiniband/verbs.h>
>  
>  /****s* OpenSM: Vendor AL/osm_umad_bind_info_t
>   * NAME
> @@ -441,6 +442,91 @@ Exit:
>  
>  /**********************************************************************
>   **********************************************************************/
> +static void
> +umad_events_thread(
> +   IN void * vend_context)
> +{
> +   int res = 0;
> +   osm_vendor_t * p_vend = (osm_vendor_t *) vend_context;
> +   struct ibv_async_event event;
> +
> +   OSM_LOG_ENTER( p_vend->p_log, umad_events_thread );
> +
> +   osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> +           "umad_events_thread: Device %s, async event FD: %d\n", 
> +           p_vend->umad_port.ca_name, p_vend->ibv_context->async_fd);
> +   osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> +           "umad_events_thread: Listening for events on device %s, port %d\n", 
> +           p_vend->umad_port.ca_name, p_vend->umad_port.portnum);
> +
> +   while (1) {
> +
> +      res = ibv_get_async_event(p_vend->ibv_context, &event);
> +      if (res)
> +      {
> +         osm_log(p_vend->p_log, OSM_LOG_ERROR,
> +                 "umad_events_thread: ERR 5450: "
> +                 "Failed getting async event (device %s, port %d)\n", 
> +                 p_vend->umad_port.ca_name, p_vend->umad_port.portnum);
> +         goto Exit;
> +      }
> +
> +      if (!p_vend->events_callback)
> +      {
> +         osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> +                 "umad_events_thread: Events callback has been unregistered\n");
> +         ibv_ack_async_event(&event);
> +         goto Exit;
> +      }
> +      /*
> +       * We're listening to events on the SM's port only
> +       */
> +      if ( event.element.port_num == p_vend->umad_port.portnum )
> +      {
> +         switch (event.event_type)
> +         {
> +            case IBV_EVENT_DEVICE_FATAL: 
> +               osm_log(p_vend->p_log, OSM_LOG_INFO,
> +                       "umad_events_thread: Received IBV_EVENT_DEVICE_FATAL\n");
> +               ((osm_vend_events_callback_t)
> +                     (p_vend->events_callback))(OSM_EVENT_FATAL, p_vend->sm_context);
> +                                                
> +               ibv_ack_async_event(&event);
> +               goto Exit;
> +               break;
> +
> +            case IBV_EVENT_PORT_ERR:
> +               osm_log(p_vend->p_log, OSM_LOG_VERBOSE,
> +                      "umad_events_thread: Received IBV_EVENT_PORT_ERR\n");
> +               ((osm_vend_events_callback_t)
> +                     (p_vend->events_callback))(OSM_EVENT_PORT_ERR, p_vend->sm_context);
> +               break;
> +
> +            default:
> +               osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> +                       "umad_events_thread: Received event #%d on port %d - Ignoring\n",
> +                       event.event_type, event.element.port_num);
> +         }
> +      }
> +      else 
> +      {
> +         osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> +                 "umad_events_thread: Received event #%d on port %d - Ignoring\n",
> +                 event.event_type, event.element.port_num);
> +      }
> +
> +      ibv_ack_async_event(&event);
> +   }
> +
> +  Exit:
> +   osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> +           "umad_events_thread: Terminating thread\n");
> +   OSM_LOG_EXIT(p_vend->p_log);
> +   return;
> +}
> +
> +/**********************************************************************
> + **********************************************************************/
>  ib_api_status_t
>  osm_vendor_init(
>    IN osm_vendor_t* const p_vend,
> @@ -456,6 +542,7 @@ osm_vendor_init(
>  	p_vend->max_retries = OSM_DEFAULT_RETRY_COUNT;
>  	cl_spinlock_construct( &p_vend->cb_lock );
>  	cl_spinlock_construct( &p_vend->match_tbl_lock );
> +	cl_thread_construct( &p_vend->events_thread );
>  	p_vend->umad_port_id = -1;
>  	p_vend->issmfd = -1;
>  
> @@ -1217,4 +1304,114 @@ osm_vendor_set_debug(
>  	umad_debug(level);
>  }
>  
> +/**********************************************************************
> + **********************************************************************/
> +int 
> +osm_vendor_reg_events_cb(
> +   IN osm_vendor_t * const p_vend,
> +   IN void * const sm_callback,
> +   IN void * const sm_context)
> +{
> +   ib_api_status_t status = IB_SUCCESS;
> +   struct ibv_device ** dev_list;
> +   struct ibv_device * device;
> +
> +   OSM_LOG_ENTER( p_vend->p_log, osm_vendor_reg_events_cb );
> +
> +   p_vend->events_callback = sm_callback;
> +   p_vend->sm_context = sm_context;
> +
> +   dev_list = ibv_get_device_list(NULL);
> +   if (!dev_list || !(*dev_list)) {
> +      osm_log(p_vend->p_log, OSM_LOG_ERROR,
> +              "osm_vendor_reg_events_cb: ERR 5440: "
> +              "No IB devices found\n");
> +      status = IB_ERROR;
> +      goto Exit;
> +   }
> +
> +   if (!p_vend->umad_port.ca_name || !p_vend->umad_port.ca_name[0])
> +   {
> +      osm_log(p_vend->p_log, OSM_LOG_ERROR,
> +              "osm_vendor_reg_events_cb: ERR 5441: "
> +              "Vendor initialization is not completed yet\n");
> +      status = IB_ERROR;
> +      goto Exit;
> +   }
> +
> +   osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> +           "osm_vendor_reg_events_cb: Registering on device %s\n",
> +           p_vend->umad_port.ca_name);
> +
> +   /*
> +    * find device whos name matches the SM's device
> +    */
> +   for ( device = *dev_list; 
> +         (device != NULL) && 
> +         (strcmp(p_vend->umad_port.ca_name, ibv_get_device_name(device)) != 0);
> +         device += sizeof(struct ibv_device *) )
> +      ;
> +   if (!device)
> +   {
> +      osm_log(p_vend->p_log, OSM_LOG_ERROR,
> +              "osm_vendor_reg_events_cb: ERR 5442: "
> +              "Device %s hasn't been found in the device list\n"
> +              ,p_vend->umad_port.ca_name);
> +      status = IB_ERROR;
> +      goto Exit;
> +   }
> +
> +   p_vend->ibv_context = ibv_open_device(device);
> +   if (!p_vend->ibv_context) {
> +      osm_log(p_vend->p_log, OSM_LOG_ERROR,
> +              "osm_vendor_reg_events_cb: ERR 5443: "
> +              "Couldn't get context for %s\n", 
> +              p_vend->umad_port.ca_name);
> +      status = IB_ERROR;
> +      goto Exit;
> +   }
> +
> +   /*
> +    * Initiate the events thread
> +    */
> +   if (cl_thread_init(&p_vend->events_thread, 
> +                      umad_events_thread,
> +                      p_vend,
> +                      "ibumad events thread") != CL_SUCCESS) {
> +      osm_log(p_vend->p_log, OSM_LOG_ERROR,
> +              "osm_vendor_reg_events_cb: ERR 5444: "
> +              "Failed initiating event listening thread\n");
> +      status = IB_ERROR;
> +      goto Exit;
> +   }
> +
> +  Exit:
> +   if (status != IB_SUCCESS)
> +   {
> +      p_vend->events_callback = NULL;
> +      p_vend->sm_context = NULL;
> +      p_vend->ibv_context = NULL;
> +      p_vend->events_callback = NULL;
> +   }
> +   OSM_LOG_EXIT( p_vend->p_log );
> +   return status;
> +}
> +
> +/**********************************************************************
> + **********************************************************************/
> +void 
> +osm_vendor_unreg_events_cb(
> +   IN osm_vendor_t * const p_vend)
> +{
> +   OSM_LOG_ENTER( p_vend->p_log, osm_vendor_unreg_events_cb );
> +   p_vend->events_callback = NULL;
> +   p_vend->sm_context = NULL;
> +   p_vend->ibv_context = NULL;
> +   p_vend->events_callback = NULL;
> +   OSM_LOG_EXIT( p_vend->p_log );
> +}
> +
> +/**********************************************************************
> + **********************************************************************/
> +
>  #endif   /* OSM_VENDOR_INTF_OPENIB */
> Index: libvendor/libosmvendor.map
> ===================================================================
> -- libvendor/libosmvendor.map	(revision 8998)
> +++ libvendor/libosmvendor.map	(working copy)
> @@ -1,4 +1,4 @@
> -OSMVENDOR_2.0 {
> +OSMVENDOR_2.1 {
>  	global:
>  		umad_receiver;
>  		osm_vendor_init;
> @@ -23,5 +23,7 @@ OSMVENDOR_2.0 {
>  		osmv_bind_sa;
>  		osmv_query_sa;
>  		osm_vendor_get_guid_ca_and_port;
> +                osm_vendor_reg_events_cb;
> +                osm_vendor_unreg_events_cb;
>  	local: *;
>  };
> Index: opensm/osm_sm.c
> ===================================================================
> -- opensm/osm_sm.c	(revision 8998)
> +++ opensm/osm_sm.c	(working copy)
> @@ -313,6 +313,7 @@ osm_sm_init(
>                                    p_sm->p_mad_pool,
>                                    p_sm->p_vl15,
>                                    p_sm->p_vendor,
> +                                  &p_sm->state_mgr,
>                                    p_log, p_stats, p_lock, p_disp );
>     if( status != IB_SUCCESS )
>        goto Exit;
> Index: opensm/osm_sm_mad_ctrl.c
> ===================================================================
> -- opensm/osm_sm_mad_ctrl.c	(revision 8998)
> +++ opensm/osm_sm_mad_ctrl.c	(working copy)
> @@ -59,6 +59,7 @@
>  #include <opensm/osm_msgdef.h>
>  #include <opensm/osm_helper.h>
>  #include <opensm/osm_opensm.h>
> +#include <opensm/osm_state_mgr.h>
>  
>  /****f* opensm: SM/__osm_sm_mad_ctrl_retire_trans_mad
>   * NAME
> @@ -953,6 +954,7 @@ osm_sm_mad_ctrl_init(
>    IN osm_mad_pool_t* const p_mad_pool,
>    IN osm_vl15_t* const p_vl15,
>    IN osm_vendor_t* const p_vendor,
> +  IN struct _osm_state_mgr* const p_state_mgr, 
>    IN osm_log_t* const p_log,
>    IN osm_stats_t* const p_stats,
>    IN cl_plock_t* const p_lock,
> @@ -969,6 +971,7 @@ osm_sm_mad_ctrl_init(
>    p_ctrl->p_disp = p_disp;
>    p_ctrl->p_mad_pool = p_mad_pool;
>    p_ctrl->p_vendor = p_vendor;
> +  p_ctrl->p_state_mgr = p_state_mgr;
>    p_ctrl->p_stats = p_stats;
>    p_ctrl->p_lock = p_lock;
>    p_ctrl->p_vl15 = p_vl15;
> @@ -995,6 +998,47 @@ osm_sm_mad_ctrl_init(
>  
>  /**********************************************************************
>   **********************************************************************/
> +void 
> +__osm_vend_events_callback(
> +   IN int events_mask,
> +   IN void * const context )

Shouldn't this be conditionalized on OSM_VENDOR_SUPPORT_EVENTS ?

> +{
> +   osm_sm_mad_ctrl_t * const p_ctrl = (osm_sm_mad_ctrl_t * const) context;
> +
> +   OSM_LOG_ENTER(p_ctrl->p_log, __osm_vend_events_callback);
> +
> +   if (events_mask & OSM_EVENT_FATAL)
> +   {
> +      osm_log(p_ctrl->p_log, OSM_LOG_INFO,
> +              "__osm_vend_events_callback: "
> +              "Events callback got OSM_EVENT_FATAL\n");
> +      osm_log(p_ctrl->p_log, OSM_LOG_SYS,
> +              "Fatal HCA error - forcing OpenSM exit\n");
> +      osm_exit_flag = 1;
> +      OSM_LOG_EXIT(p_ctrl->p_log);
> +      return;
> +   }
> +
> +   if (events_mask & OSM_EVENT_PORT_ERR)
> +   {
> +      osm_log(p_ctrl->p_log, OSM_LOG_INFO,
> +              "__osm_vend_events_callback: "
> +              "Events callback got OSM_EVENT_PORT_ERR - forcing heavy sweep\n");
> +      p_ctrl->p_subn->force_immediate_heavy_sweep = TRUE;
> +      osm_state_mgr_process((osm_state_mgr_t * const)p_ctrl->p_state_mgr,
> +                            OSM_SIGNAL_SWEEP);
> +      OSM_LOG_EXIT(p_ctrl->p_log);
> +      return;
> +   }
> +
> +   osm_log(p_ctrl->p_log, OSM_LOG_INFO,
> +           "__osm_vend_events_callback: "
> +           "Events callback got event mask of %d - No action taken\n");
> +   OSM_LOG_EXIT(p_ctrl->p_log);
> +}
> +
> +/**********************************************************************
> + **********************************************************************/
>  ib_api_status_t
>  osm_sm_mad_ctrl_bind(
>    IN osm_sm_mad_ctrl_t* const p_ctrl,
> @@ -1044,6 +1088,17 @@ osm_sm_mad_ctrl_bind(
>      goto Exit;
>    }
>  
> +  if ( osm_vendor_reg_events_cb(p_ctrl->p_vendor, 
> +                                __osm_vend_events_callback, 
> +                                p_ctrl) )
> +  {
> +     status = IB_ERROR;
> +     osm_log( p_ctrl->p_log, OSM_LOG_ERROR,
> +              "osm_sm_mad_ctrl_bind: ERR 3120: "
> +              "Vendor failed to register for events\n" );
> +     goto Exit;
> +  }
> +

This should be conditionalized on OSM_VENDOR_SUPPORT_EVENTS.

>   Exit:
>    OSM_LOG_EXIT( p_ctrl->p_log );
>    return( status );
> Index: config/osmvsel.m4
> ===================================================================
> -- config/osmvsel.m4	(revision 8998)
> +++ config/osmvsel.m4	(working copy)
> @@ -63,9 +63,9 @@ if test $with_osmv = "openib"; then
>     OSMV_CFLAGS="-DOSM_VENDOR_INTF_OPENIB"
>     OSMV_INCLUDES="-I\$(srcdir)/../include -I\$(srcdir)/../../libibcommon/include/infiniband -I\$(srcdir)/../../libibumad/include/infiniband"
>     if test "x$with_umad_libs" = "x"; then
> -     OSMV_LDADD="-libumad"
> +     OSMV_LDADD="-libumad -libverbs"
>     else
> -     OSMV_LDADD="-L$with_umad_libs -libumad"
> +     OSMV_LDADD="-L$with_umad_libs -libumad -libverbs"
>     fi
>  
>     if test "x$with_umad_includes" != "x"; then 
> @@ -137,6 +137,8 @@ if test "$disable_libcheck" != "yes"; th
>     LDFLAGS="$LDFLAGS $OSMV_LDADD"
>     AC_CHECK_LIB(ibumad, umad_init, [],
>  	 AC_MSG_ERROR([umad_init() not found. libosmvendor of type openib requires libibumad.]))
> +   AC_CHECK_LIB(ibverbs, ibv_get_device_list, [],
> +	 AC_MSG_ERROR([umad_init() not found. libosmvendor of type openib requires libibverbs.]))

Cut and paste error: Error message should indicate ibv_get_device_list
rather than umad_init.

>     LD_FLAGS=$osmv_save_ldflags
>   elif test $with_osmv = "sim" ; then
>     LDFLAGS="$LDFLAGS -L$with_sim/lib"

-- Hal





More information about the general mailing list