[openib-general] [PATCH] osm: handle local events
Hal Rosenstock
halr at voltaire.com
Tue Aug 22 14:25:40 PDT 2006
Hi again Yevgeny,
I've been working on integrating this patch and have some more comments
on it:
On Tue, 2006-08-22 at 13:35, Hal Rosenstock wrote:
> Hi Yevgeny,
>
> On Tue, 2006-08-22 at 11:41, Yevgeny Kliteynik wrote:
> > Hi Hal
> >
> > This patch implements first item of the OSM todo list.
>
> Thanks!
>
> Am I correct in assuming this is both for trunk and 1.1 ?
>
> > OpenSM opens a thread that is listening for events on the SM's port.
> > The events that are being taken care of are IBV_EVENT_DEVICE_FATAL and
> > IBV_EVENT_PORT_ERROR.
> >
> > In case of IBV_EVENT_DEVICE_FATAL, osm is forced to exit.
> > in case of IBV_EVENT_PORT_ERROR, osm initiates heavy sweep.
What (and how) were this tested ? How can these events be generated ?
> Some minor comments below. Let me know what you think. You don't have to
> resubmit for these.
>
> > Yevgeny
> >
> > Signed-off-by: Yevgeny Kliteynik <kliteyn at mellanox.co.il>
> >
> > Index: include/opensm/osm_sm_mad_ctrl.h
> > ===================================================================
> > -- include/opensm/osm_sm_mad_ctrl.h (revision 8998)
> > +++ include/opensm/osm_sm_mad_ctrl.h (working copy)
> > @@ -109,6 +109,7 @@ typedef struct _osm_sm_mad_ctrl
> > osm_mad_pool_t *p_mad_pool;
> > osm_vl15_t *p_vl15;
> > osm_vendor_t *p_vendor;
> > + struct _osm_state_mgr *p_state_mgr;
> > osm_bind_handle_t h_bind;
> > cl_plock_t *p_lock;
> > cl_dispatcher_t *p_disp;
> > @@ -130,6 +131,9 @@ typedef struct _osm_sm_mad_ctrl
> > * p_vendor
> > * Pointer to the vendor specific interfaces object.
> > *
> > +* p_state_mgr
> > +* Pointer to the state manager object.
> > +*
> > * h_bind
> > * Bind handle returned by the transport layer.
> > *
> > @@ -233,6 +237,7 @@ osm_sm_mad_ctrl_init(
> > IN osm_mad_pool_t* const p_mad_pool,
> > IN osm_vl15_t* const p_vl15,
> > IN osm_vendor_t* const p_vendor,
> > + IN struct _osm_state_mgr* const p_state_mgr,
> > IN osm_log_t* const p_log,
> > IN osm_stats_t* const p_stats,
> > IN cl_plock_t* const p_lock,
> > @@ -251,6 +256,9 @@ osm_sm_mad_ctrl_init(
> > * p_vendor
> > * [in] Pointer to the vendor specific interfaces object.
> > *
> > +* p_state_mgr
> > +* [in] Pointer to the state manager object.
> > +*
> > * p_log
> > * [in] Pointer to the log object.
> > *
> > Index: include/vendor/osm_vendor_ibumad.h
> > ===================================================================
> > -- include/vendor/osm_vendor_ibumad.h (revision 8998)
> > +++ include/vendor/osm_vendor_ibumad.h (working copy)
> > @@ -74,6 +74,8 @@ BEGIN_C_DECLS
> > #define OSM_UMAD_MAX_CAS 32
> > #define OSM_UMAD_MAX_PORTS_PER_CA 2
> >
> > +#define OSM_VENDOR_SUPPORT_EVENTS
> > +
>
> I prefer this as an additional flag turned on in the build for OpenIB.
I take that back. I think that this define should not be present at all
as this is a change to the vendor layer API. If so, then
libosmvendor.ver needs to be bumped.
Also, what about any of the other vendor layers supported ? They would
need at least some stub to get past the linking.
> > /* OpenIB gen2 doesn't support RMPP yet */
> >
> > /****s* OpenSM: Vendor UMAD/osm_ca_info_t
> > @@ -179,6 +181,10 @@ typedef struct _osm_vendor
> > int umad_port_id;
> > void *receiver;
> > int issmfd;
> > + cl_thread_t events_thread;
> > + void * events_callback;
> > + void * sm_context;
> > + struct ibv_context * ibv_context;
> > } osm_vendor_t;
> >
> > #define OSM_BIND_INVALID_HANDLE 0
> > Index: include/vendor/osm_vendor_api.h
> > ===================================================================
> > -- include/vendor/osm_vendor_api.h (revision 8998)
> > +++ include/vendor/osm_vendor_api.h (working copy)
> > @@ -526,6 +526,110 @@ osm_vendor_set_debug(
> > * SEE ALSO
> > *********/
> >
> > +#ifdef OSM_VENDOR_SUPPORT_EVENTS
> > +
> > +#define OSM_EVENT_FATAL 1
> > +#define OSM_EVENT_PORT_ERR 2
> > +
> > +/****s* OpenSM Vendor API/osm_vend_events_callback_t
> > +* NAME
> > +* osm_vend_events_callback_t
> > +*
> > +* DESCRIPTION
> > +* Function prototype for the vendor events callback.
> > +* The vendor layer calls this function on driver events.
> > +*
> > +* SYNOPSIS
> > +*/
> > +typedef void
> > +(*osm_vend_events_callback_t)(
> > + IN int events_mask,
> > + IN void * const context );
> > +/*
> > +* PARAMETERS
> > +* events_mask
> > +* [in] The received event(s).
> > +*
> > +* context
> > +* [in] Context supplied as the "sm_context" argument in
> > +* the osm_vendor_unreg_events_cb call
> > +*
> > +* RETURN VALUES
> > +* None.
> > +*
> > +* NOTES
> > +*
> > +* SEE ALSO
> > +* osm_vendor_reg_events_cb osm_vendor_unreg_events_cb
> > +*********/
> > +
> > +/****f* OpenSM Vendor API/osm_vendor_reg_events_cb
> > +* NAME
> > +* osm_vendor_reg_events_cb
> > +*
> > +* DESCRIPTION
> > +* Registers the events callback function and start the events
> > +* thread
> > +*
> > +* SYNOPSIS
> > +*/
> > +int
> > +osm_vendor_reg_events_cb(
> > + IN osm_vendor_t * const p_vend,
> > + IN void * const sm_callback,
> > + IN void * const sm_context);
> > +/*
> > +* PARAMETERS
> > +* p_vend
> > +* [in] vendor handle.
> > +*
> > +* sm_callback
> > +* [in] Callback function that should be called when
> > +* the event is received.
> > +*
> > +* sm_context
> > +* [in] Context supplied as the "context" argument in
> > +* the subsequenct calls to the sm_callback function
> > +*
> > +* RETURN VALUE
> > +* IB_SUCCESS if OK.
> > +*
> > +* NOTES
> > +*
> > +* SEE ALSO
> > +* osm_vend_events_callback_t osm_vendor_unreg_events_cb
> > +*********/
> > +
> > +/****f* OpenSM Vendor API/osm_vendor_unreg_events_cb
> > +* NAME
> > +* osm_vendor_unreg_events_cb
> > +*
> > +* DESCRIPTION
> > +* Un-Registers the events callback function and stops the events
> > +* thread
> > +*
> > +* SYNOPSIS
> > +*/
> > +void
> > +osm_vendor_unreg_events_cb(
> > + IN osm_vendor_t * const p_vend);
> > +/*
> > +* PARAMETERS
> > +* p_vend
> > +* [in] vendor handle.
> > +*
> > +*
> > +* RETURN VALUE
> > +* None.
> > +*
> > +* NOTES
> > +*
> > +* SEE ALSO
> > +* osm_vend_events_callback_t osm_vendor_reg_events_cb
> > +*********/
> > +
> > +#endif /* OSM_VENDOR_SUPPORT_EVENTS */
> > +
> > END_C_DECLS
> >
> > #endif /* _OSM_VENDOR_API_H_ */
> > Index: libvendor/osm_vendor_ibumad.c
> > ===================================================================
> > -- libvendor/osm_vendor_ibumad.c (revision 8998)
> > +++ libvendor/osm_vendor_ibumad.c (working copy)
> > @@ -72,6 +72,7 @@
> > #include <opensm/osm_log.h>
> > #include <opensm/osm_mad_pool.h>
> > #include <vendor/osm_vendor_api.h>
> > +#include <infiniband/verbs.h>
> >
> > /****s* OpenSM: Vendor AL/osm_umad_bind_info_t
> > * NAME
> > @@ -441,6 +442,91 @@ Exit:
> >
> > /**********************************************************************
> > **********************************************************************/
> > +static void
> > +umad_events_thread(
> > + IN void * vend_context)
> > +{
> > + int res = 0;
> > + osm_vendor_t * p_vend = (osm_vendor_t *) vend_context;
> > + struct ibv_async_event event;
> > +
> > + OSM_LOG_ENTER( p_vend->p_log, umad_events_thread );
> > +
> > + osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> > + "umad_events_thread: Device %s, async event FD: %d\n",
> > + p_vend->umad_port.ca_name, p_vend->ibv_context->async_fd);
> > + osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> > + "umad_events_thread: Listening for events on device %s, port %d\n",
> > + p_vend->umad_port.ca_name, p_vend->umad_port.portnum);
> > +
> > + while (1) {
> > +
> > + res = ibv_get_async_event(p_vend->ibv_context, &event);
> > + if (res)
> > + {
> > + osm_log(p_vend->p_log, OSM_LOG_ERROR,
> > + "umad_events_thread: ERR 5450: "
> > + "Failed getting async event (device %s, port %d)\n",
> > + p_vend->umad_port.ca_name, p_vend->umad_port.portnum);
> > + goto Exit;
> > + }
> > +
> > + if (!p_vend->events_callback)
> > + {
> > + osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> > + "umad_events_thread: Events callback has been unregistered\n");
> > + ibv_ack_async_event(&event);
> > + goto Exit;
> > + }
> > + /*
> > + * We're listening to events on the SM's port only
> > + */
> > + if ( event.element.port_num == p_vend->umad_port.portnum )
> > + {
> > + switch (event.event_type)
> > + {
> > + case IBV_EVENT_DEVICE_FATAL:
> > + osm_log(p_vend->p_log, OSM_LOG_INFO,
> > + "umad_events_thread: Received IBV_EVENT_DEVICE_FATAL\n");
> > + ((osm_vend_events_callback_t)
> > + (p_vend->events_callback))(OSM_EVENT_FATAL, p_vend->sm_context);
> > +
> > + ibv_ack_async_event(&event);
> > + goto Exit;
> > + break;
> > +
> > + case IBV_EVENT_PORT_ERR:
> > + osm_log(p_vend->p_log, OSM_LOG_VERBOSE,
> > + "umad_events_thread: Received IBV_EVENT_PORT_ERR\n");
> > + ((osm_vend_events_callback_t)
> > + (p_vend->events_callback))(OSM_EVENT_PORT_ERR, p_vend->sm_context);
> > + break;
> > +
> > + default:
> > + osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> > + "umad_events_thread: Received event #%d on port %d - Ignoring\n",
> > + event.event_type, event.element.port_num);
> > + }
> > + }
> > + else
> > + {
> > + osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> > + "umad_events_thread: Received event #%d on port %d - Ignoring\n",
> > + event.event_type, event.element.port_num);
> > + }
> > +
> > + ibv_ack_async_event(&event);
> > + }
> > +
> > + Exit:
> > + osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> > + "umad_events_thread: Terminating thread\n");
> > + OSM_LOG_EXIT(p_vend->p_log);
> > + return;
> > +}
> > +
> > +/**********************************************************************
> > + **********************************************************************/
> > ib_api_status_t
> > osm_vendor_init(
> > IN osm_vendor_t* const p_vend,
> > @@ -456,6 +542,7 @@ osm_vendor_init(
> > p_vend->max_retries = OSM_DEFAULT_RETRY_COUNT;
> > cl_spinlock_construct( &p_vend->cb_lock );
> > cl_spinlock_construct( &p_vend->match_tbl_lock );
> > + cl_thread_construct( &p_vend->events_thread );
> > p_vend->umad_port_id = -1;
> > p_vend->issmfd = -1;
> >
> > @@ -1217,4 +1304,114 @@ osm_vendor_set_debug(
> > umad_debug(level);
> > }
> >
> > +/**********************************************************************
> > + **********************************************************************/
> > +int
> > +osm_vendor_reg_events_cb(
> > + IN osm_vendor_t * const p_vend,
> > + IN void * const sm_callback,
> > + IN void * const sm_context)
> > +{
> > + ib_api_status_t status = IB_SUCCESS;
> > + struct ibv_device ** dev_list;
> > + struct ibv_device * device;
> > +
> > + OSM_LOG_ENTER( p_vend->p_log, osm_vendor_reg_events_cb );
> > +
> > + p_vend->events_callback = sm_callback;
> > + p_vend->sm_context = sm_context;
> > +
> > + dev_list = ibv_get_device_list(NULL);
> > + if (!dev_list || !(*dev_list)) {
> > + osm_log(p_vend->p_log, OSM_LOG_ERROR,
> > + "osm_vendor_reg_events_cb: ERR 5440: "
> > + "No IB devices found\n");
> > + status = IB_ERROR;
> > + goto Exit;
> > + }
> > +
> > + if (!p_vend->umad_port.ca_name || !p_vend->umad_port.ca_name[0])
> > + {
> > + osm_log(p_vend->p_log, OSM_LOG_ERROR,
> > + "osm_vendor_reg_events_cb: ERR 5441: "
> > + "Vendor initialization is not completed yet\n");
> > + status = IB_ERROR;
> > + goto Exit;
> > + }
> > +
> > + osm_log(p_vend->p_log, OSM_LOG_DEBUG,
> > + "osm_vendor_reg_events_cb: Registering on device %s\n",
> > + p_vend->umad_port.ca_name);
> > +
> > + /*
> > + * find device whos name matches the SM's device
> > + */
> > + for ( device = *dev_list;
> > + (device != NULL) &&
> > + (strcmp(p_vend->umad_port.ca_name, ibv_get_device_name(device)) != 0);
> > + device += sizeof(struct ibv_device *) )
> > + ;
> > + if (!device)
> > + {
> > + osm_log(p_vend->p_log, OSM_LOG_ERROR,
> > + "osm_vendor_reg_events_cb: ERR 5442: "
> > + "Device %s hasn't been found in the device list\n"
> > + ,p_vend->umad_port.ca_name);
> > + status = IB_ERROR;
> > + goto Exit;
> > + }
> > +
> > + p_vend->ibv_context = ibv_open_device(device);
> > + if (!p_vend->ibv_context) {
> > + osm_log(p_vend->p_log, OSM_LOG_ERROR,
> > + "osm_vendor_reg_events_cb: ERR 5443: "
> > + "Couldn't get context for %s\n",
> > + p_vend->umad_port.ca_name);
> > + status = IB_ERROR;
> > + goto Exit;
> > + }
> > +
> > + /*
> > + * Initiate the events thread
> > + */
> > + if (cl_thread_init(&p_vend->events_thread,
> > + umad_events_thread,
> > + p_vend,
> > + "ibumad events thread") != CL_SUCCESS) {
> > + osm_log(p_vend->p_log, OSM_LOG_ERROR,
> > + "osm_vendor_reg_events_cb: ERR 5444: "
> > + "Failed initiating event listening thread\n");
> > + status = IB_ERROR;
> > + goto Exit;
> > + }
> > +
> > + Exit:
> > + if (status != IB_SUCCESS)
> > + {
> > + p_vend->events_callback = NULL;
> > + p_vend->sm_context = NULL;
> > + p_vend->ibv_context = NULL;
> > + p_vend->events_callback = NULL;
> > + }
> > + OSM_LOG_EXIT( p_vend->p_log );
> > + return status;
> > +}
> > +
> > +/**********************************************************************
> > + **********************************************************************/
> > +void
> > +osm_vendor_unreg_events_cb(
> > + IN osm_vendor_t * const p_vend)
> > +{
> > + OSM_LOG_ENTER( p_vend->p_log, osm_vendor_unreg_events_cb );
> > + p_vend->events_callback = NULL;
> > + p_vend->sm_context = NULL;
> > + p_vend->ibv_context = NULL;
> > + p_vend->events_callback = NULL;
> > + OSM_LOG_EXIT( p_vend->p_log );
> > +}
> > +
> > +/**********************************************************************
> > + **********************************************************************/
> > +
> > #endif /* OSM_VENDOR_INTF_OPENIB */
> > Index: libvendor/libosmvendor.map
> > ===================================================================
> > -- libvendor/libosmvendor.map (revision 8998)
> > +++ libvendor/libosmvendor.map (working copy)
> > @@ -1,4 +1,4 @@
> > -OSMVENDOR_2.0 {
> > +OSMVENDOR_2.1 {
> > global:
> > umad_receiver;
> > osm_vendor_init;
> > @@ -23,5 +23,7 @@ OSMVENDOR_2.0 {
> > osmv_bind_sa;
> > osmv_query_sa;
> > osm_vendor_get_guid_ca_and_port;
> > + osm_vendor_reg_events_cb;
> > + osm_vendor_unreg_events_cb;
These are not present for all vendor layers.
> > local: *;
> > };
> > Index: opensm/osm_sm.c
> > ===================================================================
> > -- opensm/osm_sm.c (revision 8998)
> > +++ opensm/osm_sm.c (working copy)
> > @@ -313,6 +313,7 @@ osm_sm_init(
> > p_sm->p_mad_pool,
> > p_sm->p_vl15,
> > p_sm->p_vendor,
> > + &p_sm->state_mgr,
> > p_log, p_stats, p_lock, p_disp );
> > if( status != IB_SUCCESS )
> > goto Exit;
> > Index: opensm/osm_sm_mad_ctrl.c
> > ===================================================================
> > -- opensm/osm_sm_mad_ctrl.c (revision 8998)
> > +++ opensm/osm_sm_mad_ctrl.c (working copy)
> > @@ -59,6 +59,7 @@
> > #include <opensm/osm_msgdef.h>
> > #include <opensm/osm_helper.h>
> > #include <opensm/osm_opensm.h>
> > +#include <opensm/osm_state_mgr.h>
> >
> > /****f* opensm: SM/__osm_sm_mad_ctrl_retire_trans_mad
> > * NAME
> > @@ -953,6 +954,7 @@ osm_sm_mad_ctrl_init(
> > IN osm_mad_pool_t* const p_mad_pool,
> > IN osm_vl15_t* const p_vl15,
> > IN osm_vendor_t* const p_vendor,
> > + IN struct _osm_state_mgr* const p_state_mgr,
> > IN osm_log_t* const p_log,
> > IN osm_stats_t* const p_stats,
> > IN cl_plock_t* const p_lock,
> > @@ -969,6 +971,7 @@ osm_sm_mad_ctrl_init(
> > p_ctrl->p_disp = p_disp;
> > p_ctrl->p_mad_pool = p_mad_pool;
> > p_ctrl->p_vendor = p_vendor;
> > + p_ctrl->p_state_mgr = p_state_mgr;
> > p_ctrl->p_stats = p_stats;
> > p_ctrl->p_lock = p_lock;
> > p_ctrl->p_vl15 = p_vl15;
> > @@ -995,6 +998,47 @@ osm_sm_mad_ctrl_init(
> >
> > /**********************************************************************
> > **********************************************************************/
> > +void
> > +__osm_vend_events_callback(
> > + IN int events_mask,
> > + IN void * const context )
>
> Shouldn't this be conditionalized on OSM_VENDOR_SUPPORT_EVENTS ?
> > +{
> > + osm_sm_mad_ctrl_t * const p_ctrl = (osm_sm_mad_ctrl_t * const) context;
> > +
> > + OSM_LOG_ENTER(p_ctrl->p_log, __osm_vend_events_callback);
> > +
> > + if (events_mask & OSM_EVENT_FATAL)
> > + {
> > + osm_log(p_ctrl->p_log, OSM_LOG_INFO,
> > + "__osm_vend_events_callback: "
> > + "Events callback got OSM_EVENT_FATAL\n");
> > + osm_log(p_ctrl->p_log, OSM_LOG_SYS,
> > + "Fatal HCA error - forcing OpenSM exit\n");
> > + osm_exit_flag = 1;
> > + OSM_LOG_EXIT(p_ctrl->p_log);
> > + return;
> > + }
> > +
> > + if (events_mask & OSM_EVENT_PORT_ERR)
> > + {
> > + osm_log(p_ctrl->p_log, OSM_LOG_INFO,
> > + "__osm_vend_events_callback: "
> > + "Events callback got OSM_EVENT_PORT_ERR - forcing heavy sweep\n");
> > + p_ctrl->p_subn->force_immediate_heavy_sweep = TRUE;
> > + osm_state_mgr_process((osm_state_mgr_t * const)p_ctrl->p_state_mgr,
> > + OSM_SIGNAL_SWEEP);
> > + OSM_LOG_EXIT(p_ctrl->p_log);
> > + return;
> > + }
> > +
> > + osm_log(p_ctrl->p_log, OSM_LOG_INFO,
> > + "__osm_vend_events_callback: "
> > + "Events callback got event mask of %d - No action taken\n");
> > + OSM_LOG_EXIT(p_ctrl->p_log);
> > +}
> > +
> > +/**********************************************************************
> > + **********************************************************************/
> > ib_api_status_t
> > osm_sm_mad_ctrl_bind(
> > IN osm_sm_mad_ctrl_t* const p_ctrl,
> > @@ -1044,6 +1088,17 @@ osm_sm_mad_ctrl_bind(
> > goto Exit;
> > }
> >
> > + if ( osm_vendor_reg_events_cb(p_ctrl->p_vendor,
> > + __osm_vend_events_callback,
> > + p_ctrl) )
> > + {
Is an osm_vendor_unbind needed here or is this handled elsewhere ?
> > + status = IB_ERROR;
> > + osm_log( p_ctrl->p_log, OSM_LOG_ERROR,
> > + "osm_sm_mad_ctrl_bind: ERR 3120: "
> > + "Vendor failed to register for events\n" );
> > + goto Exit;
> > + }
> > +
Also, should osm_m_mad_ctrl_unbind unregister the events callback ?
-- Hal
> This should be conditionalized on OSM_VENDOR_SUPPORT_EVENTS.
>
> > Exit:
> > OSM_LOG_EXIT( p_ctrl->p_log );
> > return( status );
> > Index: config/osmvsel.m4
> > ===================================================================
> > -- config/osmvsel.m4 (revision 8998)
> > +++ config/osmvsel.m4 (working copy)
> > @@ -63,9 +63,9 @@ if test $with_osmv = "openib"; then
> > OSMV_CFLAGS="-DOSM_VENDOR_INTF_OPENIB"
> > OSMV_INCLUDES="-I\$(srcdir)/../include -I\$(srcdir)/../../libibcommon/include/infiniband -I\$(srcdir)/../../libibumad/include/infiniband"
> > if test "x$with_umad_libs" = "x"; then
> > - OSMV_LDADD="-libumad"
> > + OSMV_LDADD="-libumad -libverbs"
> > else
> > - OSMV_LDADD="-L$with_umad_libs -libumad"
> > + OSMV_LDADD="-L$with_umad_libs -libumad -libverbs"
> > fi
> >
> > if test "x$with_umad_includes" != "x"; then
> > @@ -137,6 +137,8 @@ if test "$disable_libcheck" != "yes"; th
> > LDFLAGS="$LDFLAGS $OSMV_LDADD"
> > AC_CHECK_LIB(ibumad, umad_init, [],
> > AC_MSG_ERROR([umad_init() not found. libosmvendor of type openib requires libibumad.]))
> > + AC_CHECK_LIB(ibverbs, ibv_get_device_list, [],
> > + AC_MSG_ERROR([umad_init() not found. libosmvendor of type openib requires libibverbs.]))
>
> Cut and paste error: Error message should indicate ibv_get_device_list
> rather than umad_init.
>
> > LD_FLAGS=$osmv_save_ldflags
> > elif test $with_osmv = "sim" ; then
> > LDFLAGS="$LDFLAGS -L$with_sim/lib"
>
> -- Hal
>
>
> _______________________________________________
> openib-general mailing list
> openib-general at openib.org
> http://openib.org/mailman/listinfo/openib-general
>
> To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
>
More information about the general
mailing list