[openib-general] [PATCH] Opensm - handling immediate error in vendor_send new
Yael Kalka
yael at mellanox.co.il
Tue Oct 11 01:28:31 PDT 2005
Hi Hal,
Attached is a new patch with several fixes for this issue.
I decided to remove the checking for zero in the atomic_dec after all,
since as I mentioned before - clearing it is not a fix, and we will
see the value in other infos in the log file.
Thanks,
Yael
Signed-off-by: Yael Kalka <yael at mellanox.co.il>
Index: include/opensm/osm_vl15intf.h
===================================================================
--- include/opensm/osm_vl15intf.h (revision 3704)
+++ include/opensm/osm_vl15intf.h (working copy)
@@ -55,11 +55,13 @@
#include <complib/cl_event.h>
#include <complib/cl_thread.h>
#include <complib/cl_qlist.h>
+#include <complib/cl_passivelock.h>
#include <opensm/osm_stats.h>
#include <opensm/osm_log.h>
#include <opensm/osm_madw.h>
#include <opensm/osm_mad_pool.h>
#include <vendor/osm_vendor.h>
+#include <opensm/osm_subnet.h>
#ifdef __cplusplus
# define BEGIN_C_DECLS extern "C" {
@@ -137,6 +139,9 @@ typedef struct _osm_vl15
osm_vendor_t *p_vend;
osm_log_t *p_log;
osm_stats_t *p_stats;
+ osm_subn_t *p_subn;
+ cl_disp_reg_handle_t h_disp;
+ cl_plock_t *p_lock;
} osm_vl15_t;
/*
@@ -176,6 +181,15 @@ typedef struct _osm_vl15
* p_stats
* Pointer to the OpenSM statistics block.
*
+* p_subn
+* Pointer to the Subnet object for this subnet.
+*
+* h_disp
+* Handle returned from dispatcher registration.
+*
+* p_lock
+* Pointer to the serializing lock.
+*
* SEE ALSO
* VL15 object
*********/
@@ -265,7 +279,10 @@ osm_vl15_init(
IN osm_vendor_t* const p_vend,
IN osm_log_t* const p_log,
IN osm_stats_t* const p_stats,
- IN const int32_t max_wire_smps );
+ IN const int32_t max_wire_smps,
+ IN osm_subn_t* const p_subn,
+ IN cl_dispatcher_t* const p_disp,
+ IN cl_plock_t* const p_lock );
/*
* PARAMETERS
* p_vl15
@@ -283,6 +300,15 @@ osm_vl15_init(
* max_wire_smps
* [in] Maximum number of MADs allowed on the wire at one time.
*
+* p_subn
+* [in] Pointer to the subnet object.
+*
+* p_disp
+* [in] Pointer to the dispatcher object.
+*
+* p_lock
+* [in] Pointer to the OpenSM serializing lock.
+*
* RETURN VALUES
* IB_SUCCESS if the VL15 object was initialized successfully.
*
Index: opensm/osm_opensm.c
===================================================================
--- opensm/osm_opensm.c (revision 3704)
+++ opensm/osm_opensm.c (working copy)
@@ -257,7 +257,8 @@ osm_opensm_init(
status = osm_vl15_init( &p_osm->vl15,
p_osm->p_vendor,
- &p_osm->log, &p_osm->stats, p_opt->max_wire_smps );
+ &p_osm->log, &p_osm->stats, p_opt->max_wire_smps,
+ &p_osm->subn, &p_osm->disp, &p_osm->lock );
if( status != IB_SUCCESS )
goto Exit;
Index: opensm/osm_vl15intf.c
===================================================================
--- opensm/osm_vl15intf.c (revision 3704)
+++ opensm/osm_vl15intf.c (working copy)
@@ -157,6 +157,8 @@ __osm_vl15_poller(
if( status != IB_SUCCESS )
{
+ uint32_t outstanding;
+ cl_status_t cl_status;
osm_log( p_vl->p_log, OSM_LOG_ERROR,
"__osm_vl15_poller: ERR 3E03: "
"MAD send failed (%s).\n",
@@ -166,7 +168,69 @@ __osm_vl15_poller(
The MAD was never successfully sent, so
fix up the pre-incremented count values.
*/
+ /* Decrement qp0_mads_sent and qp0_mads_outstanding_on_wire
+ that was incremented in the code above. */
mads_sent = cl_atomic_dec( &p_vl->p_stats->qp0_mads_sent );
+ if( p_madw->resp_expected == TRUE )
+ cl_atomic_dec( &p_vl->p_stats->qp0_mads_outstanding_on_wire );
+
+ /*
+ The following code is similar to the one in
+ __osm_sm_mad_ctrl_retire_trans_mad. We need to decrement the
+ qp0_mads_outstanding counter, and if we reached 0 - need to call
+ the cl_disp_post with OSM_SIGNAL_NO_PENDING_TRANSACTION (in order
+ to wake up the state mgr).
+ */
+ cl_atomic_dec( &p_vl->p_stats->qp0_mads_outstanding );
+
+ osm_log( p_vl->p_log, OSM_LOG_DEBUG,
+ "__osm_vl15_poller: "
+ "%u QP0 MADs outstanding.\n",
+ p_vl->p_stats->qp0_mads_outstanding );
+
+ /*
+ Acquire the lock non-exclusively.
+ Other modules that send MADs grab this lock exclusively.
+ These modules that are in the process of sending MADs
+ will hold the lock until they finish posting all the MADs
+ they plan to send. While the other module is sending MADs
+ the outstanding count may temporarily go to zero.
+ Thus, by grabbing the lock ourselves, we get an accurate
+ view of whether or not the number of outstanding MADs is
+ really zero.
+ */
+ CL_PLOCK_ACQUIRE( p_vl->p_lock );
+ outstanding = p_vl->p_stats->qp0_mads_outstanding;
+ CL_PLOCK_RELEASE( p_vl->p_lock );
+
+ if( outstanding == 0 )
+ {
+ /*
+ The wire is clean.
+ Signal the state manager.
+ */
+ if( osm_log_is_active( p_vl->p_log, OSM_LOG_DEBUG ) )
+ {
+ osm_log( p_vl->p_log, OSM_LOG_DEBUG,
+ "__osm_vl15_poller: "
+ "Posting Dispatcher message %s.\n",
+ osm_get_disp_msg_str( OSM_MSG_NO_SMPS_OUTSTANDING ) );
+ }
+
+ cl_status = cl_disp_post( p_vl->h_disp,
+ OSM_MSG_NO_SMPS_OUTSTANDING,
+ (void *)OSM_SIGNAL_NO_PENDING_TRANSACTIONS,
+ NULL,
+ NULL );
+
+ if( cl_status != CL_SUCCESS )
+ {
+ osm_log( p_vl->p_log, OSM_LOG_ERROR,
+ "__osm_vl15_poller: ERR 3E06: "
+ "Dispatcher post message failed (%s).\n",
+ CL_STATUS_MSG( cl_status ) );
+ }
+ }
}
else
{
@@ -232,6 +296,7 @@ osm_vl15_construct(
cl_qlist_init( &p_vl->rfifo );
cl_qlist_init( &p_vl->ufifo );
cl_thread_construct( &p_vl->poller );
+ p_vl->h_disp = CL_DISP_INVALID_HANDLE;
}
/**********************************************************************
@@ -281,6 +346,8 @@ osm_vl15_destroy(
p_vl->state = OSM_VL15_STATE_INIT;
cl_spinlock_destroy( &p_vl->lock );
+ cl_disp_unregister( p_vl->h_disp );
+
OSM_LOG_EXIT( p_vl->p_log );
}
@@ -292,7 +359,11 @@ osm_vl15_init(
IN osm_vendor_t* const p_vend,
IN osm_log_t* const p_log,
IN osm_stats_t* const p_stats,
- IN const int32_t max_wire_smps )
+ IN const int32_t max_wire_smps,
+ IN osm_subn_t* const p_subn,
+ IN cl_dispatcher_t* const p_disp,
+ IN cl_plock_t* const p_lock
+ )
{
ib_api_status_t status = IB_SUCCESS;
OSM_LOG_ENTER( p_log, osm_vl15_init );
@@ -301,6 +372,8 @@ osm_vl15_init(
p_vl->p_log = p_log;
p_vl->p_stats = p_stats;
p_vl->max_wire_smps = max_wire_smps;
+ p_vl->p_subn = p_subn;
+ p_vl->p_lock = p_lock;
status = cl_event_init( &p_vl->signal, FALSE );
if( status != IB_SUCCESS )
@@ -321,6 +394,21 @@ osm_vl15_init(
if( status != IB_SUCCESS )
goto Exit;
+ p_vl->h_disp = cl_disp_register(
+ p_disp,
+ CL_DISP_MSGID_NONE,
+ NULL,
+ NULL );
+
+ if( p_vl->h_disp == CL_DISP_INVALID_HANDLE )
+ {
+ osm_log( p_log, OSM_LOG_ERROR,
+ "osm_vl15_init: ERR 3E01: "
+ "Dispatcher registration failed.\n" );
+ status = IB_INSUFFICIENT_RESOURCES;
+ goto Exit;
+ }
+
Exit:
OSM_LOG_EXIT( p_log );
return( status );
More information about the general
mailing list