[openib-general] [PATCH] Opensm - handling immediate error in vendor_send

Yael Kalka yael at mellanox.co.il
Sun Oct 9 04:18:23 PDT 2005


Hi Hal,

During our tests on Windows we encountered an issue that is caused due
to some problem in the lower layer, but causes problem in the opensm.
If the osm_vendor_send call fails immediatly, we need to update
several counters (currently, only qp0_mads_sent is decremented), and
also all the dispatcher, if we reached qp0_mads_outstanding == 0 (in
order to signal the state mgr).
What we saw was that these counters weren't decremented, and thus the
state mgr wasn't signalled, and the opensm didn't proceed in
traversing through its stages.
The following patch updates the relevant counters, and calls the
dispatcher, if neccessary.

Thanks,
Yael

Signed-off-by:  Yael Kalka <yael at mellanox.co.il>

Index: include/opensm/osm_vl15intf.h
===================================================================
--- include/opensm/osm_vl15intf.h	(revision 3703)
+++ include/opensm/osm_vl15intf.h	(working copy)
@@ -60,6 +60,7 @@
 #include <opensm/osm_madw.h>
 #include <opensm/osm_mad_pool.h>
 #include <vendor/osm_vendor.h>
+#include <opensm/osm_subnet.h>
 
 #ifdef __cplusplus
 #  define BEGIN_C_DECLS extern "C" {
@@ -137,6 +138,8 @@ typedef struct _osm_vl15
 	osm_vendor_t					*p_vend;
 	osm_log_t						*p_log;
 	osm_stats_t						*p_stats;
+	osm_subn_t						*p_subn;
+	cl_disp_reg_handle_t			h_disp;
 
 } osm_vl15_t;
 /*
@@ -176,6 +179,12 @@ typedef struct _osm_vl15
 *	p_stats
 *		Pointer to the OpenSM statistics block.
 *
+*  p_subn
+*     Pointer to the Subnet object for this subnet.
+*
+*  h_disp
+*    Handle returned from dispatcher registration.
+*
 * SEE ALSO
 *	VL15 object
 *********/
@@ -265,7 +274,9 @@ osm_vl15_init(
 	IN osm_vendor_t* const p_vend,
 	IN osm_log_t* const p_log,
 	IN osm_stats_t* const p_stats,
-	IN const int32_t max_wire_smps );
+	IN const int32_t max_wire_smps,
+	IN osm_subn_t* const p_subn,
+	IN cl_dispatcher_t* const p_disp );
 /*
 * PARAMETERS
 *	p_vl15
@@ -283,6 +294,12 @@ osm_vl15_init(
 *	max_wire_smps
 *		[in] Maximum number of MADs allowed on the wire at one time.
 *
+*  p_subn
+*     [in] Pointer to the subnet object.
+*
+*  p_disp
+*     [in] Pointer to the dispatcher object.
+*
 * RETURN VALUES
 *	IB_SUCCESS if the VL15 object was initialized successfully.
 *
Index: opensm/osm_opensm.c
===================================================================
--- opensm/osm_opensm.c	(revision 3703)
+++ opensm/osm_opensm.c	(working copy)
@@ -257,7 +257,7 @@ osm_opensm_init(
 
    status = osm_vl15_init( &p_osm->vl15,
                            p_osm->p_vendor,
-                           &p_osm->log, &p_osm->stats, p_opt->max_wire_smps );
+                           &p_osm->log, &p_osm->stats, p_opt->max_wire_smps, &p_osm->subn, &p_osm->disp );
    if( status != IB_SUCCESS )
       goto Exit;
 
Index: opensm/osm_vl15intf.c
===================================================================
--- opensm/osm_vl15intf.c	(revision 3703)
+++ opensm/osm_vl15intf.c	(working copy)
@@ -157,6 +157,8 @@ __osm_vl15_poller(
 
       if( status != IB_SUCCESS )
       {
+        uint32_t outstanding;
+        cl_status_t cl_status;
         osm_log( p_vl->p_log, OSM_LOG_ERROR,
                  "__osm_vl15_poller: ERR 3E03: "
                  "MAD send failed (%s).\n",
@@ -166,7 +168,64 @@ __osm_vl15_poller(
           The MAD was never successfully sent, so
           fix up the pre-incremented count values.
         */
+        /* Decrement qp0_mads_sent and qp0_mads_outstanding_on_wire
+           that was incremented in the code above. */
         mads_sent = cl_atomic_dec( &p_vl->p_stats->qp0_mads_sent );
+        if( p_madw->resp_expected == TRUE ) 
+          if ( !&p_vl->p_stats->qp0_mads_outstanding_on_wire ) 
+            osm_log( p_vl->p_log, OSM_LOG_ERROR, 
+                     "__osm_vl15_poller: ERR 3E04: " 
+                     "Trying to dec qp0_mads_outstanding_on_wire=0. " 
+                     "Problem with transaction mgr!\n"); 
+          else 
+            cl_atomic_dec( &p_vl->p_stats->qp0_mads_outstanding_on_wire ); 
+
+        /* The following code is similar to the one in 
+           __osm_sm_mad_ctrl_retire_trans_mad. We need to decrement the 
+           qp0_mads_outstanding counter, and if we reached 0 - need to call
+           the cl_disp_post with OSM_SIGNAL_NO_PENDING_TRANSACTION (in order
+           to wake up the state mgr). */
+        if ( !&p_vl->p_stats->qp0_mads_outstanding )
+          osm_log( p_vl->p_log, OSM_LOG_ERROR,
+                    "__osm_vl15_poller: ERR 3E05: "
+                   "Trying to dec qp0_mads_outstanding=0. "
+                   "Problem with transaction mgr!\n");
+        else
+          outstanding = cl_atomic_dec( &p_vl->p_stats->qp0_mads_outstanding );
+        
+        osm_log( p_vl->p_log, OSM_LOG_DEBUG,
+                 "__osm_vl15_poller: "
+                 "%u(%u) QP0 MADs outstanding.\n",
+                 p_vl->p_stats->qp0_mads_outstanding,outstanding );
+        
+        if( outstanding == 0 )
+        {
+          /*
+            The wire is clean.
+            Signal the state manager.
+          */
+          if( osm_log_is_active( p_vl->p_log, OSM_LOG_DEBUG ) )
+          {
+            osm_log( p_vl->p_log, OSM_LOG_DEBUG,
+                     "__osm_vl15_poller: "
+                     "Posting Dispatcher message %s.\n",
+                     osm_get_disp_msg_str( OSM_MSG_NO_SMPS_OUTSTANDING ) );
+          }
+          
+          cl_status = cl_disp_post( p_vl->h_disp,
+                                    OSM_MSG_NO_SMPS_OUTSTANDING,
+                                    (void *)OSM_SIGNAL_NO_PENDING_TRANSACTIONS,
+                                    NULL,
+                                    NULL );
+          
+          if( cl_status != CL_SUCCESS )
+          {
+            osm_log( p_vl->p_log, OSM_LOG_ERROR,
+                     "__osm_vl15_poller: ERR 3E06: "
+                     "Dispatcher post message failed (%s).\n",
+                     CL_STATUS_MSG( cl_status ) );
+          }
+        }
       }
       else
       {
@@ -232,6 +291,7 @@ osm_vl15_construct(
   cl_qlist_init( &p_vl->rfifo );
   cl_qlist_init( &p_vl->ufifo );
   cl_thread_construct( &p_vl->poller );
+  p_vl->h_disp = CL_DISP_INVALID_HANDLE;
 }
 
 /**********************************************************************
@@ -281,6 +341,8 @@ osm_vl15_destroy(
   p_vl->state = OSM_VL15_STATE_INIT;
   cl_spinlock_destroy( &p_vl->lock );
 
+  cl_disp_unregister( p_vl->h_disp );
+
   OSM_LOG_EXIT( p_vl->p_log );
 }
 
@@ -292,7 +354,10 @@ osm_vl15_init(
   IN osm_vendor_t* const p_vend,
   IN osm_log_t* const p_log,
   IN osm_stats_t* const p_stats,
-  IN const int32_t max_wire_smps )
+  IN const int32_t max_wire_smps,
+  IN osm_subn_t* const p_subn,
+  IN cl_dispatcher_t* const p_disp
+ )
 {
   ib_api_status_t status = IB_SUCCESS;
   OSM_LOG_ENTER( p_log, osm_vl15_init );
@@ -301,6 +366,7 @@ osm_vl15_init(
   p_vl->p_log = p_log;
   p_vl->p_stats = p_stats;
   p_vl->max_wire_smps = max_wire_smps;
+  p_vl->p_subn = p_subn;
 
   status = cl_event_init( &p_vl->signal, FALSE );
   if( status != IB_SUCCESS )
@@ -321,6 +387,21 @@ osm_vl15_init(
   if( status != IB_SUCCESS )
     goto Exit;
 
+  p_vl->h_disp = cl_disp_register(
+    p_disp,
+    CL_DISP_MSGID_NONE,
+    NULL,
+    NULL );
+
+  if( p_vl->h_disp == CL_DISP_INVALID_HANDLE )
+  {
+    osm_log( p_log, OSM_LOG_ERROR,
+             "osm_vl15_init: ERR 3E01: "
+             "Dispatcher registration failed.\n" );
+    status = IB_INSUFFICIENT_RESOURCES;
+    goto Exit;
+  }
+
  Exit:
   OSM_LOG_EXIT( p_log );
   return( status );




More information about the general mailing list