[openib-general] [PATCH] Opensm - handling immediate error in vendor_send new

Yael Kalka yael at mellanox.co.il
Tue Oct 11 01:28:31 PDT 2005


Hi Hal,

Attached is a new patch with several fixes for this issue.
I decided to remove the checking for zero in the atomic_dec after all,
since as I mentioned before - clearing it is not a fix, and we will
see the value in other infos in the log file.

Thanks,
Yael

Signed-off-by:  Yael Kalka <yael at mellanox.co.il>

Index: include/opensm/osm_vl15intf.h
===================================================================
--- include/opensm/osm_vl15intf.h	(revision 3704)
+++ include/opensm/osm_vl15intf.h	(working copy)
@@ -55,11 +55,13 @@
 #include <complib/cl_event.h>
 #include <complib/cl_thread.h>
 #include <complib/cl_qlist.h>
+#include <complib/cl_passivelock.h>
 #include <opensm/osm_stats.h>
 #include <opensm/osm_log.h>
 #include <opensm/osm_madw.h>
 #include <opensm/osm_mad_pool.h>
 #include <vendor/osm_vendor.h>
+#include <opensm/osm_subnet.h>
 
 #ifdef __cplusplus
 #  define BEGIN_C_DECLS extern "C" {
@@ -137,6 +139,9 @@ typedef struct _osm_vl15
 	osm_vendor_t					*p_vend;
 	osm_log_t						*p_log;
 	osm_stats_t						*p_stats;
+	osm_subn_t						*p_subn;
+	cl_disp_reg_handle_t			h_disp;
+	cl_plock_t						*p_lock;
 
 } osm_vl15_t;
 /*
@@ -176,6 +181,15 @@ typedef struct _osm_vl15
 *	p_stats
 *		Pointer to the OpenSM statistics block.
 *
+*  p_subn
+*     Pointer to the Subnet object for this subnet.
+*
+*  h_disp
+*    Handle returned from dispatcher registration.
+*
+*	p_lock
+*		Pointer to the serializing lock.
+*
 * SEE ALSO
 *	VL15 object
 *********/
@@ -265,7 +279,10 @@ osm_vl15_init(
 	IN osm_vendor_t* const p_vend,
 	IN osm_log_t* const p_log,
 	IN osm_stats_t* const p_stats,
-	IN const int32_t max_wire_smps );
+	IN const int32_t max_wire_smps,
+	IN osm_subn_t* const p_subn,
+	IN cl_dispatcher_t* const p_disp,
+	IN cl_plock_t* const p_lock );
 /*
 * PARAMETERS
 *	p_vl15
@@ -283,6 +300,15 @@ osm_vl15_init(
 *	max_wire_smps
 *		[in] Maximum number of MADs allowed on the wire at one time.
 *
+*  p_subn
+*     [in] Pointer to the subnet object.
+*
+*  p_disp
+*     [in] Pointer to the dispatcher object.
+*
+*	p_lock
+*		[in] Pointer to the OpenSM serializing lock.
+*
 * RETURN VALUES
 *	IB_SUCCESS if the VL15 object was initialized successfully.
 *
Index: opensm/osm_opensm.c
===================================================================
--- opensm/osm_opensm.c	(revision 3704)
+++ opensm/osm_opensm.c	(working copy)
@@ -257,7 +257,8 @@ osm_opensm_init(
 
    status = osm_vl15_init( &p_osm->vl15,
                            p_osm->p_vendor,
-                           &p_osm->log, &p_osm->stats, p_opt->max_wire_smps );
+                           &p_osm->log, &p_osm->stats, p_opt->max_wire_smps, 
+                           &p_osm->subn, &p_osm->disp, &p_osm->lock );
    if( status != IB_SUCCESS )
       goto Exit;
 
Index: opensm/osm_vl15intf.c
===================================================================
--- opensm/osm_vl15intf.c	(revision 3704)
+++ opensm/osm_vl15intf.c	(working copy)
@@ -157,6 +157,8 @@ __osm_vl15_poller(
 
       if( status != IB_SUCCESS )
       {
+        uint32_t outstanding;
+        cl_status_t cl_status;
         osm_log( p_vl->p_log, OSM_LOG_ERROR,
                  "__osm_vl15_poller: ERR 3E03: "
                  "MAD send failed (%s).\n",
@@ -166,7 +168,69 @@ __osm_vl15_poller(
           The MAD was never successfully sent, so
           fix up the pre-incremented count values.
         */
+        /* Decrement qp0_mads_sent and qp0_mads_outstanding_on_wire
+           that was incremented in the code above. */
         mads_sent = cl_atomic_dec( &p_vl->p_stats->qp0_mads_sent );
+        if( p_madw->resp_expected == TRUE )
+          cl_atomic_dec( &p_vl->p_stats->qp0_mads_outstanding_on_wire );
+
+        /*
+           The following code is similar to the one in 
+           __osm_sm_mad_ctrl_retire_trans_mad. We need to decrement the 
+           qp0_mads_outstanding counter, and if we reached 0 - need to call
+           the cl_disp_post with OSM_SIGNAL_NO_PENDING_TRANSACTION (in order
+           to wake up the state mgr).
+        */
+        cl_atomic_dec( &p_vl->p_stats->qp0_mads_outstanding );
+        
+        osm_log( p_vl->p_log, OSM_LOG_DEBUG,
+                 "__osm_vl15_poller: "
+                 "%u QP0 MADs outstanding.\n",
+                 p_vl->p_stats->qp0_mads_outstanding );
+        
+        /*
+          Acquire the lock non-exclusively.
+          Other modules that send MADs grab this lock exclusively.
+          These modules that are in the process of sending MADs
+          will hold the lock until they finish posting all the MADs
+          they plan to send.  While the other module is sending MADs
+          the outstanding count may temporarily go to zero.
+          Thus, by grabbing the lock ourselves, we get an accurate
+          view of whether or not the number of outstanding MADs is
+          really zero.
+        */
+        CL_PLOCK_ACQUIRE( p_vl->p_lock );
+        outstanding = p_vl->p_stats->qp0_mads_outstanding;
+        CL_PLOCK_RELEASE( p_vl->p_lock );
+
+        if( outstanding == 0 )
+        {
+          /*
+            The wire is clean.
+            Signal the state manager.
+          */
+          if( osm_log_is_active( p_vl->p_log, OSM_LOG_DEBUG ) )
+          {
+            osm_log( p_vl->p_log, OSM_LOG_DEBUG,
+                     "__osm_vl15_poller: "
+                     "Posting Dispatcher message %s.\n",
+                     osm_get_disp_msg_str( OSM_MSG_NO_SMPS_OUTSTANDING ) );
+          }
+          
+          cl_status = cl_disp_post( p_vl->h_disp,
+                                    OSM_MSG_NO_SMPS_OUTSTANDING,
+                                    (void *)OSM_SIGNAL_NO_PENDING_TRANSACTIONS,
+                                    NULL,
+                                    NULL );
+          
+          if( cl_status != CL_SUCCESS )
+          {
+            osm_log( p_vl->p_log, OSM_LOG_ERROR,
+                     "__osm_vl15_poller: ERR 3E06: "
+                     "Dispatcher post message failed (%s).\n",
+                     CL_STATUS_MSG( cl_status ) );
+          }
+        }
       }
       else
       {
@@ -232,6 +296,7 @@ osm_vl15_construct(
   cl_qlist_init( &p_vl->rfifo );
   cl_qlist_init( &p_vl->ufifo );
   cl_thread_construct( &p_vl->poller );
+  p_vl->h_disp = CL_DISP_INVALID_HANDLE;
 }
 
 /**********************************************************************
@@ -281,6 +346,8 @@ osm_vl15_destroy(
   p_vl->state = OSM_VL15_STATE_INIT;
   cl_spinlock_destroy( &p_vl->lock );
 
+  cl_disp_unregister( p_vl->h_disp );
+
   OSM_LOG_EXIT( p_vl->p_log );
 }
 
@@ -292,7 +359,11 @@ osm_vl15_init(
   IN osm_vendor_t* const p_vend,
   IN osm_log_t* const p_log,
   IN osm_stats_t* const p_stats,
-  IN const int32_t max_wire_smps )
+  IN const int32_t max_wire_smps,
+  IN osm_subn_t* const p_subn,
+  IN cl_dispatcher_t* const p_disp,
+  IN cl_plock_t* const p_lock
+ )
 {
   ib_api_status_t status = IB_SUCCESS;
   OSM_LOG_ENTER( p_log, osm_vl15_init );
@@ -301,6 +372,8 @@ osm_vl15_init(
   p_vl->p_log = p_log;
   p_vl->p_stats = p_stats;
   p_vl->max_wire_smps = max_wire_smps;
+  p_vl->p_subn = p_subn;
+  p_vl->p_lock = p_lock;
 
   status = cl_event_init( &p_vl->signal, FALSE );
   if( status != IB_SUCCESS )
@@ -321,6 +394,21 @@ osm_vl15_init(
   if( status != IB_SUCCESS )
     goto Exit;
 
+  p_vl->h_disp = cl_disp_register(
+    p_disp,
+    CL_DISP_MSGID_NONE,
+    NULL,
+    NULL );
+
+  if( p_vl->h_disp == CL_DISP_INVALID_HANDLE )
+  {
+    osm_log( p_log, OSM_LOG_ERROR,
+             "osm_vl15_init: ERR 3E01: "
+             "Dispatcher registration failed.\n" );
+    status = IB_INSUFFICIENT_RESOURCES;
+    goto Exit;
+  }
+
  Exit:
   OSM_LOG_EXIT( p_log );
   return( status );




More information about the general mailing list