[openib-general] [PATCH] osm: fix bugs related to not passing OSM_SIGNAL_DONE_PENDING

Eitan Zahavi eitan at mellanox.co.il
Sat Dec 16 14:12:28 PST 2006


Hi Hal

This set of patches fixes issues of not providing back to state manager 
OSM_SIGNAL_DONE_PENDING
which breaks the state machine later in the sweep.

Eitan

Signed-off-by:  Eitan Zahavi <eitan at mellanox.co.il>

 osm/opensm/osm_pkey_mgr.c  |  112 
++++++++++++++++++++++++++++++++------------
osm/opensm/osm_state_mgr.c |   11 +++--
 osm/opensm/osm_ucast_mgr.c |   96 ++++++++++++++++++++++++--------------
 4 files changed, 179 insertions(+), 88 deletions(-)

diff --git a/osm/opensm/osm_pkey_mgr.c b/osm/opensm/osm_pkey_mgr.c
index 48837bc..a33aec7 100644
--- a/osm/opensm/osm_pkey_mgr.c
+++ b/osm/opensm/osm_pkey_mgr.c
@@ -212,8 +212,9 @@ pkey_mgr_update_pkey_entry(
 
 /**********************************************************************
  **********************************************************************/
-static ib_api_status_t
+static boolean_t
 pkey_mgr_enforce_partition(
+  IN osm_log_t *p_log,
   IN const osm_req_t *p_req,
   IN const osm_physp_t *p_physp,
   IN const boolean_t enforce)
@@ -221,12 +222,33 @@ pkey_mgr_enforce_partition(
   osm_madw_context_t context;
   uint8_t payload[IB_SMP_DATA_SIZE];
   ib_port_info_t *p_pi;
+  ib_api_status_t status;
 
   if (!(p_pi = osm_physp_get_port_info_ptr( p_physp )))
-    return IB_ERROR;
+  {
+     osm_log( p_log, OSM_LOG_ERROR,
+              "pkey_mgr_enforce_partition: ERR 0507: "
+              "No port info for "
+              "node 0x%016" PRIx64 " port %u\n",
+              cl_ntoh64(
+                 osm_node_get_node_guid(
+                    osm_physp_get_node_ptr( p_physp ))),
+              osm_physp_get_port_num( p_physp ) );
+     return FALSE;
+  }
 
-  if ((p_pi->vl_enforce & 0xc) == (0xc)*(enforce == TRUE))
-    return IB_SUCCESS;
+  if ((p_pi->vl_enforce & 0xc) == (0xc)*(enforce == TRUE))
+  {
+     osm_log( p_log, OSM_LOG_DEBUG,
+              "pkey_mgr_enforce_partition: "
+              "No need to update PortInfo for "
+              "node 0x%016" PRIx64 " port %u\n",
+              cl_ntoh64(
+                 osm_node_get_node_guid(
+                    osm_physp_get_node_ptr( p_physp ))),
+              osm_physp_get_port_num( p_physp ) );
+    return FALSE;
+  }
 
   memset( payload, 0, IB_SMP_DATA_SIZE );
   memcpy( payload, p_pi, sizeof(ib_port_info_t) );
@@ -248,11 +270,35 @@ pkey_mgr_enforce_partition(
   context.pi_context.light_sweep = FALSE;
   context.pi_context.active_transition = FALSE;
 
-  return osm_req_set( p_req, osm_physp_get_dr_path_ptr( p_physp ),
-                      payload, sizeof(payload),
-                      IB_MAD_ATTR_PORT_INFO,
-                      cl_hton32( osm_physp_get_port_num( p_physp ) ),
-                      CL_DISP_MSGID_NONE, &context );
+  status = osm_req_set( p_req, osm_physp_get_dr_path_ptr( p_physp ),
+        payload, sizeof(payload),
+        IB_MAD_ATTR_PORT_INFO,
+        cl_hton32( osm_physp_get_port_num( p_physp ) ),
+        CL_DISP_MSGID_NONE, &context );
+  if (status != IB_SUCCESS)
+  {
+     osm_log( p_log, OSM_LOG_ERROR,
+              "pkey_mgr_enforce_partition: ERR 0520: "
+              "Failed to set PortInfo for "
+              "node 0x%016" PRIx64 " port %u\n",
+              cl_ntoh64(
+                 osm_node_get_node_guid(
+                    osm_physp_get_node_ptr( p_physp ))),
+              osm_physp_get_port_num( p_physp ) );
+     return FALSE;
+  }
+  else
+  {
+     osm_log( p_log, OSM_LOG_DEBUG,
+              "pkey_mgr_enforce_partition: "
+              "Set PortInfo for "
+              "node 0x%016" PRIx64 " port %u\n",
+              cl_ntoh64(
+                 osm_node_get_node_guid(
+                    osm_physp_get_node_ptr( p_physp ))),
+              osm_physp_get_port_num( p_physp ) );
+   return TRUE;
+  }
 }
 
 /**********************************************************************
@@ -369,15 +415,26 @@ static boolean_t pkey_mgr_update_port(
 
     status = pkey_mgr_update_pkey_entry( p_req, p_physp, new_block, 
block_index );
     if (status == IB_SUCCESS)
-      ret_val = TRUE;
+  {
+   osm_log( p_log, OSM_LOG_DEBUG,
+      "pkey_mgr_update_port: "
+      "Updated "
+      "pkey table block %d for node 0x%016" PRIx64 " port %u\n",
+      block_index,
+      cl_ntoh64( osm_node_get_node_guid( p_node ) ),
+      osm_physp_get_port_num( p_physp ) );
+   ret_val = TRUE;
+  }
     else
-      osm_log( p_log, OSM_LOG_ERROR,
-        "pkey_mgr_update_port: ERR 0506: "
-        "pkey_mgr_update_pkey_entry() failed to update "
-        "pkey table block %d for node 0x%016" PRIx64 " port %u\n",
-        block_index,
-        cl_ntoh64( osm_node_get_node_guid( p_node ) ),
-        osm_physp_get_port_num( p_physp ) );
+  {
+   osm_log( p_log, OSM_LOG_ERROR,
+      "pkey_mgr_update_port: ERR 0506: "
+      "pkey_mgr_update_pkey_entry() failed to update "
+      "pkey table block %d for node 0x%016" PRIx64 " port %u\n",
+      block_index,
+      cl_ntoh64( osm_node_get_node_guid( p_node ) ),
+      osm_physp_get_port_num( p_physp ) );
+  }
   }
 
   return ret_val;
@@ -405,8 +462,9 @@ pkey_mgr_update_peer_port(
   uint16_t peer_max_blocks;
   ib_api_status_t status = IB_SUCCESS;
   boolean_t ret_val = FALSE;
+  boolean_t port_info_set = FALSE;
   ib_pkey_table_t empty_block;
-
+ 
   memset(&empty_block, 0, sizeof(ib_pkey_table_t));
 
   p_physp = osm_port_get_default_phys_ptr( p_port );
@@ -439,18 +497,11 @@ pkey_mgr_update_peer_port(
     enforce = FALSE;
   }
 
-  if (pkey_mgr_enforce_partition( p_req, peer, enforce ) != IB_SUCCESS)
-  {
-    osm_log( p_log, OSM_LOG_ERROR,
-      "pkey_mgr_update_peer_port: ERR 0507: "
-      "pkey_mgr_enforce_partition() failed to update "
-      "node 0x%016" PRIx64 " port %u\n",
-      cl_ntoh64( osm_node_get_node_guid( p_node ) ),
-      osm_physp_get_port_num( peer ) );
-  }
+  if (pkey_mgr_enforce_partition( p_log, p_req, peer, enforce))
+   port_info_set = TRUE;
 
   if (enforce == FALSE)
-    return FALSE;
+  return port_info_set;
 
   p_peer_pkey_tbl->used_blocks = p_pkey_tbl->used_blocks;
   for (block_index = 0; block_index < p_pkey_tbl->used_blocks; 
block_index++)
@@ -487,6 +538,7 @@ pkey_mgr_update_peer_port(
              osm_physp_get_port_num( peer ) );
   }
 
+  if (port_info_set) return TRUE;
   return ret_val;
 }
 
@@ -541,10 +593,10 @@ osm_pkey_mgr_process(
       signal = OSM_SIGNAL_DONE_PENDING;
     p_node = osm_port_get_parent_node( p_port );
     if ( ( osm_node_get_type( p_node ) != IB_NODE_TYPE_SWITCH ) &&
-  pkey_mgr_update_peer_port( &p_osm->log, &p_osm->sm.req,
+   pkey_mgr_update_peer_port( &p_osm->log, &p_osm->sm.req,
         &p_osm->subn, p_port,
         !p_osm->subn.opt.no_partition_enforcement ) )
-      signal = OSM_SIGNAL_DONE_PENDING;       
+      signal = OSM_SIGNAL_DONE_PENDING;
   }
 
  _err:
diff --git a/osm/opensm/osm_state_mgr.c b/osm/opensm/osm_state_mgr.c
index 9eac038..4e61259 100644
--- a/osm/opensm/osm_state_mgr.c
+++ b/osm/opensm/osm_state_mgr.c
@@ -1853,6 +1853,7 @@ osm_state_mgr_process(
 {
    ib_api_status_t status;
    osm_remote_sm_t *p_remote_sm;
+ osm_signal_t tmp_signal;
 
    CL_ASSERT( p_mgr );
 
@@ -2075,11 +2076,10 @@ osm_state_mgr_process(
          case OSM_SIGNAL_CHANGE_DETECTED:
             /*
              * Nothing to do here.  One subnet change typcially
-             * begets another....
+             * begets another.... But needs to wait for all transactions
              */
             signal = OSM_SIGNAL_NONE;
-            break;
-
+    break;
          case OSM_SIGNAL_NO_PENDING_TRANSACTIONS:
             /*
              * A change was detected on the subnet.
@@ -2219,7 +2219,10 @@ osm_state_mgr_process(
             signal = osm_pkey_mgr_process( p_mgr->p_subn->p_osm );
 
             /* the returned signal is always DONE */
-            signal = osm_qos_setup(p_mgr->p_subn->p_osm);
+            tmp_signal = osm_qos_setup(p_mgr->p_subn->p_osm);
+
+    if (tmp_signal == OSM_SIGNAL_DONE_PENDING)
+     signal = OSM_SIGNAL_DONE_PENDING;
 
             /* try to restore SA DB (this should be before lid_mgr
                because we may want to disable clients reregistration
diff --git a/osm/opensm/osm_ucast_mgr.c b/osm/opensm/osm_ucast_mgr.c
index e977253..39973de 100644
--- a/osm/opensm/osm_ucast_mgr.c
+++ b/osm/opensm/osm_ucast_mgr.c
@@ -885,6 +885,9 @@ osm_ucast_mgr_set_fwd_table(
   ib_switch_info_t si;
   uint32_t block_id_ho = 0;
   uint8_t block[IB_SMP_DATA_SIZE];
+  boolean_t set_swinfo_require = FALSE;
+  uint16_t lin_top;
+  uint8_t life_state;
 
   CL_ASSERT( p_mgr );
 
@@ -904,43 +907,59 @@ osm_ucast_mgr_set_fwd_table(
     Set the top of the unicast forwarding table.
   */
   si = *osm_switch_get_si_ptr( p_sw );
-  si.lin_top = cl_hton16( osm_switch_get_max_lid_ho( p_sw ) );
+  lin_top = cl_hton16( osm_switch_get_max_lid_ho( p_sw ) );
+  if (si.lin_top != lin_top)
+  {
+   set_swinfo_require = TRUE;
+      si.lin_top  = lin_top;
+  }
 
   /* check to see if the change state bit is on. If it is - then we
      need to clear it. */
-   if( ib_switch_info_get_state_change( &si ) )
-    si.life_state = ( (p_mgr->p_subn->opt.packet_life_time <<3 )
-                      | ( si.life_state & IB_SWITCH_PSC ) )  & 0xfc;
+  if ( ib_switch_info_get_state_change( &si ) )
+      life_state = ( (p_mgr->p_subn->opt.packet_life_time <<3 )
+                          | ( si.life_state & IB_SWITCH_PSC ) )  & 0xfc;
   else
-    si.life_state = (p_mgr->p_subn->opt.packet_life_time <<3 ) & 0xf8;
+      life_state = (p_mgr->p_subn->opt.packet_life_time <<3 ) & 0xf8;
 
-  if( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
+  if (life_state != si.life_state)
   {
-    osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
-             "osm_ucast_mgr_set_fwd_table: "
-             "Setting switch FT top to LID 0x%X\n",
-             osm_switch_get_max_lid_ho( p_sw ) );
+      set_swinfo_require = TRUE;
+      si.life_state = life_state;
   }
-
-  context.si_context.light_sweep = FALSE;
-  context.si_context.node_guid = osm_node_get_node_guid( p_node );
-  context.si_context.set_method = TRUE;
-
-  status = osm_req_set( p_mgr->p_req,
-                        p_path,
-                        (uint8_t*)&si,
-                        sizeof(si),
-                        IB_MAD_ATTR_SWITCH_INFO,
-                        0,
-                        CL_DISP_MSGID_NONE,
-                        &context );
-
-  if( status != IB_SUCCESS )
+ 
+  if ( set_swinfo_require )
   {
-    osm_log( p_mgr->p_log, OSM_LOG_ERROR,
-             "osm_ucast_mgr_set_fwd_table: ERR 3A06: "
-             "Sending SwitchInfo attribute failed (%s)\n",
-             ib_get_err_str( status ) );
+      if ( osm_log_is_active( p_mgr->p_log, OSM_LOG_DEBUG ) )
+      {
+          osm_log( p_mgr->p_log, OSM_LOG_DEBUG,
+                      "osm_ucast_mgr_set_fwd_table: "
+                      "Setting switch FT top to LID 0x%X\n",
+                      osm_switch_get_max_lid_ho( p_sw ) );
+      }
+     
+      context.si_context.light_sweep = FALSE;
+      context.si_context.node_guid = osm_node_get_node_guid( p_node );
+      context.si_context.set_method = TRUE;
+     
+      status = osm_req_set( p_mgr->p_req,
+                                    p_path,
+                                    (uint8_t*)&si,
+                                    sizeof(si),
+                                    IB_MAD_ATTR_SWITCH_INFO,
+                                    0,
+                                    CL_DISP_MSGID_NONE,
+                                    &context );
+     
+      if( status != IB_SUCCESS )
+      {
+          osm_log( p_mgr->p_log, OSM_LOG_ERROR,
+                      "osm_ucast_mgr_set_fwd_table: ERR 3A06: "
+                      "Sending SwitchInfo attribute failed (%s)\n",
+                      ib_get_err_str( status ) );
+      }
+      else
+          p_mgr->any_change = TRUE;
   }
 
   /*
@@ -1215,13 +1234,14 @@ osm_ucast_mgr_process(
 
   CL_PLOCK_EXCL_ACQUIRE( p_mgr->p_lock );
 
+  p_mgr->any_change = FALSE;
+
   /*
     If there are no switches in the subnet, we are done.
   */
   if (cl_qmap_count( p_sw_guid_tbl ) == 0)
     goto Exit;
 
-  p_mgr->any_change = FALSE;
   cl_qmap_apply_func(p_sw_guid_tbl, __osm_ucast_mgr_clean_switch, NULL);
 
   if (!p_routing_eng->build_lid_matrices ||
@@ -1248,14 +1268,20 @@ osm_ucast_mgr_process(
   if ( osm_log_is_active( p_mgr->p_log, OSM_LOG_ROUTING ) )
     __osm_ucast_mgr_dump_tables( p_mgr );
 
-  if (p_mgr->any_change)
+  if (p_mgr->any_change)
+  {
      signal = OSM_SIGNAL_DONE_PENDING;
+      osm_log(p_mgr->p_log, OSM_LOG_VERBOSE,
+                 "osm_ucast_mgr_process: "
+                 "LFT Tables configured on all switches\n");
+  }
   else
+  {
+      osm_log(p_mgr->p_log, OSM_LOG_VERBOSE,
+                 "osm_ucast_mgr_process: "
+                 "No need to set any LFT Tables on all switches\n");
      signal = OSM_SIGNAL_DONE;
-
-  osm_log(p_mgr->p_log, OSM_LOG_VERBOSE,
-          "osm_ucast_mgr_process: "
-          "LFT Tables configured on all switches\n");
+  }
 
  Exit:
   CL_PLOCK_RELEASE( p_mgr->p_lock );






More information about the general mailing list