[ofa-general] [PATCH] OpenSM handling of "Babbling" Ports

Hal Rosenstock halr at voltaire.com
Thu Jul 5 05:57:31 PDT 2007


A "babbling" port is a port which causes traps to be generated frequently.
It may directly be "this" port which generates the traps or the peer port
detecting the issue and that the SMA on switch port 0 generates the traps.
This has only currently been observed for trap 131 but will also apply
for traps 129 and 130 as well which are other urgent and similar traps.

Note that there appears to be a bug in Mellanox firmware for both Anafa-2 and
Tavor at a minimum which causes the max trap rate not to be adhered to
and relief for this does not appear to be in short term sight.

Policy
When a bablbing port is detected, OpenSM will disable the port or its
peer switch port (depending on which trap) which should terminate the
trap storm.

Detection
250 consecutive traps of this type will be used as the (initial)
threshold. The reason for this is so as to not prematurely detect this
and disable a port.

Recovery
Admin would reenable port when OK again. (This usually involves
rebooting the node causing the trap to be indicated.)

Signed-off-by: Hal Rosenstock <halr at voltaire.com>

diff --git a/opensm/include/opensm/osm_subnet.h b/opensm/include/opensm/osm_subnet.h
index bedd63f..1150703 100644
--- a/opensm/include/opensm/osm_subnet.h
+++ b/opensm/include/opensm/osm_subnet.h
@@ -286,6 +286,7 @@ typedef struct _osm_subn_opt
   boolean_t                honor_guid2lid_file;
   boolean_t                daemon;
   boolean_t                sm_inactive;
+  boolean_t                babbling_port_policy;
   osm_qos_options_t        qos_options;
   osm_qos_options_t        qos_ca_options;
   osm_qos_options_t        qos_sw0_options;
@@ -487,6 +488,9 @@ typedef struct _osm_subn_opt
 *
 *	sm_inactive
 *		OpenSM will start with SM in not active state.
+*
+*	babbling_port_policy
+*		OpenSM will enforce its "babbling" port policy.
 *	
 *	perfmgr
 *		Enable or disable the performance manager
diff --git a/opensm/opensm/osm_subnet.c b/opensm/opensm/osm_subnet.c
index 726b665..87b71e5 100644
--- a/opensm/opensm/osm_subnet.c
+++ b/opensm/opensm/osm_subnet.c
@@ -472,6 +472,7 @@ osm_subn_set_default_opt(
   p_opt->honor_guid2lid_file = FALSE;
   p_opt->daemon = FALSE;
   p_opt->sm_inactive = FALSE;
+  p_opt->babbling_port_policy = FALSE;
 #ifdef ENABLE_OSM_PERF_MGR
   p_opt->perfmgr = FALSE;
   p_opt->perfmgr_sweep_time_s = OSM_PERFMGR_DEFAULT_SWEEP_TIME_S;
@@ -1358,6 +1359,10 @@ osm_subn_parse_conf_file(
         "sm_inactive",
         p_key, p_val, &p_opts->sm_inactive);
 
+      __osm_subn_opts_unpack_boolean(
+        "babbling_port_policy",
+        p_key, p_val, &p_opts->babbling_port_policy);
+
 #ifdef ENABLE_OSM_PERF_MGR
       __osm_subn_opts_unpack_boolean(
         "perfmgr",
@@ -1631,9 +1636,12 @@ osm_subn_write_conf_file(
     "# Daemon mode\n"
     "daemon %s\n\n"
     "# SM Inactive\n"
-    "sm_inactive %s\n\n",
+    "sm_inactive %s\n\n"
+    "# Babbling Port Policy\n"
+    "babbling_port_policy %s\n\n",
     p_opts->daemon ? "TRUE" : "FALSE",
-    p_opts->sm_inactive ? "TRUE" : "FALSE"
+    p_opts->sm_inactive ? "TRUE" : "FALSE",
+    p_opts->babbling_port_policy ? "TRUE" : "FALSE"
     );
 
 #ifdef ENABLE_OSM_PERF_MGR
diff --git a/opensm/opensm/osm_trap_rcv.c b/opensm/opensm/osm_trap_rcv.c
index 5900c51..fbb6dac 100644
--- a/opensm/opensm/osm_trap_rcv.c
+++ b/opensm/opensm/osm_trap_rcv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
  * Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
@@ -548,6 +548,61 @@ __osm_trap_rcv_process_request(
         }
         else
         {
+          /* When babbling port policy option is enabled and
+             Threshold for disabling a "babbling" port is exceeded */
+          if ( p_rcv->p_subn->opt.babbling_port_policy &&
+               num_received >= 250 )
+          {
+            uint8_t               payload[IB_SMP_DATA_SIZE];
+            ib_port_info_t*       p_pi = (ib_port_info_t*)payload;
+            const ib_port_info_t* p_old_pi;
+            osm_madw_context_t    context;
+
+            /* If trap 131, might want to disable peer port if available */
+            /* but peer port has been observed not to respond to SM requests */
+
+            osm_log( p_rcv->p_log, OSM_LOG_ERROR,
+                     "__osm_trap_rcv_process_request: ERR 3810: "
+                     " Disabling physical port lid:0x%02X num:%u\n",
+                     cl_ntoh16(p_ntci->data_details.ntc_129_131.lid),
+                     p_ntci->data_details.ntc_129_131.port_num
+                     );
+
+            p_old_pi = &p_physp->port_info;
+            memcpy( payload, p_old_pi, sizeof(ib_port_info_t) );
+
+            /* Set port to disabled/down */
+            ib_port_info_set_port_state( p_pi, IB_LINK_DOWN );
+            ib_port_info_set_port_phys_state( IB_PORT_PHYS_STATE_DISABLED, p_pi );
+
+            context.pi_context.node_guid = osm_node_get_node_guid( osm_physp_get_node_ptr( p_physp ) );
+            context.pi_context.port_guid = osm_physp_get_port_guid( p_physp );
+            context.pi_context.set_method = TRUE;
+            context.pi_context.update_master_sm_base_lid = FALSE;
+            context.pi_context.light_sweep = FALSE;
+            context.pi_context.active_transition = FALSE;
+
+            status = osm_req_set( &p_rcv->p_subn->p_osm->sm.req,
+                                   osm_physp_get_dr_path_ptr( p_physp ),
+                                   payload,
+                                   sizeof(payload),
+                                   IB_MAD_ATTR_PORT_INFO,
+                                   cl_hton32(osm_physp_get_port_num( p_physp )),
+                                   CL_DISP_MSGID_NONE,
+                                  &context );
+
+            if( status == IB_SUCCESS )
+            {
+               goto Exit;
+            }
+            else
+            {
+               osm_log( p_rcv->p_log, OSM_LOG_ERROR,
+                        "__osm_trap_rcv_process_request: ERR 3811: "
+                        "Request to set PortInfo failed\n" );
+            }
+          }
+
           osm_log( p_rcv->p_log, OSM_LOG_VERBOSE,
                    "__osm_trap_rcv_process_request: "
                    "Marking unhealthy physical port by lid:0x%02X num:%u\n",







More information about the general mailing list