[ofa-general] [PATCH] OpenSM handling of "Babbling" Ports

Hal Rosenstock halr at voltaire.com
Thu Jul 5 05:57:31 PDT 2007

A "babbling" port is a port which causes traps to be generated frequently.
It may directly be "this" port which generates the traps or the peer port
detecting the issue and that the SMA on switch port 0 generates the traps.
This has only currently been observed for trap 131 but will also apply
for traps 129 and 130 as well which are other urgent and similar traps.

Note that there appears to be a bug in Mellanox firmware for both Anafa-2 and
Tavor at a minimum which causes the max trap rate not to be adhered to
and relief for this does not appear to be in short term sight.

When a bablbing port is detected, OpenSM will disable the port or its
peer switch port (depending on which trap) which should terminate the
trap storm.

250 consecutive traps of this type will be used as the (initial)
threshold. The reason for this is so as to not prematurely detect this
and disable a port.

Admin would reenable port when OK again. (This usually involves
rebooting the node causing the trap to be indicated.)

Signed-off-by: Hal Rosenstock <halr at voltaire.com>

diff --git a/opensm/include/opensm/osm_subnet.h b/opensm/include/opensm/osm_subnet.h
index bedd63f..1150703 100644
--- a/opensm/include/opensm/osm_subnet.h
+++ b/opensm/include/opensm/osm_subnet.h
@@ -286,6 +286,7 @@ typedef struct _osm_subn_opt
   boolean_t                honor_guid2lid_file;
   boolean_t                daemon;
   boolean_t                sm_inactive;
+  boolean_t                babbling_port_policy;
   osm_qos_options_t        qos_options;
   osm_qos_options_t        qos_ca_options;
   osm_qos_options_t        qos_sw0_options;
@@ -487,6 +488,9 @@ typedef struct _osm_subn_opt
 *	sm_inactive
 *		OpenSM will start with SM in not active state.
+*	babbling_port_policy
+*		OpenSM will enforce its "babbling" port policy.
 *	perfmgr
 *		Enable or disable the performance manager
diff --git a/opensm/opensm/osm_subnet.c b/opensm/opensm/osm_subnet.c
index 726b665..87b71e5 100644
--- a/opensm/opensm/osm_subnet.c
+++ b/opensm/opensm/osm_subnet.c
@@ -472,6 +472,7 @@ osm_subn_set_default_opt(
   p_opt->honor_guid2lid_file = FALSE;
   p_opt->daemon = FALSE;
   p_opt->sm_inactive = FALSE;
+  p_opt->babbling_port_policy = FALSE;
   p_opt->perfmgr = FALSE;
   p_opt->perfmgr_sweep_time_s = OSM_PERFMGR_DEFAULT_SWEEP_TIME_S;
@@ -1358,6 +1359,10 @@ osm_subn_parse_conf_file(
         p_key, p_val, &p_opts->sm_inactive);
+      __osm_subn_opts_unpack_boolean(
+        "babbling_port_policy",
+        p_key, p_val, &p_opts->babbling_port_policy);
@@ -1631,9 +1636,12 @@ osm_subn_write_conf_file(
     "# Daemon mode\n"
     "daemon %s\n\n"
     "# SM Inactive\n"
-    "sm_inactive %s\n\n",
+    "sm_inactive %s\n\n"
+    "# Babbling Port Policy\n"
+    "babbling_port_policy %s\n\n",
     p_opts->daemon ? "TRUE" : "FALSE",
-    p_opts->sm_inactive ? "TRUE" : "FALSE"
+    p_opts->sm_inactive ? "TRUE" : "FALSE",
+    p_opts->babbling_port_policy ? "TRUE" : "FALSE"
diff --git a/opensm/opensm/osm_trap_rcv.c b/opensm/opensm/osm_trap_rcv.c
index 5900c51..fbb6dac 100644
--- a/opensm/opensm/osm_trap_rcv.c
+++ b/opensm/opensm/osm_trap_rcv.c
@@ -1,5 +1,5 @@
- * Copyright (c) 2004-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
  * Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
@@ -548,6 +548,61 @@ __osm_trap_rcv_process_request(
+          /* When babbling port policy option is enabled and
+             Threshold for disabling a "babbling" port is exceeded */
+          if ( p_rcv->p_subn->opt.babbling_port_policy &&
+               num_received >= 250 )
+          {
+            uint8_t               payload[IB_SMP_DATA_SIZE];
+            ib_port_info_t*       p_pi = (ib_port_info_t*)payload;
+            const ib_port_info_t* p_old_pi;
+            osm_madw_context_t    context;
+            /* If trap 131, might want to disable peer port if available */
+            /* but peer port has been observed not to respond to SM requests */
+            osm_log( p_rcv->p_log, OSM_LOG_ERROR,
+                     "__osm_trap_rcv_process_request: ERR 3810: "
+                     " Disabling physical port lid:0x%02X num:%u\n",
+                     cl_ntoh16(p_ntci->data_details.ntc_129_131.lid),
+                     p_ntci->data_details.ntc_129_131.port_num
+                     );
+            p_old_pi = &p_physp->port_info;
+            memcpy( payload, p_old_pi, sizeof(ib_port_info_t) );
+            /* Set port to disabled/down */
+            ib_port_info_set_port_state( p_pi, IB_LINK_DOWN );
+            ib_port_info_set_port_phys_state( IB_PORT_PHYS_STATE_DISABLED, p_pi );
+            context.pi_context.node_guid = osm_node_get_node_guid( osm_physp_get_node_ptr( p_physp ) );
+            context.pi_context.port_guid = osm_physp_get_port_guid( p_physp );
+            context.pi_context.set_method = TRUE;
+            context.pi_context.update_master_sm_base_lid = FALSE;
+            context.pi_context.light_sweep = FALSE;
+            context.pi_context.active_transition = FALSE;
+            status = osm_req_set( &p_rcv->p_subn->p_osm->sm.req,
+                                   osm_physp_get_dr_path_ptr( p_physp ),
+                                   payload,
+                                   sizeof(payload),
+                                   IB_MAD_ATTR_PORT_INFO,
+                                   cl_hton32(osm_physp_get_port_num( p_physp )),
+                                   CL_DISP_MSGID_NONE,
+                                  &context );
+            if( status == IB_SUCCESS )
+            {
+               goto Exit;
+            }
+            else
+            {
+               osm_log( p_rcv->p_log, OSM_LOG_ERROR,
+                        "__osm_trap_rcv_process_request: ERR 3811: "
+                        "Request to set PortInfo failed\n" );
+            }
+          }
           osm_log( p_rcv->p_log, OSM_LOG_VERBOSE,
                    "__osm_trap_rcv_process_request: "
                    "Marking unhealthy physical port by lid:0x%02X num:%u\n",

More information about the general mailing list