[ofa-general] RE: [PATCH] OpenSM handling of "Babbling" Ports
Amit Krig
amitk at mellanox.co.il
Mon Jul 9 06:27:35 PDT 2007
Hi Hal,
In such case OpenSM should first check that the OPVL fields of the ports
(the one that sends the traps and its peer) are identical,
If you have a mismatch in the OPVL field, the link watchdog mechanism
will retrain the logical link in high rate
Amit
-----Original Message-----
From: Hal Rosenstock [mailto:halr at voltaire.com]
Sent: Thursday, July 05, 2007 3:58 PM
To: general at lists.openfabrics.org
Cc: Eitan Zahavi; Yevgeny Kliteynik
Subject: [PATCH] OpenSM handling of "Babbling" Ports
A "babbling" port is a port which causes traps to be generated
frequently.
It may directly be "this" port which generates the traps or the peer
port detecting the issue and that the SMA on switch port 0 generates the
traps.
This has only currently been observed for trap 131 but will also apply
for traps 129 and 130 as well which are other urgent and similar traps.
Note that there appears to be a bug in Mellanox firmware for both
Anafa-2 and Tavor at a minimum which causes the max trap rate not to be
adhered to and relief for this does not appear to be in short term
sight.
Policy
When a bablbing port is detected, OpenSM will disable the port or its
peer switch port (depending on which trap) which should terminate the
trap storm.
Detection
250 consecutive traps of this type will be used as the (initial)
threshold. The reason for this is so as to not prematurely detect this
and disable a port.
Recovery
Admin would reenable port when OK again. (This usually involves
rebooting the node causing the trap to be indicated.)
Signed-off-by: Hal Rosenstock <halr at voltaire.com>
diff --git a/opensm/include/opensm/osm_subnet.h
b/opensm/include/opensm/osm_subnet.h
index bedd63f..1150703 100644
--- a/opensm/include/opensm/osm_subnet.h
+++ b/opensm/include/opensm/osm_subnet.h
@@ -286,6 +286,7 @@ typedef struct _osm_subn_opt
boolean_t honor_guid2lid_file;
boolean_t daemon;
boolean_t sm_inactive;
+ boolean_t babbling_port_policy;
osm_qos_options_t qos_options;
osm_qos_options_t qos_ca_options;
osm_qos_options_t qos_sw0_options;
@@ -487,6 +488,9 @@ typedef struct _osm_subn_opt
*
* sm_inactive
* OpenSM will start with SM in not active state.
+*
+* babbling_port_policy
+* OpenSM will enforce its "babbling" port policy.
*
* perfmgr
* Enable or disable the performance manager
diff --git a/opensm/opensm/osm_subnet.c b/opensm/opensm/osm_subnet.c
index 726b665..87b71e5 100644
--- a/opensm/opensm/osm_subnet.c
+++ b/opensm/opensm/osm_subnet.c
@@ -472,6 +472,7 @@ osm_subn_set_default_opt(
p_opt->honor_guid2lid_file = FALSE;
p_opt->daemon = FALSE;
p_opt->sm_inactive = FALSE;
+ p_opt->babbling_port_policy = FALSE;
#ifdef ENABLE_OSM_PERF_MGR
p_opt->perfmgr = FALSE;
p_opt->perfmgr_sweep_time_s = OSM_PERFMGR_DEFAULT_SWEEP_TIME_S; @@
-1358,6 +1359,10 @@ osm_subn_parse_conf_file(
"sm_inactive",
p_key, p_val, &p_opts->sm_inactive);
+ __osm_subn_opts_unpack_boolean(
+ "babbling_port_policy",
+ p_key, p_val, &p_opts->babbling_port_policy);
+
#ifdef ENABLE_OSM_PERF_MGR
__osm_subn_opts_unpack_boolean(
"perfmgr",
@@ -1631,9 +1636,12 @@ osm_subn_write_conf_file(
"# Daemon mode\n"
"daemon %s\n\n"
"# SM Inactive\n"
- "sm_inactive %s\n\n",
+ "sm_inactive %s\n\n"
+ "# Babbling Port Policy\n"
+ "babbling_port_policy %s\n\n",
p_opts->daemon ? "TRUE" : "FALSE",
- p_opts->sm_inactive ? "TRUE" : "FALSE"
+ p_opts->sm_inactive ? "TRUE" : "FALSE",
+ p_opts->babbling_port_policy ? "TRUE" : "FALSE"
);
#ifdef ENABLE_OSM_PERF_MGR
diff --git a/opensm/opensm/osm_trap_rcv.c b/opensm/opensm/osm_trap_rcv.c
index 5900c51..fbb6dac 100644
--- a/opensm/opensm/osm_trap_rcv.c
+++ b/opensm/opensm/osm_trap_rcv.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2004-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
* Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights
reserved.
* Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
*
@@ -548,6 +548,61 @@ __osm_trap_rcv_process_request(
}
else
{
+ /* When babbling port policy option is enabled and
+ Threshold for disabling a "babbling" port is exceeded */
+ if ( p_rcv->p_subn->opt.babbling_port_policy &&
+ num_received >= 250 )
+ {
+ uint8_t payload[IB_SMP_DATA_SIZE];
+ ib_port_info_t* p_pi = (ib_port_info_t*)payload;
+ const ib_port_info_t* p_old_pi;
+ osm_madw_context_t context;
+
+ /* If trap 131, might want to disable peer port if
available */
+ /* but peer port has been observed not to respond to SM
+ requests */
+
+ osm_log( p_rcv->p_log, OSM_LOG_ERROR,
+ "__osm_trap_rcv_process_request: ERR 3810: "
+ " Disabling physical port lid:0x%02X num:%u\n",
+ cl_ntoh16(p_ntci->data_details.ntc_129_131.lid),
+ p_ntci->data_details.ntc_129_131.port_num
+ );
+
+ p_old_pi = &p_physp->port_info;
+ memcpy( payload, p_old_pi, sizeof(ib_port_info_t) );
+
+ /* Set port to disabled/down */
+ ib_port_info_set_port_state( p_pi, IB_LINK_DOWN );
+ ib_port_info_set_port_phys_state(
+ IB_PORT_PHYS_STATE_DISABLED, p_pi );
+
+ context.pi_context.node_guid = osm_node_get_node_guid(
osm_physp_get_node_ptr( p_physp ) );
+ context.pi_context.port_guid = osm_physp_get_port_guid(
p_physp );
+ context.pi_context.set_method = TRUE;
+ context.pi_context.update_master_sm_base_lid = FALSE;
+ context.pi_context.light_sweep = FALSE;
+ context.pi_context.active_transition = FALSE;
+
+ status = osm_req_set( &p_rcv->p_subn->p_osm->sm.req,
+ osm_physp_get_dr_path_ptr( p_physp
),
+ payload,
+ sizeof(payload),
+ IB_MAD_ATTR_PORT_INFO,
+ cl_hton32(osm_physp_get_port_num(
p_physp )),
+ CL_DISP_MSGID_NONE,
+ &context );
+
+ if( status == IB_SUCCESS )
+ {
+ goto Exit;
+ }
+ else
+ {
+ osm_log( p_rcv->p_log, OSM_LOG_ERROR,
+ "__osm_trap_rcv_process_request: ERR 3811: "
+ "Request to set PortInfo failed\n" );
+ }
+ }
+
osm_log( p_rcv->p_log, OSM_LOG_VERBOSE,
"__osm_trap_rcv_process_request: "
"Marking unhealthy physical port by lid:0x%02X
num:%u\n",
More information about the general
mailing list