[openib-general] [PATCH] osm: PathRecord prefer 1K MTU for MT23108 devices
Hal Rosenstock
halr at voltaire.com
Fri Sep 15 16:06:35 PDT 2006
Hi Sasha,
On Fri, 2006-09-15 at 18:17, Sasha Khapyorsky wrote:
> Hi Eitan,
>
> Some comments about the patch.
>
> On 14:45 Fri 15 Sep , Eitan Zahavi wrote:
> > Hi Hal
> >
> > The following patch solves an issue with OpenSM preferring largest MTU
> > for PathRecord/MultiPathRecord for paths going to or from MT23108 (Tavor)
> > devices instead of using a 1K MTU which is best for this device.
> >
> > Since this is a device specific quirk I have added a configuration option
> > named enable_quirks which is FALSE by default to enable this functionality.
> >
> > To summarize the functionality change:
> > 1. Added enable_quirks option
> > 2. If enable_quirks is FALSE do nothing
>
> I see those quirks are SA specific. Then should this option be called
> 'enable_sa_quirks' instead?
Not sure what the right "granularity" is for this. Would all quirks be
enabled at once or would this end up being a pick and choose ?
> > 3. If a specific MTU is requested (either =2K or >1K) do nothing
> > 4. If either source port or destination port is a Tavor device
> > MTU is limited to 1K (can be further reduced by path traversal)
> >
> > Target is both trunk and OFED 1.1
> >
> > Thanks
> >
> > Eitan
> >
> > Signed-off-by: Eitan Zahavi <eitan at mellanox.co.il>
> >
> > Index: include/opensm/osm_subnet.h
> > ===================================================================
> > --- include/opensm/osm_subnet.h (revision 9493)
> > +++ include/opensm/osm_subnet.h (working copy)
> > @@ -286,6 +286,7 @@ typedef struct _osm_subn_opt
> > osm_qos_options_t qos_sw0_options;
> > osm_qos_options_t qos_swe_options;
> > osm_qos_options_t qos_rtr_options;
> > + boolean_t enable_quirks;
> > } osm_subn_opt_t;
> > /*
> > * FIELDS
> > @@ -469,6 +470,10 @@ typedef struct _osm_subn_opt
> > * qos_rtr_options
> > * QoS options for router ports
> > *
> > +* enable_quirks
> > +* Enable high risk new features and not fully qualified
> > +* hardware specific work arounds
> > +*
> > * SEE ALSO
> > * Subnet object
> > *********/
> > Index: include/opensm/osm_base.h
> > ===================================================================
> > --- include/opensm/osm_base.h (revision 9493)
> > +++ include/opensm/osm_base.h (working copy)
> > @@ -778,6 +778,34 @@ typedef enum _osm_mcast_req_type
> > #define MAX_UPDN_GUID_FILE_LINE_LENGTH 120
> > /**********/
> >
> > +/****s* OpenSM: Base/VendorOUIs
> > +* NAME
> > +* VendorOUIs
> > +*
> > +* DESCRIPTION
> > +* Known device vendor ID and GUID OUIs
> > +*
> > +* SYNOPSIS
> > +*/
> > +#define OSM_VENDOR_ID_INTEL 0x00D0B7
> > +#define OSM_VENDOR_ID_MELLANOX 0x0002C9
> > +#define OSM_VENDOR_ID_REDSWITCH 0x000617
> > +#define OSM_VENDOR_ID_SILVERSTORM 0x00066A
> > +#define OSM_VENDOR_ID_TOPSPIN 0x0005AD
> > +#define OSM_VENDOR_ID_FUJITSU 0x00E000
> > +#define OSM_VENDOR_ID_FUJITSU2 0x000B5D
> > +#define OSM_VENDOR_ID_VOLTAIRE 0x0008F1
> > +#define OSM_VENDOR_ID_YOTTAYOTTA 0x000453
> > +#define OSM_VENDOR_ID_PATHSCALE 0x001175
> > +#define OSM_VENDOR_ID_IBM 0x000255
> > +#define OSM_VENDOR_ID_DIVERGENET 0x00084E
> > +#define OSM_VENDOR_ID_FLEXTRONICS 0x000B8C
> > +#define OSM_VENDOR_ID_AGILENT 0x0030D3
> > +#define OSM_VENDOR_ID_OBSIDIAN 0x001777
> > +#define OSM_VENDOR_ID_BAYMICRO 0x000BC1
> > +#define OSM_VENDOR_ID_LSILOGIC 0x00A0B8
> > +/**********/
> > +
> > END_C_DECLS
> >
> > #endif /* _OSM_BASE_H_ */
> > Index: opensm/osm_sa_multipath_record.c
> > ===================================================================
> > --- opensm/osm_sa_multipath_record.c (revision 9493)
> > +++ opensm/osm_sa_multipath_record.c (working copy)
> > @@ -150,6 +150,75 @@ osm_mpr_rcv_init(
> >
> > /**********************************************************************
> > **********************************************************************/
> > +static inline boolean_t
> > +__osm_sa_multipath_rec_is_tavor_port(
> > + IN const osm_port_t* const p_port)
> > +{
> > + osm_node_t const* p_node;
> > + ib_net32_t vend_id;
> > +
> > + p_node = osm_port_get_parent_node( p_port );
> > + vend_id = ib_node_info_get_vendor_id( &p_node->node_info );
> > +
> > + return( (p_node->node_info.device_id == CL_HTON16(23108)) &&
> > + ((vend_id == CL_HTON32(OSM_VENDOR_ID_MELLANOX)) ||
> > + (vend_id == CL_HTON32(OSM_VENDOR_ID_TOPSPIN)) ||
> > + (vend_id == CL_HTON32(OSM_VENDOR_ID_SILVERSTORM)) ||
> > + (vend_id == CL_HTON32(OSM_VENDOR_ID_VOLTAIRE))));
> > +}
> > +
> > +/**********************************************************************
> > + **********************************************************************/
> > +boolean_t
> > + __osm_sa_multipath_rec_apply_tavor_mtu_limit(
> > + IN const ib_multipath_rec_t* const p_mpr,
> > + IN const osm_port_t* const p_src_port,
> > + IN const osm_port_t* const p_dest_port,
> > + IN const ib_net64_t comp_mask)
> > +{
> > + uint8_t required_mtu;
> > +
> > + /* only if one of the ports is a Tavor device */
> > + if (! __osm_sa_multipath_rec_is_tavor_port(p_src_port) &&
> > + ! __osm_sa_multipath_rec_is_tavor_port(p_dest_port) )
> > + return( FALSE );
> > +
> > + /*
> > + we can apply the patch if either:
> > + 1. No MTU required
> > + 2. Required MTU <
> > + 3. Required MTU = 1K or 512 or 256
> > + 4. Required MTU > 256 or 512
> > + */
> > + required_mtu = ib_multipath_rec_mtu( p_mpr );
> > + if ( ( comp_mask & IB_PR_COMPMASK_MTUSELEC ) &&
> > + ( comp_mask & IB_PR_COMPMASK_MTU ) )
>
> Should here be IB_MPR_COMPMASK_* instead of IB_PR_COMPMASK_*?
Good catch.
> > + {
> > + switch( ib_multipath_rec_mtu_sel( p_mpr ) )
> > + {
> > + case 0: /* must be greater than */
> > + case 2: /* exact match */
> > + if( IB_MTU_LEN_1024 < required_mtu )
> > + return(FALSE);
> > + break;
> > +
> > + case 1: /* must be less than */
> > + case 3: /* largest available */
> > + /* can't be disqualified by this one */
> > + break;
> > +
> > + default:
> > + /* if we're here, there's a bug in ib_path_rec_mtu_sel() */
> > + CL_ASSERT( FALSE );
> > + break;
> > + }
> > + }
> > +
> > + return(TRUE);
> > +}
> > +
> > +/**********************************************************************
> > + **********************************************************************/
> > static ib_api_status_t
> > __osm_mpr_rcv_get_path_parms(
> > IN osm_mpr_rcv_t* const p_rcv,
> > @@ -195,6 +264,23 @@ __osm_mpr_rcv_get_path_parms(
> > mtu = ib_port_info_get_mtu_cap( p_pi );
> > rate = ib_port_info_compute_rate( p_pi );
> >
> > + /*
> > + Mellanox Tavor device performance is better using 1K MTU.
> > + If required MTU and MTU selector are such that 1K is OK
> > + and one of the ends of the path is Tavor we override the
> > + port MTU with 1K.
> > + */
> > + if ( p_rcv->p_subn->opt.enable_quirks &&
> > + __osm_sa_multipath_rec_apply_tavor_mtu_limit(
> > + p_mpr, p_src_port, p_dest_port, comp_mask) )
> > + if (mtu > IB_MTU_LEN_1024)
> > + {
> > + mtu = IB_MTU_LEN_1024;
> > + osm_log( p_rcv->p_log, OSM_LOG_DEBUG,
> > + "__osm_mpr_rcv_get_path_parms: "
> > + "Optimized Path MTU to 1K for Mellanox Tavor device\n");
> > + }
> > +
>
> This part is pure hardcode, isn't it? Could this be at least isolated in
> single call 'osm_*_do_quirks()' or like thin?
Perhaps. This can be worked on the trunk.
> > if ( comp_mask & IB_MPR_COMPMASK_RAWTRAFFIC &&
> > cl_ntoh32( p_mpr->hop_flow_raw ) & ( 1<<31 ) )
> > required_pkey = osm_physp_find_common_pkey( p_physp, p_dest_physp );
> > Index: opensm/osm_subnet.c
> > ===================================================================
> > --- opensm/osm_subnet.c (revision 9493)
> > +++ opensm/osm_subnet.c (working copy)
> > @@ -494,6 +494,7 @@ osm_subn_set_default_opt(
> > p_opt->ucast_dump_file = NULL;
> > p_opt->updn_guid_file = NULL;
> > p_opt->exit_on_fatal = TRUE;
> > + p_opt->enable_quirks = FALSE;
> > subn_set_default_qos_options(&p_opt->qos_options);
> > subn_set_default_qos_options(&p_opt->qos_ca_options);
> > subn_set_default_qos_options(&p_opt->qos_sw0_options);
> > @@ -979,6 +980,10 @@ osm_subn_parse_conf_file(
> > subn_parse_qos_options("qos_rtr",
> > p_key, p_val, &p_opts->qos_rtr_options);
> >
> > + __osm_subn_opts_unpack_boolean(
> > + "enable_quirks",
> > + p_key, p_val, &p_opts->enable_quirks);
> > +
> > }
> > }
> > fclose(opts_file);
> > @@ -1179,11 +1184,15 @@ osm_subn_write_conf_file(
> > "force_log_flush %s\n\n"
> > "# Log file to be used\n"
> > "log_file %s\n\n"
> > + "# Limit the the size of the log file. If overrun log is restarted\n"
> > "log_max_size %lu\n\n"
> > + "# If TRUE will accumulate the log over multiple OpenSM sessions\n"
> > "accum_log_file %s\n\n"
> > "# The directory to hold the file OpenSM dumps\n"
> > "dump_files_dir %s\n\n"
> > - "# If TRUE if OpenSM should disable multicast support\n"
> > + "# If TRUE enables new high risk options and hardware specific quirks\n"
> > + "enable_quirks %s\n\n"
> > + "# If TRUE OpenSM should disable multicast support\n"
> > "no_multicast_option %s\n\n"
> > "# No multicast routing is performed if TRUE\n"
> > "disable_multicast %s\n\n"
> > @@ -1195,6 +1204,7 @@ osm_subn_write_conf_file(
> > p_opts->log_max_size,
> > p_opts->accum_log_file ? "TRUE" : "FALSE",
> > p_opts->dump_files_dir,
> > + p_opts->enable_quirks ? "TRUE" : "FALSE",
> > p_opts->no_multicast_option ? "TRUE" : "FALSE",
> > p_opts->disable_multicast ? "TRUE" : "FALSE",
> > p_opts->exit_on_fatal ? "TRUE" : "FALSE"
> > Index: opensm/osm_helper.c
> > ===================================================================
> > --- opensm/osm_helper.c (revision 9493)
> > +++ opensm/osm_helper.c (working copy)
> > @@ -2289,24 +2289,6 @@ osm_get_node_type_str_fixed_width(
> > return( __osm_node_type_str_fixed_width[node_type] );
> > }
> >
> > -#define OSM_VENDOR_ID_INTEL 0x00D0B7
> > -#define OSM_VENDOR_ID_MELLANOX 0x0002C9
> > -#define OSM_VENDOR_ID_REDSWITCH 0x000617
> > -#define OSM_VENDOR_ID_SILVERSTORM 0x00066A
> > -#define OSM_VENDOR_ID_TOPSPIN 0x0005AD
> > -#define OSM_VENDOR_ID_FUJITSU 0x00E000
> > -#define OSM_VENDOR_ID_FUJITSU2 0x000B5D
> > -#define OSM_VENDOR_ID_VOLTAIRE 0x0008F1
> > -#define OSM_VENDOR_ID_YOTTAYOTTA 0x000453
> > -#define OSM_VENDOR_ID_PATHSCALE 0x001175
> > -#define OSM_VENDOR_ID_IBM 0x000255
> > -#define OSM_VENDOR_ID_DIVERGENET 0x00084E
> > -#define OSM_VENDOR_ID_FLEXTRONICS 0x000B8C
> > -#define OSM_VENDOR_ID_AGILENT 0x0030D3
> > -#define OSM_VENDOR_ID_OBSIDIAN 0x001777
> > -#define OSM_VENDOR_ID_BAYMICRO 0x000BC1
> > -#define OSM_VENDOR_ID_LSILOGIC 0x00A0B8
> > -
> > /**********************************************************************
> > **********************************************************************/
> > const char*
> > Index: opensm/osm_sa_path_record.c
> > ===================================================================
> > --- opensm/osm_sa_path_record.c (revision 9493)
> > +++ opensm/osm_sa_path_record.c (working copy)
> > @@ -57,6 +57,7 @@
> > #include <complib/cl_passivelock.h>
> > #include <complib/cl_debug.h>
> > #include <complib/cl_qlist.h>
> > +#include <opensm/osm_base.h>
> > #include <opensm/osm_sa_path_record.h>
> > #include <opensm/osm_port.h>
> > #include <opensm/osm_node.h>
> > @@ -150,6 +151,75 @@ osm_pr_rcv_init(
> >
> > /**********************************************************************
> > **********************************************************************/
> > +static inline boolean_t
> > +__osm_sa_path_rec_is_tavor_port(
> > + IN const osm_port_t* const p_port)
> > +{
> > + osm_node_t const* p_node;
> > + ib_net32_t vend_id;
> > +
> > + p_node = osm_port_get_parent_node( p_port );
> > + vend_id = ib_node_info_get_vendor_id( &p_node->node_info );
> > +
> > + return( (p_node->node_info.device_id == CL_HTON16(23108)) &&
> > + ((vend_id == CL_HTON32(OSM_VENDOR_ID_MELLANOX)) ||
> > + (vend_id == CL_HTON32(OSM_VENDOR_ID_TOPSPIN)) ||
> > + (vend_id == CL_HTON32(OSM_VENDOR_ID_SILVERSTORM)) ||
> > + (vend_id == CL_HTON32(OSM_VENDOR_ID_VOLTAIRE))));
> > +}
> > +
> > +/**********************************************************************
> > + **********************************************************************/
> > +static boolean_t
> > + __osm_sa_path_rec_apply_tavor_mtu_limit(
> > + IN const ib_path_rec_t* const p_pr,
> > + IN const osm_port_t* const p_src_port,
> > + IN const osm_port_t* const p_dest_port,
> > + IN const ib_net64_t comp_mask)
> > +{
> > + uint8_t required_mtu;
> > +
> > + /* only if one of the ports is a Tavor device */
> > + if (! __osm_sa_path_rec_is_tavor_port(p_src_port) &&
> > + ! __osm_sa_path_rec_is_tavor_port(p_dest_port) )
> > + return( FALSE );
> > +
> > + /*
> > + we can apply the patch if either:
> > + 1. No MTU required
> > + 2. Required MTU <
> > + 3. Required MTU = 1K or 512 or 256
> > + 4. Required MTU > 256 or 512
> > + */
> > + required_mtu = ib_path_rec_mtu( p_pr );
> > + if ( ( comp_mask & IB_PR_COMPMASK_MTUSELEC ) &&
> > + ( comp_mask & IB_PR_COMPMASK_MTU ) )
> > + {
> > + switch( ib_path_rec_mtu_sel( p_pr ) )
> > + {
> > + case 0: /* must be greater than */
> > + case 2: /* exact match */
> > + if( IB_MTU_LEN_1024 < required_mtu )
> > + return(FALSE);
> > + break;
> > +
> > + case 1: /* must be less than */
> > + case 3: /* largest available */
> > + /* can't be disqualified by this one */
> > + break;
> > +
> > + default:
> > + /* if we're here, there's a bug in ib_path_rec_mtu_sel() */
> > + CL_ASSERT( FALSE );
> > + break;
> > + }
> > + }
> > +
> > + return(TRUE);
> > +}
> > +
> > +/**********************************************************************
> > + **********************************************************************/
> > static ib_api_status_t
> > __osm_pr_rcv_get_path_parms(
> > IN osm_pr_rcv_t* const p_rcv,
> > @@ -191,6 +261,23 @@ __osm_pr_rcv_get_path_parms(
> > mtu = ib_port_info_get_mtu_cap( p_pi );
> > rate = ib_port_info_compute_rate( p_pi );
> >
> > + /*
> > + Mellanox Tavor device performance is better using 1K MTU.
> > + If required MTU and MTU selector are such that 1K is OK
> > + and one of the ends of the path is Tavor we override the
> > + port MTU with 1K.
> > + */
> > + if ( p_rcv->p_subn->opt.enable_quirks &&
> > + __osm_sa_path_rec_apply_tavor_mtu_limit(
> > + p_pr, p_src_port, p_dest_port, comp_mask) )
> > + if (mtu > IB_MTU_LEN_1024)
> > + {
> > + mtu = IB_MTU_LEN_1024;
> > + osm_log( p_rcv->p_log, OSM_LOG_DEBUG,
> > + "__osm_pr_rcv_get_path_parms: "
> > + "Optimized Path MTU to 1K for Mellanox Tavor device\n");
> > + }
> > +
>
> The same is here (about hardcodes).
>
> Also I see that tavor specific functions are pretty similar for PR and
> MPR cases. Why not to share this in something like osm_sa_quirks.c?
I think we can work on this on the trunk and see if there is an OFED 1.1
opening.
-- Hal
> Sasha
>
> > /*
> > Walk the subnet object from source to destination,
> > tracking the most restrictive rate and mtu values along the way...
> > @@ -444,10 +531,10 @@ __osm_pr_rcv_get_path_parms(
> > */
> >
> > /* we silently ignore cases where only the MTU selector is defined */
> > + required_mtu = ib_path_rec_mtu( p_pr );
> > if ( ( comp_mask & IB_PR_COMPMASK_MTUSELEC ) &&
> > ( comp_mask & IB_PR_COMPMASK_MTU ) )
> > {
> > - required_mtu = ib_path_rec_mtu( p_pr );
> > switch( ib_path_rec_mtu_sel( p_pr ) )
> > {
> > case 0: /* must be greater than */
> >
> >
> > _______________________________________________
> > openib-general mailing list
> > openib-general at openib.org
> > http://openib.org/mailman/listinfo/openib-general
> >
> > To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
> >
More information about the general
mailing list