[openib-general] [PATCH] osm: PathRecord prefer 1K MTU for MT23108 devices

Sasha Khapyorsky sashak at voltaire.com
Fri Sep 15 15:17:09 PDT 2006


Hi Eitan,

Some comments about the patch.

On 14:45 Fri 15 Sep     , Eitan Zahavi wrote:
> Hi Hal
> 
> The following patch solves an issue with OpenSM preferring largest MTU 
> for PathRecord/MultiPathRecord for paths going to or from MT23108 (Tavor)
> devices instead of using a 1K MTU which is best for this device.
> 
> Since this is a device specific quirk I have added a configuration option
> named enable_quirks which is FALSE by default to enable this functionality.
> 
> To summarize the functionality change:
> 1. Added enable_quirks option 
> 2. If enable_quirks is FALSE do nothing

I see those quirks are SA specific. Then should this option be called
'enable_sa_quirks' instead?

> 3. If a specific MTU is requested (either =2K or >1K) do nothing
> 4. If either source port or destination port is a Tavor device  
> 	MTU is limited to 1K (can be further reduced by path traversal) 
> 
> Target is both trunk and OFED 1.1
> 
> Thanks
> 
> Eitan
> 
> Signed-off-by:  Eitan Zahavi <eitan at mellanox.co.il>
> 
> Index: include/opensm/osm_subnet.h
> ===================================================================
> --- include/opensm/osm_subnet.h	(revision 9493)
> +++ include/opensm/osm_subnet.h	(working copy)
> @@ -286,6 +286,7 @@ typedef struct _osm_subn_opt
>    osm_qos_options_t        qos_sw0_options;
>    osm_qos_options_t        qos_swe_options;
>    osm_qos_options_t        qos_rtr_options;
> +  boolean_t                enable_quirks;
>  } osm_subn_opt_t;
>  /*
>  * FIELDS
> @@ -469,6 +470,10 @@ typedef struct _osm_subn_opt
>  *	qos_rtr_options
>  *		QoS options for router ports
>  *
> +*  enable_quirks
> +*     Enable high risk new features and not fully qualified 
> +*     hardware specific work arounds
> +*
>  * SEE ALSO
>  *	Subnet object
>  *********/
> Index: include/opensm/osm_base.h
> ===================================================================
> --- include/opensm/osm_base.h	(revision 9493)
> +++ include/opensm/osm_base.h	(working copy)
> @@ -778,6 +778,34 @@ typedef enum _osm_mcast_req_type
>  #define MAX_UPDN_GUID_FILE_LINE_LENGTH 120
>  /**********/
>  
> +/****s* OpenSM: Base/VendorOUIs
> +* NAME
> +*	VendorOUIs
> +*
> +* DESCRIPTION
> +*	Known device vendor ID and GUID OUIs
> +*
> +* SYNOPSIS
> +*/
> +#define OSM_VENDOR_ID_INTEL         0x00D0B7
> +#define OSM_VENDOR_ID_MELLANOX      0x0002C9
> +#define OSM_VENDOR_ID_REDSWITCH     0x000617
> +#define OSM_VENDOR_ID_SILVERSTORM   0x00066A
> +#define OSM_VENDOR_ID_TOPSPIN       0x0005AD
> +#define OSM_VENDOR_ID_FUJITSU       0x00E000
> +#define OSM_VENDOR_ID_FUJITSU2      0x000B5D
> +#define OSM_VENDOR_ID_VOLTAIRE      0x0008F1
> +#define OSM_VENDOR_ID_YOTTAYOTTA    0x000453
> +#define OSM_VENDOR_ID_PATHSCALE     0x001175
> +#define OSM_VENDOR_ID_IBM           0x000255
> +#define OSM_VENDOR_ID_DIVERGENET    0x00084E
> +#define OSM_VENDOR_ID_FLEXTRONICS   0x000B8C
> +#define OSM_VENDOR_ID_AGILENT       0x0030D3
> +#define OSM_VENDOR_ID_OBSIDIAN      0x001777
> +#define OSM_VENDOR_ID_BAYMICRO      0x000BC1
> +#define OSM_VENDOR_ID_LSILOGIC      0x00A0B8
> +/**********/
> +
>  END_C_DECLS
>  
>  #endif	/* _OSM_BASE_H_ */
> Index: opensm/osm_sa_multipath_record.c
> ===================================================================
> --- opensm/osm_sa_multipath_record.c	(revision 9493)
> +++ opensm/osm_sa_multipath_record.c	(working copy)
> @@ -150,6 +150,75 @@ osm_mpr_rcv_init(
>  
>  /**********************************************************************
>   **********************************************************************/
> +static inline boolean_t
> +__osm_sa_multipath_rec_is_tavor_port(
> +	IN const osm_port_t*     const p_port)
> +{
> +	osm_node_t const* p_node;
> +	ib_net32_t vend_id;
> +
> +	p_node = osm_port_get_parent_node( p_port );
> +	vend_id = ib_node_info_get_vendor_id( &p_node->node_info );
> +	
> +	return( (p_node->node_info.device_id == CL_HTON16(23108)) &&
> +			  ((vend_id == CL_HTON32(OSM_VENDOR_ID_MELLANOX)) || 
> +				(vend_id == CL_HTON32(OSM_VENDOR_ID_TOPSPIN)) || 
> +				(vend_id == CL_HTON32(OSM_VENDOR_ID_SILVERSTORM)) || 
> +				(vend_id == CL_HTON32(OSM_VENDOR_ID_VOLTAIRE))));
> +}
> +
> +/**********************************************************************
> + **********************************************************************/
> +boolean_t
> + __osm_sa_multipath_rec_apply_tavor_mtu_limit(
> +  IN const ib_multipath_rec_t*  const p_mpr,
> +  IN const osm_port_t*          const p_src_port,
> +  IN const osm_port_t*          const p_dest_port,
> +  IN const ib_net64_t           comp_mask)
> +{
> +	uint8_t   required_mtu;
> +	
> +	/* only if one of the ports is a Tavor device */
> +	if (! __osm_sa_multipath_rec_is_tavor_port(p_src_port) && 
> +		 ! __osm_sa_multipath_rec_is_tavor_port(p_dest_port) )
> +		return( FALSE );
> +	
> +	/*
> +	  we can apply the patch if either:
> +	  1. No MTU required
> +	  2. Required MTU < 
> +	  3. Required MTU = 1K or 512 or 256
> +	  4. Required MTU > 256 or 512
> +	*/
> +	required_mtu = ib_multipath_rec_mtu( p_mpr );
> +	if ( ( comp_mask & IB_PR_COMPMASK_MTUSELEC ) &&
> +		  ( comp_mask & IB_PR_COMPMASK_MTU ) )

Should here be IB_MPR_COMPMASK_* instead of IB_PR_COMPMASK_*?

> +	{
> +		switch( ib_multipath_rec_mtu_sel( p_mpr ) )
> +		{
> +		case 0:    /* must be greater than */
> +		case 2:    /* exact match */
> +			if( IB_MTU_LEN_1024 < required_mtu )
> +				return(FALSE);
> +			break;
> +
> +		case 1:    /* must be less than */
> +		case 3:    /* largest available */
> +			/* can't be disqualified by this one */
> +			break;
> +			
> +		default:
> +			/* if we're here, there's a bug in ib_path_rec_mtu_sel() */
> +			CL_ASSERT( FALSE );
> +			break;
> +		}
> +	}
> +
> +	return(TRUE);
> +}
> +
> +/**********************************************************************
> + **********************************************************************/
>  static ib_api_status_t
>  __osm_mpr_rcv_get_path_parms(
>    IN osm_mpr_rcv_t*		const p_rcv,
> @@ -195,6 +264,23 @@ __osm_mpr_rcv_get_path_parms(
>    mtu = ib_port_info_get_mtu_cap( p_pi );
>    rate = ib_port_info_compute_rate( p_pi );
>  
> +  /* 
> +	  Mellanox Tavor device performance is better using 1K MTU.
> +	  If required MTU and MTU selector are such that 1K is OK 
> +	  and one of the ends of the path is Tavor we override the
> +	  port MTU with 1K.
> +  */
> +  if ( p_rcv->p_subn->opt.enable_quirks &&
> +		 __osm_sa_multipath_rec_apply_tavor_mtu_limit(
> +			 p_mpr, p_src_port, p_dest_port, comp_mask) )
> +	  if (mtu > IB_MTU_LEN_1024) 
> +	  {
> +		  mtu = IB_MTU_LEN_1024;
> +  		  osm_log( p_rcv->p_log, OSM_LOG_DEBUG,
> +					  "__osm_mpr_rcv_get_path_parms: "
> +					  "Optimized Path MTU to 1K for Mellanox Tavor device\n");
> +	  }
> +

This part is pure hardcode, isn't it? Could this be at least isolated in
single call 'osm_*_do_quirks()' or like thin?

>    if ( comp_mask & IB_MPR_COMPMASK_RAWTRAFFIC &&
>         cl_ntoh32( p_mpr->hop_flow_raw ) & ( 1<<31 ) )
>      required_pkey = osm_physp_find_common_pkey( p_physp, p_dest_physp );
> Index: opensm/osm_subnet.c
> ===================================================================
> --- opensm/osm_subnet.c	(revision 9493)
> +++ opensm/osm_subnet.c	(working copy)
> @@ -494,6 +494,7 @@ osm_subn_set_default_opt(
>    p_opt->ucast_dump_file = NULL;
>    p_opt->updn_guid_file = NULL;
>    p_opt->exit_on_fatal = TRUE;
> +  p_opt->enable_quirks = FALSE;
>    subn_set_default_qos_options(&p_opt->qos_options);
>    subn_set_default_qos_options(&p_opt->qos_ca_options);
>    subn_set_default_qos_options(&p_opt->qos_sw0_options);
> @@ -979,6 +980,10 @@ osm_subn_parse_conf_file(
>        subn_parse_qos_options("qos_rtr",
>          p_key, p_val, &p_opts->qos_rtr_options);
>  
> +      __osm_subn_opts_unpack_boolean(
> +        "enable_quirks",
> +        p_key, p_val, &p_opts->enable_quirks);
> +
>      }
>    }
>    fclose(opts_file);
> @@ -1179,11 +1184,15 @@ osm_subn_write_conf_file(
>      "force_log_flush %s\n\n"
>      "# Log file to be used\n"
>      "log_file %s\n\n"
> +	 "# Limit the the size of the log file. If overrun log is restarted\n"
>      "log_max_size %lu\n\n"
> +	 "# If TRUE will accumulate the log over multiple OpenSM sessions\n"
>      "accum_log_file %s\n\n"
>      "# The directory to hold the file OpenSM dumps\n"
>      "dump_files_dir %s\n\n"
> -    "# If TRUE if OpenSM should disable multicast support\n"
> +	 "# If TRUE enables new high risk options and hardware specific quirks\n"
> +	 "enable_quirks %s\n\n"
> +    "# If TRUE OpenSM should disable multicast support\n"  
>      "no_multicast_option %s\n\n"
>      "# No multicast routing is performed if TRUE\n"
>      "disable_multicast %s\n\n"
> @@ -1195,6 +1204,7 @@ osm_subn_write_conf_file(
>      p_opts->log_max_size,
>      p_opts->accum_log_file ? "TRUE" : "FALSE",
>      p_opts->dump_files_dir,
> +    p_opts->enable_quirks ? "TRUE" : "FALSE",
>      p_opts->no_multicast_option ? "TRUE" : "FALSE",
>      p_opts->disable_multicast ? "TRUE" : "FALSE",
>      p_opts->exit_on_fatal ? "TRUE" : "FALSE"
> Index: opensm/osm_helper.c
> ===================================================================
> --- opensm/osm_helper.c	(revision 9493)
> +++ opensm/osm_helper.c	(working copy)
> @@ -2289,24 +2289,6 @@ osm_get_node_type_str_fixed_width(
>    return( __osm_node_type_str_fixed_width[node_type] );
>  }
>  
> -#define OSM_VENDOR_ID_INTEL         0x00D0B7
> -#define OSM_VENDOR_ID_MELLANOX      0x0002C9
> -#define OSM_VENDOR_ID_REDSWITCH     0x000617
> -#define OSM_VENDOR_ID_SILVERSTORM     0x00066A
> -#define OSM_VENDOR_ID_TOPSPIN    0x0005AD
> -#define OSM_VENDOR_ID_FUJITSU    0x00E000
> -#define OSM_VENDOR_ID_FUJITSU2      0x000B5D
> -#define OSM_VENDOR_ID_VOLTAIRE   0x0008F1
> -#define OSM_VENDOR_ID_YOTTAYOTTA    0x000453
> -#define OSM_VENDOR_ID_PATHSCALE     0x001175
> -#define OSM_VENDOR_ID_IBM           0x000255
> -#define OSM_VENDOR_ID_DIVERGENET    0x00084E
> -#define OSM_VENDOR_ID_FLEXTRONICS   0x000B8C
> -#define OSM_VENDOR_ID_AGILENT       0x0030D3
> -#define OSM_VENDOR_ID_OBSIDIAN      0x001777
> -#define OSM_VENDOR_ID_BAYMICRO      0x000BC1
> -#define OSM_VENDOR_ID_LSILOGIC      0x00A0B8
> -
>  /**********************************************************************
>   **********************************************************************/
>  const char*
> Index: opensm/osm_sa_path_record.c
> ===================================================================
> --- opensm/osm_sa_path_record.c	(revision 9493)
> +++ opensm/osm_sa_path_record.c	(working copy)
> @@ -57,6 +57,7 @@
>  #include <complib/cl_passivelock.h>
>  #include <complib/cl_debug.h>
>  #include <complib/cl_qlist.h>
> +#include <opensm/osm_base.h>
>  #include <opensm/osm_sa_path_record.h>
>  #include <opensm/osm_port.h>
>  #include <opensm/osm_node.h>
> @@ -150,6 +151,75 @@ osm_pr_rcv_init(
>  
>  /**********************************************************************
>   **********************************************************************/
> +static inline boolean_t
> +__osm_sa_path_rec_is_tavor_port(
> +	IN const osm_port_t*     const p_port)
> +{
> +	osm_node_t const* p_node;
> +	ib_net32_t vend_id;
> +
> +	p_node = osm_port_get_parent_node( p_port );
> +	vend_id = ib_node_info_get_vendor_id( &p_node->node_info );
> +	
> +	return( (p_node->node_info.device_id == CL_HTON16(23108)) &&
> +			  ((vend_id == CL_HTON32(OSM_VENDOR_ID_MELLANOX)) || 
> +				(vend_id == CL_HTON32(OSM_VENDOR_ID_TOPSPIN)) || 
> +				(vend_id == CL_HTON32(OSM_VENDOR_ID_SILVERSTORM)) || 
> +				(vend_id == CL_HTON32(OSM_VENDOR_ID_VOLTAIRE))));
> +}
> +
> +/**********************************************************************
> + **********************************************************************/
> +static boolean_t
> + __osm_sa_path_rec_apply_tavor_mtu_limit(
> +  IN const ib_path_rec_t*  const p_pr,
> +  IN const osm_port_t*     const p_src_port,
> +  IN const osm_port_t*     const p_dest_port,
> +  IN const ib_net64_t      comp_mask)
> +{
> +	uint8_t   required_mtu;
> +	
> +	/* only if one of the ports is a Tavor device */
> +	if (! __osm_sa_path_rec_is_tavor_port(p_src_port) && 
> +		 ! __osm_sa_path_rec_is_tavor_port(p_dest_port) )
> +		return( FALSE );
> +	
> +	/*
> +	  we can apply the patch if either:
> +	  1. No MTU required
> +	  2. Required MTU < 
> +	  3. Required MTU = 1K or 512 or 256
> +	  4. Required MTU > 256 or 512
> +	*/
> +	required_mtu = ib_path_rec_mtu( p_pr );
> +	if ( ( comp_mask & IB_PR_COMPMASK_MTUSELEC ) &&
> +		  ( comp_mask & IB_PR_COMPMASK_MTU ) )
> +	{
> +		switch( ib_path_rec_mtu_sel( p_pr ) )
> +		{
> +		case 0:    /* must be greater than */
> +		case 2:    /* exact match */
> +			if( IB_MTU_LEN_1024 < required_mtu )
> +				return(FALSE);
> +			break;
> +
> +		case 1:    /* must be less than */
> +		case 3:    /* largest available */
> +			/* can't be disqualified by this one */
> +			break;
> +			
> +		default:
> +			/* if we're here, there's a bug in ib_path_rec_mtu_sel() */
> +			CL_ASSERT( FALSE );
> +			break;
> +		}
> +	}
> +
> +	return(TRUE);
> +}
> +
> +/**********************************************************************
> + **********************************************************************/
>  static ib_api_status_t
>  __osm_pr_rcv_get_path_parms(
>    IN osm_pr_rcv_t*         const p_rcv,
> @@ -191,6 +261,23 @@ __osm_pr_rcv_get_path_parms(
>    mtu = ib_port_info_get_mtu_cap( p_pi );
>    rate = ib_port_info_compute_rate( p_pi );
>  
> +  /* 
> +	  Mellanox Tavor device performance is better using 1K MTU.
> +	  If required MTU and MTU selector are such that 1K is OK 
> +	  and one of the ends of the path is Tavor we override the
> +	  port MTU with 1K.
> +  */
> +  if (  p_rcv->p_subn->opt.enable_quirks &&
> +		  __osm_sa_path_rec_apply_tavor_mtu_limit(
> +			  p_pr, p_src_port, p_dest_port, comp_mask) )
> +	  if (mtu > IB_MTU_LEN_1024) 
> +	  {
> +		  mtu = IB_MTU_LEN_1024;
> +		  osm_log( p_rcv->p_log, OSM_LOG_DEBUG,
> +					  "__osm_pr_rcv_get_path_parms: "
> +					  "Optimized Path MTU to 1K for Mellanox Tavor device\n");
> +	  }
> +

The same is here (about hardcodes).

Also I see that tavor specific functions are pretty similar for PR and
MPR cases. Why not to share this in something like osm_sa_quirks.c?

Sasha

>    /*
>      Walk the subnet object from source to destination,
>      tracking the most restrictive rate and mtu values along the way...
> @@ -444,10 +531,10 @@ __osm_pr_rcv_get_path_parms(
>    */
>  
>    /* we silently ignore cases where only the MTU selector is defined */
> +  required_mtu = ib_path_rec_mtu( p_pr );
>    if ( ( comp_mask & IB_PR_COMPMASK_MTUSELEC ) &&
>         ( comp_mask & IB_PR_COMPMASK_MTU ) )
>    {
> -    required_mtu = ib_path_rec_mtu( p_pr );
>      switch( ib_path_rec_mtu_sel( p_pr ) )
>      {
>      case 0:    /* must be greater than */
> 
> 
> _______________________________________________
> openib-general mailing list
> openib-general at openib.org
> http://openib.org/mailman/listinfo/openib-general
> 
> To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
> 




More information about the general mailing list