[openib-general] Re: [PATCH] Opensm - duplicated guids issue

Hal Rosenstock halr at voltaire.com
Mon Dec 5 06:14:01 PST 2005


Hi Yael,

On Sun, 2005-12-04 at 08:02, Yael Kalka wrote:
> Hi Hal,
>
> Currently if OpenSM discovers duplicated guids

What is the cause of a duplicated GUID ? Is it a misconfiguration of
someone's firmware (rather than some error on the part of OpenSM) ? If
so, I'm not sure exiting SM is the best option. IMO the policy is to
decide which GUID to "honor" (either the original one or the new one).

> or 12x link with lane reversal badly configured

What does badly configured mean ? Does it mean the link does not come up
at all or just in some non desired mode ? How is "bad lane reversal"
reconfigured ?

Can't this also occur on a 4x link as well ?

>  it only issues an error to the log
> file. This issue, though, is much more problematic, since it will cause
> part of the subnet to be un-initialized.
> The following patch includes a fuller handling of the issue - first,
> issue an error message to the /var/log/messeges file as well.

I am incorporating this part of the patch.

> Second - add an option flag to the SM that will define wether or not
> to exit on such case.

Also, there are other scenarios which mark the subnet initialization as
failed (but don't exit the SM). This seems inconsistent to me. These
cases also do not put errors out on syslog. Should they ?

IMO, in general, exiting out of OpenSM should be avoided at all costs.
The admin can always cause this to occur if desired and operating part
of the subnet is better than none. Are these cases where the admin would
not want to run the SM until the issues were resolved ?

-- Hal

> Thanks,
> Yael
>
> Signed-off-by:  Yael Kalka <yael at mellanox.co.il>
>
> Index: include/opensm/osm_subnet.h
> ===================================================================
> --- include/opensm/osm_subnet.h       (revision 4288)
> +++ include/opensm/osm_subnet.h       (working copy)
> @@ -235,6 +235,7 @@ typedef struct _osm_subn_opt
>    osm_testability_modes_t  testability_mode;
>    boolean_t                updn_activate;
>    char *                   updn_guid_file;
> +  boolean_t                exit_on_fatal;
>  } osm_subn_opt_t;
>  /*
>  * FIELDS
> @@ -372,6 +373,13 @@ typedef struct _osm_subn_opt
>  *  updn_guid_file
>  *     Pointer to name of the UPDN guid file given by User
>  *
> +*  exit_on_fatal
> +*     If TRUE (default) - SM will exit on fatal subnet initialization issues.
> +*     If FALSE - SM will not exit.
> +*     Fatal initialization issues:
> +*     a. SM recognizes 2 different nodes with the same guid, or 12x link with
> +*        lane reversal badly configured.
> +*
>  * SEE ALSO
>  *    Subnet object
>  *********/
> Index: opensm/osm_subnet.c
> ===================================================================
> --- opensm/osm_subnet.c       (revision 4288)
> +++ opensm/osm_subnet.c       (working copy)
> @@ -440,6 +440,7 @@ osm_subn_set_default_opt(
>    p_opt->testability_mode = OSM_TEST_MODE_NONE;
>    p_opt->updn_activate = FALSE;
>    p_opt->updn_guid_file = NULL;
> +  p_opt->exit_on_fatal = TRUE;
>  }
> 
>  /**********************************************************************
> @@ -765,6 +766,10 @@ osm_subn_parse_conf_file(
>        __osm_subn_opts_unpack_charp(
>          "updn_guid_file" ,
>          p_key, p_val, &p_opts->updn_guid_file);
> +
> +      __osm_subn_opts_unpack_boolean(
> +        "exit_on_fatal",
> +        p_key, p_val, &p_opts->exit_on_fatal);
>      }
>    }
>    fclose(opts_file);
> @@ -930,14 +935,17 @@ osm_subn_write_conf_file(
>      "# If TRUE if OpenSM should disable multicast support\n"
>      "no_multicast_option %s\n\n"
>      "# No multicast routing is performed if TRUE\n"
> -    "disable_multicast %s\n\n",
> +    "disable_multicast %s\n\n"
> +    "# If TRUE opensm will exit on fatal initialization issues\n"
> +    "exit_on_fatal %s\n\n",
>      p_opts->log_flags,
>      p_opts->force_log_flush ? "TRUE" : "FALSE",
>      p_opts->log_file,
>      p_opts->accum_log_file ? "TRUE" : "FALSE",
>      p_opts->dump_files_dir,
>      p_opts->no_multicast_option ? "TRUE" : "FALSE",
> -    p_opts->disable_multicast ? "TRUE" : "FALSE"
> +    p_opts->disable_multicast ? "TRUE" : "FALSE",
> +    p_opts->exit_on_fatal ? "TRUE" : "FALSE"
>      );
>   
>    /* optional string attributes ... */
> Index: opensm/osm_node_info_rcv.c
> ===================================================================
> --- opensm/osm_node_info_rcv.c        (revision 4288)
> +++ opensm/osm_node_info_rcv.c        (working copy)
> @@ -198,6 +198,14 @@ __osm_ni_rcv_set_links(
>                       p_ni_context->port_num,
>                       dr_new_path
>                       );
> +
> +            osm_log( p_rcv->p_log, OSM_LOG_SYS,
> +                     "Errors on subnet. SM found duplicated guids or 12x "
> +                     "link with lane reversal badly configured. "
> +                     "Use osm log for more details.\n");
> +
> +            if ( p_rcv->p_subn->opt.exit_on_fatal == TRUE )
> +              exit( 1 );
>            }
> 
>            /*
> Index: opensm/main.c
> ===================================================================
> --- opensm/main.c     (revision 4288)
> +++ opensm/main.c     (working copy)
> @@ -178,6 +178,12 @@ show_usage(void)
>            "          This option will cause deletion of the log file\n"
>            "          (if it previously exists). By default, the log file\n"
>            "          is accumulative.\n\n");
> +  printf( "-y\n"
> +          "--stay_on_fatal\n"
> +          "          This option will cause SM not to exit on fatal initialization\n"
> +          "          issues: If SM discovers duplicated guids or 12x link with\n"
> +          "          lane reversal badly configured.\n"
> +          "          By default, the SM will exit.\n\n");
>    printf( "-v\n"
>            "--verbose\n"
>            "          This option increases the log verbosity level.\n"
> @@ -460,7 +466,7 @@ main(
>    boolean_t             cache_options = FALSE;
>    char                 *ignore_guids_file_name = NULL;
>    uint32_t              val;
> -  const char * const    short_option = "i:f:ed:g:l:s:t:a:uvVhorc";
> +  const char * const    short_option = "i:f:ed:g:l:s:t:a:uvVhorcy";
> 
>    /*
>      In the array below, the 2nd parameter specified the number
> @@ -492,6 +498,7 @@ main(
>        {  "updn",          0, NULL, 'u'},
>        {  "add_guid_file", 1, NULL, 'a'},
>        {  "cache-options", 0, NULL, 'c'},
> +      {  "stay_on_fatal", 0, NULL, 'y'},
>        {  NULL,            0, NULL,  0 }  /* Required at the end of the array */
>      };
> 
> @@ -665,6 +672,11 @@ main(
>        printf(" Creating new log file\n");
>        break;
> 
> +    case 'y':
> +      opt.exit_on_fatal = FALSE;
> +      printf(" Staying on fatal initialization\n");
> +      break;
> +
>      case 'v':
>        log_flags = (log_flags <<1 )|1;
>        printf(" Verbose option -v (log flags = 0x%X)\n", log_flags );
>






More information about the general mailing list