[ofa-general] ***SPAM*** Re: [PATCHv2] opensm/PerfMgr: Better redirection support
Sasha Khapyorsky
sashak at voltaire.com
Wed Apr 15 05:59:25 PDT 2009
Hi Hal,
On 15:21 Thu 12 Mar , Hal Rosenstock wrote:
>
> Handle PKey and QPN redirection information
> GID redirection handling remains
>
> Signed-off-by: Hal Rosenstock <hal.rosenstock at gmail.com>
>
> ---
> Changes since v1:
> Added include of osm_helper.h to osm_perfmgr.c
>
> Notes: osm_console redirection patch is unchanged
> Also, relies on ib_gid_is_notzero patch
>
> diff --git a/opensm/include/opensm/osm_perfmgr.h b/opensm/include/opensm/osm_perfmgr.h
> index 45dec54..651ea80 100644
> --- a/opensm/include/opensm/osm_perfmgr.h
> +++ b/opensm/include/opensm/osm_perfmgr.h
> @@ -92,8 +92,14 @@ typedef enum {
>
> /* Redirection information */
> typedef struct redir {
> + boolean_t redirection;
> + boolean_t invalid;
Why using lid value != 0 is/was bad for redirection invalidation?
> + ib_gid_t redir_gid;
> ib_net16_t redir_lid;
> + ib_net16_t redir_pkey;
> ib_net32_t redir_qp;
> + uint16_t redir_pkey_ix;
Don't need to repeat structure name (redir) in field names - it just
makes lines longer - 'redir->pkey_idx' looks pretty clear.
> + ib_net16_t orig_lid;
> } redir_t;
>
> /* Node to store information about which nodes we are monitoring */
> @@ -134,6 +140,7 @@ typedef struct osm_perfmgr {
> uint32_t max_outstanding_queries;
> cl_qmap_t monitored_map; /* map the nodes we are tracking */
> __monitored_node_t *remove_list;
> + int16_t local_port;
> } osm_perfmgr_t;
> /*
> * FIELDS
> diff --git a/opensm/opensm/osm_perfmgr.c b/opensm/opensm/osm_perfmgr.c
> index 4a6f65c..fc1b7cb 100644
> --- a/opensm/opensm/osm_perfmgr.c
> +++ b/opensm/opensm/osm_perfmgr.c
> @@ -47,7 +47,6 @@
> #endif /* HAVE_CONFIG_H */
>
> #ifdef ENABLE_OSM_PERF_MGR
> -
> #include <stdlib.h>
> #include <stdint.h>
> #include <string.h>
> @@ -65,8 +64,11 @@
> #include <opensm/osm_log.h>
> #include <opensm/osm_node.h>
> #include <opensm/osm_opensm.h>
> +#include <opensm/osm_helper.h>
>
> #define OSM_PERFMGR_INITIAL_TID_VALUE 0xcafe
> +#define MAX_LOCAL_IBPORTS 64
> +#define MAX_LOCAL_PKEYS 256
>
> #if ENABLE_OSM_PERF_MGR_PROFILE
> struct {
> @@ -118,8 +120,6 @@ static inline void diff_time(struct timeval *before,
> }
> #endif
>
> -extern int wait_for_pending_transactions(osm_stats_t * stats);
> -
> /**********************************************************************
> * Internal helper functions.
> **********************************************************************/
> @@ -203,6 +203,7 @@ osm_perfmgr_mad_send_err_callback(void *bind_context, osm_madw_t * p_madw)
> uint8_t port = context->perfmgr_context.port;
> cl_map_item_t *p_node;
> __monitored_node_t *p_mon_node;
> + ib_net16_t orig_lid;
>
> OSM_LOG_ENTER(pm->log);
>
> @@ -233,9 +234,10 @@ osm_perfmgr_mad_send_err_callback(void *bind_context, osm_madw_t * p_madw)
> p_mon_node->guid, p_mon_node->redir_tbl_size);
> goto Exit;
> }
> - /* Clear redirection info */
> - p_mon_node->redir_port[port].redir_lid = 0;
> - p_mon_node->redir_port[port].redir_qp = 0;
> + /* Clear redirection info for this port except orig_lid */
> + orig_lid = p_mon_node->redir_port[port].orig_lid;
> + memset(&p_mon_node->redir_port[port], 0, sizeof(redir_t));
> + p_mon_node->redir_port[port].orig_lid = orig_lid;
Hmm, why should 'orig_lid' be part of redirection structure and not
placed on original node/port (below I see that it is used in
non-redirected paths)?
I think it would be better to use structures like:
struct node {
....
uint16_t lid;
uint16_t pkey_ix;
unsigned num_ports;
struct port {
....
struct redir_info { /* or even same 'struct port' */
...
} *ri;
} ports[0];
}
> cl_plock_release(pm->lock);
> }
>
> @@ -292,6 +294,40 @@ osm_perfmgr_bind(osm_perfmgr_t * const pm, const ib_net64_t port_guid)
> goto Exit;
> }
>
> + /* if redirection enabled, determine local port from port GUID */
> + if (pm->subn->opt.perfmgr_redir) {
> + ib_port_attr_t attr_array[MAX_LOCAL_IBPORTS];
> + uint32_t num_ports = MAX_LOCAL_IBPORTS;
> + int i;
> +
> + for (i = 0; i < num_ports; i++) {
> + attr_array[i].num_pkeys = 0;
> + attr_array[i].p_pkey_table = NULL;
> + }
> +
> + /* call transport layer for a list of local port GUIDs */
> + status = osm_vendor_get_all_port_attr(pm->subn->p_osm->p_vendor,
> + attr_array, &num_ports);
> + if (status != IB_SUCCESS) {
> + OSM_LOG(pm->log, OSM_LOG_ERROR,
> + "ERR 4C1C: osm_vendor_get_all_port_attr status 0x%x\n",
> + status);
> + goto Exit;
> + }
> + if (num_ports == 0) {
> + OSM_LOG(pm->log, OSM_LOG_ERROR,
> + "ERR 4C1D: No local ports detected!\n");
> + goto Exit;
> + }
> +
> + for (i = 0; i < num_ports; i++) {
> + if (port_guid == attr_array[i].port_guid) {
> + pm->local_port = attr_array[i].port_num;
> + break;
> + }
> + }
> + }
> +
PerfMgr is always running over discovered fabric so maybe local port
number should be detected later at start of PerfMgr process cycle just
using OpenSM DB. Also see comment below about pkey validation per
redirection request.
> Exit:
> OSM_LOG_EXIT(pm->log);
> return (status);
> @@ -321,25 +357,16 @@ static ib_net32_t get_qp(__monitored_node_t * mon_node, uint8_t port)
>
> if (mon_node && mon_node->redir_tbl_size &&
> port < mon_node->redir_tbl_size &&
> - mon_node->redir_port[port].redir_lid &&
> + mon_node->redir_port[port].redirection &&
> mon_node->redir_port[port].redir_qp)
> qp = mon_node->redir_port[port].redir_qp;
>
> return qp;
> }
>
> -/**********************************************************************
> - * Given a node, a port, and an optional monitored node,
> - * return the appropriate lid to query that port
> - **********************************************************************/
> static ib_net16_t
> -get_lid(osm_node_t * p_node, uint8_t port, __monitored_node_t * mon_node)
> +get_base_lid(osm_node_t * p_node, uint8_t port)
> {
> - if (mon_node && mon_node->redir_tbl_size &&
> - port < mon_node->redir_tbl_size &&
> - mon_node->redir_port[port].redir_lid)
> - return mon_node->redir_port[port].redir_lid;
> -
> switch (p_node->node_info.node_type) {
> case IB_NODE_TYPE_CA:
> case IB_NODE_TYPE_ROUTER:
> @@ -352,12 +379,27 @@ get_lid(osm_node_t * p_node, uint8_t port, __monitored_node_t * mon_node)
> }
>
> /**********************************************************************
> + * Given a node, a port, and an optional monitored node,
> + * return the lid appropriate to query that port
> + **********************************************************************/
> +static ib_net16_t
> +get_lid(osm_node_t * p_node, uint8_t port, __monitored_node_t * mon_node)
> +{
> + if (mon_node && mon_node->redir_tbl_size &&
> + port < mon_node->redir_tbl_size &&
> + mon_node->redir_port[port].redir_lid)
> + return mon_node->redir_port[port].redir_lid;
> +
> + return get_base_lid(p_node, port);
> +}
> +
> +/**********************************************************************
> * Form and send the Port Counters MAD for a single port.
> **********************************************************************/
> static ib_api_status_t
> osm_perfmgr_send_pc_mad(osm_perfmgr_t * perfmgr, ib_net16_t dest_lid,
> - ib_net32_t dest_qp, uint8_t port, uint8_t mad_method,
> - osm_madw_context_t * const p_context)
> + ib_net32_t dest_qp, uint16_t pkey_ix, uint8_t port,
> + uint8_t mad_method, osm_madw_context_t * const p_context)
> {
> ib_api_status_t status = IB_SUCCESS;
> ib_port_counters_t *port_counter = NULL;
> @@ -396,8 +438,7 @@ osm_perfmgr_send_pc_mad(osm_perfmgr_t * perfmgr, ib_net16_t dest_lid,
> p_madw->mad_addr.addr_type.gsi.remote_qp = dest_qp;
> p_madw->mad_addr.addr_type.gsi.remote_qkey =
> cl_hton32(IB_QP1_WELL_KNOWN_Q_KEY);
> - /* FIXME what about other partitions */
> - p_madw->mad_addr.addr_type.gsi.pkey_ix = 0;
> + p_madw->mad_addr.addr_type.gsi.pkey_ix = pkey_ix;
> p_madw->mad_addr.addr_type.gsi.service_level = 0;
> p_madw->mad_addr.addr_type.gsi.global_route = FALSE;
> p_madw->resp_expected = TRUE;
> @@ -432,28 +473,32 @@ static void __collect_guids(cl_map_item_t * const p_map_item, void *context)
> uint64_t node_guid = cl_ntoh64(node->node_info.node_guid);
> osm_perfmgr_t *pm = (osm_perfmgr_t *) context;
> __monitored_node_t *mon_node = NULL;
> - uint32_t size;
> + uint32_t num_ports;
> + int port;
>
> OSM_LOG_ENTER(pm->log);
>
> if (cl_qmap_get(&pm->monitored_map, node_guid)
> == cl_qmap_end(&pm->monitored_map)) {
> /* if not already in our map add it */
> - size = osm_node_get_num_physp(node);
> - mon_node = malloc(sizeof(*mon_node) + sizeof(redir_t) * size);
> + num_ports = osm_node_get_num_physp(node);
> + mon_node = malloc(sizeof(*mon_node) + sizeof(redir_t) * num_ports);
> if (!mon_node) {
> OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 4C06: "
> "malloc failed: not handling node %s"
> "(GUID 0x%" PRIx64 ")\n", node->print_desc, node_guid);
> goto Exit;
> }
> - memset(mon_node, 0, sizeof(*mon_node) + sizeof(redir_t) * size);
> + memset(mon_node, 0, sizeof(*mon_node) + sizeof(redir_t) * num_ports);
> mon_node->guid = node_guid;
> mon_node->name = strdup(node->print_desc);
> - mon_node->redir_tbl_size = size;
> + mon_node->redir_tbl_size = num_ports;
> /* check for enhanced switch port 0 */
> mon_node->esp0 = (node->sw &&
> ib_switch_info_is_enhanced_port0(&node->sw->switch_info));
> + for (port = (mon_node->esp0) ? 0 : 1; port < num_ports; port++)
> + mon_node->redir_port[port].orig_lid = get_base_lid(node, port);
> +
> cl_qmap_insert(&(pm->monitored_map), node_guid,
> (cl_map_item_t *) mon_node);
> }
> @@ -511,6 +556,10 @@ __osm_perfmgr_query_counters(cl_map_item_t * const p_map_item, void *context)
> if (!osm_node_get_physp_ptr(node, port))
> continue;
>
> + if (mon_node->redir_port[port].redirection &&
> + mon_node->redir_port[port].invalid)
> + continue;
> +
Are two flags really needed? Couldn't this be stripped down?
Also what about letting "chance" for port to refresh redirection info?
> lid = get_lid(node, port, mon_node);
> if (lid == 0) {
> OSM_LOG(pm->log, OSM_LOG_DEBUG, "WARN: node 0x%" PRIx64
> @@ -532,8 +581,10 @@ __osm_perfmgr_query_counters(cl_map_item_t * const p_map_item, void *context)
> PRIx64 " port %d (lid %u) (%s)\n", node_guid, port,
> cl_ntoh16(lid), node->print_desc);
> status =
> - osm_perfmgr_send_pc_mad(pm, lid, remote_qp, port,
> - IB_MAD_METHOD_GET, &mad_context);
> + osm_perfmgr_send_pc_mad(pm, lid, remote_qp,
> + mon_node->redir_port[port].redir_pkey_ix,
> + port, IB_MAD_METHOD_GET,
> + &mad_context);
> if (status != IB_SUCCESS)
> OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C09: "
> "Failed to issue port counter query for node 0x%"
> @@ -550,6 +601,7 @@ Exit:
> * Discovery stuff.
> * Basically this code should not be here, but merged with main OpenSM
> **********************************************************************/
> +extern int wait_for_pending_transactions(osm_stats_t * stats);
> extern void osm_drop_mgr_process(IN osm_sm_t *sm);
>
> static int sweep_hop_1(osm_sm_t * sm)
> @@ -980,6 +1032,10 @@ osm_perfmgr_check_overflow(osm_perfmgr_t * pm, __monitored_node_t *mon_node,
> osm_node_t *p_node = NULL;
> ib_net16_t lid = 0;
>
> + if (mon_node->redir_port[port].redirection &&
> + mon_node->redir_port[port].invalid)
> + goto Exit;
> +
> osm_log(pm->log, OSM_LOG_VERBOSE,
> "PerfMgr: Counter overflow: %s (0x%" PRIx64
> ") port %d; clearing counters\n",
> @@ -1004,8 +1060,10 @@ osm_perfmgr_check_overflow(osm_perfmgr_t * pm, __monitored_node_t *mon_node,
> mad_context.perfmgr_context.mad_method = IB_MAD_METHOD_SET;
> /* clear port counters */
> status =
> - osm_perfmgr_send_pc_mad(pm, lid, remote_qp, port,
> - IB_MAD_METHOD_SET, &mad_context);
> + osm_perfmgr_send_pc_mad(pm, lid, remote_qp,
> + mon_node->redir_port[port].redir_pkey_ix,
> + port, IB_MAD_METHOD_SET,
> + &mad_context);
> if (status != IB_SUCCESS)
> OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 4C11: "
> "Failed to send clear counters MAD for %s (0x%"
> @@ -1063,6 +1121,73 @@ osm_perfmgr_log_events(osm_perfmgr_t * pm, __monitored_node_t *mon_node, uint8_t
> time_diff, mon_node->name, mon_node->guid, port);
> }
>
> +static boolean_t validate_redir_pkey(osm_perfmgr_t *pm, ib_net16_t pkey,
> + uint16_t *pkey_ix)
> +{
This function can just return pkey index value (or negative in case of
failure).
> + ib_port_attr_t attr_array[MAX_LOCAL_IBPORTS];
> + uint32_t num_ports = MAX_LOCAL_IBPORTS;
> + ib_api_status_t status;
> + boolean_t sts = FALSE;
> + uint16_t i = 0;
> +
> + OSM_LOG_ENTER(pm->log);
> +
> + for (i = 0; i < num_ports; i++) {
> + attr_array[i].num_pkeys = 0;
> + attr_array[i].p_pkey_table = NULL;
> + }
> +
> + /* If local port couldn't be determined previously */
> + if (pm->local_port == -1)
> + goto not_found;
> +
> + attr_array[pm->local_port].num_pkeys = MAX_LOCAL_PKEYS;
> + attr_array[pm->local_port].p_pkey_table =
> + malloc(MAX_LOCAL_PKEYS * sizeof(ib_net16_t));
> + if (!attr_array[pm->local_port].p_pkey_table) {
> + OSM_LOG(pm->log, OSM_LOG_ERROR,
> + "ERR 4C20: No memory for port %d pkey table\n",
> + pm->local_port);
> + goto not_found;
> + }
> +
> + /* call the transport layer for a list of local port pkeys */
> + status = osm_vendor_get_all_port_attr(pm->subn->p_osm->p_vendor,
> + attr_array, &num_ports);
This heavy stuff is performed per redirection request, but it actually
uses same data (local port's pkey table). This looks very inefficient.
Instead of doing this you can get local port's pkey table only once at
PerfMgr process cycle start and do all checks against this already
initialized table. Of course only in case when redirection is enabled at
all.
> + if (status != IB_SUCCESS) {
> + OSM_LOG(pm->log, OSM_LOG_ERROR,
> + "ERR 4C1E: osm_vendor_get_all_port_attr status 0x%x\n",
> + status);
> + goto not_found;
> + }
> + if (num_ports == 0 || pm->local_port > num_ports) {
> + OSM_LOG(pm->log, OSM_LOG_ERROR,
> + "ERR 4C1F: No local ports detected or local port out of range!\n");
> + goto not_found;
> + }
> + ib_net16_t *pkey_table = attr_array[pm->local_port].p_pkey_table;
> + for (i = 0; i < attr_array[pm->local_port].num_pkeys; i++)
> + if (pkey_table[i] == pkey)
> + break;
> + if (i == attr_array[pm->local_port].num_pkeys) {
> + i = 0;
> + goto not_found;
> + }
> + free(attr_array[pm->local_port].p_pkey_table);
> + sts = TRUE;
> + goto Exit;
> +
> +not_found:
> + if (attr_array[pm->local_port].p_pkey_table)
> + free(attr_array[pm->local_port].p_pkey_table);
> + sts = FALSE;
> +Exit:
> + if (pkey_ix)
> + *pkey_ix = i;
> + OSM_LOG_EXIT(pm->log);
> + return sts;
> +}
> +
> /**********************************************************************
> * The dispatcher uses a thread pool which will call this function when
> * we have a thread available to process our mad received from the wire.
> @@ -1082,6 +1207,8 @@ static void osm_pc_rcv_process(void *context, void *data)
> perfmgr_db_data_cnt_reading_t data_reading;
> cl_map_item_t *p_node;
> __monitored_node_t *p_mon_node;
> + uint16_t pkey_ix;
> + boolean_t invalid = FALSE;
>
> OSM_LOG_ENTER(pm->log);
>
> @@ -1105,7 +1232,8 @@ static void osm_pc_rcv_process(void *context, void *data)
> p_mad->attr_id == IB_MAD_ATTR_CLASS_PORT_INFO);
>
> /* Response could also be redirection (IBM eHCA PMA does this) */
> - if (p_mad->attr_id == IB_MAD_ATTR_CLASS_PORT_INFO) {
> + if (p_mad->status & IB_MAD_STATUS_REDIRECT &&
> + p_mad->attr_id == IB_MAD_ATTR_CLASS_PORT_INFO) {
> char gid_str[INET6_ADDRSTRLEN];
> ib_class_port_info_t *cpi =
> (ib_class_port_info_t *) &
> @@ -1119,17 +1247,48 @@ static void osm_pc_rcv_process(void *context, void *data)
> sizeof gid_str),
> cl_ntoh32(cpi->redir_qp));
>
> - /* LID or GID redirection ? */
> - /* For GID redirection, need to get PathRecord from SA */
> + /* valid redirection ? */
> if (cpi->redir_lid == 0) {
> - OSM_LOG(pm->log, OSM_LOG_VERBOSE,
> - "GID redirection not currently implemented!\n");
> - goto Exit;
> + if (!ib_gid_is_notzero(&cpi->redir_gid)) {
> + OSM_LOG(pm->log, OSM_LOG_ERROR,
> + "ERR 4C17: Invalid redirection "
> + "(both redirect LID and GID are zero)\n");
> + invalid = TRUE;
> + }
> + }
> + if (cpi->redir_qp == 0) {
> + OSM_LOG(pm->log, OSM_LOG_ERROR,
> + "ERR 4C18: Invalid RedirectQP\n");
> + invalid = TRUE;
> + }
> + if (cpi->redir_pkey == 0) {
> + OSM_LOG(pm->log, OSM_LOG_ERROR,
> + "ERR 4C19: Invalid RedirectP_Key\n");
> + invalid = TRUE;
> + }
> + if (cpi->redir_qkey != IB_QP1_WELL_KNOWN_Q_KEY) {
> + OSM_LOG(pm->log, OSM_LOG_ERROR,
> + "ERR 4C1A: Invalid RedirectQ_Key\n");
> + invalid = TRUE;
> + }
> +
> + if (!validate_redir_pkey(pm, cpi->redir_pkey, &pkey_ix)) {
> + OSM_LOG(pm->log, OSM_LOG_ERROR,
> + "ERR 4C1B: Index for Pkey 0x%x not found\n",
> + cl_ntoh16(cpi->redir_pkey));
> + invalid = TRUE;
> }
All above are not OpenSM errors, but wrong external data. I think it
should be logged as VERBOSE messages.
>
> if (!pm->subn->opt.perfmgr_redir) {
> - OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C16: "
> - "redirection requested but disabled\n");
> + OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C16: "
> + "redirection requested but disabled\n");
> + invalid = TRUE;
> + }
This is not an error.
BTW, why to bother with verifying redirection info when redirection
support is disabled anyway?
> +
> + if (cpi->redir_lid == 0) {
> + /* GID redirection: get PathRecord information */
> + OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C21: "
> + "GID redirection not currently supported\n");
> goto Exit;
> }
>
> @@ -1144,14 +1303,23 @@ static void osm_pc_rcv_process(void *context, void *data)
> p_mon_node->redir_tbl_size);
> goto Exit;
> }
> + p_mon_node->redir_port[port].redirection = TRUE;
> + p_mon_node->redir_port[port].invalid = invalid;
> + memcpy(&p_mon_node->redir_port[port].redir_gid,
> + &cpi->redir_gid, sizeof(ib_gid_t));
> p_mon_node->redir_port[port].redir_lid = cpi->redir_lid;
> p_mon_node->redir_port[port].redir_qp = cpi->redir_qp;
> + p_mon_node->redir_port[port].redir_pkey = cpi->redir_pkey;
> + p_mon_node->redir_port[port].redir_pkey_ix = pkey_ix;
> cl_plock_release(pm->lock);
>
> + if (invalid)
> + goto Exit;
> +
> /* Finally, reissue the query to the redirected location */
> status =
> osm_perfmgr_send_pc_mad(pm, cpi->redir_lid, cpi->redir_qp,
> - port,
> + pkey_ix, port,
> mad_context->perfmgr_context.
> mad_method, mad_context);
> if (status != IB_SUCCESS)
> @@ -1234,6 +1402,7 @@ osm_perfmgr_init(osm_perfmgr_t * const pm, osm_opensm_t *osm,
> pm->sweep_time_s = p_opt->perfmgr_sweep_time_s;
> pm->max_outstanding_queries = p_opt->perfmgr_max_outstanding_queries;
> pm->osm = osm;
> + pm->local_port = -1;
>
> status = cl_timer_init(&pm->sweep_timer, perfmgr_sweep, pm);
> if (status != IB_SUCCESS)
In general I would suggest to not mix redirection case with main flow
(by using better data structures). The Redirection is not something
PerfMgr specific and ideally we could have separate redirection handling
module. I'm not requesting to do this now (in this implementation), but
at least flow separation is very desirable.
Sasha
More information about the general
mailing list