[ofw] RE: [IBBUS] Bugfix for wrong event propagation

Alex Estrin alex.estrin at qlogic.com
Wed Feb 11 14:08:29 PST 2009


Hello,

Looks good. One question though:

> +  // subnet manager restart
> +  //if (AL_OBJ_IS_TYPE(p_obj, AL_OBJ_TYPE_CI_CA)
> +  if (AL_BASE_TYPE( p_obj->type) == AL_OBJ_TYPE_CI_CA) {
> +    pnp_force_event( (struct _al_ci_ca *) p_obj, IB_PNP_LID_CHANGE,
> +     p_event_item->event_rec.port_number );
> +  }

Wouldn't it be better to send user actual event code? - 

   pnp_force_event( (struct _al_ci_ca *) p_obj, p_event_item->event_rec.code,
	     p_event_item->event_rec.port_number );


Thanks,
Alex.


> -----Original Message-----
> From: Alex Naslednikov [mailto:xalex at mellanox.co.il]
> Sent: Wednesday, February 11, 2009 10:03 AM
> To: ofw at lists.openfabrics.org; Alex Estrin; Leonid Keller
> Cc: Tzachi Dar
> Subject: [IBBUS] Bugfix for wrong event propagation
> 
> Bug description and reproduction:
> 1. Connect to machines (A and B) via IB switch
> 2. Run subnet manager (say, opensm) on B
> 3. Kill opensm and clear arp tables
> 4. Rerun opensm - ping will not longer work
> 5. That's because new opensm instance will clear old multicast groups, and side A will be not aware
> about opensm restart and will not request to join new MCAST group
> 
> Explanations:
> There are 2 types of events relevant in our case: PnP and AE.
> The problem had happened due to:
> 1. During opensm restart, port will generate AE event: IB_EVENT_LID_CHANGE or (in other cases)
> IB_EVENT_CLIENT_REREGISTER
> These events will be generated even in the case when  SM was restarted but LID will not actually
> change.
> 
> 2. All PnP events were handled properly; but these events were mapped to IB_AE_FATAL
> This patch fixes it and maps IB_EVENT_* events to appropriate IB_AE_* events and then to IB_PNP_*
> events
> 
> 3. function force_smi_poll() will now update it's subscribers about LID change event iff LID was
> changed.
> So, we still have the problem when opensm was restarted and no one of the port attributes was changed
> This patch generated appropriate IB_PNP event to resolve this issue
> 
> Signed-off by: Alexander Naslednikov (xalex at mellanox.co.il)
> 
> Index: core/al/al_ci_ca_shared.c
> ===================================================================
> --- core/al/al_ci_ca_shared.c (revision 3889)
> +++ core/al/al_ci_ca_shared.c (working copy)
> @@ -299,10 +299,27 @@
>    cq_async_event_cb( &p_event_item->event_rec );
>    break;
> 
> +#ifdef CL_KERNEL
> +
> + case IB_AE_LID_CHANGE:
> + case IB_AE_CLIENT_REREGISTER:
> +  // These AE event will be generated even in the case when
> +  // SM was restaretd but LID will not actually change.
> +  // It's important to propagate these event (via PnP mechanism)
> +  // up to subscribers. Otherwise, there will be no ping after
> +  // subnet manager restart
> +  //if (AL_OBJ_IS_TYPE(p_obj, AL_OBJ_TYPE_CI_CA)
> +  if (AL_BASE_TYPE( p_obj->type) == AL_OBJ_TYPE_CI_CA) {
> +    pnp_force_event( (struct _al_ci_ca *) p_obj, IB_PNP_LID_CHANGE,
> +     p_event_item->event_rec.port_number );
> +  }
> +  break;
> +#endif //CL_KERNEL
> +
>   case IB_AE_PORT_TRAP:
>   case IB_AE_PORT_DOWN:
>   case IB_AE_PORT_ACTIVE:
> - case IB_AE_CLIENT_REREGISTER:
> +
>  #ifdef CL_KERNEL
>    /* The SMI polling routine may report a PnP event. */
>    force_smi_poll();
> Index: core/al/al_pnp.h
> ===================================================================
> --- core/al/al_pnp.h (revision 3889)
> +++ core/al/al_pnp.h (working copy)
> @@ -216,6 +216,13 @@
>   IN    KEVENT      *p_sync_event,
>    OUT   ib_pnp_handle_t* const  ph_pnp );
> 
> +void
> +pnp_force_event(
> + IN   struct _al_ci_ca  * p_ci_ca,
> + IN   ib_pnp_event_t pnp_event,
> + IN   uint8_t port_num);
> +
> +
>  #endif /* CL_KERNEL */
> 
>  static inline ib_pnp_class_t
> Index: core/al/kernel/al_ci_ca.c
> ===================================================================
> --- core/al/kernel/al_ci_ca.c (revision 3889)
> +++ core/al/kernel/al_ci_ca.c (working copy)
> @@ -347,6 +347,7 @@
>   event_rec.code = p_event_record->type;
>   event_rec.context = p_event_record->context;
>   event_rec.vendor_specific = p_event_record->vendor_specific;
> + event_rec.port_number = p_event_record->port_number;
> 
>   ci_ca_async_event( &event_rec );
> 
> Index: core/al/kernel/al_pnp.c
> ===================================================================
> --- core/al/kernel/al_pnp.c (revision 3889)
> +++ core/al/kernel/al_pnp.c (working copy)
> @@ -1740,3 +1740,26 @@
>   AL_EXIT( AL_DBG_PNP );
>   return IB_UNSUPPORTED;
>  }
> +
> +void
> +pnp_force_event(
> + IN   struct _al_ci_ca  * p_ci_ca,
> + IN   ib_pnp_event_t pnp_event,
> + IN  uint8_t port_num)
> +{
> +
> +#define PORT_INDEX_OFFSET 1
> + al_pnp_ca_event_t  event_rec;
> +
> + ASSERT(p_ci_ca);
> +
> + if (!p_ci_ca)
> +  return;
> +
> + event_rec.p_ci_ca = p_ci_ca;
> + event_rec.port_index = port_num - PORT_INDEX_OFFSET;
> + event_rec.pnp_event = pnp_event;
> + __pnp_process_port_forward( &event_rec );
> +}
> +
> +
> Index: hw/mlx4/kernel/bus/inc/ib_verbs.h
> ===================================================================
> --- hw/mlx4/kernel/bus/inc/ib_verbs.h (revision 3889)
> +++ hw/mlx4/kernel/bus/inc/ib_verbs.h (working copy)
> @@ -274,10 +274,11 @@
>   IB_EVENT_RESET_CLIENT     = IB_AE_RESET_CLIENT, // device will be reset upon client request
>   IB_EVENT_RESET_END     = IB_AE_RESET_END,  // device has been reset
>   IB_EVENT_RESET_FAILED    = IB_AE_RESET_FAILED,  // device has been reset
> - IB_EVENT_LID_CHANGE     = IB_AE_UNKNOWN + 1,
> - IB_EVENT_PKEY_CHANGE,
> - IB_EVENT_SM_CHANGE,
> - IB_EVENT_CLIENT_REREGISTER
> + IB_EVENT_LID_CHANGE     = IB_AE_LID_CHANGE,
> + IB_EVENT_CLIENT_REREGISTER   = IB_AE_CLIENT_REREGISTER,
> + IB_EVENT_PKEY_CHANGE    = IB_AE_PKEY_CHANGE,
> + IB_EVENT_SM_CHANGE     = IB_AE_SM_CHANGE
> +
>  };
> 
>  struct ib_event {
> Index: inc/iba/ib_al.h
> ===================================================================
> --- inc/iba/ib_al.h (revision 3889)
> +++ inc/iba/ib_al.h (working copy)
> @@ -473,6 +473,8 @@
>    TO_LONG_PTR(struct _ib_srq*,   h_srq);
> 
>   } handle;
> +
> + uint8_t          port_number;
> 
>  } ib_async_event_rec_t;
>  /*
> Index: inc/iba/ib_types.h
> ===================================================================
> --- inc/iba/ib_types.h (revision 3889)
> +++ inc/iba/ib_types.h (working copy)
> @@ -8797,6 +8797,9 @@
>   IB_AE_RESET_CLIENT,
>   IB_AE_RESET_END,
>   IB_AE_RESET_FAILED,
> + IB_AE_LID_CHANGE,
> + IB_AE_PKEY_CHANGE,
> + IB_AE_SM_CHANGE,
>   IB_AE_UNKNOWN  /* ALWAYS LAST ENUM VALUE */
> 
>  } ib_async_event_t;
> Index: ulp/opensm/user/include/iba/ib_types_extended.h
> ===================================================================
> --- ulp/opensm/user/include/iba/ib_types_extended.h (revision 3889)
> +++ ulp/opensm/user/include/iba/ib_types_extended.h (working copy)
> @@ -234,6 +234,9 @@
>   IB_AE_SRQ_LIMIT_REACHED,
>   IB_AE_SRQ_CATAS_ERROR,
>   IB_AE_SRQ_QP_LAST_WQE_REACHED,
> + IB_AE_LID_CHANGE,
> + IB_AE_PKEY_CHANGE,
> + IB_AE_SM_CHANGE,
>   IB_AE_UNKNOWN  /* ALWAYS LAST ENUM VALUE */
> 
>  } ib_async_event_t;




More information about the ofw mailing list