[ofw] [IBBUS] Bugfix for wrong event propagation

Alex Naslednikov xalex at mellanox.co.il
Wed Feb 11 07:02:38 PST 2009


Bug description and reproduction:
1. Connect to machines (A and B) via IB switch
2. Run subnet manager (say, opensm) on B
3. Kill opensm and clear arp tables
4. Rerun opensm - ping will not longer work
5. That's because new opensm instance will clear old multicast groups,
and side A will be not aware about opensm restart and will not request
to join new MCAST group
 
Explanations:
There are 2 types of events relevant in our case: PnP and AE.
The problem had happened due to:
1. During opensm restart, port will generate AE event:
IB_EVENT_LID_CHANGE or (in other cases)  IB_EVENT_CLIENT_REREGISTER
These events will be generated even in the case when  SM was restarted
but LID will not actually change.
 
2. All PnP events were handled properly; but these events were mapped to
IB_AE_FATAL
This patch fixes it and maps IB_EVENT_* events to appropriate IB_AE_*
events and then to IB_PNP_* events
 
3. function force_smi_poll() will now update it's subscribers about LID
change event iff LID was changed.
So, we still have the problem when opensm was restarted and no one of
the port attributes was changed
This patch generated appropriate IB_PNP event to resolve this issue
 
Signed-off by: Alexander Naslednikov (xalex at mellanox.co.il)
 
Index: core/al/al_ci_ca_shared.c
===================================================================
--- core/al/al_ci_ca_shared.c (revision 3889)
+++ core/al/al_ci_ca_shared.c (working copy)
@@ -299,10 +299,27 @@
   cq_async_event_cb( &p_event_item->event_rec );
   break;
 
+#ifdef CL_KERNEL
+
+ case IB_AE_LID_CHANGE:
+ case IB_AE_CLIENT_REREGISTER:
+  // These AE event will be generated even in the case when
+  // SM was restaretd but LID will not actually change.
+  // It's important to propagate these event (via PnP mechanism)
+  // up to subscribers. Otherwise, there will be no ping after
+  // subnet manager restart
+  //if (AL_OBJ_IS_TYPE(p_obj, AL_OBJ_TYPE_CI_CA)
+  if (AL_BASE_TYPE( p_obj->type) == AL_OBJ_TYPE_CI_CA) {
+    pnp_force_event( (struct _al_ci_ca *) p_obj, IB_PNP_LID_CHANGE,
+     p_event_item->event_rec.port_number );
+  }
+  break;
+#endif //CL_KERNEL
+
  case IB_AE_PORT_TRAP:
  case IB_AE_PORT_DOWN:
  case IB_AE_PORT_ACTIVE:
- case IB_AE_CLIENT_REREGISTER:
+ 
 #ifdef CL_KERNEL
   /* The SMI polling routine may report a PnP event. */
   force_smi_poll();
Index: core/al/al_pnp.h
===================================================================
--- core/al/al_pnp.h (revision 3889)
+++ core/al/al_pnp.h (working copy)
@@ -216,6 +216,13 @@
  IN    KEVENT      *p_sync_event,
   OUT   ib_pnp_handle_t* const  ph_pnp );
 
+void
+pnp_force_event(
+ IN   struct _al_ci_ca  * p_ci_ca,
+ IN   ib_pnp_event_t pnp_event,
+ IN   uint8_t port_num);
+
+
 #endif /* CL_KERNEL */
 
 static inline ib_pnp_class_t
Index: core/al/kernel/al_ci_ca.c
===================================================================
--- core/al/kernel/al_ci_ca.c (revision 3889)
+++ core/al/kernel/al_ci_ca.c (working copy)
@@ -347,6 +347,7 @@
  event_rec.code = p_event_record->type;
  event_rec.context = p_event_record->context;
  event_rec.vendor_specific = p_event_record->vendor_specific;
+ event_rec.port_number = p_event_record->port_number;
 
  ci_ca_async_event( &event_rec );
 
Index: core/al/kernel/al_pnp.c
===================================================================
--- core/al/kernel/al_pnp.c (revision 3889)
+++ core/al/kernel/al_pnp.c (working copy)
@@ -1740,3 +1740,26 @@
  AL_EXIT( AL_DBG_PNP );
  return IB_UNSUPPORTED;
 }
+
+void
+pnp_force_event(
+ IN   struct _al_ci_ca  * p_ci_ca,
+ IN   ib_pnp_event_t pnp_event,
+ IN  uint8_t port_num)
+{
+ 
+#define PORT_INDEX_OFFSET 1
+ al_pnp_ca_event_t  event_rec;
+
+ ASSERT(p_ci_ca);
+ 
+ if (!p_ci_ca)
+  return;
+ 
+ event_rec.p_ci_ca = p_ci_ca;
+ event_rec.port_index = port_num - PORT_INDEX_OFFSET;
+ event_rec.pnp_event = pnp_event;
+ __pnp_process_port_forward( &event_rec );
+}
+
+
Index: hw/mlx4/kernel/bus/inc/ib_verbs.h
===================================================================
--- hw/mlx4/kernel/bus/inc/ib_verbs.h (revision 3889)
+++ hw/mlx4/kernel/bus/inc/ib_verbs.h (working copy)
@@ -274,10 +274,11 @@
  IB_EVENT_RESET_CLIENT     = IB_AE_RESET_CLIENT, // device will be
reset upon client request
  IB_EVENT_RESET_END     = IB_AE_RESET_END,  // device has been reset 
  IB_EVENT_RESET_FAILED    = IB_AE_RESET_FAILED,  // device has been
reset 
- IB_EVENT_LID_CHANGE     = IB_AE_UNKNOWN + 1,
- IB_EVENT_PKEY_CHANGE,
- IB_EVENT_SM_CHANGE,
- IB_EVENT_CLIENT_REREGISTER
+ IB_EVENT_LID_CHANGE     = IB_AE_LID_CHANGE,
+ IB_EVENT_CLIENT_REREGISTER   = IB_AE_CLIENT_REREGISTER,
+ IB_EVENT_PKEY_CHANGE    = IB_AE_PKEY_CHANGE,
+ IB_EVENT_SM_CHANGE     = IB_AE_SM_CHANGE
+ 
 };
 
 struct ib_event {
Index: inc/iba/ib_al.h
===================================================================
--- inc/iba/ib_al.h (revision 3889)
+++ inc/iba/ib_al.h (working copy)
@@ -473,6 +473,8 @@
   TO_LONG_PTR(struct _ib_srq*,   h_srq);
 
  } handle;
+ 
+ uint8_t          port_number;
 
 } ib_async_event_rec_t;
 /*
Index: inc/iba/ib_types.h
===================================================================
--- inc/iba/ib_types.h (revision 3889)
+++ inc/iba/ib_types.h (working copy)
@@ -8797,6 +8797,9 @@
  IB_AE_RESET_CLIENT,
  IB_AE_RESET_END,
  IB_AE_RESET_FAILED,
+ IB_AE_LID_CHANGE,
+ IB_AE_PKEY_CHANGE,
+ IB_AE_SM_CHANGE,
  IB_AE_UNKNOWN  /* ALWAYS LAST ENUM VALUE */
 
 } ib_async_event_t;
Index: ulp/opensm/user/include/iba/ib_types_extended.h
===================================================================
--- ulp/opensm/user/include/iba/ib_types_extended.h (revision 3889)
+++ ulp/opensm/user/include/iba/ib_types_extended.h (working copy)
@@ -234,6 +234,9 @@
  IB_AE_SRQ_LIMIT_REACHED,
  IB_AE_SRQ_CATAS_ERROR,
  IB_AE_SRQ_QP_LAST_WQE_REACHED,
+ IB_AE_LID_CHANGE,
+ IB_AE_PKEY_CHANGE,
+ IB_AE_SM_CHANGE,
  IB_AE_UNKNOWN  /* ALWAYS LAST ENUM VALUE */
 
 } ib_async_event_t;

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20090211/7c9eb735/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: sm_pnp_ev_prop_bug.patch
Type: application/octet-stream
Size: 4829 bytes
Desc: sm_pnp_ev_prop_bug.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20090211/7c9eb735/attachment.obj>


More information about the ofw mailing list