[ofw][patch][ibal] fix ping disappear after restart of opensm

Wed May 27 08:40:36 PDT 2009

>-----Original Message-----
>From: ofw-bounces at lists.openfabrics.org [mailto:ofw-
>bounces at lists.openfabrics.org] On Behalf Of Leonid Keller
>Sent: Wednesday, May 27, 2009 4:03 AM
>To: ofw at lists.openfabrics.org
>Subject: [ofw][patch][ibal] fix ping disappear after restart of opensm
>
>Summary: Ill-defined mechanism of event propagation
>
>Bug description and reproduction:
>
>1. Connect to machines (A and B) via IB switch
>2. Run subnet manager (say, opensm) on B
>3. Kill opensm and clear arp tables
>4. Rerun opensm - ping will not longer work
>5. That's because new opensm instance will clear old multicast groups, and side
>A will be not aware about opensm restart and will not request to join new MCAST
>group
>
>
>Explanations:
>
>There are 2 types of events relevant in our case: PnP and AE.
>
>The problem had happened due to:
>
>Index: core/al/al_ci_ca.h
>===================================================================
>--- core/al/al_ci_ca.h (revision 2195)
>+++ core/al/al_ci_ca.h (working copy)
>@@ -65,6 +65,11 @@
>  IN  const ib_ca_handle_t    h_ca );
> #endif
>
>+#define MAX_AE    32
>+typedef struct _al_ae_info {
>+ ib_pnp_event_t   pnp_event;
>+ uint8_t     port_index;
>+} al_ae_info_t;
>
>
> typedef struct _al_ci_ca
>@@ -106,6 +111,12 @@
>  /* "end of PnP handling" event */
>  cl_event_t     event;
>
>+ /* Array of pending AEs (Asynchronic events) */
>+ al_ae_info_t    ae[MAX_AE];
>+ int       ci;
>+ int       pi;

These names are a little too terse to understand how they're used or what the
names represent.  It looks like they're insert/remove index values. 

>+ atomic32_t     cnt;
>+
> } al_ci_ca_t;
>
>
>Index: core/al/al_ci_ca_shared.c
>===================================================================
>--- core/al/al_ci_ca_shared.c (revision 2195)
>+++ core/al/al_ci_ca_shared.c (working copy)
>@@ -250,6 +250,7 @@
>  p_event_item = PARENT_STRUCT( p_item, event_item_t, async_item.pool_item );
>  p_event_item->event_rec.code = p_event_rec->code;
>  p_event_item->event_rec.context = p_event_rec->context;
>+ p_event_item->event_rec.port_number = p_event_rec->port_number;
>
>  /* Queue the item on the asynchronous callback thread for processing. */
>  p_event_item->async_item.pfn_callback = ci_ca_process_event_cb;
>@@ -300,10 +301,27 @@
>   cq_async_event_cb( &p_event_item->event_rec );
>   break;
>
>+#ifdef CL_KERNEL
>+
>+ case IB_AE_LID_CHANGE:
>+ case IB_AE_CLIENT_REREGISTER:
>+  // These AE events will be generated even in the case when
>+  // SM was restaretd but LID will not actually change.
>+  // It's important to propagate these event (via PnP mechanism)
>+  // up to subscribers. Otherwise, there will be no ping after
>+  // subnet manager restart
>+  //if (AL_OBJ_IS_TYPE(p_obj, AL_OBJ_TYPE_CI_CA)
>+  if (AL_BASE_TYPE( p_obj->type) == AL_OBJ_TYPE_CI_CA) {
>+    pnp_force_event( (struct _al_ci_ca *) p_obj, IB_PNP_LID_CHANGE,
>+     p_event_item->event_rec.port_number );
>+  }

It would be better to just add a new PNP type for client reregister, or at least
map it to SM CHANGE.  Updating the clients to respond to a new event shouldn't
be that difficult.

>Index: core/al/al_pnp.h
>===================================================================
>--- core/al/al_pnp.h (revision 2195)
>+++ core/al/al_pnp.h (working copy)
>@@ -216,6 +216,13 @@
>  IN    KEVENT      *p_sync_event,
>   OUT   ib_pnp_handle_t* const  ph_pnp );
>
>+void
>+pnp_force_event(
>+ IN   struct _al_ci_ca  * p_ci_ca,
>+ IN   ib_pnp_event_t pnp_event,
>+ IN   uint8_t port_num);

This function really just queues an event, so I would rename accordingly.

>+
>+
> #endif /* CL_KERNEL */
>
> static inline ib_pnp_class_t
>Index: core/al/kernel/al_ci_ca.c
>===================================================================
>--- core/al/kernel/al_ci_ca.c (revision 2195)
>+++ core/al/kernel/al_ci_ca.c (working copy)
>@@ -354,6 +354,7 @@
>  event_rec.code = p_event_record->type;
>  event_rec.context = p_event_record->context;
>  event_rec.vendor_specific = p_event_record->vendor_specific;
>+ event_rec.port_number = p_event_record->port_number;
>
>  ci_ca_async_event( &event_rec );
>
>Index: core/al/kernel/al_pnp.c
>===================================================================
>--- core/al/kernel/al_pnp.c (revision 2195)
>+++ core/al/kernel/al_pnp.c (working copy)
>@@ -605,6 +605,7 @@
>  {
>   AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_PNP,
>    ("p_context is already in context map %I64x \n",p_context->guid));
>+  cl_free( p_context );

Why is this needed?  I didn't see any memory allocations added by this patch, so
seeing a free is a little odd.

>   p_context = NULL;
>  }
>
>@@ -1490,6 +1491,24 @@
>    __pnp_process_port_forward( &event_rec );
>   }
>  }
>+
>+ /* send asynchronous events */
>+ {
>+  int ci = p_ci_ca->ci;
>+  int cnt = p_ci_ca->cnt % MAX_AE;

This doesn't seem correct.  If cnt can go above MAX_AE, then this will only
process a portion of the queued events.  E.g. if cnt = MAX_AE + 1, this will
process 1 event, rather than MAX_AE, which is how many are actually queued.

I think you want the protection on the queuing side to prevent cnt from going
over MAX_AE.

>+
>+  while ( cnt-- > 0 )
>+  {
>+    event_rec.pnp_event = p_ci_ca->ae[ci].pnp_event;
>+    event_rec.port_index = p_ci_ca->ae[ci].port_index;
>+    cl_atomic_dec( &p_ci_ca->cnt );
>+    __pnp_process_port_forward( &event_rec );
>+    if ( ++ci >= MAX_AE )
>+     ci = 0;
>+  }
>+  p_ci_ca->ci = ci;
>+ }

Are calls to this function serialized?

>+
> }
>
>
>@@ -1749,3 +1768,26 @@
>  AL_EXIT( AL_DBG_PNP );
>  return IB_UNSUPPORTED;
> }
>+
>+void
>+pnp_force_event(

rename to pnp_queue_event

>+ IN   struct _al_ci_ca  * p_ci_ca,
>+ IN   ib_pnp_event_t pnp_event,
>+ IN  uint8_t port_num)
>+{
>+
>+#define PORT_INDEX_OFFSET 1
>+
>+ ASSERT(p_ci_ca);
>+
>+ if (!p_ci_ca)
>+  return;

Either assert or do the if, but not both.

>+
>+ p_ci_ca->ae[p_ci_ca->pi].pnp_event = pnp_event;
>+ p_ci_ca->ae[p_ci_ca->pi].port_index = port_num - PORT_INDEX_OFFSET;
>+ cl_atomic_inc( &p_ci_ca->cnt );
>+ if ( ++p_ci_ca->pi >= MAX_AE )
>+  p_ci_ca->pi = 0;
>+}

Are calls into this function serialized?

- Sean