[ofw] RE: SM timeout

Fab Tillier ftillier at windows.microsoft.com
Thu Oct 2 11:19:25 PDT 2008


Hi Slava,

>When IB switch is heavy loaded, host will not receive reply from SM. It
>results with IB_TIMEOUT and IPoIB adapter is shown as disconnected.
>Suggested solution for IB_TIMEOUT problem.

Why not eliminate the port info query?  The following fields of the port info records are used:

- base_lid
- link_width_active
- link_speed_active

Why not get these from the port attributes and eliminate the query altogether?  The ib_port_attr_t structure has:

- lid (base_lid)
- active_speed (link_speed_active)

Missing is link_width_active, but it's known to the HCA in the underlying code.

In fact, is there any reason why the ib_port_attr_t structure shouldn't just embed an ib_port_info_t structure rather than duplicating some of the data and hiding the rest?

If you look at what vstat does, it issues a local MAD to the HCA for the port info record to get the information missing from the port attributes.  Fixing the port attributes would simplify everybody's life.

Eliminating the port info query will reduce the load on the SM and should help with the broadcast group query processing.  Also, because ipoib_port_up issues the port info query, and is called at PASSIVE_LEVEL, you can substitute the port info query with a ib_local_mad call (just like vstat) for the port info if you don't feel like updating the port attribute structure.

In any case, I made some specific comments inline, but this patch isn't ready.  Before coding a fix, take a step back and try to identify the root problem (SM scalability sucks), not just the symptom (SM queries time out).  If you can find a way to deal with the root cause you will save yourself dealing with the many different ways the problem manifests itself.

The best way to help the SM scale is to not ask it for information that can be retrieved locally.

-Fab

>Index: ulp/ipoib/kernel/ipoib_port.c
>===================================================================
>--- ulp/ipoib/kernel/ipoib_port.c   (revision 1627)
>+++ ulp/ipoib/kernel/ipoib_port.c (working copy)
>@@ -779,6 +779,11 @@
>
>            KeCancelTimer(&p_port->gc_timer);
>            KeFlushQueuedDpcs();
>+    if(p_port->p_query_workitem)
>+    {
>+                      cl_event_signal(&p_port->query_event);
>+        cl_event_destroy(&p_port->query_event);

How do you prevent the work item callback from accessing the port structure after it's been freed?  What if the work item was just queued but hasn't run yet?

The synchronization here is broken.  A timer/DPC pair might be a better choice.

>+    }
>            __endpt_mgr_destroy( p_port );
>            __recv_mgr_destroy( p_port );
>            __send_mgr_destroy( p_port );
>@@ -5191,8 +5196,71 @@
>            return status;
> }
>
>+static void
>+__port_query(
>+          IN PDEVICE_OBJECT  DeviceObject,
>+    IN PVOID  context)
>+{
>+          ipoib_port_t       *p_port;
>+          ib_pnp_port_rec_t          pnp_rec;
>+          ib_port_attr_t                 attr;
>+          PIO_WORKITEM                       p_work = NULL;
>+    const uint32_t WAIT_US = 30 * 1000000;      // 30 sec
>
>+          IPOIB_ENTER( IPOIB_DBG_INIT );
>+          UNREFERENCED_PARAMETER(DeviceObject);
>+
>+          p_port = (ipoib_port_t*)context;
>+          p_work = p_port->p_query_workitem;
>+
>+          if(p_port->base_lid)
>+          {
>+                      cl_memclr(&attr,sizeof(attr));
>+                      attr.lid = p_port->base_lid;
>+                      cl_memclr(&pnp_rec,sizeof(pnp_rec));
>+                      pnp_rec.p_port_attr = &attr;

Why would you be here if you have a base LID?

>+          }
>+    if(cl_event_wait_on(&p_port->query_event,WAIT_US,FALSE) ==
>CL_TIMEOUT)
>+          {
>+                      if(p_port->base_lid)
>+                                  ipoib_port_up(p_port, &pnp_rec);
>+                      else
>+                                  __port_get_bcast( p_port );
>+
>+                      cl_event_destroy(&p_port->query_event);
>+                      p_port->p_query_workitem = NULL;
>+    }
>+          IoFreeWorkItem(p_work);
>+          IPOIB_EXIT( IPOIB_DBG_INIT );
>+}
>+
> static void
>+port_start_query(
>+          IN         ipoib_port_t       *p_port)
>+{
>+          DEVICE_OBJECT         *p_pdo;
>+          IPOIB_ENTER( IPOIB_DBG_INIT );
>+
>+          CL_ASSERT(!p_port->p_query_workitem);
>+          NdisMGetDeviceProperty( p_port->p_adapter->h_adapter, &p_pdo,
>NULL, NULL, NULL, NULL );
>+          p_port->p_query_workitem = IoAllocateWorkItem(p_pdo);
>+          if(! p_port->p_query_workitem)
>+          {
>+                      IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
>IPOIB_DBG_ERROR,
>+                                  ("port_start_query failed to allocate
>workitem\n") );
>+                      return;
>+          }
>+
>+          cl_event_construct(&p_port->query_event);
>+          cl_event_init(&p_port->query_event, TRUE);
>+    IoQueueWorkItem(p_port->p_query_workitem,
>+
>(PIO_WORKITEM_ROUTINE)__port_query,
>+                    DelayedWorkQueue,
>+                                                          p_port);
>+
>+          IPOIB_EXIT( IPOIB_DBG_INIT );
>+}
>+static void
> __port_info_cb(
>            IN
>ib_query_rec_t                                      *p_query_rec )
> {
>@@ -5252,10 +5320,11 @@
>                        break;
>
>            case IB_TIMEOUT:
>-                       NdisWriteErrorLogEntry(
>p_port->p_adapter->h_adapter,
>-                                   EVENT_IPOIB_PORT_INFO_TIMEOUT, 0 );
>                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION,
>IPOIB_DBG_INIT,
>                                    ("Port info query timed out.\n") );
>+                      p_port_rec = (ib_portinfo_record_t*)
>+                                  ib_get_query_result(
>p_query_rec->p_result_mad, 0 );
>+                      p_port->base_lid =
>p_port_rec->port_info.base_lid;

You're using data from the port record that wasn't filled in by the SM???  Where did p_port_rec->port_info.base_lid get filled in?

>                        break;
>
>            case IB_REMOTE_ERROR:
>@@ -5292,7 +5361,11 @@
>
>            /* Release the reference taken when issuing the port info
>query. */
>            ipoib_port_deref( p_port, ref_port_info_cb );
>-
>+          if (status == IB_TIMEOUT)
>+          {
>+                      port_start_query(p_port);
>+          }
>+
>            IPOIB_EXIT( IPOIB_DBG_INIT );
> }
>
>@@ -5395,6 +5468,12 @@
>                                    ("Instance destroying -
>Aborting.\n") );
>                        break;
>
>+          case IB_TIMEOUT:
>+                      IPOIB_PRINT(TRACE_LEVEL_INFORMATION,
>IPOIB_DBG_INIT,
>+                                  ("_port_get_bcast - TIMEOUT.\n") );
>+                      p_port->base_lid = 0;
>+                      break;
>+
>            default:
>                        NdisWriteErrorLogEntry(
>p_port->p_adapter->h_adapter,
>                                    EVENT_IPOIB_BCAST_GET, 1,
>p_query_rec->status );
>@@ -5419,7 +5498,8 @@
>
>            /* Release the reference taken when issuing the member
>record query. */
>            ipoib_port_deref( p_port, ref_bcast_get_cb );
>-
>+          if( status == IB_TIMEOUT)
>+                      port_start_query(p_port);

So if the broadcast group query fails, you start over with the port info query?  Doesn't that just put even more load on the SM?  Why not retry the broadcast group query?  Why not just increase the number of retries and/or timeout for all these queries to some really large value?

>            IPOIB_EXIT( IPOIB_DBG_INIT );
> }
>
>@@ -5663,7 +5743,7 @@
>                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>                                    ("Multicast join for broadcast group
>returned %s.\n",
>
>p_port->p_adapter->p_ifc->get_err_str( p_mcast_rec->status )) );
>-                       if( status == IB_REMOTE_ERROR )
>+                      if( status == IB_REMOTE_ERROR || status ==
>IB_TIMEOUT)

You missed a space after IB_TIMEOUT before the closing parethesis...

>                        {
>                                    /*
>                                     * Either:
>Index: ulp/ipoib/kernel/ipoib_port.h
>===================================================================
>--- ulp/ipoib/kernel/ipoib_port.h   (revision 1627)
>+++ ulp/ipoib/kernel/ipoib_port.h (working copy)
>@@ -516,6 +516,9 @@
>            uint16_t                                     pkey_index;
>            KDPC
>gc_dpc;
>            KTIMER
>gc_timer;
>+          cl_event_t
>query_event;
>+          PIO_WORKITEM
>p_query_workitem;
>+          ib_net16_t
>base_lid;
>            ipoib_hdr_t
>hdr[1];  /* Must be last! */
>
> }          ipoib_port_t;
>
>Slava
>
>



More information about the ofw mailing list