[ofw] SM timeout

Slava Strebkov slavas at voltaire.com
Thu Oct 2 04:18:57 PDT 2008


When IB switch is heavy loaded, host will not receive reply from SM. It
results with IB_TIMEOUT and IPoIB adapter is shown as disconnected. 

Suggested solution for IB_TIMEOUT problem.

 

Index: ulp/ipoib/kernel/ipoib_port.c

===================================================================

--- ulp/ipoib/kernel/ipoib_port.c   (revision 1627)

+++ ulp/ipoib/kernel/ipoib_port.c (working copy)

@@ -779,6 +779,11 @@

 

            KeCancelTimer(&p_port->gc_timer);

            KeFlushQueuedDpcs();

+    if(p_port->p_query_workitem)

+    {

+                      cl_event_signal(&p_port->query_event);

+        cl_event_destroy(&p_port->query_event);

+    }

            __endpt_mgr_destroy( p_port );

            __recv_mgr_destroy( p_port );

            __send_mgr_destroy( p_port );

@@ -5191,8 +5196,71 @@

            return status;

 }

 

+static void

+__port_query(  

+          IN PDEVICE_OBJECT  DeviceObject,

+    IN PVOID  context)

+{

+          ipoib_port_t       *p_port;

+          ib_pnp_port_rec_t          pnp_rec;

+          ib_port_attr_t                 attr;

+          PIO_WORKITEM                       p_work = NULL;

+    const uint32_t WAIT_US = 30 * 1000000;      // 30 sec

 

+          IPOIB_ENTER( IPOIB_DBG_INIT );

+          UNREFERENCED_PARAMETER(DeviceObject);

+

+          p_port = (ipoib_port_t*)context;

+          p_work = p_port->p_query_workitem;

+

+          if(p_port->base_lid)

+          {

+                      cl_memclr(&attr,sizeof(attr));

+                      attr.lid = p_port->base_lid;

+                      cl_memclr(&pnp_rec,sizeof(pnp_rec));

+                      pnp_rec.p_port_attr = &attr;

+          }

+    if(cl_event_wait_on(&p_port->query_event,WAIT_US,FALSE) ==
CL_TIMEOUT)

+          {

+                      if(p_port->base_lid)

+                                  ipoib_port_up(p_port, &pnp_rec);

+                      else

+                                  __port_get_bcast( p_port );

+

+                      cl_event_destroy(&p_port->query_event);

+                      p_port->p_query_workitem = NULL; 

+    }

+          IoFreeWorkItem(p_work);

+          IPOIB_EXIT( IPOIB_DBG_INIT );

+}

+

 static void

+port_start_query(

+          IN         ipoib_port_t       *p_port)

+{

+          DEVICE_OBJECT         *p_pdo;

+          IPOIB_ENTER( IPOIB_DBG_INIT );

+

+          CL_ASSERT(!p_port->p_query_workitem);

+          NdisMGetDeviceProperty( p_port->p_adapter->h_adapter, &p_pdo,
NULL, NULL, NULL, NULL );

+          p_port->p_query_workitem = IoAllocateWorkItem(p_pdo);

+          if(! p_port->p_query_workitem)

+          {

+                      IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
IPOIB_DBG_ERROR,

+                                  ("port_start_query failed to allocate
workitem\n") );

+                      return;

+          }

+

+          cl_event_construct(&p_port->query_event);

+          cl_event_init(&p_port->query_event, TRUE);

+    IoQueueWorkItem(p_port->p_query_workitem,

+
(PIO_WORKITEM_ROUTINE)__port_query,

+                    DelayedWorkQueue,

+                                                          p_port);

+

+          IPOIB_EXIT( IPOIB_DBG_INIT );

+}

+static void

 __port_info_cb(

            IN
ib_query_rec_t                                      *p_query_rec )

 {

@@ -5252,10 +5320,11 @@

                        break;

 

            case IB_TIMEOUT:

-                       NdisWriteErrorLogEntry(
p_port->p_adapter->h_adapter,

-                                   EVENT_IPOIB_PORT_INFO_TIMEOUT, 0 );

                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION,
IPOIB_DBG_INIT,

                                    ("Port info query timed out.\n") );

+                      p_port_rec = (ib_portinfo_record_t*)

+                                  ib_get_query_result(
p_query_rec->p_result_mad, 0 );

+                      p_port->base_lid =
p_port_rec->port_info.base_lid;

                        break;

 

            case IB_REMOTE_ERROR:

@@ -5292,7 +5361,11 @@

 

            /* Release the reference taken when issuing the port info
query. */

            ipoib_port_deref( p_port, ref_port_info_cb );

-

+          if (status == IB_TIMEOUT) 

+          {

+                      port_start_query(p_port);

+          }

+          

            IPOIB_EXIT( IPOIB_DBG_INIT );

 }

 

@@ -5395,6 +5468,12 @@

                                    ("Instance destroying -
Aborting.\n") );

                        break;

 

+          case IB_TIMEOUT:

+                      IPOIB_PRINT(TRACE_LEVEL_INFORMATION,
IPOIB_DBG_INIT,

+                                  ("_port_get_bcast - TIMEOUT.\n") );

+                      p_port->base_lid = 0;

+                      break;

+

            default:

                        NdisWriteErrorLogEntry(
p_port->p_adapter->h_adapter,

                                    EVENT_IPOIB_BCAST_GET, 1,
p_query_rec->status );

@@ -5419,7 +5498,8 @@

 

            /* Release the reference taken when issuing the member
record query. */

            ipoib_port_deref( p_port, ref_bcast_get_cb );

-

+          if( status == IB_TIMEOUT)

+                      port_start_query(p_port);

            IPOIB_EXIT( IPOIB_DBG_INIT );

 }

 

@@ -5663,7 +5743,7 @@

                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,

                                    ("Multicast join for broadcast group
returned %s.\n",

 
p_port->p_adapter->p_ifc->get_err_str( p_mcast_rec->status )) );

-                       if( status == IB_REMOTE_ERROR )

+                      if( status == IB_REMOTE_ERROR || status ==
IB_TIMEOUT)

                        {

                                    /*

                                     * Either:

Index: ulp/ipoib/kernel/ipoib_port.h

===================================================================

--- ulp/ipoib/kernel/ipoib_port.h   (revision 1627)

+++ ulp/ipoib/kernel/ipoib_port.h (working copy)

@@ -516,6 +516,9 @@

            uint16_t                                     pkey_index;

            KDPC
gc_dpc;

            KTIMER
gc_timer;

+          cl_event_t
query_event;                 

+          PIO_WORKITEM
p_query_workitem;                    

+          ib_net16_t
base_lid;

            ipoib_hdr_t
hdr[1];  /* Must be last! */

 

 }          ipoib_port_t;

 

Slava 

 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20081002/cbf5eb45/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: sm_timeout.diff
Type: application/octet-stream
Size: 4507 bytes
Desc: sm_timeout.diff
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20081002/cbf5eb45/attachment.obj>


More information about the ofw mailing list