[ofw] SM timeout

Tzachi Dar tzachid at mellanox.co.il
Thu Oct 2 07:02:43 PDT 2008


Can you please give more info on how you are trying to solve the
problem?
 
I understand that the big picture is that if there is timeout than you
create the work item, wait there for 30 seconds and than you want to try
and do the query again.
 
However, I don't see how this fits in the entire driver model: For
example,
1) When you are sleeping someone might restart opensm, In this case,
port_up / down might be called again.
2) What promises that only one instance of ipoib_port_up will be running
at every given time?
3) What promises that p_query_workitem will not be used twice.
 
Thanks
Tzachi


________________________________

	From: ofw-bounces at lists.openfabrics.org
[mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Slava Strebkov
	Sent: Thursday, October 02, 2008 2:19 PM
	To: ofw at lists.openfabrics.org
	Subject: [ofw] SM timeout
	
	

	When IB switch is heavy loaded, host will not receive reply from
SM. It results with IB_TIMEOUT and IPoIB adapter is shown as
disconnected. 

	Suggested solution for IB_TIMEOUT problem.

	 

	Index: ulp/ipoib/kernel/ipoib_port.c

	
===================================================================

	--- ulp/ipoib/kernel/ipoib_port.c   (revision 1627)

	+++ ulp/ipoib/kernel/ipoib_port.c (working copy)

	@@ -779,6 +779,11 @@

	 

	            KeCancelTimer(&p_port->gc_timer);

	            KeFlushQueuedDpcs();

	+    if(p_port->p_query_workitem)

	+    {

	+                      cl_event_signal(&p_port->query_event);

	+        cl_event_destroy(&p_port->query_event);

	+    }

	            __endpt_mgr_destroy( p_port );

	            __recv_mgr_destroy( p_port );

	            __send_mgr_destroy( p_port );

	@@ -5191,8 +5196,71 @@

	            return status;

	 }

	 

	+static void

	+__port_query(  

	+          IN PDEVICE_OBJECT  DeviceObject,

	+    IN PVOID  context)

	+{

	+          ipoib_port_t       *p_port;

	+          ib_pnp_port_rec_t          pnp_rec;

	+          ib_port_attr_t                 attr;

	+          PIO_WORKITEM                       p_work = NULL;

	+    const uint32_t WAIT_US = 30 * 1000000;      // 30 sec

	 

	+          IPOIB_ENTER( IPOIB_DBG_INIT );

	+          UNREFERENCED_PARAMETER(DeviceObject);

	+

	+          p_port = (ipoib_port_t*)context;

	+          p_work = p_port->p_query_workitem;

	+

	+          if(p_port->base_lid)

	+          {

	+                      cl_memclr(&attr,sizeof(attr));

	+                      attr.lid = p_port->base_lid;

	+                      cl_memclr(&pnp_rec,sizeof(pnp_rec));

	+                      pnp_rec.p_port_attr = &attr;

	+          }

	+    if(cl_event_wait_on(&p_port->query_event,WAIT_US,FALSE) ==
CL_TIMEOUT)

	+          {

	+                      if(p_port->base_lid)

	+                                  ipoib_port_up(p_port,
&pnp_rec);

	+                      else

	+                                  __port_get_bcast( p_port );

	+

	+                      cl_event_destroy(&p_port->query_event);

	+                      p_port->p_query_workitem = NULL; 

	+    }

	+          IoFreeWorkItem(p_work);

	+          IPOIB_EXIT( IPOIB_DBG_INIT );

	+}

	+

	 static void

	+port_start_query(

	+          IN         ipoib_port_t       *p_port)

	+{

	+          DEVICE_OBJECT         *p_pdo;

	+          IPOIB_ENTER( IPOIB_DBG_INIT );

	+

	+          CL_ASSERT(!p_port->p_query_workitem);

	+          NdisMGetDeviceProperty( p_port->p_adapter->h_adapter,
&p_pdo, NULL, NULL, NULL, NULL );

	+          p_port->p_query_workitem = IoAllocateWorkItem(p_pdo);

	+          if(! p_port->p_query_workitem)

	+          {

	+                      IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
IPOIB_DBG_ERROR,

	+                                  ("port_start_query failed to
allocate workitem\n") );

	+                      return;

	+          }

	+

	+          cl_event_construct(&p_port->query_event);

	+          cl_event_init(&p_port->query_event, TRUE);

	+    IoQueueWorkItem(p_port->p_query_workitem,

	+
(PIO_WORKITEM_ROUTINE)__port_query,

	+                    DelayedWorkQueue,

	+
p_port);

	+

	+          IPOIB_EXIT( IPOIB_DBG_INIT );

	+}

	+static void

	 __port_info_cb(

	            IN
ib_query_rec_t                                      *p_query_rec )

	 {

	@@ -5252,10 +5320,11 @@

	                        break;

	 

	            case IB_TIMEOUT:

	-                       NdisWriteErrorLogEntry(
p_port->p_adapter->h_adapter,

	-
EVENT_IPOIB_PORT_INFO_TIMEOUT, 0 );

	                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION,
IPOIB_DBG_INIT,

	                                    ("Port info query timed
out.\n") );

	+                      p_port_rec = (ib_portinfo_record_t*)

	+                                  ib_get_query_result(
p_query_rec->p_result_mad, 0 );

	+                      p_port->base_lid =
p_port_rec->port_info.base_lid;

	                        break;

	 

	            case IB_REMOTE_ERROR:

	@@ -5292,7 +5361,11 @@

	 

	            /* Release the reference taken when issuing the port
info query. */

	            ipoib_port_deref( p_port, ref_port_info_cb );

	-

	+          if (status == IB_TIMEOUT) 

	+          {

	+                      port_start_query(p_port);

	+          }

	+          

	            IPOIB_EXIT( IPOIB_DBG_INIT );

	 }

	 

	@@ -5395,6 +5468,12 @@

	                                    ("Instance destroying -
Aborting.\n") );

	                        break;

	 

	+          case IB_TIMEOUT:

	+                      IPOIB_PRINT(TRACE_LEVEL_INFORMATION,
IPOIB_DBG_INIT,

	+                                  ("_port_get_bcast -
TIMEOUT.\n") );

	+                      p_port->base_lid = 0;

	+                      break;

	+

	            default:

	                        NdisWriteErrorLogEntry(
p_port->p_adapter->h_adapter,

	                                    EVENT_IPOIB_BCAST_GET, 1,
p_query_rec->status );

	@@ -5419,7 +5498,8 @@

	 

	            /* Release the reference taken when issuing the
member record query. */

	            ipoib_port_deref( p_port, ref_bcast_get_cb );

	-

	+          if( status == IB_TIMEOUT)

	+                      port_start_query(p_port);

	            IPOIB_EXIT( IPOIB_DBG_INIT );

	 }

	 

	@@ -5663,7 +5743,7 @@

	                        IPOIB_PRINT( TRACE_LEVEL_ERROR,
IPOIB_DBG_ERROR,

	                                    ("Multicast join for
broadcast group returned %s.\n",

	
p_port->p_adapter->p_ifc->get_err_str( p_mcast_rec->status )) );

	-                       if( status == IB_REMOTE_ERROR )

	+                      if( status == IB_REMOTE_ERROR || status
== IB_TIMEOUT)

	                        {

	                                    /*

	                                     * Either:

	Index: ulp/ipoib/kernel/ipoib_port.h

	
===================================================================

	--- ulp/ipoib/kernel/ipoib_port.h   (revision 1627)

	+++ ulp/ipoib/kernel/ipoib_port.h (working copy)

	@@ -516,6 +516,9 @@

	            uint16_t
pkey_index;

	            KDPC
gc_dpc;

	            KTIMER
gc_timer;

	+          cl_event_t
query_event;                 

	+          PIO_WORKITEM
p_query_workitem;                    

	+          ib_net16_t
base_lid;

	            ipoib_hdr_t
hdr[1];  /* Must be last! */

	 

	 }          ipoib_port_t;

	 

	Slava 

	 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20081002/654cb959/attachment.html>


More information about the ofw mailing list