[ofw] SM timeout
Slava Strebkov
slavas at voltaire.com
Thu Oct 2 04:18:57 PDT 2008
When IB switch is heavy loaded, host will not receive reply from SM. It
results with IB_TIMEOUT and IPoIB adapter is shown as disconnected.
Suggested solution for IB_TIMEOUT problem.
Index: ulp/ipoib/kernel/ipoib_port.c
===================================================================
--- ulp/ipoib/kernel/ipoib_port.c (revision 1627)
+++ ulp/ipoib/kernel/ipoib_port.c (working copy)
@@ -779,6 +779,11 @@
KeCancelTimer(&p_port->gc_timer);
KeFlushQueuedDpcs();
+ if(p_port->p_query_workitem)
+ {
+ cl_event_signal(&p_port->query_event);
+ cl_event_destroy(&p_port->query_event);
+ }
__endpt_mgr_destroy( p_port );
__recv_mgr_destroy( p_port );
__send_mgr_destroy( p_port );
@@ -5191,8 +5196,71 @@
return status;
}
+static void
+__port_query(
+ IN PDEVICE_OBJECT DeviceObject,
+ IN PVOID context)
+{
+ ipoib_port_t *p_port;
+ ib_pnp_port_rec_t pnp_rec;
+ ib_port_attr_t attr;
+ PIO_WORKITEM p_work = NULL;
+ const uint32_t WAIT_US = 30 * 1000000; // 30 sec
+ IPOIB_ENTER( IPOIB_DBG_INIT );
+ UNREFERENCED_PARAMETER(DeviceObject);
+
+ p_port = (ipoib_port_t*)context;
+ p_work = p_port->p_query_workitem;
+
+ if(p_port->base_lid)
+ {
+ cl_memclr(&attr,sizeof(attr));
+ attr.lid = p_port->base_lid;
+ cl_memclr(&pnp_rec,sizeof(pnp_rec));
+ pnp_rec.p_port_attr = &attr;
+ }
+ if(cl_event_wait_on(&p_port->query_event,WAIT_US,FALSE) ==
CL_TIMEOUT)
+ {
+ if(p_port->base_lid)
+ ipoib_port_up(p_port, &pnp_rec);
+ else
+ __port_get_bcast( p_port );
+
+ cl_event_destroy(&p_port->query_event);
+ p_port->p_query_workitem = NULL;
+ }
+ IoFreeWorkItem(p_work);
+ IPOIB_EXIT( IPOIB_DBG_INIT );
+}
+
static void
+port_start_query(
+ IN ipoib_port_t *p_port)
+{
+ DEVICE_OBJECT *p_pdo;
+ IPOIB_ENTER( IPOIB_DBG_INIT );
+
+ CL_ASSERT(!p_port->p_query_workitem);
+ NdisMGetDeviceProperty( p_port->p_adapter->h_adapter, &p_pdo,
NULL, NULL, NULL, NULL );
+ p_port->p_query_workitem = IoAllocateWorkItem(p_pdo);
+ if(! p_port->p_query_workitem)
+ {
+ IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
IPOIB_DBG_ERROR,
+ ("port_start_query failed to allocate
workitem\n") );
+ return;
+ }
+
+ cl_event_construct(&p_port->query_event);
+ cl_event_init(&p_port->query_event, TRUE);
+ IoQueueWorkItem(p_port->p_query_workitem,
+
(PIO_WORKITEM_ROUTINE)__port_query,
+ DelayedWorkQueue,
+ p_port);
+
+ IPOIB_EXIT( IPOIB_DBG_INIT );
+}
+static void
__port_info_cb(
IN
ib_query_rec_t *p_query_rec )
{
@@ -5252,10 +5320,11 @@
break;
case IB_TIMEOUT:
- NdisWriteErrorLogEntry(
p_port->p_adapter->h_adapter,
- EVENT_IPOIB_PORT_INFO_TIMEOUT, 0 );
IPOIB_PRINT( TRACE_LEVEL_INFORMATION,
IPOIB_DBG_INIT,
("Port info query timed out.\n") );
+ p_port_rec = (ib_portinfo_record_t*)
+ ib_get_query_result(
p_query_rec->p_result_mad, 0 );
+ p_port->base_lid =
p_port_rec->port_info.base_lid;
break;
case IB_REMOTE_ERROR:
@@ -5292,7 +5361,11 @@
/* Release the reference taken when issuing the port info
query. */
ipoib_port_deref( p_port, ref_port_info_cb );
-
+ if (status == IB_TIMEOUT)
+ {
+ port_start_query(p_port);
+ }
+
IPOIB_EXIT( IPOIB_DBG_INIT );
}
@@ -5395,6 +5468,12 @@
("Instance destroying -
Aborting.\n") );
break;
+ case IB_TIMEOUT:
+ IPOIB_PRINT(TRACE_LEVEL_INFORMATION,
IPOIB_DBG_INIT,
+ ("_port_get_bcast - TIMEOUT.\n") );
+ p_port->base_lid = 0;
+ break;
+
default:
NdisWriteErrorLogEntry(
p_port->p_adapter->h_adapter,
EVENT_IPOIB_BCAST_GET, 1,
p_query_rec->status );
@@ -5419,7 +5498,8 @@
/* Release the reference taken when issuing the member
record query. */
ipoib_port_deref( p_port, ref_bcast_get_cb );
-
+ if( status == IB_TIMEOUT)
+ port_start_query(p_port);
IPOIB_EXIT( IPOIB_DBG_INIT );
}
@@ -5663,7 +5743,7 @@
IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
("Multicast join for broadcast group
returned %s.\n",
p_port->p_adapter->p_ifc->get_err_str( p_mcast_rec->status )) );
- if( status == IB_REMOTE_ERROR )
+ if( status == IB_REMOTE_ERROR || status ==
IB_TIMEOUT)
{
/*
* Either:
Index: ulp/ipoib/kernel/ipoib_port.h
===================================================================
--- ulp/ipoib/kernel/ipoib_port.h (revision 1627)
+++ ulp/ipoib/kernel/ipoib_port.h (working copy)
@@ -516,6 +516,9 @@
uint16_t pkey_index;
KDPC
gc_dpc;
KTIMER
gc_timer;
+ cl_event_t
query_event;
+ PIO_WORKITEM
p_query_workitem;
+ ib_net16_t
base_lid;
ipoib_hdr_t
hdr[1]; /* Must be last! */
} ipoib_port_t;
Slava
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20081002/cbf5eb45/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: sm_timeout.diff
Type: application/octet-stream
Size: 4507 bytes
Desc: sm_timeout.diff
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20081002/cbf5eb45/attachment.obj>
More information about the ofw
mailing list