***SPAM*** Re: [ofw] RE: SM timeout
Hal Rosenstock
hal.rosenstock at gmail.com
Thu Oct 2 11:36:42 PDT 2008
On Thu, Oct 2, 2008 at 2:19 PM, Fab Tillier
<ftillier at windows.microsoft.com> wrote:
> Hi Slava,
>
>>When IB switch is heavy loaded, host will not receive reply from SM. It
>>results with IB_TIMEOUT and IPoIB adapter is shown as disconnected.
>>Suggested solution for IB_TIMEOUT problem.
>
> Why not eliminate the port info query? The following fields of the port info records are used:
>
> - base_lid
> - link_width_active
> - link_speed_active
>
> Why not get these from the port attributes and eliminate the query altogether? The ib_port_attr_t structure has:
>
> - lid (base_lid)
> - active_speed (link_speed_active)
Is this assuming a homogeneous subnet in terms of rate ?
> Missing is link_width_active, but it's known to the HCA in the underlying code.
>
> In fact, is there any reason why the ib_port_attr_t structure shouldn't just embed an ib_port_info_t structure rather than duplicating some of the data and hiding the rest?
>
> If you look at what vstat does, it issues a local MAD to the HCA for the port info record to get the information missing from the port attributes. Fixing the port attributes would simplify everybody's life.
Do you mean PortInfo attribute (SM class) rather than PortInfoRecord
(SA class/query) ?
If this comes from the SMA, what happens when MKey protection is enabled ?
-- Hal
>
> Eliminating the port info query will reduce the load on the SM and should help with the broadcast group query processing. Also, because ipoib_port_up issues the port info query, and is called at PASSIVE_LEVEL, you can substitute the port info query with a ib_local_mad call (just like vstat) for the port info if you don't feel like updating the port attribute structure.
>
> In any case, I made some specific comments inline, but this patch isn't ready. Before coding a fix, take a step back and try to identify the root problem (SM scalability sucks), not just the symptom (SM queries time out). If you can find a way to deal with the root cause you will save yourself dealing with the many different ways the problem manifests itself.
>
> The best way to help the SM scale is to not ask it for information that can be retrieved locally.
>
> -Fab
>
>>Index: ulp/ipoib/kernel/ipoib_port.c
>>===================================================================
>>--- ulp/ipoib/kernel/ipoib_port.c (revision 1627)
>>+++ ulp/ipoib/kernel/ipoib_port.c (working copy)
>>@@ -779,6 +779,11 @@
>>
>> KeCancelTimer(&p_port->gc_timer);
>> KeFlushQueuedDpcs();
>>+ if(p_port->p_query_workitem)
>>+ {
>>+ cl_event_signal(&p_port->query_event);
>>+ cl_event_destroy(&p_port->query_event);
>
> How do you prevent the work item callback from accessing the port structure after it's been freed? What if the work item was just queued but hasn't run yet?
>
> The synchronization here is broken. A timer/DPC pair might be a better choice.
>
>>+ }
>> __endpt_mgr_destroy( p_port );
>> __recv_mgr_destroy( p_port );
>> __send_mgr_destroy( p_port );
>>@@ -5191,8 +5196,71 @@
>> return status;
>> }
>>
>>+static void
>>+__port_query(
>>+ IN PDEVICE_OBJECT DeviceObject,
>>+ IN PVOID context)
>>+{
>>+ ipoib_port_t *p_port;
>>+ ib_pnp_port_rec_t pnp_rec;
>>+ ib_port_attr_t attr;
>>+ PIO_WORKITEM p_work = NULL;
>>+ const uint32_t WAIT_US = 30 * 1000000; // 30 sec
>>
>>+ IPOIB_ENTER( IPOIB_DBG_INIT );
>>+ UNREFERENCED_PARAMETER(DeviceObject);
>>+
>>+ p_port = (ipoib_port_t*)context;
>>+ p_work = p_port->p_query_workitem;
>>+
>>+ if(p_port->base_lid)
>>+ {
>>+ cl_memclr(&attr,sizeof(attr));
>>+ attr.lid = p_port->base_lid;
>>+ cl_memclr(&pnp_rec,sizeof(pnp_rec));
>>+ pnp_rec.p_port_attr = &attr;
>
> Why would you be here if you have a base LID?
>
>>+ }
>>+ if(cl_event_wait_on(&p_port->query_event,WAIT_US,FALSE) ==
>>CL_TIMEOUT)
>>+ {
>>+ if(p_port->base_lid)
>>+ ipoib_port_up(p_port, &pnp_rec);
>>+ else
>>+ __port_get_bcast( p_port );
>>+
>>+ cl_event_destroy(&p_port->query_event);
>>+ p_port->p_query_workitem = NULL;
>>+ }
>>+ IoFreeWorkItem(p_work);
>>+ IPOIB_EXIT( IPOIB_DBG_INIT );
>>+}
>>+
>> static void
>>+port_start_query(
>>+ IN ipoib_port_t *p_port)
>>+{
>>+ DEVICE_OBJECT *p_pdo;
>>+ IPOIB_ENTER( IPOIB_DBG_INIT );
>>+
>>+ CL_ASSERT(!p_port->p_query_workitem);
>>+ NdisMGetDeviceProperty( p_port->p_adapter->h_adapter, &p_pdo,
>>NULL, NULL, NULL, NULL );
>>+ p_port->p_query_workitem = IoAllocateWorkItem(p_pdo);
>>+ if(! p_port->p_query_workitem)
>>+ {
>>+ IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
>>IPOIB_DBG_ERROR,
>>+ ("port_start_query failed to allocate
>>workitem\n") );
>>+ return;
>>+ }
>>+
>>+ cl_event_construct(&p_port->query_event);
>>+ cl_event_init(&p_port->query_event, TRUE);
>>+ IoQueueWorkItem(p_port->p_query_workitem,
>>+
>>(PIO_WORKITEM_ROUTINE)__port_query,
>>+ DelayedWorkQueue,
>>+ p_port);
>>+
>>+ IPOIB_EXIT( IPOIB_DBG_INIT );
>>+}
>>+static void
>> __port_info_cb(
>> IN
>>ib_query_rec_t *p_query_rec )
>> {
>>@@ -5252,10 +5320,11 @@
>> break;
>>
>> case IB_TIMEOUT:
>>- NdisWriteErrorLogEntry(
>>p_port->p_adapter->h_adapter,
>>- EVENT_IPOIB_PORT_INFO_TIMEOUT, 0 );
>> IPOIB_PRINT( TRACE_LEVEL_INFORMATION,
>>IPOIB_DBG_INIT,
>> ("Port info query timed out.\n") );
>>+ p_port_rec = (ib_portinfo_record_t*)
>>+ ib_get_query_result(
>>p_query_rec->p_result_mad, 0 );
>>+ p_port->base_lid =
>>p_port_rec->port_info.base_lid;
>
> You're using data from the port record that wasn't filled in by the SM??? Where did p_port_rec->port_info.base_lid get filled in?
>
>> break;
>>
>> case IB_REMOTE_ERROR:
>>@@ -5292,7 +5361,11 @@
>>
>> /* Release the reference taken when issuing the port info
>>query. */
>> ipoib_port_deref( p_port, ref_port_info_cb );
>>-
>>+ if (status == IB_TIMEOUT)
>>+ {
>>+ port_start_query(p_port);
>>+ }
>>+
>> IPOIB_EXIT( IPOIB_DBG_INIT );
>> }
>>
>>@@ -5395,6 +5468,12 @@
>> ("Instance destroying -
>>Aborting.\n") );
>> break;
>>
>>+ case IB_TIMEOUT:
>>+ IPOIB_PRINT(TRACE_LEVEL_INFORMATION,
>>IPOIB_DBG_INIT,
>>+ ("_port_get_bcast - TIMEOUT.\n") );
>>+ p_port->base_lid = 0;
>>+ break;
>>+
>> default:
>> NdisWriteErrorLogEntry(
>>p_port->p_adapter->h_adapter,
>> EVENT_IPOIB_BCAST_GET, 1,
>>p_query_rec->status );
>>@@ -5419,7 +5498,8 @@
>>
>> /* Release the reference taken when issuing the member
>>record query. */
>> ipoib_port_deref( p_port, ref_bcast_get_cb );
>>-
>>+ if( status == IB_TIMEOUT)
>>+ port_start_query(p_port);
>
> So if the broadcast group query fails, you start over with the port info query? Doesn't that just put even more load on the SM? Why not retry the broadcast group query? Why not just increase the number of retries and/or timeout for all these queries to some really large value?
>
>> IPOIB_EXIT( IPOIB_DBG_INIT );
>> }
>>
>>@@ -5663,7 +5743,7 @@
>> IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>> ("Multicast join for broadcast group
>>returned %s.\n",
>>
>>p_port->p_adapter->p_ifc->get_err_str( p_mcast_rec->status )) );
>>- if( status == IB_REMOTE_ERROR )
>>+ if( status == IB_REMOTE_ERROR || status ==
>>IB_TIMEOUT)
>
> You missed a space after IB_TIMEOUT before the closing parethesis...
>
>> {
>> /*
>> * Either:
>>Index: ulp/ipoib/kernel/ipoib_port.h
>>===================================================================
>>--- ulp/ipoib/kernel/ipoib_port.h (revision 1627)
>>+++ ulp/ipoib/kernel/ipoib_port.h (working copy)
>>@@ -516,6 +516,9 @@
>> uint16_t pkey_index;
>> KDPC
>>gc_dpc;
>> KTIMER
>>gc_timer;
>>+ cl_event_t
>>query_event;
>>+ PIO_WORKITEM
>>p_query_workitem;
>>+ ib_net16_t
>>base_lid;
>> ipoib_hdr_t
>>hdr[1]; /* Must be last! */
>>
>> } ipoib_port_t;
>>
>>Slava
>>
>>
> _______________________________________________
> ofw mailing list
> ofw at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
>
More information about the ofw
mailing list