[ofw] [PATCH v2] complib/user: fix timer race conditions

Smith, Stan stan.smith at intel.com
Thu Jul 1 18:18:24 PDT 2010


Smith, Stan wrote:
> Eric Lantz (HPC) wrote:
>> We've had so much trouble with this particular opensm issue...and
>> have a 36-node cluster that repro's the problem readily on which we'd
>> like to test the fix.  We also have a 256-node cluster with
>> Infinihost interfaces on which I'd like to test.  I've added James
>> Ren to this thread as he's the perf tester here on our team who'll
>> do the actual work.
>>
>> Is there an MSI available (private drop would be fine) which
>> includes:
>>      1. opensm timer race fix
>>      2. all the memory leak fixes
>>      3. RDMA-Read fix in NdEndpoint.cpp
>> Such that the driver/sm are the best we currently know how to make?
>
> Funny you should ask, I was just in the process of making a snapshot
> release; Look for it after 5:00PM PDT. I'm not sure about the
> RDMA-Read fix in NdEndpoint.cpp; if it's HEAD of svn then you will
> get it.
> Can someone else speak to this?
>
> stan.

Including #3 (thanks Fab) please find svn.2843 based installers @
http://www.openfabrics.org/downloads/Windows/SnapShots/v2843/

enjoy,

stan.

>
>>
>> Thanks,
>> Eric
>>
>>
>> -----Original Message-----
>> From: ofw-bounces at lists.openfabrics.org
>> [mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Tzachi Dar
>> Sent: Tuesday, June 29, 2010 12:49 AM
>> To: Hefty, Sean; ofw at lists.openfabrics.org
>> Subject: Re: [ofw] [PATCH v2] complib/user: fix timer race conditions
>>
>> OK, a very nice trick to use a periodic timer.
>>
>> I also believe that stop can be called from a callback since the
>> criticalsections are re-entrant.
>> Indeed in that case you will not know if the callback is running
>> until one will call cl_timer_destroy.
>>
>> Thanks
>> Tzachi
>>
>>> -----Original Message-----
>>> From: ofw-bounces at lists.openfabrics.org [mailto:ofw-
>>> bounces at lists.openfabrics.org] On Behalf Of Hefty, Sean
>>> Sent: Tuesday, June 29, 2010 12:51 AM
>>> To: ofw at lists.openfabrics.org
>>> Subject: [ofw] [PATCH v2] complib/user: fix timer race conditions
>>>
>>> This serializes the callbacks for a timer and provides
>>> synchronization for multi-threaded timer usage.
>>>
>>> cl_timer_stop is not allowed to be called from a timer callback, and
>>> will block until any callback has completed.
>>>
>>> cl_timer_start simply calls cl_timer_trim
>>>
>>> cl_timer_trim changes the timeout of a queued timer if the new
>>> timeout time is less than the last set timeout.
>>>
>>> Signed-off-by: Sean Hefty <sean.hefty at intel.com>
>>> ---
>>> Changes from v1:
>>> Eliminated TimerQueue.  The default one is used.
>>> Replaced the one-shot timer with a periodic timer.  This lets us use
>>> the ChangeTimerQueueTimer (who names these things, really) function
>>> to adjust the next timeout. Moved the calculation of the timeout
>>> time to inside the critical section to improve its accuracy in case
>>> an attempt to acquire the critical section blocks.
>>>
>>>  trunk/core/complib/user/cl_timer.c    |   83
>>>  ++++++++++++++++++------- --------
>>>  trunk/inc/user/complib/cl_timer_osd.h |    5 +- 2 files changed, 47
>>> insertions(+), 41 deletions(-)
>>>
>>> diff --git a/trunk/core/complib/user/cl_timer.c
>>> b/trunk/core/complib/user/cl_timer.c
>>> index 4f2ce0f..5cab378 100644
>>> --- a/trunk/core/complib/user/cl_timer.c
>>> +++ b/trunk/core/complib/user/cl_timer.c
>>> @@ -32,23 +32,31 @@
>>>
>>>
>>>  #include "complib/cl_timer.h"
>>> +
>>>
>>> +#define CL_MAX_TIME 0xFFFFFFFF
>>>
>>> +
>>> +/*
>>> + * If timeout_time is 0, the timer has been stopped. + */  static
>>>  void CALLBACK __timer_callback(
>>>     IN cl_timer_t* const p_timer,
>>>     IN BOOLEAN timer_signalled )
>>>  {
>>> -   /* timer_signalled is always TRUE, and has no value. */
>>> -   CL_ASSERT( timer_signalled );
>>> +   uint64_t timeout;
>>>     UNUSED_PARAM( timer_signalled );
>>>
>>> +   EnterCriticalSection( &p_timer->cb_lock );
>>> +   EnterCriticalSection( &p_timer->lock );
>>> +   timeout = p_timer->timeout_time;
>>>     p_timer->timeout_time = 0;
>>> -   p_timer->thread_id = GetCurrentThreadId();
>>> +   LeaveCriticalSection( &p_timer->lock );
>>>
>>> -   (p_timer->pfn_callback)( (void*)p_timer->context ); -
>>> -   p_timer->thread_id = 0;
>>> +   if( timeout )
>>> +           (p_timer->pfn_callback)( (void*)p_timer->context );
>>> +   LeaveCriticalSection( &p_timer->cb_lock );
>>>  }
>>>
>>>
>>> @@ -56,9 +64,7 @@ void
>>>  cl_timer_construct(
>>>     IN      cl_timer_t* const       p_timer )
>>>  {
>>> -   p_timer->h_timer = NULL;
>>> -   p_timer->timeout_time = 0;
>>> -   p_timer->thread_id = 0;
>>> +   memset(p_timer, 0, sizeof *p_timer);
>>>  }
>>>
>>>
>>> @@ -76,6 +82,11 @@ cl_timer_init(
>>>
>>>     p_timer->pfn_callback = pfn_callback;
>>>     p_timer->context = context;
>>> +   InitializeCriticalSection( &p_timer->lock );
>>> +   InitializeCriticalSection( &p_timer->cb_lock );
>>> +   if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
>>> __timer_callback, +         p_timer, CL_MAX_TIME, CL_MAX_TIME,
>>>     WT_EXECUTEINIOTHREAD ) ) +              return CL_ERROR; return( CL_SUCCESS
>>> );  }
>>>
>>> @@ -86,7 +97,9 @@ cl_timer_destroy(
>>>  {
>>>     CL_ASSERT( p_timer );
>>>
>>> -   cl_timer_stop( p_timer );
>>> +   DeleteTimerQueueTimer( NULL, p_timer->h_timer,
>>> INVALID_HANDLE_VALUE );
>>> +   DeleteCriticalSection( &p_timer->lock );
>>> +   DeleteCriticalSection( &p_timer->cb_lock );
>>>  }
>>>
>>>
>>> @@ -95,19 +108,7 @@ cl_timer_start(
>>>     IN      cl_timer_t* const       p_timer,
>>>     IN      const uint32_t          time_ms )
>>>  {
>>> -   CL_ASSERT( p_timer );
>>> -
>>> -   cl_timer_stop( p_timer );
>>> -
>>> -   p_timer->timeout_time = cl_get_time_stamp() +
>>> (((uint64_t)time_ms) * 1000);
>>> -
>>> -   if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
>>> __timer_callback,
>>> -           p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )
>>> -   {
>>> -           return( CL_ERROR );
>>> -   }
>>> -
>>> -   return( CL_SUCCESS );
>>> +   return cl_timer_trim( p_timer, time_ms );
>>>  }
>>>
>>>
>>> @@ -116,35 +117,41 @@ cl_timer_trim(
>>>     IN      cl_timer_t* const       p_timer,
>>>     IN      const uint32_t          time_ms )
>>>  {
>>> -   uint64_t                timeout_time;
>>> +   uint64_t        timeout;
>>> +   cl_status_t     status = CL_SUCCESS;
>>>
>>>     CL_ASSERT( p_timer );
>>>     CL_ASSERT( p_timer->pfn_callback );
>>>
>>> -   /* Calculate the timeout time in the timer object. */
>>> -   timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) *
>>> 1000); -
>>> -   /* Only pull in the timeout time. */
>>> -   if( p_timer->timeout_time && p_timer->timeout_time < timeout_time
>>> )
>>> -           return( CL_SUCCESS );
>>> -
>>> -   return cl_timer_start( p_timer, time_ms );
>>> +   EnterCriticalSection( &p_timer->lock );
>>> +   timeout = cl_get_time_stamp() + (((uint64_t)time_ms) * 1000);
>>> +   if ( !p_timer->timeout_time || timeout < p_timer->timeout_time )
>>> +   { +             if( ChangeTimerQueueTimer( NULL, p_timer->h_timer, time_ms,
>>> CL_MAX_TIME ) ) +                   p_timer->timeout_time = timeout; +              else
>>> +                   status = CL_ERROR;
>>> +   }
>>> +   LeaveCriticalSection( &p_timer->lock );
>>> +   return status;
>>>  }
>>>
>>>
>>> +/*
>>> + * Acquire cb_lock to ensure that all callbacks have completed. +
>>>  */  void cl_timer_stop(
>>>     IN      cl_timer_t* const       p_timer )
>>>  {
>>>     CL_ASSERT( p_timer );
>>>
>>> -   if( p_timer->h_timer && p_timer->thread_id !=
>>> GetCurrentThreadId() )
>>> -   {
>>> -           /* Make sure we block until the timer is cancelled. */
>>> -           DeleteTimerQueueTimer( NULL, p_timer->h_timer,
>>> INVALID_HANDLE_VALUE );
>>> -           p_timer->h_timer = NULL;
>>> -   }
>>> +   EnterCriticalSection( &p_timer->cb_lock );
>>> +   EnterCriticalSection( &p_timer->lock );
>>>     p_timer->timeout_time = 0;
>>> +   ChangeTimerQueueTimer( NULL, p_timer->h_timer, CL_MAX_TIME,
>>> CL_MAX_TIME ); +    LeaveCriticalSection( &p_timer->lock );
>>> +   LeaveCriticalSection( &p_timer->cb_lock );
>>>  }
>>>
>>>
>>> diff --git a/trunk/inc/user/complib/cl_timer_osd.h
>>> b/trunk/inc/user/complib/cl_timer_osd.h
>>> index 0a023ba..a4a1195 100644
>>> --- a/trunk/inc/user/complib/cl_timer_osd.h
>>> +++ b/trunk/inc/user/complib/cl_timer_osd.h
>>> @@ -38,15 +38,14 @@
>>>
>>>  #include "cl_types.h"
>>>
>>> -
>>> -/* Timer object definition. */
>>>  typedef struct _cl_timer
>>>  {
>>>     HANDLE                                  h_timer;
>>>     cl_pfn_timer_callback_t pfn_callback;
>>>     const void                              *context;
>>>     uint64_t                                timeout_time;
>>> -   DWORD                                   thread_id;
>>> +   CRITICAL_SECTION                lock;
>>> +   CRITICAL_SECTION                cb_lock;
>>>
>>>  } cl_timer_t;
>>>
>>
>> _______________________________________________
>> ofw mailing list
>> ofw at lists.openfabrics.org
>> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
>>
>> _______________________________________________
>> ofw mailing list
>> ofw at lists.openfabrics.org
>> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw




More information about the ofw mailing list