[ofw] [PATCH v2] complib/user: fix timer race conditions

Smith, Stan stan.smith at intel.com
Thu Jul 1 16:04:51 PDT 2010


Eric Lantz (HPC) wrote:
> We've had so much trouble with this particular opensm issue...and
> have a 36-node cluster that repro's the problem readily on which we'd
> like to test the fix.  We also have a 256-node cluster with
> Infinihost interfaces on which I'd like to test.  I've added James
> Ren to this thread as he's the perf tester here on our team who'll do
> the actual work.
>
> Is there an MSI available (private drop would be fine) which includes:
>       1. opensm timer race fix
>       2. all the memory leak fixes
>       3. RDMA-Read fix in NdEndpoint.cpp
> Such that the driver/sm are the best we currently know how to make?

Funny you should ask, I was just in the process of making a snapshot release; Look for it after 5:00PM PDT.
I'm not sure about the RDMA-Read fix in NdEndpoint.cpp; if it's HEAD of svn then you will get it.
Can someone else speak to this?

stan.

>
> Thanks,
> Eric
>
>
> -----Original Message-----
> From: ofw-bounces at lists.openfabrics.org
> [mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Tzachi Dar
> Sent: Tuesday, June 29, 2010 12:49 AM
> To: Hefty, Sean; ofw at lists.openfabrics.org
> Subject: Re: [ofw] [PATCH v2] complib/user: fix timer race conditions
>
> OK, a very nice trick to use a periodic timer.
>
> I also believe that stop can be called from a callback since the
> criticalsections are re-entrant.
> Indeed in that case you will not know if the callback is running
> until one will call cl_timer_destroy.
>
> Thanks
> Tzachi
>
>> -----Original Message-----
>> From: ofw-bounces at lists.openfabrics.org [mailto:ofw-
>> bounces at lists.openfabrics.org] On Behalf Of Hefty, Sean
>> Sent: Tuesday, June 29, 2010 12:51 AM
>> To: ofw at lists.openfabrics.org
>> Subject: [ofw] [PATCH v2] complib/user: fix timer race conditions
>>
>> This serializes the callbacks for a timer and provides
>> synchronization for multi-threaded timer usage.
>>
>> cl_timer_stop is not allowed to be called from a timer callback, and
>> will block until any callback has completed.
>>
>> cl_timer_start simply calls cl_timer_trim
>>
>> cl_timer_trim changes the timeout of a queued timer if the new
>> timeout time is less than the last set timeout.
>>
>> Signed-off-by: Sean Hefty <sean.hefty at intel.com>
>> ---
>> Changes from v1:
>> Eliminated TimerQueue.  The default one is used.
>> Replaced the one-shot timer with a periodic timer.  This lets us use
>> the ChangeTimerQueueTimer (who names these things, really) function
>> to adjust the next timeout. Moved the calculation of the timeout
>> time to inside the critical section to improve its accuracy in case
>> an attempt to acquire the critical section blocks.
>>
>>  trunk/core/complib/user/cl_timer.c    |   83
>>  ++++++++++++++++++------- --------
>>  trunk/inc/user/complib/cl_timer_osd.h |    5 +- 2 files changed, 47
>> insertions(+), 41 deletions(-)
>>
>> diff --git a/trunk/core/complib/user/cl_timer.c
>> b/trunk/core/complib/user/cl_timer.c
>> index 4f2ce0f..5cab378 100644
>> --- a/trunk/core/complib/user/cl_timer.c
>> +++ b/trunk/core/complib/user/cl_timer.c
>> @@ -32,23 +32,31 @@
>>
>>
>>  #include "complib/cl_timer.h"
>> +
>>
>> +#define CL_MAX_TIME 0xFFFFFFFF
>>
>> +
>> +/*
>> + * If timeout_time is 0, the timer has been stopped. + */
>>  static void CALLBACK
>>  __timer_callback(
>>      IN cl_timer_t* const p_timer,
>>      IN BOOLEAN timer_signalled )
>>  {
>> -    /* timer_signalled is always TRUE, and has no value. */
>> -    CL_ASSERT( timer_signalled );
>> +    uint64_t timeout;
>>      UNUSED_PARAM( timer_signalled );
>>
>> +    EnterCriticalSection( &p_timer->cb_lock );
>> +    EnterCriticalSection( &p_timer->lock );
>> +    timeout = p_timer->timeout_time;
>>      p_timer->timeout_time = 0;
>> -    p_timer->thread_id = GetCurrentThreadId();
>> +    LeaveCriticalSection( &p_timer->lock );
>>
>> -    (p_timer->pfn_callback)( (void*)p_timer->context ); -
>> -    p_timer->thread_id = 0;
>> +    if( timeout )
>> +            (p_timer->pfn_callback)( (void*)p_timer->context );
>> +    LeaveCriticalSection( &p_timer->cb_lock );
>>  }
>>
>>
>> @@ -56,9 +64,7 @@ void
>>  cl_timer_construct(
>>      IN      cl_timer_t* const       p_timer )
>>  {
>> -    p_timer->h_timer = NULL;
>> -    p_timer->timeout_time = 0;
>> -    p_timer->thread_id = 0;
>> +    memset(p_timer, 0, sizeof *p_timer);
>>  }
>>
>>
>> @@ -76,6 +82,11 @@ cl_timer_init(
>>
>>      p_timer->pfn_callback = pfn_callback;
>>      p_timer->context = context;
>> +    InitializeCriticalSection( &p_timer->lock );
>> +    InitializeCriticalSection( &p_timer->cb_lock );
>> +    if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
>> __timer_callback, +          p_timer, CL_MAX_TIME, CL_MAX_TIME,
>>      WT_EXECUTEINIOTHREAD ) ) +              return CL_ERROR; return( CL_SUCCESS );
>>  }
>>
>> @@ -86,7 +97,9 @@ cl_timer_destroy(
>>  {
>>      CL_ASSERT( p_timer );
>>
>> -    cl_timer_stop( p_timer );
>> +    DeleteTimerQueueTimer( NULL, p_timer->h_timer,
>> INVALID_HANDLE_VALUE );
>> +    DeleteCriticalSection( &p_timer->lock );
>> +    DeleteCriticalSection( &p_timer->cb_lock );
>>  }
>>
>>
>> @@ -95,19 +108,7 @@ cl_timer_start(
>>      IN      cl_timer_t* const       p_timer,
>>      IN      const uint32_t          time_ms )
>>  {
>> -    CL_ASSERT( p_timer );
>> -
>> -    cl_timer_stop( p_timer );
>> -
>> -    p_timer->timeout_time = cl_get_time_stamp() +
>> (((uint64_t)time_ms) * 1000);
>> -
>> -    if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
>> __timer_callback,
>> -            p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )
>> -    {
>> -            return( CL_ERROR );
>> -    }
>> -
>> -    return( CL_SUCCESS );
>> +    return cl_timer_trim( p_timer, time_ms );
>>  }
>>
>>
>> @@ -116,35 +117,41 @@ cl_timer_trim(
>>      IN      cl_timer_t* const       p_timer,
>>      IN      const uint32_t          time_ms )
>>  {
>> -    uint64_t                timeout_time;
>> +    uint64_t        timeout;
>> +    cl_status_t     status = CL_SUCCESS;
>>
>>      CL_ASSERT( p_timer );
>>      CL_ASSERT( p_timer->pfn_callback );
>>
>> -    /* Calculate the timeout time in the timer object. */
>> -    timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) * 1000);
>> -
>> -    /* Only pull in the timeout time. */
>> -    if( p_timer->timeout_time && p_timer->timeout_time < timeout_time )
>> -            return( CL_SUCCESS );
>> -
>> -    return cl_timer_start( p_timer, time_ms );
>> +    EnterCriticalSection( &p_timer->lock );
>> +    timeout = cl_get_time_stamp() + (((uint64_t)time_ms) * 1000);
>> +    if ( !p_timer->timeout_time || timeout < p_timer->timeout_time )
>> +    { +             if( ChangeTimerQueueTimer( NULL, p_timer->h_timer, time_ms,
>> CL_MAX_TIME ) ) +                    p_timer->timeout_time = timeout;
>> +            else
>> +                    status = CL_ERROR;
>> +    }
>> +    LeaveCriticalSection( &p_timer->lock );
>> +    return status;
>>  }
>>
>>
>> +/*
>> + * Acquire cb_lock to ensure that all callbacks have completed. + */
>>  void
>>  cl_timer_stop(
>>      IN      cl_timer_t* const       p_timer )
>>  {
>>      CL_ASSERT( p_timer );
>>
>> -    if( p_timer->h_timer && p_timer->thread_id !=
>> GetCurrentThreadId() )
>> -    {
>> -            /* Make sure we block until the timer is cancelled. */
>> -            DeleteTimerQueueTimer( NULL, p_timer->h_timer,
>> INVALID_HANDLE_VALUE );
>> -            p_timer->h_timer = NULL;
>> -    }
>> +    EnterCriticalSection( &p_timer->cb_lock );
>> +    EnterCriticalSection( &p_timer->lock );
>>      p_timer->timeout_time = 0;
>> +    ChangeTimerQueueTimer( NULL, p_timer->h_timer, CL_MAX_TIME,
>> CL_MAX_TIME ); +     LeaveCriticalSection( &p_timer->lock );
>> +    LeaveCriticalSection( &p_timer->cb_lock );
>>  }
>>
>>
>> diff --git a/trunk/inc/user/complib/cl_timer_osd.h
>> b/trunk/inc/user/complib/cl_timer_osd.h
>> index 0a023ba..a4a1195 100644
>> --- a/trunk/inc/user/complib/cl_timer_osd.h
>> +++ b/trunk/inc/user/complib/cl_timer_osd.h
>> @@ -38,15 +38,14 @@
>>
>>  #include "cl_types.h"
>>
>> -
>> -/* Timer object definition. */
>>  typedef struct _cl_timer
>>  {
>>      HANDLE                                  h_timer;
>>      cl_pfn_timer_callback_t pfn_callback;
>>      const void                              *context;
>>      uint64_t                                timeout_time;
>> -    DWORD                                   thread_id;
>> +    CRITICAL_SECTION                lock;
>> +    CRITICAL_SECTION                cb_lock;
>>
>>  } cl_timer_t;
>>
>
> _______________________________________________
> ofw mailing list
> ofw at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
>
> _______________________________________________
> ofw mailing list
> ofw at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw




More information about the ofw mailing list