[ofw] [PATCH v2] complib/user: fix timer race conditions
Smith, Stan
stan.smith at intel.com
Thu Jul 1 16:04:51 PDT 2010
Eric Lantz (HPC) wrote:
> We've had so much trouble with this particular opensm issue...and
> have a 36-node cluster that repro's the problem readily on which we'd
> like to test the fix. We also have a 256-node cluster with
> Infinihost interfaces on which I'd like to test. I've added James
> Ren to this thread as he's the perf tester here on our team who'll do
> the actual work.
>
> Is there an MSI available (private drop would be fine) which includes:
> 1. opensm timer race fix
> 2. all the memory leak fixes
> 3. RDMA-Read fix in NdEndpoint.cpp
> Such that the driver/sm are the best we currently know how to make?
Funny you should ask, I was just in the process of making a snapshot release; Look for it after 5:00PM PDT.
I'm not sure about the RDMA-Read fix in NdEndpoint.cpp; if it's HEAD of svn then you will get it.
Can someone else speak to this?
stan.
>
> Thanks,
> Eric
>
>
> -----Original Message-----
> From: ofw-bounces at lists.openfabrics.org
> [mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Tzachi Dar
> Sent: Tuesday, June 29, 2010 12:49 AM
> To: Hefty, Sean; ofw at lists.openfabrics.org
> Subject: Re: [ofw] [PATCH v2] complib/user: fix timer race conditions
>
> OK, a very nice trick to use a periodic timer.
>
> I also believe that stop can be called from a callback since the
> criticalsections are re-entrant.
> Indeed in that case you will not know if the callback is running
> until one will call cl_timer_destroy.
>
> Thanks
> Tzachi
>
>> -----Original Message-----
>> From: ofw-bounces at lists.openfabrics.org [mailto:ofw-
>> bounces at lists.openfabrics.org] On Behalf Of Hefty, Sean
>> Sent: Tuesday, June 29, 2010 12:51 AM
>> To: ofw at lists.openfabrics.org
>> Subject: [ofw] [PATCH v2] complib/user: fix timer race conditions
>>
>> This serializes the callbacks for a timer and provides
>> synchronization for multi-threaded timer usage.
>>
>> cl_timer_stop is not allowed to be called from a timer callback, and
>> will block until any callback has completed.
>>
>> cl_timer_start simply calls cl_timer_trim
>>
>> cl_timer_trim changes the timeout of a queued timer if the new
>> timeout time is less than the last set timeout.
>>
>> Signed-off-by: Sean Hefty <sean.hefty at intel.com>
>> ---
>> Changes from v1:
>> Eliminated TimerQueue. The default one is used.
>> Replaced the one-shot timer with a periodic timer. This lets us use
>> the ChangeTimerQueueTimer (who names these things, really) function
>> to adjust the next timeout. Moved the calculation of the timeout
>> time to inside the critical section to improve its accuracy in case
>> an attempt to acquire the critical section blocks.
>>
>> trunk/core/complib/user/cl_timer.c | 83
>> ++++++++++++++++++------- --------
>> trunk/inc/user/complib/cl_timer_osd.h | 5 +- 2 files changed, 47
>> insertions(+), 41 deletions(-)
>>
>> diff --git a/trunk/core/complib/user/cl_timer.c
>> b/trunk/core/complib/user/cl_timer.c
>> index 4f2ce0f..5cab378 100644
>> --- a/trunk/core/complib/user/cl_timer.c
>> +++ b/trunk/core/complib/user/cl_timer.c
>> @@ -32,23 +32,31 @@
>>
>>
>> #include "complib/cl_timer.h"
>> +
>>
>> +#define CL_MAX_TIME 0xFFFFFFFF
>>
>> +
>> +/*
>> + * If timeout_time is 0, the timer has been stopped. + */
>> static void CALLBACK
>> __timer_callback(
>> IN cl_timer_t* const p_timer,
>> IN BOOLEAN timer_signalled )
>> {
>> - /* timer_signalled is always TRUE, and has no value. */
>> - CL_ASSERT( timer_signalled );
>> + uint64_t timeout;
>> UNUSED_PARAM( timer_signalled );
>>
>> + EnterCriticalSection( &p_timer->cb_lock );
>> + EnterCriticalSection( &p_timer->lock );
>> + timeout = p_timer->timeout_time;
>> p_timer->timeout_time = 0;
>> - p_timer->thread_id = GetCurrentThreadId();
>> + LeaveCriticalSection( &p_timer->lock );
>>
>> - (p_timer->pfn_callback)( (void*)p_timer->context ); -
>> - p_timer->thread_id = 0;
>> + if( timeout )
>> + (p_timer->pfn_callback)( (void*)p_timer->context );
>> + LeaveCriticalSection( &p_timer->cb_lock );
>> }
>>
>>
>> @@ -56,9 +64,7 @@ void
>> cl_timer_construct(
>> IN cl_timer_t* const p_timer )
>> {
>> - p_timer->h_timer = NULL;
>> - p_timer->timeout_time = 0;
>> - p_timer->thread_id = 0;
>> + memset(p_timer, 0, sizeof *p_timer);
>> }
>>
>>
>> @@ -76,6 +82,11 @@ cl_timer_init(
>>
>> p_timer->pfn_callback = pfn_callback;
>> p_timer->context = context;
>> + InitializeCriticalSection( &p_timer->lock );
>> + InitializeCriticalSection( &p_timer->cb_lock );
>> + if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
>> __timer_callback, + p_timer, CL_MAX_TIME, CL_MAX_TIME,
>> WT_EXECUTEINIOTHREAD ) ) + return CL_ERROR; return( CL_SUCCESS );
>> }
>>
>> @@ -86,7 +97,9 @@ cl_timer_destroy(
>> {
>> CL_ASSERT( p_timer );
>>
>> - cl_timer_stop( p_timer );
>> + DeleteTimerQueueTimer( NULL, p_timer->h_timer,
>> INVALID_HANDLE_VALUE );
>> + DeleteCriticalSection( &p_timer->lock );
>> + DeleteCriticalSection( &p_timer->cb_lock );
>> }
>>
>>
>> @@ -95,19 +108,7 @@ cl_timer_start(
>> IN cl_timer_t* const p_timer,
>> IN const uint32_t time_ms )
>> {
>> - CL_ASSERT( p_timer );
>> -
>> - cl_timer_stop( p_timer );
>> -
>> - p_timer->timeout_time = cl_get_time_stamp() +
>> (((uint64_t)time_ms) * 1000);
>> -
>> - if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
>> __timer_callback,
>> - p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )
>> - {
>> - return( CL_ERROR );
>> - }
>> -
>> - return( CL_SUCCESS );
>> + return cl_timer_trim( p_timer, time_ms );
>> }
>>
>>
>> @@ -116,35 +117,41 @@ cl_timer_trim(
>> IN cl_timer_t* const p_timer,
>> IN const uint32_t time_ms )
>> {
>> - uint64_t timeout_time;
>> + uint64_t timeout;
>> + cl_status_t status = CL_SUCCESS;
>>
>> CL_ASSERT( p_timer );
>> CL_ASSERT( p_timer->pfn_callback );
>>
>> - /* Calculate the timeout time in the timer object. */
>> - timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) * 1000);
>> -
>> - /* Only pull in the timeout time. */
>> - if( p_timer->timeout_time && p_timer->timeout_time < timeout_time )
>> - return( CL_SUCCESS );
>> -
>> - return cl_timer_start( p_timer, time_ms );
>> + EnterCriticalSection( &p_timer->lock );
>> + timeout = cl_get_time_stamp() + (((uint64_t)time_ms) * 1000);
>> + if ( !p_timer->timeout_time || timeout < p_timer->timeout_time )
>> + { + if( ChangeTimerQueueTimer( NULL, p_timer->h_timer, time_ms,
>> CL_MAX_TIME ) ) + p_timer->timeout_time = timeout;
>> + else
>> + status = CL_ERROR;
>> + }
>> + LeaveCriticalSection( &p_timer->lock );
>> + return status;
>> }
>>
>>
>> +/*
>> + * Acquire cb_lock to ensure that all callbacks have completed. + */
>> void
>> cl_timer_stop(
>> IN cl_timer_t* const p_timer )
>> {
>> CL_ASSERT( p_timer );
>>
>> - if( p_timer->h_timer && p_timer->thread_id !=
>> GetCurrentThreadId() )
>> - {
>> - /* Make sure we block until the timer is cancelled. */
>> - DeleteTimerQueueTimer( NULL, p_timer->h_timer,
>> INVALID_HANDLE_VALUE );
>> - p_timer->h_timer = NULL;
>> - }
>> + EnterCriticalSection( &p_timer->cb_lock );
>> + EnterCriticalSection( &p_timer->lock );
>> p_timer->timeout_time = 0;
>> + ChangeTimerQueueTimer( NULL, p_timer->h_timer, CL_MAX_TIME,
>> CL_MAX_TIME ); + LeaveCriticalSection( &p_timer->lock );
>> + LeaveCriticalSection( &p_timer->cb_lock );
>> }
>>
>>
>> diff --git a/trunk/inc/user/complib/cl_timer_osd.h
>> b/trunk/inc/user/complib/cl_timer_osd.h
>> index 0a023ba..a4a1195 100644
>> --- a/trunk/inc/user/complib/cl_timer_osd.h
>> +++ b/trunk/inc/user/complib/cl_timer_osd.h
>> @@ -38,15 +38,14 @@
>>
>> #include "cl_types.h"
>>
>> -
>> -/* Timer object definition. */
>> typedef struct _cl_timer
>> {
>> HANDLE h_timer;
>> cl_pfn_timer_callback_t pfn_callback;
>> const void *context;
>> uint64_t timeout_time;
>> - DWORD thread_id;
>> + CRITICAL_SECTION lock;
>> + CRITICAL_SECTION cb_lock;
>>
>> } cl_timer_t;
>>
>
> _______________________________________________
> ofw mailing list
> ofw at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
>
> _______________________________________________
> ofw mailing list
> ofw at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
More information about the ofw
mailing list