[ofw] [PATCH] complib/user: fix timer race conditions
Tzachi Dar
tzachid at mellanox.co.il
Fri Jun 25 06:35:09 PDT 2010
A few notes:
The following lines are very problematic:
EnterCriticalSection( &p_timer->cb_lock );
(p_timer->pfn_callback)( (void*)p_timer->context );
LeaveCriticalSection( &p_timer->cb_lock );
Nothing specific with timer, but if you have a code that takes a lock, and call someone else code than this is asking for trouble. The reason is that if he has done something like this (take a lock and call you back) than there is a deadlock. This can be avoided by documentation, but I would rather avoid this issues...
looking at the opensm scenario, this looks reasonable...
One thing that I'm missing here is that start might not do anything but still return success. I would like to have two more return code, one for the case of timer not running because the time is too big, and one because another callback is running. This is to allow the user to know if he should wait for a callback or not.
Another issue here is that start() might end waiting for the callback to return. Again this is a very problematic behavior (for a general timer) since two of this can cerate a deadlock. This is especially true for using process io threads that one has no control over. That is documentation should say, either don't call start when there is already something running, or don't call timer.start() from a system thread. This is because two system threads might create a deadlock.
Again, in the case of opensm this is currently avoided.
One way to avoid this is replace the lines:
if( p_timer->h_timer )
DeleteTimerQueueTimer( p_timer->h_timer_queue, p_timer->h_timer, NULL );
With something that says if timer is running, allocate more memory to store the request, start a new timer if needed and when the callback comes just don't call and free the memory.
Also on start one takes the critical section and calls
if( p_timer->h_timer )
DeleteTimerQueueTimer( p_timer->h_timer_queue, p_timer->h_timer, NULL );
If the callback has already started running but haven't yet called __clear_timer_handle() there is a deadlock here. As one is holding the spinlock waiting for the other to finish running , while the other is trying to capture the spinlock.
So, the general rule should be don't call start on this timer from io threads...
And as I wrote many times here this can all be avoided by creating a new thread (not perfect by itself I agree).
Thanks
Tzachi
> -----Original Message-----
> From: Hefty, Sean [mailto:sean.hefty at intel.com]
> Sent: Friday, June 25, 2010 3:56 AM
> To: Hefty, Sean; Smith, Stan; Fab Tillier; Tzachi Dar;
> ofw at lists.openfabrics.org
> Cc: Uri Habusha; Yevgeny Kliteynik
> Subject: [PATCH] complib/user: fix timer race conditions
>
> This serializes the callbacks for a timer and provides synchronization
> for multi-threaded timer usage.
>
> cl_timer_stop is not allowed to be called from a timer callback.
>
> cl_timer_start simply calls cl_timer_trim
>
> cl_timer_trim cancels any queued timer and inserts a new one if the
> timeout time is less than the last last set timeout.
>
> This eliminates the need to track the callback thread ID and any races
> associated with trying to do so. The synchronization provided is
> straightforward.
>
> Signed-off-by: Sean Hefty <sean.hefty at intel.com>
> ---
> trunk/core/complib/user/cl_timer.c | 95 ++++++++++++++++++++-----
> --------
> trunk/inc/user/complib/cl_timer_osd.h | 4 +
> 2 files changed, 60 insertions(+), 39 deletions(-)
>
> diff --git a/trunk/core/complib/user/cl_timer.c
> b/trunk/core/complib/user/cl_timer.c
> index 4f2ce0f..37c5448 100644
> --- a/trunk/core/complib/user/cl_timer.c
> +++ b/trunk/core/complib/user/cl_timer.c
> @@ -34,21 +34,33 @@
> #include "complib/cl_timer.h"
>
>
> +static HANDLE
> +__clear_timer_handle(
> + IN cl_timer_t* const p_timer )
> +{
> + HANDLE timer;
> +
> + EnterCriticalSection( &p_timer->lock );
> + timer = p_timer->h_timer;
> + p_timer->h_timer = NULL;
> + LeaveCriticalSection( &p_timer->lock );
> +
> + return timer;
> +}
> +
> static void CALLBACK
> __timer_callback(
> IN cl_timer_t* const p_timer,
> IN BOOLEAN timer_signalled )
> {
> - /* timer_signalled is always TRUE, and has no value. */
> - CL_ASSERT( timer_signalled );
> UNUSED_PARAM( timer_signalled );
>
> - p_timer->timeout_time = 0;
> - p_timer->thread_id = GetCurrentThreadId();
> + if( !__clear_timer_handle( p_timer ))
> + return;
>
> + EnterCriticalSection( &p_timer->cb_lock );
> (p_timer->pfn_callback)( (void*)p_timer->context );
> -
> - p_timer->thread_id = 0;
> + LeaveCriticalSection( &p_timer->cb_lock );
> }
>
>
> @@ -56,9 +68,7 @@ void
> cl_timer_construct(
> IN cl_timer_t* const p_timer )
> {
> - p_timer->h_timer = NULL;
> - p_timer->timeout_time = 0;
> - p_timer->thread_id = 0;
> + memset(p_timer, 0, sizeof *p_timer);
> }
>
>
> @@ -76,6 +86,12 @@ cl_timer_init(
>
> p_timer->pfn_callback = pfn_callback;
> p_timer->context = context;
> + p_timer->h_timer_queue = CreateTimerQueue();
> + if( !p_timer->h_timer_queue )
> + return CL_ERROR;
> +
> + InitializeCriticalSection( &p_timer->lock );
> + InitializeCriticalSection( &p_timer->cb_lock );
> return( CL_SUCCESS );
> }
>
> @@ -86,7 +102,12 @@ cl_timer_destroy(
> {
> CL_ASSERT( p_timer );
>
> - cl_timer_stop( p_timer );
> + if( p_timer->h_timer_queue )
> + {
> + DeleteTimerQueueEx( p_timer->h_timer_queue,
> INVALID_HANDLE_VALUE );
> + DeleteCriticalSection( &p_timer->lock );
> + DeleteCriticalSection( &p_timer->cb_lock );
> + }
> }
>
>
> @@ -95,19 +116,7 @@ cl_timer_start(
> IN cl_timer_t* const p_timer,
> IN const uint32_t time_ms )
> {
> - CL_ASSERT( p_timer );
> -
> - cl_timer_stop( p_timer );
> -
> - p_timer->timeout_time = cl_get_time_stamp() +
> (((uint64_t)time_ms) * 1000);
> -
> - if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
> __timer_callback,
> - p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )
> - {
> - return( CL_ERROR );
> - }
> -
> - return( CL_SUCCESS );
> + return cl_timer_trim( p_timer, time_ms );
> }
>
>
> @@ -116,19 +125,30 @@ cl_timer_trim(
> IN cl_timer_t* const p_timer,
> IN const uint32_t time_ms )
> {
> - uint64_t timeout_time;
> + uint64_t timeout;
> + cl_status_t status = CL_SUCCESS;
>
> CL_ASSERT( p_timer );
> CL_ASSERT( p_timer->pfn_callback );
>
> - /* Calculate the timeout time in the timer object. */
> - timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) *
> 1000);
> -
> - /* Only pull in the timeout time. */
> - if( p_timer->timeout_time && p_timer->timeout_time < timeout_time
> )
> - return( CL_SUCCESS );
> + timeout = cl_get_time_stamp() + (((uint64_t)time_ms) * 1000);
>
> - return cl_timer_start( p_timer, time_ms );
> + EnterCriticalSection( &p_timer->lock );
> + if ( !p_timer->h_timer || timeout < p_timer->timeout_time )
> + {
> + if( p_timer->h_timer )
> + DeleteTimerQueueTimer( p_timer->h_timer_queue,
> p_timer->h_timer,
> +NULL );
> +
> + p_timer->timeout_time = timeout;
> + if( !CreateTimerQueueTimer( &p_timer->h_timer, p_timer-
> >h_timer_queue,
> + __timer_callback, p_timer, time_ms, 0,
> WT_EXECUTEINIOTHREAD ) )
> + {
> + p_timer->h_timer = NULL;
> + status = CL_ERROR;
> + }
> + }
> + LeaveCriticalSection( &p_timer->lock );
> + return status;
> }
>
>
> @@ -136,15 +156,14 @@ void
> cl_timer_stop(
> IN cl_timer_t* const p_timer )
> {
> + HANDLE timer;
> +
> CL_ASSERT( p_timer );
>
> - if( p_timer->h_timer && p_timer->thread_id !=
> GetCurrentThreadId() )
> - {
> - /* Make sure we block until the timer is cancelled. */
> - DeleteTimerQueueTimer( NULL, p_timer->h_timer,
> INVALID_HANDLE_VALUE );
> - p_timer->h_timer = NULL;
> - }
> - p_timer->timeout_time = 0;
> + timer = __clear_timer_handle( p_timer );
> +
> + if( timer )
> + DeleteTimerQueueTimer( p_timer->h_timer_queue, timer,
> +INVALID_HANDLE_VALUE );
> }
>
>
> diff --git a/trunk/inc/user/complib/cl_timer_osd.h
> b/trunk/inc/user/complib/cl_timer_osd.h
> index 0a023ba..370a175 100644
> --- a/trunk/inc/user/complib/cl_timer_osd.h
> +++ b/trunk/inc/user/complib/cl_timer_osd.h
> @@ -42,11 +42,13 @@
> /* Timer object definition. */
> typedef struct _cl_timer
> {
> + HANDLE h_timer_queue;
> HANDLE h_timer;
> cl_pfn_timer_callback_t pfn_callback;
> const void *context;
> uint64_t timeout_time;
> - DWORD thread_id;
> + CRITICAL_SECTION lock;
> + CRITICAL_SECTION cb_lock;
>
> } cl_timer_t;
>
More information about the ofw
mailing list