[ofw] [PATCH] complib/user: fix timer race conditions

Tzachi Dar tzachid at mellanox.co.il
Fri Jun 25 06:35:09 PDT 2010


A few notes:

The following lines are very problematic:

	EnterCriticalSection( &p_timer->cb_lock );
	(p_timer->pfn_callback)( (void*)p_timer->context );
	LeaveCriticalSection( &p_timer->cb_lock );

Nothing specific with timer, but if you have a code that takes a lock, and call someone else code than this is asking for trouble. The reason is that if he has done something like this (take a lock and call you back) than there is a deadlock. This can be avoided by documentation, but I would rather avoid this issues...

looking at the opensm scenario, this looks reasonable...

One thing that I'm missing here is that start might not do anything but still return success. I would like to have two more return code, one for the case of timer not running because the time is too big, and one because another callback is running. This is to allow the user to know if he should wait for a callback or not.

Another issue here is that start() might end waiting for the callback to return. Again this is a very problematic behavior (for a general timer) since two of this can cerate a deadlock. This is especially true for using process io threads that one has no control over. That is documentation should say, either don't call start when there is already something running, or don't call timer.start() from a system thread. This is because two system threads might create a deadlock.

Again, in the case of opensm this is currently avoided.


One way to avoid this is replace the lines:

		if( p_timer->h_timer )
			DeleteTimerQueueTimer( p_timer->h_timer_queue, p_timer->h_timer, NULL );

With something that says if timer is running, allocate more memory to store the request, start a new timer if needed and when the callback comes just don't call and free the memory.


Also on start one takes the critical section and calls 

		if( p_timer->h_timer )
			DeleteTimerQueueTimer( p_timer->h_timer_queue, p_timer->h_timer, NULL );
If the callback has already started running but haven't yet called __clear_timer_handle() there is a deadlock here. As one is holding the spinlock waiting for the other to finish running , while the other is trying to capture the spinlock.

So, the general rule should be don't call start on this timer from io threads...

And as I wrote many times here this can all be avoided by creating a new thread (not perfect by itself I agree).

Thanks
Tzachi
> -----Original Message-----
> From: Hefty, Sean [mailto:sean.hefty at intel.com]
> Sent: Friday, June 25, 2010 3:56 AM
> To: Hefty, Sean; Smith, Stan; Fab Tillier; Tzachi Dar;
> ofw at lists.openfabrics.org
> Cc: Uri Habusha; Yevgeny Kliteynik
> Subject: [PATCH] complib/user: fix timer race conditions
> 
> This serializes the callbacks for a timer and provides synchronization
> for multi-threaded timer usage.
> 
> cl_timer_stop is not allowed to be called from a timer callback.
> 
> cl_timer_start simply calls cl_timer_trim
> 
> cl_timer_trim cancels any queued timer and inserts a new one if the
> timeout time is less than the last last set timeout.
> 
> This eliminates the need to track the callback thread ID and any races
> associated with trying to do so.  The synchronization provided is
> straightforward.
> 
> Signed-off-by: Sean Hefty <sean.hefty at intel.com>
> ---
>  trunk/core/complib/user/cl_timer.c    |   95 ++++++++++++++++++++-----
> --------
>  trunk/inc/user/complib/cl_timer_osd.h |    4 +
>  2 files changed, 60 insertions(+), 39 deletions(-)
> 
> diff --git a/trunk/core/complib/user/cl_timer.c
> b/trunk/core/complib/user/cl_timer.c
> index 4f2ce0f..37c5448 100644
> --- a/trunk/core/complib/user/cl_timer.c
> +++ b/trunk/core/complib/user/cl_timer.c
> @@ -34,21 +34,33 @@
>  #include "complib/cl_timer.h"
> 
> 
> +static HANDLE
> +__clear_timer_handle(
> +	IN	cl_timer_t* const p_timer )
> +{
> +	HANDLE timer;
> +
> +	EnterCriticalSection( &p_timer->lock );
> +	timer = p_timer->h_timer;
> +	p_timer->h_timer = NULL;
> +	LeaveCriticalSection( &p_timer->lock );
> +
> +	return timer;
> +}
> +
>  static void CALLBACK
>  __timer_callback(
>  	IN cl_timer_t* const p_timer,
>  	IN BOOLEAN timer_signalled )
>  {
> -	/* timer_signalled is always TRUE, and has no value. */
> -	CL_ASSERT( timer_signalled );
>  	UNUSED_PARAM( timer_signalled );
> 
> -	p_timer->timeout_time = 0;
> -	p_timer->thread_id = GetCurrentThreadId();
> +	if( !__clear_timer_handle( p_timer ))
> +		return;
> 
> +	EnterCriticalSection( &p_timer->cb_lock );
>  	(p_timer->pfn_callback)( (void*)p_timer->context );
> -
> -	p_timer->thread_id = 0;
> +	LeaveCriticalSection( &p_timer->cb_lock );
>  }
> 
> 
> @@ -56,9 +68,7 @@ void
>  cl_timer_construct(
>  	IN	cl_timer_t* const	p_timer )
>  {
> -	p_timer->h_timer = NULL;
> -	p_timer->timeout_time = 0;
> -	p_timer->thread_id = 0;
> +	memset(p_timer, 0, sizeof *p_timer);
>  }
> 
> 
> @@ -76,6 +86,12 @@ cl_timer_init(
> 
>  	p_timer->pfn_callback = pfn_callback;
>  	p_timer->context = context;
> +	p_timer->h_timer_queue = CreateTimerQueue();
> +	if( !p_timer->h_timer_queue )
> +		return CL_ERROR;
> +
> +	InitializeCriticalSection( &p_timer->lock );
> +	InitializeCriticalSection( &p_timer->cb_lock );
>  	return( CL_SUCCESS );
>  }
> 
> @@ -86,7 +102,12 @@ cl_timer_destroy(
>  {
>  	CL_ASSERT( p_timer );
> 
> -	cl_timer_stop( p_timer );
> +	if( p_timer->h_timer_queue )
> +	{
> +		DeleteTimerQueueEx( p_timer->h_timer_queue,
> INVALID_HANDLE_VALUE );
> +		DeleteCriticalSection( &p_timer->lock );
> +		DeleteCriticalSection( &p_timer->cb_lock );
> +	}
>  }
> 
> 
> @@ -95,19 +116,7 @@ cl_timer_start(
>  	IN	cl_timer_t* const	p_timer,
>  	IN	const uint32_t		time_ms )
>  {
> -	CL_ASSERT( p_timer );
> -
> -	cl_timer_stop( p_timer );
> -
> -	p_timer->timeout_time = cl_get_time_stamp() +
> (((uint64_t)time_ms) * 1000);
> -
> -	if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
> __timer_callback,
> -		p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )
> -	{
> -		return( CL_ERROR );
> -	}
> -
> -	return( CL_SUCCESS );
> +	return cl_timer_trim( p_timer, time_ms );
>  }
> 
> 
> @@ -116,19 +125,30 @@ cl_timer_trim(
>  	IN	cl_timer_t* const	p_timer,
>  	IN	const uint32_t		time_ms )
>  {
> -	uint64_t		timeout_time;
> +	uint64_t	timeout;
> +	cl_status_t	status = CL_SUCCESS;
> 
>  	CL_ASSERT( p_timer );
>  	CL_ASSERT( p_timer->pfn_callback );
> 
> -	/* Calculate the timeout time in the timer object. */
> -	timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) *
> 1000);
> -
> -	/* Only pull in the timeout time. */
> -	if( p_timer->timeout_time && p_timer->timeout_time < timeout_time
> )
> -		return( CL_SUCCESS );
> +	timeout = cl_get_time_stamp() + (((uint64_t)time_ms) * 1000);
> 
> -	return cl_timer_start( p_timer, time_ms );
> +	EnterCriticalSection( &p_timer->lock );
> +	if ( !p_timer->h_timer || timeout < p_timer->timeout_time )
> +	{
> +		if( p_timer->h_timer )
> +			DeleteTimerQueueTimer( p_timer->h_timer_queue,
> p_timer->h_timer,
> +NULL );
> +
> +		p_timer->timeout_time = timeout;
> +		if( !CreateTimerQueueTimer( &p_timer->h_timer, p_timer-
> >h_timer_queue,
> +			__timer_callback, p_timer, time_ms, 0,
> WT_EXECUTEINIOTHREAD ) )
> +		{
> +			p_timer->h_timer = NULL;
> +			status = CL_ERROR;
> +		}
> +	}
> +	LeaveCriticalSection( &p_timer->lock );
> +	return status;
>  }
> 
> 
> @@ -136,15 +156,14 @@ void
>  cl_timer_stop(
>  	IN	cl_timer_t* const	p_timer )
>  {
> +	HANDLE timer;
> +
>  	CL_ASSERT( p_timer );
> 
> -	if( p_timer->h_timer && p_timer->thread_id !=
> GetCurrentThreadId() )
> -	{
> -		/* Make sure we block until the timer is cancelled. */
> -		DeleteTimerQueueTimer( NULL, p_timer->h_timer,
> INVALID_HANDLE_VALUE );
> -		p_timer->h_timer = NULL;
> -	}
> -	p_timer->timeout_time = 0;
> +	timer = __clear_timer_handle( p_timer );
> +
> +	if( timer )
> +		DeleteTimerQueueTimer( p_timer->h_timer_queue, timer,
> +INVALID_HANDLE_VALUE );
>  }
> 
> 
> diff --git a/trunk/inc/user/complib/cl_timer_osd.h
> b/trunk/inc/user/complib/cl_timer_osd.h
> index 0a023ba..370a175 100644
> --- a/trunk/inc/user/complib/cl_timer_osd.h
> +++ b/trunk/inc/user/complib/cl_timer_osd.h
> @@ -42,11 +42,13 @@
>  /* Timer object definition. */
>  typedef struct _cl_timer
>  {
> +	HANDLE					h_timer_queue;
>  	HANDLE					h_timer;
>  	cl_pfn_timer_callback_t	pfn_callback;
>  	const void				*context;
>  	uint64_t				timeout_time;
> -	DWORD					thread_id;
> +	CRITICAL_SECTION		lock;
> +	CRITICAL_SECTION		cb_lock;
> 
>  } cl_timer_t;
> 




More information about the ofw mailing list