[ofw] patch: Fix a race in the cl_timer code that caused deadlocks in opensm
Hefty, Sean
sean.hefty at intel.com
Thu Jun 24 12:22:28 PDT 2010
There has to be a better/simpler way to handle this than using complex state tracking with locking acquired/released in loops. Can't we just do something simple like:
struct cl_timer_osd
{
HANDLE timer_queue;
cl_pfn_timer_cb_t pfn_callback;
CRITICAL_SECTION lock;
}
timer_callback(...)
{
EnterCriticalSection(p_timer->lock);
p_timer->pfn_callback(..);
LeaveCriticalSection(p_timer->lock)
}
cl_timer_start(..)
{
HANDLE timer;
/* optional:
DeleteTimerQueueTimerEx(p_timer->timer_queue, NULL); */
return CreateTimerQueueTimer(&timer, p_timer->timer_queue, timer_callback, ...);
}
cl_timer_stop(..)
{
DeleteTimerQueueTimerEx(p_timer->timer_queue, INVALID_HANDLE_VALUE);
}
cl_timer_trim(..)
{
cl_timer_start(..)
}
This seems MUCH simpler and serializes callbacks. Do we really need all the extra timer state checking goop just to avoid a few additional callbacks? I have a hard time believing that adding more complex locking than what's already there won't introduce other, more difficult to find race conditions.
> Index: core/complib/user/cl_timer.c
> ===================================================================
> --- core/complib/user/cl_timer.c (revision 2833)
> +++ core/complib/user/cl_timer.c (working copy)
> @@ -39,16 +39,54 @@ __timer_callback(
> IN cl_timer_t* const p_timer,
> IN BOOLEAN timer_signalled )
> {
> + uint64_t time_now;
> + uint32_t time_ms;
> +
> /* timer_signalled is always TRUE, and has no value. */
> CL_ASSERT( timer_signalled );
> UNUSED_PARAM( timer_signalled );
>
> - p_timer->timeout_time = 0;
> + cl_spinlock_acquire( &p_timer->spinlock );
> p_timer->thread_id = GetCurrentThreadId();
> + CL_ASSERT( p_timer->thread_id != 0 );
>
> - (p_timer->pfn_callback)( (void*)p_timer->context );
> + do
> + {
> + p_timer->timeout_time = 0;
> + cl_spinlock_release( &p_timer->spinlock );
> +
> + (p_timer->pfn_callback)( (void*)p_timer->context );
> +
> + cl_spinlock_acquire( &p_timer->spinlock );
> + if( p_timer->timeout_time == 0 )
> + {
> + /* Timer was not reset, we're done here. */
> + break;
> + }
> +
> + /* User set the timer again while the callback was running. */
> + time_now = cl_get_time_stamp();
> + if( time_now >= p_timer->timeout_time )
> + {
> + /* Time expired already - invoke the callback. */
> + continue;
> + }
> +
> + time_ms = (uint32_t)((p_timer->timeout_time - time_now) / 1000ULL);
> +
> + if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
> __timer_callback,
> + p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )
> + {
> + /* Error: just invoke the callback now... */
> + continue;
> + }
> + /* Timer set, break out of the loop and let the timer expire. */
> + break;
> +
> + } while( p_timer->timeout_time != 0 );
>
> p_timer->thread_id = 0;
> + cl_spinlock_release( &p_timer->spinlock );
> }
>
>
> @@ -57,10 +95,12 @@ cl_timer_construct(
> IN cl_timer_t* const p_timer )
> {
> p_timer->h_timer = NULL;
> + p_timer->pfn_callback = NULL;
> p_timer->timeout_time = 0;
> p_timer->thread_id = 0;
> -}
>
> + cl_spinlock_construct( &p_timer->spinlock );
> +}
>
>
> cl_status_t
> @@ -76,7 +116,7 @@ cl_timer_init(
>
> p_timer->pfn_callback = pfn_callback;
> p_timer->context = context;
> - return( CL_SUCCESS );
> + return cl_spinlock_init(&p_timer->spinlock);
> }
>
>
> @@ -87,48 +127,99 @@ cl_timer_destroy(
> CL_ASSERT( p_timer );
>
> cl_timer_stop( p_timer );
> + cl_spinlock_destroy( &p_timer->spinlock );
> }
>
>
> -cl_status_t
> -cl_timer_start(
> +static void
> +__cl_timer_stop_locked(
> + IN cl_timer_t* const p_timer
> + )
> +{
> + /*
> + * We must loop here, because we release the lock to delete the timer,
> and
> + * a different thread might start it before we wakeup and get the lock.
> + */
> + while( p_timer->h_timer )
> + {
> + /* Timer already queued. */
> + HANDLE h_timer = p_timer->h_timer;
> + p_timer->h_timer = NULL;
> + /* Make sure we block until the timer is cancelled. */
> + cl_spinlock_release( &p_timer->spinlock );
> + DeleteTimerQueueTimer( NULL, h_timer, INVALID_HANDLE_VALUE );
> + cl_spinlock_acquire( &p_timer->spinlock );
> + }
> +}
> +
> +
> +static cl_status_t
> +__cl_timer_start_locked(
> IN cl_timer_t* const p_timer,
> - IN const uint32_t time_ms )
> + IN const uint32_t time_ms,
> + IN const uint64_t timeout_time )
> {
> - CL_ASSERT( p_timer );
> + p_timer->timeout_time = timeout_time;
>
> - cl_timer_stop( p_timer );
> + if( p_timer->thread_id != 0 )
> + {
> + /* Timer callback executing. */
> + return( CL_SUCCESS );
> + }
>
> - p_timer->timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) *
> 1000);
> + __cl_timer_stop_locked( p_timer );
>
> if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL, __timer_callback,
> p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )
> {
> return( CL_ERROR );
> }
> -
> return( CL_SUCCESS );
> }
>
>
> cl_status_t
> +cl_timer_start(
> + IN cl_timer_t* const p_timer,
> + IN const uint32_t time_ms )
> +{
> + cl_status_t status = CL_SUCCESS;
> + CL_ASSERT( p_timer );
> +
> + cl_spinlock_acquire(&p_timer->spinlock);
> + status = __cl_timer_start_locked(
> + p_timer,
> + time_ms,
> + cl_get_time_stamp() + (((uint64_t)time_ms) * 1000)
> + );
> + cl_spinlock_release(&p_timer->spinlock);
> + return status;
> +}
> +
> +
> +cl_status_t
> cl_timer_trim(
> IN cl_timer_t* const p_timer,
> IN const uint32_t time_ms )
> {
> uint64_t timeout_time;
> + cl_status_t status = CL_SUCCESS;
>
> CL_ASSERT( p_timer );
> CL_ASSERT( p_timer->pfn_callback );
>
> + cl_spinlock_acquire(&p_timer->spinlock);
> +
> /* Calculate the timeout time in the timer object. */
> timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) * 1000);
>
> /* Only pull in the timeout time. */
> - if( p_timer->timeout_time && p_timer->timeout_time < timeout_time )
> - return( CL_SUCCESS );
> -
> - return cl_timer_start( p_timer, time_ms );
> + if( p_timer->timeout_time > timeout_time )
> + {
> + status = __cl_timer_start_locked( p_timer, time_ms, timeout_time );
> + }
> + cl_spinlock_release(&p_timer->spinlock);
> + return status;
> }
>
>
> @@ -137,14 +228,15 @@ cl_timer_stop(
> IN cl_timer_t* const p_timer )
> {
> CL_ASSERT( p_timer );
> + cl_spinlock_acquire(&p_timer->spinlock);
>
> - if( p_timer->h_timer && p_timer->thread_id != GetCurrentThreadId() )
> + if( p_timer->thread_id != GetCurrentThreadId() )
> {
> - /* Make sure we block until the timer is cancelled. */
> - DeleteTimerQueueTimer( NULL, p_timer->h_timer, INVALID_HANDLE_VALUE
> );
> - p_timer->h_timer = NULL;
> + __cl_timer_stop_locked( p_timer );
> }
> p_timer->timeout_time = 0;
> +
> + cl_spinlock_release(&p_timer->spinlock);
> }
>
>
> Index: inc/user/complib/cl_timer_osd.h
> ===================================================================
> --- inc/user/complib/cl_timer_osd.h (revision 2833)
> +++ inc/user/complib/cl_timer_osd.h (working copy)
> @@ -37,6 +37,7 @@
>
>
> #include "cl_types.h"
> +#include "cl_spinlock.h"
>
>
> /* Timer object definition. */
> @@ -47,8 +48,9 @@ typedef struct _cl_timer
> const void *context;
> uint64_t timeout_time;
> DWORD thread_id;
> + cl_spinlock_t spinlock;
>
> } cl_timer_t;
>
>
> -#endif // _CL_TIMER_OSD_H_
> \ No newline at end of file
> +#endif // _CL_TIMER_OSD_H_
More information about the ofw
mailing list