[ofw] patch: Fix a race in the cl_timer code that caused deadlocks in opensm

Hefty, Sean sean.hefty at intel.com
Thu Jun 24 12:22:28 PDT 2010


There has to be a better/simpler way to handle this than using complex state tracking with locking acquired/released in loops.  Can't we just do something simple like:

struct cl_timer_osd
{
	HANDLE		timer_queue;
	cl_pfn_timer_cb_t	pfn_callback;
	CRITICAL_SECTION	lock;
}

timer_callback(...)
{
	EnterCriticalSection(p_timer->lock);
	p_timer->pfn_callback(..);
	LeaveCriticalSection(p_timer->lock)
}

cl_timer_start(..)
{
	HANDLE timer;
	/* optional:
	DeleteTimerQueueTimerEx(p_timer->timer_queue, NULL); */
	return CreateTimerQueueTimer(&timer, p_timer->timer_queue, timer_callback, ...);
}

cl_timer_stop(..)
{
	DeleteTimerQueueTimerEx(p_timer->timer_queue, INVALID_HANDLE_VALUE);
}

cl_timer_trim(..)
{
	cl_timer_start(..)
}

This seems MUCH simpler and serializes callbacks.  Do we really need all the extra timer state checking goop just to avoid a few additional callbacks?  I have a hard time believing that adding more complex locking than what's already there won't introduce other, more difficult to find race conditions.


> Index: core/complib/user/cl_timer.c
> ===================================================================
> --- core/complib/user/cl_timer.c    (revision 2833)
> +++ core/complib/user/cl_timer.c    (working copy)
> @@ -39,16 +39,54 @@ __timer_callback(
>     IN cl_timer_t* const p_timer,
>     IN BOOLEAN timer_signalled )
>  {
> +   uint64_t time_now;
> +   uint32_t time_ms;
> +
>     /* timer_signalled is always TRUE, and has no value. */
>     CL_ASSERT( timer_signalled );
>     UNUSED_PARAM( timer_signalled );
> 
> -   p_timer->timeout_time = 0;
> +   cl_spinlock_acquire( &p_timer->spinlock );
>     p_timer->thread_id = GetCurrentThreadId();
> +   CL_ASSERT( p_timer->thread_id != 0 );
> 
> -   (p_timer->pfn_callback)( (void*)p_timer->context );
> +   do
> +   {
> +       p_timer->timeout_time = 0;
> +       cl_spinlock_release( &p_timer->spinlock );
> +
> +       (p_timer->pfn_callback)( (void*)p_timer->context );
> +
> +       cl_spinlock_acquire( &p_timer->spinlock );
> +       if( p_timer->timeout_time == 0 )
> +       {
> +           /* Timer was not reset, we're done here. */
> +           break;
> +       }
> +
> +       /* User set the timer again while the callback was running. */
> +       time_now = cl_get_time_stamp();
> +       if( time_now >= p_timer->timeout_time )
> +       {
> +           /* Time expired already - invoke the callback. */
> +           continue;
> +       }
> +
> +       time_ms = (uint32_t)((p_timer->timeout_time - time_now) / 1000ULL);
> +
> +       if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
> __timer_callback,
> +           p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )
> +       {
> +           /* Error: just invoke the callback now... */
> +           continue;
> +       }
> +       /* Timer set, break out of the loop and let the timer expire. */
> +       break;
> +
> +   } while( p_timer->timeout_time != 0 );
> 
>     p_timer->thread_id = 0;
> +   cl_spinlock_release( &p_timer->spinlock );
>  }
> 
> 
> @@ -57,10 +95,12 @@ cl_timer_construct(
>     IN  cl_timer_t* const   p_timer )
>  {
>     p_timer->h_timer = NULL;
> +   p_timer->pfn_callback = NULL;
>     p_timer->timeout_time = 0;
>     p_timer->thread_id = 0;
> -}
> 
> +   cl_spinlock_construct( &p_timer->spinlock );
> +}
> 
> 
>  cl_status_t
> @@ -76,7 +116,7 @@ cl_timer_init(
> 
>     p_timer->pfn_callback = pfn_callback;
>     p_timer->context = context;
> -   return( CL_SUCCESS );
> +   return cl_spinlock_init(&p_timer->spinlock);
>  }
> 
> 
> @@ -87,48 +127,99 @@ cl_timer_destroy(
>     CL_ASSERT( p_timer );
> 
>     cl_timer_stop( p_timer );
> +   cl_spinlock_destroy( &p_timer->spinlock );
>  }
> 
> 
> -cl_status_t
> -cl_timer_start(
> +static void
> +__cl_timer_stop_locked(
> +   IN  cl_timer_t* const   p_timer
> +   )
> +{
> +   /*
> +    * We must loop here, because we release the lock to delete the timer,
> and
> +    * a different thread might start it before we wakeup and get the lock.
> +    */
> +   while( p_timer->h_timer )
> +   {
> +       /* Timer already queued. */
> +       HANDLE h_timer = p_timer->h_timer;
> +       p_timer->h_timer = NULL;
> +       /* Make sure we block until the timer is cancelled. */
> +       cl_spinlock_release( &p_timer->spinlock );
> +       DeleteTimerQueueTimer( NULL, h_timer, INVALID_HANDLE_VALUE );
> +       cl_spinlock_acquire( &p_timer->spinlock );
> +   }
> +}
> +
> +
> +static cl_status_t
> +__cl_timer_start_locked(
>     IN  cl_timer_t* const   p_timer,
> -   IN  const uint32_t      time_ms )
> +   IN  const uint32_t      time_ms,
> +   IN  const uint64_t      timeout_time )
>  {
> -   CL_ASSERT( p_timer );
> +   p_timer->timeout_time = timeout_time;
> 
> -   cl_timer_stop( p_timer );
> +   if( p_timer->thread_id != 0 )
> +   {
> +       /* Timer callback executing. */
> +       return( CL_SUCCESS );
> +   }
> 
> -   p_timer->timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) *
> 1000);
> +   __cl_timer_stop_locked( p_timer );
> 
>     if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL, __timer_callback,
>         p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )
>     {
>         return( CL_ERROR );
>     }
> -
>     return( CL_SUCCESS );
>  }
> 
> 
>  cl_status_t
> +cl_timer_start(
> +   IN  cl_timer_t* const   p_timer,
> +   IN  const uint32_t      time_ms )
> +{
> +   cl_status_t status = CL_SUCCESS;
> +   CL_ASSERT( p_timer );
> +
> +   cl_spinlock_acquire(&p_timer->spinlock);
> +   status = __cl_timer_start_locked(
> +       p_timer,
> +       time_ms,
> +       cl_get_time_stamp() + (((uint64_t)time_ms) * 1000)
> +       );
> +   cl_spinlock_release(&p_timer->spinlock);
> +   return status;
> +}
> +
> +
> +cl_status_t
>  cl_timer_trim(
>     IN  cl_timer_t* const   p_timer,
>     IN  const uint32_t      time_ms )
>  {
>     uint64_t        timeout_time;
> +   cl_status_t     status = CL_SUCCESS;
> 
>     CL_ASSERT( p_timer );
>     CL_ASSERT( p_timer->pfn_callback );
> 
> +   cl_spinlock_acquire(&p_timer->spinlock);
> +
>     /* Calculate the timeout time in the timer object. */
>     timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) * 1000);
> 
>     /* Only pull in the timeout time. */
> -   if( p_timer->timeout_time && p_timer->timeout_time < timeout_time )
> -       return( CL_SUCCESS );
> -
> -   return cl_timer_start( p_timer, time_ms );
> +   if( p_timer->timeout_time > timeout_time )
> +   {
> +       status = __cl_timer_start_locked( p_timer, time_ms, timeout_time );
> +   }
> +   cl_spinlock_release(&p_timer->spinlock);
> +   return status;
>  }
> 
> 
> @@ -137,14 +228,15 @@ cl_timer_stop(
>     IN  cl_timer_t* const   p_timer )
>  {
>     CL_ASSERT( p_timer );
> +   cl_spinlock_acquire(&p_timer->spinlock);
> 
> -   if( p_timer->h_timer && p_timer->thread_id != GetCurrentThreadId() )
> +   if( p_timer->thread_id != GetCurrentThreadId() )
>     {
> -       /* Make sure we block until the timer is cancelled. */
> -       DeleteTimerQueueTimer( NULL, p_timer->h_timer, INVALID_HANDLE_VALUE
> );
> -       p_timer->h_timer = NULL;
> +       __cl_timer_stop_locked( p_timer );
>     }
>     p_timer->timeout_time = 0;
> +
> +   cl_spinlock_release(&p_timer->spinlock);
>  }
> 
> 
> Index: inc/user/complib/cl_timer_osd.h
> ===================================================================
> --- inc/user/complib/cl_timer_osd.h (revision 2833)
> +++ inc/user/complib/cl_timer_osd.h (working copy)
> @@ -37,6 +37,7 @@
> 
> 
>  #include "cl_types.h"
> +#include "cl_spinlock.h"
> 
> 
>  /* Timer object definition. */
> @@ -47,8 +48,9 @@ typedef struct _cl_timer
>     const void              *context;
>     uint64_t                timeout_time;
>     DWORD                   thread_id;
> +   cl_spinlock_t           spinlock;
> 
>  } cl_timer_t;
> 
> 
> -#endif // _CL_TIMER_OSD_H_
> \ No newline at end of file
> +#endif // _CL_TIMER_OSD_H_



More information about the ofw mailing list