[ofw] patch: Fix a race in the cl_timer code that caused deadlocks in opensm

Fab Tillier ftillier at microsoft.com
Thu Jun 24 14:04:33 PDT 2010


Hefty, Sean wrote on Thu, 24 Jun 2010 at 12:22:28
> 
> There has to be a better/simpler way to handle this than using complex
> state tracking with locking acquired/released in loops.  Can't we just
> do something simple like:

It depends on whether clients expect to have thread-safe timers or not.  That is, do you allow multiple threads to act on a timer simultaneously?  I'm inclined to say not, but Tzachi previously mentioned that there were multiple threads in OpenSM that were starting the timer.  If the only reason that happened is due to multiple callbacks, then we can probably get away without serialization, in which case my code changes also become quite a bit simpler.

> struct cl_timer_osd
> {
> 	HANDLE		timer_queue;
> 	cl_pfn_timer_cb_t	pfn_callback;
> 	CRITICAL_SECTION	lock;
> }
> 
> timer_callback(...)
> {
> 	EnterCriticalSection(p_timer->lock);
> 	p_timer->pfn_callback(..);
> 	LeaveCriticalSection(p_timer->lock);

Didn't we try holding a lock over the callback and ended up with things not working?  This goes back to using the threadpool but not knowing exactly how it spins up threads etc.



> }
> 
> cl_timer_start(..)
> {
> 	HANDLE timer;
> 	/* optional:
> 	DeleteTimerQueueTimerEx(p_timer->timer_queue, NULL); */
> 	return CreateTimerQueueTimer(&timer, p_timer->timer_queue,
> timer_callback, ...);
> }
> 
> cl_timer_stop(..)
> {
> 	DeleteTimerQueueTimerEx(p_timer->timer_queue,
> INVALID_HANDLE_VALUE);
> }
> 
> cl_timer_trim(..)
> {
> 	cl_timer_start(..)
> }
> 
> This seems MUCH simpler and serializes callbacks.  Do we really need
> all the extra timer state checking goop just to avoid a few additional
> callbacks?  I have a hard time believing that adding more complex
> locking than what's already there won't introduce other, more difficult
> to find race conditions.

Depends on whether there ends up being issues with multiple thread pool threads blocked on that critsec.  I thought there was an issue reported with this...

-Fab
> 
> 
> > Index: core/complib/user/cl_timer.c
> > ===================================================================
> > --- core/complib/user/cl_timer.c    (revision 2833)
> > +++ core/complib/user/cl_timer.c    (working copy)
> > @@ -39,16 +39,54 @@ __timer_callback(
> >     IN cl_timer_t* const p_timer,
> >     IN BOOLEAN timer_signalled )
> >  {
> > +   uint64_t time_now;
> > +   uint32_t time_ms;
> > +
> >     /* timer_signalled is always TRUE, and has no value. */
> >     CL_ASSERT( timer_signalled );
> >     UNUSED_PARAM( timer_signalled );
> >
> > -   p_timer->timeout_time = 0;
> > +   cl_spinlock_acquire( &p_timer->spinlock );
> >     p_timer->thread_id = GetCurrentThreadId();
> > +   CL_ASSERT( p_timer->thread_id != 0 );
> >
> > -   (p_timer->pfn_callback)( (void*)p_timer->context );
> > +   do
> > +   {
> > +       p_timer->timeout_time = 0;
> > +       cl_spinlock_release( &p_timer->spinlock );
> > +
> > +       (p_timer->pfn_callback)( (void*)p_timer->context );
> > +
> > +       cl_spinlock_acquire( &p_timer->spinlock );
> > +       if( p_timer->timeout_time == 0 )
> > +       {
> > +           /* Timer was not reset, we're done here. */
> > +           break;
> > +       }
> > +
> > +       /* User set the timer again while the callback was running.
> */
> > +       time_now = cl_get_time_stamp();
> > +       if( time_now >= p_timer->timeout_time )
> > +       {
> > +           /* Time expired already - invoke the callback. */
> > +           continue;
> > +       }
> > +
> > +       time_ms = (uint32_t)((p_timer->timeout_time - time_now) /
> 1000ULL);
> > +
> > +       if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
> > __timer_callback,
> > +           p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )
> > +       {
> > +           /* Error: just invoke the callback now... */
> > +           continue;
> > +       }
> > +       /* Timer set, break out of the loop and let the timer expire.
> */
> > +       break;
> > +
> > +   } while( p_timer->timeout_time != 0 );
> >
> >     p_timer->thread_id = 0;
> > +   cl_spinlock_release( &p_timer->spinlock );
> >  }
> >
> >
> > @@ -57,10 +95,12 @@ cl_timer_construct(
> >     IN  cl_timer_t* const   p_timer )
> >  {
> >     p_timer->h_timer = NULL;
> > +   p_timer->pfn_callback = NULL;
> >     p_timer->timeout_time = 0;
> >     p_timer->thread_id = 0;
> > -}
> >
> > +   cl_spinlock_construct( &p_timer->spinlock );
> > +}
> >
> >
> >  cl_status_t
> > @@ -76,7 +116,7 @@ cl_timer_init(
> >
> >     p_timer->pfn_callback = pfn_callback;
> >     p_timer->context = context;
> > -   return( CL_SUCCESS );
> > +   return cl_spinlock_init(&p_timer->spinlock);
> >  }
> >
> >
> > @@ -87,48 +127,99 @@ cl_timer_destroy(
> >     CL_ASSERT( p_timer );
> >
> >     cl_timer_stop( p_timer );
> > +   cl_spinlock_destroy( &p_timer->spinlock );
> >  }
> >
> >
> > -cl_status_t
> > -cl_timer_start(
> > +static void
> > +__cl_timer_stop_locked(
> > +   IN  cl_timer_t* const   p_timer
> > +   )
> > +{
> > +   /*
> > +    * We must loop here, because we release the lock to delete the
> timer,
> > and
> > +    * a different thread might start it before we wakeup and get the
> lock.
> > +    */
> > +   while( p_timer->h_timer )
> > +   {
> > +       /* Timer already queued. */
> > +       HANDLE h_timer = p_timer->h_timer;
> > +       p_timer->h_timer = NULL;
> > +       /* Make sure we block until the timer is cancelled. */
> > +       cl_spinlock_release( &p_timer->spinlock );
> > +       DeleteTimerQueueTimer( NULL, h_timer, INVALID_HANDLE_VALUE );
> > +       cl_spinlock_acquire( &p_timer->spinlock );
> > +   }
> > +}
> > +
> > +
> > +static cl_status_t
> > +__cl_timer_start_locked(
> >     IN  cl_timer_t* const   p_timer,
> > -   IN  const uint32_t      time_ms )
> > +   IN  const uint32_t      time_ms,
> > +   IN  const uint64_t      timeout_time )
> >  {
> > -   CL_ASSERT( p_timer );
> > +   p_timer->timeout_time = timeout_time;
> >
> > -   cl_timer_stop( p_timer );
> > +   if( p_timer->thread_id != 0 )
> > +   {
> > +       /* Timer callback executing. */
> > +       return( CL_SUCCESS );
> > +   }
> >
> > -   p_timer->timeout_time = cl_get_time_stamp() +
> (((uint64_t)time_ms) *
> > 1000);
> > +   __cl_timer_stop_locked( p_timer );
> >
> >     if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
> __timer_callback,
> >         p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )
> >     {
> >         return( CL_ERROR );
> >     }
> > -
> >     return( CL_SUCCESS );
> >  }
> >
> >
> >  cl_status_t
> > +cl_timer_start(
> > +   IN  cl_timer_t* const   p_timer,
> > +   IN  const uint32_t      time_ms )
> > +{
> > +   cl_status_t status = CL_SUCCESS;
> > +   CL_ASSERT( p_timer );
> > +
> > +   cl_spinlock_acquire(&p_timer->spinlock);
> > +   status = __cl_timer_start_locked(
> > +       p_timer,
> > +       time_ms,
> > +       cl_get_time_stamp() + (((uint64_t)time_ms) * 1000)
> > +       );
> > +   cl_spinlock_release(&p_timer->spinlock);
> > +   return status;
> > +}
> > +
> > +
> > +cl_status_t
> >  cl_timer_trim(
> >     IN  cl_timer_t* const   p_timer,
> >     IN  const uint32_t      time_ms )
> >  {
> >     uint64_t        timeout_time;
> > +   cl_status_t     status = CL_SUCCESS;
> >
> >     CL_ASSERT( p_timer );
> >     CL_ASSERT( p_timer->pfn_callback );
> >
> > +   cl_spinlock_acquire(&p_timer->spinlock);
> > +
> >     /* Calculate the timeout time in the timer object. */
> >     timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) *
> 1000);
> >
> >     /* Only pull in the timeout time. */
> > -   if( p_timer->timeout_time && p_timer->timeout_time < timeout_time
> )
> > -       return( CL_SUCCESS );
> > -
> > -   return cl_timer_start( p_timer, time_ms );
> > +   if( p_timer->timeout_time > timeout_time )
> > +   {
> > +       status = __cl_timer_start_locked( p_timer, time_ms,
> timeout_time );
> > +   }
> > +   cl_spinlock_release(&p_timer->spinlock);
> > +   return status;
> >  }
> >
> >
> > @@ -137,14 +228,15 @@ cl_timer_stop(
> >     IN  cl_timer_t* const   p_timer )
> >  {
> >     CL_ASSERT( p_timer );
> > +   cl_spinlock_acquire(&p_timer->spinlock);
> >
> > -   if( p_timer->h_timer && p_timer->thread_id !=
> GetCurrentThreadId() )
> > +   if( p_timer->thread_id != GetCurrentThreadId() )
> >     {
> > -       /* Make sure we block until the timer is cancelled. */
> > -       DeleteTimerQueueTimer( NULL, p_timer->h_timer,
> INVALID_HANDLE_VALUE
> > );
> > -       p_timer->h_timer = NULL;
> > +       __cl_timer_stop_locked( p_timer );
> >     }
> >     p_timer->timeout_time = 0;
> > +
> > +   cl_spinlock_release(&p_timer->spinlock);
> >  }
> >
> >
> > Index: inc/user/complib/cl_timer_osd.h
> > ===================================================================
> > --- inc/user/complib/cl_timer_osd.h (revision 2833)
> > +++ inc/user/complib/cl_timer_osd.h (working copy)
> > @@ -37,6 +37,7 @@
> >
> >
> >  #include "cl_types.h"
> > +#include "cl_spinlock.h"
> >
> >
> >  /* Timer object definition. */
> > @@ -47,8 +48,9 @@ typedef struct _cl_timer
> >     const void              *context;
> >     uint64_t                timeout_time;
> >     DWORD                   thread_id;
> > +   cl_spinlock_t           spinlock;
> >
> >  } cl_timer_t;
> >
> >
> > -#endif // _CL_TIMER_OSD_H_
> > \ No newline at end of file
> > +#endif // _CL_TIMER_OSD_H_



More information about the ofw mailing list