<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML xmlns="http://www.w3.org/TR/REC-html40" xmlns:v =
"urn:schemas-microsoft-com:vml" xmlns:o =
"urn:schemas-microsoft-com:office:office" xmlns:w =
"urn:schemas-microsoft-com:office:word" xmlns:x =
"urn:schemas-microsoft-com:office:excel" xmlns:p =
"urn:schemas-microsoft-com:office:powerpoint" xmlns:a =
"urn:schemas-microsoft-com:office:access" xmlns:dt =
"uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" xmlns:s =
"uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882" xmlns:rs =
"urn:schemas-microsoft-com:rowset" xmlns:z = "#RowsetSchema" xmlns:b =
"urn:schemas-microsoft-com:office:publisher" xmlns:ss =
"urn:schemas-microsoft-com:office:spreadsheet" xmlns:c =
"urn:schemas-microsoft-com:office:component:spreadsheet" xmlns:odc =
"urn:schemas-microsoft-com:office:odc" xmlns:oa =
"urn:schemas-microsoft-com:office:activation" xmlns:html =
"http://www.w3.org/TR/REC-html40" xmlns:q =
"http://schemas.xmlsoap.org/soap/envelope/" xmlns:rtc =
"http://microsoft.com/officenet/conferencing" XMLNS:D = "DAV:" XMLNS:Repl =
"http://schemas.microsoft.com/repl/" xmlns:mt =
"http://schemas.microsoft.com/sharepoint/soap/meetings/" xmlns:x2 =
"http://schemas.microsoft.com/office/excel/2003/xml" xmlns:ppda =
"http://www.passport.com/NameSpace.xsd" xmlns:ois =
"http://schemas.microsoft.com/sharepoint/soap/ois/" xmlns:dir =
"http://schemas.microsoft.com/sharepoint/soap/directory/" xmlns:ds =
"http://www.w3.org/2000/09/xmldsig#" xmlns:dsp =
"http://schemas.microsoft.com/sharepoint/dsp" xmlns:udc =
"http://schemas.microsoft.com/data/udc" xmlns:xsd =
"http://www.w3.org/2001/XMLSchema" xmlns:sub =
"http://schemas.microsoft.com/sharepoint/soap/2002/1/alerts/" xmlns:ec =
"http://www.w3.org/2001/04/xmlenc#" xmlns:sp =
"http://schemas.microsoft.com/sharepoint/" xmlns:sps =
"http://schemas.microsoft.com/sharepoint/soap/" xmlns:xsi =
"http://www.w3.org/2001/XMLSchema-instance" xmlns:udcs =
"http://schemas.microsoft.com/data/udc/soap" xmlns:udcxf =
"http://schemas.microsoft.com/data/udc/xmlfile" xmlns:udcp2p =
"http://schemas.microsoft.com/data/udc/parttopart" xmlns:wf =
"http://schemas.microsoft.com/sharepoint/soap/workflow/" xmlns:dsss =
"http://schemas.microsoft.com/office/2006/digsig-setup" xmlns:dssi =
"http://schemas.microsoft.com/office/2006/digsig" xmlns:mdssi =
"http://schemas.openxmlformats.org/package/2006/digital-signature" xmlns:mver =
"http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:m =
"http://schemas.microsoft.com/office/2004/12/omml" xmlns:mrels =
"http://schemas.openxmlformats.org/package/2006/relationships" xmlns:spwp =
"http://microsoft.com/sharepoint/webpartpages" xmlns:ex12t =
"http://schemas.microsoft.com/exchange/services/2006/types" xmlns:ex12m =
"http://schemas.microsoft.com/exchange/services/2006/messages" xmlns:pptsl =
"http://schemas.microsoft.com/sharepoint/soap/SlideLibrary/" xmlns:spsl =
"http://microsoft.com/webservices/SharePointPortalServer/PublishedLinksService"
XMLNS:Z = "urn:schemas-microsoft-com:" xmlns:st = ""><HEAD>
<META content="text/html; charset=us-ascii" http-equiv=Content-Type>
<META name=GENERATOR content="MSHTML 8.00.6001.18904"><!--[if !mso]>
<STYLE>v\:* {
BEHAVIOR: url(#default#VML)
}
o\:* {
BEHAVIOR: url(#default#VML)
}
w\:* {
BEHAVIOR: url(#default#VML)
}
.shape {
BEHAVIOR: url(#default#VML)
}
</STYLE>
<![endif]-->
<STYLE>@font-face {
font-family: Cambria Math;
}
@font-face {
font-family: Calibri;
}
@font-face {
font-family: Tahoma;
}
@page WordSection1 {size: 8.5in 11.0in; margin: 1.0in 1.0in 1.0in 1.0in; }
P.MsoNormal {
MARGIN: 0in 0in 0pt; FONT-FAMILY: "Calibri","sans-serif"; FONT-SIZE: 11pt
}
LI.MsoNormal {
MARGIN: 0in 0in 0pt; FONT-FAMILY: "Calibri","sans-serif"; FONT-SIZE: 11pt
}
DIV.MsoNormal {
MARGIN: 0in 0in 0pt; FONT-FAMILY: "Calibri","sans-serif"; FONT-SIZE: 11pt
}
A:link {
COLOR: blue; TEXT-DECORATION: underline; mso-style-priority: 99
}
SPAN.MsoHyperlink {
COLOR: blue; TEXT-DECORATION: underline; mso-style-priority: 99
}
A:visited {
COLOR: purple; TEXT-DECORATION: underline; mso-style-priority: 99
}
SPAN.MsoHyperlinkFollowed {
COLOR: purple; TEXT-DECORATION: underline; mso-style-priority: 99
}
SPAN.EmailStyle17 {
FONT-FAMILY: "Calibri","sans-serif"; COLOR: windowtext; mso-style-type: personal
}
SPAN.EmailStyle18 {
FONT-FAMILY: "Calibri","sans-serif"; COLOR: #1f497d; mso-style-type: personal-reply
}
.MsoChpDefault {
FONT-SIZE: 10pt; mso-style-type: export-only
}
DIV.WordSection1 {
page: WordSection1
}
</STYLE>
<!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]--></HEAD>
<BODY lang=EN-US link=blue vLink=purple>
<DIV dir=ltr align=left><FONT color=#0000ff size=2 face=Arial><SPAN
class=296382618-23062010>Separated patch files so cut-n-paste into a single
patch file did not introduce any problems.</SPAN></FONT></DIV>
<DIV dir=ltr align=left><FONT color=#0000ff size=2 face=Arial><SPAN
class=296382618-23062010>From svn trunk\</SPAN></FONT></DIV>
<DIV dir=ltr align=left><FONT face=Arial><FONT color=#0000ff><FONT size=2><SPAN
class=296382618-23062010>Apply your timer patch as</SPAN> <SPAN
class=296382618-23062010>patch -p1 <
timer.patch</SPAN></FONT></FONT></FONT></DIV>
<DIV dir=ltr align=left><FONT color=#0000ff size=2 face=Arial><SPAN
class=296382618-23062010>Apply:</SPAN></FONT></DIV>
<DIV dir=ltr align=left><FONT color=#0000ff size=2 face=Arial><SPAN
class=296382618-23062010> patch -p1 <
cl_timer_osd.h.patch user-mode serialize</SPAN></FONT></DIV>
<DIV dir=ltr align=left><FONT color=#0000ff size=2 face=Arial><SPAN
class=296382618-23062010> patch -p1 <
cl_timer.c.patch
user-mode serialize</SPAN></FONT></DIV>
<DIV dir=ltr align=left><FONT color=#0000ff size=2 face=Arial><SPAN
class=296382618-23062010> patch -p2 <
kernel_timer.patch sean's kernel
patches</SPAN></FONT></DIV>
<DIV dir=ltr align=left><FONT color=#0000ff size=2 face=Arial><SPAN
class=296382618-23062010></SPAN></FONT> </DIV>
<DIV dir=ltr align=left><FONT color=#0000ff size=2 face=Arial><SPAN
class=296382618-23062010> if using opensm 3.3.6 (vnedor-umad) patch
-p1 < umad.cpp.patch</SPAN></FONT></DIV>
<DIV dir=ltr align=left><FONT color=#0000ff size=2 face=Arial><SPAN
class=296382618-23062010></SPAN></FONT> </DIV><FONT color=#0000ff size=2
face=Arial></FONT><BR>
<DIV dir=ltr lang=en-us class=OutlookMessageHeader align=left>
<HR tabIndex=-1>
<FONT size=2 face=Tahoma><B>From:</B> Tzachi Dar [mailto:tzachid@mellanox.co.il]
<BR><B>Sent:</B> Wednesday, June 23, 2010 8:10 AM<BR><B>To:</B> Smith, Stan;
ofw@lists.openfabrics.org<BR><B>Cc:</B> Yevgeny Kliteynik; Uri Habusha; Fab
Tillier<BR><B>Subject:</B> RE: [ofw] patch: Fix a race in the cl_timer code that
caused deadlocks in opensm<BR></FONT><BR></DIV>
<DIV></DIV>
<DIV class=WordSection1>
<P class=MsoNormal><SPAN style="COLOR: #1f497d">Can you please send one patch,
the way that you see it? (user + kernel)?<o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN style="COLOR: #1f497d"><o:p> </o:p></SPAN></P>
<P class=MsoNormal><SPAN style="COLOR: #1f497d">Thanks<o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN style="COLOR: #1f497d">Tzachi<o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN style="COLOR: #1f497d"><o:p> </o:p></SPAN></P>
<DIV
style="BORDER-BOTTOM: medium none; BORDER-LEFT: blue 1.5pt solid; PADDING-BOTTOM: 0in; PADDING-LEFT: 4pt; PADDING-RIGHT: 0in; BORDER-TOP: medium none; BORDER-RIGHT: medium none; PADDING-TOP: 0in">
<DIV>
<DIV
style="BORDER-BOTTOM: medium none; BORDER-LEFT: medium none; PADDING-BOTTOM: 0in; PADDING-LEFT: 0in; PADDING-RIGHT: 0in; BORDER-TOP: #b5c4df 1pt solid; BORDER-RIGHT: medium none; PADDING-TOP: 3pt">
<P class=MsoNormal><B><SPAN
style="FONT-FAMILY: 'Tahoma','sans-serif'; FONT-SIZE: 10pt">From:</SPAN></B><SPAN
style="FONT-FAMILY: 'Tahoma','sans-serif'; FONT-SIZE: 10pt"> Smith, Stan
[mailto:stan.smith@intel.com] <BR><B>Sent:</B> Wednesday, June 23, 2010 5:16
PM<BR><B>To:</B> Tzachi Dar; ofw@lists.openfabrics.org<BR><B>Cc:</B> Yevgeny
Kliteynik; Uri Habusha; Fab Tillier<BR><B>Subject:</B> RE: [ofw] patch: Fix a
race in the cl_timer code that caused deadlocks in
opensm<o:p></o:p></SPAN></P></DIV></DIV>
<P class=MsoNormal><o:p> </o:p></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt">Hello,</SPAN><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt">
In testing on our 52 node HPC Edition cluster using opensm 3.3.6
(vendor-umad) I found your patch by itself insufficient to resolve the lost
MAD problem which occurs when rebooting many compute nodes (DHCP assigned IPoIB
IPv4 address) simultaneously.</SPAN><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt">Your
patch by itself did work at 39 nodes, although failed consistently at 52
nodes.</SPAN><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt">Only
after applying your patch plus Sean's kernel cl_timer patch and my own
cl_timer serialization patch can 52 nodes reboot simultaneously and reach full
IPoIB operational status (no lost MADs, all ports active and all compute nodes
get DHCP address assigned correctly).</SPAN><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt">Will be
testing opensm 3.3.6 (vendor-ibal) later today.</SPAN><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"> <o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt">BTW, the
system installs, uninstalls performs without noticeable delays. Will do some
performance comparisons later today.</SPAN><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"> <o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt">I
suspect the system 'might' get by without the user-mode serialization patch
although it's there as the Linux cl_timer callbacks are serialized.</SPAN><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt">Will
perform experiments in the near future.</SPAN><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"> <o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt">stan.</SPAN><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"> <o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt">PS:
Sorry about the diffs as my patch is integrated with your original patch.
Important part is the lock acquire before invoking callback() then lock
release.</SPAN><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"> <o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt">---
a/inc/user/complib/cl_timer_osd.h Wed Jun 23 06:48:25 2010<BR>+++
b/inc/user/complib/cl_timer_osd.h Wed Jun 23 06:48:46 2010<BR>@@ -47,6
+47,8 @@<BR> const
void *context;<BR> uint64_t timeout_time;<BR> DWORD thread_id;<BR>+ cl_spinlock_t spinlock;<BR>+ </SPAN><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: red; FONT-SIZE: 10pt">cl_spinlock_t cb_serialize;<BR></SPAN><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt"> <BR> }
cl_timer_t;<BR> <BR>--- b/core/complib/user/cl_timer.c Wed Jun 23
06:50:14 2010<BR>+++ b/core/complib/user/cl_timer.c Tue Jun 22 15:02:02
2010<BR>@@ -33,30 +33,34 @@<BR> <BR> #include
"complib/cl_timer.h"<BR> <BR>-<BR> static void
CALLBACK<BR> __timer_callback( <BR> IN cl_timer_t* const
p_timer,<BR> IN BOOLEAN timer_signalled
)<BR> {<BR> /* timer_signalled is always TRUE, and has no value.
*/<BR>+ <BR>+ DWORD thread_id =
GetCurrentThreadId();<BR> CL_ASSERT( timer_signalled
);<BR>+<BR> UNUSED_PARAM( timer_signalled
);<BR> <BR>+ CL_ASSERT(thread_id!=0);<BR>+<BR> p_timer->timeout_time
= 0;<BR>- p_timer->thread_id =
GetCurrentThreadId();<BR>+ p_timer->thread_id =
thread_id;<BR> <BR>+ </SPAN><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: red; FONT-SIZE: 10pt">cl_spinlock_acquire(&p_timer->cb_serialize);</SPAN><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt"><BR> (p_timer->pfn_callback)(
(void*)p_timer->context );<BR>-<BR>- p_timer->thread_id =
0;<BR>+ </SPAN><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: red; FONT-SIZE: 10pt">cl_spinlock_release(&p_timer->cb_serialize);</SPAN><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt"><BR> }<BR> <BR>-<BR> void<BR> cl_timer_construct(<BR> IN cl_timer_t*
const p_timer )<BR> {<BR> p_timer->h_timer =
NULL;<BR>+ </SPAN><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: red; FONT-SIZE: 10pt">p_timer->pfn_callback
= NULL;<BR></SPAN><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt"> p_timer->timeout_time
= 0;<BR> p_timer->thread_id = 0;<BR> }<BR>@@ -75,8 +79,9
@@<BR> cl_timer_construct( p_timer
);<BR> <BR> p_timer->pfn_callback =
pfn_callback;<BR>+ </SPAN><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: red; FONT-SIZE: 10pt">cl_spinlock_init(&p_timer->cb_serialize);</SPAN><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt"><BR> p_timer->context
= context;<BR>- return( CL_SUCCESS );<BR>+ return
cl_spinlock_init(&p_timer->spinlock);<BR> }<BR> <BR> <BR>@@
-87,6 +92,12 @@<BR> CL_ASSERT( p_timer
);<BR> <BR> cl_timer_stop( p_timer
);<BR>+ </SPAN><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: red; FONT-SIZE: 10pt">if (
p_timer->pfn_callback )<BR>+ {<BR>+ /* construct does not set
these, _init does */<BR>+ cl_spinlock_destroy(
&p_timer->spinlock );<BR>+ cl_spinlock_destroy(
&p_timer->cb_serialize );<BR>+ }<BR></SPAN><SPAN
style="FONT-FAMILY: 'Arial','sans-serif'; COLOR: blue; FONT-SIZE: 10pt"> }<BR> <BR> <BR>@@
-97,6 +108,8 @@<BR> {<BR> CL_ASSERT( p_timer
);<BR> <BR>+ cl_spinlock_acquire(&p_timer->spinlock);<BR>+ <BR> cl_timer_stop(
p_timer );<BR> <BR> p_timer->timeout_time =
cl_get_time_stamp() + (((uint64_t)time_ms) * 1000);<BR>@@ -104,9 +117,12
@@<BR> if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
__timer_callback,<BR> p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD
)
)<BR> {<BR>+ <BR>+ cl_spinlock_release(&p_timer->spinlock);<BR> return(
CL_ERROR
);<BR> }<BR> <BR>+ cl_spinlock_release(&p_timer->spinlock);<BR> return(
CL_SUCCESS );<BR> }<BR> <BR>@@ -117,26 +133,34
@@<BR> IN const uint32_t time_ms
)<BR> {<BR> uint64_t timeout_time;<BR>+ cl_status_t status;<BR> <BR> CL_ASSERT(
p_timer );<BR> CL_ASSERT( p_timer->pfn_callback
);<BR> <BR>+ cl_spinlock_acquire(&p_timer->spinlock);<BR>+<BR> /*
Calculate the timeout time in the timer object. */<BR> timeout_time =
cl_get_time_stamp() + (((uint64_t)time_ms) * 1000);<BR> <BR> /*
Only pull in the timeout time. */<BR>- if( p_timer->timeout_time
&& p_timer->timeout_time < timeout_time )<BR>+ if(
p_timer->timeout_time && p_timer->timeout_time < timeout_time )
{<BR>+ cl_spinlock_release(&p_timer->spinlock);<BR> return(
CL_SUCCESS );<BR>+ }<BR> <BR>- return cl_timer_start( p_timer,
time_ms );<BR>+ status = cl_timer_start( p_timer, time_ms
);<BR>+ <BR>+ cl_spinlock_release(&p_timer->spinlock);<BR>+ return
status;<BR> }<BR> <BR>-<BR> void<BR> cl_timer_stop(<BR> IN cl_timer_t*
const p_timer )<BR> {<BR> CL_ASSERT( p_timer
);<BR>+ cl_spinlock_acquire(&p_timer->spinlock);<BR> <BR> if(
p_timer->h_timer && p_timer->thread_id != GetCurrentThreadId()
)<BR> {<BR>@@ -145,6 +169,8
@@<BR> p_timer->h_timer =
NULL;<BR> }<BR> p_timer->timeout_time =
0;<BR>+ <BR>+ cl_spinlock_release(&p_timer->spinlock);<BR> }<BR> <BR> </SPAN><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p></o:p></SPAN></P>
<P class=MsoNormal><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p> </o:p></SPAN></P>
<DIV style="TEXT-ALIGN: center" class=MsoNormal align=center><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt">
<HR align=center SIZE=2 width="100%">
</SPAN></DIV>
<P style="MARGIN-BOTTOM: 12pt" class=MsoNormal><B><SPAN
style="FONT-FAMILY: 'Tahoma','sans-serif'; FONT-SIZE: 10pt">From:</SPAN></B><SPAN
style="FONT-FAMILY: 'Tahoma','sans-serif'; FONT-SIZE: 10pt">
ofw-bounces@lists.openfabrics.org [mailto:ofw-bounces@lists.openfabrics.org]
<B>On Behalf Of </B>Tzachi Dar<BR><B>Sent:</B> Tuesday, June 22, 2010 12:51
AM<BR><B>To:</B> ofw@lists.openfabrics.org<BR><B>Cc:</B> Yevgeny
Kliteynik<BR><B>Subject:</B> [ofw] patch: Fix a race in the cl_timer code that
caused deadlocks in opensm</SPAN><SPAN
style="FONT-FAMILY: 'Times New Roman','serif'; FONT-SIZE: 12pt"><o:p></o:p></SPAN></P>
<P class=MsoNormal>While debugging a deadlock in the opensm executable, we have
found that there is a problem in the cl_timer code:<o:p></o:p></P>
<P class=MsoNormal><o:p> </o:p></P>
<P class=MsoNormal>The problem starts from the fact that there can be two call
backs that are running simultaneously.<o:p></o:p></P>
<P class=MsoNormal>On each call back that runs there is the following code:
<o:p></o:p></P>
<P class=MsoNormal><o:p> </o:p></P>
<P class=MsoNormal>static void CALLBACK<o:p></o:p></P>
<P class=MsoNormal>__timer_callback( <o:p></o:p></P>
<P
class=MsoNormal>
IN cl_timer_t* const p_timer,<o:p></o:p></P>
<P
class=MsoNormal>
IN BOOLEAN timer_signalled )<o:p></o:p></P>
<P class=MsoNormal>{<o:p></o:p></P>
<P
class=MsoNormal>
/* timer_signalled is always TRUE, and has no value. */<o:p></o:p></P>
<P
class=MsoNormal>
CL_ASSERT( timer_signalled );<o:p></o:p></P>
<P
class=MsoNormal>
UNUSED_PARAM( timer_signalled );<o:p></o:p></P>
<P class=MsoNormal><o:p> </o:p></P>
<P
class=MsoNormal>
p_timer->timeout_time = 0;<o:p></o:p></P>
<P
class=MsoNormal>
<SPAN style="COLOR: red">p_timer->thread_id =
GetCurrentThreadId();<o:p></o:p></SPAN></P>
<P class=MsoNormal><o:p> </o:p></P>
<P
class=MsoNormal>
(p_timer->pfn_callback)( (void*)p_timer->context );<o:p></o:p></P>
<P class=MsoNormal><o:p> </o:p></P>
<P
class=MsoNormal>
<SPAN style="COLOR: red">p_timer->thread_id = 0;<o:p></o:p></SPAN></P>
<P class=MsoNormal>}<o:p></o:p></P>
<P class=MsoNormal><o:p> </o:p></P>
<P class=MsoNormal>This should promise that if a callback is running, <SPAN
style="COLOR: red">p_timer->thread_id </SPAN>will have the
thread_id of the running thread.<o:p></o:p></P>
<P class=MsoNormal>This field is later used when cl_timer_stop() is being called
in order to prevent deadlocks. (please note that cl_timer_stop() is being called
from cl_timer_start(). Please also note that the deadlock happens if the caller
of cl_stop() is actually executing in the context of the call back
itself).<o:p></o:p></P>
<P class=MsoNormal><o:p> </o:p></P>
<P class=MsoNormal>The problem happens when two callbacks will be running and
than the <SPAN style="COLOR: red">p_timer->thread_id</SPAN> doesn’t show the
correct field.<o:p></o:p></P>
<P class=MsoNormal><o:p> </o:p></P>
<P class=MsoNormal>The fix itself is to sync the start and stop calls and now
only one callback will be running (actually, there is one exception to this
code: if from the callback, a new timer is started, the new callback might be
running on the same time, but: (1) stop has already being called. (2) the <SPAN
style="COLOR: red">p_timer->thread_id</SPAN> is not being touched after the
callback.<o:p></o:p></P>
<P class=MsoNormal><o:p> </o:p></P>
<P class=MsoNormal>Thanks<o:p></o:p></P>
<P class=MsoNormal>Tzachi<o:p></o:p></P>
<P class=MsoNormal><o:p> </o:p></P>
<P class=MsoNormal><o:p> </o:p></P>
<P class=MsoNormal><o:p> </o:p></P>
<P class=MsoNormal><o:p> </o:p></P>
<P class=MsoNormal>Index:
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/core/complib/user/cl_timer.c<o:p></o:p></P>
<P
class=MsoNormal>===================================================================<o:p></o:p></P>
<P class=MsoNormal>---
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/core/complib/user/cl_timer.c
(revision 5977)<o:p></o:p></P>
<P class=MsoNormal>+++
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/core/complib/user/cl_timer.c
(revision 5978)<o:p></o:p></P>
<P class=MsoNormal>@@ -33,25 +33,27 @@<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal> #include "complib/cl_timer.h"<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal>-<o:p></o:p></P>
<P class=MsoNormal> static void CALLBACK<o:p></o:p></P>
<P class=MsoNormal> __timer_callback( <o:p></o:p></P>
<P
class=MsoNormal>
IN cl_timer_t* const p_timer,<o:p></o:p></P>
<P
class=MsoNormal>
IN BOOLEAN timer_signalled )<o:p></o:p></P>
<P class=MsoNormal> {<o:p></o:p></P>
<P
class=MsoNormal>
/* timer_signalled is always TRUE, and has no value. */<o:p></o:p></P>
<P
class=MsoNormal>+
<o:p></o:p></P>
<P
class=MsoNormal>+
DWORD thread_id = GetCurrentThreadId();<o:p></o:p></P>
<P
class=MsoNormal>
CL_ASSERT( timer_signalled );<o:p></o:p></P>
<P class=MsoNormal>+<o:p></o:p></P>
<P
class=MsoNormal>
UNUSED_PARAM( timer_signalled );<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P
class=MsoNormal>+
CL_ASSERT(thread_id!=0);<o:p></o:p></P>
<P class=MsoNormal>+<o:p></o:p></P>
<P
class=MsoNormal>
p_timer->timeout_time = 0;<o:p></o:p></P>
<P
class=MsoNormal>-
p_timer->thread_id = GetCurrentThreadId();<o:p></o:p></P>
<P
class=MsoNormal>+
p_timer->thread_id = thread_id;<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P
class=MsoNormal>
(p_timer->pfn_callback)( (void*)p_timer->context );<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P
class=MsoNormal>-
p_timer->thread_id = 0;<o:p></o:p></P>
<P class=MsoNormal> }<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal>-<o:p></o:p></P>
<P class=MsoNormal> void<o:p></o:p></P>
<P class=MsoNormal> cl_timer_construct(<o:p></o:p></P>
<P
class=MsoNormal>
IN cl_timer_t*
const p_timer
)<o:p></o:p></P>
<P class=MsoNormal>@@ -76,7 +78,8 @@<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P
class=MsoNormal>
p_timer->pfn_callback = pfn_callback;<o:p></o:p></P>
<P
class=MsoNormal>
p_timer->context = context;<o:p></o:p></P>
<P
class=MsoNormal>-
return( CL_SUCCESS );<o:p></o:p></P>
<P
class=MsoNormal>+
return cl_spinlock_init(&p_timer->spinlock);<o:p></o:p></P>
<P
class=MsoNormal>+
<o:p></o:p></P>
<P class=MsoNormal> }<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal>@@ -97,6 +100,8 @@<o:p></o:p></P>
<P class=MsoNormal> {<o:p></o:p></P>
<P
class=MsoNormal>
CL_ASSERT( p_timer );<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P
class=MsoNormal>+
cl_spinlock_acquire(&p_timer->spinlock);<o:p></o:p></P>
<P
class=MsoNormal>+
<o:p></o:p></P>
<P
class=MsoNormal>
cl_timer_stop( p_timer );<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P
class=MsoNormal>
p_timer->timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) *
1000);<o:p></o:p></P>
<P class=MsoNormal>@@ -104,9 +109,12 @@<o:p></o:p></P>
<P
class=MsoNormal>
if( !CreateTimerQueueTimer( &p_timer->h_timer, NULL,
__timer_callback,<o:p></o:p></P>
<P
class=MsoNormal>
p_timer, time_ms, 0, WT_EXECUTEINIOTHREAD ) )<o:p></o:p></P>
<P
class=MsoNormal>
{<o:p></o:p></P>
<P
class=MsoNormal>+
<o:p></o:p></P>
<P
class=MsoNormal>+
cl_spinlock_release(&p_timer->spinlock);<o:p></o:p></P>
<P
class=MsoNormal>
return( CL_ERROR );<o:p></o:p></P>
<P
class=MsoNormal>
}<o:p></o:p></P>
<P
class=MsoNormal>
<o:p></o:p></P>
<P
class=MsoNormal>+
cl_spinlock_release(&p_timer->spinlock);<o:p></o:p></P>
<P
class=MsoNormal>
return( CL_SUCCESS );<o:p></o:p></P>
<P class=MsoNormal> }<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal>@@ -117,26 +125,34 @@<o:p></o:p></P>
<P
class=MsoNormal>
IN const
uint32_t
time_ms )<o:p></o:p></P>
<P class=MsoNormal> {<o:p></o:p></P>
<P
class=MsoNormal>
uint64_t
timeout_time;<o:p></o:p></P>
<P
class=MsoNormal>+
cl_status_t
status;<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P
class=MsoNormal>
CL_ASSERT( p_timer );<o:p></o:p></P>
<P
class=MsoNormal>
CL_ASSERT( p_timer->pfn_callback );<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P
class=MsoNormal>+
cl_spinlock_acquire(&p_timer->spinlock);<o:p></o:p></P>
<P class=MsoNormal>+<o:p></o:p></P>
<P
class=MsoNormal>
/* Calculate the timeout time in the timer object. */<o:p></o:p></P>
<P
class=MsoNormal>
timeout_time = cl_get_time_stamp() + (((uint64_t)time_ms) *
1000);<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P
class=MsoNormal>
/* Only pull in the timeout time. */<o:p></o:p></P>
<P
class=MsoNormal>-
if( p_timer->timeout_time && p_timer->timeout_time <
timeout_time )<o:p></o:p></P>
<P
class=MsoNormal>+
if( p_timer->timeout_time && p_timer->timeout_time <
timeout_time ) {<o:p></o:p></P>
<P
class=MsoNormal>+
cl_spinlock_release(&p_timer->spinlock);<o:p></o:p></P>
<P
class=MsoNormal>
return( CL_SUCCESS );<o:p></o:p></P>
<P
class=MsoNormal>+
}<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P
class=MsoNormal>-
return cl_timer_start( p_timer, time_ms );<o:p></o:p></P>
<P
class=MsoNormal>+
status = cl_timer_start( p_timer, time_ms );<o:p></o:p></P>
<P
class=MsoNormal>+
<o:p></o:p></P>
<P
class=MsoNormal>+
cl_spinlock_release(&p_timer->spinlock);<o:p></o:p></P>
<P
class=MsoNormal>+
return status;<o:p></o:p></P>
<P class=MsoNormal> }<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal>-<o:p></o:p></P>
<P class=MsoNormal> void<o:p></o:p></P>
<P class=MsoNormal> cl_timer_stop(<o:p></o:p></P>
<P
class=MsoNormal>
IN cl_timer_t*
const p_timer
)<o:p></o:p></P>
<P class=MsoNormal> {<o:p></o:p></P>
<P
class=MsoNormal>
CL_ASSERT( p_timer );<o:p></o:p></P>
<P
class=MsoNormal>+
cl_spinlock_acquire(&p_timer->spinlock);<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P
class=MsoNormal>
if( p_timer->h_timer && p_timer->thread_id != GetCurrentThreadId()
)<o:p></o:p></P>
<P
class=MsoNormal>
{<o:p></o:p></P>
<P class=MsoNormal>@@ -145,6 +161,8 @@<o:p></o:p></P>
<P
class=MsoNormal>
p_timer->h_timer = NULL;<o:p></o:p></P>
<P
class=MsoNormal>
}<o:p></o:p></P>
<P
class=MsoNormal>
p_timer->timeout_time = 0;<o:p></o:p></P>
<P
class=MsoNormal>+
<o:p></o:p></P>
<P
class=MsoNormal>+
cl_spinlock_release(&p_timer->spinlock);<o:p></o:p></P>
<P class=MsoNormal> }<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal>Index:
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/inc/complib/cl_timer.h<o:p></o:p></P>
<P
class=MsoNormal>===================================================================<o:p></o:p></P>
<P class=MsoNormal>---
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/inc/complib/cl_timer.h
(revision 5977)<o:p></o:p></P>
<P class=MsoNormal>+++
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/inc/complib/cl_timer.h
(revision 5978)<o:p></o:p></P>
<P class=MsoNormal>@@ -45,8 +45,8 @@<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal> #include <complib/cl_types.h><o:p></o:p></P>
<P class=MsoNormal>+#include <complib/cl_spinlock.h><o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal>-<o:p></o:p></P>
<P class=MsoNormal> /****h* Component Library/Timer<o:p></o:p></P>
<P class=MsoNormal> * NAME<o:p></o:p></P>
<P
class=MsoNormal> *
Timer<o:p></o:p></P>
<P class=MsoNormal>Index:
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/inc/user/complib/cl_timer_osd.h<o:p></o:p></P>
<P
class=MsoNormal>===================================================================<o:p></o:p></P>
<P class=MsoNormal>---
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/inc/user/complib/cl_timer_osd.h
(revision 5977)<o:p></o:p></P>
<P class=MsoNormal>+++
B:/users/tzachid/MLNX_WinOF-2_1_1/MLNX_WinOF-2_1_1/inc/user/complib/cl_timer_osd.h
(revision 5978)<o:p></o:p></P>
<P class=MsoNormal>@@ -47,8 +47,9 @@<o:p></o:p></P>
<P
class=MsoNormal>
const
void
*context;<o:p></o:p></P>
<P
class=MsoNormal>
uint64_t
timeout_time;<o:p></o:p></P>
<P
class=MsoNormal>
DWORD
thread_id;<o:p></o:p></P>
<P
class=MsoNormal>+
cl_spinlock_t
spinlock;<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal> } cl_timer_t;<o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal> <o:p></o:p></P>
<P class=MsoNormal>-#endif // _CL_TIMER_OSD_H_<o:p></o:p></P>
<P class=MsoNormal>\ No newline at end of file<o:p></o:p></P>
<P
class=MsoNormal>+#endif
// _CL_TIMER_OSD_H_<o:p></o:p></P></DIV></DIV></BODY></HTML>