<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD>
<META http-equiv=Content-Type content="text/html; charset=us-ascii">
<META content="MSHTML 6.00.2900.3243" name=GENERATOR></HEAD>
<BODY>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=990235109-21052009>Hi
Stan,</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=990235109-21052009></SPAN></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=990235109-21052009>Thanks
a lot on the debug info.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=990235109-21052009>I
believe, i understood the problem.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=990235109-21052009>The
algorithm of sweeping passes several times through the point, where I added the
increment of ref_cnt (and not once, as i thought).</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=990235109-21052009></SPAN></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=990235109-21052009>Find
below and attached a new patch, that solves the original problem a bit
simpler.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=990235109-21052009>It
takes advance of the fact, that all the sweep processing is managed by one
ioc_sweep_results_t structrure.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=990235109-21052009>The
patch increments ref_cnt only once, just after allocating a structure, and
decrements it only once, just before releasing the
structure.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=990235109-21052009>It
should work unless this structure get leaked in some
scenario.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=990235109-21052009></SPAN></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=990235109-21052009>I'd
very thankful to you, if you could check it. (i still don't have SRP
environment).</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=990235109-21052009>Please, pay attention, that it is a patch for
the original (commited to SVN) patch and not for one, i sent you in the
previous e-mail.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=990235109-21052009>TIA</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2>Index:
core/al/kernel/al_ioc_pnp.c<BR>===================================================================<BR>---
core/al/kernel/al_ioc_pnp.c (revision 2191)<BR>+++
core/al/kernel/al_ioc_pnp.c (working copy)<BR>@@ -1720,6 +1720,11
@@<BR> p_results->p_svc =
p_svc;<BR> cl_fmap_init( &p_results->iou_map, __iou_cmp
);<BR> <BR>+ /* Reference the service till the end of sweep
processing */<BR>+ ref_al_obj( &p_results->p_svc->obj
);<BR>+ cl_dbg_out ("~%d:[IBBUS] %s() : p_results %p, p_svc %p,
ref_cnt %d", <BR>+ KeGetCurrentProcessorNumber(), __FUNCTION__,
p_results, p_svc,
p_results->p_svc->obj.ref_cnt);<BR>+<BR> /* Build the map
of nodes by port GUID. */<BR> __process_nodes( p_svc,
&port_map );<BR> <BR>@@ -1749,6 +1754,10
@@<BR> break;<BR> default:<BR> CL_ASSERT(
p_results );<BR>+ /* Release the reference taken for the sweep.
*/<BR>+ deref_al_obj( &p_results->p_svc->obj
);<BR>+ cl_dbg_out ("~%d:[IBBUS] %s() : p_results %p, p_svc %p,
ref_cnt %d", <BR>+ KeGetCurrentProcessorNumber(), __FUNCTION__,
p_results, p_svc,
p_results->p_svc->obj.ref_cnt);<BR> cl_free( p_results
);<BR> /* Fall through */<BR> case
IB_INSUFFICIENT_MEMORY:<BR>@@ -2034,8 +2043,6 @@<BR> if(
!cl_atomic_dec( &p_results->p_svc->query_cnt )
&&<BR> status == IB_SUCCESS
)<BR> {<BR>- /* Reference the
service till the end of processing in the thread
*/<BR>- ref_al_obj( &p_results->p_svc->obj
);<BR> cl_async_proc_queue(
gp_async_pnp_mgr,<BR> &p_results->async_item
);<BR> }<BR>@@ -2231,11 +2238,8 @@<BR> * If
this is the last MAD, finish processing the IOU queries<BR> * in the
PnP thread.<BR> */<BR>- if( !cl_atomic_dec(
&p_results->p_svc->query_cnt ) ) {<BR>- /* Reference the
service till the end of processing in the thread */<BR>- ref_al_obj(
&p_results->p_svc->obj );<BR>+ if( !cl_atomic_dec(
&p_results->p_svc->query_cnt )
)<BR> cl_async_proc_queue( gp_async_pnp_mgr,
&p_results->async_item );<BR>- }<BR> <BR> AL_EXIT(
AL_DBG_PNP );<BR> }<BR>@@ -2356,8 +2360,10
@@<BR> err:<BR> if( !cl_atomic_dec(
&gp_ioc_pnp->query_cnt )
)<BR> cl_async_proc_queue( gp_async_pnp_mgr,
&gp_ioc_pnp->async_item );<BR>- /* Release the reference taken
for the query. */<BR>+ /* Release the reference taken for the sweep.
*/<BR> deref_al_obj( &p_results->p_svc->obj
);<BR>+ cl_dbg_out ("~%d:[IBBUS] %s() : p_results %p, p_svc %p,
ref_cnt %d", <BR>+ KeGetCurrentProcessorNumber(), __FUNCTION__,
p_results, p_svc,
p_results->p_svc->obj.ref_cnt);<BR> cl_free( p_results
);<BR> }<BR> <BR></FONT></DIV><BR>
<BLOCKQUOTE dir=ltr
style="PADDING-LEFT: 5px; MARGIN-LEFT: 5px; BORDER-LEFT: #0000ff 2px solid; MARGIN-RIGHT: 0px">
<DIV class=OutlookMessageHeader lang=en-us dir=ltr align=left>
<HR tabIndex=-1>
<FONT face=Tahoma size=2><B>From:</B> Smith, Stan
[mailto:stan.smith@intel.com] <BR><B>Sent:</B> Wednesday, May 20, 2009 10:15
PM<BR><B>To:</B> Leonid Keller; Fab Tillier<BR><B>Cc:</B>
ofw@lists.openfabrics.org<BR><B>Subject:</B> RE: [ofw] crash on IBBUS
disabling while mad traffic<BR></FONT><BR></DIV>
<DIV></DIV>
<DIV dir=ltr align=left><SPAN class=838510819-20052009><FONT face=Arial
color=#0000ff size=2>Hello Leo,</FONT></SPAN></DIV>
<DIV dir=ltr align=left><SPAN class=838510819-20052009><FONT face=Arial
color=#0000ff size=2> Here's the requested debug info, one HCA
enable/disable cycle with No IOUs available and one cycle with 2 IOUs
available.</FONT></SPAN></DIV>
<DIV dir=ltr align=left><SPAN class=838510819-20052009><FONT face=Arial
color=#0000ff size=2></FONT></SPAN> </DIV>
<DIV dir=ltr align=left><SPAN class=838510819-20052009><FONT face=Arial
color=#0000ff size=2>stan.</FONT></SPAN></DIV>
<DIV dir=ltr align=left><SPAN class=838510819-20052009><FONT face=Arial
color=#0000ff size=2></FONT></SPAN> </DIV><SPAN class=838510819-20052009>
<DIV dir=ltr align=left><BR><FONT face=Arial color=#0000ff size=2>No IOUs
available - No HCAs enabled<BR>Enable HCA0<BR>Before
polling.<BR>~0:[IBBUS] __ioc_pnp_send_cb() : p_results 82136008, p_svc
825F47E8, ref_cnt 3<BR>~1:[IBBUS]
__process_sweep() <SPAN class=838510819-20052009> </SPAN>:
p_results 82136008, p_svc 825F47E8, ref_cnt 2<BR>~0:[IBBUS]
__ioc_pnp_send_cb() : p_results 82145840, p_svc 825F47E8, ref_cnt
3<BR>~1:[IBBUS] __process_sweep() <SPAN
class=838510819-20052009> </SPAN>: p_results 82145840, p_svc 825F47E8, ref_cnt
2<BR>~0:[IBBUS] __ioc_pnp_send_cb() : p_results 82145840, p_svc
825F47E8, ref_cnt 3<BR>~1:[IBBUS]
__process_sweep() <SPAN class=838510819-20052009> </SPAN>:
p_results 82145840, p_svc 825F47E8, ref_cnt 2</FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV dir=ltr align=left><FONT face=Arial color=#0000ff size=2>Disable
HCA0<BR>[AL]:al_cleanup(): Goodbye Cruel World =(<BR>Signaled to stop
polling.<BR>Polling thread terminated.</FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV dir=ltr align=left><FONT face=Arial color=#0000ff size=2><Enabled SRP
Linux target[2 IOUs]><BR><restart opensm on the SRP target
node></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV dir=ltr align=left><FONT face=Arial color=#0000ff size=2><reboot test
system></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV dir=ltr align=left><FONT face=Arial color=#0000ff size=2>2 IOUs available
- No HCAs enabled<BR>Enable HCA0<BR>Before polling.<BR>~0:[IBBUS]
__ioc_pnp_send_cb() : p_results 820F3280, p_svc 820360B8, ref_cnt
3<BR>~0:[IBBUS] __ioc_pnp_send_cb() : p_results 820F3280, p_svc
820360B8, ref_cnt 4<BR>~0:[IBBUS] __ioc_pnp_send_cb() : p_results
820F3280, p_svc 820360B8, ref_cnt 5<BR>~1:[IBBUS]
__process_sweep() <SPAN class=838510819-20052009> </SPAN>:
p_results 820F3280, p_svc 820360B8, ref_cnt 4<BR>Cancelled Found
New Hardware Wizard: SRP driver load<SPAN class=838510819-20052009> - same
hang results if SRP driver pre-installed.</SPAN><BR>~0:[IBBUS]
__ioc_pnp_send_cb() : p_results 820F3280, p_svc 820360B8, ref_cnt
6<BR>~0:[IBBUS] __ioc_pnp_send_cb() : p_results 820F3280, p_svc
820360B8, ref_cnt 7<BR>~1:[IBBUS] __process_sweep() <SPAN
class=838510819-20052009> </SPAN>: p_results 820F3280, p_svc
820360B8, ref_cnt 6</FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV dir=ltr align=left><FONT face=Arial color=#0000ff size=2>Disable
HCA0<BR>[AL]sync_destroy_obj() !ERROR!: Error waiting for references to be
released - delaying.<BR>[AL]print_al_obj() !ERROR!: AL object
0000000082096008(AL_OBJ_TYPE_AL_MGR), parent: 0000000000000000 ref_cnt:
3</FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV dir=ltr align=left><FONT face=Arial color=#0000ff size=2>*** Assertion
failed: cl_status == CL_SUCCESS<BR>*** Source File:
f:\openib-windows-svn\latest\gen1\trunk\core\al\al_common.c, line
539</FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV dir=ltr align=left><FONT face=Arial color=#0000ff size=2>Break
repeatedly, break Once, Ignore, terminate Process, or terminate Thread
(boipt)? i<BR>i<BR>[AL]sync_destroy_obj() !ERROR!: Forcing object
destruction.<BR>[AL]print_al_obj() !ERROR!: AL object
0000000082096008(AL_OBJ_TYPE_AL_MGR), parent:
0000000000000000 ref_cnt: 3<BR>[AL]print_al_obj() !ERROR!: AL object
00000000822103c0(AL_OBJ_TYPE_IOC_PNP_MGR), parent: 0000000082096008 ref_cnt:
1<BR>[AL]print_al_obj() !ERROR!: AL object
00000000820360b8(AL_OBJ_TYPE_IOC_PNP_SVC), parent: 00000000822103c0 ref_cnt:
4<BR>[AL]print_al_obj() !ERROR!: AL object
00000000822103c0(AL_OBJ_TYPE_IOC_PNP_MGR), parent: 0000000082096008 ref_cnt:
1<BR>[AL]print_al_obj() !ERROR!: AL object
00000000820360b8(AL_OBJ_TYPE_IOC_PNP_SVC), parent: 00000000822103c0 ref_cnt:
4<BR>[AL]:al_cleanup(): Goodbye Cruel World =(<BR>Signaled to stop
polling.<BR>Polling thread terminated.<BR></FONT></SPAN></DIV>
<DIV dir=ltr align=left><SPAN class=838510819-20052009><FONT face=Arial
color=#0000ff size=2></FONT></SPAN> </DIV><BR>
<DIV class=OutlookMessageHeader lang=en-us dir=ltr align=left>
<HR tabIndex=-1>
<FONT face=Tahoma size=2><B>From:</B> Leonid Keller
[mailto:leonid@mellanox.co.il] <BR><B>Sent:</B> Tuesday, May 19, 2009 6:42
AM<BR><B>To:</B> Smith, Stan; Fab Tillier<BR><B>Cc:</B>
ofw@lists.openfabrics.org<BR><B>Subject:</B> RE: [ofw] crash on IBBUS
disabling while mad traffic<BR></FONT><BR></DIV>
<DIV></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=762055812-19052009>Hi
Stan, </SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=762055812-19052009></SPAN></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=762055812-19052009>Thank you for the info.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=762055812-19052009>Unfortunately, I don't have now a setup with IOU
devices and can't investigate it.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=762055812-19052009>Maybe you will have a possibility to do a check for
me.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=762055812-19052009></SPAN></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=762055812-19052009>To
remind: my patch was very simple (only 3 lines): </SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=762055812-19052009> i increment ref_cnt of the
sweeping thread before running it and decrement it at the end of the sweep
handling.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=762055812-19052009>Your
data show, that this ref_cnt is not zero.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=762055812-19052009>So
either it can be incremented twice in a row or the thread can exit
without decrementing ref_cnt.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=762055812-19052009>I
don't see how it can happen.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=762055812-19052009></SPAN></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=762055812-19052009>I'd
like you to apply the below patch, make two runs - without and with IOUs
- and send me the debug output of both.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=762055812-19052009>TIA</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2>Index:
core/al/kernel/al_ioc_pnp.c<BR>===================================================================<BR>---
core/al/kernel/al_ioc_pnp.c (revision 2162)<BR>+++
core/al/kernel/al_ioc_pnp.c (working copy)<BR>@@ -2036,6 +2036,8
@@<BR> {<BR> /* Reference
the service till the end of processing in the thread
*/<BR> ref_al_obj(
&p_results->p_svc->obj );<BR>+ cl_dbg_out
("~%d:[IBBUS] %s() : p_results %p, p_svc %p, ref_cnt %d",
<BR>+ KeGetCurrentProcessorNumber(),
__FUNCTION__, p_results, p_svc,
p_results->p_svc->obj.ref_cnt);<BR> cl_async_proc_queue(
gp_async_pnp_mgr,<BR> &p_results->async_item
);<BR> }<BR>@@ -2234,6 +2236,8 @@<BR> if(
!cl_atomic_dec( &p_results->p_svc->query_cnt ) )
{<BR> /* Reference the service till the end of processing in
the thread */<BR> ref_al_obj(
&p_results->p_svc->obj );<BR>+ cl_dbg_out ("~%d:[IBBUS]
%s() : p_results %p, p_svc %p, ref_cnt %d",
<BR>+ KeGetCurrentProcessorNumber(), __FUNCTION__, p_results,
p_svc,
p_results->p_svc->obj.ref_cnt);<BR> cl_async_proc_queue(
gp_async_pnp_mgr, &p_results->async_item
);<BR> }<BR> <BR>@@ -2358,6 +2362,8
@@<BR> cl_async_proc_queue( gp_async_pnp_mgr,
&gp_ioc_pnp->async_item );<BR> /* Release the
reference taken for the query. */<BR> deref_al_obj(
&p_results->p_svc->obj );<BR>+ cl_dbg_out ("~%d:[IBBUS]
%s() : p_results %p, p_svc %p, ref_cnt %d",
<BR>+ KeGetCurrentProcessorNumber(), __FUNCTION__, p_results,
p_svc, p_results->p_svc->obj.ref_cnt);<BR> cl_free(
p_results );<BR> }<BR> <BR></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV><BR>
<BLOCKQUOTE dir=ltr
style="PADDING-LEFT: 5px; MARGIN-LEFT: 5px; BORDER-LEFT: #0000ff 2px solid; MARGIN-RIGHT: 0px">
<DIV class=OutlookMessageHeader lang=en-us dir=ltr align=left>
<HR tabIndex=-1>
<FONT face=Tahoma size=2><B>From:</B> Smith, Stan
[mailto:stan.smith@intel.com] <BR><B>Sent:</B> Monday, May 18, 2009 11:33
PM<BR><B>To:</B> Leonid Keller; Fab Tillier<BR><B>Cc:</B>
ofw@lists.openfabrics.org<BR><B>Subject:</B> RE: [ofw] crash on IBBUS
disabling while mad traffic<BR></FONT><BR></DIV>
<DIV></DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2>Leo,</FONT></SPAN></DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2> This patch, which I believe was committed as
svn.4275, works fine if there are no IOUnits in the fabric. Once there is an
IOU present, my case a Linux SRP target, this patch hangs HCA disable for a
debug version of ibbus. </FONT></SPAN></DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2></FONT></SPAN> </DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2>[AL]bus_release_resources(): Releasing BusFilter
bfi-0<BR>[AL]:al_cleanup(): Destroying \ device.<BR>[AL]:al_cleanup():
Destroying AL Mgr.<BR>[AL]sync_destroy_obj() !ERROR!: Error waiting for
references to be released - delaying.<BR>[AL]print_al_obj() !ERROR!: AL
object 0000000082156200(AL_OBJ_TYPE_AL_MGR), parent: 0000000000000000
ref_cnt: 3</FONT></SPAN></DIV>
<DIV> </DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2>*** Assertion failed: cl_status ==
CL_SUCCESS<BR>*** Source File:
f:\openib-windows-svn\latest\gen1\trunk\core\al\al_common.c, line
554</FONT></SPAN></DIV>
<DIV> </DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2>Break repeatedly, break Once, Ignore, terminate
Process, or terminate Thread (boipt)? i<BR>i<BR>[AL]sync_destroy_obj()
!ERROR!: Forcing object destruction.<BR>[AL]print_al_obj() !ERROR!: AL
object 0000000082156200(AL_OBJ_TYPE_AL_MGR), parent: 0000000000000000
ref_cnt: 3<BR>[AL]print_al_obj() !ERROR!: AL object
0000000082175270(AL_OBJ_TYPE_IOC_PNP_MGR), parent: 0000000082156200 ref_cnt:
1<BR>[AL]print_al_obj() !ERROR!: AL object
00000000ff8ca2c0(AL_OBJ_TYPE_IOC_PNP_SVC), parent: 0000000082175270 ref_cnt:
2<BR>[AL]print_al_obj() !ERROR!: AL object
0000000082175270(AL_OBJ_TYPE_IOC_PNP_MGR), parent: 0000000082156200 ref_cnt:
1<BR>[AL]print_al_obj() !ERROR!: AL object
00000000ff8ca2c0(AL_OBJ_TYPE_IOC_PNP_SVC), parent: 0000000082175270 ref_cnt:
2<BR>[AL]:al_cleanup(): Destroying async obj mgr.<BR>[AL]:al_cleanup():
Destroying async pnp mgr.<BR>[AL]:al_cleanup(): Destroying async proc
mgr.<BR>[AL]:al_cleanup(): Goodbye Cruel World
=(<BR>[AL]bus_release_resources() ]<BR>Signaled to stop polling.<BR>Polling
thread terminated.<BR></FONT></SPAN></DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2>It seems there is a path in IBAL which is not releasing
the reference on the IOC PnP service when an IOU is present in the
fabric.</FONT></SPAN></DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2>Perhaps you could suggest a fix?</FONT></SPAN></DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2>If commit svn.4275 is removed the call to al_cleanup()
returns successfully with no errors.</FONT></SPAN></DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2></FONT></SPAN> </DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2>thanks,</FONT></SPAN></DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2></FONT></SPAN> </DIV>
<DIV dir=ltr align=left><SPAN class=681092220-18052009><FONT face=Arial
color=#0000ff size=2>Stan.</FONT></SPAN></DIV><FONT face=Arial color=#0000ff
size=2></FONT><BR>
<DIV class=OutlookMessageHeader lang=en-us dir=ltr align=left>
<HR tabIndex=-1>
<FONT face=Tahoma size=2><B>From:</B> Leonid Keller
[mailto:leonid@mellanox.co.il] <BR><B>Sent:</B> Monday, April 27, 2009 5:38
AM<BR><B>To:</B> Leonid Keller; Fab Tillier; Smith, Stan<BR><B>Cc:</B>
ofw@lists.openfabrics.org<BR><B>Subject:</B> RE: [ofw] crash on IBBUS
disabling while mad traffic<BR></FONT><BR></DIV>
<DIV></DIV>
<DIV><SPAN class=818171912-27042009><FONT face=Arial color=#0000ff
size=2>Here is a possible explanation and a fix. Please,
review.</FONT></SPAN></DIV>
<DIV><SPAN class=818171912-27042009><FONT face=Arial color=#0000ff
size=2></FONT></SPAN> </DIV>
<DIV><SPAN class=818171912-27042009><FONT face=Arial color=#0000ff
size=2>__ioc_query_sa takes references on IOC PnP service before sending the
node and path_record requests.</FONT></SPAN></DIV>
<DIV><SPAN class=818171912-27042009><FONT face=Arial size=2><FONT
color=#0000ff>But these </FONT><FONT color=#0000ff>references get released
at the end of __node_rec_cb and __path_rec_cb, while __process_sweep
routine, which performs the IOU sweeping, is just scheduled to run in an
async thread.</FONT></FONT></SPAN></DIV>
<DIV><SPAN class=818171912-27042009><FONT face=Arial color=#0000ff size=2>If
the test happens to unload the driver after __node_rec_cb and __path_rec_cb
and before __process_sweep started to run, IOC PnP service gets released and
__process_sweep crashes.</FONT></SPAN></DIV>
<DIV><SPAN class=818171912-27042009><FONT face=Arial color=#0000ff
size=2></FONT></SPAN> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=818171912-27042009>The patch takes a reference on IOC PnP service
before scheduling a thread for __process_sweep and releases the reference at
the end of __process_sweep.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=818171912-27042009>(Pay attention, that __process_sweep schedules a
thread for itself twice while moving through its FSM:
</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=818171912-27042009>SWEEP_IOU_INFO --> SWEEP_IOC_PROFILE -->
SWEEP_SVC_ENTRIES --> SWEEP_COMPLETE)</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2>Index:
al/kernel/al_ioc_pnp.c<BR>===================================================================<BR>---
al/kernel/al_ioc_pnp.c (revision 3609)<BR>+++
al/kernel/al_ioc_pnp.c (working copy)<BR>@@ -2231,8 +2231,11
@@<BR> * If this is the last MAD, finish processing the IOU
queries<BR> * in the PnP thread.<BR>
*/<BR>- if( !cl_atomic_dec( &p_results->p_svc->query_cnt )
)<BR>+ if( !cl_atomic_dec( &p_results->p_svc->query_cnt ) )
{<BR>+ /* Reference the service till the end of processing in the
thread */<BR>+ ref_al_obj( &p_results->p_svc->obj
);<BR> cl_async_proc_queue( gp_async_pnp_mgr,
&p_results->async_item
);<BR>+ }<BR> <BR> AL_EXIT( AL_DBG_PNP
);<BR> }<BR>@@ -2354,6 +2357,8 @@<BR> if(
!cl_atomic_dec( &gp_ioc_pnp->query_cnt )
)<BR> cl_async_proc_queue( gp_async_pnp_mgr,
&gp_ioc_pnp->async_item );<BR> cl_free( p_results
);<BR>+ /* Release the reference taken for the query.
*/<BR>+ deref_al_obj( &p_results->p_svc->obj
);<BR> }<BR> <BR> AL_EXIT( AL_DBG_PNP
);<BR></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV><BR></DIV>
<BLOCKQUOTE dir=ltr
style="PADDING-LEFT: 5px; MARGIN-LEFT: 5px; BORDER-LEFT: #0000ff 2px solid; MARGIN-RIGHT: 0px">
<DIV class=OutlookMessageHeader lang=en-us dir=ltr align=left>
<HR tabIndex=-1>
<FONT face=Tahoma size=2><B>From:</B> Leonid Keller <BR><B>Sent:</B>
Sunday, April 26, 2009 1:05 AM<BR><B>To:</B> 'Fab Tillier'; 'Smith,
Stan'<BR><B>Cc:</B> ofw@lists.openfabrics.org<BR><B>Subject:</B> [ofw]
crash on IBBUS disabling while mad traffic<BR></FONT><BR></DIV>
<DIV></DIV>
<DIV><FONT face=Arial size=2><SPAN class=654294220-25042009>I've got a
crash while running WHQL Disable Enable test while opensm was running on
another node.</SPAN></FONT></DIV>
<DIV><FONT face=Arial size=2><SPAN class=654294220-25042009>I was
running a December version of the driver, but i'm not sure this will
work with current one. (i'll try)</SPAN></FONT></DIV>
<DIV><FONT face=Arial size=2><SPAN
class=654294220-25042009></SPAN></FONT> </DIV>
<DIV><FONT face=Arial size=2><SPAN class=654294220-25042009>The test,
which makes disable/enable to all devices, passes without
opensm.</SPAN></FONT></DIV>
<DIV><FONT face=Arial size=2><SPAN class=654294220-25042009>With opensm
IBBUS sends SA requests to opensm.</SPAN></FONT></DIV>
<DIV><FONT face=Arial size=2><SPAN class=654294220-25042009>In this case
</SPAN></FONT><FONT face=Arial><FONT size=2>__process_sweep<SPAN
class=654294220-25042009>() fails, because per-port IOC PnP agent seems to
be already released.</SPAN></FONT></FONT></DIV>
<DIV><FONT face=Arial><FONT size=2><SPAN class=654294220-25042009>The
latter is strange, because __ioc_query_sa takes reference on PnP agent
before sending request.</SPAN></FONT></FONT></DIV>
<DIV><FONT face=Arial><FONT size=2><SPAN
class=654294220-25042009> __ioc_query_sa<BR> __node_rec_cb<BR> __process_query<BR> __process_sweep<BR></SPAN></FONT></FONT></DIV>
<DIV><FONT face=Arial><FONT size=2><SPAN class=654294220-25042009>Any
ideas ?</SPAN></FONT></FONT></DIV>
<DIV><FONT face=Arial><FONT size=2><SPAN
class=654294220-25042009></SPAN></FONT></FONT> </DIV>
<DIV><FONT face=Arial size=2></FONT> </DIV>
<DIV><FONT face=Arial size=2>3: kd> !analyze -v<BR>ERROR: FindPlugIns
8007007b<BR>*******************************************************************************<BR>*
*<BR>*
Bugcheck
Analysis
*<BR>*
*<BR>*******************************************************************************</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>DRIVER_PAGE_FAULT_IN_FREED_SPECIAL_POOL
(d5)<BR>Memory was referenced after it was freed.<BR>This cannot be
protected by try-except.<BR>When possible, the guilty driver's name
(Unicode string) is printed on<BR>the bugcheck screen and saved in
KiBugCheckDriver.<BR>Arguments:<BR>Arg1: fffff98005b72f84, memory
referenced<BR>Arg2: 0000000000000000, value 0 = read operation, 1 = write
operation<BR>Arg3: fffffa600400b1d0, if non-zero, the address which
referenced memory.<BR>Arg4: 0000000000000000, (reserved)</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>Debugging
Details:<BR>------------------</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>Matched: ibbus!proxy_ioctl+0x41
(fffffa60`04031d8d) <BR>Matched: ibbus!proxy_ioctl+0xa5
(fffffa60`04031df1) </FONT></DIV>
<DIV><FONT face=Arial size=2></FONT> </DIV>
<DIV><FONT face=Arial size=2>READ_ADDRESS: fffff98005b72f84 Special
pool</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>FAULTING_IP: <BR>ibbus!__process_sweep+44
[s:\builds\3609\branches\mlnx_winof_2-0\core\al\kernel\al_ioc_pnp.c @
2315]<BR>fffffa60`0400b1d0 83b8d400000003
cmp dword ptr [rax+0D4h],3</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>MM_INTERNAL_CODE: 0</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>IMAGE_NAME: ibbus.sys</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>DEBUG_FLR_IMAGE_TIMESTAMP:
49401b3e</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>MODULE_NAME: ibbus</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>FAULTING_MODULE: fffffa6004002000
ibbus</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>DEFAULT_BUCKET_ID:
VISTA_DRIVER_FAULT</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>BUGCHECK_STR: 0xD5</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>PROCESS_NAME: System</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>CURRENT_IRQL: f</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>TRAP_FRAME: fffffa6003d50b00 -- (.trap
0xfffffa6003d50b00)<BR>NOTE: The trap frame does not contain all
registers.<BR>Some register values may be zeroed or
incorrect.<BR>rax=fffff98005b72eb0 rbx=0000000000000000
rcx=fffffa6004057780<BR>rdx=fffffa6004005e97 rsi=fffffa600199ccc0
rdi=fffff80001cc0304<BR>rip=fffffa600400b1d0 rsp=fffffa6003d50c90
rbp=0000000000000080<BR> r8=0000000000000005
r9=fffffa6004005e97 r10=0000000000000001<BR>r11=fffffa6003d50c50
r12=0000000000000000 r13=0000000000000000<BR>r14=0000000000000000
r15=0000000000000000<BR>iopl=0
nv up ei pl zr na po
nc<BR>ibbus!__process_sweep+0x44:<BR>fffffa60`0400b1d0
83b8d400000003 cmp dword ptr [rax+0D4h],3
ds:fffff980`05b72f84=????????<BR>Resetting default scope</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>LAST_CONTROL_TRANSFER: from
fffff80001969c42 to fffff800018b0b30</FONT></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial size=2>STACK_TEXT: <BR>fffffa60`03d502f8
fffff800`01969c42 : fffffa80`0e0eb290 fffff800`0194893d fffff800`01a55140
00000000`00001000 : nt!RtlpBreakWithStatusInstruction<BR>fffffa60`03d50300
fffff800`0196adb7 : fffff800`00000004 fffff800`01a55140 ffffffff`fffff000
00000000`00000050 : nt!KiBugCheckDebugBreak+0x12<BR>fffffa60`03d50360
fffff800`018b6754 : fffffa80`0dd77480 fffff800`01cc2bb9 00000000`00000000
fffff800`0194c13f : nt!KeBugCheck2+0xaa7<BR>fffffa60`03d509d0
fffff800`018c5671 : 00000000`00000050 fffff980`05b72f84 00000000`00000000
fffffa60`03d50b00 : nt!KeBugCheckEx+0x104<BR>fffffa60`03d50a10
fffff800`018b51d9 : 00000000`00000000 fffff980`0427cf78 fffffa80`0e0ecf00
fffff980`1c27ef40 : nt!MmAccessFault+0x1371<BR>fffffa60`03d50b00
fffffa60`0400b1d0 : fffff980`1c27ef40 fffff980`04318e00 fffffa60`04005eba
fffff980`04318e78 : nt!KiPageFault+0x119<BR>fffffa60`03d50c90
fffffa60`04005e9d : fffff980`04318e98 fffff980`043bccb0 fffff980`1b88afd0
fffff980`04318e78 : ibbus!__process_sweep+0x44
[s:\builds\3609\branches\mlnx_winof_2-0\core\al\kernel\al_ioc_pnp.c @
2315]<BR>fffffa60`03d50cc0 fffffa60`040070d9 : fffff980`04318d60
fffff980`0434afd0 00000000`00000000 fffffa60`0400743c :
ibbus!__cl_async_proc_worker+0x61
[s:\builds\3609\branches\mlnx_winof_2-0\core\complib\cl_async_proc.c @
153]<BR>fffffa60`03d50cf0 fffffa60`04007464 : fffff980`0434afd0
00000000`00000080 fffff980`0434afd0 8b8b8b8b`8b8b8b8b :
ibbus!__cl_thread_pool_routine+0x41
[s:\builds\3609\branches\mlnx_winof_2-0\core\complib\cl_threadpool.c @
66]<BR>fffffa60`03d50d20 fffff800`01adafd3 : 8b8b8b8b`8b8b8b8b
8b8b8b8b`8b8b8b8b 8b8b8b8b`8b8b8b8b 8b8b8b8b`8b8b8b01 :
ibbus!__thread_callback+0x28
[s:\builds\3609\branches\mlnx_winof_2-0\core\complib\kernel\cl_thread.c @
49]<BR>fffffa60`03d50d50 fffff800`018f0816 : fffffa60`01999180
fffffa80`0e0eb290 fffffa60`019a2d40 00000000`00000001 :
nt!PspSystemThreadStartup+0x57<BR>fffffa60`03d50d80 00000000`00000000 :
00000000`00000000 00000000`00000000 00000000`00000000 00000000`00000000 :
nt!KiStartSystemThread+0x16</FONT></DIV>
<DIV> </DIV><FONT face=Arial size=2>
<DIV><BR>STACK_COMMAND: kb</DIV>
<DIV> </DIV>
<DIV>FOLLOWUP_IP: <BR>ibbus!__process_sweep+44
[s:\builds\3609\branches\mlnx_winof_2-0\core\al\kernel\al_ioc_pnp.c @
2315]<BR>fffffa60`0400b1d0 83b8d400000003
cmp dword ptr [rax+0D4h],3</DIV>
<DIV> </DIV>
<DIV>FAULTING_SOURCE_CODE: <BR> 2311: <BR> 2312:
p_results = PARENT_STRUCT( p_async_item, ioc_sweep_results_t,
async_item );<BR> 2313: CL_ASSERT(
!p_results->p_svc->query_cnt );<BR> 2314: <BR>> 2315:
if( p_results->p_svc->obj.state == CL_DESTROYING )<BR>
2316: {<BR> 2317: __put_iou_map( gp_ioc_pnp,
&p_results->iou_map );<BR> 2318: goto
err;<BR> 2319: }<BR> 2320: </DIV>
<DIV> </DIV>
<DIV><BR>SYMBOL_STACK_INDEX: 6</DIV>
<DIV> </DIV>
<DIV>SYMBOL_NAME: ibbus!__process_sweep+44</DIV>
<DIV> </DIV>
<DIV>FOLLOWUP_NAME: MachineOwner</DIV>
<DIV> </DIV>
<DIV>FAILURE_BUCKET_ID: X64_0xD5_VRF_ibbus!__process_sweep+44</DIV>
<DIV> </DIV>
<DIV>BUCKET_ID: X64_0xD5_VRF_ibbus!__process_sweep+44</DIV>
<DIV> </DIV>
<DIV>Followup: MachineOwner<BR>---------</DIV>
<DIV> </DIV>
<DIV></FONT> </DIV></BLOCKQUOTE></BLOCKQUOTE></BLOCKQUOTE></BODY></HTML>