<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD>
<META http-equiv=Content-Type content="text/html; charset=us-ascii">
<META content="MSHTML 6.00.2900.5512" name=GENERATOR></HEAD>
<BODY>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=039060208-27072009>This
patch adds usage of INLINE DATA facility of Mellanox HCAs for improving latency
of ND provider.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=039060208-27072009>E.g.,
on our Nehalem computer (Processor Intel(R) Core(TM) i7 CPU 920
@ 2.67GHz, 2660 Mhz, 4 Core(s), 8 Logical Processor(s)) </SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=039060208-27072009><STRONG>it improved latency from 2.23 us to 1.12
us.</STRONG></SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=039060208-27072009></SPAN></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=039060208-27072009>Here
are the ideas of the patch:</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=039060208-27072009> - by default, ND provider will
create QP with inline data of 400 bytes;</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=039060208-27072009> (this can enlarge user's QP
size)</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=039060208-27072009> - one can change this default by
defining env variable IBNDPROV_MAX_INLINE_SIZE;</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=039060208-27072009> - an ND application, while creating
QP, can define the necessary INLINE DATA size; this value takes precedence over
the default one.</SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=039060208-27072009></SPAN></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=039060208-27072009></SPAN></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN class=039060208-27072009>Index:
ulp/nd/user/NdEndpoint.cpp<BR>===================================================================<BR>---
ulp/nd/user/NdEndpoint.cpp (revision 2310)<BR>+++
ulp/nd/user/NdEndpoint.cpp (working copy)<BR>@@ -41,6 +41,8
@@<BR> #pragma warning( pop )<BR> #include
"nddebug.h"<BR> <BR>+extern
uint32_t g_nd_max_inline_size;<BR>+<BR> #if
defined(EVENT_TRACING)<BR> #ifdef offsetof<BR> #undef offsetof<BR>@@
-96,7 +98,7 @@<BR> __in SIZE_T
nOutboundSge,<BR> __in SIZE_T
InboundReadLimit,<BR> __in SIZE_T
OutboundReadLimit,<BR>- __out_opt SIZE_T*
pMaxInlineData<BR>+ __in_opt __out_opt SIZE_T*
pMaxInlineData<BR>
)<BR> {<BR> ND_ENTER( ND_DBG_NDI );<BR>@@ -117,10
+119,17 @@<BR>
m_pParent->m_Ifc.user_verbs.nd_get_qp_state != NULL
||<BR>
m_pParent->m_Ifc.user_verbs.pre_destroy_qp != NULL
||<BR>
m_pParent->m_Ifc.user_verbs.post_destroy_qp != NULL
||<BR>+
m_pParent->m_Ifc.user_verbs.post_query_qp != NULL
||<BR>
m_pParent->m_Ifc.user_verbs.post_send != NULL
||<BR>
m_pParent->m_Ifc.user_verbs.post_recv != NULL
/*||<BR>
m_pParent->m_Ifc.user_verbs.bind_mw != NULL*/ );<BR> <BR>+ UINT32
InlineSize;<BR>+ if ( pMaxInlineData )<BR>+ InlineSize =
(UINT32)*pMaxInlineData;<BR>+ else<BR>+ InlineSize =
g_nd_max_inline_size;<BR>+ <BR> HRESULT hr =
CreateQp(<BR>
pInboundCq,<BR>
pOutboundCq,<BR>@@ -129,13 +138,25
@@<BR>
nInboundSge,<BR>
nOutboundSge,<BR>
InboundReadLimit,<BR>-
OutboundReadLimit );<BR>+
OutboundReadLimit,<BR>+ InlineSize
);<BR> <BR> if( FAILED( hr )
)<BR> return
hr;<BR> <BR>+ ib_qp_attr_t qp_attr;<BR>+ hr =
QueryQp(&qp_attr);<BR>+ if( FAILED( hr ) )
{<BR>+ DestroyQp();<BR>+ return
hr;<BR>+ }<BR>+ else<BR>+ InlineSize =
(UINT32)qp_attr.sq_max_inline;<BR>+ <BR>+ <BR>
m_Ird = (UINT8)InboundReadLimit;<BR> m_Ord =
(UINT8)OutboundReadLimit;<BR>+ m_MaxInlineSize =
InlineSize;<BR> <BR> // Move the QP to the INIT
state so users can post receives.<BR> hr = ModifyQp(
IB_QPS_INIT );<BR>@@ -143,10 +164,7
@@<BR>
DestroyQp();<BR> <BR> if( SUCCEEDED( hr )
&& pMaxInlineData != NULL )<BR>-
{<BR>- // Worst
case.<BR>- *pMaxInlineData =
nOutboundSge * 12;<BR>-
}<BR>+ *pMaxInlineData =
InlineSize;<BR> <BR> return hr;<BR> }<BR>@@
-286,7 +304,11 @@<BR> wr.p_next =
NULL;<BR> wr.wr_id =
(ULONG_PTR)pResult;<BR> wr.wr_type =
WR_SEND;<BR>- wr.send_opt = 0;<BR>+ if (
pResult->BytesTransferred <= m_MaxInlineSize
)<BR>+ wr.send_opt = IB_SEND_OPT_INLINE;
<BR>+ else<BR>+ wr.send_opt =
0;<BR>+ <BR> if( !(Flags &
ND_OP_FLAG_SILENT_SUCCESS) )<BR>
wr.send_opt |= IB_SEND_OPT_SIGNALED;<BR> if( Flags &
ND_OP_FLAG_READ_FENCE )<BR>@@ -374,11 +396,15 @@<BR>
wr.p_next = NULL;<BR> wr.wr_id =
(ULONG_PTR)pResult;<BR> wr.wr_type =
WR_SEND;<BR>+ if ( pResult->BytesTransferred <=
m_MaxInlineSize )<BR>+ wr.send_opt = IB_SEND_OPT_INLINE;
<BR>+ else<BR>+ wr.send_opt =
0;<BR> // We simulate invalidate operations (since we
simulate MW use). We<BR> // put the RKey in the
immediate data, the recipient will do the<BR> // lookup
of the MW based on that (as they would with a
regular<BR> // invalidate
request).<BR>- wr.send_opt =
IB_SEND_OPT_IMMEDIATE;<BR>+ wr.send_opt |=
IB_SEND_OPT_IMMEDIATE;<BR> if( !(Flags &
ND_OP_FLAG_SILENT_SUCCESS) )<BR>
wr.send_opt |= IB_SEND_OPT_SIGNALED;<BR> if( Flags &
ND_OP_FLAG_READ_FENCE )<BR>@@ -665,7 +691,10 @@<BR>
wr.p_next = NULL;<BR> wr.wr_id =
(ULONG_PTR)pResult;<BR> wr.wr_type =
Type;<BR>- wr.send_opt = 0;<BR>+ if (
pResult->BytesTransferred <= m_MaxInlineSize
)<BR>+ wr.send_opt = IB_SEND_OPT_INLINE;
<BR>+ else<BR>+ wr.send_opt =
0;<BR> if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS)
)<BR> wr.send_opt |=
IB_SEND_OPT_SIGNALED;<BR> if( Flags &
ND_OP_FLAG_READ_FENCE )<BR>@@ -737,11 +766,14 @@<BR>
__in SIZE_T nInboundSge,<BR> __in SIZE_T
nOutboundSge,<BR> __in SIZE_T
InboundReadLimit,<BR>- __in SIZE_T
OutboundReadLimit<BR>+ __in SIZE_T
OutboundReadLimit,<BR>+ __in SIZE_T
MaxInlineData<BR>
)<BR> {<BR> ND_ENTER( ND_DBG_NDI
);<BR> <BR>+ if( MaxInlineData > UINT_MAX
)<BR>+ return
ND_INVALID_PARAMETER_3;<BR> if( nInboundEntries >
UINT_MAX )<BR> return
ND_INVALID_PARAMETER_4;<BR> if( nOutboundEntries >
UINT_MAX )<BR>@@ -764,6 +796,7 @@<BR>
qp_ioctl.in.qp_create.rq_depth =
(uint32_t)nInboundEntries;<BR>
qp_ioctl.in.qp_create.sq_sge =
(uint32_t)nOutboundSge;<BR> qp_ioctl.in.qp_create.rq_sge
= (uint32_t)nInboundSge;<BR>+
qp_ioctl.in.qp_create.sq_max_inline =
(uint32_t)MaxInlineData;<BR> qp_ioctl.in.qp_create.h_srq
= NULL;<BR> qp_ioctl.in.qp_create.sq_signaled =
FALSE;<BR> <BR>@@ -941,4 +974,67 @@<BR> return
S_OK;<BR> }<BR> <BR>+HRESULT CEndpoint::QueryQp(<BR>+ __out
ib_qp_attr_t *qp_attr<BR>+ )<BR>+{<BR>+ ib_api_status_t
status;<BR>+ <BR>+ ND_ENTER( ND_DBG_NDI
);<BR>+<BR>+ ual_query_qp_ioctl_t qp_ioctl;<BR>+ cl_memclr(
&qp_ioctl, sizeof(qp_ioctl) );<BR>+ qp_ioctl.in.h_qp =
m_hQp;<BR>+<BR>+ /* Call the uvp pre call if the vendor library provided a
valid ca handle */<BR>+ if( m_pParent->m_Ifc.user_verbs.pre_query_qp
)<BR>+ {<BR>+ /* Pre call to the UVP library
*/<BR>+ status = m_pParent->m_Ifc.user_verbs.pre_query_qp( m_uQp,
&qp_ioctl.in.umv_buf );<BR>+ if( status != IB_SUCCESS
)<BR>+ goto done;<BR>+ }<BR>+<BR>+ DWORD
bytes_ret;<BR>+ BOOL fSuccess =
DeviceIoControl(<BR>+ m_pParent->m_hSync,<BR>+ UAL_QUERY_QP,<BR>+ &qp_ioctl.in,<BR>+ sizeof(qp_ioctl.in),<BR>+ &qp_ioctl.out,<BR>+ sizeof(qp_ioctl.out),<BR>+ &bytes_ret,<BR>+ NULL<BR>+ );<BR>+<BR>+ if(
fSuccess != TRUE || bytes_ret != sizeof(qp_ioctl.out) )<BR>+ status =
IB_ERROR;<BR>+ else<BR>+ status =
qp_ioctl.out.status;<BR>+<BR>+ /* Call vendor's post_query_qp
*/<BR>+ CL_ASSERT( m_pParent->m_Ifc.user_verbs.post_query_qp
);<BR>+ if( m_pParent->m_Ifc.user_verbs.post_query_qp
)<BR>+ {<BR>+ m_pParent->m_Ifc.user_verbs.post_query_qp(
m_uQp, status,<BR>+ &qp_ioctl.out.attr,
&qp_ioctl.out.umv_buf );<BR>+ }<BR>+<BR>+done:<BR>+ ND_PRINT(
TRACE_LEVEL_INFORMATION, ND_DBG_NDI,<BR>+ ("Queried QP %#I64x, QPn
%#x, pd %#I64x, context %p, status %#x \n", <BR>+ m_hQp, m_Qpn,
m_pParent->m_hPd, this, status ) );<BR>+<BR>+ switch( status
)<BR>+ {<BR>+ case IB_SUCCESS:<BR>+ *qp_attr =
qp_ioctl.out.attr;<BR>+ return
S_OK;<BR>+<BR>+ default:<BR>+ return
ND_UNSUCCESSFUL;<BR>+ }<BR>+ <BR>+}<BR>+<BR> } //
namespace<BR>Index:
ulp/nd/user/NdEndpoint.h<BR>===================================================================<BR>---
ulp/nd/user/NdEndpoint.h (revision 2310)<BR>+++
ulp/nd/user/NdEndpoint.h (working copy)<BR>@@ -67,7 +67,7
@@<BR> __in SIZE_T
nOutboundSge,<BR> __in SIZE_T
InboundReadLimit,<BR> __in
SIZE_T OutboundReadLimit,<BR>-
__out_opt SIZE_T* pMaxInlineData<BR>+
__in_opt __out_opt SIZE_T*
pMaxInlineData<BR>
);<BR> <BR> public:<BR>@@ -176,11 +176,16
@@<BR> __in SIZE_T
nInboundSge,<BR> __in SIZE_T
nOutboundSge,<BR> __in SIZE_T
InboundReadLimit,<BR>- __in SIZE_T
OutboundReadLimit<BR>+ __in SIZE_T
OutboundReadLimit,<BR>+ __in SIZE_T
MaxInlineData<BR>
);<BR> <BR> void
DestroyQp();<BR> <BR>+ HRESULT
QueryQp(<BR>+ __out ib_qp_attr_t
*qp_attr<BR>+ );<BR>+<BR> HRESULT
ModifyQp(<BR> __in ib_qp_state_t
NewState<BR> );<BR>@@ -197,6
+202,7 @@<BR> <BR> UINT8
m_Ird;<BR> UINT8 m_Ord;<BR>+ UINT32
m_MaxInlineSize;<BR> };<BR> <BR> } // namespace<BR>Index:
ulp/nd/user/NdProv.cpp<BR>===================================================================<BR>---
ulp/nd/user/NdProv.cpp (revision 2310)<BR>+++
ulp/nd/user/NdProv.cpp (working copy)<BR>@@ -57,6 +57,7
@@<BR> uint32_t g_nd_dbg_level = TRACE_LEVEL_ERROR;<BR> /* WPP doesn't
want here literals! */<BR> uint32_t g_nd_dbg_flags = 0x80000001; /*
ND_DBG_ERROR | ND_DBG_NDI; */<BR>+uint32_t g_nd_max_inline_size =
400;<BR> <BR> HANDLE ghHeap;<BR> <BR>@@ -462,6 +463,8
@@<BR> switch( dwReason )<BR>
{<BR> case
DLL_PROCESS_ATTACH:<BR>+
TCHAR
env_var[16];<BR>+
DWORD i;<BR> <BR> <BR> #if
defined(EVENT_TRACING)<BR>@@ -471,9 +474,6
@@<BR>
WPP_INIT_TRACING(L"ibndprov.dll");<BR> #endif<BR> #elif DBG
<BR>- TCHAR
env_var[16];<BR>-
DWORD
i;<BR>-<BR> i =
GetEnvironmentVariable( "IBNDPROV_DBG_LEVEL", env_var, sizeof(env_var)
);<BR> if( i && i <=
16 )<BR> {<BR>@@ -494,6 +494,12
@@<BR>
GetCurrentProcessId(), g_nd_dbg_level ,g_nd_dbg_flags)
);<BR> #endif<BR> <BR>+ i = GetEnvironmentVariable(
"IBNDPROV_MAX_INLINE_SIZE", env_var, sizeof(env_var) );<BR>+ if( i
&& i <= 16
)<BR>+ {<BR>+ g_nd_max_inline_size = _tcstoul(
env_var, NULL, 16
);<BR>+ }<BR>+<BR>
ghHeap = HeapCreate( 0, 0, 0
);<BR> if( ghHeap == NULL
)<BR> {<BR></SPAN></FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=039060208-27072009></SPAN></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2><SPAN
class=039060208-27072009></SPAN></FONT> </DIV><BR>
<DIV class=OutlookMessageHeader lang=en-us dir=ltr align=left>
<HR tabIndex=-1>
<FONT face=Tahoma size=2><B>From:</B> Tzachi Dar <BR><B>Sent:</B> Monday, July
20, 2009 6:13 PM<BR><B>To:</B> Leonid Keller<BR><B>Subject:</B> FW: Improving
latency of ms-mpi<BR></FONT><BR></DIV>
<DIV></DIV>
<DIV> </DIV><BR>
<DIV class=OutlookMessageHeader lang=en-us dir=ltr align=left>
<HR tabIndex=-1>
<FONT face=Tahoma size=2><B>From:</B> Tzachi Dar <BR><B>Sent:</B> Tuesday, July
14, 2009 6:39 PM<BR><B>To:</B> Fab Tillier<BR><B>Cc:</B> Ishai Rabinovitz; Gilad
Shainer; Leonid Keller; Eric Lantz (HPC GOFAST)<BR><B>Subject:</B> Improving
latency of ms-mpi<BR></FONT><BR></DIV>
<DIV></DIV>
<DIV><FONT face=Arial size=2><SPAN class=795025014-14072009>Hi
Fab,</SPAN></FONT></DIV>
<DIV><FONT face=Arial size=2><SPAN
class=795025014-14072009></SPAN></FONT> </DIV>
<DIV><SPAN class=795025014-14072009>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2>I have
made some tests on my Nehalem computer (Processor Intel(R) Core(TM) i7
CPU 920 @ 2.67GHz, 2660
Mhz, 4 Core(s), 8 Logical Processor(s))</FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff
size=2></FONT></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><FONT color=#0000ff><FONT face=Arial><FONT
size=2>It seems that the latency that I got for RDMA operations using <FONT
color=#ff0000>ib_write_lat</FONT> <FONT color=#ff0000>is
1.26us</FONT></FONT></FONT></FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#ff0000
size=2></FONT></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2>On the
same machine ndrpingpong.exe gave<FONT color=#ff0000> 2.23u</FONT> which is
almost 1us bigger )-: </FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff
size=2></FONT></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2>After
some debugging of the system, it seems that the problem comes from not using
inline send. Using Inline send has reduced the ndrpingpong latency to
</FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT color=#0000ff><FONT face=Arial><FONT
size=2><FONT color=#ff0000>1.12us</FONT> Which as far as I know is around 200ns
bigger than the Clovertown numbers.</FONT></FONT></FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff
size=2></FONT></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2>[By
the way, it seems that the nd pingpong test is better than ours due to a lazy
poll cq, but this is a different issue].</FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff
size=2></FONT></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2>Since
nothing in the nd provider is using inline send I guess that we can improve all
the results on MS-MPI in about 1us by using inline
send.</FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff
size=2></FONT></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2><SPAN
class=795025014-14072009>I have created a very simple patch that demonstrates
what has to be done in order to allow inline to work. Please note that the patch
is over simplified and will always use inline RDMA operations even when it is
not allowed to.</SPAN></FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2><SPAN
class=795025014-14072009></SPAN></FONT></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2><SPAN
class=795025014-14072009>The questions that I see
are:</SPAN></FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2><SPAN
class=795025014-14072009>1) How do we pass the maxinline parameter for creating
the qp? Do we want to add a new parameter, or do we want to use sge * 12 number,
or do we want a big default?</SPAN></FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2><SPAN
class=795025014-14072009></SPAN></FONT></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2><SPAN
class=795025014-14072009>2) How do we decide when to send using inline? Do we
want to add a new ND_OP_FLAG, do we want to always send when we can (that is
message < max_inline)?</SPAN></FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff
size=2></FONT></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2><SPAN
class=795025014-14072009>Here is the patch created:</SPAN></FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2>Index:
NdEndpoint.cpp<BR>===================================================================<BR>---
NdEndpoint.cpp (revision 4569)<BR>+++ NdEndpoint.cpp (working
copy)<BR>@@ -635,7 +635,7 @@<BR> wr.p_next =
NULL;<BR> wr.wr_id =
(ULONG_PTR)pResult;<BR> wr.wr_type =
Type;<BR>- wr.send_opt = 0;<BR>+ wr.send_opt
= IB_SEND_OPT_INLINE; //????<BR> if( !(Flags &
ND_OP_FLAG_SILENT_SUCCESS) )<BR>
wr.send_opt |= IB_SEND_OPT_SIGNALED;<BR> if( Flags &
ND_OP_FLAG_READ_FENCE )<BR>@@ -737,6 +737,8 @@<BR>
qp_ioctl.in.qp_create.h_srq = NULL;<BR>
qp_ioctl.in.qp_create.sq_signaled = FALSE;<BR> <BR>+
qp_ioctl.in.qp_create.sq_max_inline = 400;<BR>+<BR> /*
Pre call to the UVP library */<BR> CL_ASSERT(
m_pParent->m_Ifc.user_verbs.pre_create_qp );<BR>
qp_ioctl.in.qp_create.h_sq_cq = pOutboundCq->m_uCq;<BR></FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff
size=2></FONT></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff
size=2>Thanks</FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff
size=2>Tzachi</FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff
size=2></FONT></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><SPAN class=795025014-14072009><FONT
face=Arial color=#0000ff size=2>Here are the more detailed
results:</FONT></SPAN></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><SPAN class=795025014-14072009><FONT
face=Arial color=#0000ff size=2></FONT></SPAN></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><SPAN class=795025014-14072009>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff
size=2>C:\Users\herod><FONT color=#ff0000>ib_write_lat</FONT> -a
11.4.12.128<BR>------------------------------------------------------------------<BR>
RDMA_Write Latency Test<BR>Connection type : RC<BR>max inline size 28<BR>
local address: LID 0x200, QPN 0x4b000200, PSN 0xe7520000, RKey 0x140004
VAddr 0x00000001630040<BR> remote address: LID 0x100, QPN 0x4a001e00, PSN
0xf6040000, RKey 0x130040 VAddr 0x00000001530040<BR>Mtu :
2048<BR>------------------------------------------------------------------<BR> #bytes
#iterations t_min[usec] t_max[usec]
t_typical[usec]<BR>
2
1000
0.98
4.89
<FONT color=#ff0000>1.26</FONT><BR>
4
1000
1.12
1.26
1.26<BR>
8
1000
0.98
1.40
1.26<BR> 16
1000
0.84
31.71
1.26</FONT></SPAN></DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff
size=2></FONT></SPAN> </DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#0000ff size=2><FONT
color=#ff0000>C:\Users\herod>ib_read_lat -a
11.4.12.128<BR></FONT>------------------------------------------------------------------<BR>
RDMA_Read Latency Test<BR>max inline size 28<BR> local address: LID
0x200, QPN 0x4a000600, PSN 0xf1560000, RKey 0x13000c VAddr
0x00000001550040<BR> remote address: LID 0x100, QPN 0x4a000400, PSN
0x42390000, RKey 0x13000c VAddr 0x00000001390040<BR>Mtu :
2048<BR>------------------------------------------------------------------<BR> #bytes
#iterations t_min[usec] t_max[usec]
t_typical[usec]<BR>
2
1000
1.40
8.94
<FONT color=#ff0000>2.23</FONT><BR>
4
1000
1.40
2.79
2.23<BR>
8
1000
1.96
2.51
2.23<BR> 16
1000
1.68
2.79
2.23<BR> 32
1000
1.40
9.22
2.23<BR> 64
1000
1.40
3.07
2.23<BR> 128
1000
2.23
2.79
2.23<BR> 256
1000
1.96
2.79
2.51<BR> 512
1000
1.96
3.07
2.79<BR> 1024
1000
2.51
3.63
3.07<BR> 2048
1000
3.07
32.69
3.91<BR> 4096
1000
3.91
5.31
4.75<BR> 8192
1000
5.87
6.70
6.15<BR> 16384
1000
5.31
26.82
8.66<BR> 32768
1000
12.29
15.37
14.25<BR> 65536
1000
22.63
42.46
24.86<BR> 131072
1000
43.30
88.56
46.10<BR> 262144
1000
84.09
152.53
88.84<BR> 524288
1000
169.02
236.90
174.32<BR>1048576
1000
342.22
382.45
345.02<BR>2097152
1000
653.99
706.51
686.96<BR>4194304
1000
1364.70
1408.00
1370.57<BR>8388608
1000
2736.10
2765.71
2738.62<BR>------------------------------------------------------------------</FONT></SPAN></DIV>
<DIV> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2>C:\Users\herod><FONT
color=#ff0000>ib_Send_lat -a
11.4.12.128</FONT><BR>------------------------------------------------------------------<BR>
Send Latency Test<BR>Inline data is used up to 400 bytes message<BR>Connection
type : RC<BR>test<BR> local address: LID 0x200, QPN 0x4a000800, PSN
0x7a600000, RKey 0x130010 VAddr 0x00000001510040<BR> remote address: LID
0x100, QPN 0x4a000600, PSN 0x6c6e0000, RKey 0x130010 VAddr
0x00000001570040<BR>Mtu :
2048<BR>------------------------------------------------------------------<BR> #bytes
#iterations t_min[usec] t_max[usec]
t_typical[usec]<BR>
2
1000
1.26
20.95
<FONT color=#ff0000>1.40</FONT><BR>
4
1000
1.26
5.17
1.40<BR>
8
1000
1.12
5.03
1.40</FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff
size=2>C:\Users\herod>q:\projinf4\trunk\bin\user\objfre_wlh_amd64\amd64\<FONT
color=#ff0000>ndpingpong.exe</FONT> c 11.4.12.128 5000 p<BR>1<BR>Using 8
processors. Sender Frequency is
3579545<BR> 1
100000 <FONT color=#ff0000>3.07</FONT>
106.77
325414<BR> 2
100000 3.07
106.77
650829<BR> 4
100000 3.07 106.77
1301659<BR> 8
100000 3.07 109.31
2603319<BR> 16
100000 3.07 106.77
5206638<BR> 32
100000 3.07 109.31
10413276</FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff
size=2>C:\Users\herod>q:\projinf4\trunk\bin\user\objfre_wlh_amd64\amd64\<FONT
color=#ff0000>ndrpingpong.exe</FONT> c 11.4.12.128 5000<BR>p1<BR>Using 8
processors. Sender Frequency is
3579545<BR> 1
100000 <FONT color=#ff0000> 2.23</FONT>
101.39
447627<BR> 2
100000 2.23
101.39
895255<BR> 4
100000 2.23 101.39
1790510<BR> 8
100000 2.23 101.39
3581020<BR> 16
100000 2.23 101.39
7162041<BR> 32
100000 2.23 108.37
14324082<BR> 64
100000 2.23 104.88
28648164<BR> 128
100000 2.23 111.91
57296329<BR> 256
100000 2.51 99.44
101829753<BR> 512
100000 2.51 111.85
203659506<BR> 1024
100000 3.07 106.77
333224861<BR> 2048
100000 3.91 105.86
523651240<BR> 4096
100000 4.75 101.98
862497367<BR> 8192
100000 6.15 100.41
1332899446<BR> 16384
100000 8.66 102.84
1891916859<BR> 32768
64000 13.97 101.37
2345933562<BR> 65536
32000 24.86 102.12 2635884647</FONT></DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff size=2></FONT> </DIV>
<DIV><SPAN class=718383609-14072009><FONT face=Arial color=#ff0000 size=2>And
after moving to inline send ...</FONT></SPAN></DIV>
<DIV><FONT face=Arial color=#0000ff size=2>Using 8 processors. Sender Frequency
is 3579545<BR>
1 100000 <FONT
color=#ff0000>1.12</FONT> 111.91
895255<BR> 2
100000 1.12 118.89
1790510<BR> 4
100000 1.12 118.89
3581020<BR> 8
100000 1.12 111.91
7162041<BR> 16
100000 1.12 118.89
14324082<BR> 32
100000 1.12 125.87
28648164<BR> 64
100000 1.40 106.30
45845272<BR> 128
100000 1.40 111.89
91690544<BR> 256
100000 2.23 101.39
114592658</FONT></DIV>
<DIV><FONT face=Arial><FONT color=#0000ff><FONT size=2><SPAN
class=718383609-14072009></SPAN></FONT></FONT></FONT> </DIV>
<DIV><FONT face=Arial color=#0000ff
size=2></FONT> </DIV></SPAN></SPAN></DIV></SPAN></DIV></BODY></HTML>