[ofw][patch][ND provider] Improving latency of ms-mpi
Fab Tillier
ftillier at microsoft.com
Thu Aug 6 13:07:24 PDT 2009
Hi Leo,
Sorry for the delay in getting back to you. I think using inline data makes sense. A couple things though, and it's hard to comment inline due to the HTML format so here they are at the top:
Max inline data is not an in/out parameter to the INDConnector::CreateEndpoint method. I don't know if it makes sense to have it be an input parameter. Aren't the proper tuning points dependent on the HCA, rather than the app?
Assuming that the tuning points are specific to the HCA models, does it make sense to always allocate 400 bytes? Is it always faster to inline 400 bytes than to DMA the data for all HCAs (InfiniHost 3 LX, EX, ConnectX, etc?) It seems to me that having the inline data controlled by the HCA driver rather than the ND provider would make more sense, and allow the HCA driver to optimize the sweet spot.
This all assumes that the sweet spot is largely independent of the host system (CPU, memory bus, PCI bus), and a function of the HCA's ability to process requests.
When you query the QP to get the actual inline data size, you store the value temporarily in a local variable, then store it in the member variable, as well as return. Why not just store it in the member variable and eliminate the local variable?
Ndendpoint.h @ line 176 seems to have whitespace issues.
Lastly, the latency improvement you quote is for RDMA writes (the ndrpingpong test). You also provide results for ib_send_lat compared to ndpingpong, but those results aren't comparable to begin with because of test methodology (e.g. ib_send_lat reports median latency as typical, which helps ignore outliers, while ndpingpong calculates average.) Do you have before/after results for ndpingpong?
Thanks,
-Fab
From: ofw-bounces at lists.openfabrics.org [mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Leonid Keller
Sent: Monday, July 27, 2009 1:27 AM
To: Fab Tillier; Tzachi Dar
Cc: ofw at lists.openfabrics.org
Subject: [ofw][patch][ND provider] Improving latency of ms-mpi
This patch adds usage of INLINE DATA facility of Mellanox HCAs for improving latency of ND provider.
E.g., on our Nehalem computer (Processor Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz, 2660 Mhz, 4 Core(s), 8 Logical Processor(s))
it improved latency from 2.23 us to 1.12 us.
Here are the ideas of the patch:
- by default, ND provider will create QP with inline data of 400 bytes;
(this can enlarge user's QP size)
- one can change this default by defining env variable IBNDPROV_MAX_INLINE_SIZE;
- an ND application, while creating QP, can define the necessary INLINE DATA size; this value takes precedence over the default one.
Index: ulp/nd/user/NdEndpoint.cpp
===================================================================
--- ulp/nd/user/NdEndpoint.cpp (revision 2310)
+++ ulp/nd/user/NdEndpoint.cpp (working copy)
@@ -41,6 +41,8 @@
#pragma warning( pop )
#include "nddebug.h"
+extern uint32_t g_nd_max_inline_size;
+
#if defined(EVENT_TRACING)
#ifdef offsetof
#undef offsetof
@@ -96,7 +98,7 @@
__in SIZE_T nOutboundSge,
__in SIZE_T InboundReadLimit,
__in SIZE_T OutboundReadLimit,
- __out_opt SIZE_T* pMaxInlineData
+ __in_opt __out_opt SIZE_T* pMaxInlineData
)
{
ND_ENTER( ND_DBG_NDI );
@@ -117,10 +119,17 @@
m_pParent->m_Ifc.user_verbs.nd_get_qp_state != NULL ||
m_pParent->m_Ifc.user_verbs.pre_destroy_qp != NULL ||
m_pParent->m_Ifc.user_verbs.post_destroy_qp != NULL ||
+ m_pParent->m_Ifc.user_verbs.post_query_qp != NULL ||
m_pParent->m_Ifc.user_verbs.post_send != NULL ||
m_pParent->m_Ifc.user_verbs.post_recv != NULL /*||
m_pParent->m_Ifc.user_verbs.bind_mw != NULL*/ );
+ UINT32 InlineSize;
+ if ( pMaxInlineData )
+ InlineSize = (UINT32)*pMaxInlineData;
+ else
+ InlineSize = g_nd_max_inline_size;
+
HRESULT hr = CreateQp(
pInboundCq,
pOutboundCq,
@@ -129,13 +138,25 @@
nInboundSge,
nOutboundSge,
InboundReadLimit,
- OutboundReadLimit );
+ OutboundReadLimit,
+ InlineSize );
if( FAILED( hr ) )
return hr;
+ ib_qp_attr_t qp_attr;
+ hr = QueryQp(&qp_attr);
+ if( FAILED( hr ) ) {
+ DestroyQp();
+ return hr;
+ }
+ else
+ InlineSize = (UINT32)qp_attr.sq_max_inline;
+
+
m_Ird = (UINT8)InboundReadLimit;
m_Ord = (UINT8)OutboundReadLimit;
+ m_MaxInlineSize = InlineSize;
// Move the QP to the INIT state so users can post receives.
hr = ModifyQp( IB_QPS_INIT );
@@ -143,10 +164,7 @@
DestroyQp();
if( SUCCEEDED( hr ) && pMaxInlineData != NULL )
- {
- // Worst case.
- *pMaxInlineData = nOutboundSge * 12;
- }
+ *pMaxInlineData = InlineSize;
return hr;
}
@@ -286,7 +304,11 @@
wr.p_next = NULL;
wr.wr_id = (ULONG_PTR)pResult;
wr.wr_type = WR_SEND;
- wr.send_opt = 0;
+ if ( pResult->BytesTransferred <= m_MaxInlineSize )
+ wr.send_opt = IB_SEND_OPT_INLINE;
+ else
+ wr.send_opt = 0;
+
if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS) )
wr.send_opt |= IB_SEND_OPT_SIGNALED;
if( Flags & ND_OP_FLAG_READ_FENCE )
@@ -374,11 +396,15 @@
wr.p_next = NULL;
wr.wr_id = (ULONG_PTR)pResult;
wr.wr_type = WR_SEND;
+ if ( pResult->BytesTransferred <= m_MaxInlineSize )
+ wr.send_opt = IB_SEND_OPT_INLINE;
+ else
+ wr.send_opt = 0;
// We simulate invalidate operations (since we simulate MW use). We
// put the RKey in the immediate data, the recipient will do the
// lookup of the MW based on that (as they would with a regular
// invalidate request).
- wr.send_opt = IB_SEND_OPT_IMMEDIATE;
+ wr.send_opt |= IB_SEND_OPT_IMMEDIATE;
if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS) )
wr.send_opt |= IB_SEND_OPT_SIGNALED;
if( Flags & ND_OP_FLAG_READ_FENCE )
@@ -665,7 +691,10 @@
wr.p_next = NULL;
wr.wr_id = (ULONG_PTR)pResult;
wr.wr_type = Type;
- wr.send_opt = 0;
+ if ( pResult->BytesTransferred <= m_MaxInlineSize )
+ wr.send_opt = IB_SEND_OPT_INLINE;
+ else
+ wr.send_opt = 0;
if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS) )
wr.send_opt |= IB_SEND_OPT_SIGNALED;
if( Flags & ND_OP_FLAG_READ_FENCE )
@@ -737,11 +766,14 @@
__in SIZE_T nInboundSge,
__in SIZE_T nOutboundSge,
__in SIZE_T InboundReadLimit,
- __in SIZE_T OutboundReadLimit
+ __in SIZE_T OutboundReadLimit,
+ __in SIZE_T MaxInlineData
)
{
ND_ENTER( ND_DBG_NDI );
+ if( MaxInlineData > UINT_MAX )
+ return ND_INVALID_PARAMETER_3;
if( nInboundEntries > UINT_MAX )
return ND_INVALID_PARAMETER_4;
if( nOutboundEntries > UINT_MAX )
@@ -764,6 +796,7 @@
qp_ioctl.in.qp_create.rq_depth = (uint32_t)nInboundEntries;
qp_ioctl.in.qp_create.sq_sge = (uint32_t)nOutboundSge;
qp_ioctl.in.qp_create.rq_sge = (uint32_t)nInboundSge;
+ qp_ioctl.in.qp_create.sq_max_inline = (uint32_t)MaxInlineData;
qp_ioctl.in.qp_create.h_srq = NULL;
qp_ioctl.in.qp_create.sq_signaled = FALSE;
@@ -941,4 +974,67 @@
return S_OK;
}
+HRESULT CEndpoint::QueryQp(
+ __out ib_qp_attr_t *qp_attr
+ )
+{
+ ib_api_status_t status;
+
+ ND_ENTER( ND_DBG_NDI );
+
+ ual_query_qp_ioctl_t qp_ioctl;
+ cl_memclr( &qp_ioctl, sizeof(qp_ioctl) );
+ qp_ioctl.in.h_qp = m_hQp;
+
+ /* Call the uvp pre call if the vendor library provided a valid ca handle */
+ if( m_pParent->m_Ifc.user_verbs.pre_query_qp )
+ {
+ /* Pre call to the UVP library */
+ status = m_pParent->m_Ifc.user_verbs.pre_query_qp( m_uQp, &qp_ioctl.in.umv_buf );
+ if( status != IB_SUCCESS )
+ goto done;
+ }
+
+ DWORD bytes_ret;
+ BOOL fSuccess = DeviceIoControl(
+ m_pParent->m_hSync,
+ UAL_QUERY_QP,
+ &qp_ioctl.in,
+ sizeof(qp_ioctl.in),
+ &qp_ioctl.out,
+ sizeof(qp_ioctl.out),
+ &bytes_ret,
+ NULL
+ );
+
+ if( fSuccess != TRUE || bytes_ret != sizeof(qp_ioctl.out) )
+ status = IB_ERROR;
+ else
+ status = qp_ioctl.out.status;
+
+ /* Call vendor's post_query_qp */
+ CL_ASSERT( m_pParent->m_Ifc.user_verbs.post_query_qp );
+ if( m_pParent->m_Ifc.user_verbs.post_query_qp )
+ {
+ m_pParent->m_Ifc.user_verbs.post_query_qp( m_uQp, status,
+ &qp_ioctl.out.attr, &qp_ioctl.out.umv_buf );
+ }
+
+done:
+ ND_PRINT( TRACE_LEVEL_INFORMATION, ND_DBG_NDI,
+ ("Queried QP %#I64x, QPn %#x, pd %#I64x, context %p, status %#x \n",
+ m_hQp, m_Qpn, m_pParent->m_hPd, this, status ) );
+
+ switch( status )
+ {
+ case IB_SUCCESS:
+ *qp_attr = qp_ioctl.out.attr;
+ return S_OK;
+
+ default:
+ return ND_UNSUCCESSFUL;
+ }
+
+}
+
} // namespace
Index: ulp/nd/user/NdEndpoint.h
===================================================================
--- ulp/nd/user/NdEndpoint.h (revision 2310)
+++ ulp/nd/user/NdEndpoint.h (working copy)
@@ -67,7 +67,7 @@
__in SIZE_T nOutboundSge,
__in SIZE_T InboundReadLimit,
__in SIZE_T OutboundReadLimit,
- __out_opt SIZE_T* pMaxInlineData
+ __in_opt __out_opt SIZE_T* pMaxInlineData
);
public:
@@ -176,11 +176,16 @@
__in SIZE_T nInboundSge,
__in SIZE_T nOutboundSge,
__in SIZE_T InboundReadLimit,
- __in SIZE_T OutboundReadLimit
+ __in SIZE_T OutboundReadLimit,
+ __in SIZE_T MaxInlineData
);
void DestroyQp();
+ HRESULT QueryQp(
+ __out ib_qp_attr_t *qp_attr
+ );
+
HRESULT ModifyQp(
__in ib_qp_state_t NewState
);
@@ -197,6 +202,7 @@
UINT8 m_Ird;
UINT8 m_Ord;
+ UINT32 m_MaxInlineSize;
};
} // namespace
Index: ulp/nd/user/NdProv.cpp
===================================================================
--- ulp/nd/user/NdProv.cpp (revision 2310)
+++ ulp/nd/user/NdProv.cpp (working copy)
@@ -57,6 +57,7 @@
uint32_t g_nd_dbg_level = TRACE_LEVEL_ERROR;
/* WPP doesn't want here literals! */
uint32_t g_nd_dbg_flags = 0x80000001; /* ND_DBG_ERROR | ND_DBG_NDI; */
+uint32_t g_nd_max_inline_size = 400;
HANDLE ghHeap;
@@ -462,6 +463,8 @@
switch( dwReason )
{
case DLL_PROCESS_ATTACH:
+ TCHAR env_var[16];
+ DWORD i;
#if defined(EVENT_TRACING)
@@ -471,9 +474,6 @@
WPP_INIT_TRACING(L"ibndprov.dll");
#endif
#elif DBG
- TCHAR env_var[16];
- DWORD i;
-
i = GetEnvironmentVariable( "IBNDPROV_DBG_LEVEL", env_var, sizeof(env_var) );
if( i && i <= 16 )
{
@@ -494,6 +494,12 @@
GetCurrentProcessId(), g_nd_dbg_level ,g_nd_dbg_flags) );
#endif
+ i = GetEnvironmentVariable( "IBNDPROV_MAX_INLINE_SIZE", env_var, sizeof(env_var) );
+ if( i && i <= 16 )
+ {
+ g_nd_max_inline_size = _tcstoul( env_var, NULL, 16 );
+ }
+
ghHeap = HeapCreate( 0, 0, 0 );
if( ghHeap == NULL )
{
________________________________
From: Tzachi Dar
Sent: Monday, July 20, 2009 6:13 PM
To: Leonid Keller
Subject: FW: Improving latency of ms-mpi
________________________________
From: Tzachi Dar
Sent: Tuesday, July 14, 2009 6:39 PM
To: Fab Tillier
Cc: Ishai Rabinovitz; Gilad Shainer; Leonid Keller; Eric Lantz (HPC GOFAST)
Subject: Improving latency of ms-mpi
Hi Fab,
I have made some tests on my Nehalem computer (Processor Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz, 2660 Mhz, 4 Core(s), 8 Logical Processor(s))
It seems that the latency that I got for RDMA operations using ib_write_lat is 1.26us
On the same machine ndrpingpong.exe gave 2.23u which is almost 1us bigger )-:
After some debugging of the system, it seems that the problem comes from not using inline send. Using Inline send has reduced the ndrpingpong latency to
1.12us Which as far as I know is around 200ns bigger than the Clovertown numbers.
[By the way, it seems that the nd pingpong test is better than ours due to a lazy poll cq, but this is a different issue].
Since nothing in the nd provider is using inline send I guess that we can improve all the results on MS-MPI in about 1us by using inline send.
I have created a very simple patch that demonstrates what has to be done in order to allow inline to work. Please note that the patch is over simplified and will always use inline RDMA operations even when it is not allowed to.
The questions that I see are:
1) How do we pass the maxinline parameter for creating the qp? Do we want to add a new parameter, or do we want to use sge * 12 number, or do we want a big default?
2) How do we decide when to send using inline? Do we want to add a new ND_OP_FLAG, do we want to always send when we can (that is message < max_inline)?
Here is the patch created:
Index: NdEndpoint.cpp
===================================================================
--- NdEndpoint.cpp (revision 4569)
+++ NdEndpoint.cpp (working copy)
@@ -635,7 +635,7 @@
wr.p_next = NULL;
wr.wr_id = (ULONG_PTR)pResult;
wr.wr_type = Type;
- wr.send_opt = 0;
+ wr.send_opt = IB_SEND_OPT_INLINE; //????
if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS) )
wr.send_opt |= IB_SEND_OPT_SIGNALED;
if( Flags & ND_OP_FLAG_READ_FENCE )
@@ -737,6 +737,8 @@
qp_ioctl.in.qp_create.h_srq = NULL;
qp_ioctl.in.qp_create.sq_signaled = FALSE;
+ qp_ioctl.in.qp_create.sq_max_inline = 400;
+
/* Pre call to the UVP library */
CL_ASSERT( m_pParent->m_Ifc.user_verbs.pre_create_qp );
qp_ioctl.in.qp_create.h_sq_cq = pOutboundCq->m_uCq;
Thanks
Tzachi
Here are the more detailed results:
C:\Users\herod>ib_write_lat -a 11.4.12.128
------------------------------------------------------------------
RDMA_Write Latency Test
Connection type : RC
max inline size 28
local address: LID 0x200, QPN 0x4b000200, PSN 0xe7520000, RKey 0x140004 VAddr 0x00000001630040
remote address: LID 0x100, QPN 0x4a001e00, PSN 0xf6040000, RKey 0x130040 VAddr 0x00000001530040
Mtu : 2048
------------------------------------------------------------------
#bytes #iterations t_min[usec] t_max[usec] t_typical[usec]
2 1000 0.98 4.89 1.26
4 1000 1.12 1.26 1.26
8 1000 0.98 1.40 1.26
16 1000 0.84 31.71 1.26
C:\Users\herod>ib_read_lat -a 11.4.12.128
------------------------------------------------------------------
RDMA_Read Latency Test
max inline size 28
local address: LID 0x200, QPN 0x4a000600, PSN 0xf1560000, RKey 0x13000c VAddr 0x00000001550040
remote address: LID 0x100, QPN 0x4a000400, PSN 0x42390000, RKey 0x13000c VAddr 0x00000001390040
Mtu : 2048
------------------------------------------------------------------
#bytes #iterations t_min[usec] t_max[usec] t_typical[usec]
2 1000 1.40 8.94 2.23
4 1000 1.40 2.79 2.23
8 1000 1.96 2.51 2.23
16 1000 1.68 2.79 2.23
32 1000 1.40 9.22 2.23
64 1000 1.40 3.07 2.23
128 1000 2.23 2.79 2.23
256 1000 1.96 2.79 2.51
512 1000 1.96 3.07 2.79
1024 1000 2.51 3.63 3.07
2048 1000 3.07 32.69 3.91
4096 1000 3.91 5.31 4.75
8192 1000 5.87 6.70 6.15
16384 1000 5.31 26.82 8.66
32768 1000 12.29 15.37 14.25
65536 1000 22.63 42.46 24.86
131072 1000 43.30 88.56 46.10
262144 1000 84.09 152.53 88.84
524288 1000 169.02 236.90 174.32
1048576 1000 342.22 382.45 345.02
2097152 1000 653.99 706.51 686.96
4194304 1000 1364.70 1408.00 1370.57
8388608 1000 2736.10 2765.71 2738.62
------------------------------------------------------------------
C:\Users\herod>ib_Send_lat -a 11.4.12.128
------------------------------------------------------------------
Send Latency Test
Inline data is used up to 400 bytes message
Connection type : RC
test
local address: LID 0x200, QPN 0x4a000800, PSN 0x7a600000, RKey 0x130010 VAddr 0x00000001510040
remote address: LID 0x100, QPN 0x4a000600, PSN 0x6c6e0000, RKey 0x130010 VAddr 0x00000001570040
Mtu : 2048
------------------------------------------------------------------
#bytes #iterations t_min[usec] t_max[usec] t_typical[usec]
2 1000 1.26 20.95 1.40
4 1000 1.26 5.17 1.40
8 1000 1.12 5.03 1.40
C:\Users\herod>q:\projinf4\trunk\bin\user\objfre_wlh_amd64\amd64\ndpingpong.exe c 11.4.12.128 5000 p
1
Using 8 processors. Sender Frequency is 3579545
1 100000 3.07 106.77 325414
2 100000 3.07 106.77 650829
4 100000 3.07 106.77 1301659
8 100000 3.07 109.31 2603319
16 100000 3.07 106.77 5206638
32 100000 3.07 109.31 10413276
C:\Users\herod>q:\projinf4\trunk\bin\user\objfre_wlh_amd64\amd64\ndrpingpong.exe c 11.4.12.128 5000
p1
Using 8 processors. Sender Frequency is 3579545
1 100000 2.23 101.39 447627
2 100000 2.23 101.39 895255
4 100000 2.23 101.39 1790510
8 100000 2.23 101.39 3581020
16 100000 2.23 101.39 7162041
32 100000 2.23 108.37 14324082
64 100000 2.23 104.88 28648164
128 100000 2.23 111.91 57296329
256 100000 2.51 99.44 101829753
512 100000 2.51 111.85 203659506
1024 100000 3.07 106.77 333224861
2048 100000 3.91 105.86 523651240
4096 100000 4.75 101.98 862497367
8192 100000 6.15 100.41 1332899446
16384 100000 8.66 102.84 1891916859
32768 64000 13.97 101.37 2345933562
65536 32000 24.86 102.12 2635884647
And after moving to inline send ...
Using 8 processors. Sender Frequency is 3579545
1 100000 1.12 111.91 895255
2 100000 1.12 118.89 1790510
4 100000 1.12 118.89 3581020
8 100000 1.12 111.91 7162041
16 100000 1.12 118.89 14324082
32 100000 1.12 125.87 28648164
64 100000 1.40 106.30 45845272
128 100000 1.40 111.89 91690544
256 100000 2.23 101.39 114592658
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20090806/c0f80337/attachment.html>
More information about the ofw
mailing list