[ofw][patch][ND provider] Improving latency of ms-mpi
Leonid Keller
leonid at mellanox.co.il
Mon Jul 27 01:27:04 PDT 2009
This patch adds usage of INLINE DATA facility of Mellanox HCAs for
improving latency of ND provider.
E.g., on our Nehalem computer (Processor Intel(R) Core(TM) i7 CPU 920 @
2.67GHz, 2660 Mhz, 4 Core(s), 8 Logical Processor(s))
it improved latency from 2.23 us to 1.12 us.
Here are the ideas of the patch:
- by default, ND provider will create QP with inline data of 400
bytes;
(this can enlarge user's QP size)
- one can change this default by defining env variable
IBNDPROV_MAX_INLINE_SIZE;
- an ND application, while creating QP, can define the necessary
INLINE DATA size; this value takes precedence over the default one.
Index: ulp/nd/user/NdEndpoint.cpp
===================================================================
--- ulp/nd/user/NdEndpoint.cpp (revision 2310)
+++ ulp/nd/user/NdEndpoint.cpp (working copy)
@@ -41,6 +41,8 @@
#pragma warning( pop )
#include "nddebug.h"
+extern uint32_t g_nd_max_inline_size;
+
#if defined(EVENT_TRACING)
#ifdef offsetof
#undef offsetof
@@ -96,7 +98,7 @@
__in SIZE_T nOutboundSge,
__in SIZE_T InboundReadLimit,
__in SIZE_T OutboundReadLimit,
- __out_opt SIZE_T* pMaxInlineData
+ __in_opt __out_opt SIZE_T* pMaxInlineData
)
{
ND_ENTER( ND_DBG_NDI );
@@ -117,10 +119,17 @@
m_pParent->m_Ifc.user_verbs.nd_get_qp_state != NULL ||
m_pParent->m_Ifc.user_verbs.pre_destroy_qp != NULL ||
m_pParent->m_Ifc.user_verbs.post_destroy_qp != NULL ||
+ m_pParent->m_Ifc.user_verbs.post_query_qp != NULL ||
m_pParent->m_Ifc.user_verbs.post_send != NULL ||
m_pParent->m_Ifc.user_verbs.post_recv != NULL /*||
m_pParent->m_Ifc.user_verbs.bind_mw != NULL*/ );
+ UINT32 InlineSize;
+ if ( pMaxInlineData )
+ InlineSize = (UINT32)*pMaxInlineData;
+ else
+ InlineSize = g_nd_max_inline_size;
+
HRESULT hr = CreateQp(
pInboundCq,
pOutboundCq,
@@ -129,13 +138,25 @@
nInboundSge,
nOutboundSge,
InboundReadLimit,
- OutboundReadLimit );
+ OutboundReadLimit,
+ InlineSize );
if( FAILED( hr ) )
return hr;
+ ib_qp_attr_t qp_attr;
+ hr = QueryQp(&qp_attr);
+ if( FAILED( hr ) ) {
+ DestroyQp();
+ return hr;
+ }
+ else
+ InlineSize = (UINT32)qp_attr.sq_max_inline;
+
+
m_Ird = (UINT8)InboundReadLimit;
m_Ord = (UINT8)OutboundReadLimit;
+ m_MaxInlineSize = InlineSize;
// Move the QP to the INIT state so users can post receives.
hr = ModifyQp( IB_QPS_INIT );
@@ -143,10 +164,7 @@
DestroyQp();
if( SUCCEEDED( hr ) && pMaxInlineData != NULL )
- {
- // Worst case.
- *pMaxInlineData = nOutboundSge * 12;
- }
+ *pMaxInlineData = InlineSize;
return hr;
}
@@ -286,7 +304,11 @@
wr.p_next = NULL;
wr.wr_id = (ULONG_PTR)pResult;
wr.wr_type = WR_SEND;
- wr.send_opt = 0;
+ if ( pResult->BytesTransferred <= m_MaxInlineSize )
+ wr.send_opt = IB_SEND_OPT_INLINE;
+ else
+ wr.send_opt = 0;
+
if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS) )
wr.send_opt |= IB_SEND_OPT_SIGNALED;
if( Flags & ND_OP_FLAG_READ_FENCE )
@@ -374,11 +396,15 @@
wr.p_next = NULL;
wr.wr_id = (ULONG_PTR)pResult;
wr.wr_type = WR_SEND;
+ if ( pResult->BytesTransferred <= m_MaxInlineSize )
+ wr.send_opt = IB_SEND_OPT_INLINE;
+ else
+ wr.send_opt = 0;
// We simulate invalidate operations (since we simulate MW use).
We
// put the RKey in the immediate data, the recipient will do the
// lookup of the MW based on that (as they would with a regular
// invalidate request).
- wr.send_opt = IB_SEND_OPT_IMMEDIATE;
+ wr.send_opt |= IB_SEND_OPT_IMMEDIATE;
if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS) )
wr.send_opt |= IB_SEND_OPT_SIGNALED;
if( Flags & ND_OP_FLAG_READ_FENCE )
@@ -665,7 +691,10 @@
wr.p_next = NULL;
wr.wr_id = (ULONG_PTR)pResult;
wr.wr_type = Type;
- wr.send_opt = 0;
+ if ( pResult->BytesTransferred <= m_MaxInlineSize )
+ wr.send_opt = IB_SEND_OPT_INLINE;
+ else
+ wr.send_opt = 0;
if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS) )
wr.send_opt |= IB_SEND_OPT_SIGNALED;
if( Flags & ND_OP_FLAG_READ_FENCE )
@@ -737,11 +766,14 @@
__in SIZE_T nInboundSge,
__in SIZE_T nOutboundSge,
__in SIZE_T InboundReadLimit,
- __in SIZE_T OutboundReadLimit
+ __in SIZE_T OutboundReadLimit,
+ __in SIZE_T MaxInlineData
)
{
ND_ENTER( ND_DBG_NDI );
+ if( MaxInlineData > UINT_MAX )
+ return ND_INVALID_PARAMETER_3;
if( nInboundEntries > UINT_MAX )
return ND_INVALID_PARAMETER_4;
if( nOutboundEntries > UINT_MAX )
@@ -764,6 +796,7 @@
qp_ioctl.in.qp_create.rq_depth = (uint32_t)nInboundEntries;
qp_ioctl.in.qp_create.sq_sge = (uint32_t)nOutboundSge;
qp_ioctl.in.qp_create.rq_sge = (uint32_t)nInboundSge;
+ qp_ioctl.in.qp_create.sq_max_inline = (uint32_t)MaxInlineData;
qp_ioctl.in.qp_create.h_srq = NULL;
qp_ioctl.in.qp_create.sq_signaled = FALSE;
@@ -941,4 +974,67 @@
return S_OK;
}
+HRESULT CEndpoint::QueryQp(
+ __out ib_qp_attr_t *qp_attr
+ )
+{
+ ib_api_status_t status;
+
+ ND_ENTER( ND_DBG_NDI );
+
+ ual_query_qp_ioctl_t qp_ioctl;
+ cl_memclr( &qp_ioctl, sizeof(qp_ioctl) );
+ qp_ioctl.in.h_qp = m_hQp;
+
+ /* Call the uvp pre call if the vendor library provided a valid ca
handle */
+ if( m_pParent->m_Ifc.user_verbs.pre_query_qp )
+ {
+ /* Pre call to the UVP library */
+ status = m_pParent->m_Ifc.user_verbs.pre_query_qp( m_uQp,
&qp_ioctl.in.umv_buf );
+ if( status != IB_SUCCESS )
+ goto done;
+ }
+
+ DWORD bytes_ret;
+ BOOL fSuccess = DeviceIoControl(
+ m_pParent->m_hSync,
+ UAL_QUERY_QP,
+ &qp_ioctl.in,
+ sizeof(qp_ioctl.in),
+ &qp_ioctl.out,
+ sizeof(qp_ioctl.out),
+ &bytes_ret,
+ NULL
+ );
+
+ if( fSuccess != TRUE || bytes_ret != sizeof(qp_ioctl.out) )
+ status = IB_ERROR;
+ else
+ status = qp_ioctl.out.status;
+
+ /* Call vendor's post_query_qp */
+ CL_ASSERT( m_pParent->m_Ifc.user_verbs.post_query_qp );
+ if( m_pParent->m_Ifc.user_verbs.post_query_qp )
+ {
+ m_pParent->m_Ifc.user_verbs.post_query_qp( m_uQp, status,
+ &qp_ioctl.out.attr, &qp_ioctl.out.umv_buf );
+ }
+
+done:
+ ND_PRINT( TRACE_LEVEL_INFORMATION, ND_DBG_NDI,
+ ("Queried QP %#I64x, QPn %#x, pd %#I64x, context %p, status %#x \n",
+ m_hQp, m_Qpn, m_pParent->m_hPd, this, status ) );
+
+ switch( status )
+ {
+ case IB_SUCCESS:
+ *qp_attr = qp_ioctl.out.attr;
+ return S_OK;
+
+ default:
+ return ND_UNSUCCESSFUL;
+ }
+
+}
+
} // namespace
Index: ulp/nd/user/NdEndpoint.h
===================================================================
--- ulp/nd/user/NdEndpoint.h (revision 2310)
+++ ulp/nd/user/NdEndpoint.h (working copy)
@@ -67,7 +67,7 @@
__in SIZE_T nOutboundSge,
__in SIZE_T InboundReadLimit,
__in SIZE_T OutboundReadLimit,
- __out_opt SIZE_T* pMaxInlineData
+ __in_opt __out_opt SIZE_T* pMaxInlineData
);
public:
@@ -176,11 +176,16 @@
__in SIZE_T nInboundSge,
__in SIZE_T nOutboundSge,
__in SIZE_T InboundReadLimit,
- __in SIZE_T OutboundReadLimit
+ __in SIZE_T OutboundReadLimit,
+ __in SIZE_T MaxInlineData
);
void DestroyQp();
+ HRESULT QueryQp(
+ __out ib_qp_attr_t *qp_attr
+ );
+
HRESULT ModifyQp(
__in ib_qp_state_t NewState
);
@@ -197,6 +202,7 @@
UINT8 m_Ird;
UINT8 m_Ord;
+ UINT32 m_MaxInlineSize;
};
} // namespace
Index: ulp/nd/user/NdProv.cpp
===================================================================
--- ulp/nd/user/NdProv.cpp (revision 2310)
+++ ulp/nd/user/NdProv.cpp (working copy)
@@ -57,6 +57,7 @@
uint32_t g_nd_dbg_level = TRACE_LEVEL_ERROR;
/* WPP doesn't want here literals! */
uint32_t g_nd_dbg_flags = 0x80000001; /* ND_DBG_ERROR | ND_DBG_NDI; */
+uint32_t g_nd_max_inline_size = 400;
HANDLE ghHeap;
@@ -462,6 +463,8 @@
switch( dwReason )
{
case DLL_PROCESS_ATTACH:
+ TCHAR env_var[16];
+ DWORD i;
#if defined(EVENT_TRACING)
@@ -471,9 +474,6 @@
WPP_INIT_TRACING(L"ibndprov.dll");
#endif
#elif DBG
- TCHAR env_var[16];
- DWORD i;
-
i = GetEnvironmentVariable( "IBNDPROV_DBG_LEVEL", env_var,
sizeof(env_var) );
if( i && i <= 16 )
{
@@ -494,6 +494,12 @@
GetCurrentProcessId(), g_nd_dbg_level ,g_nd_dbg_flags) );
#endif
+ i = GetEnvironmentVariable( "IBNDPROV_MAX_INLINE_SIZE", env_var,
sizeof(env_var) );
+ if( i && i <= 16 )
+ {
+ g_nd_max_inline_size = _tcstoul( env_var, NULL, 16 );
+ }
+
ghHeap = HeapCreate( 0, 0, 0 );
if( ghHeap == NULL )
{
________________________________
From: Tzachi Dar
Sent: Monday, July 20, 2009 6:13 PM
To: Leonid Keller
Subject: FW: Improving latency of ms-mpi
________________________________
From: Tzachi Dar
Sent: Tuesday, July 14, 2009 6:39 PM
To: Fab Tillier
Cc: Ishai Rabinovitz; Gilad Shainer; Leonid Keller; Eric Lantz (HPC
GOFAST)
Subject: Improving latency of ms-mpi
Hi Fab,
I have made some tests on my Nehalem computer (Processor Intel(R)
Core(TM) i7 CPU 920 @ 2.67GHz, 2660 Mhz, 4 Core(s), 8 Logical
Processor(s))
It seems that the latency that I got for RDMA operations using
ib_write_lat is 1.26us
On the same machine ndrpingpong.exe gave 2.23u which is almost 1us
bigger )-:
After some debugging of the system, it seems that the problem comes from
not using inline send. Using Inline send has reduced the ndrpingpong
latency to
1.12us Which as far as I know is around 200ns bigger than the Clovertown
numbers.
[By the way, it seems that the nd pingpong test is better than ours due
to a lazy poll cq, but this is a different issue].
Since nothing in the nd provider is using inline send I guess that we
can improve all the results on MS-MPI in about 1us by using inline send.
I have created a very simple patch that demonstrates what has to be done
in order to allow inline to work. Please note that the patch is over
simplified and will always use inline RDMA operations even when it is
not allowed to.
The questions that I see are:
1) How do we pass the maxinline parameter for creating the qp? Do we
want to add a new parameter, or do we want to use sge * 12 number, or do
we want a big default?
2) How do we decide when to send using inline? Do we want to add a new
ND_OP_FLAG, do we want to always send when we can (that is message <
max_inline)?
Here is the patch created:
Index: NdEndpoint.cpp
===================================================================
--- NdEndpoint.cpp (revision 4569)
+++ NdEndpoint.cpp (working copy)
@@ -635,7 +635,7 @@
wr.p_next = NULL;
wr.wr_id = (ULONG_PTR)pResult;
wr.wr_type = Type;
- wr.send_opt = 0;
+ wr.send_opt = IB_SEND_OPT_INLINE; //????
if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS) )
wr.send_opt |= IB_SEND_OPT_SIGNALED;
if( Flags & ND_OP_FLAG_READ_FENCE )
@@ -737,6 +737,8 @@
qp_ioctl.in.qp_create.h_srq = NULL;
qp_ioctl.in.qp_create.sq_signaled = FALSE;
+ qp_ioctl.in.qp_create.sq_max_inline = 400;
+
/* Pre call to the UVP library */
CL_ASSERT( m_pParent->m_Ifc.user_verbs.pre_create_qp );
qp_ioctl.in.qp_create.h_sq_cq = pOutboundCq->m_uCq;
Thanks
Tzachi
Here are the more detailed results:
C:\Users\herod>ib_write_lat -a 11.4.12.128
------------------------------------------------------------------
RDMA_Write Latency Test
Connection type : RC
max inline size 28
local address: LID 0x200, QPN 0x4b000200, PSN 0xe7520000, RKey
0x140004 VAddr 0x00000001630040
remote address: LID 0x100, QPN 0x4a001e00, PSN 0xf6040000, RKey
0x130040 VAddr 0x00000001530040
Mtu : 2048
------------------------------------------------------------------
#bytes #iterations t_min[usec] t_max[usec] t_typical[usec]
2 1000 0.98 4.89 1.26
4 1000 1.12 1.26 1.26
8 1000 0.98 1.40 1.26
16 1000 0.84 31.71 1.26
C:\Users\herod>ib_read_lat -a 11.4.12.128
------------------------------------------------------------------
RDMA_Read Latency Test
max inline size 28
local address: LID 0x200, QPN 0x4a000600, PSN 0xf1560000, RKey
0x13000c VAddr 0x00000001550040
remote address: LID 0x100, QPN 0x4a000400, PSN 0x42390000, RKey
0x13000c VAddr 0x00000001390040
Mtu : 2048
------------------------------------------------------------------
#bytes #iterations t_min[usec] t_max[usec] t_typical[usec]
2 1000 1.40 8.94 2.23
4 1000 1.40 2.79 2.23
8 1000 1.96 2.51 2.23
16 1000 1.68 2.79 2.23
32 1000 1.40 9.22 2.23
64 1000 1.40 3.07 2.23
128 1000 2.23 2.79 2.23
256 1000 1.96 2.79 2.51
512 1000 1.96 3.07 2.79
1024 1000 2.51 3.63 3.07
2048 1000 3.07 32.69 3.91
4096 1000 3.91 5.31 4.75
8192 1000 5.87 6.70 6.15
16384 1000 5.31 26.82 8.66
32768 1000 12.29 15.37 14.25
65536 1000 22.63 42.46 24.86
131072 1000 43.30 88.56 46.10
262144 1000 84.09 152.53 88.84
524288 1000 169.02 236.90 174.32
1048576 1000 342.22 382.45 345.02
2097152 1000 653.99 706.51 686.96
4194304 1000 1364.70 1408.00 1370.57
8388608 1000 2736.10 2765.71 2738.62
------------------------------------------------------------------
C:\Users\herod>ib_Send_lat -a 11.4.12.128
------------------------------------------------------------------
Send Latency Test
Inline data is used up to 400 bytes message
Connection type : RC
test
local address: LID 0x200, QPN 0x4a000800, PSN 0x7a600000, RKey
0x130010 VAddr 0x00000001510040
remote address: LID 0x100, QPN 0x4a000600, PSN 0x6c6e0000, RKey
0x130010 VAddr 0x00000001570040
Mtu : 2048
------------------------------------------------------------------
#bytes #iterations t_min[usec] t_max[usec] t_typical[usec]
2 1000 1.26 20.95 1.40
4 1000 1.26 5.17 1.40
8 1000 1.12 5.03 1.40
C:\Users\herod>q:\projinf4\trunk\bin\user\objfre_wlh_amd64\amd64\ndpingp
ong.exe c 11.4.12.128 5000 p
1
Using 8 processors. Sender Frequency is 3579545
1 100000 3.07 106.77 325414
2 100000 3.07 106.77 650829
4 100000 3.07 106.77 1301659
8 100000 3.07 109.31 2603319
16 100000 3.07 106.77 5206638
32 100000 3.07 109.31 10413276
C:\Users\herod>q:\projinf4\trunk\bin\user\objfre_wlh_amd64\amd64\ndrping
pong.exe c 11.4.12.128 5000
p1
Using 8 processors. Sender Frequency is 3579545
1 100000 2.23 101.39 447627
2 100000 2.23 101.39 895255
4 100000 2.23 101.39 1790510
8 100000 2.23 101.39 3581020
16 100000 2.23 101.39 7162041
32 100000 2.23 108.37 14324082
64 100000 2.23 104.88 28648164
128 100000 2.23 111.91 57296329
256 100000 2.51 99.44 101829753
512 100000 2.51 111.85 203659506
1024 100000 3.07 106.77 333224861
2048 100000 3.91 105.86 523651240
4096 100000 4.75 101.98 862497367
8192 100000 6.15 100.41 1332899446
16384 100000 8.66 102.84 1891916859
32768 64000 13.97 101.37 2345933562
65536 32000 24.86 102.12 2635884647
And after moving to inline send ...
Using 8 processors. Sender Frequency is 3579545
1 100000 1.12 111.91 895255
2 100000 1.12 118.89 1790510
4 100000 1.12 118.89 3581020
8 100000 1.12 111.91 7162041
16 100000 1.12 118.89 14324082
32 100000 1.12 125.87 28648164
64 100000 1.40 106.30 45845272
128 100000 1.40 111.89 91690544
256 100000 2.23 101.39 114592658
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20090727/ececcb3d/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: nd_perf.patch
Type: application/octet-stream
Size: 8390 bytes
Desc: nd_perf.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20090727/ececcb3d/attachment.obj>
More information about the ofw
mailing list