[ofw][patch][ND provider] Improving latency of ms-mpi

Mon Jul 27 01:27:04 PDT 2009

This patch adds usage of INLINE DATA facility of Mellanox HCAs for
improving latency of ND provider.
E.g., on our Nehalem computer (Processor Intel(R) Core(TM) i7 CPU 920  @
2.67GHz, 2660 Mhz, 4 Core(s), 8 Logical Processor(s)) 
it improved latency from 2.23 us to 1.12 us.
 
Here are the ideas of the patch:
    - by default, ND provider will create QP with inline data of 400
bytes;
    (this can enlarge user's QP size)
    - one can change this default by defining env variable
IBNDPROV_MAX_INLINE_SIZE;
    - an ND application, while creating QP, can define the necessary
INLINE DATA size; this value takes precedence over the default one.
 
 
Index: ulp/nd/user/NdEndpoint.cpp
===================================================================

--- ulp/nd/user/NdEndpoint.cpp (revision 2310)
+++ ulp/nd/user/NdEndpoint.cpp (working copy)
@@ -41,6 +41,8 @@
 #pragma warning( pop )
 #include "nddebug.h"
 
+extern uint32_t   g_nd_max_inline_size;
+
 #if defined(EVENT_TRACING)
 #ifdef offsetof
 #undef offsetof
@@ -96,7 +98,7 @@
     __in SIZE_T nOutboundSge,
     __in SIZE_T InboundReadLimit,
     __in SIZE_T OutboundReadLimit,
-    __out_opt SIZE_T* pMaxInlineData
+    __in_opt __out_opt SIZE_T* pMaxInlineData
     )
 {
     ND_ENTER( ND_DBG_NDI );
@@ -117,10 +119,17 @@
         m_pParent->m_Ifc.user_verbs.nd_get_qp_state != NULL ||
         m_pParent->m_Ifc.user_verbs.pre_destroy_qp != NULL ||
         m_pParent->m_Ifc.user_verbs.post_destroy_qp != NULL ||
+        m_pParent->m_Ifc.user_verbs.post_query_qp != NULL ||
         m_pParent->m_Ifc.user_verbs.post_send != NULL ||
         m_pParent->m_Ifc.user_verbs.post_recv != NULL /*||
         m_pParent->m_Ifc.user_verbs.bind_mw != NULL*/ );
 
+ UINT32 InlineSize;
+ if ( pMaxInlineData )
+  InlineSize = (UINT32)*pMaxInlineData;
+ else
+  InlineSize = g_nd_max_inline_size;
+  
     HRESULT hr = CreateQp(
         pInboundCq,
         pOutboundCq,
@@ -129,13 +138,25 @@
         nInboundSge,
         nOutboundSge,
         InboundReadLimit,
-        OutboundReadLimit );
+        OutboundReadLimit,
+        InlineSize );
 
     if( FAILED( hr ) )
         return hr;
 
+ ib_qp_attr_t qp_attr;
+ hr = QueryQp(&qp_attr);
+ if( FAILED( hr ) ) {
+  DestroyQp();
+  return hr;
+ }
+ else
+  InlineSize = (UINT32)qp_attr.sq_max_inline;
+ 
+ 
     m_Ird = (UINT8)InboundReadLimit;
     m_Ord = (UINT8)OutboundReadLimit;
+    m_MaxInlineSize = InlineSize;
 
     // Move the QP to the INIT state so users can post receives.
     hr = ModifyQp( IB_QPS_INIT );
@@ -143,10 +164,7 @@
         DestroyQp();
 
     if( SUCCEEDED( hr ) && pMaxInlineData != NULL )
-    {
-        // Worst case.
-        *pMaxInlineData = nOutboundSge * 12;
-    }
+        *pMaxInlineData = InlineSize;
 
     return hr;
 }
@@ -286,7 +304,11 @@
     wr.p_next = NULL;
     wr.wr_id = (ULONG_PTR)pResult;
     wr.wr_type = WR_SEND;
-    wr.send_opt = 0;
+    if ( pResult->BytesTransferred <= m_MaxInlineSize )
+     wr.send_opt = IB_SEND_OPT_INLINE; 
+ else
+     wr.send_opt = 0;
+    
     if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS) )
         wr.send_opt |= IB_SEND_OPT_SIGNALED;
     if( Flags & ND_OP_FLAG_READ_FENCE )
@@ -374,11 +396,15 @@
     wr.p_next = NULL;
     wr.wr_id = (ULONG_PTR)pResult;
     wr.wr_type = WR_SEND;
+    if ( pResult->BytesTransferred <= m_MaxInlineSize )
+     wr.send_opt = IB_SEND_OPT_INLINE; 
+ else
+     wr.send_opt = 0;
     // We simulate invalidate operations (since we simulate MW use).
We
     // put the RKey in the immediate data, the recipient will do the
     // lookup of the MW based on that (as they would with a regular
     // invalidate request).
-    wr.send_opt = IB_SEND_OPT_IMMEDIATE;
+    wr.send_opt |= IB_SEND_OPT_IMMEDIATE;
     if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS) )
         wr.send_opt |= IB_SEND_OPT_SIGNALED;
     if( Flags & ND_OP_FLAG_READ_FENCE )
@@ -665,7 +691,10 @@
     wr.p_next = NULL;
     wr.wr_id = (ULONG_PTR)pResult;
     wr.wr_type = Type;
-    wr.send_opt = 0;
+    if ( pResult->BytesTransferred <= m_MaxInlineSize )
+     wr.send_opt = IB_SEND_OPT_INLINE; 
+ else
+     wr.send_opt = 0;
     if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS) )
         wr.send_opt |= IB_SEND_OPT_SIGNALED;
     if( Flags & ND_OP_FLAG_READ_FENCE )
@@ -737,11 +766,14 @@
     __in SIZE_T nInboundSge,
     __in SIZE_T nOutboundSge,
     __in SIZE_T InboundReadLimit,
-    __in SIZE_T OutboundReadLimit
+    __in SIZE_T OutboundReadLimit,
+    __in SIZE_T MaxInlineData
     )
 {
     ND_ENTER( ND_DBG_NDI );
 
+    if( MaxInlineData > UINT_MAX )
+        return ND_INVALID_PARAMETER_3;
     if( nInboundEntries > UINT_MAX )
         return ND_INVALID_PARAMETER_4;
     if( nOutboundEntries > UINT_MAX )
@@ -764,6 +796,7 @@
     qp_ioctl.in.qp_create.rq_depth = (uint32_t)nInboundEntries;
     qp_ioctl.in.qp_create.sq_sge = (uint32_t)nOutboundSge;
     qp_ioctl.in.qp_create.rq_sge = (uint32_t)nInboundSge;
+    qp_ioctl.in.qp_create.sq_max_inline = (uint32_t)MaxInlineData;
     qp_ioctl.in.qp_create.h_srq = NULL;
     qp_ioctl.in.qp_create.sq_signaled = FALSE;
 
@@ -941,4 +974,67 @@
     return S_OK;
 }
 
+HRESULT CEndpoint::QueryQp(
+ __out ib_qp_attr_t *qp_attr
+ )
+{
+ ib_api_status_t status;
+ 
+ ND_ENTER( ND_DBG_NDI );
+
+ ual_query_qp_ioctl_t qp_ioctl;
+ cl_memclr( &qp_ioctl, sizeof(qp_ioctl) );
+ qp_ioctl.in.h_qp = m_hQp;
+
+ /* Call the uvp pre call if the vendor library provided a valid ca
handle */
+ if( m_pParent->m_Ifc.user_verbs.pre_query_qp )
+ {
+  /* Pre call to the UVP library */
+  status = m_pParent->m_Ifc.user_verbs.pre_query_qp( m_uQp,
&qp_ioctl.in.umv_buf );
+  if( status != IB_SUCCESS )
+   goto done;
+ }
+
+ DWORD bytes_ret;
+ BOOL fSuccess = DeviceIoControl(
+  m_pParent->m_hSync,
+  UAL_QUERY_QP,
+  &qp_ioctl.in,
+  sizeof(qp_ioctl.in),
+  &qp_ioctl.out,
+  sizeof(qp_ioctl.out),
+  &bytes_ret,
+  NULL
+  );
+
+ if( fSuccess != TRUE || bytes_ret != sizeof(qp_ioctl.out) )
+  status = IB_ERROR;
+ else
+  status = qp_ioctl.out.status;
+
+ /* Call vendor's post_query_qp */
+ CL_ASSERT( m_pParent->m_Ifc.user_verbs.post_query_qp );
+ if( m_pParent->m_Ifc.user_verbs.post_query_qp )
+ {
+  m_pParent->m_Ifc.user_verbs.post_query_qp( m_uQp, status,
+   &qp_ioctl.out.attr, &qp_ioctl.out.umv_buf );
+ }
+
+done:
+ ND_PRINT( TRACE_LEVEL_INFORMATION, ND_DBG_NDI,
+  ("Queried QP %#I64x, QPn %#x, pd %#I64x, context %p, status %#x \n", 
+  m_hQp, m_Qpn, m_pParent->m_hPd, this, status ) );
+
+ switch( status )
+ {
+ case IB_SUCCESS:
+  *qp_attr = qp_ioctl.out.attr;
+  return S_OK;
+
+ default:
+  return ND_UNSUCCESSFUL;
+ }
+ 
+}
+
 } // namespace
Index: ulp/nd/user/NdEndpoint.h
===================================================================
--- ulp/nd/user/NdEndpoint.h (revision 2310)
+++ ulp/nd/user/NdEndpoint.h (working copy)
@@ -67,7 +67,7 @@
         __in SIZE_T nOutboundSge,
         __in SIZE_T InboundReadLimit,
         __in SIZE_T OutboundReadLimit,
-        __out_opt SIZE_T* pMaxInlineData
+        __in_opt __out_opt SIZE_T* pMaxInlineData
         );
 
 public:
@@ -176,11 +176,16 @@
         __in SIZE_T nInboundSge,
         __in SIZE_T nOutboundSge,
         __in SIZE_T InboundReadLimit,
-        __in SIZE_T OutboundReadLimit
+  __in SIZE_T OutboundReadLimit,
+  __in SIZE_T MaxInlineData
         );
 
     void DestroyQp();
 
+    HRESULT QueryQp(
+  __out ib_qp_attr_t *qp_attr
+  );
+
     HRESULT ModifyQp(
         __in ib_qp_state_t NewState
         );
@@ -197,6 +202,7 @@
 
     UINT8 m_Ird;
     UINT8 m_Ord;
+ UINT32 m_MaxInlineSize;
 };
 
 } // namespace
Index: ulp/nd/user/NdProv.cpp
===================================================================
--- ulp/nd/user/NdProv.cpp (revision 2310)
+++ ulp/nd/user/NdProv.cpp (working copy)
@@ -57,6 +57,7 @@
 uint32_t g_nd_dbg_level = TRACE_LEVEL_ERROR;
 /* WPP doesn't want here literals! */
 uint32_t g_nd_dbg_flags = 0x80000001; /* ND_DBG_ERROR | ND_DBG_NDI; */
+uint32_t g_nd_max_inline_size = 400;
 
 HANDLE ghHeap;
 
@@ -462,6 +463,8 @@
     switch( dwReason )
     {
     case DLL_PROCESS_ATTACH:
+        TCHAR    env_var[16];
+        DWORD    i;
 
 
 #if defined(EVENT_TRACING)
@@ -471,9 +474,6 @@
         WPP_INIT_TRACING(L"ibndprov.dll");
 #endif
 #elif DBG 
-        TCHAR    env_var[16];
-        DWORD    i;
-
         i = GetEnvironmentVariable( "IBNDPROV_DBG_LEVEL", env_var,
sizeof(env_var) );
         if( i && i <= 16 )
         {
@@ -494,6 +494,12 @@
             GetCurrentProcessId(), g_nd_dbg_level ,g_nd_dbg_flags) );
 #endif
 
+  i = GetEnvironmentVariable( "IBNDPROV_MAX_INLINE_SIZE", env_var,
sizeof(env_var) );
+  if( i && i <= 16 )
+  {
+   g_nd_max_inline_size = _tcstoul( env_var, NULL, 16 );
+  }
+
         ghHeap = HeapCreate( 0, 0, 0 );
         if( ghHeap == NULL )
         {

 
 

________________________________

From: Tzachi Dar 
Sent: Monday, July 20, 2009 6:13 PM
To: Leonid Keller
Subject: FW: Improving latency of ms-mpi


 

________________________________

From: Tzachi Dar 
Sent: Tuesday, July 14, 2009 6:39 PM
To: Fab Tillier
Cc: Ishai Rabinovitz; Gilad Shainer; Leonid Keller; Eric Lantz (HPC
GOFAST)
Subject: Improving latency of ms-mpi


Hi Fab,
 
I have made some tests on my Nehalem computer (Processor Intel(R)
Core(TM) i7 CPU         920  @ 2.67GHz, 2660 Mhz, 4 Core(s), 8 Logical
Processor(s))
 
It seems that the latency that I got for RDMA operations using
ib_write_lat is 1.26us
 
On the same machine ndrpingpong.exe gave 2.23u which is almost 1us
bigger )-: 
 
After some debugging of the system, it seems that the problem comes from
not using inline send. Using Inline send has reduced the ndrpingpong
latency to 
1.12us Which as far as I know is around 200ns bigger than the Clovertown
numbers.
 
[By the way, it seems that the nd pingpong test is better than ours due
to a lazy poll cq, but this is a different issue].
 
Since nothing in the nd provider is using inline send I guess that we
can improve all the results on MS-MPI in about 1us by using inline send.
 
I have created a very simple patch that demonstrates what has to be done
in order to allow inline to work. Please note that the patch is over
simplified and will always use inline RDMA operations even when it is
not allowed to.
 
The questions that I see are:
1) How do we pass the maxinline parameter for creating the qp? Do we
want to add a new parameter, or do we want to use sge * 12 number, or do
we want a big default?
 
2) How do we decide when to send using inline? Do we want to add a new
ND_OP_FLAG, do we want to always send when we can (that is message <
max_inline)?
 
Here is the patch created:
Index: NdEndpoint.cpp
===================================================================
--- NdEndpoint.cpp (revision 4569)
+++ NdEndpoint.cpp (working copy)
@@ -635,7 +635,7 @@
     wr.p_next = NULL;
     wr.wr_id = (ULONG_PTR)pResult;
     wr.wr_type = Type;
-    wr.send_opt = 0;
+    wr.send_opt = IB_SEND_OPT_INLINE; //????
     if( !(Flags & ND_OP_FLAG_SILENT_SUCCESS) )
         wr.send_opt |= IB_SEND_OPT_SIGNALED;
     if( Flags & ND_OP_FLAG_READ_FENCE )
@@ -737,6 +737,8 @@
     qp_ioctl.in.qp_create.h_srq = NULL;
     qp_ioctl.in.qp_create.sq_signaled = FALSE;
 
+    qp_ioctl.in.qp_create.sq_max_inline = 400;
+
     /* Pre call to the UVP library */
     CL_ASSERT( m_pParent->m_Ifc.user_verbs.pre_create_qp );
     qp_ioctl.in.qp_create.h_sq_cq = pOutboundCq->m_uCq;

 
Thanks
Tzachi
 
Here are the more detailed results:
 
C:\Users\herod>ib_write_lat -a 11.4.12.128
------------------------------------------------------------------
                    RDMA_Write Latency Test
Connection type : RC
max inline size 28
  local address:  LID 0x200, QPN 0x4b000200, PSN 0xe7520000, RKey
0x140004 VAddr 0x00000001630040
  remote address: LID 0x100, QPN 0x4a001e00, PSN 0xf6040000, RKey
0x130040 VAddr 0x00000001530040
Mtu : 2048
------------------------------------------------------------------
 #bytes #iterations    t_min[usec]    t_max[usec]  t_typical[usec]
      2        1000           0.98           4.89             1.26
      4        1000           1.12           1.26             1.26
      8        1000           0.98           1.40             1.26
     16        1000           0.84          31.71             1.26
 
C:\Users\herod>ib_read_lat -a 11.4.12.128
------------------------------------------------------------------
                    RDMA_Read Latency Test
max inline size 28
  local address:  LID 0x200, QPN 0x4a000600, PSN 0xf1560000, RKey
0x13000c VAddr 0x00000001550040
  remote address: LID 0x100, QPN 0x4a000400, PSN 0x42390000, RKey
0x13000c VAddr 0x00000001390040
Mtu : 2048
------------------------------------------------------------------
 #bytes #iterations    t_min[usec]    t_max[usec]  t_typical[usec]
      2        1000           1.40           8.94             2.23
      4        1000           1.40           2.79             2.23
      8        1000           1.96           2.51             2.23
     16        1000           1.68           2.79             2.23
     32        1000           1.40           9.22             2.23
     64        1000           1.40           3.07             2.23
    128        1000           2.23           2.79             2.23
    256        1000           1.96           2.79             2.51
    512        1000           1.96           3.07             2.79
   1024        1000           2.51           3.63             3.07
   2048        1000           3.07          32.69             3.91
   4096        1000           3.91           5.31             4.75
   8192        1000           5.87           6.70             6.15
  16384        1000           5.31          26.82             8.66
  32768        1000          12.29          15.37            14.25
  65536        1000          22.63          42.46            24.86
 131072        1000          43.30          88.56            46.10
 262144        1000          84.09         152.53            88.84
 524288        1000         169.02         236.90           174.32
1048576        1000         342.22         382.45           345.02
2097152        1000         653.99         706.51           686.96
4194304        1000        1364.70        1408.00          1370.57
8388608        1000        2736.10        2765.71          2738.62
------------------------------------------------------------------
 
 
C:\Users\herod>ib_Send_lat -a 11.4.12.128
------------------------------------------------------------------
                    Send Latency Test
Inline data is used up to 400 bytes message
Connection type : RC
test
  local address:  LID 0x200, QPN 0x4a000800, PSN 0x7a600000, RKey
0x130010 VAddr 0x00000001510040
  remote address: LID 0x100, QPN 0x4a000600, PSN 0x6c6e0000, RKey
0x130010 VAddr 0x00000001570040
Mtu : 2048
------------------------------------------------------------------
 #bytes #iterations    t_min[usec]    t_max[usec]  t_typical[usec]
      2        1000           1.26          20.95             1.40
      4        1000           1.26           5.17             1.40
      8        1000           1.12           5.03             1.40
 
 
C:\Users\herod>q:\projinf4\trunk\bin\user\objfre_wlh_amd64\amd64\ndpingp
ong.exe c 11.4.12.128 5000 p
1
Using 8 processors. Sender Frequency is 3579545
         1    100000      3.07  106.77      325414
         2    100000      3.07  106.77      650829
         4    100000      3.07  106.77     1301659
         8    100000      3.07  109.31     2603319
        16    100000      3.07  106.77     5206638
        32    100000      3.07  109.31    10413276
 
 
C:\Users\herod>q:\projinf4\trunk\bin\user\objfre_wlh_amd64\amd64\ndrping
pong.exe c 11.4.12.128 5000
p1
Using 8 processors. Sender Frequency is 3579545
         1    100000      2.23  101.39      447627
         2    100000      2.23  101.39      895255
         4    100000      2.23  101.39     1790510
         8    100000      2.23  101.39     3581020
        16    100000      2.23  101.39     7162041
        32    100000      2.23  108.37    14324082
        64    100000      2.23  104.88    28648164
       128    100000      2.23  111.91    57296329
       256    100000      2.51   99.44   101829753
       512    100000      2.51  111.85   203659506
      1024    100000      3.07  106.77   333224861
      2048    100000      3.91  105.86   523651240
      4096    100000      4.75  101.98   862497367
      8192    100000      6.15  100.41  1332899446
     16384    100000      8.66  102.84  1891916859
     32768     64000     13.97  101.37  2345933562
     65536     32000     24.86  102.12  2635884647
 
 
And after moving to inline send ...
Using 8 processors. Sender Frequency is 3579545
         1    100000      1.12  111.91      895255
         2    100000      1.12  118.89     1790510
         4    100000      1.12  118.89     3581020
         8    100000      1.12  111.91     7162041
        16    100000      1.12  118.89    14324082
        32    100000      1.12  125.87    28648164
        64    100000      1.40  106.30    45845272
       128    100000      1.40  111.89    91690544
       256    100000      2.23  101.39   114592658
 
 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20090727/ececcb3d/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: nd_perf.patch
Type: application/octet-stream
Size: 8390 bytes
Desc: nd_perf.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20090727/ececcb3d/attachment.obj>