[ofw] [Patch 56/62] Reference implementation of NDv2
Fab Tillier
ftillier at microsoft.com
Wed Feb 20 21:08:33 PST 2013
IBAL: Add exponential backoff to SA queries if timeout specified is negative. The upper half of the timeout is the maximum delay, so that exponential backoff is capped. The bottom half is the starting delay. A random (per host) jitter is added to the timeout so that concurrent queries (as you might see with an MPI all-to-all) don't flood the SA.
Signed-off-by: Fab Tillier <ftillier at microsoft.com>
diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r \dev\openib\Mellanox\11011\core\al\al_mad.c .\core\al\al_mad.c
--- \dev\openib\Mellanox\11011\core\al\al_mad.c Thu Sep 20 17:51:04 2012
+++ .\core\al\al_mad.c Tue Oct 09 08:49:38 2012
@@ -32,6 +32,7 @@
#include <iba/ib_al.h>
#include <complib/cl_byteswap.h>
#include <complib/cl_timer.h>
+#include <limits.h>
#include "al.h"
#include "al_debug.h"
@@ -178,9 +179,10 @@ __cleanup_mad_send(
IN ib_mad_send_handle_t h_send,
IN uint16_t ctx);
-static __inline void
+static __inline int
__set_retry_time(
- IN ib_mad_send_handle_t h_send );
+ IN ib_mad_send_handle_t h_send,
+ IN ULONG send_jitter );
static void
__mad_svc_send_done(
@@ -1071,6 +1073,7 @@ reg_mad_svc(
ib_mad_svc_handle_t h_mad_svc;
al_qp_alias_t *p_qp_alias;
ib_qp_attr_t qp_attr;
+ static ULONG seed = 0;
AL_ENTER( AL_DBG_MAD_SVC );
CL_ASSERT( h_qp );
@@ -1108,6 +1111,14 @@ reg_mad_svc(
cl_qlist_init( &h_mad_svc->send_list );
cl_qlist_init( &h_mad_svc->recv_list );
+ if( seed == 0 )
+ {
+ seed = (ULONG)(ULONG_PTR)p_mad_svc;
+ }
+#ifdef CL_KERNEL
+ h_mad_svc->send_jitter = RtlRandomEx( &seed );
+#endif
+
p_qp_alias = PARENT_STRUCT( h_qp, al_qp_alias_t, qp );
h_mad_svc->svc_type = p_mad_svc->svc_type;
h_mad_svc->obj.context = p_mad_svc->mad_svc_context;
@@ -1967,9 +1976,8 @@ __mad_svc_send_done(
("waiting for response for TID:0x%I64x\n",
__get_send_tid( h_send )) );
- __set_retry_time( h_send );
cl_timer_trim( &h_mad_svc->send_timer,
- h_send->p_send_mad->timeout_ms );
+ __set_retry_time( h_send, h_mad_svc->send_jitter ) );
}
cl_spinlock_release( &h_mad_svc->obj.lock );
}
@@ -2962,14 +2972,51 @@ __process_rmpp_nack(
-static __inline void
+static __inline int
__set_retry_time(
- IN ib_mad_send_handle_t h_send )
+ IN ib_mad_send_handle_t h_send,
+ IN ULONG send_jitter )
{
- h_send->retry_time =
- (uint64_t)(h_send->p_send_mad->timeout_ms + h_send->delay) * 1000Ui64 +
- cl_get_time_stamp();
+ int timeout = (int)h_send->p_send_mad->timeout_ms;
+
+ //
+ // Negative values indicate recursive doubling.
+ //
+ if( timeout < 0 )
+ {
+ int max;
+ timeout = -timeout;
+ max = timeout >> 16;
+ timeout &= 0xFFFFUL;
+
+ if( max == 0 )
+ {
+ max = SHRT_MAX;
+ }
+
+ if( (timeout * 2) <= max )
+ {
+ //
+ // Double the timeout for the next iteration.
+ //
+ h_send->p_send_mad->timeout_ms = (ULONG)-((max << 16) | (timeout * 2));
+ }
+ else
+ {
+ h_send->p_send_mad->timeout_ms = (ULONG)-((max << 16) | max);
+ }
+ }
+
+ //
+ // Add some jitter, random number between 0 and 1/2 timeout.
+ // Note that this is in microseconds and not milliseconds.
+ //
+ timeout += (send_jitter % timeout) / 2;
+ timeout += h_send->delay;
+
+ h_send->retry_time = (uint64_t)(timeout) * 1000Ui64 + cl_get_time_stamp();
h_send->delay = 0;
+ return timeout;
}
@@ -3076,9 +3123,8 @@ __check_send_queue(
else
{
/* The send was delivered. Continue waiting. */
- __set_retry_time( h_send );
cl_timer_trim( &h_mad_svc->send_timer,
- ((uint32_t)(h_send->retry_time - cur_time) / 1000) );
+ __set_retry_time( h_send, h_mad_svc->send_jitter ) );
}
}
else
diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r \dev\openib\Mellanox\11011\core\al\al_mad.h .\core\al\al_mad.h
--- \dev\openib\Mellanox\11011\core\al\al_mad.h Thu Sep 20 17:51:04 2012
+++ .\core\al\al_mad.h Thu Oct 04 14:19:36 2012
@@ -148,6 +148,7 @@ typedef struct _al_mad_svc
cl_qlist_t send_list;
cl_timer_t send_timer;
+ ULONG send_jitter;
cl_qlist_t recv_list;
cl_timer_t recv_timer;
diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r \dev\openib\Mellanox\11011\core\al\kernel\al_cm_cep.c .\core\al\kernel\al_cm_cep.c
--- \dev\openib\Mellanox\11011\core\al\kernel\al_cm_cep.c Thu Sep 20 17:51:02 2012
+++ .\core\al\kernel\al_cm_cep.c Tue Oct 16 14:53:19 2012
@@ -36,6 +36,7 @@
#include <complib/cl_spinlock.h>
#include <iba/ib_al_ifc.h>
#include <iba/ib_cm_ifc.h>
+#include <limits.h>
#include "al_common.h"
#include "al_cm_cep.h"
#include "al_cm_conn.h"
@@ -3612,7 +3613,7 @@ __calc_mad_timeout(
* trap exceedingly large values to prevent wrapping.
*/
if( pkt_life > 39 )
- return ~0UL;
+ return INT_MAX;
if( pkt_life > 14 )
return 67 << (pkt_life - 14);
else if( pkt_life > 8 )
diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r \dev\openib\Mellanox\11011\inc\kernel\complib\cl_types_osd.h .\inc\kernel\complib\cl_types_osd.h
--- \dev\openib\Mellanox\11011\inc\kernel\complib\cl_types_osd.h Thu Sep 20 17:51:06 2012
+++ .\inc\kernel\complib\cl_types_osd.h Wed Sep 19 14:54:03 2012
@@ -49,7 +49,10 @@ extern "C"
#define CL_NTDDK
#endif /* NDIS_WDM */
#elif !defined( _MINIPORT_ )
+#ifndef _NTDDK_
+#include <ntifs.h>
#include <ntddk.h>
+#endif
#define CL_NTDDK
#endif /* defined( NDIS_MINIPORT_DRIVER ) */
#pragma warning( pop )
-------------- next part --------------
A non-text attachment was scrubbed...
Name: ndv2.56.patch
Type: application/octet-stream
Size: 5400 bytes
Desc: ndv2.56.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20130221/abf274ad/attachment.obj>
More information about the ofw
mailing list