[ofw] [Patch 56/62] Reference implementation of NDv2

Fab Tillier ftillier at microsoft.com
Wed Feb 20 21:08:33 PST 2013


IBAL: Add exponential backoff to SA queries if timeout specified is negative.  The upper half of the timeout is the maximum delay, so that exponential backoff is capped.  The bottom half is the starting delay.  A random (per host) jitter is added to the timeout so that concurrent queries (as you might see with an MPI all-to-all) don't flood the SA.

Signed-off-by: Fab Tillier <ftillier at microsoft.com>

diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r \dev\openib\Mellanox\11011\core\al\al_mad.c .\core\al\al_mad.c
--- \dev\openib\Mellanox\11011\core\al\al_mad.c	Thu Sep 20 17:51:04 2012
+++ .\core\al\al_mad.c	Tue Oct 09 08:49:38 2012
@@ -32,6 +32,7 @@
 #include <iba/ib_al.h>
 #include <complib/cl_byteswap.h>
 #include <complib/cl_timer.h>
+#include <limits.h>
 
 #include "al.h"
 #include "al_debug.h"
@@ -178,9 +179,10 @@ __cleanup_mad_send(
 	IN				ib_mad_send_handle_t		h_send,
 	IN				uint16_t					ctx);
 
-static __inline void
+static __inline int
 __set_retry_time(
-	IN				ib_mad_send_handle_t		h_send );
+	IN				ib_mad_send_handle_t		h_send,
+    IN              ULONG                       send_jitter );
 
 static void
 __mad_svc_send_done(
@@ -1071,6 +1073,7 @@ reg_mad_svc(
 	ib_mad_svc_handle_t	h_mad_svc;
 	al_qp_alias_t		*p_qp_alias;
 	ib_qp_attr_t		qp_attr;
+    static ULONG        seed = 0;
 
 	AL_ENTER( AL_DBG_MAD_SVC );
 	CL_ASSERT( h_qp );
@@ -1108,6 +1111,14 @@ reg_mad_svc(
 	cl_qlist_init( &h_mad_svc->send_list );
 	cl_qlist_init( &h_mad_svc->recv_list );
 
+    if( seed == 0 )
+    {
+        seed = (ULONG)(ULONG_PTR)p_mad_svc;
+    }
+#ifdef CL_KERNEL
+    h_mad_svc->send_jitter = RtlRandomEx( &seed );
+#endif
+
 	p_qp_alias = PARENT_STRUCT( h_qp, al_qp_alias_t, qp );
 	h_mad_svc->svc_type = p_mad_svc->svc_type;
 	h_mad_svc->obj.context = p_mad_svc->mad_svc_context;
@@ -1967,9 +1976,8 @@ __mad_svc_send_done(
 				("waiting for response for TID:0x%I64x\n",
 				__get_send_tid( h_send )) );
 
-			__set_retry_time( h_send );
 			cl_timer_trim( &h_mad_svc->send_timer,
-				h_send->p_send_mad->timeout_ms );
+				__set_retry_time( h_send, h_mad_svc->send_jitter ) );
 		}
 		cl_spinlock_release( &h_mad_svc->obj.lock );
 	}
@@ -2962,14 +2972,51 @@ __process_rmpp_nack(
 
 
 
-static __inline void
+static __inline int
 __set_retry_time(
-	IN				ib_mad_send_handle_t		h_send )
+	IN				ib_mad_send_handle_t		h_send,
+    IN              ULONG                       send_jitter )
 {
-	h_send->retry_time =
-		(uint64_t)(h_send->p_send_mad->timeout_ms + h_send->delay) * 1000Ui64 +
-		cl_get_time_stamp();
+    int timeout = (int)h_send->p_send_mad->timeout_ms;
+
+    //
+    // Negative values indicate recursive doubling.
+    //
+    if( timeout < 0 )
+    {
+        int max;
+        timeout = -timeout;
+        max = timeout >> 16;
+        timeout &= 0xFFFFUL;
+
+        if( max == 0 )
+        {
+            max = SHRT_MAX;
+        }
+
+        if( (timeout * 2) <= max )
+        {
+            //
+            // Double the timeout for the next iteration.
+            //
+            h_send->p_send_mad->timeout_ms = (ULONG)-((max << 16) | (timeout * 2));
+        }
+        else
+        {
+            h_send->p_send_mad->timeout_ms = (ULONG)-((max << 16) | max);
+        }
+    }
+
+    //
+    // Add some jitter, random number between 0 and 1/2 timeout.
+    // Note that this is in microseconds and not milliseconds.
+    //
+    timeout += (send_jitter % timeout) / 2;
+    timeout += h_send->delay;
+
+	h_send->retry_time = (uint64_t)(timeout) * 1000Ui64 + cl_get_time_stamp();
 	h_send->delay = 0;
+    return timeout;
 }
 
 
@@ -3076,9 +3123,8 @@ __check_send_queue(
 				else
 				{
 					/* The send was delivered.  Continue waiting. */
-					__set_retry_time( h_send );
 					cl_timer_trim( &h_mad_svc->send_timer,
-						((uint32_t)(h_send->retry_time - cur_time) / 1000) );
+						__set_retry_time( h_send, h_mad_svc->send_jitter ) );
 				}
 			}
 			else
diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r \dev\openib\Mellanox\11011\core\al\al_mad.h .\core\al\al_mad.h
--- \dev\openib\Mellanox\11011\core\al\al_mad.h	Thu Sep 20 17:51:04 2012
+++ .\core\al\al_mad.h	Thu Oct 04 14:19:36 2012
@@ -148,6 +148,7 @@ typedef struct _al_mad_svc
 
 	cl_qlist_t					send_list;
 	cl_timer_t					send_timer;
+    ULONG                       send_jitter;
 
 	cl_qlist_t					recv_list;
 	cl_timer_t					recv_timer;
diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r \dev\openib\Mellanox\11011\core\al\kernel\al_cm_cep.c .\core\al\kernel\al_cm_cep.c
--- \dev\openib\Mellanox\11011\core\al\kernel\al_cm_cep.c	Thu Sep 20 17:51:02 2012
+++ .\core\al\kernel\al_cm_cep.c	Tue Oct 16 14:53:19 2012
@@ -36,6 +36,7 @@
 #include <complib/cl_spinlock.h>
 #include <iba/ib_al_ifc.h>
 #include <iba/ib_cm_ifc.h>
+#include <limits.h>
 #include "al_common.h"
 #include "al_cm_cep.h"
 #include "al_cm_conn.h"
@@ -3612,7 +3613,7 @@ __calc_mad_timeout(
 	 * trap exceedingly large values to prevent wrapping.
 	 */
 	if( pkt_life > 39 )
-		return ~0UL;
+		return INT_MAX;
 	if( pkt_life > 14 )
 		return 67 << (pkt_life - 14);
 	else if( pkt_life > 8 )
diff -dwup3 -X excl.txt -I ^ \*$ -I ^ \* \$ -r \dev\openib\Mellanox\11011\inc\kernel\complib\cl_types_osd.h .\inc\kernel\complib\cl_types_osd.h
--- \dev\openib\Mellanox\11011\inc\kernel\complib\cl_types_osd.h	Thu Sep 20 17:51:06 2012
+++ .\inc\kernel\complib\cl_types_osd.h	Wed Sep 19 14:54:03 2012
@@ -49,7 +49,10 @@ extern "C"
 #define CL_NTDDK
 #endif /* NDIS_WDM */
 #elif !defined( _MINIPORT_ )
+#ifndef _NTDDK_
+#include <ntifs.h>
 #include <ntddk.h>
+#endif
 #define CL_NTDDK
 #endif	/* defined( NDIS_MINIPORT_DRIVER ) */
 #pragma warning( pop )
-------------- next part --------------
A non-text attachment was scrubbed...
Name: ndv2.56.patch
Type: application/octet-stream
Size: 5400 bytes
Desc: ndv2.56.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20130221/abf274ad/attachment.obj>


More information about the ofw mailing list