[ofw] [Patch 21/62] Reference implementation of NDv2

Fab Tillier ftillier at microsoft.com
Wed Feb 20 17:47:28 PST 2013


Add RDMA CM support to CEP manager.

This patch adds extra logic in the CEP manager to enable comparing private data according to the IP Addressing Annex to the IB specification (aka RDMA CM).  The existing IBAL ND provider, as well as WinVerbs, are updated to take advantage of the new functionality.  Rather than doing a memory comparison of the private data, the offset is overloaded to represent a bit mask of fields to compare.  This overload only occurs if the service ID is a valid RDMA CM service ID, otherwise the behavior is unchanged.

Signed-off-by: Fab Tillier <ftillier at microsoft.com>

diff -dwup3 -x *svn* -x *makefile.inc -x *sources -r c:\dev\openib\ofw\gen1\branches\mlx4_30\trunk\core\al\kernel\al_cm_cep.c .\core\al\kernel\al_cm_cep.c
--- c:\dev\openib\ofw\gen1\branches\mlx4_30\trunk\core\al\kernel\al_cm_cep.c	Fri Aug 03 16:53:03 2012
+++ .\core\al\kernel\al_cm_cep.c	Mon Jul 30 22:23:51 2012
@@ -2700,6 +2700,59 @@ __lookup_by_id(
 }
 
 
+static intn_t
+__cm_rdma_req_cmp(
+	__in UINT64 mask,
+	__in const ib_cm_rdma_req_t* p_cmp1,
+	__in const ib_cm_rdma_req_t* p_cmp2 )
+{
+	intn_t cmp;
+
+	if( p_cmp1->maj_min_ver != p_cmp2->maj_min_ver )
+	{
+		return (intn_t)p_cmp1->maj_min_ver - (intn_t)p_cmp2->maj_min_ver;
+	}
+
+	if( p_cmp1->ipv != p_cmp2->ipv )
+	{
+		return (intn_t)p_cmp1->ipv - (intn_t)p_cmp2->ipv;
+	}
+
+	if( mask & IB_REQ_CM_RDMA_CMP_SRC_PORT )
+	{
+		if( p_cmp1->src_port != p_cmp2->src_port )
+		{
+			return (intn_t)p_cmp1->src_port - (intn_t)p_cmp2->src_port;
+		}
+	}
+
+	if( mask & IB_REQ_CM_RDMA_CMP_SRC_IP )
+	{
+		cmp = cl_memcmp( p_cmp1->src_ip_addr,
+			p_cmp2->src_ip_addr, sizeof(p_cmp2->src_ip_addr) );
+		if( cmp != 0 )
+		{
+			return cmp;
+		}
+	}
+
+	if( mask & IB_REQ_CM_RDMA_CMP_DST_IP )
+	{
+		cmp = cl_memcmp( p_cmp1->dst_ip_addr,
+			p_cmp2->dst_ip_addr, sizeof(p_cmp2->dst_ip_addr) );
+		if( cmp != 0 )
+		{
+			return cmp;
+		}
+	}
+
+	/*
+	 * TODO: Richer compare options to allow specifying pdata compare
+	 */
+	return 0;
+}
+
+
 /*
  * Lookup a CEP by Service ID and private data.
  */
@@ -2746,10 +2799,36 @@ port_cmp:
 pdata_cmp:
 		if( p_cep->p_cmp_buf && p_pdata )
 		{
-			int len = min(p_cep->cmp_len, IB_REQ_PDATA_SIZE - p_cep->cmp_offset);
+			if( ib_cm_is_rdma_cm_sid(sid) )
+			{
+				ib_cm_rdma_req_t *p_rdma_req = (ib_cm_rdma_req_t *)p_pdata;
+				CL_ASSERT(p_cep->cmp_len >= FIELD_OFFSET(ib_cm_rdma_req_t, pdata));
+
+				/* reject connection request with incorrect version parameters */
+				if( ib_cm_is_rdma_cm_req_valid( p_rdma_req ) == FALSE )
+				{
+					AL_PRINT_EXIT( TRACE_LEVEL_ERROR, AL_DBG_ERROR, 
+						("RDMA CM connection req is invalid: maj_min_ver %d, ipv %#x \n", 
+						p_rdma_req->maj_min_ver, p_rdma_req->ipv ) );
+					return NULL;
+				}
 			
+				cmp = __cm_rdma_req_cmp(
+					p_cep->cmp_offset,
+					p_rdma_req,
+					(ib_cm_rdma_req_t*)p_cep->p_cmp_buf
+					);
+			}
+			else
+			{
+                /*
+                 * TODO: this check seems to be for catching a malformed listen, and should
+                 * be trapped when the listen is created.  Checking after the fact is dumb.
+                 */
+				int len = min(p_cep->cmp_len, IB_REQ_PDATA_SIZE - p_cep->cmp_offset);
 			cmp = cl_memcmp( &p_pdata[p_cep->cmp_offset],
 				p_cep->p_cmp_buf, len );
+			}
 
 			if( !cmp )
 				goto match;
@@ -3423,26 +3502,6 @@ __cep_queue_mad(
 		return IB_INVALID_STATE;
 	}
 
-	// TODO: Remove - manage above core kernel CM code
-	/* NDI connection request case */
-	if ( p_cep->state == CEP_STATE_LISTEN &&
-		(p_cep->sid & IB_REQ_CM_RDMA_SID_PREFIX_MASK) == IB_REQ_CM_RDMA_SID_PREFIX )
-	{ /* Try to complete pending IRP, if any */
-		mad_cm_req_t* p_req = (mad_cm_req_t*)ib_get_mad_buf( p_mad );
-		ib_cm_rdma_req_t *p_rdma_req = (ib_cm_rdma_req_t *)p_req->pdata;
-
-		/* reject connection request with incorrect version parameters */
-		if ( ((p_rdma_req->maj_min_ver >> 4) != IB_REQ_CM_RDMA_MAJOR_VERSION) ||
-			 ((p_rdma_req->maj_min_ver & 0x0f) > IB_REQ_CM_RDMA_MINOR_VERSION) ||
-			 (p_rdma_req->ipv != 0x40 && p_rdma_req->ipv != 0x60) )
-		{
-			AL_PRINT_EXIT( TRACE_LEVEL_ERROR, AL_DBG_ERROR, 
-				("RDMA CM connection req is rejected: maj_min_ver %d, ipv %#x \n", 
-				p_rdma_req->maj_min_ver, p_rdma_req->ipv ) );
-			return IB_UNSUPPORTED;
-		}
-	}
-
 	/* Queue this MAD for processing. */
 	if( p_cep->p_mad_head )
 	{
@@ -4258,6 +4317,7 @@ al_cep_listen(
 	boolean_t			left = TRUE;
 	intn_t				cmp;
 	KLOCK_QUEUE_HANDLE	hdl;
+	ib_cm_rdma_req_t*	p_rdma_req = (ib_cm_rdma_req_t*)p_listen_info->p_cmp_buf;
 
 	AL_PRINT( TRACE_LEVEL_VERBOSE, AL_DBG_CM, ("[ CID = %d\n", cid) );
 
@@ -4291,6 +4351,24 @@ al_cep_listen(
 		goto done;
 	}
 
+	if( ib_cm_is_rdma_cm_sid(p_listen_info->svc_id) && p_rdma_req != NULL )
+	{
+		if( p_listen_info->cmp_len < FIELD_OFFSET(ib_cm_rdma_req_t, pdata) )
+		{
+			status = IB_INVALID_SETTING;
+			goto done;
+		}
+
+		if( ib_cm_is_rdma_cm_req_valid(p_rdma_req) == FALSE )
+		{
+			AL_PRINT_EXIT( TRACE_LEVEL_ERROR, AL_DBG_ERROR, 
+				("RDMA CM listen is invalid: maj_min_ver %d, ipv %#x \n", 
+				p_rdma_req->maj_min_ver, p_rdma_req->ipv ) );
+			status = IB_INVALID_SETTING;
+			goto done;
+		}
+	}
+
 	/* Insert the CEP into the listen map. */
 	p_item = cl_rbmap_root( &gp_cep_mgr->listen_map );
 	p_insert_at = p_item;
@@ -4336,17 +4414,26 @@ pdata_cmp:
 		if( p_listen_info->p_cmp_buf )
 		{
 			/* Compare length must match. */
-			//if( p_listen_info->cmp_len != p_listen->cmp_len )
-			//	break;
-
-			/* Compare offset must match. */
-			//if( p_listen_info->cmp_offset != p_listen->cmp_offset )
-			//	break;
+			if( p_listen_info->cmp_len != p_listen->cmp_len )
+				break;
 
-			int len = min(p_listen_info->cmp_len, p_listen->cmp_len);
+			/* Compare offset (or mask for RDMA CM) must match. */
+			if( p_listen_info->cmp_offset != p_listen->cmp_offset )
+				break;
 			
-			cmp = cl_memcmp( p_listen_info->p_cmp_buf,
-				p_listen->p_cmp_buf, len );
+			if( ib_cm_is_rdma_cm_sid(p_listen_info->svc_id) )
+			{
+				cmp = __cm_rdma_req_cmp(
+					p_listen->cmp_offset,
+					p_rdma_req,
+					(ib_cm_rdma_req_t*)p_listen->p_cmp_buf
+					);
+			}
+			else
+			{
+				cmp = cl_memcmp( &p_listen_info->p_cmp_buf,
+					p_listen->p_cmp_buf, p_listen->cmp_len );
+			}
 
 			if( cmp < 0 )
 				p_item = cl_rbmap_left( p_item ), left = TRUE;
diff -dwup3 -x *svn* -x *makefile.inc -x *sources -r c:\dev\openib\ofw\gen1\branches\mlx4_30\trunk\core\winverbs\kernel\wv_ep.c .\core\winverbs\kernel\wv_ep.c
--- c:\dev\openib\ofw\gen1\branches\mlx4_30\trunk\core\winverbs\kernel\wv_ep.c	Thu Mar 29 00:15:18 2012
+++ .\core\winverbs\kernel\wv_ep.c	Thu Jul 26 15:31:14 2012
@@ -354,18 +354,20 @@ static void WvFormatCmaHeader(IB_CMA_HEA
 {
 	pHeader->CmaVersion = IB_CMA_VERSION;
 	if (pLocalAddress->SockAddr.Sa.SaFamily == WV_AF_INET) {
-		pHeader->IpVersion = 4 << 4;
+		pHeader->IpVersion = IB_REQ_CM_RDMA_IPV4;
 		RtlZeroMemory(pHeader->SrcAddress.Ip4.Pad, sizeof(pHeader->SrcAddress.Ip4.Pad));
 		pHeader->SrcAddress.Ip4.Address = pLocalAddress->SockAddr.In.SinAddr;
 		RtlZeroMemory(pHeader->DstAddress.Ip4.Pad, sizeof(pHeader->DstAddress.Ip4.Pad));
 		pHeader->DstAddress.Ip4.Address = pPeerAddress->SockAddr.In.SinAddr;
 		pHeader->Port = pLocalAddress->SockAddr.In.SinPort;
 	} else {
-		pHeader->IpVersion = 6 << 4;
+		pHeader->IpVersion = IB_REQ_CM_RDMA_IPV6;
 		RtlCopyMemory(pHeader->SrcAddress.Ip6Address,
-					  pLocalAddress->SockAddr.In6.Sin6Addr, 16);
+					  pLocalAddress->SockAddr.In6.Sin6Addr,
+					  sizeof(pHeader->SrcAddress.Ip6Address));
 		RtlCopyMemory(pHeader->DstAddress.Ip6Address,
-					  pPeerAddress->SockAddr.In6.Sin6Addr, 16);
+					  pPeerAddress->SockAddr.In6.Sin6Addr,
+					  sizeof(pHeader->DstAddress.Ip6Address));
 		pHeader->Port = pLocalAddress->SockAddr.In6.Sin6Port;
 	}
 }
@@ -1197,8 +1199,10 @@ void WvEpListen(WV_PROVIDER *pProvider, 
 	WV_IO_EP_LISTEN		*pattr;
 	NTSTATUS			status;
 	void				*buf;
-	UINT8				offset, len;
+	UINT8				len;
+	UINT8				mask;
 	UINT64				sid;
+	IB_CMA_HEADER		hdr;
 
 	status = WdfRequestRetrieveInputBuffer(Request, sizeof(WV_IO_EP_LISTEN),
 										   &pattr, NULL);
@@ -1214,18 +1218,13 @@ void WvEpListen(WV_PROVIDER *pProvider, 
 
 	if (WvAnyAddress(&ep->Attributes.LocalAddress)) {
 		buf = NULL;
-		offset = 0;
 		len = 0;
+		mask = 0;
 	} else {
-		if (ep->Attributes.LocalAddress.SockAddr.Sa.SaFamily == WV_AF_INET) {
-			buf = &ep->Attributes.LocalAddress.SockAddr.In.SinAddr;
-			len = sizeof ep->Attributes.LocalAddress.SockAddr.In.SinAddr;
-			offset = FIELD_OFFSET(IB_CMA_HEADER, DstAddress.Ip4.Address);
-		} else {
-			buf = ep->Attributes.LocalAddress.SockAddr.In6.Sin6Addr;
-			len = sizeof ep->Attributes.LocalAddress.SockAddr.In6.Sin6Addr;
-			offset = FIELD_OFFSET(IB_CMA_HEADER, DstAddress.Ip6Address);
-		}
+		WvFormatCmaHeader(&hdr, &ep->Attributes.LocalAddress, &ep->Attributes.LocalAddress);
+		buf = &hdr;
+		len = sizeof(hdr);
+		mask = IB_REQ_CM_RDMA_CMP_DST_IP;
 	}
 
 	WdfObjectAcquireLock(ep->Queue);
@@ -1242,7 +1241,7 @@ void WvEpListen(WV_PROVIDER *pProvider, 
 	ep->Attributes.Param.Backlog = pattr->Backlog;
 	ep->State = WvEpListening;
 	sid = WvGetServiceId(ep->EpType, &ep->Attributes.LocalAddress);
-	status = IbCmInterface.CM.listen(ep->pIbCmId, sid, buf, len, offset);
+	status = IbCmInterface.CM.listen(ep->pIbCmId, sid, buf, len, mask);
 
 release:
 	WdfObjectReleaseLock(ep->Queue);
diff -dwup3 -x *svn* -x *makefile.inc -x *sources -r c:\dev\openib\ofw\gen1\branches\mlx4_30\trunk\inc\iba\ib_types.h .\inc\iba\ib_types.h
--- c:\dev\openib\ofw\gen1\branches\mlx4_30\trunk\inc\iba\ib_types.h	Fri Aug 03 12:14:07 2012
+++ .\inc\iba\ib_types.h	Thu Jul 26 15:31:14 2012
@@ -12876,19 +12876,40 @@ typedef struct _ib_time_stamp {
 *	ib_cc_mad_t
 *********/
 
-#define IB_REQ_CM_RDMA_SID_PREFIX			CL_NTOH64( 0x0000000001000000I64 )
-#define IB_REQ_CM_RDMA_SID_PREFIX_MASK		CL_NTOH64( 0xFFFFFFFFFF000000I64 )
+#define IB_REQ_CM_RDMA_SID_PREFIX			0x0000000100000000ULL
+#define IB_REQ_CM_RDMA_SID_PREFIX_MASK      0x000000FFFFFFFFFFULL
 #define IB_REQ_CM_RDMA_PDATA_SIZE			56
 #define IB_REQ_CM_RDMA_MAJOR_VERSION		0
 #define IB_REQ_CM_RDMA_MINOR_VERSION		0
+#define IB_REQ_CM_RDMA_VERSION              ((IB_REQ_CM_RDMA_MAJOR_VERSION << 4) |\
+                                            IB_REQ_CM_RDMA_MINOR_VERSION)
+#define IB_REQ_CM_RDMA_IPV4                 0x40
+#define IB_REQ_CM_RDMA_IPV6                 0x60
 
+/*
+ * Bit masks to define what fields should be compared.  Major, Minor and IP
+ * version fields are always compared.
+ */
+#define IB_REQ_CM_RDMA_CMP_SRC_PORT         0x01
+#define IB_REQ_CM_RDMA_CMP_SRC_IP           0x02
+#define IB_REQ_CM_RDMA_CMP_DST_IP           0x04
 
-/****s* Access Layer/ib_cm_rep_t
+static inline boolean_t ib_cm_is_rdma_cm_sid(uint64_t sid)
+{
+    return ((sid & IB_REQ_CM_RDMA_SID_PREFIX_MASK) == IB_REQ_CM_RDMA_SID_PREFIX);
+}
+
+static inline net64_t ib_cm_rdma_cm_sid(uint8_t protocol, net16_t port)
+{
+    return IB_REQ_CM_RDMA_SID_PREFIX | (UINT64)protocol << 40 | (UINT64)port << 48;
+}
+
+/****s* Access Layer/ib_cm_rdma_req_t
 * NAME
 *	ib_cm_rdma_req_t
 *
 * DESCRIPTION
-*	Connection reply information used when establishing a connection.
+*	IP Addressing CM REQ Message Private Data Format.
 *
 * SYNOPSIS
 */
@@ -12923,6 +12944,13 @@ typedef struct _ib_cm_rdma_req
 *		Contains Consumer Private Data.
 *
 *****/
+
+static inline boolean_t ib_cm_is_rdma_cm_req_valid(ib_cm_rdma_req_t* p_rdma_req)
+{
+	return (p_rdma_req->maj_min_ver == IB_REQ_CM_RDMA_VERSION) &&
+		((p_rdma_req->ipv == IB_REQ_CM_RDMA_IPV4 ||
+		p_rdma_req->ipv == IB_REQ_CM_RDMA_IPV6));
+}
 
 AL_INLINE net64_t AL_API
 ib_cm_rdma_sid(
diff -dwup3 -x *svn* -x *makefile.inc -x *sources -r c:\dev\openib\ofw\gen1\branches\mlx4_30\trunk\ulp\nd\user\NdListen.cpp .\ulp\nd\user\NdListen.cpp
--- c:\dev\openib\ofw\gen1\branches\mlx4_30\trunk\ulp\nd\user\NdListen.cpp	Thu May 31 11:22:11 2012
+++ .\ulp\nd\user\NdListen.cpp	Wed May 23 18:26:49 2012
@@ -35,7 +35,6 @@
 #pragma warning( push, 3 )
 #include "winternl.h"
 #pragma warning( pop )
-#include <complib/cl_byteswap.h>
 #include <limits.h>
 #include "nddebug.h"
 
@@ -185,30 +184,36 @@ HRESULT GetPdataForActive(
         ual_cep_listen_ioctl_t listen;
         listen.cid = 0;
 
-        listen.cep_listen.svc_id = ib_cm_rdma_sid( (uint8_t) Protocol, Port );
+        listen.cep_listen.svc_id = ib_cm_rdma_sid(static_cast<UINT8>(Protocol), Port);
 
         listen.cep_listen.port_guid = m_pParent->m_PortGuid;
 
+        ib_cm_rdma_req_t* pdata = reinterpret_cast<ib_cm_rdma_req_t*>(listen.compare);
+        pdata->maj_min_ver = IB_REQ_CM_RDMA_VERSION;
+        pdata->src_port = 0;
+        ZeroMemory( &pdata->src_ip_addr, sizeof(pdata->src_ip_addr) );
+
         switch( m_pParent->m_Addr.v4.sin_family )
         {
         case AF_INET:
-            ZeroMemory( listen.compare, ATS_IPV4_OFFSET );
-            CopyMemory( &listen.compare[ATS_IPV4_OFFSET],
-                (uint8_t*)&m_pParent->m_Addr.v4.sin_addr,
-                sizeof(m_pParent->m_Addr.v4.sin_addr) );
+            pdata->ipv = IB_REQ_CM_RDMA_IPV4;
+            pdata->dst_ip_addr[0] = pdata->dst_ip_addr[1] = pdata->dst_ip_addr[2] = 0;
+            pdata->dst_ip_addr[3] = m_pParent->m_Addr.v4.sin_addr.s_addr;
             ND_PRINT( TRACE_LEVEL_INFORMATION, ND_DBG_NDI,
                 ("Listen for: IP %#x, port %#hx\n", 
-                cl_hton32(m_pParent->m_Addr.v4.sin_addr.S_un.S_addr), cl_hton16(m_pParent->m_Addr.v4.sin_port) ) );
+                _byteswap_ulong(m_pParent->m_Addr.v4.sin_addr.S_un.S_addr),
+                _byteswap_ushort(m_pParent->m_Addr.v4.sin_port) ) );
             break;
         case AF_INET6:
-            CopyMemory( listen.compare,
-                (uint8_t*)&m_pParent->m_Addr.v6.sin6_addr,
+            pdata->ipv = IB_REQ_CM_RDMA_IPV6;
+            CopyMemory( &pdata->dst_ip_addr,
+                &m_pParent->m_Addr.v6.sin6_addr,
                 sizeof(m_pParent->m_Addr.v6.sin6_addr) );
             break;
         }
         listen.cep_listen.p_cmp_buf = listen.compare;
-        listen.cep_listen.cmp_len = 16;
-        listen.cep_listen.cmp_offset = FIELD_OFFSET( ib_cm_rdma_req_t, dst_ip_addr );
+        listen.cep_listen.cmp_len = FIELD_OFFSET(ib_cm_rdma_req_t, pdata);
+        listen.cep_listen.cmp_offset = IB_REQ_CM_RDMA_CMP_DST_IP;
 
         IO_STATUS_BLOCK IoStatus;
         IoStatus.Status = g_NtDeviceIoControlFile(
-------------- next part --------------
A non-text attachment was scrubbed...
Name: ndv2.21.patch
Type: application/octet-stream
Size: 14224 bytes
Desc: ndv2.21.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20130221/f5a76179/attachment.obj>


More information about the ofw mailing list