[ofw] [PATCH 3/4] ib/cm: poll for CM REQ events

Sean Hefty sean.hefty at intel.com
Mon Jan 11 22:37:51 PST 2010


Replace the callback mechanism for reporting connection
requests with one that requires the user to poll for the
events.  This allows queuing REQs in the CM until the user
is ready to process the events.

Still provide a callback mechanism to notify the user that
REQ events are ready to be retrieved.

This change improves the connection rate for winverbs when
the user retrieves only a small number of requests at a time.

Signed-off-by: Sean Hefty <sean.hefty at intel.com>
---
This change should be added to WinOF 2.2 once committed to the trunk.

 trunk/core/al/kernel/al_cm.c       |   53 ++++++++------
 trunk/core/al/kernel/al_cm_cep.c   |   55 ++++++---------
 trunk/core/winverbs/kernel/wv_ep.c |  134 +++++++++++++++++++++---------------
 trunk/inc/kernel/iba/ib_cm_ifc.h   |   30 +++++++-
 4 files changed, 158 insertions(+), 114 deletions(-)

diff --git a/trunk/core/al/kernel/al_cm.c b/trunk/core/al/kernel/al_cm.c
index 177bb9e..8ba28f1 100644
--- a/trunk/core/al/kernel/al_cm.c
+++ b/trunk/core/al/kernel/al_cm.c
@@ -99,36 +99,44 @@ cm_cep_handler(const ib_al_handle_t h_al, const net32_t cid)
 static void
 cm_listen_handler(const ib_al_handle_t h_al, const net32_t cid)
 {
+	iba_cm_id		*id;
+	iba_cm_event	event;
+
+	id = (iba_cm_id *) kal_cep_get_context(h_al, cid, NULL, NULL);
+	memset(&event, 0, sizeof event);
+	event.type = iba_cm_req_received;
+	id->callback(id, &event);
+}
+
+static NTSTATUS
+cm_get_request(iba_cm_id *p_listen_id, iba_cm_id **pp_id, iba_cm_event *p_event)
+{
 	void				*context;
 	net32_t				new_cid;
 	ib_mad_element_t	*mad;
-	iba_cm_id			*id, *listen_id;
-	iba_cm_event		event;
+	ib_api_status_t		ib_status;
 	NTSTATUS			status;
 
-	while (al_cep_poll(h_al, cid, &context, &new_cid, &mad) == IB_SUCCESS) {
-
-		listen_id = (iba_cm_id *) context;
+	ib_status = al_cep_poll(gh_al, p_listen_id->cid, &context, &new_cid, &mad);
+	if (ib_status != IB_SUCCESS) {
+		return STATUS_NO_MORE_ENTRIES;
+	}
 
-		id = cm_alloc_id(listen_id->callback, listen_id);
-		if (id == NULL) {
-			kal_cep_destroy(h_al, new_cid, STATUS_NO_MORE_ENTRIES);
-			ib_put_mad(mad);
-			continue;
-		}
+	*pp_id = cm_alloc_id(p_listen_id->callback, p_listen_id);
+	if (*pp_id == NULL) {
+		kal_cep_destroy(gh_al, new_cid, STATUS_NO_MORE_ENTRIES);
+		status = STATUS_NO_MEMORY;
+		goto out;
+	}
 
-		kal_cep_config(h_al, new_cid, cm_cep_handler, id, cm_destroy_handler);
-		id->cid = new_cid;
+	kal_cep_config(gh_al, new_cid, cm_cep_handler, *pp_id, cm_destroy_handler);
+	(*pp_id)->cid = new_cid;
+	kal_cep_format_event(gh_al, new_cid, mad, p_event);
+	status = STATUS_SUCCESS;
 
-		kal_cep_format_event(h_al, id->cid, mad, &event);
-		status = id->callback(id, &event);
-		if (!NT_SUCCESS(status)) {
-			kal_cep_config(h_al, new_cid, NULL, NULL, NULL);
-			kal_cep_destroy(h_al, id->cid, status);
-			cm_free_id(id);
-		}
-		ib_put_mad(mad);
-	}
+out:
+	ib_put_mad(mad);
+	return status;
 }
 
 static NTSTATUS
@@ -367,6 +375,7 @@ void cm_get_interface(iba_cm_interface *p_ifc)
 	p_ifc->create_id = cm_create_id;
 	p_ifc->destroy_id = cm_destroy_id;
 	p_ifc->listen = cm_listen;
+	p_ifc->get_request = cm_get_request;
 	p_ifc->send_req = cm_send_req;
 	p_ifc->send_rep = cm_send_rep;
 	p_ifc->send_rtu = cm_send_rtu;
diff --git a/trunk/core/al/kernel/al_cm_cep.c b/trunk/core/al/kernel/al_cm_cep.c
index 86c5412..9a1d138 100644
--- a/trunk/core/al/kernel/al_cm_cep.c
+++ b/trunk/core/al/kernel/al_cm_cep.c
@@ -5978,38 +5978,32 @@ __format_event_req(kcep_t *p_cep, mad_cm_req_t *p_mad, iba_cm_req_event *p_req)
 	p_req->remote_ca_guid = p_cep->remote_ca_guid;
 	p_req->pkey_index = p_cep->av[0].pkey_index;
 	p_req->port_num = p_cep->av[0].attr.port_num;
-	p_req->req.service_id = p_mad->sid;
-
-	p_req->req.qpn = conn_req_get_lcl_qpn(p_mad);
-	p_req->req.qp_type = conn_req_get_qp_type(p_mad);
-	p_req->req.starting_psn = conn_req_get_starting_psn(p_mad);
-
-	p_req->req.p_pdata = p_mad->pdata;
-	p_req->req.pdata_len = IB_REQ_PDATA_SIZE;
-
-	p_req->req.max_cm_retries = conn_req_get_max_cm_retries(p_mad);
-	p_req->req.resp_res = conn_req_get_init_depth(p_mad);
-	p_req->req.init_depth = conn_req_get_resp_res(p_mad);
-	p_req->req.remote_resp_timeout = conn_req_get_resp_timeout(p_mad);
-	p_req->req.flow_ctrl = (uint8_t) conn_req_get_flow_ctrl(p_mad);
-	p_req->req.local_resp_timeout = conn_req_get_lcl_resp_timeout(p_mad);
-	p_req->req.rnr_retry_cnt = conn_req_get_rnr_retry_cnt(p_mad);
-	p_req->req.retry_cnt = conn_req_get_retry_cnt(p_mad);
-	p_req->req.srq = 0; // TODO: fix mad_cm_req_t
-
-	// We can re-use the MAD buffer if we're careful to read out the data
-	// that we need before it's overwritten.
-	p_req->req.p_primary_path = (ib_path_rec_t *) p_mad;
-	__format_path(p_req->req.p_primary_path, &p_mad->primary_path,
+	p_req->service_id = p_mad->sid;
+
+	p_req->qpn = conn_req_get_lcl_qpn(p_mad);
+	p_req->qp_type = conn_req_get_qp_type(p_mad);
+	p_req->starting_psn = conn_req_get_starting_psn(p_mad);
+
+	cl_memcpy(p_req->pdata, p_mad->pdata, IB_REQ_PDATA_SIZE);
+
+	p_req->max_cm_retries = conn_req_get_max_cm_retries(p_mad);
+	p_req->resp_res = conn_req_get_init_depth(p_mad);
+	p_req->init_depth = conn_req_get_resp_res(p_mad);
+	p_req->remote_resp_timeout = conn_req_get_resp_timeout(p_mad);
+	p_req->flow_ctrl = (uint8_t) conn_req_get_flow_ctrl(p_mad);
+	p_req->local_resp_timeout = conn_req_get_lcl_resp_timeout(p_mad);
+	p_req->rnr_retry_cnt = conn_req_get_rnr_retry_cnt(p_mad);
+	p_req->retry_cnt = conn_req_get_retry_cnt(p_mad);
+	p_req->srq = 0; // TODO: fix mad_cm_req_t
+
+	__format_path(&p_req->primary_path, &p_mad->primary_path,
 				  p_mad->pkey, conn_req_get_mtu(p_mad));
 
 	if (p_mad->alternate_path.remote_lid != 0) {
-		p_req->req.p_alt_path = p_req->req.p_primary_path + 1;
-		__format_path(p_req->req.p_alt_path, &p_mad->alternate_path,
-					  p_req->req.p_primary_path->pkey,
-					  p_req->req.p_primary_path->mtu);
+		__format_path(&p_req->alt_path, &p_mad->alternate_path,
+					  p_req->primary_path.pkey, p_req->primary_path.mtu);
 	} else {
-		p_req->req.p_alt_path = NULL;
+		cl_memclr(&p_req->alt_path, sizeof p_req->alt_path);
 	}
 }
 
@@ -6647,7 +6641,6 @@ kal_cep_get_context(
 	AL_PRINT( TRACE_LEVEL_VERBOSE, AL_DBG_CM, ("[ CID = %d\n", cid) );
 
 	CL_ASSERT( h_al );
-	CL_ASSERT( pfn_addref );
 
 	KeAcquireInStackQueuedSpinLock( &gp_cep_mgr->lock, &hdl );
 	p_cep = __lookup_cep( h_al, cid );
@@ -6658,7 +6651,7 @@ kal_cep_get_context(
 		goto out;
 	}
 
-	if( p_cep->pfn_cb != pfn_cb )
+	if( pfn_cb && p_cep->pfn_cb != pfn_cb )
 	{
 		AL_PRINT( TRACE_LEVEL_ERROR, AL_DBG_ERROR,
 			("CEP callback mismatch for cid %d, h_al %p\n", cid, h_al ));
@@ -6666,7 +6659,7 @@ kal_cep_get_context(
 	}
 
 	context = p_cep->context;
-	if( context != NULL )
+	if( pfn_addref && context != NULL )
 	{
 		pfn_addref( context );
 	}
diff --git a/trunk/core/winverbs/kernel/wv_ep.c b/trunk/core/winverbs/kernel/wv_ep.c
index 3d5c6ce..3ad91ac 100644
--- a/trunk/core/winverbs/kernel/wv_ep.c
+++ b/trunk/core/winverbs/kernel/wv_ep.c
@@ -1110,60 +1110,75 @@ complete:
 	WdfRequestComplete(Request, status);
 }
 
-static NTSTATUS WvEpIbListenHandler(iba_cm_id *pId, iba_cm_event *pEvent)
+static void WvEpGetIbRequest(WV_ENDPOINT *pListen)
 {
-	WV_ENDPOINT		*listen, *ep;
+	WV_ENDPOINT		*ep;
 	WDFREQUEST		request;
 	NTSTATUS		status;
 	IB_CMA_HEADER	*hdr;
+	iba_cm_id		*id;
+	iba_cm_event	event;
 
-	listen = ((iba_cm_id *) pId->context)->context;
+	WdfObjectAcquireLock(pListen->Queue);
+	while (1) {
+		status = WdfIoQueueRetrieveNextRequest(pListen->Queue, &request);
+		if (!NT_SUCCESS(status)) {
+			break;
+		}
 
-	WdfObjectAcquireLock(listen->Queue);
-	status = WdfIoQueueRetrieveNextRequest(listen->Queue, &request);
-	if (!NT_SUCCESS(status)) {
-		goto release;
+		status = IbCmInterface.CM.get_request(pListen->pIbCmId, &id, &event);
+		if (!NT_SUCCESS(status)) {
+			WdfRequestRequeue(request);
+			break;
+		}
+
+		ASSERT(!IsListEmpty(&pListen->Entry));
+		ep = CONTAINING_RECORD(RemoveHeadList(&pListen->Entry), WV_ENDPOINT, Entry);
+		ep->pIbCmId = id;
+		id->callback = WvEpIbCmHandler;
+		id->context = ep;
+
+		hdr = (IB_CMA_HEADER *) event.data.req.pdata;
+		if ((hdr->IpVersion >> 4) == 4) {
+			ep->Attributes.LocalAddress.SockAddr.In.SinFamily = WV_AF_INET;
+			ep->Attributes.LocalAddress.SockAddr.In.SinAddr = hdr->DstAddress.Ip4.Address;
+			ep->Attributes.PeerAddress.SockAddr.In.SinFamily = WV_AF_INET;
+			ep->Attributes.PeerAddress.SockAddr.In.SinAddr = hdr->SrcAddress.Ip4.Address;
+		} else {
+			ep->Attributes.LocalAddress.SockAddr.In6.Sin6Family = WV_AF_INET6; 
+			RtlCopyMemory(ep->Attributes.LocalAddress.SockAddr.In6.Sin6Addr,
+						  hdr->DstAddress.Ip6Address, 16);
+			ep->Attributes.PeerAddress.SockAddr.In6.Sin6Family = WV_AF_INET6;
+			RtlCopyMemory(ep->Attributes.PeerAddress.SockAddr.In6.Sin6Addr,
+						  hdr->SrcAddress.Ip6Address, 16);
+		}
+		ep->Attributes.Device.DeviceGuid = event.data.req.local_ca_guid;
+		ep->Attributes.Device.Pkey = event.data.req.primary_path.pkey;
+		ep->Attributes.Device.PortNumber = event.data.req.port_num;
+		ep->Attributes.Param.Connect.ResponderResources = event.data.req.resp_res;
+		ep->Attributes.Param.Connect.InitiatorDepth = event.data.req.init_depth;
+		ep->Attributes.Param.Connect.RetryCount = event.data.req.retry_cnt;
+		ep->Attributes.Param.Connect.RnrRetryCount = event.data.req.rnr_retry_cnt;
+		ep->Attributes.Param.Connect.DataLength = sizeof(ep->Attributes.Param.Connect.Data);
+		RtlCopyMemory(ep->Attributes.Param.Connect.Data, hdr + 1,
+					  sizeof(ep->Attributes.Param.Connect.Data));
+		ep->Route = event.data.req.primary_path;
+
+		ep->State = WvEpPassiveConnect;
+		WvEpPut(ep);
+
+		WdfRequestComplete(request, STATUS_SUCCESS);
 	}
+	WdfObjectReleaseLock(pListen->Queue);
+}
 
-	ASSERT(!IsListEmpty(&listen->Entry));
-	ep = CONTAINING_RECORD(RemoveHeadList(&listen->Entry), WV_ENDPOINT, Entry);
-	ep->pIbCmId = pId;
-	pId->callback = WvEpIbCmHandler;
-	pId->context = ep;
+static NTSTATUS WvEpIbListenHandler(iba_cm_id *pId, iba_cm_event *pEvent)
+{
+	WV_ENDPOINT		*listen;
 
-	hdr = pEvent->data.req.req.p_pdata;
-	if ((hdr->IpVersion >> 4) == 4) {
-		ep->Attributes.LocalAddress.SockAddr.In.SinFamily = WV_AF_INET;
-		ep->Attributes.LocalAddress.SockAddr.In.SinAddr = hdr->DstAddress.Ip4.Address;
-		ep->Attributes.PeerAddress.SockAddr.In.SinFamily = WV_AF_INET;
-		ep->Attributes.PeerAddress.SockAddr.In.SinAddr = hdr->SrcAddress.Ip4.Address;
-	} else {
-		ep->Attributes.LocalAddress.SockAddr.In6.Sin6Family = WV_AF_INET6; 
-		RtlCopyMemory(ep->Attributes.LocalAddress.SockAddr.In6.Sin6Addr,
-					  hdr->DstAddress.Ip6Address, 16);
-		ep->Attributes.PeerAddress.SockAddr.In6.Sin6Family = WV_AF_INET6;
-		RtlCopyMemory(ep->Attributes.PeerAddress.SockAddr.In6.Sin6Addr,
-					  hdr->SrcAddress.Ip6Address, 16);
-	}
-	ep->Attributes.Device.DeviceGuid = pEvent->data.req.local_ca_guid;
-	ep->Attributes.Device.Pkey = pEvent->data.req.req.p_primary_path->pkey;
-	ep->Attributes.Device.PortNumber = pEvent->data.req.port_num;
-	ep->Attributes.Param.Connect.ResponderResources = pEvent->data.req.req.resp_res;
-	ep->Attributes.Param.Connect.InitiatorDepth = pEvent->data.req.req.init_depth;
-	ep->Attributes.Param.Connect.RetryCount = pEvent->data.req.req.retry_cnt;
-	ep->Attributes.Param.Connect.RnrRetryCount = pEvent->data.req.req.rnr_retry_cnt;
-	ep->Attributes.Param.Connect.DataLength = sizeof(ep->Attributes.Param.Connect.Data);
-	RtlCopyMemory(ep->Attributes.Param.Connect.Data, hdr + 1,
-				  sizeof(ep->Attributes.Param.Connect.Data));
-	ep->Route = *pEvent->data.req.req.p_primary_path;
-
-	ep->State = WvEpPassiveConnect;
-	WvEpPut(ep);
-
-	WdfRequestComplete(request, STATUS_SUCCESS);
-release:
-	WdfObjectReleaseLock(listen->Queue);
-	return status;
+	listen = pId->context;
+	WvEpGetIbRequest(listen);
+	return STATUS_SUCCESS;
 }
 
 void WvEpListen(WV_PROVIDER *pProvider, WDFREQUEST Request)
@@ -1235,24 +1250,24 @@ void WvEpGetRequest(WV_PROVIDER *pProvider, WDFREQUEST Request)
 	status = WdfRequestRetrieveInputBuffer(Request, sizeof(WV_IO_EP_GET_REQUEST),
 										   &req, NULL);
 	if (!NT_SUCCESS(status)) {
-		goto complete;
+		goto err1;
 	}
 
 	listen = WvEpAcquire(pProvider, req->Id);
 	if (listen == NULL) {
 		status = STATUS_NOT_FOUND;
-		goto complete;
+		goto err1;
 	}
 
 	if (listen->State != WvEpListening) {
 		status = STATUS_NOT_SUPPORTED;
-		goto release1;
+		goto err2;
 	}
 
 	ep = WvEpAcquire(pProvider, req->EpId);
 	if (ep == NULL) {
 		status = STATUS_NOT_FOUND;
-		goto release1;
+		goto err2;
 	}
 
 	WdfObjectAcquireLock(ep->Queue);
@@ -1262,9 +1277,8 @@ void WvEpGetRequest(WV_PROVIDER *pProvider, WDFREQUEST Request)
 		status = STATUS_CONNECTION_IN_USE;
 	}
 	WdfObjectReleaseLock(ep->Queue);
-
 	if (!NT_SUCCESS(status)) {
-		goto release2;
+		goto err3;
 	}
 
 	WdfObjectAcquireLock(listen->Queue);
@@ -1274,15 +1288,21 @@ void WvEpGetRequest(WV_PROVIDER *pProvider, WDFREQUEST Request)
 		WvEpGet(ep);
 	}
 	WdfObjectReleaseLock(listen->Queue);
+	if (!NT_SUCCESS(status)) {
+		goto err3;
+	}
 
-release2:
 	WvEpRelease(ep);
-release1:
+	WvEpGetIbRequest(listen);
 	WvEpRelease(listen);
-complete:
-	if (!NT_SUCCESS(status)) {
-		WdfRequestComplete(Request, status);
-	}
+	return;
+
+err3:
+	WvEpRelease(ep);
+err2:
+	WvEpRelease(listen);
+err1:
+	WdfRequestComplete(Request, status);
 }
 
 void WvEpLookup(WV_PROVIDER *pProvider, WDFREQUEST Request)
diff --git a/trunk/inc/kernel/iba/ib_cm_ifc.h b/trunk/inc/kernel/iba/ib_cm_ifc.h
index 0949482..e895406 100644
--- a/trunk/inc/kernel/iba/ib_cm_ifc.h
+++ b/trunk/inc/kernel/iba/ib_cm_ifc.h
@@ -73,7 +73,27 @@ typedef struct _iba_cm_req
 
 typedef struct _iba_cm_req_event
 {
-	iba_cm_req					req;
+	ib_net64_t					service_id;
+
+	ib_path_rec_t				primary_path;
+	ib_path_rec_t				alt_path;
+
+	net32_t						qpn;
+	ib_qp_type_t				qp_type;
+	net32_t						starting_psn;
+
+	uint8_t						pdata[IB_REQ_PDATA_SIZE];
+
+	uint8_t						max_cm_retries;
+	uint8_t						resp_res;
+	uint8_t						init_depth;
+	uint8_t						remote_resp_timeout;
+	uint8_t						flow_ctrl;
+	uint8_t						local_resp_timeout;
+	uint8_t						rnr_retry_cnt;
+	uint8_t						retry_cnt;
+	uint8_t						srq;
+
 	net64_t						local_ca_guid;
 	net64_t						remote_ca_guid;
 	uint16_t					pkey_index;
@@ -239,6 +259,8 @@ typedef struct _iba_cm_interface
 
 	NTSTATUS		(*listen)(iba_cm_id *p_id, net64_t service_id, void *p_compare_buf,
 							  uint8_t compare_len, uint8_t compare_offset);
+	NTSTATUS		(*get_request)(iba_cm_id *p_listen_id, iba_cm_id **pp_id,
+								   iba_cm_event *p_event);
 
 	NTSTATUS		(*send_req)(iba_cm_id *p_id, iba_cm_req *p_req);
 	NTSTATUS		(*send_rep)(iba_cm_id *p_id, iba_cm_rep *p_rep);
@@ -282,9 +304,9 @@ static inline UINT8 IbaCmVersionMinor(USHORT Version)
 	return (UINT8) Version;
 }
 
-// {EACC1466-BB2D-4478-B5BE-40EDF7EE08AB}
-DEFINE_GUID(GUID_INFINIBAND_INTERFACE_CM, 0xeacc1466, 0xbb2d, 0x4478,
-			0xb5, 0xbe, 0x40, 0xed, 0xf7, 0xee, 0x8, 0xab);
+// {6A11D060-8957-49e6-BE2A-01EDF1BD22B3}
+DEFINE_GUID(GUID_INFINIBAND_INTERFACE_CM, 0x6a11d060, 0x8957, 0x49e6,
+			0xbe, 0x2a, 0x1, 0xed, 0xf1, 0xbd, 0x22, 0xb3);
 
 typedef struct _INFINIBAND_INTERFACE_CM
 {





More information about the ofw mailing list