[ofw] [PATCH 4/4] librdmacm: fix connection scalability limit

Sean Hefty sean.hefty at intel.com
Tue Mar 10 16:53:47 PDT 2009


User the COMP_CHANNEL abstraction to as a common framework for event
reporting and provide better scalability.

Signed-off-by: Sean Hefty <sean.hefty at intel.com>
---
With these changes, I can consistently establish 3000 connections between
two systems.  Work is still needed to identify performance bottlenecks,
but at least the connections form now.

diff -up -r -X trunk\docs\dontdiff.txt -I '\$Id:' trunk\ulp\librdmacm/examples/cmatose/cmatose.c
branches\winverbs\ulp\librdmacm/examples/cmatose/cmatose.c
--- trunk\ulp\librdmacm/examples/cmatose/cmatose.c	2009-01-15 15:47:32.196611000 -0800
+++ branches\winverbs\ulp\librdmacm/examples/cmatose/cmatose.c	2009-03-10 12:38:47.383414700 -0700
@@ -433,7 +433,7 @@ static int connect_events(void)
 			cma_handler(event->id, event);
 			rdma_ack_cm_event(event);
 		} else {
-			printf("cmatose: failure in rdma_get_cm_event in connect events\n");
+			printf("cmatose:rdma_get_cm_event connect events error 0x%x\n", err);
 			ret = err;
 		}
 	}
@@ -452,7 +452,7 @@ static int disconnect_events(void)
 			cma_handler(event->id, event);
 			rdma_ack_cm_event(event);
 		} else {
-			printf("cmatose: failure in rdma_get_cm_event in disconnect events\n");
+			printf("cmatose: rdma_get_cm_event disconnect events error 0x%x\n", err);
 			ret = err;
 		}
 	}
@@ -537,7 +537,7 @@ static int run_server(void)
 		goto out;
 	}
 
-	ret = rdma_listen(listen_id, 0);
+	ret = rdma_listen(listen_id, connections);
 	if (ret) {
 		printf("cmatose: failure trying to listen: 0x%x\n", ret);
 		goto out;
diff -up -r -X trunk\docs\dontdiff.txt -I '\$Id:' trunk\ulp\librdmacm/include/rdma/rdma_cma.h
branches\winverbs\ulp\librdmacm/include/rdma/rdma_cma.h
--- trunk\ulp\librdmacm/include/rdma/rdma_cma.h	2009-01-12 14:41:27.573634000 -0800
+++ branches\winverbs\ulp\librdmacm/include/rdma/rdma_cma.h	2009-03-10 01:57:47.109375000 -0700
@@ -121,7 +121,7 @@ struct rdma_route
 
 struct rdma_event_channel
 {
-	uint32_t		timeout;
+	COMP_CHANNEL		channel;
 };
 
 struct rdma_cm_id
@@ -133,13 +133,12 @@ struct rdma_cm_id
 	struct rdma_route			route;
 	enum rdma_port_space		ps;
 	uint8_t						port_num;
+	COMP_ENTRY					comp_entry;
 
 	union {
 		IWVConnectEndpoint		*connect;
 		IWVDatagramEndpoint		*datagram;
 	}	ep;
-	OVERLAPPED					overlap;
-	uint32_t					events_completed;
 };
 
 struct rdma_conn_param
diff -up -r -X trunk\docs\dontdiff.txt -I '\$Id:' trunk\ulp\librdmacm/src/cma.cpp branches\winverbs\ulp\librdmacm/src/cma.cpp
--- trunk\ulp\librdmacm/src/cma.cpp	2009-03-02 13:19:41.110716900 -0800
+++ branches\winverbs\ulp\librdmacm/src/cma.cpp	2009-03-10 15:51:13.085375000 -0700
@@ -34,12 +34,12 @@
 
 #include <rdma/rdma_cma.h>
 #include <infiniband/verbs.h>
+#include <comp_channel.h>
 #include <iba/ibat.h>
 #include "cma.h"
+#include "..\..\..\etc\user\comp_channel.cpp"
 
-IWVProvider *prov;
-__declspec(dllexport)
-IWVProvider *ibv_get_winverbs(void);
+static struct ibv_windata windata;
 
 enum cma_state
 {
@@ -64,22 +64,12 @@ struct cma_id_private
 {
 	struct rdma_cm_id			id;
 	enum cma_state				state;
-	int							channel_index;
 	struct cma_device			*cma_dev;
 	int							backlog;
 	int							index;
 	struct rdma_cm_id			**req_list;
 };
 
-struct cma_event_channel
-{
-	struct rdma_event_channel	channel;
-	CRITICAL_SECTION			lock;
-	struct cma_id_private		*id[MAXIMUM_WAIT_OBJECTS];
-	HANDLE						event[MAXIMUM_WAIT_OBJECTS];
-	int							count;
-};
-
 struct cma_device
 {
 	struct ibv_context	*verbs;
@@ -107,9 +97,9 @@ static void ucma_cleanup(void)
 		delete cma_dev_array;
 		cma_dev_cnt = 0;
 	}
-	if (prov != NULL) {
-		prov->Release();
-		prov = NULL;
+	if (windata.prov != NULL) {
+		ibv_release_windata(&windata, IBV_WINDATA_VERSION);
+		windata.prov = NULL;
 	}
 }
 
@@ -125,9 +115,8 @@ static int ucma_init(void)
 		goto out;
 	}
 
-	prov = ibv_get_winverbs();
-	if (prov == NULL) {
-		ret = -1;
+	ret = ibv_get_windata(&windata, IBV_WINDATA_VERSION);
+	if (ret) {
 		goto err;
 	}
 
@@ -211,75 +200,26 @@ void rdma_free_devices(struct ibv_contex
 __declspec(dllexport)
 struct rdma_event_channel *rdma_create_event_channel(void)
 {
-	struct cma_event_channel *chan;
+	struct rdma_event_channel *channel;
 
 	if (!cma_dev_cnt && ucma_init()) {
 		return NULL;
 	}
 
-	chan = new struct cma_event_channel;
-	if (chan == NULL) {
+	channel = new struct rdma_event_channel;
+	if (channel == NULL) {
 		return NULL;
 	}
 
-	InitializeCriticalSection(&chan->lock);
-	chan->count = 0;
-	chan->channel.timeout = INFINITE;
-
-	return &chan->channel;
+	CompChannelInit(windata.comp_mgr, &channel->channel, INFINITE);
+	return channel;
 }
 
 __declspec(dllexport)
 void rdma_destroy_event_channel(struct rdma_event_channel *channel)
 {
-	struct cma_event_channel *chan;
-
-	chan = CONTAINING_RECORD(channel, struct cma_event_channel, channel);
-	DeleteCriticalSection(&chan->lock);	
-	delete chan;
-}
-
-static int cma_event_channel_insert_id(struct rdma_event_channel *channel,
-									   struct cma_id_private *id_priv)
-{
-	struct cma_event_channel *chan;
-	int ret = 0;
-
-	chan = CONTAINING_RECORD(channel, struct cma_event_channel, channel);
-
-	EnterCriticalSection(&chan->lock);
-	if (chan->count == MAXIMUM_WAIT_OBJECTS) {
-		ret = -1;
-		goto out;
-	}
-
-	chan->id[chan->count] = id_priv;
-	chan->event[chan->count] = id_priv->id.overlap.hEvent;
-	id_priv->channel_index = chan->count++;
-out:
-	LeaveCriticalSection(&chan->lock);
-	return ret;
-}
-
-/*
- * TODO: we cannot call cma_event_channel_remove_id() while another
- * thread is calling rdma_get_event().  If this is needed, then we
- * need to halt the rdma_get_event() thread, modify the event list,
- * then restart the rdma_get_event() thread.
- */
-static void cma_event_channel_remove_id(struct rdma_event_channel *channel,
-										struct cma_id_private *id_priv)
-{
-	struct cma_event_channel *chan;
-
-	chan = CONTAINING_RECORD(channel, struct cma_event_channel, channel);
-
-	EnterCriticalSection(&chan->lock);
-	chan->count--;
-	chan->id[id_priv->channel_index] = chan->id[chan->count];
-	chan->event[id_priv->channel_index] = chan->event[chan->count];
-	chan->id[id_priv->channel_index]->channel_index = id_priv->channel_index;
-	LeaveCriticalSection(&chan->lock);
+	CompChannelCleanup(&channel->channel);
+	delete channel;
 }
 
 __declspec(dllexport)
@@ -301,43 +241,26 @@ int rdma_create_id(struct rdma_event_cha
 	}
 
 	RtlZeroMemory(id_priv, sizeof(struct cma_id_private));
-	id_priv->id.overlap.hEvent = CreateEvent(NULL, FALSE, FALSE, NULL);
-	if (id_priv->id.overlap.hEvent == NULL) {
-		goto err1;
-	}
-
 	id_priv->id.context = context;
 	id_priv->id.channel = channel;
 	id_priv->id.ps = ps;
+	CompEntryInit(&channel->channel, &id_priv->id.comp_entry);
 
 	if (ps == RDMA_PS_TCP) {
-		hr = prov->CreateConnectEndpoint(&id_priv->id.ep.connect);
+		hr = windata.prov->CreateConnectEndpoint(&id_priv->id.ep.connect);
 	} else {
-		hr = prov->CreateDatagramEndpoint(&id_priv->id.ep.datagram);
+		hr = windata.prov->CreateDatagramEndpoint(&id_priv->id.ep.datagram);
 	}
 	if (FAILED(hr)) {
-		goto err2;
-	}
-
-	hr = cma_event_channel_insert_id(channel, id_priv);
-	if (FAILED(hr)) {
-		goto err3;
+		goto err;
 	}
 
 	*id = &id_priv->id;
 	return 0;
 
-err3:
-	if (ps == RDMA_PS_TCP) {
-		id_priv->id.ep.connect->Release();
-	} else {
-		id_priv->id.ep.datagram->Release();
-	}
-err2:
-	CloseHandle(id_priv->id.overlap.hEvent);
-err1:
+err:
 	delete id_priv;
-	return -1;
+	return hr;
 }
 
 static void ucma_destroy_listen(struct cma_id_private *id_priv)
@@ -354,7 +277,7 @@ static void ucma_destroy_listen(struct c
 __declspec(dllexport)
 int rdma_destroy_id(struct rdma_cm_id *id)
 {
-	struct cma_id_private	*id_priv;
+	struct cma_id_private *id_priv;
 
 	id_priv = CONTAINING_RECORD(id, struct cma_id_private, id);
 	if (id->ps == RDMA_PS_TCP) {
@@ -363,7 +286,7 @@ int rdma_destroy_id(struct rdma_cm_id *i
 		id->ep.datagram->CancelOverlappedRequests();
 	}
 
-	cma_event_channel_remove_id(id->channel, id_priv);
+	CompChannelRemoveEntry(&id->channel->channel, &id->comp_entry);
 
 	if (id_priv->backlog > 0) {
 		ucma_destroy_listen(id_priv);
@@ -538,7 +461,8 @@ int rdma_resolve_addr(struct rdma_cm_id 
 	RtlCopyMemory(&id->route.addr.dst_addr, dst_addr, ucma_addrlen(dst_addr));
 	id_priv = CONTAINING_RECORD(id, struct cma_id_private, id);
 	id_priv->state = cma_addr_resolve;
-	SetEvent(id->overlap.hEvent);
+
+	CompEntryPost(&id->comp_entry);
 	return 0;
 }
 
@@ -563,7 +487,8 @@ int rdma_resolve_route(struct rdma_cm_id
 
 	id_priv = CONTAINING_RECORD(id, struct cma_id_private, id);
 	id_priv->state = cma_route_resolve;
-	SetEvent(id->overlap.hEvent);
+
+	CompEntryPost(&id->comp_entry);
 	return 0;
 }
 
@@ -583,7 +508,7 @@ static int ucma_modify_qp_init(struct cm
 		return hr;
 	}
 
-	qp_attr.pkey_index = (uint16_t) index;
+	qp_attr.pkey_index = index;
 	return ibv_modify_qp(qp, &qp_attr, (enum ibv_qp_attr_mask)
 						 (IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT));
 }
@@ -689,7 +614,7 @@ int rdma_connect(struct rdma_cm_id *id, 
 
 	id_priv->state = cma_active_connect;
 	hr = id->ep.connect->Connect(id->qp->conn_handle, &id->route.addr.dst_addr,
-								 &attr, &id->overlap);
+								 &attr, &id->comp_entry.Overlap);
 	if (FAILED(hr) && hr != WV_IO_PENDING) {
 		id_priv->state = cma_route_resolve;
 		return hr;
@@ -715,13 +640,16 @@ static int ucma_get_request(struct cma_i
 
 	if (listen->id.ps == RDMA_PS_TCP) {
 		hr = listen->id.ep.connect->GetRequest(id_priv->id.ep.connect,
-											   &id_priv->id.overlap);
+											   &id_priv->id.comp_entry.Overlap);
 	} else {
 		hr = listen->id.ep.datagram->GetRequest(id_priv->id.ep.datagram,
-												&id_priv->id.overlap);
+												&id_priv->id.comp_entry.Overlap);
+	}
+	if (FAILED(hr) && hr != WV_IO_PENDING) {
+		return hr;
 	}
 
-	return (FAILED(hr) && hr != WV_IO_PENDING) ? hr : 0;
+	return 0;
 }
 
 __declspec(dllexport)
@@ -731,7 +659,7 @@ int rdma_listen(struct rdma_cm_id *id, i
 	HRESULT hr;
 	int i;
 
-	if (backlog <= 0 || backlog > CMA_DEFAULT_BACKLOG) {
+	if (backlog <= 0) {
 		backlog = CMA_DEFAULT_BACKLOG;
 	}
 
@@ -784,7 +712,8 @@ int rdma_accept(struct rdma_cm_id *id, s
 	}
 
 	id_priv->state = cma_accepting;
-	hr = id->ep.connect->Accept(id->qp->conn_handle, &attr, &id->overlap);
+	hr = id->ep.connect->Accept(id->qp->conn_handle, &attr,
+								&id->comp_entry.Overlap);
 	if (FAILED(hr) && hr != WV_IO_PENDING) {
 		id_priv->state = cma_disconnected;
 		return hr;
@@ -848,9 +777,10 @@ int rdma_ack_cm_event(struct rdma_cm_eve
 static int ucma_process_conn_req(struct cma_event *event)
 {
 	struct cma_id_private *listen;
-	HRESULT hr = 0;
+	struct cma_event_channel *chan;
 
 	listen = (struct cma_id_private *) event->id_priv->id.context;
+	ucma_get_request(listen, event->id_priv->index);
 
 	if (SUCCEEDED(event->event.status)) {
 		event->event.status = ucma_query_connect(&event->id_priv->id,
@@ -860,25 +790,18 @@ static int ucma_process_conn_req(struct 
 	if (SUCCEEDED(event->event.status)) {
 		event->event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
 		event->id_priv->state = cma_passive_connect;
-
-		listen->req_list[event->id_priv->index] = NULL;
-		ucma_get_request(listen, event->id_priv->index);
 	} else {
-		hr = listen->id.ep.connect->GetRequest(event->id_priv->id.ep.connect,
-											   &event->id_priv->id.overlap);
-		if (hr == WV_IO_PENDING) {
-			hr = 0;
-		}
+		rdma_destroy_id(&event->id_priv->id);
 	}
 
-	return hr;
+	return event->event.status;
 }
 
 static int ucma_process_conn_resp(struct cma_event *event)
 {
-	struct rdma_cm_id	*id;
-	WV_CONNECT_PARAM	attr;
-	HRESULT				hr;
+	struct rdma_cm_id *id;
+	WV_CONNECT_PARAM attr;
+	HRESULT hr;
 
 	if (FAILED(event->event.status)) {
 		goto err;
@@ -888,7 +811,8 @@ static int ucma_process_conn_resp(struct
 	event->id_priv->state = cma_accepting;
 
 	id = &event->id_priv->id;
-	hr = id->ep.connect->Accept(id->qp->conn_handle, &attr, &id->overlap);
+	hr = id->ep.connect->Accept(id->qp->conn_handle, &attr,
+								&id->comp_entry.Overlap);
 	if (FAILED(hr) && hr != WV_IO_PENDING) {
 		event->event.status = hr;
 		goto err;
@@ -906,16 +830,18 @@ err:
 
 static void ucma_process_establish(struct cma_event *event)
 {
+	struct cma_id_private *id_priv = event->id_priv;
+
 	if (SUCCEEDED(event->event.status)) {
-		event->event.status = ucma_query_connect(&event->id_priv->id,
+		event->event.status = ucma_query_connect(&id_priv->id,
 												 &event->event.param.conn);
 	}
 
 	if (SUCCEEDED(event->event.status)) {
 		event->event.event = RDMA_CM_EVENT_ESTABLISHED;
-		event->id_priv->state = cma_connected;
 
-		event->id_priv->id.ep.connect->NotifyDisconnect(&event->id_priv->id.overlap);
+		id_priv->state = cma_connected;
+		id_priv->id.ep.connect->NotifyDisconnect(&id_priv->id.comp_entry.Overlap);
 	} else {
 		event->event.event = RDMA_CM_EVENT_CONNECT_ERROR;
 		event->id_priv->state = cma_disconnected;
@@ -962,12 +888,10 @@ __declspec(dllexport)
 int rdma_get_cm_event(struct rdma_event_channel *channel,
 					  struct rdma_cm_event **event)
 {
-	struct cma_event_channel *chan;
 	struct cma_event *evt;
-	struct cma_id_private *id_priv;
 	struct rdma_cm_id *id;
-	DWORD bytes;
-	HRESULT hr;
+	COMP_ENTRY *entry;
+	DWORD bytes, ret;
 
 	evt = new struct cma_event;
 	if (evt == NULL) {
@@ -977,28 +901,20 @@ int rdma_get_cm_event(struct rdma_event_
 	do {
 		RtlZeroMemory(evt, sizeof(struct cma_event));
 
-		chan = CONTAINING_RECORD(channel, struct cma_event_channel, channel);
-		hr = WaitForMultipleObjects(chan->count, chan->event, FALSE,
-									chan->channel.timeout);
-		if (hr == WAIT_TIMEOUT) {
-			return hr;
-		} else if (hr == WAIT_FAILED) {
-			return HRESULT_FROM_WIN32(GetLastError());
+		ret = CompChannelPoll(&channel->channel, &entry);
+		if (ret) {
+			return ret;
 		}
 
-		EnterCriticalSection(&chan->lock);
-		evt->id_priv = chan->id[hr];
-		LeaveCriticalSection(&chan->lock);
-
-		id = &evt->id_priv->id;
+		id = CONTAINING_RECORD(entry, struct rdma_cm_id, comp_entry);
+		evt->id_priv = CONTAINING_RECORD(id, struct cma_id_private, id);
 		evt->event.id = id;
 		evt->event.param.conn.private_data = evt->private_data;
-		if (id->ep.connect->GetOverlappedResult(&id->overlap, &bytes, FALSE) == 0) {
-			evt->event.status = HRESULT_FROM_WIN32(GetLastError());
-		}
+		evt->event.status = id->ep.connect->
+							GetOverlappedResult(&entry->Overlap, &bytes, FALSE);
 
-		hr = ucma_process_event(evt);
-	} while (FAILED(hr));
+		ret = ucma_process_event(evt);
+	} while (ret);
 	
 	*event = &evt->event;
 	return 0;
@@ -1069,16 +985,6 @@ int rdma_set_option(struct rdma_cm_id *i
 __declspec(dllexport)
 int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel)
 {
-	struct cma_id_private *id_priv;
-
-	id_priv = CONTAINING_RECORD(id, struct cma_id_private, id);
-	cma_event_channel_remove_id(id->channel, id_priv);
-	/*
-	 * TODO: To support calling this routine while processing events on the old
-	 * channel, we need to wait for all old events to be acknowledged.
-	 */
 	id->channel = channel;
-	cma_event_channel_insert_id(channel, id_priv);
-
 	return 0;
 }





More information about the ofw mailing list