[ofw] winverbs ND provider

Sean Hefty sean.hefty at intel.com
Wed Jan 13 10:22:33 PST 2010


>Just making sure, we are talking about 2008 64 bits right?

yes - here's the code I was using:

@@ -172,7 +173,11 @@ BindAddress(SOCKADDR* pAddress)
 	DWORD				bytes;
 	int					len;
 	HRESULT				hr;
+LARGE_INTEGER start_time, end_time;
+LARGE_INTEGER freq;
+double run_time;
 
+QueryPerformanceFrequency(&freq);
 	if (pAddress->sa_family == AF_INET) {
 		any = (((SOCKADDR_IN *) pAddress)->sin_addr.S_un.S_addr ==
INADDR_ANY);
 		bytes = sizeof(SOCKADDR_IN);
@@ -184,34 +189,54 @@ BindAddress(SOCKADDR* pAddress)
 	if (any) {
 		RtlZeroMemory(&attr.Device, sizeof attr.Device);
 	} else {
+QueryPerformanceCounter(&start_time);
 		hr = m_pProvider->TranslateAddress(pAddress, (WV_DEVICE_ADDRESS
*) &attr.Device);
 		if (FAILED(hr)) {
 			return hr;
 		}
+QueryPerformanceCounter(&end_time);
+run_time = (double) (end_time.QuadPart - start_time.QuadPart) / (double)
freq.QuadPart;
+printf("translate address: %.6f us\n", run_time * 1000000);
 	}
 
+QueryPerformanceCounter(&start_time);
 	m_Socket = socket(pAddress->sa_family, SOCK_STREAM, IPPROTO_TCP);
 	if (m_Socket == INVALID_SOCKET) {
 		return WvConvertWSAStatus(WSAGetLastError());
 	}
+QueryPerformanceCounter(&end_time);
+run_time = (double) (end_time.QuadPart - start_time.QuadPart) / (double)
freq.QuadPart;
+printf("socket: %.6f us\n", run_time * 1000000);
 
+QueryPerformanceCounter(&start_time);
 	hr = bind(m_Socket, pAddress, bytes);
 	if (FAILED(hr)) {
 		goto get_err;
 	}
+QueryPerformanceCounter(&end_time);
+run_time = (double) (end_time.QuadPart - start_time.QuadPart) / (double)
freq.QuadPart;
+printf("socket bind: %.6f us\n", run_time * 1000000);
 
 	attr.Id = m_Id;
 	len = sizeof attr.Address;
+QueryPerformanceCounter(&start_time);
 	hr = getsockname(m_Socket, (sockaddr *) &attr.Address, &len);
 	if (FAILED(hr)) {
 		goto get_err;
 	}
+QueryPerformanceCounter(&end_time);
+run_time = (double) (end_time.QuadPart - start_time.QuadPart) / (double)
freq.QuadPart;
+printf("getsockname: %.6f us\n", run_time * 1000000);
 
+QueryPerformanceCounter(&start_time);
 	if (!WvDeviceIoControl(m_hFile, WV_IOCTL_EP_BIND, &attr, sizeof attr,
 						   &attr, sizeof attr, &bytes,
NULL)) {
 		hr = HRESULT_FROM_WIN32(GetLastError());
 		goto err;
 	}
+QueryPerformanceCounter(&end_time);
+run_time = (double) (end_time.QuadPart - start_time.QuadPart) / (double)
freq.QuadPart;
+printf("ep bind ioctl: %.6f us\n", run_time * 1000000);
 
 	return WV_SUCCESS;
 
diff --git a/trunk/ulp/netdirect/user/nd_connect.cpp
b/trunk/ulp/netdirect/user/nd_connect.cpp
index aa46ada..e2c59b8 100644
--- a/trunk/ulp/netdirect/user/nd_connect.cpp
+++ b/trunk/ulp/netdirect/user/nd_connect.cpp
@@ -131,6 +131,11 @@ Connect(INDEndpoint* pEndpoint,
 	WV_CONNECT_PARAM attr;
 	IBAT_PATH_BLOB path;
 	HRESULT hr;
+LARGE_INTEGER start_time, end_time;
+LARGE_INTEGER freq;
+double run_time;
+
+QueryPerformanceFrequency(&freq);
 
 	RtlCopyMemory(&addr, &m_pAdapter->m_Address, AddressLength);
 	if (addr.Sa.sa_family == AF_INET) {
@@ -138,20 +143,32 @@ Connect(INDEndpoint* pEndpoint,
 	} else {
 		addr.Sin6.sin6_port = LocalPort;
 	}
+QueryPerformanceCounter(&start_time);
 	hr = m_pWvConnEp->BindAddress(&addr.Sa);
 	if (FAILED(hr)) {
 		goto out;
 	}
+QueryPerformanceCounter(&end_time);
+run_time = (double) (end_time.QuadPart - start_time.QuadPart) / (double)
freq.QuadPart;
+printf("bind address: %.6f us\n", run_time * 1000000);
 

The time to call BindAddress was 43 milliseconds, which is substantially more
than the sum of the times reported inside the call.  Maybe some thread switching
was going on, but the numbers were consistent between multiple runs.

>As for the ibat resolve:
>
>I have created a very simple "resolve" program and started playing with
>it.
>Without any change resolve takes ~33us
>
>I have changed the resolve code to open the ibat device once and than
>only do the resolve many times this took ~27us. I guess that the nd code
>can do a similar thing.

I'm sure we can add several optimizations - just not sure that it's worth it.
We could probably cache data in the libraries as well and avoid some of the
calls completely.  This would definitely help micro-benchmark performance, like
ndconn or rdma_cmatose.

>and this only take me ~22us to complete which is very far from the
>numbers that you have measured  (socket: 75 us, bind: 32 us
>getsockname: 9 us).

The numbers I gave were from a single run.  I was running ndpingpong and output
the measurements.

>By the way, the entire socket thing doesn't really look like a must to
>me. If we want to create a "unique numbers generator" we can defently
>make something much more efficient. Does the socket / bind getsockname
>has another reason?

The intent was to reserve port numbers, and was intended to prevent the host
stack from being able to allocate the same numbers (for iWarp support).  It
turns out that this breaks MS-MPI, so I'm working on a different approach.

>I have also noticed that when running I get the following error:
>[AL]:kal_cep_get_context() !ERROR!: CEP callback mismatch for cid 2326,
>h_al FFFFFA802529A010

I've been running with the free build, but haven't seen this.  I believe that
kal_cep_get_context() is currently only called by the ND support code in the
IBAL driver.  (I have a patch which extends its use.)  I'm not sure this error
isn't an expected situation.

- Sean 




More information about the ofw mailing list