[ofw] [PATCH v2] Add smarter IB address translation

Sean Hefty sean.hefty at intel.com
Wed Jul 2 12:43:03 PDT 2008


>+namespace IBAT
>+{
>+
>+    const IN6_ADDR x_DefaultGid = {0xFE,0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
>+
>+class H
>+{
>+public:
>+    H( HANDLE h = INVALID_HANDLE_VALUE ) : m_h( h ) {};
>+    ~H(){ if( m_h != INVALID_HANDLE_VALUE ) CloseHandle( m_h ); }
>+
>+    H& operator =(HANDLE h){ CloseHandle( m_h ); m_h = h; }
>+    operator HANDLE() const { return m_h; }
>+
>+private:
>+    HANDLE m_h;
>+};
>+
>+#if WINVER >= 0x600
>+HRESULT
>+Resolve(
>+    __in const struct sockaddr* pSrcAddr,
>+    __in const struct sockaddr* pDestAddr,
>+    __out IN6_ADDR* pSrcGid,
>+    __out IN6_ADDR* pDestGid,
>+    __out USHORT* pPkey
>+    )
>+{
>+    if( pSrcAddr->sa_family != pDestAddr->sa_family )
>+        return E_INVALIDARG;
>+
>+    H hIbatDev = CreateFileW( IBAT_WIN32_NAME,
>+        MAXIMUM_ALLOWED, 0, NULL,
>+        OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL );
>+    if( hIbatDev == INVALID_HANDLE_VALUE )
>+        return HRESULT_FROM_WIN32( GetLastError() );
>+
>+    bool fLoopback;
>+    IOCTL_IBAT_IP_TO_PORT_IN port_in;

Variable declarations in the middle of functions make it more difficult to find
where a variable is declared, and requires reading through the entire function
to see if a variable name is already in use.

Why StudlyCaps for one variable, but not the other?  And why the 'f' prefix -
used here and elsewhere on bools?

>+    port_in.Version = IBAT_IOCTL_VERSION;
>+    if( pSrcAddr->sa_family == AF_INET )
>+    {
>+        port_in.Address.IpVersion = 4;
>+        RtlCopyMemory(
>+            &port_in.Address.Address[12],
>+            &((struct sockaddr_in*)pSrcAddr)->sin_addr,
>+            sizeof( ((struct sockaddr_in*)pSrcAddr)->sin_addr ) );
>+
>+        fLoopback = ((struct sockaddr_in*)pDestAddr)->sin_addr.s_addr ==
>+            ((struct sockaddr_in*)pSrcAddr)->sin_addr.s_addr;

Is this sufficient?  What about 127.0.0.1 to a local IP address?

>+    }
>+    else
>+    {
>+        port_in.Address.IpVersion = 6;
>+        RtlCopyMemory(
>+            port_in.Address.Address,
>+            &((struct sockaddr_in6*)pSrcAddr)->sin6_addr,
>+            sizeof(port_in.Address.Address) );
>+        fLoopback = IN6_ADDR_EQUAL(
>+            &((struct sockaddr_in6*)pDestAddr)->sin6_addr,
>+            &((struct sockaddr_in6*)pSrcAddr)->sin6_addr
>+            ) == TRUE;

Can't you just assign IN6_ADDR_EQUAL directly, without the '== TRUE'?

>+    }
>+
>+    IBAT_PORT_RECORD port_out;
>+    DWORD size;
>+    BOOL fSuccess = DeviceIoControl( hIbatDev, IOCTL_IBAT_IP_TO_PORT,
>+        &port_in, sizeof(port_in), &port_out, sizeof(port_out), &size, NULL );
>+
>+    if( !fSuccess )
>+        return HRESULT_FROM_WIN32( GetLastError() );
>+
>+    // Check for loopback.
>+    if( fLoopback )
>+    {
>+        *pSrcGid = x_DefaultGid;

See comment further down about GIDs and IB routers.

>+        RtlCopyMemory(
>+            &pSrcGid->u.Byte[8],
>+            &port_out.PortGuid,
>+            sizeof(port_out.PortGuid) );
>+        *pDestGid = *pSrcGid;
>+        *pPkey = port_out.PKey;
>+        return S_OK;
>+    }
>+
>+    NET_LUID luid;
>+    DWORD ret;
>+    do
>+    {
>+        DWORD iIf;
>+        ret = GetBestInterfaceEx( (struct sockaddr*)pSrcAddr, &iIf );
>+        if( ret != NO_ERROR )
>+            return HRESULT_FROM_WIN32( ret );
>+
>+        // Interface indexes are not constant, so get the LUID mapping for the
>+        // returned interface for use in the rest of the function.
>+        ret = ConvertInterfaceIndexToLuid( iIf, &luid );
>+
>+    } while( ret != NO_ERROR );
>+
>+    SOCKADDR_INET src;
>+    MIB_IPNET_ROW2 net = {0};
>+    net.InterfaceLuid = luid;
>+    switch( pDestAddr->sa_family )
>+    {
>+    case AF_INET:
>+        net.Address.si_family = src.si_family = AF_INET;
>+        net.Address.Ipv4 = *(struct sockaddr_in*)pDestAddr;
>+        src.Ipv4 = *(struct sockaddr_in*)pSrcAddr;
>+        break;
>+
>+    case AF_INET6:
>+        net.Address.si_family = src.si_family = AF_INET6;
>+        net.Address.Ipv6 = *(struct sockaddr_in6*)pDestAddr;
>+        src.Ipv6 = *(struct sockaddr_in6*)pSrcAddr;
>+        break;
>+
>+    default:
>+        return E_INVALIDARG;
>+    }
>+
>+    bool fRetry = true;
>+retry:
>+    ret = GetIpNetEntry2( &net );
>+    if( ret != NO_ERROR )
>+        return HRESULT_FROM_WIN32( ret );
>+
>+    switch( net.State )
>+    {
>+    default:
>+    case NlnsUnreachable:
>+        ret = ResolveIpNetEntry2( &net, &src );
>+        if( ret == ERROR_BAD_NET_NAME && fRetry )
>+        {
>+            fRetry = false;
>+            goto retry;
>+        }
>+        else if( ret != NO_ERROR )
>+        {
>+            return HRESULT_FROM_WIN32( ret );
>+        }
>+        break;
>+
>+    case NlnsReachable:
>+    case NlnsPermanent:
>+        break;
>+
>+    case NlnsIncomplete:
>+        return E_PENDING;
>+    }
>+
>+    if( net.PhysicalAddressLength > 6 )
>+        return E_UNEXPECTED;

Why this check?

>+
>+    IOCTL_IBAT_MAC_TO_GID_IN mac_in;
>+    mac_in.Version = IBAT_IOCTL_VERSION;
>+    mac_in.PortGuid = port_out.PortGuid;
>+    RtlCopyMemory( mac_in.DestMac, net.PhysicalAddress, IBAT_MAC_LEN );
>+
>+    fSuccess = DeviceIoControl( hIbatDev, IOCTL_IBAT_MAC_TO_GID,
>+        &mac_in, sizeof(mac_in), pDestGid, sizeof(*pDestGid), &size, NULL );
>+    if( !fSuccess )
>+        return HRESULT_FROM_WIN32( GetLastError() );
>+
>+    // Use the same subnet prefix as the destination.

There are simple IB routers today that allow IPoIB to span IB subnets.  If we
know the local port GUID, we should be able to lookup any usable GID from that
port using verbs.

>+    *pSrcGid = *pDestGid;
>+    RtlCopyMemory( &pSrcGid->u.Byte[8], &port_out.PortGuid,
>sizeof(port_out.PortGuid) );
>+    *pPkey = port_out.PKey;
>+    return S_OK;
>+}
>+#else   // Back compatibility with Windows Server 2003
>+
>+
>+static HRESULT
>+GetDestMac(
>+    __in struct sockaddr_in* pDestAddr,
>+    __out BYTE* pDestMac
>+    )
>+{
>+    DWORD ret;
>+
>+    MIB_IPNETTABLE* pTable = NULL;
>+    ULONG len = 0;
>+    do
>+    {
>+        ret = GetIpNetTable( pTable, &len, FALSE );
>+        if( ret != ERROR_INSUFFICIENT_BUFFER )
>+            break;
>+
>+        if( pTable != NULL )
>+        {
>+            HeapFree( GetProcessHeap(), 0, pTable );
>+        }

sometimes there are curly braces after one-line if statements, sometimes there
aren't...

>+
>+        pTable = (MIB_IPNETTABLE*)HeapAlloc( GetProcessHeap(), 0, len );
>+    } while( ret == ERROR_INSUFFICIENT_BUFFER );
>+
>+    if( ret != NO_ERROR )
>+    {
>+        if( pTable != NULL )
>+        {
>+            HeapFree( GetProcessHeap(), 0, pTable );
>+        }
>+        return HRESULT_FROM_WIN32( ret );
>+    }
>+
>+    ret = ERROR_NOT_SUPPORTED;
>+    DWORD i;
>+    for( i = 0; i < pTable->dwNumEntries; i++ )
>+    {
>+        if( pTable->table[i].dwType == MIB_IPNET_TYPE_OTHER ||
>+            pTable->table[i].dwType == MIB_IPNET_TYPE_INVALID )
>+        {
>+            continue;
>+        }
>+
>+        if( pTable->table[i].dwAddr !=
>+            ((struct sockaddr_in*)pDestAddr)->sin_addr.s_addr )
>+        {
>+            continue;
>+        }
>+
>+        if( pTable->table[i].dwPhysAddrLen != IBAT_MAC_LEN )
>+        {
>+            continue;
>+        }
>+
>+        RtlCopyMemory( pDestMac, pTable->table[i].bPhysAddr, IBAT_MAC_LEN );
>+        ret = S_OK;
>+        break;
>+    }
>+    HeapFree( GetProcessHeap(), 0, pTable );
>+
>+    return HRESULT_FROM_WIN32( ret );
>+}
>+
>+HRESULT
>+Resolve(
>+    __in const struct sockaddr* pSrcAddr,
>+    __in const struct sockaddr* pDestAddr,
>+    __out IN6_ADDR* pSrcGid,
>+    __out IN6_ADDR* pDestGid,
>+    __out USHORT* pPkey
>+    )
>+{
>+    if( pDestAddr->sa_family != AF_INET )
>+        return E_NOTIMPL;
>+
>+    H hIbatDev = CreateFileW( IBAT_WIN32_NAME,
>+        MAXIMUM_ALLOWED, 0, NULL,
>+        OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL );
>+    if( hIbatDev == INVALID_HANDLE_VALUE )
>+        return HRESULT_FROM_WIN32( GetLastError() );
>+
>+    IOCTL_IBAT_IP_TO_PORT_IN port_in;
>+    port_in.Version = IBAT_IOCTL_VERSION;
>+    port_in.Address.IpVersion = 4;
>+    RtlCopyMemory(
>+        &port_in.Address.Address[12],
>+        &((struct sockaddr_in*)pSrcAddr)->sin_addr,
>+        sizeof( ((struct sockaddr_in*)pSrcAddr)->sin_addr ) );
>+
>+    IBAT_PORT_RECORD port_out;
>+    DWORD size;
>+    BOOL fSuccess = DeviceIoControl( hIbatDev, IOCTL_IBAT_IP_TO_PORT,
>+        &port_in, sizeof(port_in), &port_out, sizeof(port_out), &size, NULL );
>+
>+    if( !fSuccess )
>+        return HRESULT_FROM_WIN32( GetLastError() );
>+
>+    // Check for loopback.
>+    if( ((struct sockaddr_in*)pDestAddr)->sin_addr.s_addr ==
>+        ((struct sockaddr_in*)pSrcAddr)->sin_addr.s_addr )
>+    {
>+        *pSrcGid = x_DefaultGid;
>+        RtlCopyMemory(
>+            &pSrcGid->u.Byte[8],
>+            &port_out.PortGuid,
>+            sizeof(port_out.PortGuid) );

this format for calling RtlCopyMemory differs from.. :

>+        *pDestGid = *pSrcGid;
>+        *pPkey = port_out.PKey;
>+        return S_OK;
>+    }
>+
>+    IOCTL_IBAT_MAC_TO_GID_IN mac_in;
>+    mac_in.Version = IBAT_IOCTL_VERSION;
>+    mac_in.PortGuid = port_out.PortGuid;
>+    HRESULT hr = GetDestMac( (struct sockaddr_in*)pDestAddr, mac_in.DestMac );
>+    if( FAILED( hr ) )
>+    {
>+        ULONG len = sizeof(mac_in.DestMac);
>+        DWORD ret = SendARP(
>+            ((struct sockaddr_in*)pDestAddr)->sin_addr.s_addr,
>+            ((struct sockaddr_in*)pSrcAddr)->sin_addr.s_addr,
>+            (ULONG*)mac_in.DestMac,
>+            &len
>+            );
>+        if( ret != NO_ERROR )
>+            return HRESULT_FROM_WIN32( ret );
>+    }
>+
>+    fSuccess = DeviceIoControl( hIbatDev, IOCTL_IBAT_MAC_TO_GID,
>+        &mac_in, sizeof(mac_in), pDestGid, sizeof(*pDestGid), &size, NULL );
>+    if( !fSuccess )
>+        return HRESULT_FROM_WIN32( GetLastError() );
>+
>+    // Use the same subnet prefix as the destination.
>+    *pSrcGid = *pDestGid;
>+    RtlCopyMemory( &pSrcGid->u.Byte[8], &port_out.PortGuid,
>sizeof(port_out.PortGuid) );

..this format

I prefer the more condensed format, to keep the focus on the flow of the code
and overall function, rather than on the parameters to a single call.

>+    *pPkey = port_out.PKey;
>+    return S_OK;
>+}
>+
>+#endif
>+}
>+
>+extern "C"
>+{
>+
>+HRESULT
>+IbatResolve(
>+    __in const struct sockaddr* pSrcAddr,
>+    __in const struct sockaddr* pDestAddr,
>+    __out IN6_ADDR* pSrcGid,
>+    __out IN6_ADDR* pDestGid,
>+    __out USHORT* pPkey
>+    )
>+{
>+    return IBAT::Resolve( pSrcAddr, pDestAddr, pSrcGid, pDestGid, pPkey );
>+}
>+
>+} /* extern "C" */


>diff -up -N -r -X trunk\docs\dontdiff.txt -I \$Id:
>old\hw\mthca\kernel\mthca_log.rc trunk\hw\mthca\kernel\mthca_log.rc
>--- old\hw\mthca\kernel\mthca_log.rc    Wed Jul 02 11:53:02 2008
>+++ trunk\hw\mthca\kernel\mthca_log.rc  Wed Dec 31 16:00:00 1969
>@@ -1,2 +0,0 @@
>-LANGUAGE 0x9,0x1
>-1 11 MSG00001.bin

auto-generated file ended up in the patch...

>diff -up -N -r -X trunk\docs\dontdiff.txt -I \$Id: old\inc\iba\ib_at_ioctl.h
>trunk\inc\iba\ib_at_ioctl.h
>--- old\inc\iba\ib_at_ioctl.h   Wed Jul 02 11:53:02 2008
>+++ trunk\inc\iba\ib_at_ioctl.h Wed Jul 02 11:50:14 2008
>@@ -1,6 +1,7 @@
> /*
>  * Copyright (c) 2005 Mellanox Technologies.  All rights reserved.
>  * Copyright (c) 2005 SilverStorm Technologies.  All rights reserved.
>+ * Portions Copyright (c) 2008 Microsoft Corporation.  All rights reserved.
>  *
>  * This software is available to you under the OpenIB.org BSD license
>  * below:
>@@ -39,7 +40,7 @@
> #define        _IB_AT_IOCTL_H_
>
>
>-#define        IBAT_IOCTL_VERSION              4
>+#define        IBAT_IOCTL_VERSION              5
>
> #define        IBAT_MAC_LEN                    6
>
>@@ -56,6 +57,7 @@ typedef struct _IBAT_PORT_RECORD
> {
>        UINT64                          CaGuid;
>        UINT64                          PortGuid;
>+       UINT16                          PKey;
>        UINT8                           PortNum;

It would be better to pad out all IOCTL stuctures to 64 bits, since the compiler
will do it for us anyway, and we get reserved space for later use.

>+    LONGLONG StartTime = GetElapsedTime();
>+    for( int i = 0; i < 2000; i++ )
>+    {
>+        HRESULT hr = IBAT::Resolve(
>+            (struct sockaddr*)&srcAddr,
>+            (struct sockaddr*)&destAddr,
>+            &srcGid,
>+            &destGid,
>+            &pkey
>+            );
>+        if( FAILED( hr ) )
>+        {
>+            printf( "Resolve returned %08x.\n", hr );
>+            return hr;
>+        }
>+    }
>+    LONGLONG RunTime = GetElapsedTime() - StartTime;
>+    double Rate = 2000.0 / ((double)RunTime / (double)GetFrequency());
>+    printf( "%7.2f lookups per second\n", Rate );

It would be nice to get the time for the first lookup too, to get a rough idea
of possible ARP processing time.  Not a big deal.

>@@ -142,6 +142,7 @@ __ibat_get_ports(
>                pAdapter = CONTAINING_RECORD( pItem, ipoib_adapter_t, entry );
>                pOut->Ports[pOut->NumPorts].CaGuid = pAdapter->guids.ca_guid;
>                pOut->Ports[pOut->NumPorts].PortGuid = pAdapter-
>>guids.port_guid.guid;
>+               pOut->Ports[pOut->NumPorts].PKey = IB_DEFAULT_PKEY;

We should be able to read this from verbs, or maybe the ipoib multicast group.

- Sean




More information about the ofw mailing list