[ofw] [PATCH] ipoib-CM 5 of 13

Smith, Stan stan.smith at intel.com
Tue Jan 11 18:31:06 PST 2011


signed-off-by: stan smith <stan.smith at intel.com>

--- A/ulp/ipoib_NDIS6_CM/kernel/ipoib_port.cpp  Tue Jan 11 17:55:39 2011
+++ B/ulp/ipoib_NDIS6_CM/kernel/ipoib_port.cpp  Tue Jan 11 17:52:47 2011
@@ -45,6 +45,9 @@
 #include "wdm.h"
 #include <ntddk.h>

+#include <kernel\ip_packet.h>
+#include <netiodef.h>
+
 extern ULONG g_ipoib_send;
 extern ULONG g_ipoib_send_ack;
 extern ULONG g_ipoib_send_SW;
@@ -55,10 +58,6 @@
 extern ULONG g_ipoib_send_SG_failed;
 extern ULONG g_ipoib_send_reset;

-
-
-
-
 ib_gid_t       bcast_mgid_template = {
        0xff,                                                           /* multicast field */
        0x12,                                                           /* scope (to be filled in) */
@@ -75,13 +74,23 @@
 #endif

 static KDEFERRED_ROUTINE __port_mcast_garbage_dpc;
-static void __port_mcast_garbage_dpc(KDPC *p_gc_dpc,void *context,void *s_arg1, void *s_arg2);

-static void __port_do_mcast_garbage(ipoib_port_t* const        p_port );
+static void
+__port_mcast_garbage_dpc(KDPC *p_gc_dpc, void *context, void *s_arg1, void *s_arg2);
+
+static void
+__port_do_mcast_garbage(ipoib_port_t* const    p_port );
+
+static ipoib_endpt_t *
+ipoib_mac_to_endpt(
+       IN                              ipoib_port_t* const                     p_port,
+       IN              const   mac_addr_t*                                     mac );
+
+#if IPOIB_CM && DBG

-#if 0
 #ifndef _IPOIB_DEBUG_NDIS6
 #define _IPOIB_DEBUG_NDIS6
+
 CL_INLINE void CL_API
 cl_qlist_check_validity(
        IN      cl_qlist_t* const       p_list )
@@ -105,6 +114,7 @@
 }
 #endif
 #endif
+
 /******************************************************************************
 *
 * Declarations
@@ -137,9 +147,6 @@
        IN              ipoib_port_t* const                                     p_port,
        IN              ib_ca_attr_t**                                          pp_ca_attrs );

-static void
-__srq_async_event_cb(
-IN                     ib_async_event_rec_t            *p_event_rec );

 /******************************************************************************
 *
@@ -193,7 +200,7 @@
        IN                              void*                                           context,
                OUT                     cl_pool_item_t** const          pp_pool_item );

-#if !IPOIB_INLINE_RECV
+#if !IPOIB_INLINE_RECV || UD_NBL_IN_DESC
 static void
 __recv_dtor(
        IN              const   cl_pool_item_t* const           p_pool_item,
@@ -249,7 +256,7 @@
        IN                              ipoib_port_t* const                     p_port );

 /* Posts receive buffers to the receive queue. */
-int32_t
+static int32_t
 __recv_mgr_repost(
        IN                              ipoib_port_t* const                     p_port );

@@ -388,7 +395,7 @@
        IN                              eth_hdr_t* const                        p_eth_hdr,
        IN                              MDL* const                                      p_mdl,
        IN              const   size_t                                          mdl_len,
-       IN                              ipoib_send_NB_SG                        *s_buf);
+       IN                              ipoib_send_NB_SG                        *s_buf );


 static void
@@ -488,33 +495,33 @@

 }

-//TODO CM Restore
-#if 0
+#if IPOIB_CM

-static NDIS_STATUS
-__send_fragments(
-       IN              ipoib_port_t* const                                     p_port,
-       IN              ipoib_send_desc_t* const                        p_desc,
-       IN              eth_hdr_t* const                                        p_eth_hdr,
-       IN              ip_hdr_t* const                                         p_ip_hdr,
-       IN              uint32_t                                                        buf_len,
-       IN              NDIS_BUFFER*                                            p_ndis_buf );
+static uint32_t dump_sgl( IN  PSCATTER_GATHER_LIST     p_sgl, int verbose );

+static NDIS_STATUS
+__build_ipv4_fragments(
+       IN              ipoib_send_NB_SG*                       s_buf,
+       IN              ip_hdr_t* const                         p_ip_hdr,
+       IN              uint32_t                                        buf_len,
+       IN              uint32_t                                        ip_packet_len,
+       IN              MDL*                                            p_mdl );

 static void
 __update_fragment_ip_hdr(
-IN             ip_hdr_t* const         p_ip_hdr,
-IN             uint16_t                        fragment_size,
-IN             uint16_t                        fragment_offset,
-IN             BOOLEAN                         more_fragments );
+       IN              ip_hdr_t* const         p_ip_hdr,
+       IN              uint16_t                        fragment_size,
+       IN              uint16_t                        fragment_offset,
+       IN              BOOLEAN                         more_fragments );

 static void
 __copy_ip_options(
-IN             uint8_t*                        p_buf,
-IN             uint8_t*                        p_options,
-IN             uint32_t                        options_len,
-IN             BOOLEAN                         copy_all );
+       IN              uint8_t*                        p_buf,
+       IN              uint8_t*                        p_options,
+       IN              uint32_t                        options_len,
+       IN              BOOLEAN                         copy_all );
 #endif
+
 /******************************************************************************
 *
 * Endpoint manager operations
@@ -641,6 +648,10 @@
        IN                              void                                            *context );


+#if DBG
+char *ref_cnt_str(int type);
+#endif
+
 static int
 __gid_cmp(
        IN              const   void* const                                     p_key1,
@@ -653,14 +664,13 @@
 inline void ipoib_port_ref( ipoib_port_t * p_port, int type )
 {
        cl_obj_ref( &p_port->obj );
-#if DBG
-       cl_atomic_inc( &p_port->ref[type % ref_mask] );
-       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_OBJ,
-                       ("Port[%d] refcount raised to %d\n", p_port->port_num, p_port->obj.ref_cnt));
-
-       if ((p_port->obj.ref_cnt % 20)==0)
-       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_OBJ,
-               ("ref type %d ref_cnt %d\n", type, p_port->obj.ref_cnt) );
+#if _PORT_REFS
+       int32_t r = cl_atomic_inc( &p_port->ref[type % ref_mask] );
+
+       if( ((p_port->obj.ref_cnt % 20) == 0) || p_port->obj.ref_cnt < 10 )
+               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_OBJ,
+                       ("ref type %d '%s' refs %d port refs %d\n",
+                               type, ref_cnt_str(type), r, p_port->obj.ref_cnt) );
 #else
        UNREFERENCED_PARAMETER(type);
 #endif
@@ -670,14 +680,13 @@
 inline void ipoib_port_deref(ipoib_port_t * p_port, int type)
 {
        cl_obj_deref( &p_port->obj );
-#if DBG
-       cl_atomic_dec( &p_port->ref[type % ref_mask] );
-       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_OBJ,
-               ("Port[%d] refcount-- to %d\n", p_port->port_num, p_port->obj.ref_cnt));
-
-       if ((p_port->obj.ref_cnt % 20) == 0)
-       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_OBJ,
-               ("deref type %d ref_cnt %d\n", type, p_port->obj.ref_cnt) );
+#if _PORT_REFS
+       int32_t r = cl_atomic_dec( &p_port->ref[type % ref_mask] );
+
+       if( ((p_port->obj.ref_cnt % 20) == 0) || p_port->obj.ref_cnt < 10 )
+               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_OBJ,
+                       ("deref type %d '%s' refs %d port refs %d\n",
+                               type, ref_cnt_str(type), r, p_port->obj.ref_cnt) );
 #else
        UNREFERENCED_PARAMETER(type);
 #endif
@@ -759,6 +768,15 @@
        CL_ASSERT( p_port->p_adapter );
        CL_ASSERT( !p_port->p_adapter->p_port );

+#if DBG
+       if( p_port->obj.ref_cnt > 0 )
+       {
+               dmp_ipoib_port_refs( p_port, "port_destroy()" );
+               IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ALL,
+                       ("*** port[%d] ref cnt %d > 0\n",
+                               p_port->port_num, p_port->obj.ref_cnt) );
+       }
+#endif
        cl_obj_destroy( &p_port->obj );

        IPOIB_EXIT( IPOIB_DBG_INIT );
@@ -785,6 +803,7 @@
        __endpt_mgr_construct( p_port );

        p_port->pPoWorkItem = NULL;
+       p_port->pPoWorkItemCM = NULL;

        KeInitializeEvent( &p_port->sa_event, NotificationEvent, TRUE );
        KeInitializeEvent( &p_port->leave_mcast_event, NotificationEvent, TRUE );
@@ -815,7 +834,15 @@
                        ("IoAllocateWorkItem returned NULL\n") );
                return IB_ERROR;
        }
-
+#if IPOIB_CM
+       p_port->pPoWorkItemCM = IoAllocateWorkItem(p_adapter->pdo);
+       if( p_port->pPoWorkItemCM == NULL )
+       {
+               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                       ("IoAllocateWorkItem returned NULL for CM?\n") );
+               return IB_ERROR;
+       }
+#endif
        cl_status = cl_spinlock_init( &p_port->send_lock );
        if( cl_status != CL_SUCCESS )
        {
@@ -882,18 +909,13 @@
        }

         /* Initialize multicast garbage collector timer and DPC object */
-       KeInitializeDpc(&p_port->gc_dpc,(PKDEFERRED_ROUTINE)__port_mcast_garbage_dpc,p_port);
+       KeInitializeDpc(&p_port->gc_dpc,
+                                       (PKDEFERRED_ROUTINE)__port_mcast_garbage_dpc,p_port);
        KeInitializeTimerEx(&p_port->gc_timer,SynchronizationTimer);

        /* We only ever destroy from the PnP callback thread. */
-       cl_status = cl_obj_init( &p_port->obj, CL_DESTROY_SYNC,
-               __port_destroying, __port_cleanup, __port_free );
-
-#if 0
-       cl_atomic_inc( &p_port->ref[ref_init] );
-       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_OBJ,
-               ("ref type %d ref_cnt %d\n", ref_init, p_port->obj.ref_cnt) );
-#endif
+       cl_status = cl_obj_init( &p_port->obj, CL_DESTROY_SYNC, __port_destroying,
+                                                               __port_cleanup, __port_free );

        if( cl_status != CL_SUCCESS )
        {
@@ -911,11 +933,6 @@
                return IB_ERROR;
        }

-#if 0
-       cl_atomic_inc( &p_port->ref[ref_init] );
-       IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_OBJ,
-               ("ref type %d ref_cnt %d\n", ref_init, p_port->obj.ref_cnt) );
-#endif
        /* The steps of the initialization are as depicted below:
         I.     adapter_init() calls shutter_init(), the shutter counter is set to 0
        II.     ipoib_pnp_cb() calls to ipoib_port_init() that calls to __port_init() that SHOULD set shutter counter to -MAX_OPERATIONS
@@ -960,19 +977,20 @@

        __endpt_mgr_remove_all( p_port );

-#if 0
        if( p_port->p_adapter->params.cm_enabled )
        {
-               endpt_cm_buf_mgr_destroy(  p_port );
+               endpt_cm_buf_mgr_destroy( p_port );
                ipoib_port_srq_destroy( p_port );
                p_port->endpt_mgr.thread_is_done = 1;
                cl_event_signal( &p_port->endpt_mgr.event );
        }
-#endif
+
        cl_spinlock_acquire(&p_port->send_lock);
        ipoib_port_resume( p_port, FALSE );
        cl_spinlock_release(&p_port->send_lock);

+       dmp_ipoib_port_refs( p_port, "port_destroying()" );
+
        IPOIB_EXIT( IPOIB_DBG_INIT );
 }

@@ -1029,6 +1047,9 @@
        }

        IoFreeWorkItem( p_port->pPoWorkItem );
+       if( p_port->pPoWorkItemCM )
+               IoFreeWorkItem( p_port->pPoWorkItemCM );
+
        cl_free( p_port );

        IPOIB_EXIT( IPOIB_DBG_INIT );
@@ -1061,6 +1082,7 @@
        uint64_t                        vaddr;
        net32_t                         rkey;
        ib_qp_attr_t            qp_attr;
+       boolean_t                       cm_enabled = p_port->p_adapter->params.cm_enabled;

        IPOIB_ENTER( IPOIB_DBG_INIT );

@@ -1087,20 +1109,6 @@
                        ("Query CA attributes failed\n" ) );
                return status;
        }
-#if 0
-       if( p_port->p_adapter->params.cm_enabled )
-       {
-               uint32_t payload_mtu = __port_attr_to_mtu_size(
-                                       p_port->p_ca_attrs->p_port_attr[p_port->port_num - 1].mtu )
-                                       - sizeof(ipoib_hdr_t);
-
-               /* adjust ipoib UD payload MTU to actual port MTU size. */
-               p_port->p_adapter->params.payload_mtu =
-                                                                               max( DEFAULT_PAYLOAD_MTU, payload_mtu );
-               p_port->p_adapter->params.xfer_block_size =
-                       (sizeof(eth_hdr_t) + p_port->p_adapter->params.payload_mtu);
-       }
-#endif

 #if IPOIB_USE_DMA
        /* init DMA only once while running MiniportInitialize */
@@ -1285,23 +1293,25 @@
                return status;
        }

-       status = ipoib_port_srq_init( p_port );
-       if( status != IB_SUCCESS )
+       if( p_port->p_adapter->params.cm_enabled )
        {
-               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                       ("ipoib_port_srq_init failed %s\n",
-                       p_port->p_adapter->p_ifc->get_err_str( status )) );
-               /* disable further CM initialization */
-               p_port->p_adapter->params.cm_enabled = FALSE;
+               /* Create a Shared Recv Queue for CM */
+               status = ipoib_port_srq_init( p_port );
+               if( status != IB_SUCCESS )
+               {
+                       IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                               ("ipoib_port_srq_init failed %s\n",
+                               p_port->p_adapter->p_ifc->get_err_str( status )) );
+                       /* disable further CM initialization */
+                       p_port->p_adapter->params.cm_enabled = FALSE;

-               NdisWriteErrorLogEntry( p_port->p_adapter->h_adapter,
+                       NdisWriteErrorLogEntry( p_port->p_adapter->h_adapter,
                                EVENT_IPOIB_CONNECTED_MODE_ERR, 1, 0xbadc0de1 );

-       }
-//CM
-#if 0
-       if( p_port->p_adapter->params.cm_enabled )
-       {
+                       p_port->ib_mgr.h_srq = NULL;
+                       goto cm_xit;
+               }
+
                status = endpt_cm_buf_mgr_init( p_port );
                if( status != IB_SUCCESS )
                {
@@ -1312,21 +1322,29 @@

                        NdisWriteErrorLogEntry( p_port->p_adapter->h_adapter,
                                EVENT_IPOIB_CONNECTED_MODE_ERR, 1, 0xbadc0de2 );
+                       p_port->p_adapter->params.cm_enabled  = 0; // disable CM mode.
                }
                else
                {
                        if ( p_port->p_adapter->params.send_chksum_offload )
                                p_port->p_adapter->params.send_chksum_offload = CSUM_DISABLED;
                }
+cm_xit:
+               if(     cm_enabled && p_port->p_adapter->params.cm_enabled == 0 )
+               {       /* problems in CM resource allocation - release CM resouces */
+                       endpt_cm_buf_mgr_destroy( p_port );
+                       ipoib_port_srq_destroy( p_port );
+                       status = IB_SUCCESS;    // good to go in UD mode.
+               }
        }
-#endif
+
        IPOIB_EXIT( IPOIB_DBG_INIT );
        return IB_SUCCESS;
 }

 static void
 __srq_async_event_cb(
-IN                     ib_async_event_rec_t            *p_event_rec )
+       IN              ib_async_event_rec_t            *p_event_rec )
 {
        ipoib_port_t* p_port =
                (ipoib_port_t *)p_event_rec->context;
@@ -1354,8 +1372,9 @@
                        break;
        default:
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                               ("ASYNC EVENT CODE ARRIVED %d(%#x)\n",
-                               p_event_rec->code, p_event_rec->code ) );
+                               ("ASYNC EVENT CODE ARRIVED %s vendor code %#I64d\n",
+                                       ib_get_async_event_str( p_event_rec->code ),
+                                       p_event_rec->vendor_specific) );
        }
 }

@@ -1367,23 +1386,32 @@
        ib_srq_handle_t         h_srq;
        ib_srq_attr_t           srq_attr;

-       IPOIB_ENTER( IPOIB_DBG_INIT );
+       IPOIB_ENTER( IPOIB_DBG_CM );

        if( !p_port->p_adapter->params.cm_enabled )
                return IB_SUCCESS;

-       srq_attr.max_sge = min( 2, p_port->p_ca_attrs->max_srq_sges );
-       srq_attr.srq_limit = 10;
-       srq_attr.max_wr =
-               min( (uint32_t)p_port->p_adapter->params.rq_depth * 8,
-                               p_port->p_ca_attrs->max_srq_wrs/2 );
-
-       ib_status = p_port->p_adapter->p_ifc->create_srq(
-                                                               p_port->ib_mgr.h_pd,
-                                                               &srq_attr,
-                                                               p_port,
-                                                               __srq_async_event_cb,
-                                                               &h_srq );
+       srq_attr.max_sge = MAX_CM_RECV_SGE;
+
+       // if below threshold, then hardware fires async event
+       srq_attr.srq_limit = SRQ_LOW_WATER;
+       srq_attr.max_wr =
+               min( ((uint32_t)p_port->p_adapter->params.rq_depth * 8),
+                               (p_port->p_ca_attrs->max_srq_wrs/2) );
+
+       DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM_CONN,
+               ("SRQ max_WR %u = MIN( rq_depth x8 %u, ca.max_srq_wrs/2 %u)\n",
+                       srq_attr.max_wr,
+                       (uint32_t)p_port->p_adapter->params.rq_depth * 8,
+                       (p_port->p_ca_attrs->max_srq_wrs/2)) );
+
+       p_port->ib_mgr.srq_qp_cnt = 0;
+
+       ib_status = p_port->p_adapter->p_ifc->create_srq( p_port->ib_mgr.h_pd,
+                                                                                                         &srq_attr,
+                                                                                                         p_port,
+                                                                                                         __srq_async_event_cb,
+                                                                                                         &h_srq );
        if( ib_status != IB_SUCCESS )
        {
                NdisWriteErrorLogEntry( p_port->p_adapter->h_adapter,
@@ -1395,7 +1423,7 @@
        }
        p_port->ib_mgr.h_srq = h_srq;

-       IPOIB_EXIT( IPOIB_DBG_INIT );
+       IPOIB_EXIT( IPOIB_DBG_CM );

        return ib_status;
 }
@@ -1458,10 +1486,26 @@

        if( p_port->ib_mgr.h_srq )
        {
-               status =
-                       p_port->p_adapter->p_ifc->destroy_srq( p_port->ib_mgr.h_srq, NULL );
-               CL_ASSERT( status == IB_SUCCESS );
+               int loops=1000;
+               BOOLEAN dispatch = (KeGetCurrentIrql() == DISPATCH_LEVEL);
+
+               /* wait for SRQ bound QPs to destroy */
+               for(; loops > 0 && p_port->ib_mgr.srq_qp_cnt > 0; loops-- )
+               {
+                       if( !dispatch )
+                                       cl_thread_suspend(2);
+               }
+
+               status = p_port->p_adapter->p_ifc->destroy_srq( p_port->ib_mgr.h_srq, NULL );
+               if( status != IB_SUCCESS )
+               {
+                       IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                               ("Port[%d] destroy_srq() %s\n",
+                                       p_port->port_num,
+                                       p_port->p_adapter->p_ifc->get_err_str( status )) );
+               }
                p_port->ib_mgr.h_srq = NULL;
+               p_port->ib_mgr.srq_qp_cnt = 0;
        }
 }

@@ -1475,8 +1519,7 @@

        if( p_port->ib_mgr.h_ca )
        {
-               status =
-                       p_port->p_adapter->p_ifc->close_ca( p_port->ib_mgr.h_ca, NULL );
+               status = p_port->p_adapter->p_ifc->close_ca( p_port->ib_mgr.h_ca, NULL );
                CL_ASSERT( status == IB_SUCCESS );
                p_port->ib_mgr.h_ca = NULL;
        }
@@ -1513,6 +1556,9 @@
        IN                              ipoib_port_t* const                     p_port )
 {
        cl_status_t             cl_status;
+#if UD_NBL_IN_DESC
+       ib_api_status_t ib_status;
+#endif
        ipoib_params_t  *p_params;
        NET_BUFFER_LIST_POOL_PARAMETERS pool_parameters;

@@ -1523,6 +1569,55 @@

        p_params = &p_port->p_adapter->params;

+#if    UD_NBL_IN_DESC
+
+       /* Allocate the NET BUFFER list pools for receive indication. */
+       memset(&pool_parameters, 0, sizeof(NET_BUFFER_LIST_POOL_PARAMETERS));
+    pool_parameters.Header.Type = NDIS_OBJECT_TYPE_DEFAULT;
+    pool_parameters.Header.Revision = NET_BUFFER_LIST_POOL_PARAMETERS_REVISION_1;
+    pool_parameters.Header.Size = sizeof(pool_parameters);
+    pool_parameters.ProtocolId = NDIS_PROTOCOL_ID_DEFAULT;
+    pool_parameters.ContextSize = 0;
+    pool_parameters.fAllocateNetBuffer = TRUE;
+    pool_parameters.PoolTag = 'CRPI';
+       pool_parameters.DataSize = 0;
+
+    p_port->buf_mgr.h_packet_pool = NdisAllocateNetBufferListPool(
+                                                                                               p_port->p_adapter->h_adapter,
+                                                                                               &pool_parameters );
+
+       if( !p_port->buf_mgr.h_packet_pool )
+       {
+               NdisWriteErrorLogEntry( p_port->p_adapter->h_adapter,
+                       EVENT_IPOIB_RECV_PKT_POOL, 1, NDIS_STATUS_RESOURCES  );
+               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                       ("NdisAllocatePacketPool returned %08X\n", (UINT)NDIS_STATUS_RESOURCES) );
+               return IB_INSUFFICIENT_RESOURCES;
+       }
+
+       /* Allocate the receive descriptor pool */
+       cl_status = cl_qpool_init( &p_port->buf_mgr.recv_pool,
+                                                          p_params->rq_depth * p_params->recv_pool_ratio,
+                                                          0,
+                                                          0,
+                                                          sizeof(ipoib_recv_desc_t),
+                                                          __recv_ctor,
+                                                          __recv_dtor,
+                                                          p_port );
+
+       if( cl_status != CL_SUCCESS )
+       {
+               NdisWriteErrorLogEntry( p_port->p_adapter->h_adapter,
+                       EVENT_IPOIB_RECV_POOL, 1, cl_status );
+               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                       ("cl_qpool_init for recvs returned %#x\n",
+                       cl_status) );
+               ib_status = IB_INSUFFICIENT_MEMORY;
+               goto pkt_pool_failed;
+       }
+
+#else  !UD_NBL_IN_DESC
+
        /* Allocate the receive descriptor pool */
        cl_status = cl_qpool_init( &p_port->buf_mgr.recv_pool,
                                                           p_params->rq_depth * p_params->recv_pool_ratio,
@@ -1552,7 +1647,7 @@
     pool_parameters.Header.Type = NDIS_OBJECT_TYPE_DEFAULT;
     pool_parameters.Header.Revision = NET_BUFFER_LIST_POOL_PARAMETERS_REVISION_1;
     pool_parameters.Header.Size = sizeof(pool_parameters);
-    pool_parameters.ProtocolId = 0;
+    pool_parameters.ProtocolId = NDIS_PROTOCOL_ID_DEFAULT;
     pool_parameters.ContextSize = 0;
     pool_parameters.fAllocateNetBuffer = TRUE;
     pool_parameters.PoolTag = 'CRPI';
@@ -1570,6 +1665,7 @@
                        ("NdisAllocatePacketPool returned %08X\n", (UINT)NDIS_STATUS_RESOURCES) );
                return IB_INSUFFICIENT_RESOURCES;
        }
+#endif

        /* Allocate the NET buffer list pool for send formatting. */
     pool_parameters.PoolTag = 'XTPI';
@@ -1584,11 +1680,31 @@
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("NdisAllocatePacketPool returned %08X\n",
                        (UINT)NDIS_STATUS_RESOURCES) );
+#if UD_NBL_IN_DESC
+               ib_status = IB_INSUFFICIENT_RESOURCES;
+               goto pkt_pool_failed;
+#else
                return IB_INSUFFICIENT_RESOURCES;
+#endif
        }

        IPOIB_EXIT( IPOIB_DBG_INIT );
        return IB_SUCCESS;
+
+#if UD_NBL_IN_DESC
+pkt_pool_failed:
+       NdisFreeNetBufferListPool( p_port->buf_mgr.h_packet_pool );
+       p_port->buf_mgr.h_packet_pool = NULL;
+       cl_qpool_destroy( &p_port->buf_mgr.recv_pool );
+       if( p_port->buf_mgr.h_send_pkt_pool)
+       {
+               NdisFreeNetBufferListPool( p_port->buf_mgr.h_send_pkt_pool );
+               p_port->buf_mgr.h_send_pkt_pool = NULL;
+       }
+
+       IPOIB_EXIT( IPOIB_DBG_INIT );
+       return ib_status;
+#endif
 }


@@ -1607,11 +1723,19 @@
                NdisFreeNetBufferListPool ( p_port->buf_mgr.h_send_pkt_pool );

        /* Destroy the receive packet and buffer pools. */
+#if UD_NBL_IN_DESC
+       /* Free the receive and send descriptors. */
+       cl_qpool_destroy( &p_port->buf_mgr.recv_pool );
+
+       if( p_port->buf_mgr.h_packet_pool )
+               NdisFreeNetBufferListPool ( p_port->buf_mgr.h_packet_pool );
+#else
        if( p_port->buf_mgr.h_packet_pool )
                NdisFreeNetBufferListPool ( p_port->buf_mgr.h_packet_pool );

        /* Free the receive and send descriptors. */
        cl_qpool_destroy( &p_port->buf_mgr.recv_pool );
+#endif

        /* Free the lookaside list of scratch buffers. */
        NdisDeleteNPagedLookasideList( &p_port->buf_mgr.send_buf_list );
@@ -1672,7 +1796,7 @@
                p_desc->local_ds[1].length = sizeof(recv_buf_t) - ds0_len;
                p_desc->wr.num_ds = 2;
        }
-#else  /* IPOIB_INLINE_RECV */
+#else  /* ! IPOIB_INLINE_RECV */
        /* Allocate the receive buffer. */
        p_desc->p_buf = (recv_buf_t*)cl_zalloc( sizeof(recv_buf_t) );
        if( !p_desc->p_buf )
@@ -1693,14 +1817,68 @@
        p_desc->wr.num_ds = 1;
 #endif /* IPOIB_INLINE_RECV */

+       p_desc->type = PKT_TYPE_UCAST;
+       p_desc->recv_mode = RECV_UD;
+
+#if    UD_NBL_IN_DESC
+       /* setup NDIS NetworkBufferList and MemoryDescriptorList for this Recv desc */
+       p_desc->p_mdl = NdisAllocateMdl(p_port->p_adapter->h_adapter,
+#if IPOIB_INLINE_RECV
+                                                                       &p_desc->buf.eth.pkt,
+#else
+                                                                       p_desc->p_buf,
+#endif
+                                                                       sizeof(ipoib_pkt_t) + sizeof(ib_grh_t) );
+       if( !p_desc->p_mdl )
+       {
+               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                       ("Failed to allocate MDL\n") );
+               goto err1;
+       }
+
+       p_desc->p_NBL = NdisAllocateNetBufferAndNetBufferList(
+                                               p_port->buf_mgr.h_packet_pool,
+                                               0,
+                                               0,
+                                               p_desc->p_mdl,
+                                               0,
+                                               0);
+
+       if( !p_desc->p_NBL )
+       {
+               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                       ("Failed to allocate NET_BUFFER_LIST\n") );
+               goto err2;
+       }
+
+       NET_BUFFER_LIST_NEXT_NBL(p_desc->p_NBL) = NULL;
+       IPOIB_PORT_FROM_NBL( p_desc->p_NBL ) = p_port;
+       IPOIB_RECV_FROM_NBL( p_desc->p_NBL ) = p_desc;
+       p_desc->p_NBL->SourceHandle = p_port->p_adapter->h_adapter;
+#endif
+
        *pp_pool_item = &p_desc->item;

        IPOIB_EXIT( IPOIB_DBG_ALLOC );
        return CL_SUCCESS;
-}

+#if UD_NBL_IN_DESC
+err2:
+       NdisFreeMdl( p_desc->p_mdl );
+       p_desc->p_mdl = NULL;

+err1:
 #if !IPOIB_INLINE_RECV
+       cl_free( p_desc->p_buf );
+       p_desc->p_buf = NULL;
+#endif
+       IPOIB_EXIT( IPOIB_DBG_ALLOC );
+       return CL_INSUFFICIENT_MEMORY;
+#endif
+}
+
+
+#if !IPOIB_INLINE_RECV || UD_NBL_IN_DESC
 static void
 __recv_dtor(
        IN              const   cl_pool_item_t* const           p_pool_item,
@@ -1714,9 +1892,23 @@

        p_desc = PARENT_STRUCT( p_pool_item, ipoib_recv_desc_t, item );

+#if UD_NBL_IN_DESC
+       if( p_desc->p_mdl )
+       {
+               NdisFreeMdl( p_desc->p_mdl );
+               p_desc->p_mdl = NULL;
+       }
+       if( p_desc->p_NBL)
+       {
+               NdisFreeNetBufferList(p_desc->p_NBL);
+               p_desc->p_NBL = NULL;
+       }
+#endif
+
+#if !IPOIB_INLINE_RECV
        if( p_desc->p_buf )
                cl_free( p_desc->p_buf );
-
+#endif
        IPOIB_EXIT( IPOIB_DBG_ALLOC );
 }
 #endif
@@ -1727,26 +1919,27 @@
        IN                              ipoib_port_t* const                     p_port )
 {
        ipoib_recv_desc_t       *p_desc;
-       IPOIB_ENTER( IPOIB_DBG_RECV );
+
+       XIPOIB_ENTER( IPOIB_DBG_BUF );
+
        p_desc = (ipoib_recv_desc_t*)cl_qpool_get( &p_port->buf_mgr.recv_pool );

        /* Reference the port object for the recv. */
        if( p_desc )
        {
                ipoib_port_ref( p_port, ref_get_recv );
+               CL_ASSERT( p_desc->recv_mode == RECV_UD );
                CL_ASSERT( p_desc->wr.wr_id == (uintn_t)p_desc );
 #if IPOIB_INLINE_RECV
-               CL_ASSERT( p_desc->local_ds[0].vaddr ==
-                       cl_get_physaddr( &p_desc->buf ) );
-#else  /* IPOIB_INLINE_RECV */
-               CL_ASSERT( p_desc->local_ds[0].vaddr ==
-                       cl_get_physaddr( p_desc->p_buf ) );
+               CL_ASSERT( p_desc->local_ds[0].vaddr == cl_get_physaddr( &p_desc->buf ) );
+#else
+               CL_ASSERT( p_desc->local_ds[0].vaddr == cl_get_physaddr( p_desc->p_buf ) );
                CL_ASSERT( p_desc->local_ds[0].length ==
-                       (sizeof(ipoib_pkt_t) + sizeof(ib_grh_t)) );
-#endif /* IPOIB_INLINE_RECV */
+                                                                               (sizeof(ipoib_pkt_t) + sizeof(ib_grh_t)) );
+#endif
                CL_ASSERT( p_desc->local_ds[0].lkey == p_port->ib_mgr.lkey );
        }
-       IPOIB_EXIT( IPOIB_DBG_RECV );
+       XIPOIB_EXIT( IPOIB_DBG_BUF );
        return p_desc;
 }

@@ -1758,12 +1951,25 @@
        IN                              ipoib_recv_desc_t* const        p_desc,
        IN                              NET_BUFFER_LIST* const          p_net_buffer_list OPTIONAL )
 {
-       NET_BUFFER              *p_buf = NULL;
-       MDL                             *p_mdl = NULL;
-       IPOIB_ENTER(IPOIB_DBG_RECV );
+       IPOIB_ENTER(IPOIB_DBG_BUF );

        if( p_net_buffer_list )
        {
+               NET_BUFFER      *p_buf = NULL;
+               MDL                     *p_mdl = NULL;
+#if UD_NBL_IN_DESC
+               ASSERT( p_desc->p_NBL );
+               ASSERT( p_desc->p_mdl );
+               ASSERT( p_net_buffer_list == p_desc->p_NBL );
+
+               NET_BUFFER_LIST_NEXT_NBL(p_net_buffer_list) = NULL;
+
+               p_buf = NET_BUFFER_LIST_FIRST_NB( p_net_buffer_list );
+               p_mdl = NET_BUFFER_FIRST_MDL( p_buf );
+
+               ASSERT( p_mdl == p_desc->p_mdl );
+               ASSERT( NET_BUFFER_CURRENT_MDL( p_buf ) == p_mdl );
+#else
                NET_BUFFER_LIST_NEXT_NBL(p_net_buffer_list) = NULL;
                p_buf = NET_BUFFER_LIST_FIRST_NB(p_net_buffer_list);
                CL_ASSERT( p_buf );
@@ -1771,7 +1977,19 @@
                CL_ASSERT( p_mdl );
                NdisFreeMdl(p_mdl);
                NdisFreeNetBufferList(p_net_buffer_list);
+#endif
        }
+#if DBG
+       if (p_desc->recv_mode != RECV_UD )
+       {
+               IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                       ("!RECV_UD? p_desc %p pkt_type %s cmode %s\n",
+                               p_desc,
+                               get_ipoib_pkt_type_str(p_desc->type),
+                               get_ib_recv_mode_str(p_desc->recv_mode)) );
+       }
+#endif
+       CL_ASSERT( p_desc->recv_mode == RECV_UD );

        /* Return the descriptor to its pools. */
        cl_qpool_put( &p_port->buf_mgr.recv_pool, &p_desc->item );
@@ -1780,7 +1998,7 @@
         * Dereference the port object since the receive is no longer outstanding.
         */
        ipoib_port_deref( p_port, ref_get_recv );
-       IPOIB_EXIT( IPOIB_DBG_RECV );
+       IPOIB_EXIT( IPOIB_DBG_BUF );
 }


@@ -1800,8 +2018,26 @@
 {
        NET_BUFFER_LIST                 *p_net_buffer_list;
        MDL                                             *p_mdl;
+#if UD_NBL_IN_DESC
+       PNET_BUFFER                     NetBuffer;
+       UNREFERENCED_PARAMETER(p_port);
+
+       IPOIB_ENTER( IPOIB_DBG_BUF );
+       ASSERT( p_desc->p_NBL );
+       ASSERT( p_desc->p_mdl );
+       p_net_buffer_list = p_desc->p_NBL;
+
+       NetBuffer = NET_BUFFER_LIST_FIRST_NB( p_net_buffer_list );
+       p_mdl = NET_BUFFER_FIRST_MDL( NetBuffer );
+
+       ASSERT( p_mdl == p_desc->p_mdl );
+       ASSERT( NET_BUFFER_CURRENT_MDL( NetBuffer ) == p_mdl );

-       IPOIB_ENTER(  IPOIB_DBG_RECV );
+       NET_BUFFER_DATA_LENGTH( NetBuffer ) = p_desc->len;
+       NdisAdjustMdlLength( p_mdl, p_desc->len );
+
+#else  // !UD_NBL_IN_DESC
+       IPOIB_ENTER( IPOIB_DBG_BUF );

        p_mdl = NdisAllocateMdl(p_port->p_adapter->h_adapter,
                                                        &p_desc->buf.eth.pkt,
@@ -1833,8 +2069,9 @@
        IPOIB_PORT_FROM_NBL( p_net_buffer_list ) = p_port;
        IPOIB_RECV_FROM_NBL( p_net_buffer_list ) = p_desc;
        p_net_buffer_list->SourceHandle = p_port->p_adapter->h_adapter;
+#endif // !UD_NBL_IN_DESC

-       IPOIB_EXIT(  IPOIB_DBG_RECV );
+       IPOIB_EXIT( IPOIB_DBG_BUF );
        return p_net_buffer_list;
 }

@@ -1898,30 +2135,35 @@
  * that the value is signed, and can go negative.  All tests must
  * be for > 0.
  */
-int32_t
+static int32_t
 __recv_mgr_repost(
-       IN                              ipoib_port_t* const                     p_port )
+       IN                      ipoib_port_t* const                     p_port )
 {
        ipoib_recv_desc_t       *p_head = NULL, *p_tail = NULL, *p_next;
        ib_api_status_t         status;
        ib_recv_wr_t            *p_failed;
+       size_t                          rx_cnt=0;
+       int                                     rx_wanted;
        PERF_DECLARE( GetRecv );
        PERF_DECLARE( PostRecv );

-       IPOIB_ENTER( IPOIB_DBG_RECV );
+       XIPOIB_ENTER( IPOIB_DBG_RECV );

        CL_ASSERT( p_port );
        cl_obj_lock( &p_port->obj );
        if( p_port->state != IB_QPS_RTS )
        {
                cl_obj_unlock( &p_port->obj );
-               IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
-                       ("Port in invalid state.  Not reposting.\n") );
+               IPOIB_PRINT_EXIT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_RECV,
+                       ("Port[%d] in invalid state; Not reposting.\n", p_port->port_num) );
+
                return 0;
        }
        ipoib_port_ref( p_port, ref_repost );
        cl_obj_unlock( &p_port->obj );

+       rx_wanted = p_port->p_adapter->params.rq_depth - p_port->recv_mgr.depth;
+
        while( p_port->recv_mgr.depth < p_port->p_adapter->params.rq_depth )
        {
                /* Pull receives out of the pool and chain them up. */
@@ -1931,7 +2173,8 @@
                if( !p_next )
                {
                        IPOIB_PRINT(TRACE_LEVEL_VERBOSE, IPOIB_DBG_RECV,
-                               ("Out of receive descriptors! recv queue depth 0x%x\n",p_port->recv_mgr.depth) );
+                               ("Out of UD receive descriptors! cur recv Q depth %d Max %d\n",
+                                       p_port->recv_mgr.depth,p_port->p_adapter->params.rq_depth) );
                        break;
                }

@@ -1948,13 +2191,15 @@
                p_head = p_next;

                p_port->recv_mgr.depth++;
+               rx_cnt++;
        }

        if( p_head )
        {
                cl_perf_start( PostRecv );
-               status = p_port->p_adapter->p_ifc->post_recv(
-                       p_port->ib_mgr.h_qp, &p_head->wr, &p_failed );
+               status = p_port->p_adapter->p_ifc->post_recv( p_port->ib_mgr.h_qp,
+                                                                                                         &p_head->wr,
+                                                                                                         &p_failed );
                cl_perf_stop( &p_port->p_adapter->perf, PostRecv );

                if( status != IB_SUCCESS )
@@ -1972,22 +2217,35 @@
                                p_port->recv_mgr.depth--;
                        }
                }
+#if DBG
+               else
+               {
+                       if( (size_t)rx_wanted != rx_cnt )
+                       {
+                               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_BUF,
+                                       ("UD RX bufs: wanted %d posted %d\n",rx_wanted,rx_cnt) );
+                       }
+               }
+#endif
        }

        ipoib_port_deref( p_port, ref_repost );
-       IPOIB_EXIT( IPOIB_DBG_RECV );
+       XIPOIB_EXIT( IPOIB_DBG_RECV );
        return p_port->p_adapter->params.rq_low_watermark - p_port->recv_mgr.depth;
 }

-inline ULONG __free_received_NBL (
-       IN ipoib_port_t         *p_port,
-       IN NET_BUFFER_LIST      *p_net_buffer_lists )
-{
+//     p_port->recv_lock held by caller.

+static inline ULONG __free_received_NBL(
+       IN              ipoib_port_t                    *p_port,
+       IN              NET_BUFFER_LIST                 *p_net_buffer_lists )
+{
        ipoib_recv_desc_t       *p_desc;
        NET_BUFFER_LIST         *cur_net_buffer_list, *next_net_buffer_list;
        LONG                            NBL_cnt = 0;

+       PERF_DECLARE( ReturnPutRecv );
+
        for (cur_net_buffer_list = p_net_buffer_lists;
                 cur_net_buffer_list != NULL;
                 cur_net_buffer_list = next_net_buffer_list)
@@ -1999,43 +2257,44 @@
                CL_ASSERT(p_port == IPOIB_PORT_FROM_NBL( cur_net_buffer_list ));
                p_desc = IPOIB_RECV_FROM_NBL( cur_net_buffer_list );

-#if 0 //TODO CM flow
-               if( p_desc->type == PKT_TYPE_CM_UCAST )
-               {
-                       int32_t                         discarded;
-                       uint32_t                        NBL_cnt = 0;
-
-                       ib_api_status_t         status = IB_NOT_DONE;
-
-
-                       NDIS_BUFFER             *p_buf;
-
-                       /* Unchain the NDIS buffer. */
-                       NdisUnchainBufferAtFront( p_packet, &p_buf );
-                       CL_ASSERT( p_buf );
-                       /* Return the NDIS packet and NDIS buffer to their pools. */
-                       NdisDprFreePacketNonInterlocked( p_packet );
-                       NdisFreeBuffer( p_buf );
-
-                       endpt_cm_buf_mgr_put_recv( &p_port->cm_buf_mgr, (ipoib_cm_desc_t *)p_desc );
-                       status = endpt_cm_post_recv( p_port );
-                       if(  status != IB_SUCCESS )
-                       {
-                               IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                                       ("Post Recv QP failed\n" ) );
-                       }
-                       cl_spinlock_release( &p_port->recv_lock );
-                       return;
-               }
-#endif
+               CL_ASSERT(p_desc->recv_mode == RECV_UD || p_desc->recv_mode == RECV_RC);

                cl_perf_start( ReturnPutRecv );
-               __buf_mgr_put_recv( p_port, p_desc, cur_net_buffer_list );
+               if( p_desc->recv_mode == RECV_RC )
+               {
+                       ipoib_cm_recv_desc_t *p_cm_desc = (ipoib_cm_recv_desc_t*)p_desc;
+#if DBG
+                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_BUF,
+                               ("RC NBL %p pkt_type %s recv_mode %s\n",
+                                       cur_net_buffer_list,
+                                       get_ipoib_pkt_type_str(p_desc->type),
+                                       get_ib_recv_mode_str(p_desc->recv_mode)) );
+#endif
+                       ipoib_cm_buf_mgr_put_recv( p_port, p_cm_desc, cur_net_buffer_list );
+               }
+               else
+               {
+                       __buf_mgr_put_recv( p_port, p_desc, cur_net_buffer_list );
+               }
                cl_perf_stop( &p_port->p_adapter->perf, ReturnPutRecv );
        }
        return NBL_cnt;
 }

+
+ULONG ipoib_free_received_NBL(
+       IN              ipoib_port_t                    *p_port,
+       IN              NET_BUFFER_LIST                 *p_net_buffer_lists )
+{
+       return __free_received_NBL( p_port, p_net_buffer_lists );
+}
+
+
+/* Called by MiniPortDriver->MiniportReturnNetBufferLists()),
+ * hence NBL can come from UD or RC send/recv.
+ * see ipoib_driver.cpp for reference.
+ */
+
 void
 ipoib_return_net_buffer_list(
        IN                              NDIS_HANDLE                                     adapter_context,
@@ -2047,12 +2306,9 @@
        LONG                            NBL_cnt = 0;

        PERF_DECLARE( ReturnPacket );
-       PERF_DECLARE( ReturnPutRecv );
        PERF_DECLARE( ReturnRepostRecv );
-       PERF_DECLARE( ReturnPreparePkt );
-       PERF_DECLARE( ReturnNdisIndicate );

-       IPOIB_ENTER( IPOIB_DBG_RECV );
+       IPOIB_ENTER( IPOIB_DBG_BUF );

        UNUSED_PARAM( return_flags );

@@ -2079,15 +2335,15 @@
        cl_spinlock_release( &p_port->recv_lock );
        cl_perf_stop( &p_port->p_adapter->perf, ReturnPacket );

-       IPOIB_EXIT( IPOIB_DBG_RECV );
+       IPOIB_EXIT( IPOIB_DBG_BUF );
 }

 static BOOLEAN
 __recv_cb_internal(
        IN              const   ib_cq_handle_t                          h_cq,
        IN                              void                                            *cq_context,
-       IN                              uint32_t                                         *p_recv_cnt
-       );
+    IN              uint32_t*                    p_recv_cnt );
+

 static IO_WORKITEM_ROUTINE __iopoib_WorkItem;

@@ -2129,7 +2385,7 @@
 __recv_cb_internal(
        IN              const   ib_cq_handle_t                          h_cq,
        IN                              void                                            *cq_context,
-       IN                              uint32_t*                                        p_recv_cnt)
+    IN              uint32_t                    *p_recv_cnt )
 {
        ipoib_port_t            *p_port;
        ib_api_status_t         status;
@@ -2161,6 +2417,10 @@

        p_port = (ipoib_port_t*)cq_context;

+#if DBG
+       if( h_cq ) {ASSERT( h_cq == p_port->ib_mgr.h_recv_cq );}
+#endif
+
        cl_qlist_init( &done_list );
        cl_qlist_init( &bad_list );

@@ -2216,7 +2476,6 @@

        do
        {
-               //int32_t cnt;
                /* Repost ASAP so we don't starve the RQ. */
                cl_perf_start( RepostRecv );
                shortage = __recv_mgr_repost( p_port );
@@ -2265,8 +2524,8 @@
                                cl_spinlock_release( &p_port->recv_lock );
                        }
                        cl_perf_stop( &p_port->p_adapter->perf, RecvNdisIndicate );
-               }
-               else
+               }
+               else
                {
                        /* If shortage >0,  IPoIB driver should regain
                           ownership of the NET_BUFFER_LIST structures immediately.
@@ -2304,7 +2563,6 @@
                        }
                        __recv_mgr_repost( p_port );
                        cl_spinlock_release( &p_port->recv_lock );
-
                }
                cl_spinlock_acquire( &p_port->recv_lock );

@@ -2326,13 +2584,15 @@

                cl_perf_stop( &p_port->p_adapter->perf, RearmRecv );
                CL_ASSERT( status == IB_SUCCESS );
-
        } else {
-               if (h_cq) {
+               if(h_cq) {
                        // increment reference to ensure no one release the object while work
                        // item is queued
                        ipoib_port_ref( p_port, ref_recv_cb );
-                       IoQueueWorkItem( p_port->pPoWorkItem, __iopoib_WorkItem, DelayedWorkQueue, p_port);
+                       IoQueueWorkItem( p_port->pPoWorkItem,
+                                                        (PIO_WORKITEM_ROUTINE) __iopoib_WorkItem,
+                                                        DelayedWorkQueue,
+                                                        p_port );
                        WorkToDo = FALSE;
                } else {
                        WorkToDo = TRUE;
@@ -2371,7 +2631,7 @@
        PERF_DECLARE( GetEndptByLid );
        PERF_DECLARE( EndptInsert );

-       IPOIB_ENTER( IPOIB_DBG_RECV );
+       XIPOIB_ENTER( IPOIB_DBG_RECV );

        /* Setup our shortcut pointers based on whether GRH is valid. */
        if( p_wc->recv.ud.recv_opt & IB_RECV_OPT_GRH_VALID )
@@ -2430,7 +2690,7 @@
                        }

                        /* Create the endpoint. */
-                       *pp_src = ipoib_endpt_create(
+                       *pp_src = ipoib_endpt_create(
                                                                                p_port,
 #if IPOIB_INLINE_RECV
                                                                                &p_desc->buf.ib.grh.src_gid,
@@ -2445,19 +2705,21 @@
                                        ("ipoib_endpt_create failed\n") );
                                return;
                        }
+#if DBG
+                       ipoib_port_ref( p_port, ref_endpt_track );
+#endif
                        cl_perf_start( EndptInsert );
                        cl_obj_lock( &p_port->obj );
                        status = __endpt_mgr_insert( p_port, mac, *pp_src );
+                       cl_obj_unlock( &p_port->obj );
                        if( status != IB_SUCCESS )
                        {
-                               cl_obj_unlock( &p_port->obj );
                                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                        ("__endpt_mgr_insert returned %s\n",
                                        p_port->p_adapter->p_ifc->get_err_str( status )) );
                                *pp_src = NULL;
                                return;
                        }
-                       cl_obj_unlock( &p_port->obj );
                        cl_perf_stop( &p_port->p_adapter->perf, EndptInsert );
                }
        }
@@ -2479,28 +2741,19 @@
        {
                /* Update the QPN for the endpoint. */
                IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
-                       ("Updating QPN for MAC: %02X-%02X-%02X-%02X-%02X-%02X\n",
-                       (*pp_src )->mac.addr[0], (*pp_src )->mac.addr[1],
-                       (*pp_src )->mac.addr[2], (*pp_src )->mac.addr[3],
-                       (*pp_src )->mac.addr[4], (*pp_src )->mac.addr[5]) );
-//             (*pp_src)->qpn = p_wc->recv.ud.remote_qp;
+                       ("Updating QPN for MAC: %s\n", mk_mac_str(&(*pp_src)->mac)) );
+               (*pp_src)->qpn = p_wc->recv.ud.remote_qp;
        }

        if( *pp_src && *pp_dst )
        {
                IPOIB_PRINT(TRACE_LEVEL_VERBOSE, IPOIB_DBG_RECV,
-                       ("Recv:\n"
-                       "\tsrc MAC: %02X-%02X-%02X-%02X-%02X-%02X\n"
-                       "\tdst MAC: %02X-%02X-%02X-%02X-%02X-%02X\n",
-                       (*pp_src )->mac.addr[0], (*pp_src )->mac.addr[1],
-                       (*pp_src )->mac.addr[2], (*pp_src )->mac.addr[3],
-                       (*pp_src )->mac.addr[4], (*pp_src )->mac.addr[5],
-                       (*pp_dst )->mac.addr[0], (*pp_dst )->mac.addr[1],
-                       (*pp_dst )->mac.addr[2], (*pp_dst )->mac.addr[3],
-                       (*pp_dst )->mac.addr[4], (*pp_dst )->mac.addr[5]) );
+                       ("\n\tsrc-EP %s MAC %s dst-EP %s MAC %s \n",
+                               (*pp_src)->tag, mk_mac_str(&(*pp_src)->mac),
+                               (*pp_dst)->tag, mk_mac_str2(&(*pp_dst)->mac)) );
        }

-       IPOIB_EXIT( IPOIB_DBG_RECV );
+       XIPOIB_EXIT( IPOIB_DBG_RECV );
 }


@@ -2526,7 +2779,7 @@
        PERF_DECLARE( RecvDhcp );
        PERF_DECLARE( RecvArp );

-       IPOIB_ENTER( IPOIB_DBG_RECV );
+       XIPOIB_ENTER( IPOIB_DBG_RECV );

        for( p_wc = p_done_wc_list; p_wc; p_wc = p_wc->p_next )
        {
@@ -2582,7 +2835,8 @@
                }

                /* Successful completion.  Get the receive information. */
-               p_desc->ndis_csum.Value = ( ( p_wc->recv.ud.recv_opt & IB_RECV_OPT_CSUM_MASK ) >> 8 );
+               p_desc->ndis_csum.Value =
+                                       ( ( p_wc->recv.ud.recv_opt & IB_RECV_OPT_CSUM_MASK ) >> 8 );
                p_desc->len = len + 14 - 4 ;
                cl_perf_start( GetRecvEndpts );
                __recv_get_endpts( p_port, p_desc, p_wc, &p_src, &p_dst );
@@ -2803,7 +3057,7 @@
                }
        }

-       IPOIB_EXIT( IPOIB_DBG_RECV );
+       XIPOIB_EXIT( IPOIB_DBG_RECV );
        return recv_cnt;
 }

@@ -2815,7 +3069,7 @@
        IN                              ipoib_endpt_t* const            p_src,
        IN                              ipoib_endpt_t* const            p_dst )
 {
-       IPOIB_ENTER( IPOIB_DBG_RECV );
+       XIPOIB_ENTER( IPOIB_DBG_RECV );

        if( !p_src || !p_dst )
        {
@@ -2843,7 +3097,7 @@
        if (p_dst->h_mcast)
                p_dst->is_in_use = TRUE;

-       IPOIB_EXIT( IPOIB_DBG_RECV );
+       XIPOIB_EXIT( IPOIB_DBG_RECV );
        return IB_SUCCESS;
 }

@@ -2972,7 +3226,7 @@
                if( p_dhcp->htype != DHCP_HW_TYPE_IB )
                {
                        IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                               ("Invalid hardware address type.\n") );
+                               ("msg %d Invalid hardware address type %d\n",msg,p_dhcp->htype) );
                        return IB_INVALID_SETTING;
                }
                break;
@@ -3028,8 +3282,8 @@
                if( p_cid[1] != coIPoIB_CID_Len )
                {
                        IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                               ("Client-identifier length is not equal to %d as required.\n",
-                                       coIPoIB_CID_Len) );
+                               ("Client-identifier length(%u) is not equal to %d as required.\n",
+                                       p_cid[1], coIPoIB_CID_Len) );
                        return IB_INVALID_SETTING;
                }
                if( p_cid[2] != coIPoIB_HwTypeIB)
@@ -3078,8 +3332,9 @@
        mac_addr_t                              mac;
        ipoib_hw_addr_t                 null_hw = {0};
        uint8_t                                 cm_capable = 0;
+       boolean_t                               queue_rc_conn = FALSE;

-       IPOIB_ENTER( IPOIB_DBG_RECV );
+       IPOIB_ENTER( IPOIB_DBG_ARP );

        if( !p_dst )
        {
@@ -3202,18 +3457,26 @@
                                ("ipoib_endpt_create failed\n") );
                        return status;
                }
-
+#if DBG
+               ipoib_port_ref( p_port, ref_endpt_track );
+#endif
                cl_obj_lock( &p_port->obj );
                status = __endpt_mgr_insert( p_port, mac, *pp_src );
+               cl_obj_unlock( &p_port->obj );
                if( status != IB_SUCCESS )
                {
-                       cl_obj_unlock( &p_port->obj );
                        IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("__endpt_mgr_insert return %s \n",
                                p_port->p_adapter->p_ifc->get_err_str( status )) );
                        return status;
                }
-               cl_obj_unlock( &p_port->obj );
+               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ARP,
+                       ("Created EP %p %s CM_cap %s cstate[%s] %s\n",
+                               (*pp_src),
+                               (*pp_src)->tag,
+                               (cm_capable == IPOIB_CM_FLAG_RC ? "1" : "0"),
+                               cm_get_state_str(endpt_cm_get_state((*pp_src))),
+                               mk_mac_str(&(*pp_src)->mac)) );
        }

        (*pp_src)->cm_flag = cm_capable;
@@ -3222,35 +3485,34 @@
                &(*pp_src)->dgid, &p_ib_arp->src_hw.gid, sizeof(ib_gid_t) ) );
        CL_ASSERT( ipoib_is_voltaire_router_gid( &(*pp_src)->dgid ) ||
                (*pp_src)->qpn == ipoib_addr_get_qpn( &p_ib_arp->src_hw ) );
-#if 0
+
        if( p_port->p_adapter->params.cm_enabled &&
-               p_ib_arp->op == ARP_OP_REQ &&
+               p_ib_arp->op == ARP_OP_REP &&
                cm_capable == IPOIB_CM_FLAG_RC )
        {
-               /* if we've got ARP request and RC flag is set,
-               save SID for connect REQ to be sent in ARP reply
-               when requestor's path get resolved */
-               if( endpt_cm_get_state( (*pp_src) ) == IPOIB_CM_DISCONNECTED )
-               {
-                       (*pp_src)->cm_flag = cm_capable;
-                       ipoib_addr_set_sid(
-                               &(*pp_src)->conn.service_id,
-                               ipoib_addr_get_qpn( &p_ib_arp->src_hw ) );
-               }
+               /* ARP sender is CM enabled, RC connect */
+               ipoib_addr_set_sid( &(*pp_src)->conn.service_id,
+                                                       ipoib_addr_get_qpn( &p_ib_arp->src_hw ) );
+               queue_rc_conn = TRUE;
        }
-#endif
-#if 0 //DBG
-       if( p_port->p_adapter->params.cm_enabled )
+#if DBG
        {
-               IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT,
-                       (" ARP %s from ENDPT[%p] state %d CM cap: %d QPN: %#x MAC: %02x:%02x:%02x:%02x:%02x:%02x\n",
-                       ((p_ib_arp->op == ARP_OP_REQ )? "REQUEST" : "REPLY"),
-                       *pp_src, endpt_cm_get_state( *pp_src ),
-                       ((cm_capable == IPOIB_CM_FLAG_RC)? 1: 0),
-                       cl_ntoh32( ipoib_addr_get_qpn( &p_ib_arp->src_hw ) ),
-                       (*pp_src)->mac.addr[0], (*pp_src)->mac.addr[1],
-                       (*pp_src)->mac.addr[2], (*pp_src)->mac.addr[3],
-                       (*pp_src)->mac.addr[4], (*pp_src)->mac.addr[5] ));
+               char ip_src[16];
+               char ip_dst[16];
+
+               RtlIpv4AddressToStringA( (IN_ADDR*)&p_ib_arp->src_ip, ip_src );
+               RtlIpv4AddressToStringA( (IN_ADDR*)&p_ib_arp->dst_ip, ip_dst );
+
+               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ARP,
+                       ("Rx ARP-%s Src[%s] EP %s CM_cap[%s] %s SID %I64x Dst[%s] %s\n",
+                               (p_ib_arp->op == ARP_OP_REQ ? "REQ":"REPL"),
+                               ip_src,
+                               (*pp_src)->tag,
+                               ((*pp_src)->cm_flag == IPOIB_CM_FLAG_RC ? "1" : "0"),
+                               cm_get_state_str(endpt_cm_get_state((*pp_src))),
+                               (*pp_src)->conn.service_id,
+                               ip_dst,
+                               p_dst->tag) );
        }
 #endif

@@ -3292,12 +3554,74 @@
                        p_arp->dst_hw = p_dst->mac;
                        p_arp->dst_ip = p_ib_arp->dst_ip;
                        CL_ASSERT( p_dst->qpn == ipoib_addr_get_qpn( &p_ib_arp->dst_hw ) );
+#if IPOIB_CM
+#if DBG
+                       {
+                               char ip_src[16];
+                               char ip_dst[16];
+
+                               RtlIpv4AddressToStringA( (IN_ADDR*)&p_ib_arp->src_ip, ip_src );
+                               RtlIpv4AddressToStringA( (IN_ADDR*)&p_ib_arp->dst_ip, ip_dst );
+
+                               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ARP,
+                                       ("Recv'ed BC? ARP-%s Src[%s] EP %s CM_cap[%s] %s Dst[%s] %s\n",
+                                               (p_ib_arp->op == ARP_OP_REQ ? "REQ":"REPL"),
+                                               ip_src,
+                                               (*pp_src)->tag,
+                                               ((*pp_src)->cm_flag == IPOIB_CM_FLAG_RC ? "1" : "0"),
+                                               cm_get_state_str(endpt_cm_get_state((*pp_src))),
+                                               ip_dst,
+                                               p_dst->tag) );
+                       }
+#endif
+                       if( queue_rc_conn )
+                       {
+                               /* Received our ARP reply and the remote RC flag is set,
+                                * Queue an active RC connection to the remote EP.
+                                */
+                               if( endpt_cm_get_state( (*pp_src) ) == IPOIB_CM_DISCONNECTED )
+                               {
+#if DBG
+                                       {
+                                               char ip_src[16];
+
+                                               RtlIpv4AddressToStringA((IN_ADDR*)&p_ib_arp->src_ip, ip_src);
+                                               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
+                                                       ("Queue RC CONNECT EP %s [%s] MAC %s SID %#I64x\n",
+                                                               (*pp_src)->tag, ip_src,
+                                                               mk_mac_str(&(*pp_src)->mac),
+                                                               (*pp_src)->conn.service_id) );
+                                       }
+#endif
+                                       endpt_queue_cm_connection( p_port, *pp_src );
+                               }
+                       }
+#endif // IPOIB_CM
                }
        }
        else /* we got ARP request */
        {
                memset( &p_arp->dst_hw, 0, sizeof(mac_addr_t) );
                p_arp->dst_ip = p_ib_arp->dst_ip;
+#if IPOIB_CM && DBG && 0
+               {
+                       char ip_src[16];
+                       char ip_dst[16];
+
+                       RtlIpv4AddressToStringA( (IN_ADDR*)&p_ib_arp->src_ip, ip_src );
+                       RtlIpv4AddressToStringA( (IN_ADDR*)&p_ib_arp->dst_ip, ip_dst );
+
+                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ARP,
+                               ("Recv'ed BC ARP-%s Src[%s] EP %s CM_cap[%s] %s Dst[%s] %s\n",
+                                       (p_ib_arp->op == ARP_OP_REQ ? "REQ":"REPL"),
+                                       ip_src,
+                                       (*pp_src)->tag,
+                                       (cm_capable == IPOIB_CM_FLAG_RC ? "1" : "0"),
+                                       cm_get_state_str(endpt_cm_get_state((*pp_src))),
+                                       ip_dst,
+                                       p_dst->tag) );
+               }
+#endif
        }

        /*
@@ -3314,7 +3638,7 @@
                return status;
        }

-       IPOIB_EXIT( IPOIB_DBG_RECV );
+       IPOIB_EXIT( IPOIB_DBG_ARP );
        return IB_SUCCESS;
 }

@@ -3466,7 +3790,7 @@

        PERF_DECLARE( PreparePkt );

-       IPOIB_ENTER( IPOIB_DBG_RECV );
+       XIPOIB_ENTER( IPOIB_DBG_RECV );

        *p_discarded = 0;

@@ -3518,7 +3842,7 @@
                p_item = cl_qlist_remove_head( p_done_list );
        }

-       IPOIB_EXIT( IPOIB_DBG_RECV );
+       XIPOIB_EXIT( IPOIB_DBG_RECV );
        return i;
 }

@@ -3534,13 +3858,11 @@
 __send_mgr_construct(
        IN                              ipoib_port_t* const                     p_port )
 {
-       IPOIB_ENTER( IPOIB_DBG_SEND );
        p_port->send_mgr.depth = 0;
        cl_qlist_init( &p_port->send_mgr.pending_list );
        cl_qpool_construct(&p_port->send_mgr.sg_pool);
        cl_qpool_construct( &p_port->send_mgr.send_pool );
        p_port->p_desc = NULL;
-       IPOIB_EXIT( IPOIB_DBG_SEND );
 }

 static ib_api_status_t
@@ -3600,12 +3922,16 @@
        //This send descriptor can't be allocated on the stack because of boundary
        // violation !!!
        p_port->p_desc = (ipoib_send_desc_t *)
-               ExAllocatePoolWithTag(NonPagedPool ,sizeof (ipoib_send_desc_t), 'XMXA');
+                                               ExAllocatePoolWithTag( NonPagedPool,
+                                                                                          sizeof(ipoib_send_desc_t),
+                                                                                          'XMXA');
        if (!p_port->p_desc) {
                NdisWriteErrorLogEntry( p_port->p_adapter->h_adapter,
-                       EVENT_IPOIB_RECV_POOL, 1, CL_INSUFFICIENT_MEMORY); //TODO EVENT_IPOIB_SEND_POOL
+                                                               EVENT_IPOIB_RECV_POOL,
+                                                               1,
+                                                               CL_INSUFFICIENT_MEMORY); //TODO EVENT_IPOIB_SEND_POOL
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                       ("Allocation of  send descriptor failed\n") );
+                       ("Allocation of port[%d] send descriptor failed\n", p_port->port_num) );
                cl_qpool_destroy(&p_port->send_mgr.send_pool);
                cl_qpool_destroy(&p_port->send_mgr.sg_pool);
                return IB_INSUFFICIENT_MEMORY;
@@ -3653,10 +3979,11 @@
        cl_qpool_destroy(&p_port->send_mgr.send_pool);
        cl_qpool_destroy(&p_port->send_mgr.sg_pool);

-       //Now, free port descriptor
-       if (p_port->p_desc) {
+       //Now, free port send descriptor
+       if( p_port->p_desc )
+       {
                ExFreePoolWithTag(p_port->p_desc, 'XMXA');
-               p_port->p_desc = NULL;
+               p_port->p_desc = NULL;
        }

        //Lookaside list will be destroyed in __buf_mgr_destroy
@@ -3664,6 +3991,106 @@
        IPOIB_EXIT( IPOIB_DBG_SEND );
 }

+#if 0
+
+static char *get_ipv6_nxt_hdr_str(UINT8 NH)
+{
+       static char what[28];
+
+       switch( NH )
+       {
+         case IPPROTO_HOPOPTS:
+               return "Hop-by-Hop";
+
+         case IPPROTO_ICMP:
+               return "ICMP";
+
+         case IPPROTO_IGMP:
+               return "IGMP";
+
+         case IPPROTO_GGP:
+               return "GGP";
+
+         case IPPROTO_IPV4:
+               return "IPV4";
+
+         case IPPROTO_ST:
+               return "ST";
+
+         case IPPROTO_TCP:
+               return "TCP";
+
+         case IPPROTO_CBT:
+               return "CBT";
+
+         case IPPROTO_EGP:
+               return "EGP";
+
+         case IPPROTO_IGP:
+               return "IGP";
+
+         case IPPROTO_PUP:
+               return "PUP";
+
+         case IPPROTO_UDP:
+               return "UDP";
+
+         case IPPROTO_IDP:
+               return "IDP";
+
+         case IPPROTO_RDP:
+               return "RDP";
+
+         case IPPROTO_IPV6:
+               return "IPV6";
+
+         case IPPROTO_ROUTING:
+               return "ROUTING";
+
+         case IPPROTO_FRAGMENT:
+               return "FRAGMENT";
+
+         case IPPROTO_ESP:
+               return "ESP";
+
+         case IPPROTO_AH:
+               return "AH";
+
+         case IPPROTO_ICMPV6:
+               return "ICMPV6";
+
+         case IPPROTO_NONE:
+               return "NONE";
+
+         case IPPROTO_DSTOPTS:
+               return "DSTOPTS";
+
+         case IPPROTO_ND:
+               return "ND";
+
+         case IPPROTO_ICLFXBM:
+               return "ICLFXBM";
+
+         case IPPROTO_PIM:
+               return "PIM";
+
+         case IPPROTO_PGM:
+               return "PGM";
+
+         case IPPROTO_L2TP:
+               return "L2TP";
+
+         case IPPROTO_SCTP:
+               return "SCTP";
+
+         default:
+               break;
+       }
+       StringCchPrintf(what,sizeof(what),"Unknown Proto %u",NH);
+       return what;
+}
+#endif // DBG
+

 static NDIS_STATUS
 __send_mgr_filter(
@@ -3673,18 +4100,20 @@
        IN      OUT                     ipoib_send_NB_SG                        *s_buf )
 {
        NDIS_STATUS             status;
+       //IPV6_HEADER           *p_ip6_hdr;

        PERF_DECLARE( FilterIp );
+       PERF_DECLARE( FilterIpV6 );
        PERF_DECLARE( FilterArp );
        PERF_DECLARE( SendGen );

        IPOIB_ENTER( IPOIB_DBG_SEND );

+       ASSERT( s_buf->p_send_desc );
        /*
         * We already checked the ethernet header length, so we know it's safe
         * to decrement the buf_len without underflowing.
         */
-       ipoib_send_desc_t *p_desc = s_buf->p_port->p_desc;
        buf_len -= sizeof(eth_hdr_t);

        switch( p_eth_hdr->type )
@@ -3694,37 +4123,39 @@
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
                        ("Current header type is IPv6\n") );
                status = __send_mgr_filter_ip( p_eth_hdr, p_mdl, buf_len, s_buf );
-               cl_perf_stop( &p_port->p_adapter->perf, FilterIpV6 );
+               cl_perf_stop( &s_buf->p_port->p_adapter->perf, FilterIpV6 );
                break;
+
        case ETH_PROT_TYPE_IP:
                cl_perf_start( FilterIp );
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
                        ("Current header type is IPv4\n") );
                status = __send_mgr_filter_ip( p_eth_hdr, p_mdl, buf_len, s_buf );
-               cl_perf_stop( &p_port->p_adapter->perf, FilterIp );
+               cl_perf_stop( &s_buf->p_port->p_adapter->perf, FilterIp );
                break;

        case ETH_PROT_TYPE_ARP:
                cl_perf_start( FilterArp );
+               // UD only
+               CL_ASSERT( s_buf->p_send_desc->send_qp == s_buf->p_port->ib_mgr.h_qp );
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
                        ("Current header type is ARP\n") );
                status = __send_mgr_filter_arp( p_eth_hdr, p_mdl, buf_len, s_buf );
-               p_desc->send_dir = SEND_UD_QP;
-               cl_perf_stop( &p_port->p_adapter->perf, FilterArp );
+               cl_perf_stop( &s_buf->p_port->p_adapter->perf, FilterArp );
                break;

        default:
                /*
                 * The IPoIB spec doesn't define how to send non IP or ARP packets.
-                * Just send the payload and hope for the best.
+                * Just send the payload UD and hope for the best.
                 */
-                IPOIB_PRINT( TRACE_LEVEL_WARNING, IPOIB_DBG_SEND,
-                       ("Unrecognized header type: %d\n",p_eth_hdr->type ) );
+               IPOIB_PRINT_EXIT(TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                       ("UD Send non ETH IP/ARP packet type 0x%X\n",
+                               cl_ntoh16(p_eth_hdr->type)));

-               p_desc->send_dir = SEND_UD_QP;
                cl_perf_start( SendGen );
                status = __send_gen( s_buf, 0 );
-               cl_perf_stop( &p_port->p_adapter->perf, SendGen );
+               cl_perf_stop( &s_buf->p_port->p_adapter->perf, SendGen );
                break;
        }

@@ -3733,13 +4164,10 @@
 }


-// TODO: move the p_desc to be on the send buffer
-
 ULONG
 CopyNetBuffer(
-    PNET_BUFFER NetBuffer,
-    PUCHAR              pDest
-    )
+    IN PNET_BUFFER             NetBuffer,
+    IN PUCHAR                  pDest )
 {
     ULONG  BytesCopied = 0;

@@ -3780,7 +4208,7 @@
                 CurrLength = DataLength;
             }
             DataLength -= CurrLength;
-            memcpy(pDest, pSrc, CurrLength);
+            memcpy( pDest, pSrc, CurrLength );
             BytesCopied += CurrLength;

             pDest += CurrLength;
@@ -3806,7 +4234,7 @@
 #else
         ASSERT((((uint32_t)pSrc % PAGE_SIZE) + DataLength) <= PAGE_SIZE);
 #endif
-        memcpy(pDest, pSrc, DataLength);
+        memcpy( pDest, pSrc, DataLength );
         BytesCopied += DataLength;
     }

@@ -3869,6 +4297,81 @@
        return NDIS_STATUS_SUCCESS;
 }

+
+char *get_eth_packet_type_str(net16_t pkt_type)
+{
+       static char what[28];
+
+       switch( pkt_type )
+       {
+         case ETH_PROT_TYPE_IP:
+               return "ETH_PROT_IP";
+
+         case ETH_PROT_TYPE_IPV6:
+               return "ETH_PROT_IPV6";
+
+         case ETH_PROT_TYPE_ARP:
+               return "ETH_PROT_ARP";
+
+         case ETH_PROT_TYPE_RARP:
+               return "ETH_PROT_RARP";
+
+         case ETH_PROT_VLAN_TAG:
+               return "ETH_PROT_VLAN_TAG";
+
+         default:
+               break;
+       }
+       StringCchPrintf(what,sizeof(what),"Unknown Eth packet type 0x%x",pkt_type);
+       return what;
+}
+
+char *get_IP_protocol_str(uint8_t proto)
+{
+       static char what[28];
+
+       switch( proto )
+       {
+         case IPPROTO_HOPOPTS:
+               return "IPPROTO_HOPOPTS";
+
+         case IPPROTO_IPV4:
+               return "IPPROTO_IP";
+
+         case IPPROTO_IPV6:
+               return "IPPROTO_ICMP";
+
+         case IPPROTO_TCP:
+               return "IPPROTO_TCP";
+
+         case IPPROTO_UDP:
+               return "IPPROTO_UDP";
+
+         case IPPROTO_IGMP:
+               return "IPPROTO_IGMP";
+
+         case IPPROTO_ICMP:
+               return "IPPROTO_ICMP";
+
+         case IPPROTO_ICMPV6:
+               return "IPPROTO_ICMPV6";
+
+         case IPPROTO_NONE:
+               return "IPPROTO_NONE";
+
+         case IPPROTO_DSTOPTS:
+               return "IPPROTO_DSTOPTS";
+
+         case IPPROTO_SCTP:
+               return "IPPROTO_SCTP";
+
+         default:
+               break;
+       }
+       StringCchPrintf(what,sizeof(what),"Unknown IP protocol %d",proto);
+       return what;
+}
+
 static inline NDIS_STATUS
 __send_mgr_get_eth_hdr(
        IN                              PNET_BUFFER                                     p_net_buffer,
@@ -3881,7 +4384,7 @@

        *pp_mdl = NET_BUFFER_FIRST_MDL(p_net_buffer);

-       NdisQueryMdl(*pp_mdl,&p_head,p_mdl_len,NormalPagePriority);
+       NdisQueryMdl( *pp_mdl, &p_head, p_mdl_len, NormalPagePriority );
        if( ! p_head )
        {
                /* Failed to get first buffer. */
@@ -3902,23 +4405,15 @@

        *pp_eth_hdr = (eth_hdr_t*)(p_head + MdlDataOffset);

-       IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
-               ("Ethernet header:\n"
-               "\tsrc MAC: %02X-%02X-%02X-%02X-%02X-%02X\n"
-               "\tdst MAC: %02X-%02X-%02X-%02X-%02X-%02X\n"
-               "\tprotocol type: %04X\n",
-               (*pp_eth_hdr)->src.addr[0], (*pp_eth_hdr)->src.addr[1],
-               (*pp_eth_hdr)->src.addr[2], (*pp_eth_hdr)->src.addr[3],
-               (*pp_eth_hdr)->src.addr[4], (*pp_eth_hdr)->src.addr[5],
-               (*pp_eth_hdr)->dst.addr[0], (*pp_eth_hdr)->dst.addr[1],
-               (*pp_eth_hdr)->dst.addr[2], (*pp_eth_hdr)->dst.addr[3],
-               (*pp_eth_hdr)->dst.addr[4], (*pp_eth_hdr)->dst.addr[5],
-               cl_ntoh16( (*pp_eth_hdr)->type )) );
+       IPOIB_PRINT_EXIT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_SEND,
+               ("Ethernet header:\n\tsrc MAC: %s\n\tdst MAC: %s\n\tprotocol: %s\n",
+               mk_mac_str(&(*pp_eth_hdr)->src),
+               mk_mac_str2(&(*pp_eth_hdr)->dst),
+               get_eth_packet_type_str((*pp_eth_hdr)->type)) );

        return NDIS_STATUS_SUCCESS;
 }

-
 #if !IPOIB_USE_DMA

 /* Send using the MDL's page information rather than the SGL. */
@@ -3939,8 +4434,7 @@

        IPOIB_ENTER( IPOIB_DBG_SEND );
        p_net_buf = NET_BUFFER_LIST_FIRST_NB(p_desc->p_netbuf_list);
-       NdisQueryBuffer( p_net_buf, &num_pages, NULL, &p_mdl,
-               &tot_len );
+       NdisQueryBuffer( p_net_buf, &num_pages, NULL, &p_mdl, &tot_len );

        if( !p_mdl )
        {
@@ -3953,7 +4447,8 @@
        if( num_pages >=  p_port->max_sq_sge_supported )
        {
                IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
-                       ("Too many buffers to fit in WR ds_array.  Copying data.\n") );
+                       ("Too many buffers(%d) to fit in WR ds_array(%u); Copying data.\n",
+                               num_pages, p_port->max_sq_sge_supported) );
                status = __send_copy( p_port, p_desc );
                IPOIB_EXIT( IPOIB_DBG_SEND );
                return status;
@@ -4079,6 +4574,7 @@
        eth_hdr_t                               *p_eth_hdr;
        UINT                                    mdl_len;
        bool                                    ret = false;
+       bool                                    dst_is_MULTICAST, dst_is_BROADCAST;

        ib_send_wr_t                    *p_wr_failed;
        NET_BUFFER_LIST                 *p_net_buffer_list;
@@ -4096,7 +4592,6 @@
        UNREFERENCED_PARAMETER(pDO);
        UNREFERENCED_PARAMETER(pIrp);

-       PERF_DECLARE( SendCopy );
        PERF_DECLARE( BuildSendDesc );
        PERF_DECLARE( GetEthHdr );
        PERF_DECLARE( QueuePacket );
@@ -4104,17 +4599,18 @@
        PERF_DECLARE( PostSend );
        PERF_DECLARE( ProcessFailedSends );
        PERF_DECLARE( GetEndpt );
-
+
        ++g_ipoib_send_SG_real;

        //Read Data from the buffer passed as a context
-       s_buf =                         (ipoib_send_NB_SG *)context;
-       p_net_buffer_list = s_buf->p_nbl;
-       p_netbuf =                      s_buf->p_curr_nb;
-       p_port =                        s_buf->p_port;
-       p_desc =                        p_port->p_desc;
+       s_buf                           = (ipoib_send_NB_SG *)context;
+       p_net_buffer_list       = s_buf->p_nbl;
+       p_netbuf                        = s_buf->p_curr_nb;
+       p_port                          = s_buf->p_port;
+       p_desc                          = s_buf->p_send_desc;
+       p_desc->send_qp         = s_buf->p_port->ib_mgr.h_qp; // assume UD send.

-       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_BUF,
+       XIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_BUF,
                ("Processing netbuffer list %p s_buf %p\n", p_net_buffer_list, s_buf) );

        //TODO Define this function as void if we are not in DBG mode
@@ -4125,13 +4621,12 @@

        CL_ASSERT(KeGetCurrentIrql() == DISPATCH_LEVEL);

-       p_port->p_desc->p_netbuf_list = p_net_buffer_list;
-       p_port->p_desc->p_endpt = NULL;
+       p_desc->p_netbuf_list = p_net_buffer_list;
        s_buf->p_send_buf = NULL;
-       p_port->p_desc->num_wrs = 1;
+       p_desc->num_wrs = 1;

-       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
-                       ("\nRECEIVED NBL= 0x%p, NB= 0x%p with SG= %p\n********\n", p_net_buffer_list, p_netbuf, p_sgl) );
+       DIPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_SEND,
+                       ("\nRECEIVED NB= %p with SG= %p\n********\n", p_netbuf, p_sgl) );

        /* Get the ethernet header so we can find the endpoint. */
        cl_perf_start( GetEthHdr );
@@ -4153,6 +4648,9 @@
                goto send_end;
        }

+       dst_is_MULTICAST = ETH_IS_MULTICAST( p_eth_hdr->dst.addr );
+       dst_is_BROADCAST = ETH_IS_BROADCAST( p_eth_hdr->dst.addr );
+
        from_queue = (boolean_t)(s_buf->p_sgl != NULL);

        if (from_queue)
@@ -4160,10 +4658,10 @@
                cl_perf_start( GetEndpt );
                status = __endpt_mgr_ref( p_port,
                                                                  p_eth_hdr->dst,
-                                                                 &(p_port->p_desc->p_endpt) );
+                                                                 &(s_buf->p_endpt) );

-               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
-                               ("__endpt_mgr_ref called on EP %p\n", p_port->p_desc->p_endpt));
+               XIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
+                               ("__endpt_mgr_ref called on EP %p\n", s_buf->p_endpt));

                cl_perf_stop( &p_port->p_adapter->perf, GetEndpt );
                if( status == NDIS_STATUS_PENDING )
@@ -4181,13 +4679,15 @@
                {
                        ASSERT( status == NDIS_STATUS_NO_ROUTE_TO_DESTINATION );

-                       if( ETH_IS_MULTICAST( p_eth_hdr->dst.addr ) )
+                       if( dst_is_MULTICAST )
                        {
                                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_MCAST,
                                        ("recived a mc packet (from the queue) %p\n",
                                                p_net_buffer_list));
-                               if( ipoib_port_join_mcast( p_port, p_eth_hdr->dst,
-                                       IB_MC_REC_STATE_FULL_MEMBER) == IB_SUCCESS )
+                               ib_status = ipoib_port_join_mcast( p_port,
+                                                                                                  p_eth_hdr->dst,
+                                                                                                  IB_MC_REC_STATE_FULL_MEMBER);
+                               if( ib_status == IB_SUCCESS )
                                {
                                        s_buf->p_sgl = p_sgl;
                                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_MCAST,
@@ -4223,9 +4723,9 @@
                s_buf->p_sgl = p_sgl;

                cl_perf_start( SendMgrQueue );
-               if ( ETH_IS_MULTICAST( p_eth_hdr->dst.addr ) &&
-                        p_eth_hdr->type == ETH_PROT_TYPE_IP &&
-                        !ETH_IS_BROADCAST( p_eth_hdr->dst.addr ) )
+               if( dst_is_MULTICAST
+                       && p_eth_hdr->type == ETH_PROT_TYPE_IP
+                       && !dst_is_BROADCAST )
                {
                        ip_hdr_t                        *p_ip_hdr;
                        uint8_t                         *p_tmp;
@@ -4251,18 +4751,19 @@
                                                ("Failed to get IP header buffer.\n") );
                                        goto mc_end;
                                }
-                               NdisQueryMdl(p_ip_hdr_mdl,&p_tmp,&ip_hdr_mdl_len,NormalPagePriority);
+                               NdisQueryMdl( p_ip_hdr_mdl, &p_tmp, &ip_hdr_mdl_len, NormalPagePriority );
                                if( !p_tmp )
                                {
-                                       IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                                       IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                                ("Failed to get IP header.\n") );
                                        goto mc_end;
                                }
                                if( ip_hdr_mdl_len < sizeof(ip_hdr_t) )
                                {
                                        /* This buffer is done for.  Get the next buffer. */
-                                       IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                                               ("Buffer too small for IP packet.\n") );
+                                       IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                                               ("ip_hdr_mdl_len(%d) < sizeof(ip_hdr_t)(%d) @ line #%d\n",
+                                               ip_hdr_mdl_len,(int)sizeof(ip_hdr_t),__LINE__) );
                                        goto mc_end;
                                }
                                p_ip_hdr = (ip_hdr_t*)(p_tmp + NET_BUFFER_CURRENT_MDL_OFFSET(p_netbuf));
@@ -4275,7 +4776,7 @@
 mc_end:
                ASSERT(s_buf->p_sgl);

-               status = __send_mgr_queue( p_port, p_eth_hdr, &(p_port->p_desc->p_endpt) );
+               status = __send_mgr_queue( p_port, p_eth_hdr, &s_buf->p_endpt );

                cl_perf_stop( &p_port->p_adapter->perf, SendMgrQueue );
                if( status == NDIS_STATUS_PENDING )
@@ -4283,8 +4784,10 @@
                        /* Queue net buffer list. */
                        cl_perf_start( QueuePacket );

-                       cl_qlist_insert_tail( &p_port->send_mgr.pending_list, (cl_list_item_t*)s_buf );
-                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND, ("Inserting %p NB first time to the pending list\n", p_netbuf));
+                       cl_qlist_insert_tail( &p_port->send_mgr.pending_list,
+                                                                 (cl_list_item_t*)s_buf );
+                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
+                               ("Inserting NB %p first time to the pending list\n", p_netbuf));

                        cl_perf_stop( &p_port->p_adapter->perf, QueuePacket );
                        ++g_ipoib_send_SG_pending;
@@ -4309,6 +4812,7 @@
                        ret = true;
                        goto send_end;
                }
+               // endpt ref held
        }
        cl_perf_start( BuildSendDesc );
        status = __build_send_desc( p_eth_hdr,
@@ -4320,7 +4824,6 @@

        if( status != NDIS_STATUS_SUCCESS )
        {
-//             ASSERT(FALSE);
                cl_perf_start( ProcessFailedSends );
                __send_complete_net_buffer(s_buf, NDIS_STATUS_FAILURE, complete_flags, TRUE);
                cl_perf_stop( &p_port->p_adapter->perf, ProcessFailedSends );
@@ -4328,25 +4831,64 @@
                goto send_end;
        }

+       if( p_desc->send_qp != p_port->ib_mgr.h_qp &&
+                       p_desc->send_qp != s_buf->p_endpt->conn.h_send_qp )
+       {
+               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
+                       ("EP %s RESET send_qp to UD\n", s_buf->p_endpt->tag) );
+               p_desc->send_qp = p_port->ib_mgr.h_qp;
+       }
+
+ // XXX+
+#if DBG && 0
+ if( p_desc->send_qp != p_port->ib_mgr.h_qp || p_desc->num_wrs > 1 )
+ {
+       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
+               ("post_send[%s] EP %s num_wrs %d num_ds %u sgl_size %u\n",
+                       (p_desc->send_qp == p_port->ib_mgr.h_qp ? "UD":"RC"),
+                       s_buf->p_endpt->tag, p_desc->num_wrs,
+                       p_desc->send_wr[0].wr.num_ds,
+                       get_sgl_size(p_sgl)) );
+ }
+#endif
+// XXX-
+
        /* Post the WR. */
        cl_perf_start( PostSend );
-       IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_SEND, ("sending packet with wr-id =0x%p\n",p_net_buffer_list ));
-
-       ib_status = p_port->p_adapter->p_ifc->post_send( p_port->ib_mgr.h_qp, &(p_port->p_desc->send_wr[0].wr), &p_wr_failed );
+       ib_status = p_port->p_adapter->p_ifc->post_send( p_desc->send_qp,
+                                                                                                        &(p_desc->send_wr[0].wr),
+                                                                                                        &p_wr_failed );
        p_port->n_no_progress = 0; // IPoIB can send, reset the failure counter
        ret = true;
        cl_perf_stop( &p_port->p_adapter->perf, PostSend );
        if( ib_status != IB_SUCCESS )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                       ("ib_post_send returned %s\n",
-                       p_port->p_adapter->p_ifc->get_err_str( ib_status )) );
+                       ("Port[%d] %s ib_post_send returned %s\n",
+                               p_port->port_num,
+                               (p_desc->send_qp == p_port->ib_mgr.h_qp ? "UD":"RC"),
+                               p_port->p_adapter->p_ifc->get_err_str( ib_status )) );
                cl_perf_start( ProcessFailedSends );
                __send_complete_net_buffer(s_buf, NDIS_STATUS_FAILURE, complete_flags, TRUE);
                cl_perf_stop( &p_port->p_adapter->perf, ProcessFailedSends );
-               /* Flag the adapter as hung since posting is busted. */
-               p_port->p_adapter->hung = TRUE;
-       } else {
+               if( p_desc->send_qp == p_port->ib_mgr.h_qp )
+               {
+                       /* Flag the adapter as hung since posting is busted. */
+                       p_port->p_adapter->hung = TRUE;
+               }
+               else
+               {
+                       /* revert to UD only send */
+                       endpt_cm_set_state( s_buf->p_endpt, IPOIB_CM_DISCONNECT_CLEANUP );
+                       IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                               ("Port[%d] Release CM Tx resources on EP %s\n",
+                                       p_port->port_num, s_buf->p_endpt->tag) );
+                       cm_release_resources( p_port, s_buf->p_endpt, 1 );
+                       endpt_cm_set_state( s_buf->p_endpt, IPOIB_CM_DISCONNECTED );
+               }
+       }
+       else
+       {
                ++g_ipoib_send;
                cl_atomic_inc( &p_port->send_mgr.depth );
        }
@@ -4359,7 +4901,8 @@
 }


-// This routine called during execution of NdisMAllocateNetBufferSGList().
+// This routine is called (aka callout) from within the execution of the Windows
+// routine NdisMAllocateNetBufferSGList().

 void
 ipoib_process_sg_list(
@@ -4375,13 +4918,23 @@

        ++g_ipoib_send_SG;
        if (g_ipoib_send_SG > 2) {
-               //ASSERT(g_ipoib_send_SG-2 <= g_ipoib_send + g_ipoib_send_mcast+p_port->send_mgr.pending_list.count+ g_ipoib_send_SG_failed);
+#if 0
+               ASSERT( g_ipoib_send_SG-2 <= g_ipoib_send
+                                                                       + g_ipoib_send_mcast
+                                                                       + p_port->send_mgr.pending_list.count
+                                                                       + g_ipoib_send_SG_failed );
+#endif
        }

        ipoib_process_sg_list_real( pDO, pIrp, p_sgl, context );

        if (g_ipoib_send_SG > 1) {
-               //ASSERT(g_ipoib_send_SG-1 <= g_ipoib_send + g_ipoib_send_mcast+p_port->send_mgr.pending_list.count + g_ipoib_send_SG_failed);
+#if 0
+               ASSERT( g_ipoib_send_SG-1 <= g_ipoib_send
+                                                                               + g_ipoib_send_mcast
+                                                                               + p_port->send_mgr.pending_list.count
+                                                                               + g_ipoib_send_SG_failed );
+#endif
        }
        cl_spinlock_release( &p_port->send_lock );
 }
@@ -4399,7 +4952,7 @@
        UINT                    total_offset;

        PSCATTER_GATHER_LIST p_sgl = s_buf->p_sgl;
-       ipoib_send_desc_t *p_desc = s_buf->p_port->p_desc;
+       ipoib_send_desc_t *p_desc = s_buf->p_send_desc;

        PERF_DECLARE( SendCopy );

@@ -4412,7 +4965,7 @@
          * Thus, the calulation should be:
          * Normal send: offset = sizeof ETH header
          * LSO           : offset = sizeof ETH header+sizeof IP header+ sizeof TCP header
-                                               == sizeof LSO header + (sizeof ETH header-sizeof IPoIB header)
+                                       == sizeof LSO header + (sizeof ETH header-sizeof IPoIB header)
          */
        if ( lso_header_size )
        {
@@ -4465,7 +5018,7 @@
        DataOffset= (ULONG)(NET_BUFFER_CURRENT_MDL_OFFSET(s_buf->p_curr_nb));

        /*
-        * Skip the Ethernet or LSO header.  It is contained at N+1 first elements (N>=0),
+        * Skip the Ethernet or LSO header. It is contained at N+1 first elements (N>=0),
         * while (N+1) element may contain only part of it
         */

@@ -4523,6 +5076,10 @@
        size_t                  iph_options_size;
        uint8_t                 prot;
        size_t                  hdr_size;
+       ipoib_send_desc_t *p_desc = s_buf->p_send_desc;
+       boolean_t               dst_is_multicast;
+       boolean_t               dst_is_broadcast;
+       boolean_t               cm_enabled = s_buf->p_port->p_adapter->params.cm_enabled;

        PERF_DECLARE( QueryIp );
        PERF_DECLARE( SendTcp );
@@ -4530,7 +5087,8 @@

        IPOIB_ENTER( IPOIB_DBG_SEND );

-       ipoib_send_desc_t *p_desc = s_buf->p_port->p_desc;
+       CL_ASSERT( p_desc->send_qp == s_buf->p_port->ib_mgr.h_qp );     // start with UD Tx
+
        if( !buf_len )
        {
                cl_perf_start( QueryIp );
@@ -4549,31 +5107,40 @@
                                ("Failed to query IP header buffer.\n") );
                        return NDIS_STATUS_FAILURE;
                }
-               cl_perf_stop( &p_port->p_adapter->perf, QueryIp );
+               cl_perf_stop( &s_buf->p_port->p_adapter->perf, QueryIp );
        }
        else
        {
                p_ip_hdr = (PVOID) (p_eth_hdr + 1);
        }
-
+
+       dst_is_multicast = ETH_IS_MULTICAST( p_eth_hdr->dst.addr );
+       dst_is_broadcast = ETH_IS_BROADCAST( p_eth_hdr->dst.addr );
+
        if ( p_eth_hdr->type == ETH_PROT_TYPE_IPV6 )
        {
                // BUGBUG: need to add support for extension headers
+               PIPV6_HEADER p_ip6_hdr = (PIPV6_HEADER)p_ip_hdr;
+
                prot = ((ipv6_hdr_t *) p_ip_hdr)->next_header;
                hdr_size = sizeof(ipv6_hdr_t);
-               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,("Got IPV6 Header\n" ) );
+               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,("Got IPV6 Header\n") );
+               ip_packet_len = cl_ntoh16( p_ip6_hdr->PayloadLength );
        }
        else //IPv4
        {
                prot = ((ip_hdr_t *) p_ip_hdr)->prot;
                hdr_size = sizeof(ip_hdr_t);
+               ip_packet_len = cl_ntoh16( ((ip_hdr_t*)p_ip_hdr)->length );
                ipoib_print_ip_hdr( (ip_hdr_t *) p_ip_hdr );
        }
+
        if( buf_len < hdr_size )
        {
                /* This buffer is done for.  Get the next buffer. */
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                       ("Buffer too small for IP packet.\n") );
+                       ("buf_len(%d) < hdr_size(%d) @ Line #%d\n",
+                               (int)buf_len,(int)hdr_size,__LINE__) );
                return NDIS_STATUS_BUFFER_TOO_SHORT;
        }

@@ -4583,18 +5150,25 @@
                cl_perf_start( FilterUdp );
                status = __send_mgr_filter_udp( p_ip_hdr, p_mdl,
                                                                                (buf_len - hdr_size), p_eth_hdr->type, s_buf );
-               cl_perf_stop( &p_port->p_adapter->perf, FilterUdp );
+               cl_perf_stop( &s_buf->p_port->p_adapter->perf, FilterUdp );
+
                if( status != NDIS_STATUS_PENDING ) {
                        return status;
                }
                /* not DHCP packet, keep going */
-               if( ETH_IS_MULTICAST( p_eth_hdr->dst.addr ) )
-                       p_desc->send_dir = SEND_UD_QP;
-               else
-                       p_desc->send_dir = SEND_RC_QP;
-               break;
+               if( !dst_is_multicast && cm_enabled && s_buf->p_endpt->conn.h_send_qp )
+                       p_desc->send_qp = s_buf->p_endpt->conn.h_send_qp;
+
+               break;
        case IP_PROT_TCP:
-               p_desc->send_dir = SEND_RC_QP;
+               if( !cm_enabled )
+                       break;
+
+               if( s_buf->p_endpt->conn.h_send_qp &&
+                       ip_packet_len <= s_buf->p_endpt->tx_mtu )
+               {
+                       p_desc->send_qp = s_buf->p_endpt->conn.h_send_qp;       // RC Tx
+               }
                break;
        case IP_PROT_IGMP:
                /*
@@ -4619,49 +5193,63 @@
                                                                                        buf_len );
                if( status != NDIS_STATUS_SUCCESS )
                        return status;
+               break;  // required if ICMP/ICMPV6 can go RC?

        case IP_PROT_ICMP:
        case IP_PROT_ICMPV6:
-               p_desc->send_dir = SEND_UD_QP;
+               break;
+
+       case IPPROTO_HOPOPTS:
+               break;
+
        default:
                break;
        }

-       if( !s_buf->p_port->p_adapter->params.cm_enabled )
-       {
-               p_desc->send_dir = SEND_UD_QP;
-               goto send_gen;
-       }
-       else if( endpt_cm_get_state( p_desc->p_endpt ) != IPOIB_CM_CONNECTED )
+       CL_ASSERT( s_buf->p_endpt );
+
+       if( p_desc->send_qp == s_buf->p_port->ib_mgr.h_qp ) // UD Tx
        {
-               p_desc->send_dir = SEND_UD_QP;
+               if( ip_packet_len > s_buf->p_endpt->tx_mtu )
+               {
+                       DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
+                               ("SEND_UD needs IP fragmentation ip_pkt_len %d mtu %u\n",
+                                       ip_packet_len, s_buf->p_endpt->tx_mtu) );
+
+                       if ( p_eth_hdr->type == ETH_PROT_TYPE_IPV6 )
+                       {
+                               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                                       ("IPv6 packet (len %d) wants IP fragmentation, unsupported.\n",
+                                               ip_packet_len) );
+                               return NDIS_STATUS_FAILURE;
+                       }
+                       status = __build_ipv4_fragments( s_buf,
+                                                                                        (ip_hdr_t* const)p_ip_hdr,
+                                                                                        (uint32_t)buf_len,
+                                                                                        ip_packet_len,
+                                                                                        p_mdl );
+
+                       /* no need for send_gen(,0,0) as wr's & ds have already been setup. */
+                       IPOIB_EXIT( IPOIB_DBG_SEND );
+                       return status;
+               }
        }
-       if( p_desc->send_dir == SEND_UD_QP )
+       else
        {
-               ip_packet_len = cl_ntoh16( ((ip_hdr_t*)p_ip_hdr)->length ); //TODO add IPv6 support for CM flow
-               if( ip_packet_len  > s_buf->p_port->p_adapter->params.payload_mtu )
+               /* want to send RC, can we? */
+               if( ip_packet_len > s_buf->p_endpt->tx_mtu )
                {
-                       //TODO: NDIS60
-                       #if 0
-                       status = __send_fragments( p_port,
-                                                                          p_desc,
-                                                                          (eth_hdr_t* const)p_eth_hdr,
-                                                                          (ip_hdr_t* const)p_ip_hdr,
-                                                                          (uint32_t)buf_len,
-                                                                          p_mdl );
-                       return status;
-                       #endif
-                       IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                               ("TODO NDIS6 SEND_UD_QP fragments @ line #%d file %s\n",
-                                       __LINE__,__FILE__) );
-                       ASSERT(FALSE);
+                       /* problems? */
+                       IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                               ("ERR: want RC send, IP packet Len %d > Tx payload MTU %u\n",
+                                       ip_packet_len, s_buf->p_endpt->tx_mtu) );
+                       return NDIS_STATUS_INVALID_LENGTH;
                }
        }

-send_gen:
        cl_perf_start( SendTcp );
        status = __send_gen( s_buf, 0 );
-       cl_perf_stop( &p_port->p_adapter->perf, SendTcp );
+       cl_perf_stop( &s_buf->p_port->p_adapter->perf, SendTcp );

        IPOIB_EXIT( IPOIB_DBG_SEND );
        return status;
@@ -4791,6 +5379,12 @@

                break;

+       case IGMP_VERSION3_REPORT_TYPE:
+               XIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_MCAST,
+                    ("IGMP_VERSION3_REPORT_TYPE: 0x%x, unsupported\n",
+                               p_igmp_v2_hdr->type ) );
+               break;
+
        default:
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_MCAST,
                             ("Send Unknown IGMP message: 0x%x \n", p_igmp_v2_hdr->type ) );
@@ -4812,9 +5406,9 @@
        NDIS_STATUS                     status;
        udp_hdr_t                       *p_udp_hdr;
        PERF_DECLARE( QueryUdp );
-       PERF_DECLARE( SendUdp );
        PERF_DECLARE( FilterDhcp );
-       IPOIB_ENTER( IPOIB_DBG_SEND );
+
+       XIPOIB_ENTER( IPOIB_DBG_SEND );

        if( !buf_len )
        {
@@ -4898,7 +5492,8 @@
        memset(s_buf->p_send_buf, 0, s_buf->p_port->buf_mgr.send_buf_len);

        /* Copy the IP and UDP headers. */
-       //TODO: in this case we limited IP size to 20, but it can be bigger, according to GetIpPayloadPtr
+       //TODO: in this case we limited IP size to 20, but it can be bigger, according
+       // to GetIpPayloadPtr
        if ( ethertype == ETH_PROT_TYPE_IPV6 )
        {
                memcpy( &s_buf->p_send_buf->ipv6.hdr, p_ip_hdr , sizeof(ipv6_hdr_t) );
@@ -4907,18 +5502,18 @@
        {
                memcpy( &s_buf->p_send_buf->ip.hdr, p_ip_hdr , sizeof(ip_hdr_t) );
        }
-
+
        memcpy( &s_buf->p_send_buf->ip.prot.udp.hdr, p_udp_hdr, sizeof(udp_hdr_t) );

        cl_perf_start( FilterDhcp );
        status = __send_mgr_filter_dhcp( p_udp_hdr, p_mdl, buf_len, ethertype, s_buf );
-       cl_perf_stop( &p_port->p_adapter->perf, FilterDhcp );
+       cl_perf_stop( &s_buf->p_port->p_adapter->perf, FilterDhcp );

-       IPOIB_EXIT( IPOIB_DBG_SEND );
+       XIPOIB_EXIT( IPOIB_DBG_SEND );
        return status;
 }

-unsigned short ipchksum(unsigned short *ip, int len)
+static unsigned short ipchksum( unsigned short *ip, int len )
 {
     unsigned long sum = 0;

@@ -4947,7 +5542,7 @@

        IPOIB_ENTER( IPOIB_DBG_SEND );

-       ipoib_send_desc_t *p_desc = s_buf->p_port->p_desc;
+       ipoib_send_desc_t *p_desc = s_buf->p_send_desc;

        if( !buf_len )
        {
@@ -5114,7 +5709,7 @@
                                                                                        + sizeof(dhcp_pkt_t);
        p_desc->send_wr[0].local_ds[1].lkey = s_buf->p_port->ib_mgr.lkey;
        p_desc->send_wr[0].wr.num_ds = 2;
-       p_desc->send_dir = SEND_UD_QP;
+       p_desc->send_qp = s_buf->p_port->ib_mgr.h_qp;   // UD Tx
        IPOIB_EXIT( IPOIB_DBG_SEND );
        return NDIS_STATUS_SUCCESS;
 }
@@ -5131,11 +5726,11 @@
        ipoib_arp_pkt_t         *p_ib_arp;
        NDIS_STATUS                     status;
        mac_addr_t                      null_hw = {0};
+       ipoib_send_desc_t       *p_desc = s_buf->p_send_desc;
+       ipoib_port_t            *p_port = s_buf->p_port;

-       IPOIB_ENTER( IPOIB_DBG_SEND );
+       IPOIB_ENTER( IPOIB_DBG_ARP );

-       ipoib_send_desc_t *p_desc = s_buf->p_port->p_desc;
-
        if( !buf_len )
        {
                NdisGetNextMdl( p_mdl, &p_mdl );
@@ -5191,13 +5786,16 @@
        p_ib_arp->prot_size = p_arp->prot_size;
        p_ib_arp->op = p_arp->op;

-       ipoib_addr_set_qpn( &p_ib_arp->src_hw, s_buf->p_port->ib_mgr.qpn );
-       ipoib_addr_set_flags( &p_ib_arp->src_hw, 0 );
-#if 0
+       ipoib_addr_set_qpn( &p_ib_arp->src_hw, s_buf->p_port->ib_mgr.qpn ); /* UD QPN */
+       ipoib_addr_set_flags( &p_ib_arp->src_hw,
+                       (s_buf->p_port->p_adapter->params.cm_enabled ? IPOIB_CM_FLAG_RC : 0) );

-       if( p_port->p_adapter->params.cm_enabled )
+#if DBG
+       if( s_buf->p_port->p_adapter->params.cm_enabled )
        {
-               ipoib_addr_set_flags( &p_ib_arp->src_hw, IPOIB_CM_FLAG_RC );
+               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ARP,
+                       ("Set CM_FLAG_RC ib_arp.src_hw ARP %s\n",
+                               (p_ib_arp->op == ARP_OP_REQ ? "REQ":"REPL")) );
        }
 #endif

@@ -5222,69 +5820,85 @@
                        return status;
                }
                ipoib_addr_set_qpn( &p_ib_arp->dst_hw, qpn );
-#if 0
+
+#if IPOIB_CM
+#if DBG
+               {
+               char ipa[16];
+               bool req = (p_arp->op == ARP_OP_REQ);
+
+               RtlIpv4AddressToStringA((IN_ADDR*)&p_ib_arp->dst_ip, ipa);
+
+               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ARP,
+                       ("Sending UD ARP-%s to [%s] EP %s CM_cap %s\n",
+                               (req ? "REQ" : "REP"),
+                               ipa,
+                               s_buf->p_endpt->tag,
+                               (s_buf->p_endpt->cm_flag == IPOIB_CM_FLAG_RC ? "1" : "0")) );
+               }
+#endif
+
+#if 0 // SKIP conn establishment in favor of recv ARP-REPLY conn setup
+
+               /* ARP Reply is not over RC per IPOIB CM spec. */
                if( p_arp->op == ARP_OP_REP &&
-                       p_port->p_adapter->params.cm_enabled &&
-                       p_desc->p_endpt->cm_flag == IPOIB_CM_FLAG_RC )
+                       s_buf->p_port->p_adapter->params.cm_enabled &&
+                       s_buf->p_endpt->cm_flag == IPOIB_CM_FLAG_RC )
                {
-                       cm_state_t      cm_state;
-                       cm_state =
-                               ( cm_state_t )InterlockedCompareExchange( (volatile LONG *)&p_desc->p_endpt->conn.state,
-                                                               IPOIB_CM_CONNECT, IPOIB_CM_DISCONNECTED );
+                       cm_state_t      cm_state = endpt_cm_get_state(s_buf->p_endpt);
+
+                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ARP,
+                               ("CM state %s\n",cm_get_state_str(cm_state)) );
+
                        switch( cm_state )
                        {
                        case IPOIB_CM_DISCONNECTED:
-                                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT,
-                                               ("ARP REPLY pending Endpt[%p] QPN %#x MAC %02x:%02x:%02x:%02x:%02x:%02x\n",
-                                               p_desc->p_endpt,
-                                               cl_ntoh32( ipoib_addr_get_qpn( &p_ib_arp->dst_hw )),
-                                               p_desc->p_endpt->mac.addr[0], p_desc->p_endpt->mac.addr[1],
-                                               p_desc->p_endpt->mac.addr[2], p_desc->p_endpt->mac.addr[3],
-                                               p_desc->p_endpt->mac.addr[4], p_desc->p_endpt->mac.addr[5] ) );
-                                       ipoib_addr_set_sid( &p_desc->p_endpt->conn.service_id,
-                                                                               ipoib_addr_get_qpn( &p_ib_arp->dst_hw ) );
-
-                                       NdisFreeToNPagedLookasideList(
-                                               &p_port->buf_mgr.send_buf_list, s_buf->p_send_buf );
-                                       cl_qlist_insert_tail( &p_port->send_mgr.pending_list,
-                                                               IPOIB_LIST_ITEM_FROM_NBL( s_buf->p_send_buf ) );
-                                       NdisInterlockedInsertTailList( &p_port->endpt_mgr.pending_conns,
-                                                                                               &p_desc->p_endpt->list_item,
-                                                                                               &p_port->endpt_mgr.conn_lock );
-                                       cl_event_signal( &p_port->endpt_mgr.event );
-                                       return NDIS_STATUS_PENDING;
+                               cm_state = (cm_state_t)InterlockedCompareExchange(
+                                                               (volatile LONG *)&s_buf->p_endpt->conn.state,
+                                                               IPOIB_CM_QUEUED_TO_CONNECT, IPOIB_CM_DISCONNECTED );
+                               {
+                                       char ipa[16];
+
+                                       RtlIpv4AddressToStringA( (IN_ADDR*)&p_ib_arp->src_ip, ipa );
+
+                                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ARP,
+                                       ("Queue RC connect[%s], send ARP REPLY 2 EP %s\n",
+                                               ipa, s_buf->p_endpt->tag) );
+                               }
+                               ipoib_addr_set_sid( &s_buf->p_endpt->conn.service_id, qpn);
+                               endpt_queue_cm_connection( p_port, s_buf->p_endpt );
+                               break;

-                       case IPOIB_CM_CONNECT:
-                               /* queue ARP REP packet until connected */
-                               NdisFreeToNPagedLookasideList(
-                                       &p_port->buf_mgr.send_buf_list, s_buf->p_send_buf );
-                               cl_qlist_insert_tail( &p_port->send_mgr.pending_list,
-                                                                               IPOIB_LIST_ITEM_FROM_NBL( s_buf->p_nbl ) );
+                       case IPOIB_CM_CONNECTING:
+                               break;

-                               return NDIS_STATUS_PENDING;
                        default:
                                break;
                        }
                }
-#endif
+#endif // SKIP connect - see recv_cb for conn establishment
+#endif // IPOIB_CM
        }
        else
        {
                memset( &p_ib_arp->dst_hw, 0, sizeof(ipoib_hw_addr_t) );
        }

-#if 0 //DBG
+#if IPOIB_CM
        if( p_port->p_adapter->params.cm_enabled )
        {
-               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT,
-               (" ARP %s SEND to ENDPT[%p] State: %d flag: %#x, QPN: %#x MAC %02x:%02x:%02x:%02x:%02x:%02x\n",
-                       ( p_ib_arp->op == ARP_OP_REP ? "REP": "REQ"),                   p_desc->p_endpt,
-                       endpt_cm_get_state( p_desc->p_endpt ),
-                       p_desc->p_endpt->cm_flag,
-                       cl_ntoh32( ipoib_addr_get_qpn( &p_ib_arp->dst_hw )),
-                       p_desc->p_endpt->mac.addr[0], p_desc->p_endpt->mac.addr[1],
-                       p_desc->p_endpt->mac.addr[2], p_desc->p_endpt->mac.addr[3],
-                       p_desc->p_endpt->mac.addr[4], p_desc->p_endpt->mac.addr[5] ));
+               char ip_dst[16];
+
+               RtlIpv4AddressToStringA( (IN_ADDR*)&p_ib_arp->dst_ip, ip_dst );
+               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ARP,
+                       ("Send ARP-%s to EP %s [%s] %s RCM_cap %d %s MAC %s\n",
+                       (p_ib_arp->op == ARP_OP_REP ? "REP": "REQ"),
+                       s_buf->p_endpt->tag,
+                       ip_dst,
+                       cm_get_state_str(endpt_cm_get_state(s_buf->p_endpt)),
+                       (s_buf->p_endpt->cm_flag == IPOIB_CM_FLAG_RC ? 1:0),
+                       (p_desc->send_qp == s_buf->p_port->ib_mgr.h_qp ? "UD":"RC"),
+                       mk_mac_str(&s_buf->p_endpt->mac)) );
        }
 #endif

@@ -5332,7 +5946,8 @@

        cl_perf_start( GetEndpt );
        status = __endpt_mgr_ref( p_port, p_eth_hdr->dst, pp_endpt );
-       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,("__endpt_mgr_ref called for %p\n", *pp_endpt));
+       XIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
+                       ("__endpt_mgr_ref called for %p\n", *pp_endpt));
        cl_perf_stop( &p_port->p_adapter->perf, GetEndpt );

        if( status == NDIS_STATUS_NO_ROUTE_TO_DESTINATION &&
@@ -5372,32 +5987,35 @@
        NDIS_STATUS                     status;
        int32_t                         hdr_idx;
        ULONG                           mss = 0;
+       PVOID                           *ppTemp;
+       ipoib_send_desc_t       *p_desc= s_buf->p_port->p_desc;
+       ipoib_endpt_t           *p_endpt = s_buf->p_endpt;

        PNDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO                      p_checksum_list_info = NULL;
        PNDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO       p_lso_info = NULL;
        PERF_DECLARE( SendMgrFilter );

        IPOIB_ENTER( IPOIB_DBG_SEND );
-
-       ipoib_send_desc_t *p_desc = s_buf->p_port->p_desc;

        /* Store context in our reserved area of the packet. */

        ASSERT(s_buf == (ipoib_send_NB_SG *) IPOIB_INFO_FROM_NB(s_buf->p_curr_nb));
-       s_buf->p_endpt = p_desc->p_endpt;
+
        //TODO IMPORTANT: Send buffer should not be allocated within global struct !!!
        // Otherwise, the next send may override its content
        //s_buf->p_send_buf= p_desc->p_buf;

        /* Format the send descriptor. */
-       PVOID* ppTemp           = &NET_BUFFER_LIST_INFO(s_buf->p_nbl, TcpIpChecksumNetBufferListInfo);
-    p_checksum_list_info =
-                        (PNDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO) ((PULONG)ppTemp);
-       // Calculate LSO
+       ppTemp = &NET_BUFFER_LIST_INFO( s_buf->p_nbl, TcpIpChecksumNetBufferListInfo );
+    p_checksum_list_info =
+                               (PNDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO) ((PULONG)ppTemp);
+       // Calculate LSO - no LSO if CM enabled.
        if( s_buf->p_port->p_adapter->params.lso )
        {
-               p_lso_info =
-                       (PNDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO) (PULONG) &NET_BUFFER_LIST_INFO(s_buf->p_nbl, TcpLargeSendNetBufferListInfo);
+               ASSERT( !s_buf->p_port->p_adapter->params.cm_enabled );
+               p_lso_info = (PNDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO)
+                                       (PULONG) &NET_BUFFER_LIST_INFO(s_buf->p_nbl,
+                                                                                               TcpLargeSendNetBufferListInfo);
                ASSERT(p_lso_info);
        ULONG LsoType = p_lso_info->Transmit.Type;

@@ -5410,9 +6028,11 @@
                }
            if(LsoType == NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE)
                {
-               ASSERT(p_lso_info->LsoV2Transmit.Type == NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE);
+               ASSERT(p_lso_info->LsoV2Transmit.Type ==
+                                                                               NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE);
                ASSERT(mss == p_lso_info->LsoV2Transmit.MSS);
-               ASSERT(p_lso_info->LsoV1Transmit.TcpHeaderOffset == p_lso_info->LsoV2Transmit.TcpHeaderOffset);
+               ASSERT(p_lso_info->LsoV1Transmit.TcpHeaderOffset ==
+                                                                       p_lso_info->LsoV2Transmit.TcpHeaderOffset);
            }
        }

@@ -5424,14 +6044,14 @@
        /* Set up IPoIB Header */
        s_buf->p_port->hdr[hdr_idx].type = p_eth_hdr->type;
        s_buf->p_port->hdr[hdr_idx].resv = 0;
-

        //Init send buffer to 0
        s_buf->p_send_buf = NULL;

        if (mss && (p_lso_info->LsoV1Transmit.TcpHeaderOffset != 0))
        { //We have LSO packet
-               ASSERT( mss == (p_lso_info->LsoV1Transmit.MSS & p_lso_info->LsoV2Transmit.MSS));
+               ASSERT( mss == (p_lso_info->LsoV1Transmit.MSS &
+                                                                                               p_lso_info->LsoV2Transmit.MSS));
                //ASSERT ( (mss & (1<<20)) == mss);
                status = __build_lso_desc( s_buf->p_port,
                                                                   mss,
@@ -5467,42 +6087,73 @@
                        return status;
                }

-               if( p_desc->send_dir == SEND_UD_QP )
+               /* want to Transmit over RC connection - is RC connection available & ready?
+                * if endpt RC connection state is OK, then, set
+                * p_desc->send_qp to be RC not UD QP. Otherwise reset send_QP to UD QP.
+                */
+
+               if( s_buf->p_port->p_adapter->params.cm_enabled
+                   && p_desc->send_qp == p_endpt->conn.h_send_qp )     // RC Tx
+               {
+                       cm_state_t      cstate = endpt_cm_get_state( p_endpt );
+
+                       switch( cstate )
+                       {
+                         case IPOIB_CM_CONNECTED:
+
+                               for( i = 0; i < p_desc->num_wrs; i++ )
+                                       p_desc->send_wr[i].wr.send_opt = 0;
+                               goto wr_setup;
+
+                         case IPOIB_CM_CONNECTING:
+                               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
+                                       ("Revert RC to UD, EP %s CONNECTING\n", p_endpt->tag) );
+                               break;
+
+                         case IPOIB_CM_DISCONNECTED:
+                               break;
+
+                         default:
+                               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
+                                       ("Revert RC to UD, RC not CONNECTED  cstate %s EP %s\n",
+                                               cm_get_state_str(cstate), p_endpt->tag) );
+                               break;
+                       }
+                       // Not yet, set UD Tx.
+                       p_desc->send_qp = s_buf->p_port->ib_mgr.h_qp;
+               }
+
+               if( p_desc->send_qp == s_buf->p_port->ib_mgr.h_qp ) // UD Tx ?
                {
-                       ASSERT ( p_desc->num_wrs == 1 );
-                       p_desc->send_qp = s_buf->p_port->ib_mgr.h_qp; // UD QP
                        for( i = 0; i < p_desc->num_wrs; i++ )
                        {
-                               p_desc->send_wr[i].wr.dgrm.ud.remote_qp = p_desc->p_endpt->qpn;
+                               p_desc->send_wr[i].wr.dgrm.ud.remote_qp = p_endpt->qpn;
                                p_desc->send_wr[i].wr.dgrm.ud.remote_qkey = s_buf->p_port->ib_mgr.bcast_rec.qkey;
-                               p_desc->send_wr[i].wr.dgrm.ud.h_av = p_desc->p_endpt->h_av;
+                               p_desc->send_wr[i].wr.dgrm.ud.h_av = p_endpt->h_av;
                                p_desc->send_wr[i].wr.dgrm.ud.pkey_index = s_buf->p_port->pkey_index;
                                p_desc->send_wr[i].wr.dgrm.ud.rsvd = NULL;
                                p_desc->send_wr[i].wr.send_opt = 0;

-                               if( s_buf->p_port->p_adapter->params.send_chksum_offload        &&
-                                       p_checksum_list_info                                                    &&
-                                       ( p_checksum_list_info->Transmit.IsIPv4  ||
-                                       p_checksum_list_info->Transmit.IsIPv6  ))
+                               if( s_buf->p_port->p_adapter->params.send_chksum_offload &&
+                                       p_checksum_list_info &&
+                                       (p_checksum_list_info->Transmit.IsIPv4 ||
+                                       p_checksum_list_info->Transmit.IsIPv6) )
                                {
                                        // Set transmit checksum offloading
                                        if( p_checksum_list_info->Transmit.IpHeaderChecksum )
                                        {
                                                p_desc->send_wr[i].wr.send_opt |= IB_SEND_OPT_TX_IP_CSUM;
                                        }
-                                       if( p_checksum_list_info->Transmit.TcpChecksum || p_checksum_list_info->Transmit.UdpChecksum)
+                                       if( p_checksum_list_info->Transmit.TcpChecksum ||
+                                               p_checksum_list_info->Transmit.UdpChecksum )
                                        {
                                                p_desc->send_wr[i].wr.send_opt |= IB_SEND_OPT_TX_TCP_UDP_CSUM;
                                        }
                                }
                        }
                }
-               else // RC QP
-               {
-                       CL_ASSERT( p_desc->send_dir == SEND_RC_QP );
-                       p_desc->send_qp = p_desc->p_endpt->conn.h_work_qp;
-               }

+wr_setup:
                for( i = 0; i < p_desc->num_wrs; i++ )
                {
                        p_desc->send_wr[i].wr.wr_type = WR_SEND;
@@ -5514,8 +6165,12 @@
                        }
                }

-               p_desc->send_wr[p_desc->num_wrs - 1].wr.wr_id = (uintn_t)s_buf ;
+               p_desc->send_wr[p_desc->num_wrs - 1].wr.wr_id = (uintn_t)s_buf;
                p_desc->send_wr[p_desc->num_wrs - 1].wr.send_opt |= IB_SEND_OPT_SIGNALED;
+
+               if( p_desc->send_qp != s_buf->p_port->ib_mgr.h_qp )     // RC Tx
+                       p_desc->send_wr[p_desc->num_wrs - 1].wr.send_opt |=IB_SEND_OPT_SOLICITED;
+
                p_desc->send_wr[p_desc->num_wrs - 1].wr.p_next = NULL;
        }

@@ -5539,7 +6194,7 @@
        IPOIB_ENTER( IPOIB_DBG_SEND );

        ipoib_send_NB_SG  *s_buf = IPOIB_INFO_FROM_NB(p_netbuf);
-       ipoib_send_desc_t *p_desc = p_port->p_desc;
+       ipoib_send_desc_t *p_desc = s_buf->p_send_desc;

        //TODO What if first NB was inserted to pending list ????
        PNET_BUFFER     FirstBuffer  = NET_BUFFER_LIST_FIRST_NB (s_buf->p_nbl);
@@ -5570,16 +6225,15 @@
        // Tell NDIS how much we will send.
        if(p_lso_info->LsoV1Transmit.Type == NDIS_TCP_LARGE_SEND_OFFLOAD_V1_TYPE)
     {
-               s_buf->tcp_payload = PacketLength-TheLsoData.LsoHeaderSize;
-
+       s_buf->tcp_payload = PacketLength-TheLsoData.LsoHeaderSize;
        }

        p_desc->send_wr[0].wr.dgrm.ud.mss = mss;
        p_desc->send_wr[0].wr.dgrm.ud.header = TheLsoData.LsoBuffers[0].pData;
        p_desc->send_wr[0].wr.dgrm.ud.hlen = TheLsoData.LsoHeaderSize ;//lso_header_size;
-       p_desc->send_wr[0].wr.dgrm.ud.remote_qp = p_desc->p_endpt->qpn;
+       p_desc->send_wr[0].wr.dgrm.ud.remote_qp = s_buf->p_endpt->qpn;
        p_desc->send_wr[0].wr.dgrm.ud.remote_qkey = p_port->ib_mgr.bcast_rec.qkey;
-       p_desc->send_wr[0].wr.dgrm.ud.h_av = p_desc->p_endpt->h_av;
+       p_desc->send_wr[0].wr.dgrm.ud.h_av = s_buf->p_endpt->h_av;
        p_desc->send_wr[0].wr.dgrm.ud.pkey_index = p_port->pkey_index;
        p_desc->send_wr[0].wr.dgrm.ud.rsvd = NULL;

@@ -5591,7 +6245,6 @@

        p_desc->send_wr[0].wr.p_next = NULL;
        p_desc->send_qp = p_port->ib_mgr.h_qp;
-       p_desc->send_dir = SEND_UD_QP;
        status = __send_gen( s_buf, TheLsoData.LsoHeaderSize );

        IPOIB_EXIT( IPOIB_DBG_SEND );
@@ -5783,14 +6436,7 @@

        KIRQL                           old_irql;

-       PERF_DECLARE( GetEthHdr );
-       PERF_DECLARE( BuildSendDesc );
-       PERF_DECLARE( QueuePacket );
-       PERF_DECLARE( SendMgrQueue );
-       PERF_DECLARE( PostSend );
-       PERF_DECLARE( ProcessFailedSends );
-
-       IPOIB_ENTER( IPOIB_DBG_SEND );
+       XIPOIB_ENTER( IPOIB_DBG_SEND );

        if (NDIS_TEST_SEND_AT_DISPATCH_LEVEL(send_flags))
        {
@@ -5843,7 +6489,7 @@
        }

        IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_SEND,
-                               ("Processing netbuffer list: %p buf_cnt = %d\n", p_net_buffer_list, buf_cnt));
+               ("Processing netbuffer list: %p buf_cnt %d\n", p_net_buffer_list, buf_cnt));

        ASSERT(buf_cnt);

@@ -5872,6 +6518,7 @@
                //Set all the data needed for process_sg_list
                s_buf->p_port = p_port;
                s_buf->p_sgl = NULL;
+               s_buf->p_send_desc = p_port->p_desc;
                s_buf->p_endpt = NULL;
                s_buf->p_nbl = p_net_buffer_list;
                s_buf->p_curr_nb = p_netbuf;
@@ -5887,29 +6534,14 @@

                IPOIB_INFO_FROM_NB(p_netbuf) = s_buf;

-               IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_SEND,
+               IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_BUF,
                                ("Netbuf to send = %p\n", p_netbuf) );
-#if 0
-                       CHAR *pTemp = (CHAR *) ExAllocatePoolWithTag(NonPagedPool , p_port->p_adapter->sg_list_size, 'abcd');
-                       CL_ASSERT(pTemp != NULL);
-                       p_sgl = pTemp;
-                       CreateFragList(NdisQueryNetBufferPhysicalCount(p_netbuf), p_netbuf, NET_BUFFER_DATA_LENGTH(p_netbuf), (PMP_FRAG_LIST) p_sgl);
-                       IPOIB_FROM_QUEUE(p_netbuf) = NULL;
-                       /*IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                                               ("[%d] Allocation from scratch: Netbuf = %x, found SGL = %x, PhysBufCnt=%ld, NB LEN = %ld, sg_list_size=%ld\n",
-                                               buf_cnt, p_netbuf, p_sgl,NdisQueryNetBufferPhysicalCount(p_netbuf) ,
-                                               NET_BUFFER_DATA_LENGTH(p_netbuf),p_port->p_adapter->sg_list_size) );
-                                               */
-                       ipoib_process_sg_list(NULL, NULL, (PSCATTER_GATHER_LIST)p_sgl, p_netbuf);
-                       status = NDIS_STATUS_SUCCESS;
-#else
-
-                       //cl_qlist_check_validity(&p_port->send_mgr.pending_list);

-                       cl_spinlock_release( &p_port->send_lock );
+               //cl_qlist_check_validity(&p_port->send_mgr.pending_list);

+               cl_spinlock_release( &p_port->send_lock );

-                       ++g_ipoib_send_SW_in_loop;
+               ++g_ipoib_send_SW_in_loop;

                status = NdisMAllocateNetBufferSGList(
                                                        p_port->p_adapter->NdisMiniportDmaHandle,
@@ -5920,7 +6552,6 @@
                                                        p_port->p_adapter->sg_list_size);

                cl_spinlock_acquire( &p_port->send_lock );
-#endif

                if( status != NDIS_STATUS_SUCCESS )
                {
@@ -5943,7 +6574,7 @@

        NDIS_LOWER_IRQL(old_irql, DISPATCH_LEVEL);

-       IPOIB_EXIT( IPOIB_DBG_SEND );
+       XIPOIB_EXIT( IPOIB_DBG_SEND );
 }

 static inline void
@@ -5953,12 +6584,17 @@
        IN      ULONG                           compl_flags,
        IN      boolean_t                       bLock )
 {
+       PERF_DECLARE( FreeSendBuf );
+       PERF_DECLARE( SendComp );
+
        CL_ASSERT( s_buf );

        IPOIB_PRINT(TRACE_LEVEL_VERBOSE, IPOIB_DBG_BUF,
                ("Processing send completion for NBL %p s_buf %p\n",
                        s_buf->p_nbl, s_buf));

+       cl_perf_start( SendComp );
+
        // Free SGL element allocated by NDIS
        // We should do it before freeing the whole NBL
        NdisMFreeNetBufferSGList( s_buf->p_port->p_adapter->NdisMiniportDmaHandle,
@@ -6041,7 +6677,7 @@
                cl_perf_start( FreeSendBuf );
                NdisFreeToNPagedLookasideList( &s_buf->p_port->buf_mgr.send_buf_list,
                        s_buf->p_send_buf );
-               cl_perf_stop( &p_port->p_adapter->perf, FreeSendBuf );
+               cl_perf_stop( &s_buf->p_port->p_adapter->perf, FreeSendBuf );
        }

        /* Dereference the enpoint used for the transfer. */
@@ -6053,18 +6689,30 @@
        if (status == NDIS_STATUS_SUCCESS)
        {
                //++g_ipoib_send_SG_real;
-               ipoib_inc_send_stat( p_port->p_adapter, IP_STAT_SUCCESS, 0 );
+               ipoib_inc_send_stat( s_buf->p_port->p_adapter, IP_STAT_SUCCESS, 0 );
        } else {
                ++g_ipoib_send_SG_failed;
-               ipoib_inc_send_stat( p_port->p_adapter, IP_STAT_ERROR, 0 );
+               ipoib_inc_send_stat( s_buf->p_port->p_adapter, IP_STAT_ERROR, 0 );
        }
 #endif
        //Put back into the pool list structure allocated for the NB
        cl_qpool_put(&s_buf->p_port->send_mgr.send_pool, (cl_pool_item_t* )s_buf);

-       cl_perf_stop( &p_port->p_adapter->perf, SendComp );
+       cl_perf_stop( &s_buf->p_port->p_adapter->perf, SendComp );
 }

+void
+ipoib_send_complete_net_buffer(
+       IN      ipoib_send_NB_SG        *s_buf,
+       IN      NDIS_STATUS             status,
+       IN      ULONG                           compl_flags,
+       IN      boolean_t                       bLock )
+{
+       IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_BUF,
+               ("RC send completion for NBL %p s_buf %p\n", s_buf->p_nbl, s_buf));
+
+       __send_complete_net_buffer( s_buf, status, compl_flags, bLock );
+}

 void
 ipoib_port_resume(
@@ -6074,31 +6722,33 @@
        cl_list_item_t          *p_item;
        ipoib_send_NB_SG        *s_buf = NULL;
        boolean_t                       b_good_port_state = TRUE;
-
-       PERF_DECLARE( GetEndpt );
-       PERF_DECLARE( BuildSendDesc );
-       PERF_DECLARE( ProcessFailedSends );
-       PERF_DECLARE( PostSend );
-
-       IPOIB_ENTER( IPOIB_DBG_SEND );
+       bool                            continue_sending;
+       KIRQL                           cur_irql = DISPATCH_LEVEL;

        UNUSED_PARAM(b_pending);

+       XIPOIB_ENTER( IPOIB_DBG_SEND );
+
+       if( KeGetCurrentIrql() != DISPATCH_LEVEL )
+       {
+               NDIS_RAISE_IRQL_TO_DISPATCH(&cur_irql);
+       }
+
        ASSERT(KeGetCurrentIrql() == DISPATCH_LEVEL);

        cl_obj_lock( &p_port->obj );
        if( p_port->state != IB_QPS_RTS )
        {
-               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_SEND,
-                       ("Invalid port state =%d - Flush pending list\n", p_port->state) );
+               IPOIB_PRINT_EXIT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_SEND,
+                       ("Port[%d] Invalid state !IB_QPS_RTS(%d) - Flush pending list\n",
+                               p_port->port_num, p_port->p_adapter->state) );
                b_good_port_state = FALSE;
        }
        cl_obj_unlock( &p_port->obj );

        if (p_port->send_mgr.pending_list.count <= 0)
-       {
-               return;
-       }
+               goto Cleanup;
+
        p_item =  cl_qlist_remove_head( &p_port->send_mgr.pending_list );
        while (p_item != cl_qlist_end(&p_port->send_mgr.pending_list))
        {
@@ -6120,7 +6770,11 @@
                                break;
                        }

-                       bool continue_sending = ipoib_process_sg_list_real(NULL, NULL, (PSCATTER_GATHER_LIST) s_buf->p_sgl, s_buf);
+                       continue_sending = ipoib_process_sg_list_real(
+                                                                                       NULL,
+                                                                                       NULL,
+                                                                                       (PSCATTER_GATHER_LIST) s_buf->p_sgl,
+                                                                                       s_buf );

                        //cl_qlist_check_validity(&p_port->send_mgr.pending_list);

@@ -6135,7 +6789,13 @@
        }

 Cleanup:
-       IPOIB_EXIT( IPOIB_DBG_SEND );
+
+       if (cur_irql != DISPATCH_LEVEL)
+       {
+               NDIS_LOWER_IRQL(cur_irql, DISPATCH_LEVEL);
+       }
+
+       XIPOIB_EXIT( IPOIB_DBG_SEND );
 }


@@ -6159,7 +6819,6 @@
        PERF_DECLARE( SendCb );
        PERF_DECLARE( PollSend );
        PERF_DECLARE( SendComp );
-       PERF_DECLARE( FreeSendBuf );
        PERF_DECLARE( RearmSend );
        PERF_DECLARE( PortResume );

@@ -6201,11 +6860,9 @@
                                                p_wc->wc_type == IB_WC_SEND || p_wc->wc_type == IB_WC_LSO);

                        s_buf = (ipoib_send_NB_SG*)(uintn_t)p_wc->wr_id;
-
-                       IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
-                               ("Send completion for NBL=0x%p, NB=0x%p \n",  s_buf->p_nbl, s_buf->p_curr_nb));
-
                        CL_ASSERT( s_buf );
+                       IPOIB_PRINT(TRACE_LEVEL_VERBOSE, IPOIB_DBG_SEND,
+                               ("UD send completion NBL %p s_buf %p\n", s_buf->p_nbl, s_buf) );

                        p_endpt = s_buf->p_endpt;

@@ -6293,12 +6950,12 @@
 }


-
 /******************************************************************************
 *
 * Endpoint manager implementation
 *
 ******************************************************************************/
+
 static void
 __endpt_mgr_construct(
        IN                              ipoib_port_t* const                     p_port )
@@ -6308,137 +6965,33 @@
        cl_fmap_init( &p_port->endpt_mgr.gid_endpts, __gid_cmp );
 }

-//TODO Restore CM
-#if 0
-static void
-__endpt_cm_mgr_thread(
-IN             void*           p_context );
-#endif

 static ib_api_status_t
 __endpt_mgr_init(
-       IN                              ipoib_port_t* const                     p_port )
-{
-       IPOIB_ENTER( IPOIB_DBG_INIT );
-#if 0
-       if( p_port->p_adapter->params.cm_enabled )
-       {
-               cl_fmap_init( &p_port->endpt_mgr.conn_endpts, __gid_cmp );
-
-               NdisInitializeListHead( &p_port->endpt_mgr.pending_conns );
-               NdisAllocateSpinLock( &p_port->endpt_mgr.conn_lock );
-               cl_event_init( &p_port->endpt_mgr.event, FALSE );
-
-               NdisInitializeListHead( &p_port->endpt_mgr.remove_conns );
-               NdisAllocateSpinLock( &p_port->endpt_mgr.remove_lock );
-
-               cl_thread_init( &p_port->endpt_mgr.h_thread,
-                                               __endpt_cm_mgr_thread,
-                                               ( const void *)p_port,
-                                               "CmEndPtMgr" );
-       }
-#endif
-       UNUSED_PARAM(p_port);
-       IPOIB_EXIT( IPOIB_DBG_INIT );
-       return IB_SUCCESS;
-}
-
-//TODO CM Restore
-#if 0
-static void
-__endpt_cm_mgr_thread(
-IN             void*           p_context )
-{
-       ib_api_status_t ib_status;
-       LIST_ENTRY              *p_item;
-       ipoib_endpt_t   *p_endpt;
-       ipoib_port_t    *p_port =( ipoib_port_t *)p_context;
-
-       IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT,
-               ("Starting Port [%d] Endpt CM thread \n", p_port->port_num ) );
-
-       while( !p_port->endpt_mgr.thread_is_done )
-       {
-               cl_event_wait_on( &p_port->endpt_mgr.event, EVENT_NO_TIMEOUT, FALSE );
-
-               while( ( p_item = NdisInterlockedRemoveHeadList(
-                                                               &p_port->endpt_mgr.pending_conns,
-                                                               &p_port->endpt_mgr.conn_lock) ) != NULL )
-               {
-
-                       p_endpt = PARENT_STRUCT( p_item, ipoib_endpt_t, list_item );
-                       if( p_port->endpt_mgr.thread_is_done )
-                       {
-                               endpt_cm_set_state( p_endpt, IPOIB_CM_DISCONNECTED );
-                               continue;
-                       }
-
-                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT,
-                               ("Endpt[%p] CONNECT REQ to MAC %02x:%02x:%02x:%02x:%02x:%02x\n",
-                               p_endpt,
-                               p_endpt->mac.addr[0], p_endpt->mac.addr[1],
-                               p_endpt->mac.addr[2], p_endpt->mac.addr[3],
-                               p_endpt->mac.addr[4], p_endpt->mac.addr[5] ) );
-
-                       if( !p_endpt->conn.h_send_qp )
-                       {
-                               ib_status = endpt_cm_create_qp( p_endpt,
-                                                                                               &p_endpt->conn.h_send_qp );
-                               if( ib_status != IB_SUCCESS )
-                               {
-                                       IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                                               ("Endpt %p CM create QP failed status %#x\n",
-                                               p_endpt, ib_status ) );
-                               }
-                               else
-                               {
-                                       ib_status = ipoib_endpt_connect( p_endpt );
-                                       if( ib_status != IB_SUCCESS && ib_status != IB_PENDING )
-                                       {
-                                               IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                                                       ("Endpt %p conn REQ failed status %#x\n",
-                                                       p_endpt, ib_status ) );
-                                       }
-                               }
-                               if( ib_status != IB_SUCCESS && ib_status != IB_PENDING )
-                               {
-                                       endpt_cm_set_state( p_endpt, IPOIB_CM_DESTROY );
-                                       endpt_cm_flush_recv( p_port, p_endpt );
-                                       endpt_cm_set_state( p_endpt, IPOIB_CM_DISCONNECTED );
-                               }
-                       }
+       IN                              ipoib_port_t* const                     p_port )
+{
+       IPOIB_ENTER( IPOIB_DBG_INIT );

-               }//while( p_item != NULL )
+       if( p_port->p_adapter->params.cm_enabled )
+       {
+               cl_fmap_init( &p_port->endpt_mgr.conn_endpts, __gid_cmp );

-               while( ( p_item = NdisInterlockedRemoveHeadList(
-                                                               &p_port->endpt_mgr.remove_conns,
-                                                               &p_port->endpt_mgr.remove_lock ) ) != NULL )
-               {
-                       p_endpt = PARENT_STRUCT( p_item, ipoib_endpt_t, list_item );
-
-                       endpt_cm_set_state( p_endpt, IPOIB_CM_DESTROY );
-
-                       IPOIB_PRINT( TRACE_LEVEL_WARNING, IPOIB_DBG_INIT,
-                               ("\nDESTROYING Endpt[%p]  MAC %02x:%02x:%02x:%02x:%02x:%02x\n",
-                               p_endpt,
-                               p_endpt->mac.addr[0], p_endpt->mac.addr[1],
-                               p_endpt->mac.addr[2], p_endpt->mac.addr[3],
-                               p_endpt->mac.addr[4], p_endpt->mac.addr[5] ) );
-                       endpt_cm_flush_recv( p_port, p_endpt );
-                       endpt_cm_set_state( p_endpt, IPOIB_CM_DISCONNECTED );
-                       cl_obj_destroy( &p_endpt->obj );
-               }
-       }
+               NdisInitializeListHead( &p_port->endpt_mgr.pending_conns );
+               NdisAllocateSpinLock( &p_port->endpt_mgr.conn_lock );
+               cl_event_init( &p_port->endpt_mgr.event, FALSE );
+
+               NdisInitializeListHead( &p_port->endpt_mgr.remove_conns );
+               NdisAllocateSpinLock( &p_port->endpt_mgr.remove_lock );

-       p_port->endpt_mgr.thread_is_done++;
+               cl_thread_init( &p_port->endpt_mgr.h_thread,
+                                               ipoib_endpt_cm_mgr_thread,
+                                               ( const void *)p_port,
+                                               "CmEndPtMgr" );
+       }

-       NdisFreeSpinLock( &p_port->endpt_mgr.remove_lock );
-       NdisFreeSpinLock( &p_port->endpt_mgr.conn_lock );
-
-       IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT,
-               (" Port [%d] Endpt_mgr thread is done\n", p_port->port_num ) );
+       IPOIB_EXIT( IPOIB_DBG_INIT );
+       return IB_SUCCESS;
 }
-#endif

 static void
 __endpt_mgr_destroy(
@@ -6448,17 +7001,36 @@
        CL_ASSERT( cl_is_qmap_empty( &p_port->endpt_mgr.mac_endpts ) );
        CL_ASSERT( cl_is_qmap_empty( &p_port->endpt_mgr.lid_endpts ) );
        CL_ASSERT( cl_is_fmap_empty( &p_port->endpt_mgr.gid_endpts ) );
-       UNUSED_PARAM(p_port);
-#if 0
+
        if( p_port->p_adapter->params.cm_enabled )
        {
-               CL_ASSERT( cl_is_fmap_empty( &p_port->endpt_mgr.conn_endpts ) );
+               // make sure once CM connected EPs are removed.
+               if( !cl_is_fmap_empty( &p_port->endpt_mgr.conn_endpts ) )
+               {
+                       cl_fmap_item_t                  *p_fmap_item;
+
+                       p_fmap_item = cl_fmap_head( &p_port->endpt_mgr.conn_endpts );
+                       while( p_fmap_item != cl_fmap_end( &p_port->endpt_mgr.conn_endpts ) )
+                       {
+                               ipoib_endpt_t   *p_endpt;
+
+                               p_endpt = PARENT_STRUCT( p_fmap_item, ipoib_endpt_t, conn_item );
+                               DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
+                                       ("conn_endpts: Lingering EP: %s state %s destroy %s flush %s\n",
+                                               p_endpt->tag,
+                                               cm_get_state_str( endpt_cm_get_state( p_endpt ) ),
+                                               (p_endpt->cm_ep_destroy ? "True":"False"),
+                                               (p_endpt->cm_rx_flushing ? "True":"False")) );
+                               cl_fmap_remove_item( &p_port->endpt_mgr.conn_endpts,
+                                                                        &p_endpt->conn_item );
+                               p_fmap_item = cl_fmap_head( &p_port->endpt_mgr.conn_endpts );
+                       }
+               }
        }
-#endif
+
        IPOIB_EXIT( IPOIB_DBG_INIT );
 }

-
 static void
 __endpt_mgr_remove_all(
        IN                              ipoib_port_t* const                     p_port )
@@ -6466,6 +7038,7 @@
        IPOIB_ENTER( IPOIB_DBG_ENDPT );

        cl_obj_lock( &p_port->obj );
+
        /* Wait for all readers to complete. */
        while( p_port->endpt_rdr )
                ;
@@ -6485,10 +7058,9 @@

 static void
 __endpt_mgr_reset_all(
-       IN                              ipoib_port_t* const                     p_port )
+       IN                      ipoib_port_t* const                     p_port )
 {
        cl_map_item_t                   *p_item;
-       cl_fmap_item_t                  *p_fmap_item;
        ipoib_endpt_t                   *p_endpt;
        cl_qlist_t                              mc_list;
        cl_qlist_t                              conn_list;
@@ -6496,73 +7068,71 @@
        NDIS_LINK_STATE                 link_state;
        NDIS_STATUS_INDICATION  status_indication;

-
        IPOIB_ENTER( IPOIB_DBG_ENDPT );

        cl_qlist_init( &mc_list );
        cl_qlist_init( &conn_list );
+
        cl_obj_lock( &p_port->obj );
        /* Wait for all readers to complete. */
        while( p_port->endpt_rdr )
                ;

 #if 0
-                       __endpt_mgr_remove_all(p_port);
+       __endpt_mgr_remove_all(p_port);
 #else
-                       link_state.Header.Revision = NDIS_LINK_STATE_REVISION_1;
-                       link_state.Header.Type = NDIS_OBJECT_TYPE_DEFAULT;
-                       link_state.Header.Size = sizeof(NDIS_LINK_STATE);
-                       link_state.MediaConnectState = MediaConnectStateDisconnected;
-                       link_state.MediaDuplexState = MediaDuplexStateFull;
-                       link_state.XmitLinkSpeed =
-                       link_state.RcvLinkSpeed  =
-                                                       SET_PORT_RATE_BPS( p_port->p_adapter->port_rate );
-
-                       IPOIB_INIT_NDIS_STATUS_INDICATION(&status_indication,
-                                                               p_port->p_adapter->h_adapter,
-                                                               NDIS_STATUS_LINK_STATE,
-                                                               (PVOID)&link_state,
-                                                               sizeof(link_state));
-                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT,
-                                               ("Indicate DISCONNECT!\n") );
-                       NdisMIndicateStatusEx( p_port->p_adapter->h_adapter,
-                                                                  &status_indication );
+       link_state.Header.Revision = NDIS_LINK_STATE_REVISION_1;
+       link_state.Header.Type = NDIS_OBJECT_TYPE_DEFAULT;
+       link_state.Header.Size = sizeof(NDIS_LINK_STATE);
+       link_state.MediaConnectState = MediaConnectStateDisconnected;
+       link_state.MediaDuplexState = MediaDuplexStateFull;
+
+       link_state.XmitLinkSpeed =
+       link_state.RcvLinkSpeed  = SET_PORT_RATE_BPS( p_port->p_adapter->port_rate );
+
+       IPOIB_INIT_NDIS_STATUS_INDICATION( &status_indication,
+                                                                          p_port->p_adapter->h_adapter,
+                                                                          NDIS_STATUS_LINK_STATE,
+                                                                          (PVOID)&link_state,
+                                                                          sizeof(link_state) );
+
+       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT,
+                                       ("Indicate DISCONNECT!\n") );
+
+       NdisMIndicateStatusEx( p_port->p_adapter->h_adapter, &status_indication );

-                       link_state.MediaConnectState = MediaConnectStateConnected;
-                       IPOIB_INIT_NDIS_STATUS_INDICATION(&status_indication,
-                                                               p_port->p_adapter->h_adapter,
-                                                               NDIS_STATUS_LINK_STATE,
-                                                               (PVOID)&link_state,
-                                                               sizeof(link_state));
-                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT,
-                                               ("Indicate Connect\n") );
-                       NdisMIndicateStatusEx( p_port->p_adapter->h_adapter,
-                                                                       &status_indication );
+       link_state.MediaConnectState = MediaConnectStateConnected;
+       IPOIB_INIT_NDIS_STATUS_INDICATION( &status_indication,
+                                                                          p_port->p_adapter->h_adapter,
+                                                                          NDIS_STATUS_LINK_STATE,
+                                                                          (PVOID)&link_state,
+                                                                          sizeof(link_state) );
+
+       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT, ("Indicate Connect\n") );

+       NdisMIndicateStatusEx( p_port->p_adapter->h_adapter, &status_indication );

-                               //      IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT,
-                                       //      ("Link DOWN!\n") );
+       // IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT, ("Link DOWN!\n") );

        if( p_port->p_local_endpt )
        {
-               //TODO: CM RESTORE
-               //ipoib_port_cancel_listen( p_port, p_port->p_local_endpt );
+               if( p_port->p_adapter->params.cm_enabled )
+                       ipoib_port_cancel_listen( p_port->p_local_endpt );

                cl_fmap_remove_item( &p_port->endpt_mgr.gid_endpts,
-                       &p_port->p_local_endpt->gid_item );
+                                                        &p_port->p_local_endpt->gid_item );
                cl_qmap_remove_item( &p_port->endpt_mgr.mac_endpts,
-                       &p_port->p_local_endpt->mac_item );
+                                                        &p_port->p_local_endpt->mac_item );
                if( p_port->p_local_endpt->dlid )
                {
                        cl_qmap_remove_item( &p_port->endpt_mgr.lid_endpts,
-                               &p_port->p_local_endpt->lid_item );
+                                                                &p_port->p_local_endpt->lid_item );
                        p_port->p_local_endpt->dlid = 0;
                }

-               cl_qlist_insert_head(
-                       &mc_list, &p_port->p_local_endpt->mac_item.pool_item.list_item );
+               cl_qlist_insert_head( &mc_list,
+                                                         &p_port->p_local_endpt->mac_item.pool_item.list_item );
                local_exist = 1;
-
                p_port->p_local_endpt = NULL;
        }

@@ -6571,67 +7141,67 @@
        {
                p_endpt = PARENT_STRUCT( p_item, ipoib_endpt_t, mac_item );
                p_item = cl_qmap_next( p_item );
+
                if( p_endpt->h_mcast )
                {
                        /*
                         * We destroy MC endpoints since they will get recreated
                         * when the port comes back up and we rejoin the MC groups.
                         */
-                       cl_qmap_remove_item( &p_port->endpt_mgr.mac_endpts,
-                               &p_endpt->mac_item );
-                       cl_fmap_remove_item( &p_port->endpt_mgr.gid_endpts,
-                               &p_endpt->gid_item );
+                       cl_qmap_remove_item( &p_port->endpt_mgr.mac_endpts, &p_endpt->mac_item );
+                       cl_fmap_remove_item( &p_port->endpt_mgr.gid_endpts, &p_endpt->gid_item );

-                       cl_qlist_insert_tail(
-                               &mc_list, &p_endpt->mac_item.pool_item.list_item );
+                       cl_qlist_insert_tail( &mc_list, &p_endpt->mac_item.pool_item.list_item );
                }
-               /* destroy connected endpoints if any */
-               else if( p_port->p_adapter->params.cm_enabled &&
-                                endpt_cm_get_state( p_endpt ) != IPOIB_CM_DISCONNECTED )
+               /* destroy endpoints CM resources if any */
+               if( p_port->p_adapter->params.cm_enabled &&
+                       (p_endpt->conn.h_send_cq || p_endpt->conn.h_recv_cq) )
                {
-                       p_fmap_item = cl_fmap_get( &p_port->endpt_mgr.conn_endpts, &p_endpt->dgid );
-                       if( p_fmap_item != cl_fmap_end( &p_port->endpt_mgr.conn_endpts ) )
-                       {
-                               cl_fmap_remove_item( &p_port->endpt_mgr.conn_endpts,
-                                       &p_endpt->conn_item );
-                       }
-                       cl_qmap_remove_item( &p_port->endpt_mgr.mac_endpts,
-                               &p_endpt->mac_item );
-                       cl_fmap_remove_item( &p_port->endpt_mgr.gid_endpts,
-                               &p_endpt->gid_item );
+                       endpt_unmap_conn_dgid( p_port, p_endpt );
+                       cl_qmap_remove_item( &p_port->endpt_mgr.mac_endpts, &p_endpt->mac_item );
+                       cl_fmap_remove_item( &p_port->endpt_mgr.gid_endpts, &p_endpt->gid_item );

-                       cl_qlist_insert_tail(
-                               &conn_list, &p_endpt->mac_item.pool_item.list_item );
+                       cl_qlist_insert_tail( &conn_list, &p_endpt->mac_item.pool_item.list_item );
                }
+
                if( p_endpt->h_av )
                {
                        /* Destroy the AV for all other endpoints. */
-                       p_port->p_adapter->p_ifc->destroy_av( p_endpt->h_av );
+                       p_endpt->p_ifc->destroy_av( p_endpt->h_av );
                        p_endpt->h_av = NULL;
                }

                if( p_endpt->dlid )
                {
-                       cl_qmap_remove_item( &p_port->endpt_mgr.lid_endpts,
-                               &p_endpt->lid_item );
-                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
+                       cl_qmap_remove_item( &p_port->endpt_mgr.lid_endpts, &p_endpt->lid_item );
+                       IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_ENDPT,
                                ("<__endptr_mgr_reset_all: setting p_endpt->dlid to 0\n"));
                        p_endpt->dlid = 0;
                }
-
        }
 #endif
        cl_obj_unlock( &p_port->obj );

-       //TODO CM
-       /*while( cl_qlist_count( &conn_list ) )
+       if( p_port->p_adapter->params.cm_enabled )
        {
-               endpt_cm_destroy_conn( p_port,
-                       PARENT_STRUCT( cl_qlist_remove_head( &conn_list ),
-                       ipoib_endpt_t, mac_item.pool_item.list_item ) );
-       }*/
+               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
+                       ("Port[%d] conn_list entries %d\n",
+                               p_port->port_num, (int)cl_qlist_count( &conn_list )) );
+
+               while( cl_qlist_count( &conn_list ) )
+               {
+                       BOOLEAN destroy_now;
+
+                       p_endpt = PARENT_STRUCT( cl_qlist_remove_head(&conn_list),
+                                                                        ipoib_endpt_t,
+                                                                        mac_item.pool_item.list_item );
+                       destroy_now = cm_destroy_conn( p_port, p_endpt );
+                       if( destroy_now )
+                               cl_obj_destroy( &p_endpt->obj );
+               }
+       }

-       if(cl_qlist_count( &mc_list ) - local_exist)
+       if( cl_qlist_count( &mc_list ) - local_exist )
        {
                p_port->mcast_cnt =  (uint32_t)cl_qlist_count( &mc_list ) - local_exist;
        }
@@ -6642,15 +7212,14 @@
        }

        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
-                               ("p_port->mcast_cnt = %d\n", p_port->mcast_cnt - local_exist));
+                               ("p_port->mcast_cnt %d\n", p_port->mcast_cnt - local_exist));

        /* Destroy all multicast endpoints now that we have released the lock. */
        while( cl_qlist_count( &mc_list ) )
        {
                cl_list_item_t  *p_item;
                p_item = cl_qlist_remove_head( &mc_list );
-               p_endpt = PARENT_STRUCT(p_item, ipoib_endpt_t,
-                                                               mac_item.pool_item.list_item);
+               p_endpt = PARENT_STRUCT(p_item, ipoib_endpt_t, mac_item.pool_item.list_item);
                cl_obj_destroy( &p_endpt->obj);
        }

@@ -6669,9 +7238,6 @@
        IN                              ipoib_port_t* const                     p_port,
        IN                              ipoib_endpt_t* const            p_endpt )
 {
-#if 0 //CM
-       cl_fmap_item_t* p_fmap_item;
-#endif
        IPOIB_ENTER( IPOIB_DBG_ENDPT );

        /* This function must be called from the receive path */
@@ -6689,29 +7255,26 @@
         * in the LID map if the GID has the same subnet prefix as us.
         */
        cl_fmap_remove_item( &p_port->endpt_mgr.gid_endpts, &p_endpt->gid_item );
-#if 0
-
-       if( p_port->p_adapter->params.cm_enabled )
-       {
-               p_fmap_item = cl_fmap_get( &p_port->endpt_mgr.conn_endpts, &p_endpt->dgid );
-
-               if( p_fmap_item != cl_fmap_end( &p_port->endpt_mgr.conn_endpts ) )
-               {
-                       cl_fmap_remove_item( &p_port->endpt_mgr.conn_endpts,
-                                                                &p_endpt->conn_item );
-               }
-       }
+#if IPOIB_CM
+       endpt_unmap_conn_dgid( p_port, p_endpt );
 #endif
        if( p_endpt->dlid )
        {
                cl_qmap_remove_item( &p_port->endpt_mgr.lid_endpts, &p_endpt->lid_item );
+               p_endpt->dlid = 0;
        }

        cl_obj_unlock( &p_port->obj );
-       cl_obj_destroy( &p_endpt->obj );

-       //TODO CM
-       //endpt_cm_destroy_conn( p_port, p_endpt );
+#if IPOIB_CM
+       if( p_port->p_adapter->params.cm_enabled )
+               cm_destroy_conn( p_port, p_endpt );
+
+       if( !p_endpt->cm_ep_destroy )
+               cl_obj_destroy( &p_endpt->obj );
+#else
+       cl_obj_destroy( &p_endpt->obj );
+#endif

        IPOIB_EXIT( IPOIB_DBG_ENDPT );
 }
@@ -6792,7 +7355,8 @@
                {
                        cl_obj_unlock( &p_port->obj );
                        IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                               ("Failed endpoint lookup.\n") );
+                               ("Port[%d] Failed endpoint lookup MAC %s\n",
+                                       p_port->port_num, mk_mac_str(&mac)) );
                        return STATUS_INVALID_PARAMETER;
                }

@@ -6864,23 +7428,20 @@
                IPOIB_EXIT( IPOIB_DBG_ENDPT );
                return NDIS_STATUS_NO_ROUTE_TO_DESTINATION;
        }
-
        key = 0;
        memcpy( &key, &mac, sizeof(mac_addr_t) );

-       cl_obj_lock( &p_port->obj );
-
-       IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_ENDPT,
-               ("Look for :\t  MAC: %02X-%02X-%02X-%02X-%02X-%02X\n",
-               mac.addr[0], mac.addr[1], mac.addr[2],
-               mac.addr[3], mac.addr[4], mac.addr[5]) );
+       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
+               ("Check MAC %s\n",mk_mac_str(&mac)) );

+       cl_obj_lock( &p_port->obj );
        p_item = cl_qmap_get( &p_port->endpt_mgr.mac_endpts, key );
        if( p_item == cl_qmap_end( &p_port->endpt_mgr.mac_endpts ) )
        {
                cl_obj_unlock( &p_port->obj );
-               IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
-                       ("Failed endpoint lookup.\n") );
+               IPOIB_PRINT_EXIT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_ENDPT,
+                       ("Port[%d] Failed endpoint lookup MAC %s\n",
+                               p_port->port_num, mk_mac_str(&mac)) );
                return NDIS_STATUS_NO_ROUTE_TO_DESTINATION;
        }

@@ -6992,9 +7553,7 @@
        IPOIB_ENTER( IPOIB_DBG_ENDPT );

        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
-               ("insert  :\t  MAC: %02X-%02X-%02X-%02X-%02X-%02X\n",
-               mac.addr[0], mac.addr[1], mac.addr[2],
-               mac.addr[3], mac.addr[4], mac.addr[5]) );
+               ("insert  :\t  MAC: %s\n", mk_mac_str(&mac)) );

        cl_obj_lock( &p_port->obj );
        while( p_port->endpt_rdr )
@@ -7031,21 +7590,15 @@
                ;

        /* Link the endpoint to the port. */
-       cl_status = cl_obj_insert_rel_parent_locked(
-               &p_endpt->rel, &p_port->obj, &p_endpt->obj );
-
+       cl_status = cl_obj_insert_rel_parent_locked( &p_endpt->rel,
+                                                                                                &p_port->obj,
+                                                                                                &p_endpt->obj );
        if( cl_status != CL_SUCCESS )
        {
                cl_obj_destroy( &p_endpt->obj );
                return IB_INVALID_STATE;
        }

-#if DBG
-       cl_atomic_inc( &p_port->ref[ref_endpt_track] );
-       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_OBJ,
-               ("ref  type %d ref_cnt %d\n", ref_endpt_track, p_port->obj.ref_cnt) );
-#endif
-
        p_endpt->mac = mac;
        key = 0;
        memcpy( &key, &mac, sizeof(mac_addr_t) );
@@ -7104,7 +7657,11 @@
                        ("ipoib_endpt_create failed.\n") );
                return IB_INSUFFICIENT_RESOURCES;
        }
+#if DBG
        /* set reference to transport to be used while is not attached to the port */
+       ipoib_port_ref( p_port, ref_endpt_track );
+#endif
+       StringCbCopy(p_endpt->tag,sizeof(p_endpt->tag),"<BCast>");
        p_endpt->is_mcast_listener = TRUE;
        status = ipoib_endpt_set_mcast( p_endpt,
                                                                        p_port->ib_mgr.h_pd,
@@ -7134,10 +7691,9 @@
        IN              const   mac_addr_t                                      mac )
 {
        cl_map_item_t   *p_item;
-       //TODO CM
-//     cl_fmap_item_t  *p_fmap_item;
        ipoib_endpt_t   *p_endpt;
        uint64_t                key;
+       BOOLEAN                 destroy_now = TRUE;

        IPOIB_ENTER( IPOIB_DBG_ENDPT );

@@ -7162,39 +7718,24 @@
                 * The enpoints are *ALWAYS* in both the MAC and GID maps. They are only
                 * in the LID map if the GID has the same subnet prefix as us.
                 */
-               cl_fmap_remove_item(
-                       &p_port->endpt_mgr.gid_endpts, &p_endpt->gid_item );
-#if 0
-               if( p_port->p_adapter->params.cm_enabled )
-               {
-                       p_fmap_item = cl_fmap_get( &p_port->endpt_mgr.conn_endpts,
-                                                                               &p_endpt->dgid );
-
-                       if( p_fmap_item != cl_fmap_end( &p_port->endpt_mgr.conn_endpts ) )
-                       {
-                               cl_fmap_remove_item( &p_port->endpt_mgr.conn_endpts,
-                                       &p_endpt->conn_item );
-                       }
-               }
+               cl_fmap_remove_item( &p_port->endpt_mgr.gid_endpts, &p_endpt->gid_item );
+#if IPOIB_CM
+               endpt_unmap_conn_dgid( p_port, p_endpt );
 #endif

                if( p_endpt->dlid )
                {
-                       cl_qmap_remove_item(
-                               &p_port->endpt_mgr.lid_endpts, &p_endpt->lid_item );
+                       cl_qmap_remove_item( &p_port->endpt_mgr.lid_endpts, &p_endpt->lid_item );
                        p_endpt->dlid = 0;
                }

                cl_obj_unlock( &p_port->obj );
-               cl_obj_destroy( &p_endpt->obj );
-               //TODO CM
-               //endpt_cm_destroy_conn( p_port, p_endpt );

-#if DBG
-               cl_atomic_dec( &p_port->ref[ref_endpt_track] );
-               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
-                       ("ref type %d ref_cnt %d\n", ref_endpt_track, p_port->obj.ref_cnt) );
-#endif
+               if( p_port->p_adapter->params.cm_enabled )
+                       destroy_now = cm_destroy_conn( p_port, p_endpt );
+
+               if( destroy_now )
+                       cl_obj_destroy( &p_endpt->obj );
        }
        else
                cl_obj_unlock( &p_port->obj );
@@ -7352,7 +7893,10 @@
        IPOIB_ENTER( IPOIB_DBG_INIT );

        ib_gid_set_default( &gid, p_port->p_adapter->guids.port_guid.guid );
-       p_endpt = ipoib_endpt_create( p_port, &gid, p_port_info->base_lid, p_port->ib_mgr.qpn );
+       p_endpt = ipoib_endpt_create( p_port,
+                                                                 &gid,
+                                                                 p_port_info->base_lid,
+                                                                 p_port->ib_mgr.qpn );
        if( !p_endpt )
        {
                p_port->p_adapter->hung = TRUE;
@@ -7360,6 +7904,13 @@
                        ("Failed to create local endpt\n") );
                return IB_INSUFFICIENT_MEMORY;
        }
+#if DBG
+       ipoib_port_ref( p_port, ref_endpt_track );
+#endif
+       StringCchPrintf( p_endpt->tag,
+                                        sizeof(p_endpt->tag),
+                                        "Local_EP.lid-%#x",
+                                        cl_ntoh16(p_port_info->base_lid) );

        memset( &av_attr, 0, sizeof(ib_av_attr_t) );
        av_attr.port_num = p_port->port_num;
@@ -7370,8 +7921,9 @@
        av_attr.dlid = p_port_info->base_lid;
        av_attr.static_rate = p_port->ib_mgr.rate;
        av_attr.path_bits = 0;
-       status = p_port->p_adapter->p_ifc->create_av(
-               p_port->ib_mgr.h_pd, &av_attr, &p_endpt->h_av );
+       status = p_port->p_adapter->p_ifc->create_av( p_port->ib_mgr.h_pd,
+                                                                                                 &av_attr,
+                                                                                                 &p_endpt->h_av );
        if( status != IB_SUCCESS )
        {
                cl_obj_destroy( &p_endpt->obj );
@@ -7443,12 +7995,13 @@
                                                                                          &p_port->ib_mgr.h_query );
        if( status != IB_SUCCESS )
        {
-               ipoib_port_deref( p_port, ref_get_bcast );
-//             ASSERT(FALSE);
+               // done in bcast_get_cb() ipoib_port_deref( ref_bcast_get_cb );
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("ib_query returned %s\n",
-                       p_port->p_adapter->p_ifc->get_err_str( status )) );
+                               p_port->p_adapter->p_ifc->get_err_str( status )) );
        }
+       ipoib_port_deref( p_port, ref_get_bcast );
+
        IPOIB_EXIT( IPOIB_DBG_INIT );
        return status;
 }
@@ -7529,9 +8082,6 @@
        if( p_query_rec->p_result_mad )
                p_port->p_adapter->p_ifc->put_mad( p_query_rec->p_result_mad );

-       /* Release the reference taken when issuing the member record query. */
-       ipoib_port_deref( p_port, ref_bcast_get_cb );
-
        IPOIB_EXIT( IPOIB_DBG_INIT );
 }

@@ -7570,17 +8120,25 @@
        mcast_req.member_rec = *p_member_rec;

        /* We specify our port GID for the join operation. */
-       status =__port_query_ca_attrs( p_port, &p_ca_attrs);
-       if ( status == IB_SUCCESS )
+       if( p_port->p_ca_attrs )
        {
                mcast_req.member_rec.port_gid.unicast.prefix =
-                       p_ca_attrs->p_port_attr->p_gid_table[0].unicast.prefix;
-               cl_free ( p_ca_attrs );
+                                       p_port->p_ca_attrs->p_port_attr->p_gid_table[0].unicast.prefix;
        }
-       else
-       {
-               ASSERT ( status != IB_SUCCESS );
-               mcast_req.member_rec.port_gid.unicast.prefix = IB_DEFAULT_SUBNET_PREFIX;
+       else
+       {
+               status =__port_query_ca_attrs( p_port, &p_ca_attrs);
+               if ( status == IB_SUCCESS )
+               {
+                       mcast_req.member_rec.port_gid.unicast.prefix =
+                                                       p_ca_attrs->p_port_attr->p_gid_table[0].unicast.prefix;
+                       cl_free ( p_ca_attrs );
+               }
+               else
+               {
+                       ASSERT ( status != IB_SUCCESS );
+                       mcast_req.member_rec.port_gid.unicast.prefix = IB_DEFAULT_SUBNET_PREFIX;
+               }
        }

        mcast_req.member_rec.port_gid.unicast.interface_id =
@@ -7605,8 +8163,7 @@
        /* reference the object for the multicast join request. */
        ipoib_port_ref( p_port, ref_join_bcast );

-       status = p_port->p_adapter->p_ifc->join_mcast(
-               p_port->ib_mgr.h_qp, &mcast_req );
+       status = p_port->p_adapter->p_ifc->join_mcast( p_port->ib_mgr.h_qp, &mcast_req );
        if( status != IB_SUCCESS )
        {
                ipoib_port_deref( p_port, ref_bcast_join_failed );
@@ -7698,24 +8255,35 @@
        cl_spinlock_acquire( &p_port->recv_lock );
        cl_spinlock_acquire( &p_port->send_lock );
        cl_obj_lock( &p_port->obj );
+
+       if( p_port->state == IB_QPS_ERROR )
+       {
+               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                       ("Port[%d] already down(state == IB_QPS_ERROR)? PNP state %s\n",
+                               p_port->port_num, ib_get_pnp_event_str(p_port->p_adapter->state)) );
+       }
+
        p_port->state = IB_QPS_ERROR;

        __pending_list_destroy(p_port);
-       NdisWriteErrorLogEntry( p_port->p_adapter->h_adapter,
-               EVENT_IPOIB_PORT_DOWN, 0 );
+
+       NdisWriteErrorLogEntry( p_port->p_adapter->h_adapter, EVENT_IPOIB_PORT_DOWN, 0 );

        if( p_port->ib_mgr.h_query )
        {
-               p_port->p_adapter->p_ifc->cancel_query(
-                       p_port->p_adapter->h_al, p_port->ib_mgr.h_query );
+               p_port->p_adapter->p_ifc->cancel_query( p_port->p_adapter->h_al,
+                                                                                               p_port->ib_mgr.h_query );
                p_port->ib_mgr.h_query = NULL;
        }
        cl_obj_unlock( &p_port->obj );
        cl_spinlock_release( &p_port->send_lock );
        cl_spinlock_release( &p_port->recv_lock );

-       KeWaitForSingleObject(
-               &p_port->sa_event, Executive, KernelMode, FALSE, NULL );
+       KeWaitForSingleObject( &p_port->sa_event,
+                                                  Executive,
+                                                  KernelMode,
+                                                  FALSE,
+                                                  NULL );

        /* garbage collector timer is not needed when link is down */
        KeCancelTimer(&p_port->gc_timer);
@@ -7745,8 +8313,11 @@
        /* Reset all endpoints so we don't flush our ARP cache. */
        __endpt_mgr_reset_all( p_port );

-       KeWaitForSingleObject(
-               &p_port->leave_mcast_event, Executive, KernelMode, FALSE, NULL );
+       KeWaitForSingleObject( &p_port->leave_mcast_event,
+                                                  Executive,
+                                                  KernelMode,
+                                                  FALSE,
+                                                  NULL );

        cl_obj_lock( &p_port->p_adapter->obj );
        ipoib_dereg_addrs( p_port->p_adapter );
@@ -7920,7 +8491,8 @@
                ipoib_set_inactive( p_port->p_adapter );
                KeSetEvent( &p_port->sa_event, EVENT_INCREMENT, FALSE );
                IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT,
-                       ("ipoib_set_active returned %s.\n",p_port->p_adapter->p_ifc->get_err_str( status )));
+                       ("ipoib_set_active returned %s.\n",
+                                       p_port->p_adapter->p_ifc->get_err_str( status )));
                cl_spinlock_acquire( &p_port->recv_lock );
                cl_obj_lock( &p_port->obj );
                p_port->state = IB_QPS_ERROR;
@@ -7944,7 +8516,7 @@
                ipoib_port_deref( p_port, ref_join_bcast );
                return;
        }
-#if IPOIB_CM //CM
+#if IPOIB_CM
        if( p_port->p_adapter->params.cm_enabled &&
                !p_port->p_local_endpt->conn.h_cm_listen )
        {
@@ -8068,12 +8640,7 @@
        IPOIB_ENTER( IPOIB_DBG_MCAST );

        IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_MCAST,
-               ("Join Multicast request: \n"
-               "\tsrc MAC: %02X-%02X-%02X-%02X-%02X-%02X\n",
-               mac.addr[0], mac.addr[1],
-               mac.addr[2], mac.addr[3],
-               mac.addr[4], mac.addr[5]));
-
+               ("Join Multicast request: src MAC: %s\n", mk_mac_str(&mac)) );

        switch( __endpt_mgr_ref( p_port, mac, &p_endpt ) )
        {
@@ -8090,10 +8657,10 @@
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_MCAST,
                        ("<ipoib_port_join_mcast> PENDING\n") );
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
-                       ("__endpt_mgr_ref called for %p\n", p_endpt));
+                       ("__endpt_mgr_ref on EP %s\n", p_endpt->tag));
                return IB_SUCCESS;
        }
-       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
+       XIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
                ("__endpt_mgr_ref called for %p\n", p_endpt));
        /*
         * Issue the mcast request, using the parameters of the broadcast group.
@@ -8157,6 +8724,9 @@
                        ("ipoib_endpt_create failed.\n") );
                return IB_INSUFFICIENT_MEMORY;
        }
+#if DBG
+       ipoib_port_ref( p_port, ref_endpt_track );
+#endif

        status = __endpt_mgr_insert_locked( p_port, mac, p_endpt );
        if( status != IB_SUCCESS )
@@ -8309,7 +8879,6 @@
        IPOIB_EXIT( IPOIB_DBG_MCAST );
 }

-
 void
 ipoib_leave_mcast_cb(
        IN                              void                            *context )
@@ -8321,7 +8890,7 @@
        p_port = (ipoib_port_t*)context;

        IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_MCAST,
-               ("p_port->mcast_cnt %d\n", p_port->mcast_cnt));
+               ("port[%d] mcast_cnt %d\n", p_port->port_num, p_port->mcast_cnt));

        ipoib_port_deref( p_port, ref_leave_mcast);
        //It happens
@@ -8333,14 +8902,13 @@
                KeSetEvent( &p_port->leave_mcast_event, EVENT_INCREMENT, FALSE );
        }

-       IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_MCAST,
-                       ("Leave mcast callback deref ipoib_port \n") );
+       IPOIB_PRINT_EXIT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_MCAST,
+               ("Leave mcast callback deref ipoib_port[%d] (ref_leave_mcast)\n",
+                       p_port->port_num) );

        IPOIB_EXIT( IPOIB_DBG_MCAST );
 }

-
-
 void
 __leave_error_mcast_cb(
        IN                              void                            *context )
@@ -8413,7 +8981,8 @@
        NdisQueryMdl(pMDL, &pSrc, &CurrLength, NormalPagePriority);

        if (pSrc == NULL) {
-               IPOIB_PRINT(TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR, ("NdisQueryMdl failed\n"));
+               IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                       ("Error processing packets\n") );
                return status;
        }

@@ -8437,12 +9006,19 @@
                FullBuffers++;
                // First buffer was only ethernet
                pNetBuffer = NET_BUFFER_NEXT_NB(pNetBuffer);
-        NdisQueryMdl(NET_BUFFER_CURRENT_MDL(pNetBuffer), &pSrc, &CurrLength, NormalPagePriority);
-               if (pSrc == NULL) {
-                       IPOIB_PRINT(TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR, ("NdisQueryMdl failed\n"));
+        NdisQueryMdl( NET_BUFFER_CURRENT_MDL(pNetBuffer),
+                                         &pSrc,
+                                         &CurrLength,
+                                         NormalPagePriority );
+               if( pSrc == NULL )
+               {
+                       IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                               ("NdisQueryMdl failed\n") );
                        return status;
            }
-       } else {
+       }
+       else
+       {
                // This is ETH + IP together (at least)
                pLsoData->LsoBuffers[0].pData = pSrc + (ETH_OFFSET - sizeof (ipoib_hdr_t));

@@ -8456,14 +9032,17 @@
        }
        // we should now be having at least the size of ethernet data
        if (CurrLength < sizeof (ip_hdr_t)) {
-               IPOIB_PRINT(TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR, ("CurrLength < sizeof (ip_hdr_t)\n"));
+               IPOIB_PRINT(TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                       ("CurrLength < sizeof (ip_hdr_t)\n"));
                return status;
        }
        IpHdr = (ip_hdr_t UNALIGNED*)pSrc;
        IpHeaderLen = (uint16_t)IP_HEADER_LENGTH(IpHdr);
        ASSERT(IpHdr->prot == IP_PROT_TCP);
-       if (CurrLength < IpHeaderLen) {
-               IPOIB_PRINT(TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR, ("CurrLength < IpHeaderLe\n"));
+       if( CurrLength < IpHeaderLen )
+       {
+               IPOIB_PRINT(TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                       ("Error processing packets\n") );
                return status;
        }
        pLsoData->LsoHeaderSize = pLsoData->LsoHeaderSize + IpHeaderLen;
@@ -8485,14 +9064,19 @@
                FullBuffers++;
                IsRegularFlow = FALSE;
                pNetBuffer = NET_BUFFER_NEXT_NB(pNetBuffer);
-               NdisQueryMdl(NET_BUFFER_CURRENT_MDL(pNetBuffer), &pSrc, &CurrLength, NormalPagePriority);
+               NdisQueryMdl( NET_BUFFER_CURRENT_MDL(pNetBuffer),
+                                         &pSrc,
+                                         &CurrLength,
+                                         NormalPagePriority );
                if (pSrc == NULL) {
-                       IPOIB_PRINT(TRACE_LEVEL_VERBOSE, IPOIB_DBG_ERROR, ("NdisQueryMdl failed\n"));
+                       IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_ERROR,
+                               ("NdisQueryMdl failed\n") );
                        return status;
                }
        } else {
-               // if(IsRegularFlow = TRUE ) ==> the ETH and IP and TCP in the same buffer
-               // if(IsRegularFlow = FLASE ) ==> ETH in one buffer , IP+TCP together in the same buffer
+               // if(IsRegularFlow = TRUE ) ==> the ETH and IP and TCP in the same buffer.
+               // if(IsRegularFlow = FALSE ) ==> ETH in one buffer , IP+TCP together
+               // in the same buffer
                if (IsRegularFlow) {
                        pLsoData->LsoBuffers[0].Len += IpHeaderLen;
                } else {
@@ -8504,7 +9088,8 @@
                pSrc = pSrc + IpHeaderLen;
        }
        if (CurrLength < sizeof (tcp_hdr_t)) {
-               IPOIB_PRINT(TRACE_LEVEL_VERBOSE, IPOIB_DBG_ERROR, ("Error processing packets\n"));
+               IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_ERROR,
+                       ("Error porcessing packets\n") );
                return status;
        }
        // We have finaly found the TCP header
@@ -8601,18 +9186,16 @@
                p_endpt = PARENT_STRUCT( cl_qlist_remove_head( &destroy_mc_list ),
                                                                 ipoib_endpt_t, mac_item.pool_item.list_item );
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
-                       ("mcast garbage collector: destroying endpoint %02x:%02x:%02x:%02x:%02x:%02x \n",
-                                p_endpt->mac.addr[0],
-                                p_endpt->mac.addr[1],
-                                p_endpt->mac.addr[2],
-                                p_endpt->mac.addr[3],
-                                p_endpt->mac.addr[4],
-                                p_endpt->mac.addr[5]) );
+                       ("mcast garbage collector: destroying EP %p %s %s\n",
+                               p_endpt, p_endpt->tag, mk_mac_str(&p_endpt->mac)) );
+
                cl_obj_destroy( &p_endpt->obj );
        }
 }

-static void __port_mcast_garbage_dpc(KDPC *p_gc_dpc,void *context,void *s_arg1, void *s_arg2)
+
+static void
+__port_mcast_garbage_dpc(KDPC *p_gc_dpc,void *context,void *s_arg1, void *s_arg2)
 {
        ipoib_port_t *p_port = (ipoib_port_t *) context;

@@ -8705,56 +9288,71 @@
        IPOIB_EXIT( IPOIB_DBG_SEND );
 }

+
 /*
-*  Put all fragments into separate WR and chain together.
-*  The last WR will be set to generate CQ Event.
-*  lookaside buffer is used for ipoib and ip headers attached to each WR.
-*  Buffer will be released on last WR send completion.
-*/
-#if 0
+ *  Put all IP fragments into separate WRs and chain together.
+ *  The last WR will be set to generate CQ Event.
+ *  lookaside buffer is used for ip headers attached to each WR.
+ *  Lookaside buffer will be released on last WR send completion.
+ *
+ * IPoIB header is pre-built (by build_send_desc) in
+ * p_desc->send_wr[0].local_ds[0].length/vaddr
+ */
+
 static NDIS_STATUS
-__send_fragments(
-IN             ipoib_port_t* const                     p_port,
-IN             ipoib_send_desc_t* const        p_desc,
-IN             eth_hdr_t* const                        p_eth_hdr,
-IN             ip_hdr_t* const                         p_ip_hdr,
-IN             uint32_t                                        buf_len,
-IN             NDIS_BUFFER*                            p_ndis_buf )
+__build_ipv4_fragments(
+       IN              ipoib_send_NB_SG*                       s_buf,
+       IN              ip_hdr_t* const                         p_ip_hdr,
+       IN              uint32_t                                        buf_len,
+       IN              uint32_t                                        total_ip_len,
+       IN              MDL*                                            p_mdl )
 {
-       uint32_t        ds_idx = 1;
+       uint32_t        ds_idx = 1;             // ds[0] is ipoib header in send_desc[0]
        uint32_t        wr_idx = 0;
-       uint32_t        sgl_idx = 2; //skip eth hdr, ip hdr
+       uint32_t        sgl_idx = 0;
+       uint32_t        sgl_offset;
        uint32_t        options_len = 0;
        uint8_t*        p_options = NULL;
        uint8_t*        p_buf;
        uint32_t        frag_offset = 0;
-       uint32_t        next_sge;
+       uint32_t        last_frag = 0;
+       uint32_t        cur_sge;
        uint32_t        wr_size = 0;
        uint32_t        ip_hdr_len = IP_HEADER_LENGTH( p_ip_hdr );
-       uint32_t        total_ip_len = cl_ntoh16( p_ip_hdr->length );
+       uint32_t        tx_mtu = s_buf->p_endpt->tx_mtu;
+       uint32_t        seg_len;
+       uint64_t        next_sgl_addr;
+       ULONG           DataOffset;
+       uint32_t        need, mtu_avail;
+       int                     mtu_data;
+       uint32_t        frag_cnt=0;
+
+       ipoib_port_t* const                     p_port = s_buf->p_port;
+       ipoib_send_desc_t* const        p_desc = s_buf->p_send_desc;
+       SCATTER_GATHER_LIST*            p_sgl = s_buf->p_sgl;

-       SCATTER_GATHER_LIST             *p_sgl;
-
-       IPOIB_ENTER( IPOIB_DBG_SEND );
+       IPOIB_ENTER( IPOIB_DBG_FRAG );

        if( IP_DONT_FRAGMENT(p_ip_hdr) )
-                       return NDIS_STATUS_INVALID_PACKET;
-
-       p_sgl = NDIS_PER_PACKET_INFO_FROM_PACKET( p_desc->p_pkt, ScatterGatherListPacketInfo );
-       if( !p_sgl )
        {
-               ASSERT( p_sgl );
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                       ("Failed to get SGL from packet.\n") );
-               return NDIS_STATUS_FAILURE;
+                       ("Err: IP hdr: Don't Fragment SET? IP len %u\n",total_ip_len) );
+               return NDIS_STATUS_INVALID_PACKET;
        }
+
+       ASSERT( p_sgl );
+
        if( ( p_sgl->NumberOfElements > MAX_SEND_SGE ||
                p_sgl->Elements[0].Length < sizeof(eth_hdr_t)) )
        {
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                       ("Too many SG Elements in packet.\n") );
+                       ("Too many SG Elements(%d) in packet, sgl[0].Len %d\n",
+                               p_sgl->NumberOfElements,
+                               p_sgl->Elements[0].Length) );
                return NDIS_STATUS_FAILURE;
        }
+
+       CL_ASSERT( s_buf->p_send_buf == NULL );
        p_buf = (uint8_t *)
                ExAllocateFromNPagedLookasideList( &p_port->buf_mgr.send_buf_list );
        if( !p_buf )
@@ -8765,129 +9363,203 @@
        }
        s_buf->p_send_buf = (send_buf_t*)p_buf;

+       DataOffset= (ULONG)(NET_BUFFER_CURRENT_MDL_OFFSET(s_buf->p_curr_nb));
+
        if( buf_len < ip_hdr_len )
        {       /* ip options in a separate buffer */
                CL_ASSERT( buf_len == sizeof( ip_hdr_t ) );
-               NdisGetNextBuffer( p_ndis_buf, &p_ndis_buf );
-               if( !p_ndis_buf )
+               NdisGetNextBuffer( p_mdl, &p_mdl );
+               if( !p_mdl )
                {
                        IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("Failed to get IP options buffer.\n") );
                        return NDIS_STATUS_FAILURE;
                }
-               NdisQueryBufferSafe( p_ndis_buf, &p_options, &options_len, NormalPagePriority );
+
+               NdisQueryMdl(p_mdl, &p_options, &options_len, NormalPagePriority);
+
                if( !p_options )
                {
                        IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                               ("Failed to query IP options buffer address.\n") );
+                               ("Failed to QueryMdl IP options buffer address.\n") );
                        return NDIS_STATUS_FAILURE;
                }
-               memcpy( p_buf, p_ip_hdr, sizeof( ip_hdr_t ) );
+
+        memcpy( p_buf, p_ip_hdr, sizeof( ip_hdr_t ) );
                if( p_options && options_len )
                {
-                       __copy_ip_options( &p_buf[sizeof(ip_hdr_t)],
-                                                               p_options, options_len, TRUE );
+                       __copy_ip_options( &p_buf[sizeof(ip_hdr_t)],
+                                                          p_options,
+                                                          options_len,
+                                                          TRUE );
                }
                wr_size = buf_len + options_len;
-               sgl_idx++;
+               // sgl_idx++; ??
        }
        else
-       {       /*options probably in the same buffer */
-               memcpy( p_buf, p_ip_hdr, buf_len );
+       {       /* options, if any, are in the same buffer */
+        memcpy( p_buf, p_ip_hdr, buf_len );
                options_len = ip_hdr_len - sizeof( ip_hdr_t );
                if( options_len )
                {
                        p_options = p_buf + sizeof( ip_hdr_t );
                }
-               frag_offset += ( buf_len - ip_hdr_len );
+               //frag_offset += ( buf_len - ip_hdr_len );
                wr_size = buf_len;
        }
-
-       p_desc->send_wr[wr_idx].local_ds[ds_idx].vaddr = cl_get_physaddr( p_buf );
-       p_desc->send_wr[wr_idx].local_ds[ds_idx].lkey = p_port->ib_mgr.lkey;
-       p_desc->send_wr[wr_idx].local_ds[ds_idx].length = wr_size;

-       /* count how much data can be put into the first WR beside IP header.
-        * other protocols headers possibly supplied in subsequent buffers.
+       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+               ("IP.length %u Opt[%s%s] buf_len %u opt_len %u frag_off %u DataOff %u\n",
+                       total_ip_len,
+                       (options_len ? "Yes":"No"),
+                       (options_len ?
+                               (p_buf[sizeof(ip_hdr_t)] & 0x80 ? " copy: Yes":" copy: No") : ""),
+                       buf_len, options_len, frag_offset, DataOffset) );
+
+       /* local_ds[0] preset to ipoib_hdr_t in port->hdr[x] */
+       CL_ASSERT(p_desc->send_wr[0].local_ds[0].length == sizeof( ipoib_hdr_t ) );
+
+       CL_ASSERT( ds_idx == 1 );
+       p_desc->send_wr[wr_idx].local_ds[1].vaddr = cl_get_physaddr( p_buf );
+       p_desc->send_wr[wr_idx].local_ds[1].lkey = p_port->ib_mgr.lkey;
+       p_desc->send_wr[wr_idx].local_ds[1].length = wr_size;
+
+       /* Ethernet header starts @ sgl[0] + DataOffset.
+        * skip Eth hdr + IP hdr + IP options to IP packet data beyond buf_len.
         */
-       for( sgl_idx; sgl_idx < p_sgl->NumberOfElements; sgl_idx++ )
-       {
-               next_sge = p_sgl->Elements[sgl_idx].Length;
+       sgl_offset = DataOffset + sizeof(eth_hdr_t) + wr_size;
+       next_sgl_addr = p_sgl->Elements[sgl_idx].Address.QuadPart + sgl_offset;
+       cur_sge = p_sgl->Elements[sgl_idx].Length - sgl_offset;

-               /* add sgl if it can fit into the same WR
-               * Note: so far not going to split large SGE between WRs,
-               * so first fragment could be a smaller size.
-               */
-               if( next_sge <= ( p_port->p_adapter->params.payload_mtu - wr_size ) )
-               {
-                       ++ds_idx;
-                       wr_size += next_sge;
-                       frag_offset += next_sge;
-                       p_desc->send_wr[wr_idx].local_ds[ds_idx].vaddr =
-                                                                       p_sgl->Elements[sgl_idx].Address.QuadPart;
-                       p_desc->send_wr[wr_idx].local_ds[ds_idx].length = next_sge;
-                       p_desc->send_wr[wr_idx].local_ds[ds_idx].lkey = p_port->ib_mgr.lkey;
-               }
-               else
-               {
-                       /* fix ip hdr for the first fragment and move on */
-                       __update_fragment_ip_hdr( (ip_hdr_t* const)p_buf,
-                               (uint16_t)wr_size, IP_FRAGMENT_OFFSET(p_ip_hdr), TRUE );
-
-                       p_desc->send_wr[wr_idx].wr.num_ds = ds_idx + 1;
-                       p_buf += ip_hdr_len;
-                       p_buf += (( buf_len > ip_hdr_len ) ? ( buf_len - ip_hdr_len ): 0);
-                       frag_offset += ( (IP_FRAGMENT_OFFSET(p_ip_hdr)) << 3 );
-                       ++wr_idx;
-                       ds_idx = 0;
-                       break;
-               }
+       if( cur_sge == 0 )
+       {
+               cur_sge = p_sgl->Elements[++sgl_idx].Length;
+               next_sgl_addr = p_sgl->Elements[sgl_idx].Address.QuadPart;
+               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+                       ("cur_sge == 0 Next sge[%u] cur_sge %u\n", sgl_idx, cur_sge) );
        }
+
+       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+               ("sgl[%u].Len %u cur_sge %u  wr_size %u mtu_left %u\n\n",
+                       sgl_idx, p_sgl->Elements[sgl_idx].Length, cur_sge, wr_size,
+                       (tx_mtu - wr_size)) );
+
        total_ip_len -= wr_size;
-       wr_size = 0;
+       ds_idx++;
+       ASSERT( ds_idx == 2 );
+       frag_cnt = 0;
+
+       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+               ("main:\n  wr[%u] ds_idx %u cur_sge %u tot_ip_len %u wr_size %u "
+                "mtu_avail %u frag_offset %u\n\n",
+                       wr_idx, ds_idx, cur_sge, total_ip_len, wr_size, (tx_mtu - wr_size),
+                       frag_offset) );

-       for( sgl_idx, wr_idx; sgl_idx < p_sgl->NumberOfElements; sgl_idx++ )
+       for( ; sgl_idx < p_sgl->NumberOfElements; sgl_idx++ )
        {
-               uint32_t        seg_len;
-               uint64_t        next_sgl_addr;
-
                if( wr_idx >= ( MAX_WRS_PER_MSG - 1 ) )
+               {
+                       IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                               ("ResourceErr: wr_idx %d >= MAX_WRS_PER_MSG-1 %d\n",
+                                       wr_idx,( MAX_WRS_PER_MSG - 1 )) );
                        return NDIS_STATUS_RESOURCES;
+               }

-               next_sge = p_sgl->Elements[sgl_idx].Length;
-               next_sgl_addr = p_sgl->Elements[sgl_idx].Address.QuadPart;
+               if( cur_sge == 0 )
+               {
+                       cur_sge = p_sgl->Elements[sgl_idx].Length;
+                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+                               ("NEW sge[%u] cur_sge %u wr_size %u total_ip_len %u\n",
+                                       sgl_idx, cur_sge, (tx_mtu - wr_size), total_ip_len) );
+                       next_sgl_addr = p_sgl->Elements[sgl_idx].Address.QuadPart;
+               }

-               while( next_sge )
+               while( cur_sge )
                {
                        if( ds_idx == 0 )
-                       {       /* new ipoib + ip header */
-                               ((ipoib_hdr_t*)p_buf)->type = p_eth_hdr->type;
-                               ((ipoib_hdr_t*)p_buf)->resv = 0;
-                               p_desc->send_wr[wr_idx].local_ds[ds_idx].vaddr = cl_get_physaddr( p_buf );
-                               p_desc->send_wr[wr_idx].local_ds[ds_idx].lkey = p_port->ib_mgr.lkey;
-                               p_desc->send_wr[wr_idx].local_ds[ds_idx].length = sizeof( ipoib_hdr_t );
-                               p_buf += sizeof( ipoib_hdr_t );
+                       {       /* ipoib header preset in send_wr[0] */
+                               p_desc->send_wr[wr_idx].local_ds[0] = p_desc->send_wr[0].local_ds[0];
+
                                ++ds_idx;

-                               memcpy( p_buf, p_ip_hdr, sizeof( ip_hdr_t ) );
+                               /* set IP header */
+                       memcpy( p_buf, p_ip_hdr, sizeof( ip_hdr_t ) );
                                if( p_options && options_len )
                                {
                                        /* copy ip options if needed */
                                        __copy_ip_options( &p_buf[sizeof(ip_hdr_t)],
-                                                               p_options, options_len, FALSE );
+                                                                          p_options,
+                                                                          options_len,
+                                                                          FALSE );
                                }
                                wr_size = ip_hdr_len;
+
+                               /* ds_idx == 1, setup IP header */
+                               p_desc->send_wr[wr_idx].local_ds[1].length = ip_hdr_len;
+                               p_desc->send_wr[wr_idx].local_ds[1].vaddr = cl_get_physaddr( p_buf );
+                               p_desc->send_wr[wr_idx].local_ds[1].lkey = p_port->ib_mgr.lkey;
+                               ++ds_idx;
+                       }
+
+                       mtu_avail = tx_mtu - wr_size;
+                       mtu_data = (int) (mtu_avail  & ~7);
+
+                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+                               ("tot_ip_len %u mtu_avail %u wr_size %u mtu_data %d cur_sge %u\n",
+                                       total_ip_len, mtu_avail, wr_size, mtu_data, cur_sge) );
+
+                       /* IP Packet data must be in 8-byte chunks, except for the last frag. */
+
+                       if( total_ip_len <= mtu_avail && cur_sge <= mtu_avail)
+                       {
+                               IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+                                       ("Last Frag(%u)\n", (frag_cnt+1)) );
+                               seg_len = cur_sge;
                        }
-                       if( ds_idx == 1 )
+                       else
+                               seg_len = cur_sge & (~7);
+
+                       if( seg_len == 0 )
                        {
-                               p_desc->send_wr[wr_idx].local_ds[ds_idx].length = ip_hdr_len;
-                               p_desc->send_wr[wr_idx].local_ds[ds_idx].vaddr = cl_get_physaddr( p_buf );
-                               p_desc->send_wr[wr_idx].local_ds[ds_idx].lkey = p_port->ib_mgr.lkey;
-                               ++ds_idx;
+                               if( last_frag )
+                               {
+                                       IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                                               ("seg_len == 0 && last_frag > 0: last_frag %u need %u\n",
+                                                       last_frag,cur_sge) );
+                                       last_frag += cur_sge;
+                                       seg_len = cur_sge;
+                                       CL_ASSERT(0);
+                               }
+                               else
+                               {
+                                       last_frag = seg_len = cur_sge;
+                                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+                                               ("last_frag(0): last_frag/seg_len %u\n", seg_len) );
+                               }
+                               seg_len = (seg_len > (uint32_t)mtu_data
+                                                               ? (uint32_t)mtu_data : seg_len );
+                       }
+                       else
+                       {
+                               if( last_frag )
+                               {        // frag unaligned
+                                       need = 8 - last_frag;
+                                       seg_len = (seg_len - 8) + need;
+
+                                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+                                               ("last_frag_ADJ: last %u need %u cur_sge %u "
+                                                "adj_seg_len %u\n", last_frag, need, cur_sge, seg_len) );
+                                       last_frag = 0;
+                               }
+                               else
+                                       need = 0;
+
+                               seg_len = (seg_len > (uint32_t)mtu_data
+                                                               ? ((uint32_t)mtu_data + need) : seg_len);
                        }

-                       seg_len = ( next_sge > ( p_port->p_adapter->params.payload_mtu - wr_size ) )?
-                               ( p_port->p_adapter->params.payload_mtu - wr_size ) : next_sge;
+                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+                               ("Set wr[%d].ds[%d] seg_len %u\n", wr_idx, ds_idx, seg_len) );

                        p_desc->send_wr[wr_idx].local_ds[ds_idx].vaddr = next_sgl_addr;
                        p_desc->send_wr[wr_idx].local_ds[ds_idx].length = seg_len;
@@ -8896,46 +9568,97 @@

                        wr_size += seg_len;
                        total_ip_len -= seg_len;
+                       mtu_data -= seg_len;
+
+                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+                               ("mtu_data %d wr_size %u tx_mtu %u total_ip_len %u\n",
+                                       mtu_data, wr_size, tx_mtu, total_ip_len) );
+
+                       if( (int)mtu_data <= 0 || wr_size >= tx_mtu || total_ip_len == 0 )
+                       {       /* fix ip hdr for current fragment */
+                               if( frag_cnt == 0 )
+                               {
+                                       /* fix ip hdr for the first fragment and continue */
+                                       __update_fragment_ip_hdr( (ip_hdr_t* const)p_buf,
+                                                                                         (uint16_t)wr_size,
+                                                                                         IP_FRAGMENT_OFFSET(p_ip_hdr),
+                                                                                         TRUE );
+                                       p_buf += ip_hdr_len;
+                                       p_buf += ((buf_len > ip_hdr_len) ? ( buf_len - ip_hdr_len ): 0);
+                                       frag_offset += (wr_size - ip_hdr_len);
+                                       p_desc->send_wr[wr_idx].wr.num_ds = ds_idx;
+
+                                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+                                               ("Finalize frag-0 wr[%d] total_ip_len %u frag_offset %u\n",
+                                                       wr_idx, total_ip_len, frag_offset) );
+                               }
+                               else
+                               {
+                                       __update_fragment_ip_hdr( (ip_hdr_t* const)p_buf,
+                                                                                         (uint16_t)wr_size,
+                                                                                         ((uint16_t)(frag_offset >> 3 )),
+                                                                                         (BOOLEAN)(( total_ip_len > 0 ) ||
+                                                                                                       IP_MORE_FRAGMENTS(p_ip_hdr)) );
+                                       p_buf += ip_hdr_len;
+                                       p_desc->send_wr[wr_idx].wr.num_ds = ds_idx;
+                                       frag_offset += (wr_size - ip_hdr_len);
+#if DBG
+                                       if( total_ip_len > 0 )
+                                       {
+                                               CL_ASSERT( (frag_offset & 7) == 0 );
+                                       }
+#endif
+                                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+                                               ("Finalize frag-%d wr[%d] total_ip_len %u\n",
+                                                       frag_cnt+1, wr_idx, total_ip_len) );
+                               }
+                               frag_cnt++;

-                       if( wr_size >= p_port->p_adapter->params.payload_mtu || total_ip_len == 0 )
-                       {       /* fix ip hdr for that fragment */
-                               __update_fragment_ip_hdr( (ip_hdr_t* const)p_buf, (uint16_t)wr_size,
-                                       ((uint16_t)(frag_offset >> 3 )),
-                                       (BOOLEAN)(( total_ip_len > 0 ) || IP_MORE_FRAGMENTS( p_ip_hdr)) );
-                               p_desc->send_wr[wr_idx].wr.num_ds = ds_idx;
                                if( total_ip_len > 0 )
                                {
                                        ++wr_idx;
-                                       frag_offset += (wr_size - ip_hdr_len);
                                        wr_size = 0;
                                        ds_idx = 0;
-                                       p_buf += ip_hdr_len;
+                                       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+                                               ("New wr[%d] sgl[%d] < Max %d, total_ip_len %u cur_sge %u "
+                                                "seg_len %u\n",
+                                                       wr_idx,sgl_idx,p_sgl->NumberOfElements,
+                                                       total_ip_len, (cur_sge-seg_len), seg_len) );
+                               }
+                               else
+                               {
+                                       CL_ASSERT( (cur_sge - seg_len) == 0 );
+                                       DIPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_FRAG,
+                                               ("IP bytes remaining %d\n", (cur_sge - seg_len)) );
                                }
                        }
-                       next_sge -= seg_len;
-                       if( next_sge > 0 )
-                       {
+                       cur_sge -= seg_len;
+                       if( cur_sge > 0 )
                                next_sgl_addr += seg_len;
-                       }
                }
        }
-       p_desc->num_wrs += wr_idx;
-
-       IPOIB_EXIT( IPOIB_DBG_SEND );
+       p_desc->num_wrs = wr_idx + 1;
+
+       IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_FRAG,
+               ("Exit - num_wrs %d frag_cnt %u\n",p_desc->num_wrs,frag_cnt) );
+
+       IPOIB_EXIT( IPOIB_DBG_FRAG );
        return NDIS_STATUS_SUCCESS;
 }


 static void
 __update_fragment_ip_hdr(
-IN             ip_hdr_t* const         p_ip_hdr,
-IN             uint16_t                        fragment_size,
-IN             uint16_t                        fragment_offset,
-IN             BOOLEAN                         more_fragments )
+       IN              ip_hdr_t* const         p_ip_hdr,
+       IN              uint16_t                        fragment_size,
+       IN              uint16_t                        fragment_offset,
+       IN              BOOLEAN                         more_fragments )
 {
        uint16_t*       p_hdr = (uint16_t*)p_ip_hdr;
+
        p_ip_hdr->length = cl_hton16( fragment_size ); // bytes
        p_ip_hdr->offset = cl_hton16( fragment_offset ); // 8-byte units
+
        if( more_fragments )
        {
                IP_SET_MORE_FRAGMENTS( p_ip_hdr );
@@ -8950,10 +9673,10 @@

 static void
 __copy_ip_options(
-IN             uint8_t*        p_buf,
-IN             uint8_t*        p_options,
-IN             uint32_t        options_len,
-IN             BOOLEAN         copy_all )
+       IN              uint8_t*        p_buf,
+       IN              uint8_t*        p_options,
+       IN              uint32_t        options_len,
+       IN              BOOLEAN         copy_all )
 {
        uint32_t        option_length;
        uint32_t        total_length = 0;
@@ -9015,4 +9738,3 @@
        }
        return;
 }
-#endif
-------------- next part --------------
A non-text attachment was scrubbed...
Name: ipoib_port.cpp.patch
Type: application/octet-stream
Size: 146634 bytes
Desc: ipoib_port.cpp.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20110111/93e3645d/attachment.obj>


More information about the ofw mailing list