[ofw] [PATCH] ipoib-CM new files 2 of 2

Smith, Stan stan.smith at intel.com
Wed Jan 12 09:06:30 PST 2011


Handle bulk of Connected Mode issues: connections, RC send/recv callbacks

signed-off-by: stan smith <stan.smith at intel.com>

/*
 * Copyright (c) 2011 Intel Corporation.  All rights reserved.
 * Copyright (c) 2008 QLogic Corporation.  All rights reserved.
 *
 * This software is available to you under the OpenIB.org BSD license
 * below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * $Id:
 */

#include <precompile.h>

#include <complib/cl_math.h>    // for ROUNDUP
#include <inaddr.h>
#include <ip2string.h>

#if defined(EVENT_TRACING)
#ifdef offsetof
#undef offsetof
#endif
#include "ipoib_cm.tmh"
#endif


static void
__cm_recv_mgr_reset(
        IN              ipoib_port_t* const             p_port,
        IN              ipoib_endpt_t* const    p_endpt );

static void
__cm_buf_mgr_put_recv(
        IN              ipoib_port_t* const                     p_port,
        IN              ipoib_cm_recv_desc_t* const     p_desc,
        IN              BOOLEAN                                         update,
        IN              NET_BUFFER_LIST* const          p_net_buffer_list OPTIONAL );

static void
__cm_buf_mgr_put_recv_list(
        IN              ipoib_port_t* const             p_port,
        IN              cl_qlist_t* const               p_list );

static ib_api_status_t
__cm_post_srq_recv(
        IN              ipoib_port_t* const                     p_port,
        IN              ipoib_endpt_t* const            p_endpt );

static void
__cm_send_cb(
        IN              const   ib_cq_handle_t          h_cq,
        IN                              void*                           cq_context );

static int32_t
__cm_recv_mgr_filter(
        IN              ipoib_port_t* const                     p_port,
        IN              ipoib_endpt_t* const            p_endpt,
        IN              ib_wc_t* const                          p_done_wc_list,
        OUT             cl_qlist_t* const                       p_done_list,
        OUT             cl_qlist_t* const                       p_bad_list );

static BOOLEAN
__cm_recv_internal(
        IN              const   ib_cq_handle_t          h_cq,
        IN                              void*                           cq_context,
        IN                              uint32_t                        *p_recv_cnt );

static void
__cm_recv_cb(
        IN              const   ib_cq_handle_t          h_cq,
        IN                              void*                           cq_context );

/* callback to connect reply */
static void
__conn_reply_cb(
        IN              ib_cm_rep_rec_t*                p_cm_rep );

/* callback on REQ arrival while listen() */
static void
__conn_req_cb(
        IN              ib_cm_req_rec_t*                p_cm_req );

/* MRA callback */
static void
__conn_mra_cb(
        IN              ib_cm_mra_rec_t*                p_mra_rec );

/* RTU callback*/
static void
__conn_rtu_cb(
        IN              ib_cm_rtu_rec_t*                p_rtu_rec );

/*REJ callback */
static void
__conn_rej_cb(
        IN              ib_cm_rej_rec_t*                p_rej_rec );

/* callback on DREQ (Disconnect Request) arrival */
static void
__active_conn_dreq_cb(
         IN     ib_cm_dreq_rec_t*                       p_dreq_rec );

static void
__passive_conn_dreq_cb(
         IN     ib_cm_dreq_rec_t*                       p_dreq_rec );

static ib_api_status_t
__conn_accept(
        IN              ipoib_port_t* const             p_port,
        IN              ipoib_endpt_t*                  p_endpt,
        IN              ib_cm_req_rec_t*                p_cm_req,
        IN              ib_recv_wr_t*                   p_recv_wr );

static void
__conn_reject(
        IN              ipoib_port_t* const             p_port,
        IN              ib_cm_handle_t                  h_cm_handle,
        IN              ib_rej_status_t                 rej_status );

static void
__conn_send_dreq(
        IN              ipoib_port_t*   const   p_port,
        IN              ipoib_endpt_t*  const   p_endpt );

static void
__cq_async_event_cb(
        IN                      ib_async_event_rec_t            *p_event_rec );

static void
__srq_qp_async_event_cb(
        IN              ib_async_event_rec_t            *p_event_rec );

static void
__queue_tx_resource_free(
        IN              ipoib_port_t*           p_port,
        IN              ipoib_endpt_t*          p_endpt );


static void
__endpt_cm_buf_mgr_construct(
        IN              cm_buf_mgr_t * const            p_buf_mgr );

static cl_status_t
__cm_recv_desc_ctor(
        IN              void* const                                     p_object,
        IN              void*                                           context,
        OUT             cl_pool_item_t** const          pp_pool_item );

static void
__cm_recv_desc_dtor(
        IN              const   cl_pool_item_t* const           p_pool_item,
        IN                              void                                            *context );

static boolean_t
__cm_recv_is_dhcp(
        IN              const ipoib_pkt_t* const        p_ipoib );

static ib_api_status_t
__endpt_cm_recv_arp(
        IN              ipoib_port_t* const                     p_port,
        IN              const   ipoib_pkt_t* const      p_ipoib,
        OUT             eth_pkt_t* const                        p_eth,
        IN              ipoib_endpt_t* const            p_src_endpt );

static ib_api_status_t
__endpt_cm_recv_udp(
        IN              ipoib_port_t* const                     p_port,
        IN              ib_wc_t* const                          p_wc,
        IN              const   ipoib_pkt_t* const      p_ipoib,
        OUT             eth_pkt_t* const                        p_eth,
        IN              ipoib_endpt_t* const            p_src_endpt );

static void
cm_start_conn_destroy(
        IN              ipoib_port_t* const                     p_port,
        IN              ipoib_endpt_t* const            p_endpt,
        IN              int                                                     which_res );

void
endpt_queue_cm_connection(
        IN              ipoib_port_t* const                     p_port,
        IN              ipoib_endpt_t* const            p_endpt );

void
cm_release_resources(
        IN              ipoib_port_t* const                     p_port,
        IN              ipoib_endpt_t* const            p_endpt,
        IN              int                                                     which_res );


static BOOLEAN
cm_start_conn_destruction(
        IN                      ipoib_port_t* const                     p_port,
        IN                      ipoib_endpt_t* const            p_endpt );



char *cm_get_state_str( cm_state_t s )
{
        static char what[28];

        switch( s )
        {
          case IPOIB_CM_DISCONNECTED:
                return "CM_DISCONNECTED";
          case IPOIB_CM_QUEUED_TO_CONNECT:
                return "CM_QUEUED_TO_CONNECT";
          case IPOIB_CM_CONNECTING:
                return "CM_CONNECTING";
          case IPOIB_CM_CONNECTED:
                return "CM_CONNECTED";
          case IPOIB_CM_LISTEN:
                return "CM_LISTEN";
          case IPOIB_CM_DREP_SENT:
                return "CM_DREP_SENT";
          case IPOIB_CM_DREQ_SENT:
                return "CM_DREQ_SENT";
          case IPOIB_CM_DISCONNECT_CLEANUP:
                return "CM_DISCONNECT_CLEANUP";
          case IPOIB_CM_DESTROY:
                return "CM_DESTROY";
          default:
                break;
        }
        //_snprintf(what,sizeof(what),"Unknown CM state %d(%#x)",s,s);
        (void) StringCchPrintf(what,sizeof(what),"Unknown CM state %d(%#x)",s,s);
        return what;
}

#if DBG

void
decode_enet_pkt(char *preFix, void *hdr, int len, char *postFix)
{
        eth_hdr_t *eh=(eth_hdr_t*)hdr;
        ip_hdr_t *p_ip_hdr =(ip_hdr_t*)(eh + 1);
        char ipp[120];
        char ip_addrs[60];
        char ip_src[16], ip_dst[16];

#if !defined(DBG) || DBG == 0
        UNREFERENCED_PARAMETER(preFix);
        UNREFERENCED_PARAMETER(postFix);
        UNREFERENCED_PARAMETER(len);
#endif

        if (eh->type == ETH_PROT_TYPE_IP)
        {

                ip_addrs[0] = '\0';
                //if (p_ip_hdr->prot == IP_PROT_TCP)
                {
                        RtlIpv4AddressToStringA( (IN_ADDR*)&p_ip_hdr->src_ip, ip_src );
                        RtlIpv4AddressToStringA( (IN_ADDR*)&p_ip_hdr->dst_ip, ip_dst );

                        StringCchPrintf( ip_addrs, sizeof(ip_addrs), " %s --> %s",
                                                         ip_src, ip_dst );
                }
                StringCchPrintf( ipp, sizeof(ipp), "IP_proto(len %d) %s%s",
                                cl_ntoh16(p_ip_hdr->length),
                                get_IP_protocol_str(p_ip_hdr->prot),
                                (ip_addrs[0] ? ip_addrs : "") );
        }
        else
        {
                StringCchPrintf( ipp, sizeof(ipp), "?Unknown Eth proto %#x? ", eh->type);
        }

        cl_dbg_out("%sEnet hdr(calc pkt_len %d):\n\tsrc MAC: %s\n"
                                "\tdst MAC: %s\n\tEnet-proto: %s\n\t%s%s",
                                (preFix ? preFix:"\n"),
                                len,
                                mk_mac_str(&eh->src),
                                mk_mac_str2(&eh->dst),
                                get_eth_packet_type_str(eh->type),
                                ipp,
                                (postFix ? postFix:"\n") );
}
#endif


#if 0

static void
decode_NBL(
        IN              char const                              *preFix,
        IN              ipoib_port_t                    *p_port,
        IN              NET_BUFFER_LIST                 *p_net_buffer_list )
{
        ipoib_cm_recv_desc_t    *p_desc;
        NET_BUFFER_LIST                 *cur_NBL, *next_NBL;
        LONG                                    NBL_cnt = 0;
        PNET_BUFFER                             NB;
        ULONG                                   len, off, i;

        IPOIB_ENTER(IPOIB_DBG_RECV);

        for (cur_NBL = p_net_buffer_list; cur_NBL != NULL; cur_NBL = next_NBL, NBL_cnt++)
        {
                next_NBL = NET_BUFFER_LIST_NEXT_NBL(cur_NBL);
                /* Get the port and descriptor from the NET_BUFFER_LIST. */
                CL_ASSERT(p_port == IPOIB_PORT_FROM_NBL(cur_NBL));
                p_desc = IPOIB_CM_RECV_FROM_NBL(cur_NBL);
#if 0
                decode_enet_pkt( "\n%s", preFix,
                                                 p_desc->p_alloc_buf,
                                                 p_desc->len,
                                                 NULL );
#endif
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                        ("%s[%d] curNBL %p p_desc->len %d NblFlags %#x\n",
                                preFix,
                                NBL_cnt,
                                cur_NBL,
                                p_desc->len,
                                NET_BUFFER_LIST_NBL_FLAGS(cur_NBL)) );

                NB = NET_BUFFER_LIST_FIRST_NB(cur_NBL);
                for(i = 1; NB;i++)
                {
                        MDL             *p_mdl;
                        PUCHAR  p_head;
                        UINT    mdl_len;

                        p_head=NULL;
                        p_mdl = NET_BUFFER_FIRST_MDL(NB);
                        NdisQueryMdl( p_mdl, &p_head, &mdl_len, NormalPagePriority );
                        if( p_head )
                        {
                                len = NET_BUFFER_DATA_LENGTH(NB);
                                off = NET_BUFFER_DATA_OFFSET(NB);
                                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                                        (" NB[%d] off %lu len %lu mdl_len %u\n",
                                                i,off,len,mdl_len) );
                                                //(p_head+off),
                                                //(p_desc->p_buf - DATA_OFFSET)) );
                                CL_ASSERT( len == p_desc->len );
                                CL_ASSERT( (p_head+off) == (p_desc->p_buf - DATA_OFFSET));
                                decode_enet_pkt( "\nEdata:", (p_head + off), mdl_len, NULL );
                        }
                        NB=NET_BUFFER_NEXT_NB(NB);
                }
        }
}
#endif

static ib_api_status_t
__cm_create_qp(
        IN              ipoib_port_t*                   p_port,
        IN              ipoib_endpt_t*  const   p_endpt,
        IN              boolean_t                               send_qp )
{
        ib_qp_create_t  create_qp;
        ib_cq_create_t  create_cq;
    ib_api_status_t     ib_status;
        ib_qp_handle_t  h_qp = NULL;

        IPOIB_ENTER( IPOIB_DBG_CM_CONN );

        if( send_qp == TRUE && !p_endpt->conn.h_send_cq )
        {
                memset( &create_cq, 0, sizeof( ib_cq_create_t ) );
                create_cq.size = p_port->p_adapter->params.sq_depth;
                create_cq.pfn_comp_cb = __cm_send_cb;

                ib_status = p_endpt->p_ifc->create_cq( p_port->ib_mgr.h_ca,
                                                                                           &create_cq,
                                                                                           p_endpt,
                                                                                           __cq_async_event_cb,
                                                                                           &p_endpt->conn.h_send_cq );
                if( ib_status != IB_SUCCESS )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("ERR: create send CQ %s\n",
                                        p_endpt->p_ifc->get_err_str( ib_status )) );
                        goto err_exit;
                }
                ipoib_endpt_ref( p_endpt );
        }

        /* Creating a Recv QP? */
        if( send_qp == FALSE && !p_endpt->conn.h_recv_cq )
        {
                memset( &create_cq, 0, sizeof( ib_cq_create_t ) );
                create_cq.size = p_port->p_adapter->params.rq_depth;
                create_cq.pfn_comp_cb = __cm_recv_cb;

                ib_status = p_endpt->p_ifc->create_cq( p_port->ib_mgr.h_ca,
                                                                                           &create_cq,
                                                                                           p_endpt,
                                                                                           __cq_async_event_cb,
                                                                                           &p_endpt->conn.h_recv_cq );
                if( ib_status != IB_SUCCESS )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("Failed Create RECV CQ %s\n",
                                        p_endpt->p_ifc->get_err_str( ib_status )) );
                        goto err_exit;
                }
                ipoib_endpt_ref( p_endpt );
        }

        memset( &create_qp, 0, sizeof( ib_qp_create_t ) );

        create_qp.qp_type = IB_QPT_RELIABLE_CONN;

        if( send_qp == TRUE )
        {
                create_qp.sq_signaled   = TRUE;
                create_qp.h_sq_cq               = p_endpt->conn.h_send_cq;
                create_qp.sq_depth              = p_port->p_adapter->params.sq_depth;
                create_qp.sq_sge                = min( MAX_SEND_SGE, p_port->max_sq_sge_supported );

                /* not used, IBAL requires a CQ */
                create_qp.h_rq_cq               = p_endpt->conn.h_send_cq;

                DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM_CONN,
                        ("create_qp[Send]: sq_sge %u sq_depth %u\n",
                                create_qp.sq_sge, create_qp.sq_depth) );
        }
        else
        {
                create_qp.sq_signaled   = TRUE;
                create_qp.h_rq_cq               = p_endpt->conn.h_recv_cq;
                /* QP create error if Recv Queue attributes set and SRQ attached */

                ASSERT( p_port->ib_mgr.h_srq );
                create_qp.h_srq                 = p_port->ib_mgr.h_srq;

                /* not used, IBAL required */
                create_qp.h_sq_cq               = p_endpt->conn.h_recv_cq;

                DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM_CONN,
                        ("create_qp[Recv]: rq_sge %u rq_depth %u\n",
                                create_qp.rq_sge, create_qp.rq_depth) );
        }

        ib_status = p_endpt->p_ifc->create_qp( p_port->ib_mgr.h_pd,
                                                                                   &create_qp,
                                                                                   p_endpt,
                                                                                        __srq_qp_async_event_cb,
                                                                                   &h_qp );
        if( ib_status != IB_SUCCESS )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Create RC [%s] QP failed status %s\n",
                                (send_qp ? "Send" : "Recv"),
                                p_endpt->p_ifc->get_err_str( ib_status )) );
                goto err_exit;
        }
        else
        {
                IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_CM_CONN,
                        ("Created CM %s QP %p for EP %s\n",
                                (send_qp ? "Send" : "Recv"), h_qp, p_endpt->tag) );
        }

        if( send_qp )
                p_endpt->conn.h_send_qp = h_qp;
        else
        {
                p_endpt->conn.h_recv_qp = h_qp;
                cl_atomic_inc( &p_port->ib_mgr.srq_qp_cnt );
        }

err_exit:
        if( ib_status != IB_SUCCESS )
        {
                if( !send_qp && p_endpt->conn.h_recv_cq )
                {
                        p_endpt->p_ifc->destroy_cq( p_endpt->conn.h_recv_cq, NULL );
                        p_endpt->conn.h_recv_cq = NULL;
                        ipoib_endpt_deref( p_endpt );
                }

                if( send_qp && p_endpt->conn.h_send_cq )
                {
                        p_endpt->p_ifc->destroy_cq( p_endpt->conn.h_send_cq, NULL );
                        p_endpt->conn.h_send_cq = NULL;
                        ipoib_endpt_deref( p_endpt );
                }
        }

        IPOIB_EXIT( IPOIB_DBG_CM_CONN );
        return ib_status;
}


/*
 * destroy Endpoint RC connection resources.
 * Returns:
 *      TRUE = caller can destroy the endpoint: all RC connection resources released.
 *      FALSE = RC connection release release will destroy the endpoint object later.
 */
BOOLEAN
cm_destroy_conn(
        IN              ipoib_port_t* const                     p_port,
        IN              ipoib_endpt_t* const            p_endpt )
{
        cm_state_t      cm_state;
        BOOLEAN         status;

        IPOIB_ENTER( IPOIB_DBG_CM_DCONN );

        cm_state = endpt_cm_get_state( p_endpt );

        IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_CM_DCONN,
                ("EP %s %s\n", p_endpt->tag, cm_get_state_str(cm_state)) );

        if( cm_state == IPOIB_CM_QUEUED_TO_CONNECT || cm_state == IPOIB_CM_CONNECTING )
        {
                p_endpt->cm_ep_destroy++;
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Port[%d] EP %s CM_CONNECTING - abort connect operation.\n",
                                p_port->port_num, p_endpt->tag) );
                return FALSE;
        }

        if( cm_state == IPOIB_CM_DESTROY )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Port[%d] EP %s CM_DESTROY - been here before, abort.\n",
                                p_port->port_num, p_endpt->tag) );
                return TRUE;
        }

        if( cm_state == IPOIB_CM_DISCONNECTED && !p_endpt->cm_ep_destroy )
        {
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM_DCONN,
                        ("Port[%d] EP %s CM_DISCONNECTED (!cm_ep_destroy) - nothing to do.\n",
                                p_port->port_num, p_endpt->tag) );
                return TRUE;
        }

        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM_DCONN,
                ("Port[%d] Destroying Endpoint %s MAC %s %s\n",
                        p_port->port_num, p_endpt->tag, mk_mac_str(&p_endpt->mac),
                        cm_get_state_str(cm_state)) );

        status = cm_start_conn_destruction( p_port, p_endpt );

        IPOIB_EXIT( IPOIB_DBG_CM_DCONN );
        return status;
}


ib_api_status_t
endpt_cm_connect(
         IN                     ipoib_endpt_t* const            p_endpt )
{
        ib_api_status_t         ib_status = IB_SUCCESS;
        ib_cm_req_t                     creq;
        ipoib_port_t*           p_port;
        ib_path_rec_t           path_rec;

        IPOIB_ENTER( IPOIB_DBG_CM );

        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                ("RC Connecting to MAC %s via EP %s\n",
                        mk_mac_str(&p_endpt->mac), p_endpt->tag) );

        p_port = ipoib_endpt_parent( p_endpt );

        if( !p_port->p_adapter->params.cm_enabled )
                return IB_UNSUPPORTED;

        if( p_port->p_adapter->state != IB_PNP_PORT_ACTIVE )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("EP %s Port[%d] NOT active\n", p_endpt->tag, p_port->port_num ) );
                return IB_INVALID_STATE;
        }
        if( p_endpt->cm_ep_destroy )
                return IB_INVALID_STATE;

        ib_status = __cm_create_qp( p_port, p_endpt, TRUE );
        if( ib_status != IB_SUCCESS )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Endpt %p CM create send QP/CQ failed %s\n",
                                p_endpt, p_endpt->p_ifc->get_err_str( ib_status )) );
                return ib_status;
        }
        if( p_endpt->cm_ep_destroy )
                return IB_INVALID_STATE;

        memset( &creq, 0, sizeof(ib_cm_req_t) );
        memset( &path_rec, 0, sizeof(ib_path_rec_t) );

        p_endpt->conn.private_data.ud_qpn = p_port->ib_mgr.qpn;
        p_endpt->conn.private_data.recv_mtu =
                cl_hton32( p_port->p_adapter->params.cm_payload_mtu + sizeof(ipoib_hdr_t) );

        creq.svc_id                     = p_endpt->conn.service_id;
        creq.max_cm_retries     = 5;

        if( ipoib_mac_to_path(p_port, p_endpt->mac, &path_rec) != STATUS_SUCCESS )
        {
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("ipoib_mac_to_path failed\n" ) );
                return IB_INVALID_PARAMETER;
        }
        if( p_endpt->cm_ep_destroy )
                return IB_INVALID_STATE;

        creq.p_primary_path = (ib_path_rec_t*)&path_rec;
        creq.p_req_pdata        = (uint8_t *)&p_endpt->conn.private_data;
        creq.req_length         = (uint8_t) sizeof( cm_private_data_t );

        creq.qp_type            = IB_QPT_RELIABLE_CONN;
        creq.h_qp                       = p_endpt->conn.h_send_qp;

        //creq.resp_res         = 1;
        //creq.init_depth       = 1;

        creq.remote_resp_timeout = ib_path_rec_pkt_life(&path_rec) + 1;
        creq.flow_ctrl                   = FALSE;       // srq attached qp does not support FC
        creq.local_resp_timeout  = ib_path_rec_pkt_life(&path_rec) + 1;
        creq.rnr_nak_timeout     = 7;
        creq.rnr_retry_cnt               = 1; /* IPoIB CM RFC draft warns against retries */
        creq.retry_cnt                   = 1; /* IPoIB CM RFC draft warns against retries */

        //creq.pfn_cm_req_cb    = (ib_pfn_cm_req_cb_t)NULL; no peer connections
        creq.pfn_cm_rep_cb      = __conn_reply_cb;
        creq.pfn_cm_mra_cb      = __conn_mra_cb;
        creq.pfn_cm_rej_cb      = __conn_rej_cb;

        creq.h_al                       = p_port->p_adapter->h_al;
        creq.pkey                       = path_rec.pkey;

        ib_status = p_endpt->p_ifc->cm_req( &creq );
        if( ib_status != IB_SUCCESS && ib_status != IB_PENDING )
        {
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("ib_cm_req failed status %s\n",
                                p_endpt->p_ifc->get_err_str( ib_status )) );
        }
#if DBG
        else
        {
                IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_CM,
                        ("CM REQ sent to EP %s UD QPN: %#x SID: %#I64x\n",
                                p_endpt->tag, cl_ntoh32(p_endpt->qpn), p_endpt->conn.service_id) );
        }
#endif

        IPOIB_EXIT( IPOIB_DBG_CM );
        return ib_status;
}

ib_api_status_t
ipoib_port_listen(
         IN                     ipoib_port_t* const             p_port )
{
        ib_api_status_t         ib_status;
        ib_cm_listen_t          cm_listen;

        IPOIB_ENTER( IPOIB_DBG_CM );

        if( !p_port->p_adapter->params.cm_enabled )
        {
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                                (" CONNECTED MODE IS NOT ENABLED\n" ) );
                return IB_UNSUPPORTED;
        }

        if( p_port->p_adapter->state != IB_PNP_PORT_ACTIVE )
        {
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                                (" Port state IS NOT ACTIVE\n" ) );
                return IB_INVALID_STATE;
        }

        if( !p_port->ib_mgr.h_srq )
        {
                ib_status = ipoib_port_srq_init( p_port );
                if( ib_status != IB_SUCCESS )
                {
                        IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("SRQ Init failed status %s\n",
                        p_port->p_adapter->p_ifc->get_err_str( ib_status )) );
                        return ib_status;
                }
        }

        endpt_cm_set_state( p_port->p_local_endpt , IPOIB_CM_LISTEN );

        memset( &cm_listen, 0, sizeof( ib_cm_listen_t ) );

        ipoib_addr_set_sid( &cm_listen.svc_id, p_port->ib_mgr.qpn );

        cm_listen.qp_type = IB_QPT_RELIABLE_CONN;
        cm_listen.ca_guid = p_port->p_adapter->guids.ca_guid;
        cm_listen.port_guid = p_port->p_adapter->guids.port_guid.guid;
        cm_listen.lid =  IB_ALL_LIDS;
        cm_listen.pkey = p_port->p_adapter->guids.port_guid.pkey;

        cm_listen.pfn_cm_req_cb = __conn_req_cb;

        p_port->p_local_endpt->conn.service_id = cm_listen.svc_id;

        ib_status = p_port->p_adapter->p_ifc->cm_listen(
                                                                                p_port->p_adapter->h_al,
                                                                                &cm_listen,
                                                                                (const void *)p_port,
                                                                                &p_port->p_local_endpt->conn.h_cm_listen );
// listen_done: see above ref.
        if( ib_status != IB_SUCCESS )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("cm_listen failed status %#x\n", ib_status ) );

                endpt_cm_buf_mgr_destroy( p_port );
                ipoib_port_srq_destroy( p_port );
        }
#if DBG
        else
        {
                IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_CM,
                        ("\n\tPort[%d] CREATED LISTEN CEP. SID: %#I64x\n",
                                p_port->port_num, p_port->p_local_endpt->conn.service_id ) );
        }
#endif

        IPOIB_EXIT( IPOIB_DBG_CM );
        return ib_status;
}


ib_api_status_t
ipoib_port_cancel_listen(
        IN      ipoib_endpt_t*  const   p_endpt )
{
        ib_api_status_t  ibs = IB_SUCCESS;

        IPOIB_ENTER( IPOIB_DBG_CM );

        if( p_endpt->conn.h_cm_listen )
        {
                ibs = p_endpt->p_ifc->cm_cancel( p_endpt->conn.h_cm_listen, NULL );

                endpt_cm_set_state( p_endpt, IPOIB_CM_DISCONNECTED );

                p_endpt->conn.h_cm_listen = NULL;
        }

        IPOIB_EXIT( IPOIB_DBG_CM );

        return ibs;
}


/*
 * received a connection request (REQ) while listening.
 */
static void
__conn_req_cb(
        IN                      ib_cm_req_rec_t                         *p_cm_req )
{
        ib_api_status_t         ib_status = IB_ERROR;
        ipoib_endpt_t*          p_endpt;
        ipoib_port_t*           p_port;
        cm_private_data_t       private_data;
        uint32_t                        mtu;
        ib_rej_status_t         rej_status = IB_REJ_INSUFFICIENT_RESP_RES;
        cm_state_t                      cm_state;
        ib_recv_wr_t*           p_recv_wr=NULL;

        IPOIB_ENTER( IPOIB_DBG_CM );

        CL_ASSERT( p_cm_req );

        p_port = (ipoib_port_t*) p_cm_req->context;
        p_endpt = ipoib_endpt_get_by_gid( p_port, &p_cm_req->primary_path.dgid );

        if( !p_endpt )
        {
                IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                        ("No matching Endpoint by gid?\n") );
                return;
        }

        cm_state = endpt_cm_get_state(p_endpt);
        if ( cm_state > IPOIB_CM_LISTEN )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("REQ while EP %s being destroyed, Reject.\n", p_endpt->tag) );
                rej_status = IB_REJ_STALE_CONN;
                goto conn_exit;
        }

        IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_CM,
                ("Recv'ed conn REQ in listen() from EP %s\n", p_endpt->tag) );

        if( p_endpt->conn.h_recv_qp )
        {
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("EP %s conn.h_recv_qp != null? - Rejecting.\n", p_endpt->tag) );
                // XXX no REJ_CONSUMER defined per spec?
                rej_status = IB_REJ_STALE_CONN;
                goto conn_exit;
        }

        /* copy private data and parse */
        private_data = (*(cm_private_data_t *)p_cm_req->p_req_pdata);

        if( private_data.ud_qpn != p_endpt->qpn )
        {
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                        ("EP %s BAD Private_Data, INVALID REMOTE QPN %#x EXPECT %#x, Rejected\n",
                                p_endpt->tag, cl_ntoh32( private_data.ud_qpn ),
                                cl_ntoh32( p_endpt->qpn ) ));
                rej_status = IB_REJ_INVALID_COMM_ID;
                goto conn_exit;
        }

        if( !p_endpt->conn.service_id )
        {
                p_endpt->cm_flag = IPOIB_CM_FLAG_RC;
                ipoib_addr_set_sid( &p_endpt->conn.service_id, private_data.ud_qpn );
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                        ("EP %s service_id set %#I64x QPN %#x\n",
                                p_endpt->tag, p_endpt->conn.service_id,
                                cl_ntoh32( private_data.ud_qpn )) );
        }

        mtu = p_port->p_adapter->params.cm_payload_mtu + sizeof(ipoib_hdr_t);

        if( cl_ntoh32( private_data.recv_mtu ) > mtu )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("EP %s INVALID REMOTE MTU: %u. MAX EXPECT: %u, Rejected\n",
                                p_endpt->tag, cl_ntoh32( private_data.recv_mtu ), mtu) );
                rej_status = IB_REJ_INVALID_MTU;
                goto conn_exit;
        }

        DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                ("EP %s MTU: REMOTE %u. Local %u\n",
                        p_endpt->tag, cl_ntoh32( private_data.recv_mtu ), mtu) );

        ib_status = __cm_create_qp( p_port, p_endpt, FALSE );
        if( ib_status != IB_SUCCESS )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("EP %s CM create recv QP failed, Reject\n", p_endpt->tag ) );
                rej_status = IB_REJ_INSUF_RESOURCES;
                goto conn_exit;
        }

        ib_status = __cm_post_srq_recv( p_port, p_endpt );

        if( ib_status != IB_SUCCESS )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("failed to Post recv WRs\n" ) );
                goto conn_exit;
        }

        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                ("Rx CM REQ(Accepting) port[%d] EP %s %s\n",
                        p_port->port_num, p_endpt->tag,
                        cm_get_state_str(endpt_cm_get_state(p_endpt))) );

        ib_status = __conn_accept( p_port, p_endpt, p_cm_req, p_recv_wr );

        if( ib_status != IB_SUCCESS )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("EP %s CM accept failed '%s'\n",
                                p_endpt->tag, p_endpt->p_ifc->get_err_str(ib_status)) );
                goto conn_exit2; /* IBAL has already rejected REQ */
        }

        ib_status = p_endpt->p_ifc->rearm_cq( p_endpt->conn.h_recv_cq, FALSE );

        if( ib_status == IB_SUCCESS )
        {
                IPOIB_EXIT( IPOIB_DBG_CM );
                return;
        }

        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                ("rearm Recv CQ failed status %s\n",
                        p_endpt->p_ifc->get_err_str(ib_status)) );

conn_exit:

        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                ("REJECTING CM Rx connection port[%d] Endpoint %s\n",
                        p_port->port_num, p_endpt->tag) );

        __conn_reject( p_port, p_cm_req->h_cm_req, rej_status );

conn_exit2:
        cm_release_resources( p_port, p_endpt, 2 ); // release Rx resources.

        IPOIB_EXIT( IPOIB_DBG_CM );
}


static ib_api_status_t
__conn_accept(
        IN              ipoib_port_t* const             p_port,
        IN              ipoib_endpt_t*                  p_endpt,
        IN              ib_cm_req_rec_t                 *p_cm_req,
        IN              ib_recv_wr_t*                   p_recv_wr )
{
        ib_api_status_t         ib_status;
        ib_cm_rep_t                     cm_reply;
        cm_private_data_t       private_data;
        ib_recv_wr_t*           p_failed_wc=NULL;

        IPOIB_ENTER( IPOIB_DBG_CM );

        memset( &cm_reply, 0, sizeof( cm_reply ) );

        private_data.ud_qpn = p_port->ib_mgr.qpn;
        private_data.recv_mtu =
                cl_hton32( p_port->p_adapter->params.cm_payload_mtu + sizeof(ipoib_hdr_t) );

        cm_reply.p_rep_pdata = (uint8_t*)&private_data;
        cm_reply.rep_length = (uint8_t) sizeof( private_data );

        cm_reply.h_qp = p_endpt->conn.h_recv_qp;
        cm_reply.qp_type = IB_QPT_RELIABLE_CONN;

        cm_reply.access_ctrl = IB_AC_LOCAL_WRITE;
        cm_reply.target_ack_delay       = 10;
        cm_reply.failover_accepted = IB_FAILOVER_ACCEPT_UNSUPPORTED;
        cm_reply.flow_ctrl                      = p_cm_req->flow_ctrl;
        cm_reply.rnr_nak_timeout        = 7;
        cm_reply.rnr_retry_cnt          = p_cm_req->rnr_retry_cnt;

        cm_reply.pfn_cm_rej_cb          = __conn_rej_cb;
        cm_reply.pfn_cm_mra_cb          = __conn_mra_cb;
        cm_reply.pfn_cm_rtu_cb          = __conn_rtu_cb;
        cm_reply.pfn_cm_dreq_cb         = __passive_conn_dreq_cb;

        if( p_recv_wr )
        {
                cm_reply.p_recv_wr              = p_recv_wr;
                cm_reply.pp_recv_failure = &p_failed_wc;
        }

        ib_status = p_endpt->p_ifc->cm_rep( p_cm_req->h_cm_req, &cm_reply );

        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                ("EP %s sending conn-REP, private_data.recv_mtu %d status %#x\n",
                        p_endpt->tag, cl_hton32(private_data.recv_mtu), ib_status) );

        IPOIB_EXIT( IPOIB_DBG_CM );
        return ib_status;
}


/* received a CM REPLY in response to our sending a connection REQ, next send RTU. */

static void
__conn_reply_cb(
        IN              ib_cm_rep_rec_t                 *p_cm_rep )
{
        ib_api_status_t ib_status = IB_ERROR;
        ipoib_endpt_t*  p_endpt ;
        ipoib_port_t*   p_port;
        ib_cm_rtu_t             cm_rtu;

        IPOIB_ENTER( IPOIB_DBG_CM );
        CL_ASSERT( p_cm_rep );

        p_endpt = (ipoib_endpt_t* ) p_cm_rep->qp_context;
        if( ! p_endpt )
                return;

        p_port  = ipoib_endpt_parent( p_endpt );
        ASSERT( p_port );

        if( endpt_cm_get_state( p_endpt ) != IPOIB_CM_CONNECTING )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Port[%d] EP %s Wrong state %s ?, reject - Active conn Abort.\n",
                                p_port->port_num, p_endpt->tag,
                                cm_get_state_str(endpt_cm_get_state(p_endpt))) );
                goto done;
        }

        ib_status = p_endpt->p_ifc->rearm_cq( p_endpt->conn.h_send_cq, FALSE );
        if( ib_status != IB_SUCCESS )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("failed Rearm CM Send CQ %s\n",
                                p_endpt->p_ifc->get_err_str(ib_status)) );
                goto done;
        }

        memset( &cm_rtu, 0, sizeof( ib_cm_rtu_t ) );
        cm_rtu.access_ctrl = IB_AC_LOCAL_WRITE;
        cm_rtu.pfn_cm_dreq_cb = __active_conn_dreq_cb;
        cm_rtu.p_rtu_pdata = (uint8_t*)&p_endpt->conn.private_data;
        cm_rtu.rtu_length = sizeof( cm_private_data_t );

        ib_status = p_endpt->p_ifc->cm_rtu(p_cm_rep->h_cm_rep, &cm_rtu );
        if( ib_status != IB_SUCCESS )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Endpoint %s DLID %#x Connect failed (cm_rtu) status %s\n",
                                p_endpt->tag, cl_ntoh16(p_endpt->dlid),
                                p_endpt->p_ifc->get_err_str(ib_status)) );
                goto done;
        }

        /* somebody else want this EP to go away? */
        if( endpt_cm_get_state( p_endpt ) != IPOIB_CM_CONNECTING )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Endpoint %s Connect Aborted\n", p_endpt->tag) );
                ib_status = IB_INVALID_STATE;
                goto done;
        }

        cl_obj_lock( &p_port->obj );
        cl_fmap_insert( &p_port->endpt_mgr.conn_endpts,
                                        &p_endpt->dgid,
                                        &p_endpt->conn_item );
        cl_obj_unlock( &p_port->obj );

        p_endpt->tx_mtu = p_port->p_adapter->params.cm_payload_mtu + sizeof(ipoib_hdr_t);
        endpt_cm_set_state(p_endpt, IPOIB_CM_CONNECTED);

        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                ("Active RC CONNECTED to EP %s\n", p_endpt->tag) );
done:
        if( ib_status != IB_SUCCESS )
        {
                __conn_reject( p_port, p_cm_rep->h_cm_rep, IB_REJ_INSUF_RESOURCES );
                if( !p_endpt->cm_ep_destroy )
                {
                        cm_release_resources( p_port, p_endpt, 1 );
                        endpt_cm_set_state( p_endpt, IPOIB_CM_DISCONNECTED );
                }
        }

        IPOIB_EXIT( IPOIB_DBG_CM );
}


static void
__conn_mra_cb(
        IN              ib_cm_mra_rec_t                 *p_mra_rec )
{
        IPOIB_ENTER( IPOIB_DBG_CM );
        UNUSED_PARAM( p_mra_rec );
        IPOIB_EXIT( IPOIB_DBG_CM );
}

/* RTU (Ready To Use) CM message arrived for passive/listen() connection, after this
 * this side has sent a CM Reply message.
 */

static void
__conn_rtu_cb(
        IN              ib_cm_rtu_rec_t                 *p_rtu_rec )
{
        ipoib_endpt_t*  p_endpt;
        ipoib_port_t*   p_port;

        IPOIB_ENTER( IPOIB_DBG_CM );

        CL_ASSERT( p_rtu_rec );
        p_endpt = (ipoib_endpt_t *)p_rtu_rec->qp_context;
        CL_ASSERT( p_endpt );
        p_port = ipoib_endpt_parent( p_endpt );
        CL_ASSERT( p_port );

        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                ("RTU arrived: Passive RC Connected to EP %s posted %d Active RC %s\n",
                        p_endpt->tag,
                        p_port->cm_buf_mgr.posted,
                        cm_get_state_str(endpt_cm_get_state(p_endpt))) );

        if ( endpt_cm_get_state(p_endpt) == IPOIB_CM_DISCONNECTED )
                endpt_queue_cm_connection( p_port, p_endpt );

        IPOIB_EXIT( IPOIB_DBG_CM );
}


void
endpt_queue_cm_connection(
        IN              ipoib_port_t* const                     p_port,
        IN              ipoib_endpt_t* const            p_endpt )
{
        if( p_endpt->cm_flag != IPOIB_CM_FLAG_RC )
        {
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
                        ("Unable to queue EP %s for RC connection: EP not CM capable?\n",
                                p_endpt->tag) );
                return;
        }

        if( !p_endpt->conn.service_id )
        {
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
                        ("Unable to queue EP %s for RC connection: service_id not set?\n",
                                p_endpt->tag) );
                return;
        }

        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
                ("Queue EP %s for Active RC connection\n", p_endpt->tag) );

        endpt_cm_set_state( p_endpt, IPOIB_CM_QUEUED_TO_CONNECT );

        /* Queue for endpt mgr to RC connect */
        NdisInterlockedInsertTailList( &p_port->endpt_mgr.pending_conns,
                                                                   &p_endpt->list_item,
                                                                   &p_port->endpt_mgr.conn_lock );

        cl_event_signal( &p_port->endpt_mgr.event );
}


/*
 * Queue endpt for CM Tx resource release, endpoint remains UD valid.
 */
static void
__queue_tx_resource_free(
        IN              ipoib_port_t*           p_port,
        IN              ipoib_endpt_t*          p_endpt )
{
        cm_state_t      old_state;

        ASSERT( p_port );
        p_endpt->tx_mtu = p_port->p_adapter->params.payload_mtu;
        old_state = endpt_cm_set_state( p_endpt, IPOIB_CM_DISCONNECT_CLEANUP );
        if( old_state == IPOIB_CM_CONNECTED )
        {
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                        ("EP %s previous stat %s \n",
                                p_endpt->tag, cm_get_state_str(old_state)) );
        }

        if( !p_port->endpt_mgr.thread_is_done )
        {
                DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                        ("Queue EP %s for CM Tx resource cleanup\n", p_endpt->tag) );

                NdisInterlockedInsertTailList( &p_port->endpt_mgr.remove_conns,
                                                                           &p_endpt->list_item,
                                                                           &p_port->endpt_mgr.remove_lock );
                cl_event_signal( &p_port->endpt_mgr.event );
        }
        else
        {
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                        ("EP %s for CM Tx resource cleanup, EP thread not running?\n",
                                p_endpt->tag) );
        }
}


static void
__conn_rej_cb(
        IN              ib_cm_rej_rec_t                 *p_rej_rec )
{
        ipoib_endpt_t*  p_endpt;
        ipoib_port_t*   p_port;

        IPOIB_ENTER( IPOIB_DBG_CM );

        CL_ASSERT( p_rej_rec );

        p_endpt = (ipoib_endpt_t* )p_rej_rec->qp_context;
        p_port = ipoib_endpt_parent( p_endpt );

        CL_ASSERT( p_endpt->conn.h_send_qp == p_rej_rec->h_qp ||
                                p_endpt->conn.h_recv_qp == p_rej_rec->h_qp );

        if( p_rej_rec->rej_status == IB_REJ_USER_DEFINED )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Connect REQ Rejected User defined ARI: %d\n",
                                ((uint16_t)(*(p_rej_rec->p_ari+1)))) );
        }
        else
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                ("Connect REQ Rejected Status: %d\n", cl_ntoh16( p_rej_rec->rej_status )) );
        }

        /* endpt not RC connected, release active (Tx) resources */
        cm_release_resources( p_port, p_endpt, 1 );
        endpt_cm_set_state( p_endpt, IPOIB_CM_DISCONNECTED );

        IPOIB_EXIT( IPOIB_DBG_CM );
}


/* received a conn-DREQ, send conn-DREP */

static void
__active_conn_dreq_cb(
         IN     ib_cm_dreq_rec_t                        *p_dreq_rec )
{
        ib_api_status_t         ib_status;
        ib_cm_drep_t            cm_drep;
        ipoib_endpt_t*          p_endpt ;
        ipoib_port_t*           p_port;
        cm_state_t                      cm_state;

        IPOIB_ENTER( IPOIB_DBG_CM );

        CL_ASSERT( p_dreq_rec );

        p_endpt = (ipoib_endpt_t *)p_dreq_rec->qp_context;

        if( !p_endpt )
                return;

        cm_state = endpt_cm_get_state( p_endpt );

        DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                ("Received DREQ for EP %s %s, return to CM_DISCONNECTED.\n",
                        p_endpt->tag, cm_get_state_str(cm_state) ) );

        p_port = ipoib_endpt_parent( p_endpt );
        ASSERT( p_port );
        p_endpt->tx_mtu = p_port->p_adapter->params.payload_mtu;

        if( cm_state == IPOIB_CM_CONNECTED )
        {
                cm_state = endpt_cm_set_state( p_endpt, IPOIB_CM_DREP_SENT );

                cm_drep.drep_length = 0;
                cm_drep.p_drep_pdata = NULL;

                ib_status = p_endpt->p_ifc->cm_drep( p_dreq_rec->h_cm_dreq, &cm_drep );
                if ( ib_status != IB_SUCCESS )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("Failed DREP send EP %s %s\n",
                                        p_endpt->tag, p_endpt->p_ifc->get_err_str(ib_status)) );
                }
#if DBG
                else
                {
                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                                ("DREP sent to EP %s prev-cstate %s\n",
                                        p_endpt->tag, cm_get_state_str(cm_state)) );
                }
#endif
                cl_obj_lock( &p_port->obj );
                endpt_unmap_conn_dgid( p_port, p_endpt);
                cl_obj_unlock( &p_port->obj );
        }
        cm_release_resources( p_port, p_endpt, 1 );
        endpt_cm_set_state( p_endpt, IPOIB_CM_DISCONNECTED );

        IPOIB_EXIT( IPOIB_DBG_CM );
}


/* CM conn state is ONLY for an Active connection item. Passive/listen() connections
 * are identified by a non-null endpt->conn.h_recv_qp.
 */
static void
__passive_conn_dreq_cb(
         IN     ib_cm_dreq_rec_t                        *p_dreq_rec )
{
        ib_api_status_t         ib_status;
        ib_cm_drep_t            cm_drep;
        ipoib_endpt_t*          p_endpt ;
        ipoib_port_t*           p_port;

        IPOIB_ENTER( IPOIB_DBG_CM );

        CL_ASSERT( p_dreq_rec );

        p_endpt = (ipoib_endpt_t *)p_dreq_rec->qp_context;

        if( !p_endpt )
                return;

        DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                ("Received DREQ for EP %s Passive conn [%s], release Rx resources.\n",
                        p_endpt->tag, cm_get_state_str(endpt_cm_get_state(p_endpt))) );

        p_port = ipoib_endpt_parent( p_endpt );
        ASSERT( p_port );

        cm_drep.drep_length = 0;
        cm_drep.p_drep_pdata = NULL;

        ib_status = p_endpt->p_ifc->cm_drep( p_dreq_rec->h_cm_dreq, &cm_drep );
        if ( ib_status != IB_SUCCESS )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Failed DREP send %s\n",
                                p_endpt->p_ifc->get_err_str(ib_status)) );
        }
        cm_release_resources( p_port, p_endpt, 2 );     // release Rx resources.

        IPOIB_EXIT( IPOIB_DBG_CM );
}

/*
 * If send & recv QPs are present they are in the ERROR state.
 */
void
cm_destroy_recv_resources(
        IN                              ipoib_port_t* const             p_port,
        IN                              ipoib_endpt_t* const    p_endpt )
{
        ib_api_status_t                 ib_status = IB_SUCCESS;
        ib_wc_t                                 *p_done_wc;
        ib_wc_t                                 wc[MAX_CM_RECV_WC];
        ib_wc_t                                 *p_free_wc;
        ib_wc_t                                 *p_wc;
        ipoib_cm_recv_desc_t    *p_desc;
        int                                             flush_cnt=0;
        int                                             loops;

        IPOIB_ENTER( IPOIB_DBG_CM_DCONN );

        p_endpt->cm_rx_flushing = FALSE;

        if( p_endpt->conn.h_recv_qp )
        {
                BOOLEAN dispatch;

                for( p_free_wc=wc; p_free_wc < &wc[MAX_CM_RECV_WC - 1]; p_free_wc++ )
                        p_free_wc->p_next = p_free_wc + 1;
                p_free_wc->p_next = NULL;

                dispatch = (KeGetCurrentIrql() == DISPATCH_LEVEL);
                loops = p_port->p_adapter->params.rq_depth;
                do
                {
                        p_free_wc = wc;
                        ib_status = p_endpt->p_ifc->poll_cq( p_endpt->conn.h_recv_cq,
                                                                                                 &p_free_wc,
                                                                                                 &p_done_wc );

                        if( ib_status != IB_SUCCESS && ib_status != IB_NOT_FOUND )
                        {
                                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                        ("Poll Recv CQ failed: %s\n",
                                                p_endpt->p_ifc->get_err_str( ib_status )) );
                                loops = 1;
                                break;
                        }
                        if( ib_status == IB_SUCCESS )
                        {
                                cl_spinlock_acquire( &p_port->recv_lock );
                                for( p_wc = p_done_wc; p_wc; p_wc = p_wc->p_next )
                                {
                                        p_desc = (ipoib_cm_recv_desc_t *)(uintn_t)p_wc->wr_id;
                                        __cm_buf_mgr_put_recv( p_port, p_desc, TRUE, NULL );
                                        flush_cnt++;
                                }
                                cl_spinlock_release( &p_port->recv_lock );
                        }
                        else if( !dispatch )
                                        cl_thread_suspend(0);

                } while( --loops > 0 );

                if( flush_cnt > 0 )
                {
                        IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                                ("Flushed %d Passive RC recv buffers, destroying recv QP\n",
                                        flush_cnt) );
                }

                ib_status = p_endpt->p_ifc->destroy_qp( p_endpt->conn.h_recv_qp, NULL );
                if( ib_status != IB_SUCCESS )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("Destroy Recv QP failed %s\n",
                                        p_endpt->p_ifc->get_err_str( ib_status )) );
                }
                p_endpt->conn.h_recv_qp = NULL;
                cl_atomic_dec( &p_port->ib_mgr.srq_qp_cnt );
        }

        if( p_endpt->conn.h_recv_cq )
        {
                ib_status = p_endpt->p_ifc->destroy_cq( p_endpt->conn.h_recv_cq, NULL );
                if( ib_status != IB_SUCCESS )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("Destroy Recv CQ failed: %s\n",
                                        p_endpt->p_ifc->get_err_str( ib_status )) );
                }
                p_endpt->conn.h_recv_cq = NULL;
                ipoib_endpt_deref( p_endpt );
        }

        dmp_ipoib_port_refs( p_port, "cm_destroy_recv_resources()" );

        DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM_DCONN,
                ("Port[%u] EP %s Rx resources released.\n",
                        p_port->port_num, p_endpt->tag) );

        IPOIB_EXIT( IPOIB_DBG_CM_DCONN );
}


/* Transition QP into ERR state
 * which_res:
 * -1 == both Tx & Rx QPs
 *      0 == both Tx & Rx QPs resources
 *      1 == only Tx QP
 *      2 == only Rx QP
 *
 * Side Effects:
 *  cm_rx_flushing == TRUE, expect async error callback to destroy Rx resouces.
 */
static void
cm_start_conn_teardown(
        IN                      ipoib_port_t* const                     p_port,
        IN                      ipoib_endpt_t* const            p_endpt,
        IN                      int                                                     which_res )
{
        ib_api_status_t ib_status;
        ib_qp_mod_t             mod_attr;

        ASSERT( p_port );
        ASSERT( p_endpt );

        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM_DCONN,
                ("Port[%d] EP %s which_res %d\n",
                        p_port->port_num, p_endpt->tag, which_res) );

        cl_obj_lock( &p_endpt->obj );

        if( (which_res == 0 || which_res == 1) && p_endpt->conn.h_send_qp )
        {
                ib_qp_handle_t h_send_qp = p_endpt->conn.h_send_qp;

                p_endpt->conn.h_send_qp = NULL; // prevent Tx on invalid QP
                p_endpt->conn.h_send_qp_err = h_send_qp;        // save for later destroy.

                memset( &mod_attr, 0, sizeof( mod_attr ) );
                mod_attr.req_state = IB_QPS_ERROR;
                ib_status = p_endpt->p_ifc->modify_qp( h_send_qp, &mod_attr );
                if( ib_status != IB_SUCCESS )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("modify_qp(send: IB_QPS_ERROR) %s\n",
                                        p_endpt->p_ifc->get_err_str( ib_status )) );
                }
        }

        if( (which_res == 0 || which_res == 2) && p_endpt->conn.h_recv_qp )
        {
                //if( !p_endpt->cm_rx_flushing )
                {
                        p_endpt->cm_rx_flushing++;
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_CM_DCONN,
                                ("EP %s FLUSHING\n", p_endpt->tag) );
                        memset( &mod_attr, 0, sizeof( mod_attr ) );
                        mod_attr.req_state = IB_QPS_ERROR;
                        ib_status = p_endpt->p_ifc->modify_qp( p_endpt->conn.h_recv_qp,
                                                                                                   &mod_attr );
                        if( ib_status != IB_SUCCESS )
                        {
                                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                        ("modify_qp(recv:IB_QPS_ERROR) %s\n",
                                                p_endpt->p_ifc->get_err_str( ib_status )) );
                        }
                        /* QP async error handler will finish the Rx QP destruction task */
                }
        }
        cl_obj_unlock( &p_endpt->obj );
}

/*
 * begin RC connection destruction.
 * If the Rx Qp exists, then the endpoint object destruction is delayed until
 * the QP's async event callback fires on flushed CQ; then destroy.
 * Returns:
 *  TRUE - caller can destroy the endpoint obj.
 *  FALSE - Rx CQ async callback will destroy endpoint object.
 */
static BOOLEAN
cm_start_conn_destruction(
        IN                      ipoib_port_t* const                     p_port,
        IN                      ipoib_endpt_t* const            p_endpt )
{
        BOOLEAN         status=TRUE;

        IPOIB_ENTER( IPOIB_DBG_CM_DCONN );

        cl_obj_lock( &p_port->obj );
        endpt_unmap_conn_dgid( p_port, p_endpt );
        cl_obj_unlock( &p_port->obj );

        cm_release_resources( p_port, p_endpt, 1 );     // release Tx now.

        ASSERT( !p_endpt->conn.h_send_qp );
        ASSERT( !p_endpt->conn.h_send_qp_err );
        ASSERT( !p_endpt->conn.h_send_cq );

        if( p_endpt->conn.h_recv_qp )
        {
                // flag endpoint object is to be destroyed in Rx qp async error handler.
                p_endpt->cm_ep_destroy++;
                status = FALSE;
                cm_start_conn_teardown( p_port, p_endpt, 2 );   // release Rx
        }
        else
        {
                cm_destroy_recv_resources( p_port, p_endpt );
                ASSERT( !p_endpt->conn.h_recv_qp );
                ASSERT( !p_endpt->conn.h_recv_cq );
        }
        IPOIB_EXIT( IPOIB_DBG_CM_DCONN );
        return status;
}


/* release/free an EP's CM resources:
 * which_res: (<0 implies rouine called from async error handler).
 * -2 == only Rx/Passive resources, Rx QP in ERROR state.
 * -1 == both Tx & Rx CM resources, TX & Rx QPs in ERROR state.
 *      0 == both Tx & Rx CM resources
 *      1 == only Tx/Active resources
 *      2 == only Rx/Passive resources
 */
void
cm_release_resources(
        IN                              ipoib_port_t* const             p_port,
        IN                              ipoib_endpt_t* const    p_endpt,
        IN                              int                                             which_res )
{
        ib_api_status_t ib_status = IB_SUCCESS;

        IPOIB_ENTER( IPOIB_DBG_CM_DCONN );

        ASSERT( p_port );
        ASSERT( p_endpt );

        if( which_res >= 0 )
        {
                cm_start_conn_teardown( p_port, p_endpt, which_res);
                cl_obj_lock( &p_endpt->obj );
        }

        if( which_res == -1 || which_res == 1 )
        {
                ib_qp_handle_t h_send_qp = (p_endpt->conn.h_send_qp
                                                                        ? p_endpt->conn.h_send_qp
                                                                        : p_endpt->conn.h_send_qp_err);
                if( h_send_qp )
                {
                        ib_status = p_endpt->p_ifc->destroy_qp( h_send_qp, NULL );
                        if( ib_status != IB_SUCCESS )
                        {
                                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                        ("Destroy RC Send QP failed: %s\n",
                                                p_endpt->p_ifc->get_err_str( ib_status )) );
                        }
                        p_endpt->conn.h_send_qp = p_endpt->conn.h_send_qp_err = NULL;
                }
                p_endpt->tx_mtu = p_port->p_adapter->params.payload_mtu;

                if( p_endpt->conn.h_send_cq )
                {
                        ib_status = p_endpt->p_ifc->destroy_cq( p_endpt->conn.h_send_cq, NULL );
                        if( ib_status != IB_SUCCESS )
                        {
                                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                        ("Destroy Send CQ failed status %s\n",
                                                p_endpt->p_ifc->get_err_str( ib_status )) );
                        }
                        p_endpt->conn.h_send_cq = NULL;
                        ipoib_endpt_deref( p_endpt );
                        DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM_DCONN,
                                ("Port[%u] EP %s Tx resources released.\n",
                                        p_port->port_num, p_endpt->tag) );
                }
        }

        /* see srq_async */
        if( which_res < 0 )
                cm_destroy_recv_resources( p_port, p_endpt );
        else
                cl_obj_unlock( &p_endpt->obj );

        IPOIB_EXIT( IPOIB_DBG_CM_DCONN );
}


static void
__cm_send_cb(
        IN              const   ib_cq_handle_t                  h_cq,
        IN                              void                                    *cq_context )
{
        ipoib_port_t            *p_port;
        ib_api_status_t         ib_status;
        ib_wc_t                         wc[MAX_SEND_WC], *p_wc, *p_free;
        cl_qlist_t                      done_list;
        ipoib_endpt_t           *p_endpt;
        ib_api_status_t         send_failed = IB_SUCCESS;
        ip_stat_sel_t           type;
        NET_BUFFER                      *p_netbuffer = NULL;
        ipoib_send_NB_SG        *s_buf;
        cl_perf_t                       *perf;
        ib_al_ifc_t                     *p_ibal;
        NDIS_STATUS                     status = NDIS_STATUS_FAILURE;

        PERF_DECLARE( CMSendCompBundle );
        PERF_DECLARE( CMSendCb );
        PERF_DECLARE( CMPollSend );
        PERF_DECLARE( CMFreeSendBuf );

        IPOIB_ENTER( IPOIB_DBG_SEND );

        ASSERT(KeGetCurrentIrql() == DISPATCH_LEVEL);
        cl_perf_clr( CMSendCompBundle );
        cl_perf_start( CMSendCb );
        cl_qlist_init( &done_list );

        p_endpt = (ipoib_endpt_t *)cq_context;
        ASSERT( p_endpt );
        ipoib_endpt_ref( p_endpt );

#if DBG
        if( h_cq ) { ASSERT( h_cq == p_endpt->conn.h_send_cq ); }
#else
        UNUSED_PARAM( h_cq );
#endif

        p_port = ipoib_endpt_parent( p_endpt );
        perf = &p_port->p_adapter->perf;
        p_ibal = p_port->p_adapter->p_ifc;

        for( p_free=wc; p_free < &wc[MAX_SEND_WC - 1]; p_free++ )
                p_free->p_next = p_free + 1;
        p_free->p_next = NULL;

        cl_spinlock_acquire( &p_port->send_lock );

        do
        {
                p_free = wc;
                cl_perf_start( CMPollSend );
                ib_status = p_ibal->poll_cq( h_cq, &p_free, &p_wc );
                cl_perf_stop( perf, CMPollSend );
                CL_ASSERT( ib_status == IB_SUCCESS || ib_status == IB_NOT_FOUND );

                while( p_wc )
                {
                        CL_ASSERT(p_wc->status != IB_WCS_SUCCESS || p_wc->wc_type == IB_WC_SEND);
                        s_buf = (ipoib_send_NB_SG*)(uintn_t)p_wc->wr_id;
                        CL_ASSERT( s_buf );

                        DIPOIB_PRINT(TRACE_LEVEL_VERBOSE, IPOIB_DBG_SEND,
                                ("RC send completion: s_buf %p s_buf->p_nbl %p endpt %s\n",
                                        s_buf, s_buf->p_nbl, s_buf->p_endpt->tag) );

                        ASSERT( p_endpt == s_buf->p_endpt );

                        status = NDIS_STATUS_FAILURE;

                        switch( p_wc->status )
                        {
                        case IB_WCS_SUCCESS:
                                if( p_endpt->h_mcast )
                                {
                                        if( p_endpt->dgid.multicast.raw_group_id[11] == 0xFF &&
                                                p_endpt->dgid.multicast.raw_group_id[10] == 0xFF &&
                                                p_endpt->dgid.multicast.raw_group_id[12] == 0xFF &&
                                                p_endpt->dgid.multicast.raw_group_id[13] == 0xFF )
                                        {
                                                type = IP_STAT_BCAST_BYTES;
                                        }
                                        else
                                                type = IP_STAT_MCAST_BYTES;
                                }
                                else
                                        type = IP_STAT_UCAST_BYTES;

                                p_netbuffer = s_buf->p_curr_nb;
                                ipoib_inc_send_stat( p_port->p_adapter, type,
                                                                                NET_BUFFER_DATA_LENGTH(p_netbuffer) );
                                status = NDIS_STATUS_SUCCESS;
                                break;

                        case IB_WCS_WR_FLUSHED_ERR:
                                IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_SEND,
                                        ("RC Flushed send completion.\n") );
                                        ipoib_inc_send_stat(p_port->p_adapter, IP_STAT_DROPPED, 0);
                                status = NDIS_STATUS_RESET_IN_PROGRESS;
                                if( !send_failed )
                                        send_failed = (ib_api_status_t)p_wc->status;
                                break;

                        default:
                                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                        ("RC Send failed with %s (vendor specific %#x)\n",
                                        p_ibal->get_wc_status_str( p_wc->status ),
                                        (int)p_wc->vendor_specific) );
                                        ipoib_inc_send_stat( p_port->p_adapter, IP_STAT_ERROR, 0 );
                                status = NDIS_STATUS_FAILURE;
                                send_failed = (ib_api_status_t)p_wc->status;
                        }

                        cl_perf_start( CMFreeSendBuf );
                        ipoib_send_complete_net_buffer( s_buf,
                                                                                        status,
                                                                                        NDIS_SEND_COMPLETE_FLAGS_DISPATCH_LEVEL,
                                                                                        TRUE );
                        cl_perf_stop( perf, CMFreeSendBuf );

                        cl_atomic_dec( &p_port->send_mgr.depth );

                        p_wc = p_wc->p_next;
                        cl_perf_inc( CMSendCompBundle );
                }
                /* If we didn't use up every WC, break out. */
        } while( !p_free );

        if ( send_failed )
        {
                /* revert to UD only */
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Port[%d] start CM Tx resources release EP %s\n",
                                p_port->port_num, p_endpt->tag) );
                p_endpt->conn.h_send_qp_err = p_endpt->conn.h_send_qp; // for later destroy.
                p_endpt->conn.h_send_qp = NULL; // prevent Tx on invalid QP
                __queue_tx_resource_free( p_port, p_endpt );
                endpt_cm_set_state( p_endpt, IPOIB_CM_DISCONNECT_CLEANUP );
        }
        else
        {
                /* Rearm the CQ. */
                ib_status = p_ibal->rearm_cq( h_cq, FALSE );
                CL_ASSERT( ib_status == IB_SUCCESS );
        }

        ipoib_endpt_deref( p_endpt );

        cl_perf_stop( perf, CMSendCb );

        cl_perf_update_ctr( perf, CMSendCompBundle );

        cl_spinlock_release( &p_port->send_lock );

        IPOIB_EXIT( IPOIB_DBG_SEND );
}


static inline NET_BUFFER_LIST*
__cm_buf_mgr_get_NBL(
                IN                              ipoib_port_t* const                     p_port,
                IN                              ipoib_cm_recv_desc_t* const     p_desc )
{
        NET_BUFFER_LIST         *p_net_buffer_list;
        MDL                                     *p_mdl;


        PNET_BUFFER                     NetBuffer;
#if !DBG
        UNUSED_PARAM( p_port );
#endif

        CL_ASSERT( p_desc->p_NBL );
        CL_ASSERT( p_desc->p_mdl );
        CL_ASSERT( p_port == IPOIB_PORT_FROM_NBL( p_desc->p_NBL ) );

        p_net_buffer_list = p_desc->p_NBL;

        NetBuffer = NET_BUFFER_LIST_FIRST_NB( p_net_buffer_list );
        p_mdl = NET_BUFFER_FIRST_MDL( NetBuffer );

        CL_ASSERT( p_mdl == p_desc->p_mdl );
        CL_ASSERT( NET_BUFFER_CURRENT_MDL( NetBuffer ) == p_mdl );

        NET_BUFFER_DATA_LENGTH( NetBuffer ) = p_desc->len;
        NdisAdjustMdlLength( p_mdl, p_desc->len );

        return p_net_buffer_list;
}


static ib_api_status_t
__cm_recv_mgr_prepare_NBL(
        IN                      ipoib_port_t* const                             p_port,
        IN                      ipoib_cm_recv_desc_t* const             p_desc,
        OUT                     NET_BUFFER_LIST** const                 pp_net_buffer_list )
{
        NDIS_STATUS             status;
        uint32_t                pkt_filter;
        PNET_BUFFER             NetBuffer;

        PERF_DECLARE( CMGetNdisPkt );

        XIPOIB_ENTER( IPOIB_DBG_RECV );

        pkt_filter = p_port->p_adapter->packet_filter;

        CL_ASSERT( p_desc->recv_mode == RECV_RC );

        /* Check the packet filter. */
        switch( p_desc->type )
        {
        default:
        case PKT_TYPE_UCAST:

                if( pkt_filter & NDIS_PACKET_TYPE_PROMISCUOUS ||
                        pkt_filter & NDIS_PACKET_TYPE_ALL_FUNCTIONAL ||
                        pkt_filter & NDIS_PACKET_TYPE_SOURCE_ROUTING ||
                        pkt_filter & NDIS_PACKET_TYPE_DIRECTED )
                {
                        /* OK to report. */
                        status = NDIS_STATUS_SUCCESS;
                        IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_RECV,
                                ("Received RC UCAST PKT.\n") );
                }
                else
                {
                        status = NDIS_STATUS_FAILURE;
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_RECV,
                                ("Received UCAST PKT with ERROR !!!!\n"));
                }
                break;

        case PKT_TYPE_BCAST:
                if( pkt_filter & NDIS_PACKET_TYPE_PROMISCUOUS ||
                        pkt_filter & NDIS_PACKET_TYPE_BROADCAST )
                {
                        /* OK to report. */
                        status = NDIS_STATUS_SUCCESS;
                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
                                ("Received BCAST PKT.\n"));
                }
                else
                {
                        status = NDIS_STATUS_FAILURE;
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("Received BCAST PKT with ERROR !!!!\n"));
                }
                break;

        case PKT_TYPE_MCAST:
                if( pkt_filter & NDIS_PACKET_TYPE_PROMISCUOUS ||
                        pkt_filter & NDIS_PACKET_TYPE_ALL_MULTICAST ||
                        pkt_filter & NDIS_PACKET_TYPE_MULTICAST )
                {
                        /* OK to report. */
                        status = NDIS_STATUS_SUCCESS;
                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
                                ("Received MCAST PKT.\n"));
                }
                else
                {
                        status = NDIS_STATUS_FAILURE;
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("Received MCAST PKT with ERROR !!!!\n"));
                }
                break;
        }

        if( status != NDIS_STATUS_SUCCESS )
        {
                /* Return the receive descriptor to the pool. */
                __cm_buf_mgr_put_recv( p_port, p_desc, FALSE, NULL );
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Packet filter doesn't match receive.  Dropping.\n") );
                /*
                 * Return IB_NOT_DONE since the packet has been completed,
                 * but has not consumed an array entry.
                 */
                return IB_NOT_DONE;
        }

        cl_perf_start( CMGetNdisPkt );
        *pp_net_buffer_list = __cm_buf_mgr_get_NBL( p_port, p_desc );
        cl_perf_stop( &p_port->p_adapter->perf, CMGetNdisPkt );
        if( !*pp_net_buffer_list )
        {
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("__cm_buf_mgr_get_NBL failed\n") );
                return IB_INSUFFICIENT_RESOURCES;
        }

        NetBuffer = NET_BUFFER_LIST_FIRST_NB(*pp_net_buffer_list);
        NET_BUFFER_DATA_LENGTH(NetBuffer) = p_desc->len;

        switch( p_port->p_adapter->params.recv_chksum_offload )
        {
          default:
                CL_ASSERT( FALSE );
                break;

          case CSUM_DISABLED:
                NET_BUFFER_LIST_INFO(*pp_net_buffer_list, TcpIpChecksumNetBufferListInfo) =
                (PVOID)p_desc->csum.Value = (PVOID)0;
                break;

          case CSUM_ENABLED:
                /* Get the checksums directly from packet information.
                   In this case, no one of cheksum's can get false value
                   If hardware checksum failed or wasn't calculated, NDIS will
                   recalculate it again */
                NET_BUFFER_LIST_INFO(*pp_net_buffer_list, TcpIpChecksumNetBufferListInfo) =
                (PVOID)p_desc->csum.Value;
                break;

          case CSUM_BYPASS:
                p_desc->csum.Value = 0;
                /* Flag the checksums as having been calculated. */
                p_desc->csum.Receive.TcpChecksumSucceeded =
                p_desc->csum.Receive.UdpChecksumSucceeded =
                p_desc->csum.Receive.IpChecksumSucceeded = TRUE;
                NET_BUFFER_LIST_INFO(*pp_net_buffer_list, TcpIpChecksumNetBufferListInfo) =
                (PVOID)p_desc->csum.Value;
                break;
        }

        NET_BUFFER_LIST_STATUS( *pp_net_buffer_list ) = NDIS_STATUS_SUCCESS;

        XIPOIB_EXIT( IPOIB_DBG_RECV );
        return IB_SUCCESS;
}


static uint32_t
__cm_recv_mgr_build_NBL_list(
        IN              ipoib_port_t* const             p_port,
        IN              ipoib_endpt_t* const    p_endpt,
        IN              cl_qlist_t*                     p_done_list,
        IN OUT  int32_t* const                  p_discarded,
        IN OUT  int32_t* const                  p_bytes_recv,
        IN OUT  NET_BUFFER_LIST** const p_NBL_head )
{
        cl_list_item_t                  *p_item;
        ipoib_cm_recv_desc_t    *p_desc;
        uint32_t                                i = 0;
        ib_api_status_t                 status;
        NET_BUFFER_LIST                 *p_NBL;
        NET_BUFFER_LIST                 *p_tail=NULL;

        PERF_DECLARE( CMPreparePkt );

        XIPOIB_ENTER( IPOIB_DBG_RECV );

        *p_discarded = 0;
        *p_bytes_recv = 0;
        *p_NBL_head = NULL;

        /* Move any existing receives to the head of p_done_list to preserve ordering */
        if ( p_done_list ) {
                cl_qlist_insert_list_head( p_done_list, &p_endpt->cm_recv.done_list );
        } else {
                p_done_list = &p_endpt->cm_recv.done_list;
        }

        p_item = cl_qlist_remove_head( p_done_list );
        while( p_item != cl_qlist_end( p_done_list ) )
        {
                p_desc = (ipoib_cm_recv_desc_t*)p_item;

                CL_ASSERT( p_desc->recv_mode == RECV_RC );
                cl_perf_start( CMPreparePkt );
                status = __cm_recv_mgr_prepare_NBL( p_port, p_desc, &p_NBL );
                cl_perf_stop( &p_port->p_adapter->perf, CMPreparePkt );

                if( status == IB_SUCCESS )
                {
                        if ( i == 0 )
                                *p_NBL_head = p_tail = p_NBL;
                        else
                        {       // not 1st NBL, Link NBLs together
                                NET_BUFFER_LIST_NEXT_NBL(p_tail) = p_NBL;
                                p_tail = p_NBL;
                        }
                        i++;
                        *p_bytes_recv += p_desc->len;
                        IPOIB_PRINT(TRACE_LEVEL_VERBOSE, IPOIB_DBG_CM,
                                ("NBL[%d] len %d\n",(i-1),p_desc->len) );
                }
                else if( status == IB_NOT_DONE )
                {
                        IPOIB_PRINT(TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("__recv_mgr_prepare_NBL returned IB_NOT_DONE, discard pkt.\n") );
                        (*p_discarded)++;
                }
                else
                {
                        IPOIB_PRINT(TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("__recv_mgr_prepare_NBL returned %s\n",
                                        p_endpt->p_ifc->get_err_str( status )) );

                        /* Put all completed receives on the port's done list. */
                        if ( p_done_list != &p_endpt->cm_recv.done_list)
                        {
                                cl_qlist_insert_tail( &p_endpt->cm_recv.done_list, p_item );
                                cl_qlist_insert_list_tail( &p_endpt->cm_recv.done_list, p_done_list );
                        } else
                                cl_qlist_insert_head( &p_endpt->cm_recv.done_list, p_item );

                        break;
                }
                p_item = cl_qlist_remove_head( p_done_list );
        }

        XIPOIB_EXIT( IPOIB_DBG_RECV );
        return i;
}


/* cm_buf_mgr.lock held */

static ipoib_cm_recv_desc_t*
__cm_buf_mgr_get_recv_locked(
        IN               ipoib_port_t* const            p_port,
        IN               ipoib_endpt_t* const           p_endpt )
{
        ipoib_cm_recv_desc_t    *p_desc;

        XIPOIB_ENTER( IPOIB_DBG_RECV );

        p_desc = (ipoib_cm_recv_desc_t*)cl_qpool_get( &p_port->cm_buf_mgr.recv_pool );
        if( p_desc )
        {
                p_desc->p_endpt = p_endpt;
                InterlockedIncrement( &p_port->cm_buf_mgr.posted );
                cl_qlist_insert_tail( &p_port->cm_buf_mgr.oop_list, &p_desc->list_item );
                CL_ASSERT( p_desc->wr.wr_id == (uintn_t)p_desc );
                CL_ASSERT( p_desc->local_ds[0].vaddr == cl_get_physaddr(p_desc->p_buf) );
                CL_ASSERT( p_desc->local_ds[0].length > 0 );
        }

        XIPOIB_EXIT( IPOIB_DBG_RECV );
        return p_desc;
}


static ib_api_status_t
__cm_post_srq_recv(
        IN              ipoib_port_t* const                     p_port,
        IN              ipoib_endpt_t* const            p_endpt )
{
        ib_api_status_t                 ib_status = IB_SUCCESS;
        ipoib_cm_recv_desc_t    *p_head_desc = NULL;
        ipoib_cm_recv_desc_t    *p_tail_desc = NULL;
        ipoib_cm_recv_desc_t    *p_next_desc;
        ib_recv_wr_t                    *p_failed_wc = NULL;
        int                                             wanted;
        int                                             rx_cnt;
        int                                             posted;

        IPOIB_ENTER( IPOIB_DBG_RECV );

        posted = p_port->cm_buf_mgr.posted;
        wanted = p_port->p_adapter->params.rq_depth - posted;

#if DBG
        IPOIB_PRINT( TRACE_LEVEL_VERBOSE, IPOIB_DBG_RECV,
                ("Port[%d] posting %d RC bufs of limit(rq_depth %d) posted %d max %d\n",
                        p_port->port_num, wanted, p_port->p_adapter->params.rq_depth,
                        posted, p_port->cm_buf_mgr.recv_pool_depth) );
#endif

        cl_spinlock_acquire( &p_port->cm_buf_mgr.lock);

        for( rx_cnt=posted; rx_cnt < p_port->p_adapter->params.rq_depth; rx_cnt++)
        {
                /* Pull receives out of the pool to chain them up. */
                p_next_desc = __cm_buf_mgr_get_recv_locked( p_port, p_endpt );
                if( !p_next_desc )
                {
                        IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
                                ("RX descriptor pool exhausted! wanted %d provided %d\n",
                                        wanted, (rx_cnt - posted)) );
                        break;
                }

                if( !p_tail_desc )
                {
                        p_tail_desc = p_next_desc;
                        p_next_desc->wr.p_next = NULL;
                }
                else
                        p_next_desc->wr.p_next = &p_head_desc->wr;

                p_head_desc = p_next_desc;
        }
        cl_spinlock_release( &p_port->cm_buf_mgr.lock);

        if( p_head_desc )
        {
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
                        ("Posting %d SRQ desc, oop %d\n", rx_cnt, posted) );

                ib_status = p_endpt->p_ifc->post_srq_recv( p_port->ib_mgr.h_srq,
                                                                                                   &p_head_desc->wr,
                                                                                                   &p_failed_wc );
                if( ib_status != IB_SUCCESS )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("post_srq_recv() returned %s\n",
                                p_endpt->p_ifc->get_err_str( ib_status )) );

                        /* put descriptors back to the pool */
                        while( p_failed_wc )
                        {
                                p_head_desc = PARENT_STRUCT( p_failed_wc, ipoib_cm_recv_desc_t, wr );
                                p_failed_wc = p_failed_wc->p_next;
                                __cm_buf_mgr_put_recv( p_port, p_head_desc, TRUE, NULL );
                        }
                }
        }

        IPOIB_EXIT( IPOIB_DBG_RECV );
        return( ib_status );
}


/*
 * Posts receive buffers to the receive queue and returns the number
 * of receives needed to bring the RQ to its low water mark.  Note
 * that the value is signed, and can go negative.  All tests must
 * be for > 0.
 */
static int32_t
__cm_recv_mgr_repost(
        IN              ipoib_port_t* const             p_port,
        IN              ipoib_endpt_t* const    p_endpt )
{
        ipoib_cm_recv_desc_t    *p_head = NULL, *p_tail = NULL, *p_next;
        ib_api_status_t                 status;
        ib_recv_wr_t                    *p_failed;
        int                                             rx_cnt=0;
        int                                             rx_wanted;

        PERF_DECLARE( GetRecv );
        PERF_DECLARE( PostRecv );

        IPOIB_ENTER( IPOIB_DBG_RECV );

        CL_ASSERT( p_port );

        cl_obj_lock( &p_port->obj );
        if( p_port->state != IB_QPS_RTS )
        {
                cl_obj_unlock( &p_port->obj );
                IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
                        ("Port in invalid state.  Not reposting.\n") );
                return 0;
        }

        ipoib_port_ref( p_port, ref_repost );
        cl_obj_unlock( &p_port->obj );

        cl_spinlock_acquire( &p_port->cm_buf_mgr.lock);
        rx_wanted = p_port->p_adapter->params.rq_depth - p_port->cm_buf_mgr.posted;

        while( p_port->cm_buf_mgr.posted < p_port->p_adapter->params.rq_depth )
        {
                /* Pull receives out of the pool and chain them up. */
                cl_perf_start( GetRecv );
                p_next = __cm_buf_mgr_get_recv_locked( p_port, p_endpt );
                cl_perf_stop( &p_port->p_adapter->perf, GetRecv );
                if( !p_next )
                {
                        IPOIB_PRINT(TRACE_LEVEL_VERBOSE, IPOIB_DBG_RECV,
                                ("Out of receive descriptors! recv queue depth %d\n",
                                p_port->cm_buf_mgr.posted) );
                        break;
                }

                if( !p_tail )
                {
                        p_tail = p_next;
                        p_next->wr.p_next = NULL;
                }
                else
                        p_next->wr.p_next = &p_head->wr;

                p_head = p_next;
                rx_cnt++;
        }
        cl_spinlock_release( &p_port->cm_buf_mgr.lock);

        if( p_head )
        {
                cl_perf_start( PostRecv );
                status = p_endpt->p_ifc->post_srq_recv( p_port->ib_mgr.h_srq,
                                                                                            &p_head->wr,
                                                                                            &p_failed );
                cl_perf_stop( &p_port->p_adapter->perf, PostRecv );

                if( status != IB_SUCCESS )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("posting %d recv desc returned %s\n",
                                        rx_cnt, p_endpt->p_ifc->get_err_str( status )) );
                        /* return the descriptors to the pool */
                        while( p_failed )
                        {
                                p_head = PARENT_STRUCT( p_failed, ipoib_cm_recv_desc_t, wr );
                                p_failed = p_failed->p_next;
                                __cm_buf_mgr_put_recv( p_port, p_head, TRUE, NULL );
                        }
                }
                else
                {
                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
                                ("CM RX bufs: wanted %d posted %d\n", rx_wanted, rx_cnt) );
                }
        }

        cl_spinlock_acquire( &p_port->cm_buf_mgr.lock);
        rx_cnt = p_port->p_adapter->params.rq_low_watermark - p_port->cm_buf_mgr.posted;
        cl_spinlock_release( &p_port->cm_buf_mgr.lock);

        ipoib_port_deref( p_port, ref_repost );

        IPOIB_EXIT( IPOIB_DBG_RECV );
        return rx_cnt;
}


/*
 * Post specified number of receive buffers to the receive queue
 */

static int
__cm_recv_mgr_repost_grow(
        IN              ipoib_port_t* const             p_port,
        IN              ipoib_endpt_t* const    p_endpt,
        IN              int                                             grow_cnt )
{
        ipoib_cm_recv_desc_t    *p_head = NULL, *p_next;
        ib_api_status_t                 status;
        ib_recv_wr_t                    *p_failed;
        int                                             buf_cnt=0;
        int                                             buf_wanted=grow_cnt;

        PERF_DECLARE( GetRecv );
        PERF_DECLARE( PostRecv );

        IPOIB_ENTER( IPOIB_DBG_CM );

        CL_ASSERT( p_port );

        cl_obj_lock( &p_port->obj );
        if( p_port->state != IB_QPS_RTS )
        {
                cl_obj_unlock( &p_port->obj );
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Port state invalid; Not reposting for Rx pool Growth.\n") );
                return grow_cnt;
        }
        ipoib_port_ref( p_port, ref_repost );
        cl_obj_unlock( &p_port->obj );

        cl_spinlock_acquire( &p_port->cm_buf_mgr.lock);

        while (grow_cnt > 0)
        {
                /* Pull receives out of the pool and chain them up. */
                cl_perf_start( GetRecv );
                p_next = __cm_buf_mgr_get_recv_locked( p_port, p_endpt );
                cl_perf_stop( &p_port->p_adapter->perf, GetRecv );
                if( !p_next )
                {
                        IPOIB_PRINT(TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("Out of receive descriptors! recv queue depth %d\n",
                                        p_port->cm_buf_mgr.posted) );
                        break;
                }

                if( !p_head )
                {
                        p_head = p_next;
                        p_head->wr.p_next = NULL;
                }
                else
                        p_next->wr.p_next = &p_head->wr;

                p_head = p_next;
                grow_cnt--;
                buf_cnt++;
        }
        cl_spinlock_release( &p_port->cm_buf_mgr.lock);

        if( p_head )
        {
                cl_perf_start( PostRecv );
                status = p_endpt->p_ifc->post_srq_recv( p_port->ib_mgr.h_srq,
                                                                                            &p_head->wr,
                                                                                            &p_failed );
                cl_perf_stop( &p_port->p_adapter->perf, PostRecv );

                if( status != IB_SUCCESS )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("SRQ posting %d recv desc returned %s\n",
                                        buf_cnt, p_endpt->p_ifc->get_err_str( status )) );
                        /* return the descriptors to the pool */
                        while( p_failed )
                        {
                                p_head = PARENT_STRUCT( p_failed, ipoib_cm_recv_desc_t, wr );
                                p_failed = p_failed->p_next;
                                __cm_buf_mgr_put_recv( p_port, p_head, TRUE, NULL );
                        }
                }
                else
                {
                        if( (buf_wanted - buf_cnt) != 0 )
                        {
                                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                        ("SRQ growth: wanted %d posted %d, shortage %d\n",
                                                buf_wanted, buf_cnt, (buf_wanted - buf_cnt)) );
                        }
                }
        }
        ipoib_port_deref( p_port, ref_repost );

        IPOIB_EXIT( IPOIB_DBG_CM );
        return (buf_wanted - buf_cnt);
}


static IO_WORKITEM_ROUTINE __WorkItemCM;

static void
__WorkItemCM(
        IN                              DEVICE_OBJECT*                          p_dev_obj,
        IN                              void*                                           context )
{
        ipoib_port_t *p_port;
        ipoib_endpt_t *p_endpt;
        BOOLEAN WorkToDo = TRUE;
        KIRQL irql;
        uint32_t recv_cnt = 0;
        uint32_t total_recv_cnt = 0;

        UNREFERENCED_PARAMETER(p_dev_obj);
        IPOIB_ENTER( IPOIB_DBG_CM );

        p_endpt = (ipoib_endpt_t*)context;
        p_port = ipoib_endpt_parent( p_endpt );

        while (WorkToDo && total_recv_cnt < 512)
        {
                irql = KeRaiseIrqlToDpcLevel();
                WorkToDo = __cm_recv_internal( NULL, p_endpt, &recv_cnt );
                KeLowerIrql(irql);
                total_recv_cnt += recv_cnt;
        }

        if (WorkToDo)
        {
                IoQueueWorkItem( p_port->pPoWorkItemCM,
                                                (PIO_WORKITEM_ROUTINE) __WorkItemCM,
                                                 DelayedWorkQueue,
                                                 p_endpt );
        }
        else
        {
                // Release the reference count that was incremented when queued the work item
                ipoib_port_deref( p_port, ref_recv_cb );
        }
        IPOIB_EXIT( IPOIB_DBG_CM );
}


static BOOLEAN
__cm_recv_internal(
        IN              const   ib_cq_handle_t                  h_cq,   // can be NULL
        IN                              void                                    *cq_context,
        IN                              uint32_t                                *p_recv_cnt )
{
        ipoib_port_t            *p_port;
        ipoib_endpt_t           *p_endpt;
        ib_api_status_t         status;
        ib_wc_t                         wc[MAX_CM_RECV_WC], *p_free, *p_done_wc;
        int32_t                         NBL_cnt, recv_cnt = 0, shortage, discarded, bytes_received;
        cl_qlist_t                      done_list, bad_list;
        size_t                          bad_list_cnt;
        ULONG                           recv_complete_flags = 0;
        BOOLEAN                         res;
        cl_perf_t                       *p_perf;
        ib_al_ifc_t                     *p_ibal;
        BOOLEAN                         WorkToDo = FALSE;

        PERF_DECLARE( CMRecvCb );
        PERF_DECLARE( CMPollRecv );
        PERF_DECLARE( CMRepostRecv );
        PERF_DECLARE( CMFilterRecv );
        PERF_DECLARE( CMBuildNBLArray );
        PERF_DECLARE( CMRecvNdisIndicate );

        IPOIB_ENTER( IPOIB_DBG_RECV );

        cl_perf_start( CMRecvCb );

        NDIS_SET_SEND_COMPLETE_FLAG(recv_complete_flags,
                                                                NDIS_RECEIVE_FLAGS_DISPATCH_LEVEL);

        p_endpt = (ipoib_endpt_t*)cq_context;
        ASSERT( p_endpt );
#if DBG
        if( h_cq ) {ASSERT( h_cq == p_endpt->conn.h_recv_cq );}
#endif
        p_port = ipoib_endpt_parent( p_endpt );

        p_perf = &p_port->p_adapter->perf;
        p_ibal = p_port->p_adapter->p_ifc;

        cl_qlist_init( &done_list );
        cl_qlist_init( &bad_list );

        ipoib_port_ref( p_port, ref_cm_recv_cb );

        for( p_free=wc; p_free < &wc[MAX_CM_RECV_WC - 1]; p_free++ )
                p_free->p_next = p_free + 1;
        p_free->p_next = NULL;

        /*
         * We'll be accessing the endpoint map so take a reference
         * on it to prevent modifications.
         */
        cl_obj_lock( &p_port->obj );
        cl_atomic_inc( &p_port->endpt_rdr );
        cl_obj_unlock( &p_port->obj );

        do
        {
                /* If we get here, then the list of WCs is intact. */
                p_free = wc;
                cl_perf_start( CMPollRecv );
                status = p_ibal->poll_cq( p_endpt->conn.h_recv_cq, &p_free, &p_done_wc );
                cl_perf_stop( p_perf, CMPollRecv );
                if( status != IB_SUCCESS && status != IB_NOT_FOUND )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("Port[%d] EP %s poll_cq() %s\n",
                                        p_port->port_num, p_endpt->tag,
                                        p_port->p_adapter->p_ifc->get_err_str(status)) );
                        break;
                }

                /* Look at the payload now and filter ARP and DHCP packets. */

                cl_perf_start( CMFilterRecv );
                recv_cnt += __cm_recv_mgr_filter( p_port,
                                                                                  p_endpt,
                                                                                  p_done_wc,
                                                                                  &done_list,
                                                                                  &bad_list );
                cl_perf_stop( p_perf, CMFilterRecv );

        } while( (!p_free) && (recv_cnt < 128) );

        /* We're done looking at the endpoint map, release the reference. */
        cl_atomic_dec( &p_port->endpt_rdr );

        bad_list_cnt = cl_qlist_count( &bad_list );
        if( bad_list_cnt > 0 )
                recv_cnt = (uint32_t) cl_qlist_count( &done_list );

        *p_recv_cnt = (uint32_t)recv_cnt;

        /* Return any discarded receives to the pool */
        if( bad_list_cnt > 0 )
        {
                /* RC connection - we are hosed. */
                IPOIB_PRINT(TRACE_LEVEL_ERROR,IPOIB_DBG_ERROR,
                        ("bad_list %d done list %d\n", (int)bad_list_cnt, recv_cnt) );
                __cm_buf_mgr_put_recv_list( p_port, &bad_list );
        }

        if( recv_cnt == 0 )
        {
                IPOIB_PRINT_EXIT(TRACE_LEVEL_ERROR,IPOIB_DBG_ERROR, ("recv_cnt == 0 ?\n") );
                ipoib_port_deref( p_port, ref_cm_recv_cb );
                IPOIB_EXIT( IPOIB_DBG_RECV );
                return FALSE;
        }

        cl_spinlock_acquire( &p_port->cm_buf_mgr.lock);

        if( recv_cnt > 0 )
                (void) InterlockedExchangeAdd( &p_port->cm_buf_mgr.posted, -recv_cnt );

        cl_spinlock_release( &p_port->cm_buf_mgr.lock);

        //cl_spinlock_acquire( &p_port->recv_lock );

        do
        {
                /* Approximate the number of posted receive buffers needed in order to
                 * bring the SRQ above the low water mark; Normally a large negative number.
                 * Approximate, in that without proper locking, it's a guess, that's OK.
                 */
                shortage =
                        p_port->p_adapter->params.rq_low_watermark - p_port->cm_buf_mgr.posted;

                if( shortage > 0 )
                {
                        cl_perf_start( CMRepostRecv );
                        /* Repost ASAP so we don't starve the RQ. */
                        shortage = __cm_recv_mgr_repost_grow( p_port, p_endpt, shortage );
                        cl_perf_stop( p_perf, CMRepostRecv );

                        if( shortage > 0 )
                        {
                                recv_complete_flags |= NDIS_RECEIVE_FLAGS_RESOURCES;
                                cl_dbg_out("CM Rx SHORTAGE=%d\n",shortage);
                        }
                }

                cl_perf_start( CMBuildNBLArray );
                /* Notify NDIS of any and all possible receive buffers. */
                NBL_cnt = __cm_recv_mgr_build_NBL_list( p_port,
                                                                                                p_endpt,
                                                                                                &done_list,
                                                                                                &discarded,
                                                                                                &bytes_received,
                                                                                                &p_endpt->cm_recv.NBL );
                cl_perf_stop( p_perf, CMBuildNBLArray );

                /* Only indicate receives if we actually had any. */
                if( discarded && shortage > 0 )
                {
                        /* We may have thrown away packets, and have a shortage */
                        cl_perf_start( CMRepostRecv );
                        __cm_recv_mgr_repost( p_port, p_endpt );
                        cl_perf_stop( p_perf, CMRepostRecv );
                }

                if( !NBL_cnt )
                {
                        /* normal all-done loop exit */
                        break;
                }
                //cl_spinlock_release( &p_port->recv_lock );

                cl_perf_start( CMRecvNdisIndicate );

                if (shortage <= 0)      // normal case of posted RX > low water mark.
                {
                        res = shutter_add( &p_port->p_adapter->recv_shutter, NBL_cnt );
                        if (res)
                        {
                                ipoib_inc_recv_stat( p_port->p_adapter,
                                                                         IP_STAT_UCAST_BYTES,
                                                                         bytes_received,
                                                                         NBL_cnt );

                                NdisMIndicateReceiveNetBufferLists(
                                                                                        p_port->p_adapter->h_adapter,
                                                                                        p_endpt->cm_recv.NBL,
                                                                                        NDIS_DEFAULT_PORT_NUMBER,
                                                                                        NBL_cnt,
                                                                                        recv_complete_flags );
                        }
                        else
                        {
                                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                        ("Port[%d] res == 0, free NBL\n",p_port->port_num) );
                                cl_spinlock_acquire( &p_port->recv_lock );
                                ipoib_free_received_NBL( p_port, p_endpt->cm_recv.NBL );
                                cl_spinlock_release( &p_port->recv_lock );
                        }
                }
                else
                {
                        /* shortage > 0, we already set the status to
                           NDIS_RECEIVE_FLAGS_RESOURCES. That is, IPoIB driver regains
                           ownership of the NET_BUFFER_LIST structures immediately
                         */
                        res = shutter_add( &p_port->p_adapter->recv_shutter, 1 );
                        if (res)
                        {
                                ipoib_inc_recv_stat( p_port->p_adapter,
                                                                         IP_STAT_UCAST_BYTES,
                                                                         bytes_received,
                                                                         NBL_cnt );

                                NdisMIndicateReceiveNetBufferLists(
                                                                                        p_port->p_adapter->h_adapter,
                                                                                        p_endpt->cm_recv.NBL,
                                                                                        NDIS_DEFAULT_PORT_NUMBER,
                                                                                        NBL_cnt,
                                                                                        recv_complete_flags );
                                shutter_sub( &p_port->p_adapter->recv_shutter, -1 );
                        }
                        cl_perf_stop( p_perf, CMRecvNdisIndicate );

                        /*
                         * Cap the number of receives to put back to what we just indicated
                         * with NDIS_STATUS_RESOURCES.
                         */
                        cl_dbg_out("CM Rx SHORTAGE reposting all NBLs\n");
                        /* repost all NBLs */
                        cl_spinlock_acquire( &p_port->recv_lock );
                        ipoib_free_received_NBL( p_port, p_endpt->cm_recv.NBL );
                        cl_spinlock_release( &p_port->recv_lock );
                }
                //cl_spinlock_acquire( &p_port->recv_lock );

        } while( NBL_cnt );

        //cl_spinlock_release( &p_port->recv_lock );

        if (p_free )
        {
                /*
                 * Rearm after filtering to prevent contention on the end-point maps
                 * and eliminates the possibility of a call to __endpt_cm_mgr_insert
                 * finding a duplicate.
                 */
                ASSERT( WorkToDo == FALSE );
                status = p_ibal->rearm_cq( p_endpt->conn.h_recv_cq, FALSE );
                CL_ASSERT( status == IB_SUCCESS );
        }
        else
        {
                if( h_cq && bad_list_cnt == 0 )
                {
                        // increment reference to ensure no one release the object while work
                        // item is queued
                        ipoib_port_ref( p_port, ref_recv_cb );
                        IoQueueWorkItem( p_port->pPoWorkItemCM,
                                                         (PIO_WORKITEM_ROUTINE) __WorkItemCM,
                                                         DelayedWorkQueue,
                                                         p_endpt );
                        WorkToDo = FALSE;
                }
                else
                        WorkToDo = TRUE;
        }
        cl_perf_stop( p_perf, CMRecvCb );

        ipoib_port_deref( p_port, ref_cm_recv_cb );

        IPOIB_EXIT( IPOIB_DBG_RECV );
        return WorkToDo;
}

static void
__cm_recv_cb(
        IN              const   ib_cq_handle_t                          h_cq,
        IN                              void                                            *cq_context )
{
        uint32_t        recv_cnt;
        boolean_t       WorkToDo;

        do {
                WorkToDo = __cm_recv_internal(h_cq, cq_context, &recv_cnt);
        } while( WorkToDo );
}


static void
__conn_reject(
        IN              ipoib_port_t* const             p_port,
        IN              ib_cm_handle_t                  h_cm_handle,
        IN              ib_rej_status_t                 rej_status )
{
        ib_api_status_t         ib_status;
        ib_cm_rej_t                     cm_rej;
        uint16_t                        ari_info;
        cm_private_data_t       private_data;

        IPOIB_ENTER( IPOIB_DBG_CM );

        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                ("CM REJECT SEND with rej_status %d\n", cl_ntoh16( rej_status ) ) );

        memset( &cm_rej, 0, sizeof( ib_cm_rej_t ) );
        cm_rej.rej_status = IB_REJ_USER_DEFINED;
        cm_rej.ari_length = sizeof( uint16_t );
        ari_info = rej_status;
        cm_rej.p_ari = (ib_ari_t *)&ari_info;

        private_data.ud_qpn = p_port->ib_mgr.qpn;
        private_data.recv_mtu =
                cl_hton32( p_port->p_adapter->params.cm_payload_mtu + sizeof(ipoib_hdr_t) );

        cm_rej.p_rej_pdata = (uint8_t *)&private_data;
        cm_rej.rej_length = sizeof( cm_private_data_t );

        ib_status = p_port->p_adapter->p_ifc->cm_rej( h_cm_handle, &cm_rej );
        if( ib_status != IB_SUCCESS )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("cm_rej failed status %s\n",
                                p_port->p_adapter->p_ifc->get_err_str(ib_status)) );
        }

        IPOIB_EXIT( IPOIB_DBG_CM );
}


static void
__cq_async_event_cb(
        IN              ib_async_event_rec_t            *p_event_rec )
{
        ipoib_endpt_t*  p_endpt;
        ipoib_port_t*   p_port;

        p_endpt = (ipoib_endpt_t *)p_event_rec->context;
        p_port = ipoib_endpt_parent( p_endpt );

        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                ("SRQ CQ AsyncEvent EP %s event '%s' vendor code %#I64d\n",
                        p_endpt->tag, ib_get_async_event_str(p_event_rec->code),
                        p_event_rec->vendor_specific) );
}



static void
__srq_qp_async_event_cb(
        IN              ib_async_event_rec_t            *p_event_rec )
{
        ipoib_endpt_t*  p_endpt;
        ipoib_port_t*   p_port;

        IPOIB_ENTER( IPOIB_DBG_CM );

        p_endpt = (ipoib_endpt_t *)p_event_rec->context;
        ASSERT( p_endpt );
        p_port = ipoib_endpt_parent( p_endpt );
        ASSERT( p_port );

        switch( p_event_rec->code )
        {
          case IB_AE_SRQ_LIMIT_REACHED:
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("SRQ low-water mark reached(%d) EP %s grow posted by %d\n",
                                SRQ_LOW_WATER, p_endpt->tag, SRQ_MIN_GROWTH) );
                __cm_recv_mgr_repost_grow( p_port, p_endpt, SRQ_MIN_GROWTH );
                // TODO: consider queuing to endpoint thread an SRQ grow request.
                break;

          case IB_AE_SRQ_QP_LAST_WQE_REACHED:
                /* LAST_WEQ_REACHED is normal for SRQ, Endpoint CQ is flushed, destroy QP. */
                if( p_event_rec->handle.h_qp == p_endpt->conn.h_recv_qp )
                {
                        int how = -2; // always flush - tidy bowl spirit.

                        if( p_endpt->cm_ep_destroy )
                                how = -1;

                        cm_release_resources( p_port, p_endpt, how ); // QP already in ERR state.

                        if( p_endpt->cm_ep_destroy )
                        {
                                endpt_cm_set_state( p_endpt, IPOIB_CM_DISCONNECTED );
                                if( p_endpt->cm_ep_destroy == 1 )
                                {
                                        DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                                                ("Destroy Obj EP %s\n", p_endpt->tag) );
                                        cl_obj_destroy( &p_endpt->obj );
                                }
#if DBG
                                else
                                {
                                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                                                ("EP %s OBJ destroy cnt %u ?\n",
                                                        p_endpt->tag, p_endpt->cm_ep_destroy) );
                                }
#endif
                        }
                }
                break;

          default:
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ERROR,
                        ("EP %s SRQ ASYNC EVENT(%d) '%s' vendor code %#I64d\n",
                                p_endpt->tag, p_event_rec->code,
                                ib_get_async_event_str(p_event_rec->code),
                                p_event_rec->vendor_specific) );
                break;
        }
        IPOIB_EXIT( IPOIB_DBG_CM );
}

void
endpt_cm_disconnect(
        IN              ipoib_endpt_t*  const   p_endpt )
{
        ib_api_status_t         ib_status;
        ib_cm_dreq_t            cm_dreq;
        cm_state_t                      cm_state;

        IPOIB_ENTER( IPOIB_DBG_CM );

        cm_state = endpt_cm_get_state( p_endpt );

        if( cm_state != IPOIB_CM_CONNECTED && cm_state != IPOIB_CM_DESTROY )
        {
                IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                        ("EP[%p] %s DREQ not sent, Incorrect conn state %s\n",
                                        p_endpt, p_endpt->tag, cm_get_state_str(cm_state)) );
                return;
        }

        endpt_cm_set_state( p_endpt, IPOIB_CM_DREQ_SENT );

        if( p_endpt->conn.h_send_qp )
        {
                cm_dreq.h_qp = p_endpt->conn.h_send_qp;
                cm_dreq.qp_type = IB_QPT_RELIABLE_CONN;
                cm_dreq.p_dreq_pdata = NULL;
                cm_dreq.dreq_length = 0;
                cm_dreq.flags = 0;

                ib_status = p_endpt->p_ifc->cm_dreq( &cm_dreq );

                if( ib_status != IB_SUCCESS )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                (" SEND QP Disconnect ENDPT %s QP status %s\n",
                                        p_endpt->tag, p_endpt->p_ifc->get_err_str(ib_status)) );
                }
#if DBG
                else
                {
                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM,
                                (" SEND QP disconnect(DREQ) to EP %s prev %s\n",
                                        p_endpt->tag, cm_get_state_str(cm_state)) );
                }
#endif
        }

        if( p_endpt->conn.h_recv_qp )
        {
                cm_dreq.h_qp = p_endpt->conn.h_recv_qp;
                ib_status = p_endpt->p_ifc->cm_dreq( &cm_dreq );
                if( ib_status != IB_SUCCESS )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                (" RECV QP Disconnected(DREQ) EP %s QP status %s\n",
                                        p_endpt->tag, p_endpt->p_ifc->get_err_str(ib_status)) );
                }
#if DBG
                else
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                (" RECV QP disconnected(DREQ) from EP%s QP status %s\n",
                                        p_endpt->tag, p_endpt->p_ifc->get_err_str(ib_status)) );
                }
#endif
        }
        IPOIB_EXIT( IPOIB_DBG_CM );
}


#if IPOIB_CM

static void
__cm_buf_mgr_construct(
        IN              cm_buf_mgr_t * const            p_buf_mgr )
{
        cl_qpool_construct( &p_buf_mgr->recv_pool );

        cl_spinlock_construct( &p_buf_mgr->lock );

        p_buf_mgr->h_nbl_pool = NULL;
        p_buf_mgr->pool_init = FALSE;
}


ib_api_status_t
endpt_cm_buf_mgr_init(
        IN              ipoib_port_t* const                     p_port )
{
        cl_status_t             cl_status;
        ib_api_status_t ib_status=IB_SUCCESS;
        NET_BUFFER_LIST_POOL_PARAMETERS pool_parameters;

        IPOIB_ENTER( IPOIB_DBG_INIT );

        if( p_port->cm_buf_mgr.pool_init )
                return ib_status;

        __cm_buf_mgr_construct( &p_port->cm_buf_mgr );

        p_port->cm_buf_mgr.recv_pool_depth =
                min( (uint32_t) p_port->p_adapter->params.rq_depth * 8,
                                p_port->p_ca_attrs->max_srq_wrs/2 );

        DIPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_INIT,
                        ("Port[%d] cm_recv_mgr.recv_pool_depth %d max_srq_wrs/2 %d\n",
                        p_port->port_num,
                        p_port->cm_buf_mgr.recv_pool_depth,
                        p_port->p_ca_attrs->max_srq_wrs/2 ) );

        cl_qlist_init( &p_port->cm_buf_mgr.oop_list );
        cl_status = cl_spinlock_init( &p_port->cm_buf_mgr.lock );
        if( cl_status != CL_SUCCESS )
        {
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("cl_spinlock_init returned %#x\n", cl_status) );
                return IB_ERROR;
        }
        p_port->cm_buf_mgr.posted = 0;

        /* Allocate the NET BUFFER list pool for receive indication.
         * In the recv_pool ctor routine a NBL & mdl are allocated and attached to the
         * recv_desc.
         */
        memset( &pool_parameters, 0, sizeof(NET_BUFFER_LIST_POOL_PARAMETERS) );
    pool_parameters.Header.Type = NDIS_OBJECT_TYPE_DEFAULT;
    pool_parameters.Header.Revision = NET_BUFFER_LIST_POOL_PARAMETERS_REVISION_1;
    pool_parameters.Header.Size = sizeof(pool_parameters);
    pool_parameters.ProtocolId = NDIS_PROTOCOL_ID_DEFAULT;
    pool_parameters.ContextSize = 0;
    pool_parameters.fAllocateNetBuffer = TRUE;
    pool_parameters.PoolTag = 'PRPI';
        pool_parameters.DataSize = 0;

    p_port->cm_buf_mgr.h_nbl_pool = NdisAllocateNetBufferListPool(
                                                                                                        p_port->p_adapter->h_adapter,
                                                                                                        &pool_parameters );
        if( !p_port->cm_buf_mgr.h_nbl_pool )
        {
                NdisWriteErrorLogEntry( p_port->p_adapter->h_adapter,
                        EVENT_IPOIB_RECV_PKT_POOL, 1, NDIS_STATUS_RESOURCES );

                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("NdisAllocatePacketPool(cm_packet_pool)\n") );

                return IB_INSUFFICIENT_RESOURCES;
        }

        /* Allocate the receive descriptors pool */
        cl_status = cl_qpool_init( &p_port->cm_buf_mgr.recv_pool,
                                                           p_port->cm_buf_mgr.recv_pool_depth,
                                                           0,
                                                           0,
                                                           sizeof( ipoib_cm_recv_desc_t ),
                                                           __cm_recv_desc_ctor,
                                                           __cm_recv_desc_dtor,
                                                           p_port );

        if( cl_status != CL_SUCCESS )
        {
                NdisWriteErrorLogEntry( p_port->p_adapter->h_adapter,
                                                                EVENT_IPOIB_RECV_POOL, 1, cl_status );
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("cl_qpool_init(cm_buf_mgr.recv_pool) returned %#x\n", cl_status) );
                ib_status =  IB_INSUFFICIENT_MEMORY;
                goto pkt_pool_failed;
        }

        p_port->cm_buf_mgr.pool_init = TRUE;

        IPOIB_EXIT( IPOIB_DBG_INIT );

        return IB_SUCCESS;

pkt_pool_failed:

        NdisFreeNetBufferListPool( p_port->cm_buf_mgr.h_nbl_pool );
        p_port->cm_buf_mgr.h_nbl_pool = NULL;

        IPOIB_EXIT( IPOIB_DBG_INIT );
        return ib_status;
}


static void
__cm_buf_mgr_reset(
        IN              ipoib_port_t* const             p_port )
{
        cl_list_item_t          *p_item;

        IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
                ("Port[%d] pool elements outstanding %d oop_list count %d\n",
                        p_port->port_num, p_port->cm_buf_mgr.posted,
                        (int)cl_qlist_count( &p_port->cm_buf_mgr.oop_list)) );

        if( cl_qlist_count( &p_port->cm_buf_mgr.oop_list ) )
        {
                ipoib_cm_recv_desc_t*   p_desc;

                for( p_item = cl_qlist_remove_head( &p_port->cm_buf_mgr.oop_list );
                        p_item != cl_qlist_end( &p_port->cm_buf_mgr.oop_list );
                        p_item =  cl_qlist_remove_head( &p_port->cm_buf_mgr.oop_list ) )
                {
                        p_desc = PARENT_STRUCT( p_item, ipoib_cm_recv_desc_t, list_item );
                        cl_qpool_put( &p_port->cm_buf_mgr.recv_pool, &p_desc->item );
                        InterlockedDecrement( &p_port->cm_buf_mgr.posted );
                }
        }

#if DBG
        if( p_port->cm_buf_mgr.posted )
        {
                IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
                        ("Port[%d] CM Recv pool buffers outstanding %d?\n",
                                        p_port->port_num, p_port->cm_buf_mgr.posted) );
        }
#endif
}


void
endpt_cm_buf_mgr_destroy(
        IN              ipoib_port_t* const             p_port )
{
        IPOIB_ENTER(IPOIB_DBG_INIT );

        CL_ASSERT( p_port );

        /* Free the receive descriptors. */
        if( !p_port->cm_buf_mgr.pool_init )
                return;

        p_port->cm_buf_mgr.pool_init = FALSE;

        /* return CM recv pool elements (descriptors) to the recv pool */
        __cm_buf_mgr_reset( p_port );

        cl_qpool_destroy( &p_port->cm_buf_mgr.recv_pool );

        if( p_port->cm_buf_mgr.h_nbl_pool )
        {
                NdisFreeNetBufferListPool( p_port->cm_buf_mgr.h_nbl_pool );
                p_port->cm_buf_mgr.h_nbl_pool = NULL;
        }

        cl_spinlock_destroy( &p_port->cm_buf_mgr.lock );

        IPOIB_EXIT(  IPOIB_DBG_INIT );
}

static cl_status_t
__cm_recv_desc_ctor(
        IN                              void* const                                     p_object,
        IN                              void*                                           context,
                OUT                     cl_pool_item_t** const          pp_pool_item )
{
        ipoib_cm_recv_desc_t*   p_desc;
        ipoib_port_t*                   p_port;
        int                                             lds, bytes;
        uint8_t*                                kva;

        CL_ASSERT( p_object );
        CL_ASSERT( context );

        p_desc = (ipoib_cm_recv_desc_t*)p_object;
        p_port = (ipoib_port_t*)context;

        /*
         * Allocate Rx buffer (PAGE_SIZE min).
         * Extra space is allocated for the prefixed Ethernet header which is
         * synthesized prior to NDIS receive indicate.
         */
#define BUF_ALIGN 16
        p_desc->alloc_buf_size = p_port->p_adapter->params.cm_xfer_block_size + BUF_ALIGN;

        if( p_desc->alloc_buf_size < PAGE_SIZE )
                p_desc->alloc_buf_size = ROUNDUP( p_desc->alloc_buf_size, PAGE_SIZE );
        else if( p_desc->alloc_buf_size & (BUF_ALIGN - 1) )
                p_desc->alloc_buf_size = ROUNDUP( p_desc->alloc_buf_size, BUF_ALIGN );

        CL_ASSERT( (p_desc->alloc_buf_size / PAGE_SIZE) <= MAX_CM_RECV_SGE );

        p_desc->p_alloc_buf = (uint8_t *)ExAllocatePoolWithTag( NonPagedPool,
                                                                                                                        p_desc->alloc_buf_size,
                                                                                                                        'DOMC' );
        if( p_desc->p_alloc_buf == NULL )
        {
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Failed to allocate CM recv buffer size %d bytes.\n",
                        p_desc->alloc_buf_size ) );
                return CL_INSUFFICIENT_MEMORY;
        }

        p_desc->p_buf = p_desc->p_alloc_buf + DATA_OFFSET;
        p_desc->buf_size = p_desc->alloc_buf_size - DATA_OFFSET;

        /* Setup RC recv local data segments, mapped by phys page.
         * Use UD mem key which maps all phys mem, therefore map by phys page must be
         * be page aligned; no memory register.
         */
        ASSERT( (((uintn_t)p_desc->p_alloc_buf) & (PAGE_SIZE-1)) == 0 ); // Page aligned?
        kva = p_desc->p_alloc_buf;
        bytes = p_desc->alloc_buf_size;

        for( lds=0; bytes > 0; lds++ )
        {
                p_desc->local_ds[lds].vaddr = cl_get_physaddr( (void*)kva );
                p_desc->local_ds[lds].lkey = p_port->ib_mgr.lkey;
                if( lds == 0 )
                {
                        /* get to next PAGE boundry for PAGE alignment as ds[0] short due
                         * to DATA_OFFSET space reservation for the synthesized Ethernet header.
                         */
                        p_desc->local_ds[lds].vaddr += DATA_OFFSET;
                        if( bytes >= PAGE_SIZE )
                        {
                                p_desc->local_ds[lds].length = PAGE_SIZE - DATA_OFFSET;
                                bytes -= PAGE_SIZE;
                        }
                        else
                        {
                                p_desc->local_ds[lds].length = bytes;
                                bytes = 0;
                        }
                }
                else
                {
                        if( bytes >= PAGE_SIZE )
                        {
                                p_desc->local_ds[lds].length = PAGE_SIZE;
                                bytes -= PAGE_SIZE;
                        }
                        else
                        {
                                p_desc->local_ds[lds].length = bytes;
                                bytes = 0;
                        }
                }
                kva += PAGE_SIZE;
        }
        p_desc->wr.num_ds = lds;
        CL_ASSERT( lds <= MAX_CM_RECV_SGE );    // ,+ because lds ++ on for() loop exit.

        /* Setup the work request. */
        p_desc->wr.wr_id = (uintn_t)p_desc;
        p_desc->wr.ds_array = p_desc->local_ds;

        p_desc->type = PKT_TYPE_CM_UCAST;
        p_desc->recv_mode = RECV_RC;

        /* setup NDIS NetworkBufferList and MemoryDescriptorList */
        CL_ASSERT( p_port->p_adapter->h_adapter );

        p_desc->p_mdl = NdisAllocateMdl( p_port->p_adapter->h_adapter,
                                                                         p_desc->p_alloc_buf,
                                                                         p_desc->buf_size );
        if( !p_desc->p_mdl )
        {
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Failed to allocate MDL\n") );
                goto err1;
        }

        CL_ASSERT( p_port->cm_buf_mgr.h_nbl_pool );
        p_desc->p_NBL = NdisAllocateNetBufferAndNetBufferList(
                                                                                                        p_port->cm_buf_mgr.h_nbl_pool,
                                                                                                        0,
                                                                                                        0,
                                                                                                        p_desc->p_mdl,
                                                                                                        0,
                                                                                                        0 );
        if( !p_desc->p_NBL )
        {
                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Failed to allocate NET_BUFFER_LIST\n") );
                goto err2;
        }

#if NDIS_HINTS
        NdisClearNblFlag( p_desc->p_NBL,
                                          ( NDIS_NBL_FLAGS_IS_IPV4 | NDIS_NBL_FLAGS_IS_IPV6
                                          | NDIS_NBL_FLAGS_IS_TCP | NDIS_NBL_FLAGS_IS_UDP) );
#endif
        NET_BUFFER_LIST_NEXT_NBL(p_desc->p_NBL) = NULL;
        IPOIB_PORT_FROM_NBL( p_desc->p_NBL ) = p_port;
        IPOIB_CM_RECV_FROM_NBL( p_desc->p_NBL ) = p_desc;
        p_desc->p_NBL->SourceHandle = p_port->p_adapter->h_adapter;

        *pp_pool_item = &p_desc->item;

        return CL_SUCCESS;

err2:
        NdisFreeMdl( p_desc->p_mdl );
        p_desc->p_mdl = NULL;
err1:
        ExFreePoolWithTag( p_desc->p_alloc_buf, 'DOMC' );
        p_desc->p_alloc_buf = NULL;
        p_desc->p_buf = NULL;

        return CL_INSUFFICIENT_MEMORY;
}


static void
__cm_recv_desc_dtor(
        IN              const   cl_pool_item_t* const           p_pool_item,
        IN                              void                                            *context )
{
        ipoib_cm_recv_desc_t    *p_desc;

        if( p_pool_item == NULL || context == NULL )
                return;

        p_desc = PARENT_STRUCT( p_pool_item, ipoib_cm_recv_desc_t, item );

        if( p_desc->p_mdl )
        {
                NdisFreeMdl( p_desc->p_mdl );
                p_desc->p_mdl = NULL;
        }

        if( p_desc->p_NBL)
        {
                NdisFreeNetBufferList(p_desc->p_NBL);
                p_desc->p_NBL = NULL;
        }

        if( p_desc->p_alloc_buf )
                ExFreePoolWithTag( p_desc->p_alloc_buf, 'DOMC' );

        p_desc->p_alloc_buf = NULL;
        p_desc->p_buf = NULL;
}


static void
__cm_buf_mgr_put_recv(
        IN              ipoib_port_t * const            p_port,
        IN              ipoib_cm_recv_desc_t* const     p_desc,
        IN              BOOLEAN                                         update,
        IN              NET_BUFFER_LIST* const  p_net_buffer_list OPTIONAL )
{
        cm_buf_mgr_t * const    p_buf_mgr = &p_port->cm_buf_mgr;

        IPOIB_ENTER(IPOIB_DBG_RECV );

        if( p_net_buffer_list )
        {
                MDL                     *p_mdl = NULL;
                NET_BUFFER      *p_nbuf = NULL;

                ASSERT( p_desc->p_NBL );
                ASSERT( p_desc->p_mdl );
                ASSERT( p_net_buffer_list == p_desc->p_NBL );

                NET_BUFFER_LIST_NEXT_NBL(p_net_buffer_list) = NULL;

                p_nbuf = NET_BUFFER_LIST_FIRST_NB( p_net_buffer_list );
                p_mdl = NET_BUFFER_FIRST_MDL( p_nbuf );

                ASSERT( p_mdl == p_desc->p_mdl );
                ASSERT( NET_BUFFER_CURRENT_MDL( p_nbuf ) == p_mdl );

                /* reset Avail buffer lengths to full Rx size */
                NET_BUFFER_DATA_LENGTH( p_nbuf ) = p_desc->buf_size;
                NdisAdjustMdlLength( p_mdl, p_desc->buf_size );

#if NDIS_HINTS
                NdisClearNblFlag( p_desc->p_NBL,
                                                  ( NDIS_NBL_FLAGS_IS_IPV4 | NDIS_NBL_FLAGS_IS_IPV6
                                                  | NDIS_NBL_FLAGS_IS_TCP | NDIS_NBL_FLAGS_IS_UDP) );
#endif
        }

        cl_spinlock_acquire( &p_buf_mgr->lock );

        /* Remove buffer from the posted list & return the descriptor to it's pool. */
        cl_qlist_remove_item( &p_buf_mgr->oop_list, &p_desc->list_item );
        p_desc->p_endpt = NULL;
        cl_qpool_put( &p_buf_mgr->recv_pool, &p_desc->item );

        /* case exists where .posted field is updated in cm_recv_cb() hence we skip the
         * update here as the recv desc has finally come back to the pool after being
         * freed up.
         */
        if( update )
                InterlockedDecrement( &p_port->cm_buf_mgr.posted );

        cl_spinlock_release( &p_buf_mgr->lock );

        IPOIB_EXIT( IPOIB_DBG_RECV );
}

/*
 * routine called when NDIS is returning a receive NBL (Network Buffer List)
 * and the Rx buffer was determined via the p_desc field 'recv_mode' to be
 * a CM Rx buffer.
 * Repost the buffer to the SRQ.
 * see ipoib_port.cpp::__free_received_NBL()
 */
void
ipoib_cm_buf_mgr_put_recv(
        IN              ipoib_port_t* const                     p_port,
        IN              ipoib_cm_recv_desc_t* const     p_desc,
        IN              NET_BUFFER_LIST* const          p_NBL OPTIONAL )
{
        ib_api_status_t ib_status = IB_SUCCESS;
        ib_recv_wr_t    *p_failed_wc = NULL;

        /* free the Net Buffer List and MDL */
        if( p_NBL )
        {
                NET_BUFFER      *p_nbuf = NULL;
                MDL                     *p_mdl = NULL;

                ASSERT( p_NBL == p_desc->p_NBL );
                NET_BUFFER_LIST_NEXT_NBL(p_NBL) = NULL;
                p_nbuf = NET_BUFFER_LIST_FIRST_NB( p_NBL );
                p_mdl = NET_BUFFER_FIRST_MDL( p_nbuf );
                ASSERT( p_mdl == p_desc->p_mdl );
                ASSERT( NET_BUFFER_CURRENT_MDL( p_nbuf ) == p_mdl );

                /* reset Avail buffer lengths to full Rx size */
                NET_BUFFER_DATA_LENGTH( p_nbuf ) = p_desc->buf_size;
                NdisAdjustMdlLength( p_mdl, p_desc->buf_size );

#if NDIS_HINTS
                NdisClearNblFlag( p_desc->p_NBL,
                                                  ( NDIS_NBL_FLAGS_IS_IPV4 | NDIS_NBL_FLAGS_IS_IPV6
                                                  | NDIS_NBL_FLAGS_IS_TCP | NDIS_NBL_FLAGS_IS_UDP) );
#endif
        }

        /* repost RC Rx desc */
        p_desc->wr.p_next = NULL; // just 1 buffer.

        ib_status = p_port->p_adapter->p_ifc->post_srq_recv( p_port->ib_mgr.h_srq,
                                                                                                                 &p_desc->wr,
                                                                                                                 &p_failed_wc );
        if( ib_status != IB_SUCCESS )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("post_srq_recv() returned %s\n",
                                p_port->p_adapter->p_ifc->get_err_str( ib_status )) );

                /* return descriptor back to the CM RX pool, buffer accounted for in
                 * cm_recv_internal().
                 */
                __cm_buf_mgr_put_recv( p_port, p_desc, FALSE, NULL );
        }
        else
        {
                /* adjust buffer accounting as cm_recv_internal() did the decrement. */
                InterlockedIncrement( &p_port->cm_buf_mgr.posted );
        }
}


static void
__cm_buf_mgr_put_recv_list(
        IN              ipoib_port_t* const                     p_port,
        IN              cl_qlist_t* const                       p_list )
{
        cm_buf_mgr_t*           p_buf_mgr = &p_port->cm_buf_mgr;
        ipoib_cm_recv_desc_t*   p_desc;
        cl_list_item_t*                 p_item;

        IPOIB_ENTER( IPOIB_DBG_RECV );

        cl_spinlock_acquire( &p_buf_mgr->lock);
        p_item = cl_qlist_remove_head( p_list );

        while( p_item != cl_qlist_end( p_list ) )
        {
                p_desc = (ipoib_cm_recv_desc_t*)p_item;
                ASSERT( p_desc->p_endpt );
                cl_qlist_remove_item( &p_buf_mgr->oop_list, &p_desc->list_item );
                p_desc->p_endpt = NULL;
                /* Return the descriptor to it's global port buffer pool. */
                cl_qpool_put( &p_buf_mgr->recv_pool, &p_desc->item );
                InterlockedDecrement( &p_port->cm_buf_mgr.posted );
                p_item = cl_qlist_remove_head( p_list );
        }
        cl_spinlock_release( &p_buf_mgr->lock);

        IPOIB_EXIT( IPOIB_DBG_RECV );
}


static int32_t
__cm_recv_mgr_filter(
        IN              ipoib_port_t* const                     p_port,
        IN              ipoib_endpt_t* const            p_endpt,
        IN              ib_wc_t* const                          p_done_wc_list,
        OUT             cl_qlist_t* const                       p_done_list,
        OUT             cl_qlist_t* const                       p_bad_list )
{
        ib_api_status_t                 ib_status;
        ipoib_cm_recv_desc_t    *p_desc;
        ib_wc_t                                 *p_wc;
        ipoib_pkt_t                             *p_ipoib;
        eth_pkt_t                               *p_eth;
        int32_t                                 recv_cnt=0;
        uint32_t                                len;

        IPOIB_ENTER( IPOIB_DBG_CM_RECV );

        ASSERT( ipoib_endpt_parent( p_endpt ) == p_port );

        for( p_wc = p_done_wc_list; p_wc; p_wc = p_wc->p_next )
        {
                p_desc = (ipoib_cm_recv_desc_t *)(uintn_t)p_wc->wr_id;
                p_desc->p_endpt = p_endpt; // use correct EndPoint pointer.
                recv_cnt++;
                if(  p_wc->status != IB_WCS_SUCCESS )
                {
                        if( p_wc->status != IB_WCS_WR_FLUSHED_ERR )
                        {
                                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                        ("EP %s Failed RC completion %s (vendor specific %#x)\n",
                                        p_endpt->tag, p_endpt->p_ifc->get_wc_status_str( p_wc->status ),
                                        (int)p_wc->vendor_specific) );
                        }
                        else
                        {
                                IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
                                        ("EP %s Flushed RC completion %s\n",
                                        p_endpt->tag, p_endpt->p_ifc->get_wc_status_str(p_wc->status)));
                        }
                        ipoib_inc_recv_stat( p_port->p_adapter, IP_STAT_ERROR, 0, 0 );
                        cl_qlist_insert_tail( p_bad_list, &p_desc->item.list_item );
                        continue;
                }

                if( p_wc->length < sizeof(ipoib_hdr_t) )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("Received ETH packet(%d) < min size(%d)\n",
                                        p_wc->length, sizeof(ipoib_hdr_t)) );
                        ipoib_inc_recv_stat( p_port->p_adapter, IP_STAT_ERROR, 0, 0 );
                        cl_qlist_insert_tail( p_bad_list, &p_desc->item.list_item );
                        continue;
                }

                if( (p_wc->length - sizeof(ipoib_hdr_t)) >
                                        p_port->p_adapter->params.cm_payload_mtu )
                {
                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("Received ETH packet len %d > CM payload MTU (%d)\n",
                                (p_wc->length - sizeof(ipoib_hdr_t)),
                                p_port->p_adapter->params.cm_payload_mtu) );
                        ipoib_inc_recv_stat( p_port->p_adapter, IP_STAT_ERROR, 0, 0 );
                        cl_qlist_insert_tail( p_bad_list, &p_desc->item.list_item );
                        continue;
                }

                /*
                 * Successful RX completion.
                 * Setup the ethernet/ip/arp header and queue descriptor for NDIS report.
                 */
                ib_status = IB_SUCCESS;
                p_desc->csum.Value = (PVOID)
                                        (( p_wc->recv.conn.recv_opt & IB_RECV_OPT_CSUM_MASK ) >> 8);

                /* WARNING - ipoib header overwritten with Ethernet header for NDIS Rx.
                 * What p_eth points at is invalid; initialized further on.
                 */
#if 0 // TODO - IPoIB-CM RFC wants support for GRH?
                if( p_wc->recv.conn.recv_opt & IB_RECV_OPT_GRH_VALID )
                {
                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
                                ("RC Rx with GRH\n") );
                        CL_ASSERT(0);
                        p_ipoib = (ipoib_pkt_t *)(p_desc->p_buf + sizeof(ib_grh_t));
                        p_eth = (eth_pkt_t *)
                                                ((p_desc->p_buf + sizeof(ib_grh_t) + sizeof(ipoib_hdr_t))
                                                        - sizeof(eth_pkt_t));
                }
                else
#endif
                {
                        p_ipoib = (ipoib_pkt_t *) p_desc->p_buf;
                        p_eth = (eth_pkt_t*) p_desc->p_alloc_buf;
                }

                //__debugbreak();

                CL_ASSERT( p_desc->recv_mode == RECV_RC );

                switch( p_ipoib->hdr.type )
                {
                case ETH_PROT_TYPE_ARP:
                        if( p_wc->length < (sizeof(ipoib_hdr_t) + sizeof(ipoib_arp_pkt_t)) )
                        {
                                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                ("Received ARP packet too short (wc_len %d)\n", p_wc->length) );
                                ib_status = IB_ERROR;
                                break;
                        }
                        ib_status = __endpt_cm_recv_arp( p_port, p_ipoib, p_eth, p_endpt );
                        break;

                case ETH_PROT_TYPE_IP:
                        if( p_wc->length < (sizeof(ipoib_hdr_t) + sizeof(ip_hdr_t)) )
                        {
                                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                        ("Received IP packet too short (wc_len %d)\n", p_wc->length) );
                                ib_status = IB_ERROR;
                                break;
                        }

#if NDIS_HINTS
                        NdisSetNblFlag( p_desc->p_NBL, NDIS_NBL_FLAGS_IS_IPV4 );
#endif

                        if( p_ipoib->type.ip.hdr.prot == IP_PROT_UDP )
                        {
#if NDIS_HINTS
                                NdisSetNblFlag( p_desc->p_NBL, NDIS_NBL_FLAGS_IS_UDP );
#endif
                                ib_status = __endpt_cm_recv_udp( p_port,
                                                                                                 p_wc,
                                                                                                 p_ipoib,
                                                                                                 p_eth,
                                                                                                 p_endpt );
                        }
                        else if( p_ipoib->type.ip.hdr.prot == IP_PROT_TCP )
                        {
                                len = sizeof(ipoib_hdr_t) + sizeof(ip_hdr_t) + sizeof(tcp_hdr_t);
                                if( p_wc->length < len )
                                {
                                        IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                                                ("Received short TCP packet wc_len %d < %d[ipoib+IP+TCP]\n",
                                                        p_wc->length, len) );
                                        ib_status = IB_ERROR;
                                }
#if NDIS_HINTS
                                NdisSetNblFlag( p_desc->p_NBL, NDIS_NBL_FLAGS_IS_TCP );
#endif
                        }
                        break;

                case ETH_PROT_TYPE_IPV6:
                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_CM_RECV,
                                ("RC-Received IPv6\n") );
#if NDIS_HINTS
                        NdisSetNblFlag( p_desc->p_NBL, NDIS_NBL_FLAGS_IS_IPV6 );
#endif
                        break;

                default:
                        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
                                ("RC-Received ?? ipoib packet (%#x) %s desc->p_buf %p\n",
                                        p_ipoib->hdr.type, get_eth_packet_type_str(p_ipoib->hdr.type),
                                        p_desc->p_buf) );
                        break;
                }

                if( ib_status != IB_SUCCESS )
                {
                        ipoib_inc_recv_stat( p_port->p_adapter, IP_STAT_ERROR, 0, 0 );
                        cl_qlist_insert_tail( p_bad_list, &p_desc->item.list_item );
                        continue;
                }

                /*
                 * Synthesize an Ethernet header for NDIS recv.
                 * WARNING - Enet header overlays the ipoib header.
                 * Result is an Enet header + [ipoib packet - ipoib header] + wc_len payload.
                 */
                p_eth->hdr.type = p_ipoib->hdr.type;
                p_eth->hdr.src = p_endpt->mac;
                p_eth->hdr.dst = p_port->p_adapter->mac;

                /* set Ethernet frame length: (p_buf != p_eth)
                 *   wc_length + (sizeof(eth_hdr_t) - sizeof(ipoib_hdr_t))
                 */
                p_desc->len = p_wc->length + DATA_OFFSET;

                cl_qlist_insert_tail( p_done_list, &p_desc->item.list_item );
        }

        IPOIB_EXIT( IPOIB_DBG_CM_RECV );
        return recv_cnt;
}


static ib_api_status_t
__endpt_cm_recv_arp(
        IN              ipoib_port_t* const                             p_port,
        IN              const   ipoib_pkt_t* const              p_ipoib,
        OUT             eth_pkt_t* const                                p_eth,
        IN              ipoib_endpt_t* const                    p_src_endpt )
{
        const ipoib_arp_pkt_t   *p_ib_arp;
        arp_pkt_t                               *p_arp;

        IPOIB_ENTER( IPOIB_DBG_ENDPT );

        p_ib_arp = &p_ipoib->type.arp;
        p_arp = &p_eth->type.arp;

        if( p_ib_arp->hw_type != ARP_HW_TYPE_IB ||
                p_ib_arp->hw_size != sizeof(ipoib_hw_addr_t) ||
                p_ib_arp->prot_type != ETH_PROT_TYPE_IP )
        {
                IPOIB_EXIT( IPOIB_DBG_ENDPT );
                return IB_ERROR;
        }

        p_arp->hw_type = ARP_HW_TYPE_ETH;
        p_arp->hw_size = sizeof(mac_addr_t);
        p_arp->src_hw = p_src_endpt->mac;
        p_arp->src_ip = p_ib_arp->src_ip;
        p_arp->dst_hw = p_port->p_local_endpt->mac;
        p_arp->dst_ip = p_ib_arp->dst_ip;

        IPOIB_EXIT( IPOIB_DBG_ENDPT );
        return IB_SUCCESS;
}

static ib_api_status_t
__endpt_cm_recv_udp(
        IN      ipoib_port_t* const     p_port,
        IN                      ib_wc_t* const                          p_wc,
        IN              const   ipoib_pkt_t* const              p_ipoib,
        OUT                     eth_pkt_t* const                        p_eth,
        IN                      ipoib_endpt_t* const            p_src_endpt )
{
        ib_api_status_t ib_status = IB_SUCCESS;

        IPOIB_ENTER( IPOIB_DBG_ENDPT );

        if( p_wc->length <
                (sizeof(ipoib_hdr_t) + sizeof(ip_hdr_t) + sizeof(udp_hdr_t)) )
        {
                IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
                        ("Received UDP packet too short\n") );
                IPOIB_EXIT( IPOIB_DBG_ENDPT );
                return IB_ERROR;
        }
        if( __cm_recv_is_dhcp( p_ipoib ) )
        {
                ib_status = ipoib_recv_dhcp( p_port,
                                                                         p_ipoib,
                                                                         p_eth,
                                                                         p_src_endpt,
                                                                         p_port->p_local_endpt );
        }

        IPOIB_EXIT( IPOIB_DBG_ENDPT );
        return ib_status;
}

static boolean_t
__cm_recv_is_dhcp(
        IN      const ipoib_pkt_t* const        p_ipoib )
{
        return( (p_ipoib->type.ip.prot.udp.hdr.dst_port == DHCP_PORT_SERVER &&
                                p_ipoib->type.ip.prot.udp.hdr.src_port == DHCP_PORT_CLIENT) ||
                                (p_ipoib->type.ip.prot.udp.hdr.dst_port == DHCP_PORT_CLIENT &&
                                p_ipoib->type.ip.prot.udp.hdr.src_port == DHCP_PORT_SERVER) );
}

#endif /* IPOIB_CM */
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: ipoib_cm.cpp
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20110112/bce10387/attachment.ksh>


More information about the ofw mailing list