[ofw] [RFC] Remove path query from IPoIB

Hal Rosenstock hal.rosenstock at gmail.com
Wed Aug 6 06:55:10 PDT 2008


Hi Fab,

On Tue, Aug 5, 2008 at 6:25 PM, Fab Tillier
<ftillier at windows.microsoft.com> wrote:
> I wanted to get this out for comments before I complete testing.  This change removes path queries from unicast traffic in IPoIB.  It makes use of information from receive work completions (SLID, SGID) and the broadcast group (SL, flow label, hop limit, traffic class, static rate) to form the address vectors.

I don't think it's required that the flow label, hop limit, and
traffic class are the same for unicast and broadcast traffic. (I
forget about whether the same is true for SL). Also, static rate might
be pessimistic for unicast.

-- Hal

> This eliminates a path query during ARP resolution, which in a busy cluster can time out or take longer than the OS will allow for an ARP response.  Without this, IPoIB stops working, and then anything that depends on it (WSD & ND) stop too.
>
> Note that this is only for review and should not be committed yet as my testing isn't complete.
>
> Signed-off-by: Fab Tillier <ftillier at microsoft.com>
>
> Index: ulp/ipoib/kernel/ipoib_endpoint.c
> ===================================================================
> --- ulp/ipoib/kernel/ipoib_endpoint.c   (revision 1408)
> +++ ulp/ipoib/kernel/ipoib_endpoint.c   (working copy)
> @@ -183,6 +183,7 @@ ipoib_endpt_set_mcast(
>                return status;
>        }
>        p_endpt->h_mcast = p_mcast_rec->h_mcast;
> +       p_endpt->dlid = p_mcast_rec->p_member_rec->mlid;
>
>        IPOIB_EXIT( IPOIB_DBG_ENDPT );
>        return IB_SUCCESS;
> @@ -279,9 +280,8 @@ ipoib_endpt_queue(
>  {
>        ib_api_status_t status;
>        ipoib_port_t    *p_port;
> -       ib_query_req_t  query;
> -       ib_user_query_t info;
> -       ib_path_rec_t   path;
> +       ib_av_attr_t    av_attr;
> +       net32_t                 flow_lbl;
>
>        IPOIB_ENTER( IPOIB_DBG_ENDPT );
>
> @@ -291,108 +291,34 @@ ipoib_endpt_queue(
>                return NDIS_STATUS_SUCCESS;
>        }
>
> -       if( p_endpt->h_query ||
> -               p_endpt->qpn == CL_HTON32(0x00FFFFFF) )
> +       if( p_endpt->qpn == CL_HTON32(0x00FFFFFF) )
>        {
> +               /*
> +                * Handle a race between the mcast callback and a receive/send.  The QP
> +                * is joined to the MC group before the MC callback is received, so it
> +                * can receive packets, and NDIS can try to respond.  We need to delay
> +                * a response until the MC callback runs and sets the AV.
> +                */
>                ipoib_endpt_deref( p_endpt );
>                IPOIB_EXIT( IPOIB_DBG_ENDPT );
>                return NDIS_STATUS_PENDING;
>        }
>
> -       /* This is the first packet for this endpoint.  Query the SA. */
> -       p_port = __endpt_parent( p_endpt );
> -
> -       IPOIB_ENTER( IPOIB_DBG_ENDPT );
> -
> -       info.method = IB_MAD_METHOD_GETTABLE;
> -       info.attr_id = IB_MAD_ATTR_PATH_RECORD;
> -       info.attr_size = sizeof(ib_path_rec_t);
> -       info.comp_mask = IB_PR_COMPMASK_DGID | IB_PR_COMPMASK_SGID |
> -               IB_PR_COMPMASK_REVERSIBLE | IB_PR_COMPMASK_NUM_PATH;
> -       info.p_attr = &path;
> -
> -       cl_memclr( &path, sizeof(ib_path_rec_t) );
> -       path.dgid = p_endpt->dgid;
> -       ib_gid_set_default( &path.sgid, p_port->p_adapter->guids.port_guid.guid );
> -       path.num_path = 0x1;
> -
> -       cl_memclr( &query, sizeof(ib_query_req_t) );
> -       query.query_type = IB_QUERY_USER_DEFINED;
> -       query.p_query_input = &info;
> -       query.port_guid = p_port->p_adapter->guids.port_guid.guid;
> -       query.timeout_ms = p_port->p_adapter->params.sa_timeout;
> -       query.retry_cnt = p_port->p_adapter->params.sa_retry_cnt;
> -
> -       query.query_context = p_endpt;
> -       query.pfn_query_cb = __path_query_cb;
> -
> -       status = p_port->p_adapter->p_ifc->query(
> -               p_port->p_adapter->h_al, &query, &p_endpt->h_query );
> -       if( status != IB_SUCCESS )
> -       {
> -               IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ENDPT,
> -                       ("ib_query for path returned %s\n",
> -                       p_port->p_adapter->p_ifc->get_err_str( status )) );
> -               ipoib_endpt_deref( p_endpt );
> -               /* Flag the adapter as hung. */
> -               p_port->p_adapter->hung = TRUE;
> -       }
> -
> -       IPOIB_EXIT( IPOIB_DBG_ENDPT );
> -       return NDIS_STATUS_PENDING;
> -}
> -
> -
> -static void
> -__path_query_cb(
> -       IN                              ib_query_rec_t                          *p_query_rec )
> -{
> -       ib_api_status_t status;
> -       ipoib_endpt_t   *p_endpt;
> -       ipoib_port_t    *p_port;
> -       ib_av_attr_t    av_attr;
> -       ib_path_rec_t   *p_path;
> -       net32_t                 flow_lbl;
> -
> -       IPOIB_ENTER( IPOIB_DBG_ENDPT );
> -
> -       p_endpt = (ipoib_endpt_t*)p_query_rec->query_context;
> +       /* This is the first packet for this endpoint.  Create the AV. */
>        p_port = __endpt_parent( p_endpt );
>
> -       cl_obj_lock( &p_endpt->obj );
> -       p_endpt->h_query = NULL;
> -       if( p_endpt->obj.state == CL_DESTROYING )
> -       {
> -               cl_obj_unlock( &p_endpt->obj );
> -               ipoib_endpt_deref( p_endpt );
> -               if( p_query_rec->p_result_mad )
> -                       p_port->p_adapter->p_ifc->put_mad( p_query_rec->p_result_mad );
> -               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
> -                       ("Endpoint destroying, aborting.\n") );
> -               return;
> -       }
> -       cl_obj_unlock( &p_endpt->obj );
> -
> -       if( p_query_rec->status != IB_SUCCESS || !p_query_rec->result_cnt )
> -       {
> -               p_port->p_adapter->hung = TRUE;
> -               ipoib_endpt_deref( p_endpt );
> -               if( p_query_rec->p_result_mad )
> -                       p_port->p_adapter->p_ifc->put_mad( p_query_rec->p_result_mad );
> -               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
> -                       ("Path query failed with %s\n",
> -                       p_port->p_adapter->p_ifc->get_err_str( p_query_rec->status )) );
> -               return;
> -       }
> -
> -       p_path = ib_get_query_path_rec( p_query_rec->p_result_mad, 0 );
> -
>        cl_memclr( &av_attr, sizeof(ib_av_attr_t) );
>
>        av_attr.port_num = p_port->port_num;
>
> -       av_attr.sl = ib_path_rec_sl( p_path );
> -       av_attr.dlid = p_path->dlid;
> +       ib_member_get_sl_flow_hop(
> +               p_port->ib_mgr.bcast_rec.sl_flow_hop,
> +               &av_attr.sl,
> +               &flow_lbl,
> +               &av_attr.grh.hop_limit
> +               );
> +
> +       av_attr.dlid = p_endpt->dlid;
>
>        /*
>         * We always send the GRH so that we preferably lookup endpoints
> @@ -402,38 +328,17 @@ __path_query_cb(
>         * for which there is no match, something that doesn't work when
>         * using LIDs only.
>         */
> -       flow_lbl = ib_path_rec_flow_lbl( p_path );
>        av_attr.grh_valid = TRUE;
>        av_attr.grh.ver_class_flow = ib_grh_set_ver_class_flow(
> -               6, p_path->tclass, flow_lbl );
> +               6, p_port->ib_mgr.bcast_rec.tclass, flow_lbl );
>        av_attr.grh.resv1 = 0;
>        av_attr.grh.resv2 = 0;
> -       av_attr.grh.hop_limit = ib_path_rec_hop_limit( p_path );
> -       av_attr.grh.src_gid = p_path->sgid;
> -       av_attr.grh.dest_gid = p_path->dgid;
> -
> -       cl_obj_lock( &p_port->obj );
> -       if( !p_endpt->dlid )
> -       {
> -               cl_map_item_t   *p_qitem;
> +       ib_gid_set_default( &av_attr.grh.src_gid, p_port->p_adapter->guids.port_guid.guid );
> +       av_attr.grh.dest_gid = p_endpt->dgid;
>
> -               /* This is a subnet local endpoint that does not have its LID set. */
> -               p_endpt->dlid = p_path->dlid;
> -               /*
> -                * Insert the item in the LID map so that locally routed unicast
> -                * traffic will resolve it properly.
> -                */
> -               p_qitem = cl_qmap_insert( &p_port->endpt_mgr.lid_endpts,
> -                       p_endpt->dlid, &p_endpt->lid_item );
> -               CL_ASSERT( p_qitem == &p_endpt->lid_item );
> -       }
> -       cl_obj_unlock( &p_port->obj );
> -       av_attr.static_rate = ib_path_rec_rate( p_path );
> +       av_attr.static_rate = p_port->ib_mgr.bcast_rec.rate;
>        av_attr.path_bits = 0;
>
> -       /* Done with the path record.  Release the MAD. */
> -       p_port->p_adapter->p_ifc->put_mad( p_query_rec->p_result_mad );
> -
>        /* Create the AV. */
>        status = p_port->p_adapter->p_ifc->create_av(
>                p_port->ib_mgr.h_pd, &av_attr, &p_endpt->h_av );
> @@ -445,13 +350,9 @@ __path_query_cb(
>                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>                        ("ib_create_av failed with %s\n",
>                        p_port->p_adapter->p_ifc->get_err_str( status )) );
> -               return;
> +               return NDIS_STATUS_FAILURE;
>        }
>
> -       /* Try to send all pending sends. */
> -       ipoib_port_resume( p_port );
> -
> -       /* Release the reference taken for the SA query. */
> -       ipoib_endpt_deref( p_endpt );
>        IPOIB_EXIT( IPOIB_DBG_ENDPT );
> +       return NDIS_STATUS_SUCCESS;
>  }
> Index: ulp/ipoib/kernel/ipoib_port.c
> ===================================================================
> --- ulp/ipoib/kernel/ipoib_port.c       (revision 1408)
> +++ ulp/ipoib/kernel/ipoib_port.c       (working copy)
> @@ -1812,7 +1812,7 @@ __recv_get_endpts(
>  #else  /* IPOIB_INLINE_RECV */
>                        *pp_src = ipoib_endpt_create( &p_desc->p_buf->ib.grh.src_gid,
>  #endif /* IPOIB_INLINE_RECV */
> -                               0, p_wc->recv.ud.remote_qp );
> +                               p_wc->recv.ud.remote_lid, p_wc->recv.ud.remote_qp );
>                        if( !*pp_src )
>                        {
>                                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
> @@ -2392,13 +2392,10 @@ __recv_arp(
>                        return status;
>                }
>                /*
> -                * Create the endpoint.  Note that the LID is left blank and will be
> -                * resolved by a path query as needed.  This is done because the
> -                * remote LID/GID from the work completion may not be the original
> -                * initiator.
> +                * Create the endpoint.
>                 */
>                *pp_src = ipoib_endpt_create( &p_ib_arp->src_hw.gid,
> -                       0, (p_ib_arp->src_hw.flags_qpn & CL_HTON32(0x00FFFFFF)) );
> +                       p_wc->recv.ud.remote_lid, (p_ib_arp->src_hw.flags_qpn & CL_HTON32(0x00FFFFFF)) );
>
>                if( !*pp_src )
>                {
> _______________________________________________
> ofw mailing list
> ofw at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
>



More information about the ofw mailing list