[ofw] [PATCH 1/4] Avoid the SM

Hal Rosenstock halr at obsidianresearch.com
Thu Aug 28 07:51:53 PDT 2008


Hi Fab,

Fab Tillier wrote:
> This patch removes the path query from IPoIB when trying to send a unicast packet.  Instead it uses fields form the broadcast group and the work completion to form an address vector.  This speeds ARP responses.
>
> The limitation of this patch is that the rate for the address vector will be that of the broadcast group, which may be suboptimal in mixed SDR/DDR/QDR fabrics.
>   
Maybe I'm wrong about this but I think another limitation may be that 
this only works intra IB subnet (when the IPoIB subnet is contained 
within a single IB subnet). What I mean by that if the IPoIB subnet 
(e.g. broadcast group) spans more than 1 IB subnet, I don't think this 
works, right ? I know that is an "experimental" configuration currently. 
Has that been tried ?

-- Hal
> Signed-off-by: Fab Tillier <ftillier at microsoft.com>
>
> diff -up -r -X trunk\docs\dontdiff.txt -I \$Id: old\ulp\ipoib\kernel\ipoib_endpoint.c trunk\ulp\ipoib\kernel\ipoib_endpoint.c
> --- old\ulp\ipoib\kernel\ipoib_endpoint.c       Mon Jun 30 11:09:26 2008
> +++ trunk\ulp\ipoib\kernel\ipoib_endpoint.c     Thu Aug 21 08:41:29 2008
> @@ -183,6 +183,7 @@ ipoib_endpt_set_mcast(
>                 return status;
>         }
>         p_endpt->h_mcast = p_mcast_rec->h_mcast;
> +       p_endpt->dlid = p_mcast_rec->p_member_rec->mlid;
>
>         IPOIB_EXIT( IPOIB_DBG_ENDPT );
>         return IB_SUCCESS;
> @@ -279,9 +280,8 @@ ipoib_endpt_queue(
>  {
>         ib_api_status_t status;
>         ipoib_port_t    *p_port;
> -       ib_query_req_t  query;
> -       ib_user_query_t info;
> -       ib_path_rec_t   path;
> +       ib_av_attr_t    av_attr;
> +       net32_t                 flow_lbl;
>
>         IPOIB_ENTER( IPOIB_DBG_ENDPT );
>
> @@ -291,108 +291,34 @@ ipoib_endpt_queue(
>                 return NDIS_STATUS_SUCCESS;
>         }
>
> -       if( p_endpt->h_query ||
> -               p_endpt->qpn == CL_HTON32(0x00FFFFFF) )
> +       if( p_endpt->qpn == CL_HTON32(0x00FFFFFF) )
>         {
> +               /*
> +                * Handle a race between the mcast callback and a receive/send.  The QP
> +                * is joined to the MC group before the MC callback is received, so it
> +                * can receive packets, and NDIS can try to respond.  We need to delay
> +                * a response until the MC callback runs and sets the AV.
> +                */
>                 ipoib_endpt_deref( p_endpt );
>                 IPOIB_EXIT( IPOIB_DBG_ENDPT );
>                 return NDIS_STATUS_PENDING;
>         }
>
> -       /* This is the first packet for this endpoint.  Query the SA. */
> -       p_port = __endpt_parent( p_endpt );
> -
> -       IPOIB_ENTER( IPOIB_DBG_ENDPT );
> -
> -       info.method = IB_MAD_METHOD_GETTABLE;
> -       info.attr_id = IB_MAD_ATTR_PATH_RECORD;
> -       info.attr_size = sizeof(ib_path_rec_t);
> -       info.comp_mask = IB_PR_COMPMASK_DGID | IB_PR_COMPMASK_SGID |
> -               IB_PR_COMPMASK_REVERSIBLE | IB_PR_COMPMASK_NUM_PATH;
> -       info.p_attr = &path;
> -
> -       cl_memclr( &path, sizeof(ib_path_rec_t) );
> -       path.dgid = p_endpt->dgid;
> -       ib_gid_set_default( &path.sgid, p_port->p_adapter->guids.port_guid.guid );
> -       path.num_path = 0x1;
> -
> -       cl_memclr( &query, sizeof(ib_query_req_t) );
> -       query.query_type = IB_QUERY_USER_DEFINED;
> -       query.p_query_input = &info;
> -       query.port_guid = p_port->p_adapter->guids.port_guid.guid;
> -       query.timeout_ms = p_port->p_adapter->params.sa_timeout;
> -       query.retry_cnt = p_port->p_adapter->params.sa_retry_cnt;
> -
> -       query.query_context = p_endpt;
> -       query.pfn_query_cb = __path_query_cb;
> -
> -       status = p_port->p_adapter->p_ifc->query(
> -               p_port->p_adapter->h_al, &query, &p_endpt->h_query );
> -       if( status != IB_SUCCESS )
> -       {
> -               IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ENDPT,
> -                       ("ib_query for path returned %s\n",
> -                       p_port->p_adapter->p_ifc->get_err_str( status )) );
> -               ipoib_endpt_deref( p_endpt );
> -               /* Flag the adapter as hung. */
> -               p_port->p_adapter->hung = TRUE;
> -       }
> -
> -       IPOIB_EXIT( IPOIB_DBG_ENDPT );
> -       return NDIS_STATUS_PENDING;
> -}
> -
> -
> -static void
> -__path_query_cb(
> -       IN                              ib_query_rec_t                          *p_query_rec )
> -{
> -       ib_api_status_t status;
> -       ipoib_endpt_t   *p_endpt;
> -       ipoib_port_t    *p_port;
> -       ib_av_attr_t    av_attr;
> -       ib_path_rec_t   *p_path;
> -       net32_t                 flow_lbl;
> -
> -       IPOIB_ENTER( IPOIB_DBG_ENDPT );
> -
> -       p_endpt = (ipoib_endpt_t*)p_query_rec->query_context;
> +       /* This is the first packet for this endpoint.  Create the AV. */
>         p_port = __endpt_parent( p_endpt );
>
> -       cl_obj_lock( &p_endpt->obj );
> -       p_endpt->h_query = NULL;
> -       if( p_endpt->obj.state == CL_DESTROYING )
> -       {
> -               cl_obj_unlock( &p_endpt->obj );
> -               ipoib_endpt_deref( p_endpt );
> -               if( p_query_rec->p_result_mad )
> -                       p_port->p_adapter->p_ifc->put_mad( p_query_rec->p_result_mad );
> -               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
> -                       ("Endpoint destroying, aborting.\n") );
> -               return;
> -       }
> -       cl_obj_unlock( &p_endpt->obj );
> -
> -       if( p_query_rec->status != IB_SUCCESS || !p_query_rec->result_cnt )
> -       {
> -               p_port->p_adapter->hung = TRUE;
> -               ipoib_endpt_deref( p_endpt );
> -               if( p_query_rec->p_result_mad )
> -                       p_port->p_adapter->p_ifc->put_mad( p_query_rec->p_result_mad );
> -               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
> -                       ("Path query failed with %s\n",
> -                       p_port->p_adapter->p_ifc->get_err_str( p_query_rec->status )) );
> -               return;
> -       }
> -
> -       p_path = ib_get_query_path_rec( p_query_rec->p_result_mad, 0 );
> -
>         cl_memclr( &av_attr, sizeof(ib_av_attr_t) );
>
>         av_attr.port_num = p_port->port_num;
>
> -       av_attr.sl = ib_path_rec_sl( p_path );
> -       av_attr.dlid = p_path->dlid;
> +       ib_member_get_sl_flow_hop(
> +               p_port->ib_mgr.bcast_rec.sl_flow_hop,
> +               &av_attr.sl,
> +               &flow_lbl,
> +               &av_attr.grh.hop_limit
> +               );
> +
> +       av_attr.dlid = p_endpt->dlid;
>
>         /*
>          * We always send the GRH so that we preferably lookup endpoints
> @@ -402,38 +328,17 @@ __path_query_cb(
>          * for which there is no match, something that doesn't work when
>          * using LIDs only.
>          */
> -       flow_lbl = ib_path_rec_flow_lbl( p_path );
>         av_attr.grh_valid = TRUE;
>         av_attr.grh.ver_class_flow = ib_grh_set_ver_class_flow(
> -               6, p_path->tclass, flow_lbl );
> +               6, p_port->ib_mgr.bcast_rec.tclass, flow_lbl );
>         av_attr.grh.resv1 = 0;
>         av_attr.grh.resv2 = 0;
> -       av_attr.grh.hop_limit = ib_path_rec_hop_limit( p_path );
> -       av_attr.grh.src_gid = p_path->sgid;
> -       av_attr.grh.dest_gid = p_path->dgid;
> -
> -       cl_obj_lock( &p_port->obj );
> -       if( !p_endpt->dlid )
> -       {
> -               cl_map_item_t   *p_qitem;
> +       ib_gid_set_default( &av_attr.grh.src_gid, p_port->p_adapter->guids.port_guid.guid );
> +       av_attr.grh.dest_gid = p_endpt->dgid;
>
> -               /* This is a subnet local endpoint that does not have its LID set. */
> -               p_endpt->dlid = p_path->dlid;
> -               /*
> -                * Insert the item in the LID map so that locally routed unicast
> -                * traffic will resolve it properly.
> -                */
> -               p_qitem = cl_qmap_insert( &p_port->endpt_mgr.lid_endpts,
> -                       p_endpt->dlid, &p_endpt->lid_item );
> -               CL_ASSERT( p_qitem == &p_endpt->lid_item );
> -       }
> -       cl_obj_unlock( &p_port->obj );
> -       av_attr.static_rate = ib_path_rec_rate( p_path );
> +       av_attr.static_rate = p_port->ib_mgr.bcast_rec.rate;
>         av_attr.path_bits = 0;
>
> -       /* Done with the path record.  Release the MAD. */
> -       p_port->p_adapter->p_ifc->put_mad( p_query_rec->p_result_mad );
> -
>         /* Create the AV. */
>         status = p_port->p_adapter->p_ifc->create_av(
>                 p_port->ib_mgr.h_pd, &av_attr, &p_endpt->h_av );
> @@ -445,13 +350,9 @@ __path_query_cb(
>                 IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>                         ("ib_create_av failed with %s\n",
>                         p_port->p_adapter->p_ifc->get_err_str( status )) );
> -               return;
> +               return NDIS_STATUS_FAILURE;
>         }
>
> -       /* Try to send all pending sends. */
> -       ipoib_port_resume( p_port );
> -
> -       /* Release the reference taken for the SA query. */
> -       ipoib_endpt_deref( p_endpt );
>         IPOIB_EXIT( IPOIB_DBG_ENDPT );
> +       return NDIS_STATUS_SUCCESS;
>  }
> diff -up -r -X trunk\docs\dontdiff.txt -I \$Id: old\ulp\ipoib\kernel\ipoib_port.c trunk\ulp\ipoib\kernel\ipoib_port.c
> --- old\ulp\ipoib\kernel\ipoib_port.c   Thu Aug 21 08:40:18 2008
> +++ trunk\ulp\ipoib\kernel\ipoib_port.c Thu Aug 21 08:41:29 2008
> @@ -1833,7 +1833,7 @@ __recv_get_endpts(
>  #else  /* IPOIB_INLINE_RECV */
>                         *pp_src = ipoib_endpt_create( &p_desc->p_buf->ib.grh.src_gid,
>  #endif /* IPOIB_INLINE_RECV */
> -                               0, p_wc->recv.ud.remote_qp );
> +                               p_wc->recv.ud.remote_lid, p_wc->recv.ud.remote_qp );
>                         if( !*pp_src )
>                         {
>                                 IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
> @@ -2423,13 +2423,10 @@ __recv_arp(
>                         return status;
>                 }
>                 /*
> -                * Create the endpoint.  Note that the LID is left blank and will be
> -                * resolved by a path query as needed.  This is done because the
> -                * remote LID/GID from the work completion may not be the original
> -                * initiator.
> +                * Create the endpoint.
>                  */
>                 *pp_src = ipoib_endpt_create( &p_ib_arp->src_hw.gid,
> -                       0, (p_ib_arp->src_hw.flags_qpn & CL_HTON32(0x00FFFFFF)) );
> +                       p_wc->recv.ud.remote_lid, (p_ib_arp->src_hw.flags_qpn & CL_HTON32(0x00FFFFFF)) );
>
>                 if( !*pp_src )
>                 {
>   
> ------------------------------------------------------------------------
>
> _______________________________________________
> ofw mailing list
> ofw at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw




More information about the ofw mailing list