[ofw] [PATCH 3/7] ib/cm: export CM only interface

Mon Jan 19 22:39:57 PST 2009

> +static NTSTATUS
> +cm_send_req(iba_cm_id *p_id, iba_cm_req *p_req)
> +{
> +       ib_qp_mod_t             attr;
> +       ib_api_status_t ib_status;
> +
> +       attr.state.rtr.rnr_nak_timeout = 0;

Why set rnr_nak_timeout?

> +       ib_status = kal_cep_pre_req(gh_al, p_id->cid, p_req, &attr);

So the attr parameter is marked as in/out, but it only seems to be used as an input parameter, and then only the rnr_nak_timeout is used - why not just pass in the rnr_nak_timeout directly rather than via the structure?

Note that kal_cep_pre_rep, though the declaration lists the QP attributes as an out parameter, it is used as an input parameter too.  For both of these, I would prefer to see the rnr_nak_timeout passed in as an input parameter, and the QP attributes become an optional output parameter that returns the attributes for performing the RESET->INIT QP transition.  I would anticipate that all users of the INIT attributes could easily be changed to either come up with the settings themselves (especially when it comes to partition support which is pretty spotty at the moment), or call al_cep_get_init_attr.  So I think it might make sense to just eliminate this now, and clean up the interface a little.

> +       if (ib_status != IB_SUCCESS) {
> +               return ib_to_ntstatus(ib_status);
> +       }
> +
> +       ib_status = al_cep_send_req(gh_al, p_id->cid);
> +       return ib_to_ntstatus(ib_status);
> +}

<snip>

> +static NTSTATUS
> +cm_send_mra(iba_cm_id *p_id, uint8_t service_timeout,
> +                       void *p_pdata, uint8_t pdata_len)
> +{
> +       ib_cm_mra_t             mra;
> +       ib_api_status_t ib_status;
> +
> +       mra.svc_timeout = service_timeout;
> +       mra.p_mra_pdata = p_pdata;
> +       mra.mra_length = pdata_len;
> +
> +       ib_status = al_cep_mra(gh_al, p_id->cid, &mra);

Why not change the al_cep_mra function to take these parameters explicitly rather than through the ib_cm_mra_t structure?

> +       return ib_to_ntstatus(ib_status);
> +}
> +
> +static NTSTATUS
> +cm_send_lap(iba_cm_id *p_id, iba_cm_lap *p_lap)
> +{
> +       ib_cm_lap_t             lap;
> +       ib_api_status_t ib_status;
> +
> +       RtlZeroMemory(&lap, sizeof lap);
> +       lap.p_lap_pdata = p_lap->p_pdata;
> +       lap.lap_length = p_lap->pdata_len;
> +       lap.remote_resp_timeout = p_lap->remote_resp_timeout;
> +       lap.p_alt_path = p_lap->p_alt_path;
> +
> +       ib_status = al_cep_lap(gh_al, p_id->cid, &lap);

Same here, just change al_cep_lap to take the params explicitly.  We expect the common case going forward to be the RDMA CM, so we should optimize for that.

> +       return ib_to_ntstatus(ib_status);
> +}
> +
> +static NTSTATUS
> +cm_send_apr(iba_cm_id *p_id, iba_cm_apr *p_apr)
> +{
> +       ib_cm_apr_t             apr;
> +       ib_qp_mod_t             attr;
> +       ib_api_status_t ib_status;
> +
> +       RtlZeroMemory(&apr, sizeof apr);
> +       apr.p_apr_pdata = p_apr->p_pdata;
> +       apr.apr_length = p_apr->pdata_len;
> +       apr.apr_status = p_apr->status;
> +       apr.info_length = p_apr->info_length;
> +       apr.p_info = p_apr->p_info;
> +
> +       ib_status = al_cep_pre_apr(gh_al, p_id->cid, &apr, &attr);

Make attr an optional parameter.

I would change al_cep_pre_apr to take the iba_cm_apr structure rather than the ib_cm_apr_t structure.

<snip>

> diff -up -r -X trunk\docs\dontdiff.txt -I '\$Id:'
> trunk/core/al/kernel/al_cm_cep.c
> branches\winverbs/core/al/kernel/al_cm_cep.c
> --- trunk/core/al/kernel/al_cm_cep.c    2008-07-16 08:53:56.469125000 -
> 0700
> +++ branches\winverbs/core/al/kernel/al_cm_cep.c        2009-01-13
> 23:13:27.250000000 -0800
> @@ -3650,6 +3652,60 @@ __bind_cep(
>         cl_spinlock_release( &h_al->obj.lock );
>  }
>
> +ib_api_status_t

Can you make this return NTSTATUS?  The more of these calls you can switch to NTSTATUS the better IMO.  Eventually we should get rid of the ib_status_t altogether (already done for the cl_status_t)

> +kal_alloc_cep(
> +       IN                              ib_al_handle_t
> h_al,
> +       IN      OUT                     net32_t* const
> p_cid )
> +{
> +       kcep_t                          *p_cep;
> +       KLOCK_QUEUE_HANDLE      hdl;
> +       ib_api_status_t         status = IB_SUCCESS;

You don't need status as a variable at all.

> +
> +       KeAcquireInStackQueuedSpinLock( &gp_cep_mgr->lock, &hdl );
> +       p_cep = __create_cep();
> +       KeReleaseInStackQueuedSpinLock( &hdl );
> +
> +       if( p_cep )
> +       {
> +               __bind_cep(p_cep, h_al, NULL, NULL);
> +               *p_cid = p_cep->cid;
> +       }
> +       else
> +       {
> +               status = IB_INSUFFICIENT_MEMORY;

Just return in the error case (return early).  Allows outdenting the body of the if above.

> +       }
> +
> +       return status;
> +}
> +
> +ib_api_status_t
> +kal_config_cep(
> +       IN                              ib_al_handle_t
> h_al,
> +       IN                              net32_t
> cid,
> +       IN                              al_pfn_cep_cb_t
> pfn_cb,
> +       IN                              void*
> context,
> +       IN                              ib_pfn_destroy_cb_t
> pfn_destroy_cb )
> +{
> +       kcep_t                          *p_cep;
> +       KLOCK_QUEUE_HANDLE      hdl;
> +       ib_api_status_t         status = IB_SUCCESS;
> +
> +       KeAcquireInStackQueuedSpinLock( &gp_cep_mgr->lock, &hdl );
> +       p_cep = __lookup_cep( h_al, cid );
> +       if (!p_cep )
> +       {
> +               status = IB_INVALID_PARAMETER;

Most of the other calls return INVALID_HANDLE here.

> +               goto out;
> +       }
> +
> +       p_cep->pfn_cb = pfn_cb;
> +       p_cep->context = context;
> +       p_cep->pfn_destroy_cb = pfn_destroy_cb;
> +
> +out:
> +       KeReleaseInStackQueuedSpinLock( &hdl );
> +       return status;
> +}
>
>  static inline void
>  __unbind_cep(

<snip>

> +/*
> + * Called after polling a MAD from a CEP to parse the received CM
> message
> + * into readable event data.
> + */
> +void
> +kal_cep_format_event(
> +       IN                              ib_al_handle_t
> h_al,
> +       IN                              net32_t
> cid,
> +       IN                              ib_mad_element_t
> *p_mad,
> +       IN      OUT                     iba_cm_event
> *p_event)
> +{
> +       KLOCK_QUEUE_HANDLE      hdl;
> +       kcep_t                          *p_cep;
> +
> +       KeAcquireInStackQueuedSpinLock( &gp_cep_mgr->lock, &hdl );
> +       p_cep = __lookup_cep( h_al, cid );
> +       KeReleaseInStackQueuedSpinLock( &hdl );

You can't release the lock here.  The CEP could be freed before you reference it below.

> +
> +       switch (p_mad->p_mad_buf->attr_id) {
> +       case CM_REQ_ATTR_ID:
> +               if (p_mad->status == IB_SUCCESS) {
> +                       p_event->type = iba_cm_req_received;
> +                       __format_event_req(p_cep, (mad_cm_req_t*)
> p_mad->p_mad_buf, &p_event->data.req);
> +               } else {
> +                       p_event->type = iba_cm_req_error;
> +               }
> +               break;
> +       case CM_REP_ATTR_ID:
> +               if (p_mad->status == IB_SUCCESS) {
> +                       p_event->type = iba_cm_rep_received;
> +                       __format_event_rep((mad_cm_rep_t*) p_mad-
> >p_mad_buf, &p_event->data.rep);
> +               } else {
> +                       p_event->type = iba_cm_rep_error;
> +               }
> +               break;
> +       case CM_RTU_ATTR_ID:
> +               p_event->type = iba_cm_rtu_received;
> +               __format_event_pdata(p_cep, &p_event->data.rtu);
> +               break;
> +       case CM_DREQ_ATTR_ID:
> +               if (p_mad->status == IB_SUCCESS) {
> +                       p_event->type = iba_cm_dreq_received;
> +                       __format_event_pdata(p_cep, &p_event-
> >data.dreq);
> +               } else {
> +                       p_event->type = iba_cm_dreq_error;
> +               }
> +               break;
> +       case CM_DREP_ATTR_ID:
> +               p_event->type = iba_cm_drep_received;
> +               __format_event_pdata(p_cep, &p_event->data.drep);
> +               break;
> +       case CM_REJ_ATTR_ID:
> +               p_event->type = iba_cm_rej_received;
> +               __format_event_rej((mad_cm_rej_t*) p_mad->p_mad_buf,
> &p_event->data.rej);
> +               break;
> +       case CM_MRA_ATTR_ID:
> +               p_event->type = iba_cm_mra_received;
> +               __format_event_mra((mad_cm_mra_t*) p_mad->p_mad_buf,
> &p_event->data.mra);
> +               break;
> +       case CM_LAP_ATTR_ID:
> +               if (p_mad->status == IB_SUCCESS) {
> +                       p_event->type = iba_cm_lap_received;
> +                       // TODO: format lap event
> +               } else {
> +                       p_event->type = iba_cm_lap_error;
> +               }
> +               break;
> +       case CM_APR_ATTR_ID:
> +               p_event->type = iba_cm_apr_received;;
> +               // TODO: format apr event
> +               break;
> +       case CM_SIDR_REQ_ATTR_ID:
> +               if (p_mad->status == IB_SUCCESS) {
> +                       p_event->type = iba_cm_sidr_req_received;
> +                       // TODO: format sidr req event
> +               } else {
> +                       p_event->type = iba_cm_sidr_req_error;
> +               }
> +               break;
> +       case CM_SIDR_REP_ATTR_ID:
> +               p_event->type = iba_cm_sidr_rep_received;
> +               // TODO: format sidr rep event
> +               break;
> +       default:
> +               CL_ASSERT(0);
> +       }
> +}
> diff -up -r -X trunk\docs\dontdiff.txt -I '\$Id:'
> trunk/core/al/kernel/al_ndi_cm.c
> branches\winverbs/core/al/kernel/al_ndi_cm.c
> --- trunk/core/al/kernel/al_ndi_cm.c    2008-10-29 11:23:54.390625000 -
> 0700
> +++ branches\winverbs/core/al/kernel/al_ndi_cm.c        2009-01-12
> 01:53:28.062500000 -0800
> @@ -1469,24 +1469,20 @@ static void
>  __ndi_fill_cm_rep(
>         IN              ib_qp_handle_t  const
> h_qp,
>         IN              ual_ndi_rep_cm_ioctl_in_t
> *p_rep,
> -               OUT     ib_cm_rep_t
> *p_cm_rep)
> +               OUT     iba_cm_rep
> *p_cm_rep)
>  {
>         AL_ENTER( AL_DBG_NDI );
>
> -       memset( p_cm_rep, 0, sizeof(ib_cm_rep_t) );
> +       memset( p_cm_rep, 0, sizeof(iba_cm_rep) );

Why not just use sizeof(*p_cm_rep) to avoid this if things change again in the future?

> -       p_cm_rep->p_rep_pdata = p_rep->pdata;
> -       p_cm_rep->rep_length = sizeof(p_rep->pdata);
> +       p_cm_rep->p_pdata = p_rep->pdata;
> +       p_cm_rep->pdata_len = sizeof(p_rep->pdata);
>
> -       p_cm_rep->qp_type = IB_QPT_RELIABLE_CONN;
> -       p_cm_rep->h_qp = h_qp;
> +       p_cm_rep->qpn = h_qp->num;
>
> -       p_cm_rep->access_ctrl = IB_AC_RDMA_READ | IB_AC_RDMA_WRITE |
> IB_AC_LOCAL_WRITE;
>         p_cm_rep->init_depth = p_rep->init_depth;
> -       p_cm_rep->target_ack_delay = 10;
>         p_cm_rep->failover_accepted = IB_FAILOVER_ACCEPT_UNSUPPORTED;
>         p_cm_rep->flow_ctrl = TRUE;     /* HCAs must support end-to-end
> flow control. */
> -       p_cm_rep->rnr_nak_timeout = QP_ATTRIB_RNR_NAK_TIMEOUT;
>         p_cm_rep->rnr_retry_cnt = QP_ATTRIB_RNR_RETRY;
>
>         AL_EXIT( AL_DBG_NDI );
> @@ -1499,7 +1495,7 @@ __ndi_send_rep(
>         IN              PIRP
> p_irp )
>  {
>         IO_STACK_LOCATION       *p_io_stack;
> -       ib_cm_rep_t cm_rep;
> +       iba_cm_rep cm_rep;
>         ib_qp_mod_t qp_mod;
>         ib_api_status_t status;
>         ual_ndi_rep_cm_ioctl_in_t *p_rep =
> @@ -1524,13 +1520,18 @@ __ndi_send_rep(
>
>         /* Format ib_cm_req_t structure */
>         __ndi_fill_cm_rep( h_qp, p_rep, &cm_rep );
> +       qp_mod.state.rtr.rnr_nak_timeout = QP_ATTRIB_RNR_NAK_TIMEOUT;
>
>         ref_al_obj( &h_qp->obj ); /* Take CEP reference. */
>
>         /* prepare Passive CEP for connection */
> -       status = al_cep_pre_rep_ex(
> -               qp_get_al( h_qp ), p_rep->cid, __ndi_cm_handler, h_qp,
> deref_al_obj,
> -               &cm_rep, &((al_conn_qp_t*)h_qp)->cid, &qp_mod );
> +       status = kal_config_cep(qp_get_al( h_qp ), p_rep->cid,
> __ndi_cm_handler,
> +                                                       h_qp,
> deref_al_obj);

The al_cep_pre_rep_ex verified that the QP's CID was available, preventing a QP that was in process of being destroyed, or one that was already in a connection, from being used.  Basically the original function atomically associated the QP with the CEP (through the CID member), and the CEP with the QP (through the CEP context).

> +       if ( status == IB_SUCCESS )
> +       {
> +               status = kal_cep_pre_rep(qp_get_al( h_qp ), p_rep->cid,
> +
> &cm_rep, &qp_mod );
> +       }
>         if( status != IB_SUCCESS )
>         {
>                 IoFreeWorkItem( p_irp->Tail.Overlay.DriverContext[1] );
> @@ -1539,7 +1540,7 @@ __ndi_send_rep(
>                 al_destroy_cep( qp_get_al( h_qp ), &p_rep->cid, FALSE
> );
>                 deref_al_obj( &h_qp->obj ); /* Release CEP reference.

This dereference isn't always appropriate anymore.  If kal_config_cep succeeded then the reference on the QP will be released when the CEP is destroyed.  If kal_config_cep failed then the CEP isn't bound to the QP.

> */
>                 AL_PRINT_EXIT( TRACE_LEVEL_ERROR, AL_DBG_ERROR,
> -                       ("al_cep_pre_rep_ex returned %s.\n",
> ib_get_err_str( status )) );
> +                       ("kal_cep_pre_rep returned %s.\n",
> ib_get_err_str( status )) );
>                 switch (status)
>                 {
>                         case IB_INVALID_HANDLE:
> @@ -1552,6 +1553,10 @@ __ndi_send_rep(
>                                 return ib_to_ntstatus( status );
>                 }
>         }
> +       else
> +       {
> +               ((al_conn_qp_t*)h_qp)->cid = p_rep->cid;

You leak CEPs here if the QP gets destroyed by another thread while in this call.  The QP won't be freed due to the reference held for the current IOCTL, but the CID member could get set after the 'destroying_qp' function runs and checks the CID.  This leaks a CEP until the application exits.  I don't know off the top of my head if there are other races introduced by this.  I think the really nasty destruction race is still covered (see revision 1356 if you're curious.)

Adding the in/out parameter for the QP's assigned CID back into the call would fix all these issues.

> +       }
>
>         AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_NDI,
>                 ("Prepared Passive CEP with cid %d, h_al %p, context
> %p\n",