[openib-general] Re: [PATCH 3/3] iWARP CM
Sean Hefty
mshefty at ichips.intel.com
Thu Mar 16 12:47:18 PST 2006
Tom Tucker wrote:
> +struct iwcm_id_private {
> + struct iw_cm_id id;
> +
> + spinlock_t lock;
> + atomic_t refcount;
> +};
> +
> +struct iwcm_work {
> + struct work_struct work;
> + struct iwcm_id_private *cm_id;
> + struct iw_cm_event event;
> +};
> +
> +/* Release a reference on cm_id and delete the object if the
> +* reference count goes to zero.
> + */
> +static inline void iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
> +{
> + if (atomic_dec_and_test(&cm_id_priv->refcount))
> + kfree(cm_id_priv);
> +}
> +
> +static void cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
> +
> +struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
> + iw_cm_handler cm_handler,
> + void *context)
> +{
> + struct iwcm_id_private *iwcm_id_priv;
> +
> + iwcm_id_priv = kzalloc(sizeof *iwcm_id_priv, GFP_KERNEL);
> + if (!iwcm_id_priv)
> + return ERR_PTR(-ENOMEM);
> +
> + iwcm_id_priv->id.state = IW_CM_STATE_IDLE;
> + iwcm_id_priv->id.device = device;
> + iwcm_id_priv->id.cm_handler = cm_handler;
> + iwcm_id_priv->id.context = context;
> + iwcm_id_priv->id.event_handler = cm_event_handler;
> +
> + spin_lock_init(&iwcm_id_priv->lock);
> + atomic_set(&iwcm_id_priv->refcount, 1);
> +
> + return &iwcm_id_priv->id;
> +
nit: extra blank line after the return.
> +}
> +EXPORT_SYMBOL(iw_create_cm_id);
> +
> +
> +static int iwcm_modify_qp_err(struct ib_qp *qp)
> +{
> + struct ib_qp_attr qp_attr;
> +
> + if (!qp)
> + return 0;
> +
> + qp_attr.qp_state = IB_QPS_ERR;
> + return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
> +}
I don't think that we need to check for a NULL qp pointer. The state checks
where this is called should ensure that a QP is present.
> +static int iwcm_modify_qp_sqd(struct ib_qp *qp)
> +{
> + struct ib_qp_attr qp_attr;
> +
> + if (!qp)
> + return 0;
> +
> + qp_attr.qp_state = IB_QPS_SQD;
> + return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
> +}
Same comment as previous one.
> +
> +/* CM_ID <-- CLOSING
> + *
> + * - If we are established move to CLOSING and modify the QP state
> + * based on the abrupt flag
> + * - If the connection is already in the CLOSING state, the peer is
> + * disconnecting concurrently with us and we've already seen the
> + * DISCONNECT event -- ignore the request and return 0
> + * - If the connection was never established, return -EINVAL
> + */
> +int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
> +{
> + struct iwcm_id_private *iwcm_id_priv;
> + unsigned long flags;
> + struct ib_qp *qp;
> + int ret = 0;
> +
> + iwcm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> + spin_lock_irqsave(&iwcm_id_priv->lock, flags);
> + switch (cm_id->state) {
> + case IW_CM_STATE_ESTABLISHED:
> + cm_id->state = IW_CM_STATE_CLOSING;
> + qp = cm_id->qp;
Is there some significance to saving the QP pointer under lock?
> + spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> + if (abrupt)
> + ret = iwcm_modify_qp_err(qp);
> + else
> + ret = iwcm_modify_qp_sqd(qp);
> + break;
> + case IW_CM_STATE_CONN_SENT:
> + case IW_CM_STATE_LISTEN:
> + case IW_CM_STATE_CONN_RECV:
> + spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> + ret = -EINVAL;
> + break;
> + case IW_CM_STATE_CLOSING:
> + default:
> + spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> + }
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(iw_cm_disconnect);
> +
> +/* Clean up all resources associated with the connection. Set the
> + * cm_id state to DESTROYING so queued events will be ignored and
> + * subsequent events will not be queued.
> + */
> +void iw_destroy_cm_id(struct iw_cm_id *cm_id)
> +{
> + struct iwcm_id_private *iwcm_id_priv;
> + unsigned long flags;
> + struct ib_qp *qp;
> + int ret = 0;
> +
> + iwcm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +
> + spin_lock_irqsave(&iwcm_id_priv->lock, flags);
> + switch (cm_id->state) {
> + case IW_CM_STATE_LISTEN:
> + cm_id->state = IW_CM_STATE_DESTROYING;
> + spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> + ret = cm_id->device->iwcm->destroy_listen(cm_id);
> + break;
> + case IW_CM_STATE_CONN_RECV:
> + cm_id->state = IW_CM_STATE_DESTROYING;
> + spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> + /* Need to call reject on behalf of client */
> + (void)iw_cm_reject(cm_id, NULL, 0);
> + break;
> + case IW_CM_STATE_CONN_SENT:
> + cm_id->state = IW_CM_STATE_DESTROYING;
> + cm_id->qp = NULL;
Why is QP being cleared?
> + spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> + break;
> + case IW_CM_STATE_ESTABLISHED:
> + cm_id->state = IW_CM_STATE_DESTROYING;
> + qp = cm_id->qp;
And this may be related, but is there some significance to saving the qp pointer
under lock?
> + spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> + ret = iwcm_modify_qp_err(qp);
> + break;
> + case IW_CM_STATE_IDLE:
> + cm_id->state = IW_CM_STATE_DESTROYING;
> + spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> + break;
> + case IW_CM_STATE_CLOSING:
> + /* Lost race with cm_disconnect_handler. Need to
> + * release 'connect' reference, since setting state to
> + * DESTROYING will case CLOSE event to be ignored.
> + */
> + cm_id->state = IW_CM_STATE_DESTROYING;
> + spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> + iwcm_deref_id(iwcm_id_priv);
This seems problematic. How do we know that this is the thread that took the
reference?
> + break;
> + default:
> + /* Lost race with cm_close_handler. */
> + cm_id->state = IW_CM_STATE_DESTROYING;
> + spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> + }
> + iwcm_deref_id(iwcm_id_priv);
Since there's no call to the provider, how can we be sure that they won't
generate additional events after this call returns? How does a provider know
when to cleanup its resources?
> +}
> +EXPORT_SYMBOL(iw_destroy_cm_id);
> +
{snip}
> +/*
> + * Passive Side: new CM_ID <-- CONN_RECV
> + *
> + * Handles an inbound connect request. The function creates a new
> + * iw_cm_id to represent the new connection and inherits the client
> + * callback function and other attributes from the listening parent.
> + *
> + * The work item contains a pointer to the listen_cm_id and the event. The
> + * listen_cm_id contains the client cm_handler, context and
> + * device. These are copied when the device is cloned. The event
> + * contains the new four tuple.
> + *
> + * An error on the child should not affect the parent, so this
> + * function does not return a value.
> + */
> +static void cm_conn_req_handler(struct iwcm_work *work)
> +{
> + struct iw_cm_id *cm_id;
> + struct iwcm_id_private *cm_id_priv;
> + struct iwcm_id_private *listen_id_priv = work->cm_id;
> + iw_cm_handler cm_handler;
> + void *context;
> + struct ib_device *device;
> + unsigned long flags;
> + int ret;
> +
> + /* If the status was not successful, ignore request */
> + BUG_ON(work->event.status);
> +
> + /* We could be destroying the listening id. If so, ignore this
> + * upcall. */
> + spin_lock_irqsave(&listen_id_priv->lock, flags);
> + if (listen_id_priv->id.state != IW_CM_STATE_LISTEN) {
> + spin_unlock_irqrestore(&listen_id_priv->lock, flags);
This check is fine, but note that the listen_id state could change at any point
after we release the lock. Destruction of the listen must block until we
complete processing of the new request.
> + return;
> + }
> + device = listen_id_priv->id.device;
> + cm_handler = listen_id_priv->id.cm_handler;
> + context = listen_id_priv->id.context;
Is there some significance to saving the listen_id's device, cm_handler, and
context under lock?
> + spin_unlock_irqrestore(&listen_id_priv->lock, flags);
> +
> + cm_id = iw_create_cm_id(device, cm_handler, context);
> +
> + /* If the cm_id could not be created, ignore the request */
> + if (IS_ERR(cm_id))
> + return;
> +
> + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> + cm_id_priv->id.local_addr = work->event.local_addr;
> + cm_id_priv->id.remote_addr = work->event.remote_addr;
> + cm_id_priv->id.provider_id = work->event.provider_id;
> + cm_id_priv->id.state = IW_CM_STATE_CONN_RECV;
> +
> + /* Call the client CM handler */
> + atomic_inc(&cm_id_priv->refcount);
> + ret = cm_id->cm_handler(cm_id, &work->event);
> + iwcm_deref_id(cm_id_priv);
> + if (ret) {
> + cm_id->state = IW_CM_STATE_IDLE;
> + iw_destroy_cm_id(cm_id);
> + }
> +
> + return;
nit: return is unnecessary.
> +}
> +
> +/*
> + * Passive Side: CM_ID <-- ESTABLISHED
> + *
> + * If we're in the wrong state, ignore the request. This could
> + * happen if we got a close from the peer.
> + */
> +static int cm_conn_est_handler(struct iwcm_work *work)
> +{
> + struct iwcm_id_private *cm_id_priv;
> + unsigned long flags;
> +
> + cm_id_priv = work->cm_id;
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + if (cm_id_priv->id.state != IW_CM_STATE_CONN_RECV) {
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + return 0;
> + }
> +
> + cm_id_priv = work->cm_id;
> + atomic_inc(&cm_id_priv->refcount); /* close decrements */
I'm not understanding how this increment is used, or what protection it
provides. Immediately after we release the spinlock below, the user can destroy
the cm_id, which will release this reference.
> + cm_id_priv->id.state = IW_CM_STATE_ESTABLISHED;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> + /* Call the client CM handler */
> + return cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->event);
> +}
> +
> +/*
> + * Active Side: CM_ID <-- ESTABLISHED
> + *
> + * Handles the reply to our connect request:
> + * - If the cm_id is in the wrong state when the event is
> + * delivered, the event is ignored.
> + * - If the remote peer accepts the connection, we update the 4-tuple
> + * in the cm_id with the remote peer info, move the cm_id to the
> + * ESTABLISHED state and deliver the event to the client.
> + * - If the remote peer rejects the connection, or there is some
> + * connection error, move the cm_id to the IDLE state, and deliver
> + * the event to the client.
> + */
> +static int cm_conn_rep_handler(struct iwcm_work *work)
> +{
> + struct iwcm_id_private *cm_id_priv;
> + unsigned long flags;
> +
> + cm_id_priv = work->cm_id;
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + if (cm_id_priv->id.state != IW_CM_STATE_CONN_SENT) {
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + return 0;
> + }
> +
> + if (work->event.status == IW_CM_EVENT_STATUS_ACCEPTED) {
> + cm_id_priv = work->cm_id;
> + cm_id_priv->id.local_addr = work->event.local_addr;
> + cm_id_priv->id.remote_addr = work->event.remote_addr;
> + atomic_inc(&cm_id_priv->refcount); /* close decrements */
Similar concern about reference taken here as mentioned previously.
> + cm_id_priv->id.state = IW_CM_STATE_ESTABLISHED;
> + } else
> + cm_id_priv->id.state = IW_CM_STATE_IDLE;
> +
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> + /* Call the client CM handler */
> + return cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->event);
> +}
> +
> +/*
> + * CM_ID <-- CLOSING
> + *
> + * If in the ESTABLISHED state, move to closing and disassociate the
> + * CM_ID from the QP. The provider will have already moved the QP to
> + * CLOSING.
It's confusing trying to figure out who's responsible for QP transitions. In
some places, the iw_cm performs the transitions. In others, the provider is
expected to have performed them. Is there a way to make this consistent, so
that all QP transitions are initiated from the same location? It seems that
we're open to bugs if different modules drive the QP state without strict
coordination between them.
> + */
> +static void cm_disconnect_handler(struct iwcm_work *work)
> +{
> + struct iwcm_id_private *cm_id_priv;
> + unsigned long flags;
> +
> + cm_id_priv = work->cm_id;
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + if (cm_id_priv->id.state == IW_CM_STATE_ESTABLISHED) {
> + cm_id_priv->id.state = IW_CM_STATE_CLOSING;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + } else
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
nit: just drop the else and always unlock.
Why do we use a work handler just to change the state variable?
> +}
> +
> +/*
> + * CM_ID <-- IDLE
> + *
> + * If in the ESTBLISHED state, the QP will have have been moved by the
> + * provider to the ERR state. Disassociate the CM_ID from the QP,
> + * move to IDLE, and remove the 'connected' reference.
> + *
> + * If in the CLOSING state, the QP has already been disassociated,
> + * move to IDLE and remove 'connected' reference.
> + *
> + */
> +static int cm_close_handler(struct iwcm_work *work)
> +{
> + struct iwcm_id_private *cm_id_priv;
> + unsigned long flags;
> +
> + cm_id_priv = work->cm_id;
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + cm_id_priv->id.qp = NULL;
Should we check the state before clearing the QP pointer?
> + switch (cm_id_priv->id.state) {
> + case IW_CM_STATE_ESTABLISHED:
> + case IW_CM_STATE_CLOSING:
> + cm_id_priv->id.state = IW_CM_STATE_IDLE;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + iwcm_deref_id(cm_id_priv);
gurgle... Maintaining an extra reference on the object from one state to
another is confusing. This can't drop the last reference on the cm_id, since we
access it below. And if another thread can release this reference, then how do
we even know that the cm_id is valid at the top of this function? It appears
that we have something like this:
1. object.state = X, object.ref++
2. if object.state == X, then object.state = Y, object.ref--
For multiple threads to process step 2 at the same time, each thread must
already hold reference on the object. (Or we'll end up crashing while checking
the state.)
This makes me think that we don't need this reference at all. And if the
reference is supposed to provide some sort of protection, then there's likely a
bug somewhere.
> + break;
> + default:
> + /* lost race with iw_cm_disconnect */
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + return 0;
> + }
> +
> + /* Call the client CM handler */
> + return cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->event);
> +}
> +
> +static void cm_work_handler(void *arg)
> +{
> + struct iwcm_work *work = (struct iwcm_work*)arg;
> + struct iwcm_id_private *cm_id_priv = work->cm_id;
> + int ret = 0;
> +
> + /* Ignore scheduled events on a destroyed connection */
> + if (cm_id_priv->id.state != IW_CM_STATE_DESTROYING) {
Leaving this check is fine, as long as we can agree that it isn't needed. :)
> + switch (work->event.event) {
> + case IW_CM_EVENT_CONNECT_REQUEST:
> + cm_conn_req_handler(work);
> + break;
> + case IW_CM_EVENT_CONNECT_REPLY:
> + ret = cm_conn_rep_handler(work);
> + break;
> + case IW_CM_EVENT_ESTABLISHED:
> + ret = cm_conn_est_handler(work);
> + break;
> + case IW_CM_EVENT_DISCONNECT:
> + cm_disconnect_handler(work);
> + break;
> + case IW_CM_EVENT_CLOSE:
> + ret = cm_close_handler(work);
> + break;
> + default:
> + BUG_ON(1);
> + }
> + }
> + kfree(work);
> +
> + /* The reference was done in cm_event_handler. */
> + iwcm_deref_id(cm_id_priv);
> +
> + if (ret)
> + iw_destroy_cm_id(&cm_id_priv->id);
Okay - I think this is where the problems start. We can have several events
queued behind this one when we try to destroy the cm_id. If destroy blocks, we
deadlock as you mention. If destroy doesn't block, then the user can receive a
callback after destroy returns. Also note that if we don't use a single thread
work queue, then a user could get multiple callbacks simultaneously, which is
difficult to deal with, since events could now be reported out of order.
How about taking a different approach to event processing? We can add struct
work_struct and struct list_head fields to struct iwcm_id_private. New events
would be queued on the list. If the list was empty when a new event was added,
the work item would be queued for processing. If there was already an item on
the list, the new event is simply queued at the end. Something like:
destroy_cm_id()
{
normal destruction
wait for refcount = 0
cleanup queued events
}
cm_work_handler()
{
lock
while event list is not empty {
read event at head of list
unlock
ret = process event
if ret {
destroy_cm_id()
return
}
lock
remove event from list
}
unlock
}
cm_event_handler()
{
lock
if event list is empty {
queue event
queue work item
} else
queue event
unlock
}
This would allow for multi-threaded event handling as well.
> +/* IW CM provider event callback handler. This function is called on
> + * interrupt context. Schedule events on the rdma_wq thread to allow
> + * callback functions to downcall into the CM and/or block.
nit: comment text should start down one line -- this appears in a couple of spots
> + *
> + * If the cm_id is being destroyed, ignore all events except CLOSE. In
> + * this case, dump the the ESTABLISHED reference and allow the cm_id
> + * to be freed.
> + */
> +static void cm_event_handler(struct iw_cm_id *cm_id,
> + struct iw_cm_event *iw_event)
> +{
> + struct iwcm_work *work;
> + struct iwcm_id_private *cm_id_priv;
> + unsigned long flags;
How are we assured that the cm_id is valid? What indicates to the provider that
it's been destroyed? Some QP state change?
- Sean
More information about the general
mailing list