[openib-general] Re: [PATCH 3/3] iWARP CM

Thu Mar 16 12:47:18 PST 2006

Tom Tucker wrote:
> +struct iwcm_id_private {
> +	struct iw_cm_id	id;
> +
> +	spinlock_t lock;
> +	atomic_t refcount;
> +};
> +
> +struct iwcm_work {
> +	struct work_struct work;
> +	struct iwcm_id_private *cm_id;
> +	struct iw_cm_event event;
> +};
> +
> +/* Release a reference on cm_id and delete the object if the
> +* reference count goes to zero. 
> + */
> +static inline void iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
> +{
> +	if (atomic_dec_and_test(&cm_id_priv->refcount))
> +		kfree(cm_id_priv);
> +}
> +
> +static void cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
> +
> +struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
> +				 iw_cm_handler cm_handler,
> +				 void *context)
> +{
> +	struct iwcm_id_private *iwcm_id_priv;
> +
> +	iwcm_id_priv = kzalloc(sizeof *iwcm_id_priv, GFP_KERNEL);
> +	if (!iwcm_id_priv)
> +		return ERR_PTR(-ENOMEM);
> +
> +	iwcm_id_priv->id.state = IW_CM_STATE_IDLE;
> +	iwcm_id_priv->id.device = device;
> +	iwcm_id_priv->id.cm_handler = cm_handler;
> +	iwcm_id_priv->id.context = context;
> +	iwcm_id_priv->id.event_handler = cm_event_handler;
> +
> +	spin_lock_init(&iwcm_id_priv->lock);
> +	atomic_set(&iwcm_id_priv->refcount, 1);
> +
> +	return &iwcm_id_priv->id;
> +

nit: extra blank line after the return.

> +}
> +EXPORT_SYMBOL(iw_create_cm_id);
> +
> +
> +static int iwcm_modify_qp_err(struct ib_qp *qp)
> +{
> +	struct ib_qp_attr qp_attr;
> +
> +	if (!qp)
> +		return 0;
> +
> +	qp_attr.qp_state = IB_QPS_ERR;
> +	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
> +}

I don't think that we need to check for a NULL qp pointer.  The state checks 
where this is called should ensure that a QP is present.

> +static int iwcm_modify_qp_sqd(struct ib_qp *qp)
> +{
> +	struct ib_qp_attr qp_attr;
> +
> +	if (!qp)
> +		return 0;
> +
> +	qp_attr.qp_state = IB_QPS_SQD;
> +	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
> +}

Same comment as previous one.

> +
> +/* CM_ID <-- CLOSING
> + *
> + * - If we are established move to CLOSING and modify the QP state
> + *   based on the abrupt flag 
> + * - If the connection is already in the CLOSING state, the peer is
> + *   disconnecting concurrently with us and we've already seen the
> + *   DISCONNECT event -- ignore the request and return 0
> + * - If the connection was never established, return -EINVAL
> + */
> +int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
> +{
> +	struct iwcm_id_private *iwcm_id_priv;
> +	unsigned long flags;
> +	struct ib_qp *qp;
> +	int ret = 0;
> +
> +	iwcm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +	spin_lock_irqsave(&iwcm_id_priv->lock, flags);
> +	switch (cm_id->state) {
> +	case IW_CM_STATE_ESTABLISHED:
> +		cm_id->state = IW_CM_STATE_CLOSING;
> +		qp = cm_id->qp;

Is there some significance to saving the QP pointer under lock?

> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		if (abrupt)
> +			ret = iwcm_modify_qp_err(qp);
> +		else
> +			ret = iwcm_modify_qp_sqd(qp);
> +		break;
> +	case IW_CM_STATE_CONN_SENT:
> +	case IW_CM_STATE_LISTEN:
> +	case IW_CM_STATE_CONN_RECV:
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		ret = -EINVAL;
> +		break;
> +	case IW_CM_STATE_CLOSING:
> +	default:		
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(iw_cm_disconnect);
> +
> +/* Clean up all resources associated with the connection. Set the
> + * cm_id state to DESTROYING so queued events will be ignored and
> + * subsequent events will not be queued. 
> + */
> +void iw_destroy_cm_id(struct iw_cm_id *cm_id)
> +{
> +	struct iwcm_id_private *iwcm_id_priv;
> +	unsigned long flags;
> +	struct ib_qp *qp;
> +	int ret = 0;
> +
> +	iwcm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +
> +	spin_lock_irqsave(&iwcm_id_priv->lock, flags);
> +	switch (cm_id->state) {
> +	case IW_CM_STATE_LISTEN:
> +		cm_id->state = IW_CM_STATE_DESTROYING;
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		ret = cm_id->device->iwcm->destroy_listen(cm_id);
> +		break;
> +	case IW_CM_STATE_CONN_RECV:
> +		cm_id->state = IW_CM_STATE_DESTROYING;
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		/* Need to call reject on behalf of client */
> +		(void)iw_cm_reject(cm_id, NULL, 0);
> +		break;
> +	case IW_CM_STATE_CONN_SENT:
> +		cm_id->state = IW_CM_STATE_DESTROYING;
> +		cm_id->qp = NULL;

Why is QP being cleared?

> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		break;
> +	case IW_CM_STATE_ESTABLISHED:
> +		cm_id->state = IW_CM_STATE_DESTROYING;
> +		qp = cm_id->qp;

And this may be related, but is there some significance to saving the qp pointer 
under lock?

> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		ret = iwcm_modify_qp_err(qp);
> +		break;
> +	case IW_CM_STATE_IDLE:
> +		cm_id->state = IW_CM_STATE_DESTROYING;
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		break;
> +	case IW_CM_STATE_CLOSING:
> +		/* Lost race with cm_disconnect_handler. Need to 
> +		 * release 'connect' reference, since setting state to
> +		 * DESTROYING will case CLOSE event to be ignored.
> +		 */
> +		cm_id->state = IW_CM_STATE_DESTROYING;
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		iwcm_deref_id(iwcm_id_priv);

This seems problematic.  How do we know that this is the thread that took the 
reference?

> +		break;
> +	default:
> +		/* Lost race with cm_close_handler. */ 
> +		cm_id->state = IW_CM_STATE_DESTROYING;
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +	}
> +	iwcm_deref_id(iwcm_id_priv);

Since there's no call to the provider, how can we be sure that they won't 
generate additional events after this call returns?  How does a provider know 
when to cleanup its resources?

> +}
> +EXPORT_SYMBOL(iw_destroy_cm_id);
> +
{snip}
> +/*
> + * Passive Side: new CM_ID <-- CONN_RECV
> + *
> + * Handles an inbound connect request. The function creates a new
> + * iw_cm_id to represent the new connection and inherits the client
> + * callback function and other attributes from the listening parent. 
> + * 
> + * The work item contains a pointer to the listen_cm_id and the event. The
> + * listen_cm_id contains the client cm_handler, context and
> + * device. These are copied when the device is cloned. The event
> + * contains the new four tuple.
> + *
> + * An error on the child should not affect the parent, so this
> + * function does not return a value.
> + */
> +static void cm_conn_req_handler(struct iwcm_work *work)
> +{
> +	struct iw_cm_id *cm_id;
> +	struct iwcm_id_private *cm_id_priv;
> +	struct iwcm_id_private *listen_id_priv = work->cm_id;
> +	iw_cm_handler cm_handler;
> +	void *context;
> +	struct ib_device *device;
> +	unsigned long flags;
> +	int ret;
> +
> +	/* If the status was not successful, ignore request */
> +	BUG_ON(work->event.status);
> +
> +	/* We could be destroying the listening id. If so, ignore this
> +	 * upcall. */
> +	spin_lock_irqsave(&listen_id_priv->lock, flags);
> +	if (listen_id_priv->id.state != IW_CM_STATE_LISTEN) {
> +		spin_unlock_irqrestore(&listen_id_priv->lock, flags);

This check is fine, but note that the listen_id state could change at any point 
after we release the lock.  Destruction of the listen must block until we 
complete processing of the new request.

> +		return;
> +	}
> +	device = listen_id_priv->id.device;
> +	cm_handler = listen_id_priv->id.cm_handler;
> +	context = listen_id_priv->id.context;

Is there some significance to saving the listen_id's device, cm_handler, and 
context under lock?

> +	spin_unlock_irqrestore(&listen_id_priv->lock, flags);
> +
> +	cm_id = iw_create_cm_id(device,	cm_handler, context);
> +
> +	/* If the cm_id could not be created, ignore the request */
> +	if (IS_ERR(cm_id)) 
> +		return;
> +
> +	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +	cm_id_priv->id.local_addr = work->event.local_addr;
> +	cm_id_priv->id.remote_addr = work->event.remote_addr;
> +	cm_id_priv->id.provider_id = work->event.provider_id;
> +	cm_id_priv->id.state = IW_CM_STATE_CONN_RECV;
> +
> +	/* Call the client CM handler */
> +	atomic_inc(&cm_id_priv->refcount);
> +	ret = cm_id->cm_handler(cm_id, &work->event);
> +	iwcm_deref_id(cm_id_priv);
> +	if (ret) {
> +		cm_id->state = IW_CM_STATE_IDLE;
> +		iw_destroy_cm_id(cm_id);
> +	}
> +
> +	return;

nit: return is unnecessary.

> +}
> +
> +/*
> + * Passive Side: CM_ID <-- ESTABLISHED
> + * 
> + * If we're in the wrong state, ignore the request. This could
> + * happen if we got a close from the peer.
> + */
> +static int cm_conn_est_handler(struct iwcm_work *work)
> +{
> +	struct iwcm_id_private *cm_id_priv;
> +	unsigned long flags;
> +	
> +	cm_id_priv = work->cm_id;
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	if (cm_id_priv->id.state != IW_CM_STATE_CONN_RECV) {
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +		return 0;
> +	}
> +
> +	cm_id_priv = work->cm_id;
> +	atomic_inc(&cm_id_priv->refcount); /* close decrements */

I'm not understanding how this increment is used, or what protection it 
provides.  Immediately after we release the spinlock below, the user can destroy 
the cm_id, which will release this reference.

> +	cm_id_priv->id.state = IW_CM_STATE_ESTABLISHED;
> +	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> +	/* Call the client CM handler */
> +	return cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->event);
> +}
> +
> +/*
> + * Active Side: CM_ID <-- ESTABLISHED
> + *
> + * Handles the reply to our connect request:
> + * - If the cm_id is in the wrong state when the event is
> + *   delivered, the event is ignored. 
> + * - If the remote peer accepts the connection, we update the 4-tuple
> + *   in the cm_id with the remote peer info, move the cm_id to the
> + *   ESTABLISHED state and deliver the event to the client.
> + * - If the remote peer rejects the connection, or there is some
> + *   connection error, move the cm_id to the IDLE state, and deliver
> + *   the event to the client.
> + */
> +static int cm_conn_rep_handler(struct iwcm_work *work)
> +{
> +	struct iwcm_id_private *cm_id_priv;
> +	unsigned long flags;
> +	
> +	cm_id_priv = work->cm_id;
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	if (cm_id_priv->id.state != IW_CM_STATE_CONN_SENT) {
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +		return 0;
> +	}
> +
> +	if (work->event.status == IW_CM_EVENT_STATUS_ACCEPTED) {
> +		cm_id_priv = work->cm_id;
> +		cm_id_priv->id.local_addr = work->event.local_addr;
> +		cm_id_priv->id.remote_addr = work->event.remote_addr;
> +		atomic_inc(&cm_id_priv->refcount); /* close decrements */

Similar concern about reference taken here as mentioned previously.

> +		cm_id_priv->id.state = IW_CM_STATE_ESTABLISHED;
> +	} else 
> +		cm_id_priv->id.state = IW_CM_STATE_IDLE;
> +
> +	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> +	/* Call the client CM handler */
> +	return cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->event);
> +}
> +
> +/*
> + * CM_ID <-- CLOSING 
> + *
> + * If in the ESTABLISHED state, move to closing and disassociate the
> + * CM_ID from the QP. The provider will have already moved the QP to
> + * CLOSING.

It's confusing trying to figure out who's responsible for QP transitions.  In 
some places, the iw_cm performs the transitions.  In others, the provider is 
expected to have performed them.  Is there a way to make this consistent, so 
that all QP transitions are initiated from the same location?  It seems that 
we're open to bugs if different modules drive the QP state without strict 
coordination between them.

> + */
> +static void cm_disconnect_handler(struct iwcm_work *work) 
> +{
> +	struct iwcm_id_private *cm_id_priv;
> +	unsigned long flags;
> +	
> +	cm_id_priv = work->cm_id;
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	if (cm_id_priv->id.state == IW_CM_STATE_ESTABLISHED) {
> +		cm_id_priv->id.state = IW_CM_STATE_CLOSING;
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +	} else
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);

nit: just drop the else and always unlock.

Why do we use a work handler just to change the state variable?

> +}
> +
> +/*
> + * CM_ID <-- IDLE
> + *
> + * If in the ESTBLISHED state, the QP will have have been moved by the
> + * provider to the ERR state. Disassociate the CM_ID from the QP, 
> + * move to IDLE, and remove the 'connected' reference.
> + *
> + * If in the CLOSING state, the QP has already been disassociated,
> + * move to IDLE and remove 'connected' reference. 
> + *
> + */
> +static int cm_close_handler(struct iwcm_work *work)
> +{
> +	struct iwcm_id_private *cm_id_priv;
> +	unsigned long flags;
> +
> +	cm_id_priv = work->cm_id;
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	cm_id_priv->id.qp = NULL;

Should we check the state before clearing the QP pointer?

> +	switch (cm_id_priv->id.state) {
> +	case IW_CM_STATE_ESTABLISHED:
> +	case IW_CM_STATE_CLOSING:
> +		cm_id_priv->id.state = IW_CM_STATE_IDLE;
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +		iwcm_deref_id(cm_id_priv);

gurgle...  Maintaining an extra reference on the object from one state to 
another is confusing.  This can't drop the last reference on the cm_id, since we 
access it below.  And if another thread can release this reference, then how do 
we even know that the cm_id is valid at the top of this function?  It appears 
that we have something like this:

1. object.state = X, object.ref++
2. if object.state == X, then object.state = Y, object.ref--

For multiple threads to process step 2 at the same time, each thread must 
already hold reference on the object.  (Or we'll end up crashing while checking 
the state.)

This makes me think that we don't need this reference at all.  And if the 
reference is supposed to provide some sort of protection, then there's likely a 
bug somewhere.

> +		break;
> +	default:
> +		/* lost race with iw_cm_disconnect */
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +		return 0;
> +	}
> +
> +	/* Call the client CM handler */
> +	return cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->event);
> +}
> +
> +static void cm_work_handler(void *arg)
> +{
> +	struct iwcm_work *work = (struct iwcm_work*)arg;
> +	struct iwcm_id_private *cm_id_priv = work->cm_id;
> +	int ret = 0;
> +
> +	/* Ignore scheduled events on a destroyed connection */
> +	if (cm_id_priv->id.state != IW_CM_STATE_DESTROYING) {

Leaving this check is fine, as long as we can agree that it isn't needed.  :)

> +		switch (work->event.event) {
> +		case IW_CM_EVENT_CONNECT_REQUEST:
> +			cm_conn_req_handler(work);
> +			break;
> +		case IW_CM_EVENT_CONNECT_REPLY:
> +			ret = cm_conn_rep_handler(work);
> +			break;
> +		case IW_CM_EVENT_ESTABLISHED:
> +			ret = cm_conn_est_handler(work);
> +			break;
> +		case IW_CM_EVENT_DISCONNECT:
> +			cm_disconnect_handler(work);
> +			break;
> +		case IW_CM_EVENT_CLOSE:
> +			ret = cm_close_handler(work);
> +			break;
> +		default:
> +			BUG_ON(1);
> +		}
> +	}
> +	kfree(work);
> +
> +	/* The reference was done in cm_event_handler. */
> +	iwcm_deref_id(cm_id_priv);
> +
> +	if (ret)
> +		iw_destroy_cm_id(&cm_id_priv->id);

Okay - I think this is where the problems start.  We can have several events 
queued behind this one when we try to destroy the cm_id.  If destroy blocks, we 
deadlock as you mention.  If destroy doesn't block, then the user can receive a 
callback after destroy returns.  Also note that if we don't use a single thread 
work queue, then a user could get multiple callbacks simultaneously, which is 
difficult to deal with, since events could now be reported out of order.

How about taking a different approach to event processing?  We can add struct 
work_struct and struct list_head fields to struct iwcm_id_private.  New events 
would be queued on the list.  If the list was empty when a new event was added, 
the work item would be queued for processing.  If there was already an item on 
the list, the new event is simply queued at the end.  Something like:

destroy_cm_id()
{
	normal destruction
	wait for refcount = 0
	cleanup queued events
}

cm_work_handler()
{
	lock
	while event list is not empty {
		read event at head of list
		unlock
		ret = process event
		if ret {
			destroy_cm_id()
			return
		}
		lock
		remove event from list
	}
	unlock
}

cm_event_handler()
{
	lock
	if event list is empty {
		queue event
		queue work item
	} else
		queue event
	unlock
}

This would allow for multi-threaded event handling as well.

> +/* IW CM provider event callback handler. This function is called on
> + * interrupt context. Schedule events on the rdma_wq thread to allow
> + * callback functions to downcall into the CM and/or block. 

nit: comment text should start down one line -- this appears in a couple of spots

> + *
> + * If the cm_id is being destroyed, ignore all events except CLOSE. In
> + * this case, dump the the ESTABLISHED reference and allow the cm_id
> + * to be freed.
> + */
> +static void cm_event_handler(struct iw_cm_id *cm_id,
> +			     struct iw_cm_event *iw_event) 
> +{
> +	struct iwcm_work *work;
> +	struct iwcm_id_private *cm_id_priv;
> +	unsigned long flags;

How are we assured that the cm_id is valid?  What indicates to the provider that 
it's been destroyed?  Some QP state change?

- Sean