[openib-general] Re: [PATCH 3/3] iWARP CM - iWARP Connection Manager

Tue Mar 21 18:26:31 PST 2006

Thanks for putting up with the comments.  :)

This is getting close; I think that we might be able to get it just a little bit 
simpler -- something like what's shown below where we key destruction only off 
of the refcount, and always perform it in ib_destroy_cm_id().

The one area that I'm still not clear on is the CLOSE event.  Right now, it 
seems that the cm_id is required to hang out until a CLOSE event is received, 
but only if the cm_id gets connected.  Because of the complexity, I'm not 
comfortable that the destruction is error free or easily maintainable.

What's needed is a simple way to know that a provider will not invoke a callback 
referencing an invalid context.  This code tries to push this responsibility to 
the owner of the context (i.e. here), rather than down into the provider, where 
it really belongs.

Somewhere the provider is allocating their own context that gets associated with 
  the cm_id.  Can we make this explicit in the code?  Could we also make the 
de-allocation explicit?  Right now, we have something similar to an asynchronous 
destruction model, where the user does something (QP transition?) that results 
in a callback (CLOSE event), but only in specific cases.

I'm not sure if the changes below are possible without doing something to ensure 
that provider callbacks reference valid a cm_id.

- Sean

Tom Tucker wrote:
> +struct iwcm_id_private {
> +	struct iw_cm_id	id;
> +
> +	unsigned long destroy_flags;

remove destroy_flags

> +	wait_queue_head_t destroy_wait;
> +	
> +	struct list_head work_list;
> +
> +	spinlock_t lock;
> +	atomic_t refcount;
> +};
> +#define IWCM_DESTROY_F_CALLBACK   1

remove IWCM_DESTROY_F_CALLBACK

> +
> +struct iwcm_work {
> +	struct work_struct work;
> +	struct iwcm_id_private *cm_id;
> +	struct list_head list;
> +	struct iw_cm_event event;
> +};
> +
> +/* 
> + * Release a reference on cm_id. If the last reference is being removed
> + * and iw_destroy_cm_id is waiting, wake up the waiting thread.
> + */
> +static inline int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
> +{
> +	int ret = 0;
> +
> +	if (atomic_dec_and_test(&cm_id_priv->refcount)) {
> +		BUG_ON(!list_empty(&cm_id_priv->work_list));
> +		if (waitqueue_active(&cm_id_priv->destroy_wait)) {
> +			BUG_ON(cm_id_priv->id.state != IW_CM_STATE_DESTROYING);
> +			BUG_ON(test_bit(IWCM_DESTROY_F_CALLBACK,
> +					&cm_id_priv->destroy_flags));
> +			ret = 1;
> +			wake_up(&cm_id_priv->destroy_wait);
> +		}
> +	}
> +
> +	return ret;
> +}

change function to:

static void iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
{
	if (atomic_dec_and_test(&cm_id_priv->refcount))
		wake_up(&cm_id_priv->destroy_wait)
}

It looks like there's the potential to miss signaling the event if the reference 
count drops to 0, but the destroy thread hasn't started waiting on the event yet.

{snip}

> +static void destroy_cm_id(struct iw_cm_id *cm_id)

rename call to iw_destroy_cm_id(struct iw_cm_id *cm_id)

> +{
> +	struct iwcm_id_private *iwcm_id_priv;
> +	unsigned long flags;
> +	struct ib_qp *qp;
> +	int ret;
> +
> +	iwcm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +
> +	spin_lock_irqsave(&iwcm_id_priv->lock, flags);
> +	switch (cm_id->state) {
> +	case IW_CM_STATE_LISTEN:
> +		cm_id->state = IW_CM_STATE_DESTROYING;
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		/* destroy the listening endpoint */
> +		ret = cm_id->device->iwcm->destroy_listen(cm_id);
> +		break;
> +	case IW_CM_STATE_CONN_RECV:
> +		cm_id->state = IW_CM_STATE_DESTROYING;
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		/* Need to call reject on behalf of client */
> +		(void)iw_cm_reject(cm_id, NULL, 0);
> +		break;
> +	case IW_CM_STATE_ESTABLISHED:
> +		cm_id->state = IW_CM_STATE_DESTROYING;
> +		qp = cm_id->qp;
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		/* Abrupt close of the connection */
> +		ret = iwcm_modify_qp_err(qp);
> +		break;
> +	default:
> +		cm_id->state = IW_CM_STATE_DESTROYING;
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +	}
> +
> +	(void)iwcm_deref_id(iwcm_id_priv);

replace iwcm_deref_id() with:

	atomic_dec(&iwcm_id_priv->refcount);
	wait_event(&iwcm_id_priv->destroy_wait,
		   !atomic_read(iwcm_id_priv->refcount));

	while (!list_empty(&iwcm_id_priv->work_list)) {
		work = list_entry(iwcm_id_priv->work_list.next,
				  struct iwcm_work, list);
		list_del(&work->list);
		kfree(work);
	}

> +}
> +
> +/* 
> + * This function is only called by the application thread and cannot
> + * be called by the event thread. The function will wait for all
> + * references to be released on the cm_id and then kfree the cm_id
> + * object. 
> + */
> +void iw_destroy_cm_id(struct iw_cm_id *cm_id)
> +{
> +	struct iwcm_id_private *iwcm_id_priv;
> +
> +	destroy_cm_id(cm_id);
> +
> +	iwcm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +	wait_event(iwcm_id_priv->destroy_wait, 
> +		   !atomic_read(&iwcm_id_priv->refcount));
> +
> +	kfree(iwcm_id_priv);
> +}
> +EXPORT_SYMBOL(iw_destroy_cm_id);

replace with previous call

> +static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, 
> +				struct iw_cm_event *iw_event)
> +{
> +	unsigned long flags;
> +	struct iw_cm_id *cm_id;
> +	struct iwcm_id_private *cm_id_priv;
> +	iw_cm_handler cm_handler;
> +	void *context;
> +	struct ib_device *device;
> +	int ret;
> +
> +	/* The provider should never generate a connection request
> +	 * event with a bad status. 
> +	 */
> +	BUG_ON(iw_event->status);
> +
> +	/* We could be destroying the listening id. If so, ignore this
> +	 * upcall. */
> +	spin_lock_irqsave(&listen_id_priv->lock, flags);
> +	if (listen_id_priv->id.state != IW_CM_STATE_LISTEN) {
> +		spin_unlock_irqrestore(&listen_id_priv->lock, flags);
> +		return;
> +	}
> +	/* The listen_id can be destroyed the moment we release the
> +	 * lock, so take the state we need to inherit before releasing
> +	 * the lock 
> +	*/
> +	device = listen_id_priv->id.device;
> +	cm_handler = listen_id_priv->id.cm_handler;
> +	context = listen_id_priv->id.context;
> +	spin_unlock_irqrestore(&listen_id_priv->lock, flags);
> +
> +	cm_id = iw_create_cm_id(device,	cm_handler, context);
> +
> +	/* If the cm_id could not be created, ignore the request */
> +	if (IS_ERR(cm_id)) 
> +		return;
> +
> +	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +	cm_id_priv->id.local_addr = iw_event->local_addr;
> +	cm_id_priv->id.remote_addr = iw_event->remote_addr;
> +	cm_id_priv->id.provider_id = iw_event->provider_id;
> +	cm_id_priv->id.state = IW_CM_STATE_CONN_RECV;
> +
> +	/* Call the client CM handler */
> +	// XXX: atomic_inc(&cm_id_priv->refcount);
> +	ret = cm_id->cm_handler(cm_id, iw_event);
> +	// xxx: (void)iwcm_deref_id(cm_id_priv);

some dead code here

> +	if (ret) {
> +		set_bit(IWCM_DESTROY_F_CALLBACK, &cm_id_priv->destroy_flags);
> +		destroy_cm_id(cm_id);
> +		if (atomic_read(&cm_id_priv->refcount)==0)
> +			kfree(cm_id);

replace if fragment with:

	if (ret)
		ib_destroy_cm_id(cm_id_priv);

> +	}
> +}
> +
> +/*
> + * Passive Side: CM_ID <-- ESTABLISHED
> + * 
> + * The provider generated an ESTABLISHED event which means that 
> + * the MPA negotion has completed successfully and we are now in MPA
> + * FPDU mode. 
> + */
> +static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv, 
> +			       struct iw_cm_event *iw_event)
> +{
> +	unsigned long flags;
> +
> +	/* If the cm_id is in the wrong state, ignore the
> +	 * request. This could happen if the app called disconnect or
> +	 * destroy. 
> +	 */
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	if (cm_id_priv->id.state != IW_CM_STATE_CONN_RECV) {
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +		return 0;
> +	}
> +
> +	/* Take a reference on behalf of the provider. This reference
> +	 * will be removed when the provider delivers the CLOSE event.
> +	 */
> +	atomic_inc(&cm_id_priv->refcount);

This seems problematic.  How do we know that the close event wasn't already 
handed to us before we entered this routine?  There has to be a cleaner way to 
deal with this.

> +	cm_id_priv->id.state = IW_CM_STATE_ESTABLISHED;
> +	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> +	/* Call the client CM handler */
> +	return cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
> +}
> +
> +/*
> + * Active Side: CM_ID <-- ESTABLISHED
> + *
> + * Handles the reply to our connect request:
> + * - If the cm_id is in the wrong state when the event is
> + *   delivered, the event is ignored. 
> + * - If the remote peer accepted the connection, we update the 4-tuple
> + *   in the cm_id with the remote peer info, move the cm_id to the
> + *   ESTABLISHED state and deliver the event to the client.
> + * - If the remote peer rejected the connection, or there is some
> + *   connection error, move the cm_id to the IDLE state, and deliver
> + *   the bad news to the client.
> + */
> +static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, 
> +			       struct iw_cm_event *iw_event)
> +{
> +	unsigned long flags;
> +
> +	/* If the cm_id is in the wrong state, ignore the
> +	 * request. This could happen if the app called disconnect or
> +	 * destroy. 
> +	 */
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	if (cm_id_priv->id.state != IW_CM_STATE_CONN_SENT) {
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +		return 0;
> +	}
> +	if (iw_event->status == IW_CM_EVENT_STATUS_ACCEPTED) {
> +		cm_id_priv->id.local_addr = iw_event->local_addr;
> +		cm_id_priv->id.remote_addr = iw_event->remote_addr;
> +		/* Take a reference on behalf of the provider. This reference
> +		 * will be removed when the provider delivers the CLOSE event.
> +		 */
> +		atomic_inc(&cm_id_priv->refcount); 

see above

> +		cm_id_priv->id.state = IW_CM_STATE_ESTABLISHED;
> +	} else 
> +		cm_id_priv->id.state = IW_CM_STATE_IDLE;
> +	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> +	/* Call the client CM handler */
> +	return cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
> +}
> +
> +/*
> + * CM_ID <-- CLOSING 
> + *
> + * If in the ESTABLISHED state, move to CLOSING.
> + */
> +static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv, 
> +				  struct iw_cm_event *iw_event)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	if (cm_id_priv->id.state == IW_CM_STATE_ESTABLISHED)
> +		cm_id_priv->id.state = IW_CM_STATE_CLOSING;
> +	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +}
> +
> +/*
> + * CM_ID <-- IDLE
> + *
> + * If in the ESTBLISHED or CLOSING states, the QP will have have been
> + * moved by the provider to the ERR state. Disassociate the CM_ID from
> + * the QP,  move to IDLE, and remove the 'connected' reference.
> + * 
> + * If in some other state, the cm_id was destroyed asynchronously.
> + * This is the last reference that will result in waking up
> + * the app thread blocked in iw_destroy_cm_id.
> + */
> +static int cm_close_handler(struct iwcm_id_private *cm_id_priv, 
> +				  struct iw_cm_event *iw_event)
> +{
> +	unsigned long flags;
> +	int ret;
> +
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +
> +	/* Disassociate the QP from the cm_id */
> +	cm_id_priv->id.qp = NULL;
> +
> +	/* Regardsless of the state, we remove the close reference. It
> +	 * cannot be the last one, because the event reference has not
> +	 * yet been removed.*/
> +	(void)iwcm_deref_id(cm_id_priv);

see above

> +	BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
> +
> +	switch (cm_id_priv->id.state) {
> +	case IW_CM_STATE_ESTABLISHED:
> +	case IW_CM_STATE_CLOSING:
> +		cm_id_priv->id.state = IW_CM_STATE_IDLE;
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
> +		break;
> +	default:
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +		ret = 0;
> +	}
> +
> +	return ret;
> +}
> +
> +static int process_event(struct iwcm_id_private *cm_id_priv, 
> +			 struct iw_cm_event *iw_event)
> +{
> +	int ret = 0;
> +
> +	switch (iw_event->event) {
> +	case IW_CM_EVENT_CONNECT_REQUEST:
> +		cm_conn_req_handler(cm_id_priv, iw_event);
> +		break;
> +	case IW_CM_EVENT_CONNECT_REPLY:
> +		ret = cm_conn_rep_handler(cm_id_priv, iw_event);
> +		break;
> +	case IW_CM_EVENT_ESTABLISHED:
> +		ret = cm_conn_est_handler(cm_id_priv, iw_event);
> +		break;
> +	case IW_CM_EVENT_DISCONNECT:
> +		cm_disconnect_handler(cm_id_priv, iw_event);
> +		break;
> +	case IW_CM_EVENT_CLOSE:
> +		ret = cm_close_handler(cm_id_priv, iw_event);
> +		break;
> +	default:
> +		BUG_ON(1);
> +	}
> +
> +	return ret;
> +}
> +
> +/* 
> + * Process events on the work_list for the cm_id. If the callback
> + * function requests that the cm_id be deleted, a flag is set in the
> + * cm_id destroy_flags to indicate that when the last reference is
> + * removed, the cm_id is to be destroyed. This is necessary to
> + * distinguish between an object that will be destroyed by the app
> + * thread asleep on the destroy_wait list vs. an object destroyed
> + * here synchronously when the last reference is removed.
> + */
> +static void cm_work_handler(void *arg)
> +{
> +	struct iwcm_work *work = (struct iwcm_work*)arg;
> +	struct iwcm_id_private *cm_id_priv = work->cm_id;
> +	unsigned long flags;
> +	int ret = 0;
> +
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	while (!list_empty(&cm_id_priv->work_list)) {
> +		work = list_entry(cm_id_priv->work_list.next, 
> +				  struct iwcm_work, list);
> +		list_del_init(&work->list);
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> +		ret = process_event(cm_id_priv, &work->event);
> +		kfree(work);
> +		if (ret) {
> +			set_bit(IWCM_DESTROY_F_CALLBACK, 
> +				&cm_id_priv->destroy_flags);
> +			destroy_cm_id(&cm_id_priv->id);
> +		}
> +		BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
> +		if (iwcm_deref_id(cm_id_priv))
> +			return;
> +		
> +		if (atomic_read(&cm_id_priv->refcount)==0
> +		    && test_bit(IWCM_DESTROY_F_CALLBACK, 
> +				&cm_id_priv->destroy_flags)) {
> +			kfree(cm_id_priv);
> +			return;
> +		}
> +		spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	}
> +	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +}

change call to:

	spin_lock_irqsave(&cm_id_priv->lock, flags);
	while (!list_empty(&cm_id_priv->work_list)) {
		work = list_entry(cm_id_priv->work_list.next,
				  struct iwcm_work, list);
		spin_unlock_irqrestore(&cm_id_priv->lock, flags);

		ret = process_event(cm_id_priv, &work->event);
		if (ret) {
			atomic_dec(&cm_id_priv->refcount);
			ib_destroy_cm_id(&cm_id_priv->id);
		}
		spin_lock_irqsave(&cm_id_priv->lock, flags);
		list_del(&work->list);
		kfree(work);
	}
	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
	iwcm_deref_id(cm_id_priv);

> +static void cm_event_handler(struct iw_cm_id *cm_id,
> +			     struct iw_cm_event *iw_event) 
> +{
> +	struct iwcm_work *work;
> +	struct iwcm_id_private *cm_id_priv;
> +	unsigned long flags;
> +
> +	work = kmalloc(sizeof *work, GFP_ATOMIC);
> +	if (!work)
> +		return;
> +
> +	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +	atomic_inc(&cm_id_priv->refcount);

move atomic_inc() from here...

> +	
> +	INIT_WORK(&work->work, cm_work_handler, work);
> +	work->cm_id = cm_id_priv;
> +	work->event = *iw_event;
> +
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	if (list_empty(&cm_id_priv->work_list)) {
> +		list_add_tail(&work->list, &cm_id_priv->work_list);

... to here -- to associated the reference with the queued work item, rather 
than event

> +		queue_work(rdma_wq, &work->work);
> +	} else
> +		list_add_tail(&work->list, &cm_id_priv->work_list);
> +	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +}
> +