[openib-general] [PATCH 3/3] iWARP CM

Wed Mar 8 12:41:31 PST 2006

Tom Tucker wrote:
> +#include <rdma/ib_cache.h>
> +#include <rdma/ib_cm.h>

Is this needed?

> +#include <rdma/iw_cm.h>
> +
> +#include "cm_msgs.h"

Is this needed?

> +struct iwcm_id_private;

I think this can be removed.

> +struct iwcm_device;
> +struct iwcm_port {
> +	struct iwcm_device *iwcm_dev;
> +	struct sockaddr_in local_addr;
> +	u8 port_num;
> +};

Can the port have more than one address?

> +/* Called whenever a reference added for a cm_id */
> +static inline void iwcm_addref_id(struct iwcm_id_private *cm_id_priv)
> +{
> +	atomic_inc(&cm_id_priv->refcount);
> +}

Is a wrapper around atomic_inc() needed?

> +struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
> +				 iw_cm_handler cm_handler,
> +				 void *context)
> +{
> +	struct iwcm_id_private *iwcm_id_priv;
> +
> +	iwcm_id_priv = kmalloc(sizeof *iwcm_id_priv, GFP_KERNEL);
> +	if (!iwcm_id_priv)
> +		return ERR_PTR(-ENOMEM);
> +
> +	memset(iwcm_id_priv, 0, sizeof *iwcm_id_priv);

Use kzalloc().

> +	iwcm_id_priv->id.state = IW_CM_STATE_IDLE;
> +	iwcm_id_priv->id.device = device;
> +	iwcm_id_priv->id.cm_handler = cm_handler;
> +	iwcm_id_priv->id.context = context;
> +	iwcm_id_priv->id.event_handler = cm_event_handler;
> +
> +	spin_lock_init(&iwcm_id_priv->lock);
> +	init_waitqueue_head(&iwcm_id_priv->wait);
> +	atomic_set(&iwcm_id_priv->refcount, 1);
> +
> +	return &iwcm_id_priv->id;
> +
> +}
> +EXPORT_SYMBOL(iw_create_cm_id);

> +void iw_destroy_cm_id(struct iw_cm_id *cm_id)
> +{
> +	struct iwcm_id_private *iwcm_id_priv;
> +	unsigned long flags;
> +	int ret = 0;
> +
> +	iwcm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +
> +	spin_lock_irqsave(&iwcm_id_priv->lock, flags);
> +	switch (cm_id->state) {
> +	case IW_CM_STATE_LISTEN:
> +		cm_id->state = IW_CM_STATE_IDLE;
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		ret = cm_id->device->iwcm->destroy_listen(cm_id);
> +		break;
> +
> +	case IW_CM_STATE_CONN_RECV:
> +	case IW_CM_STATE_CONN_SENT:
> +	case IW_CM_STATE_ESTABLISHED:
> +		cm_id->state = IW_CM_STATE_IDLE;
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		ret = cm_id->device->iwcm->disconnect(cm_id,1);
> +		break;
> +
> +	case IW_CM_STATE_IDLE:
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		break;
> +		
> +	default:
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		printk(KERN_ERR "%s:%s:%u Illegal state %d for iw_cm_id.\n", 
> +		       __FILE__, __FUNCTION__, __LINE__, cm_id->state);
> +		;

random ';'

Nit: my preference would be to remove the extra blank lines after the break 
statements.

> +	}
> +
> +	atomic_dec(&iwcm_id_priv->refcount);
> +	wait_event(iwcm_id_priv->wait, !atomic_read(&iwcm_id_priv->refcount));
> +
> +	kfree(iwcm_id_priv);
> +}
> +EXPORT_SYMBOL(iw_destroy_cm_id);

> +int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
> +{
> +	struct iwcm_id_private *iwcm_id_priv;
> +	unsigned long flags;
> +	int ret = 0;
> +
> +	if (cm_id->device == 0 || cm_id->device->iwcm == 0)
> +		return -EINVAL;

Use NULL in place of '0'.  Are these checks needed?
> +	
> +	iwcm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +	spin_lock_irqsave(&iwcm_id_priv->lock, flags);
> +	if (cm_id->state != IW_CM_STATE_IDLE) {
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		return -EBUSY;

-EINVAL seems like a better return code here.

> +	}
> +	cm_id->state = IW_CM_STATE_LISTEN;
> +	spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +
> +	ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
> +	if (ret != 0) 

Nit: how about just if (ret)?

> +		cm_id->state = IW_CM_STATE_IDLE;
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(iw_cm_listen);

> +int iw_cm_getpeername(struct iw_cm_id *cm_id,
> +		      struct sockaddr_in* local_addr,
> +		      struct sockaddr_in* remote_addr)
> +{
> +	if (cm_id->device == 0)
> +		return -EINVAL;
> +
> +	if (cm_id->device->iwcm == 0)
> +		return -EINVAL;

Use NULL - are checks needed given state check below?  Is synchronization around 
this call needed, since it checks the state?
> +
> +	/* Make sure there's a connection */
> +	if (cm_id->state != IW_CM_STATE_ESTABLISHED)
> +		return -ENOTCONN;
> +
> +	return cm_id->device->iwcm->getpeername(cm_id, local_addr, remote_addr);
> +}
> +EXPORT_SYMBOL(iw_cm_getpeername);

> +int iw_cm_reject(struct iw_cm_id *cm_id,
> +		 const void *private_data,
> +		 u8 private_data_len)
> +{
> +	struct iwcm_id_private *iwcm_id_priv;
> +	unsigned long flags;
> +	int ret;
> +
> +	if (cm_id->device == 0 || cm_id->device->iwcm == 0)
> +		return -EINVAL;

Use NULL - are checks needed?

> +		
> +	iwcm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +
> +	spin_lock_irqsave(&iwcm_id_priv->lock, flags);
> +	switch (cm_id->state) {
> +	case IW_CM_STATE_CONN_RECV:
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		ret = cm_id->device->iwcm->reject(cm_id, private_data, 
> +						  private_data_len);
> +		cm_id->state = IW_CM_STATE_IDLE;

This changes the state after the lock has been released.

> +		break;
> +	default:
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		ret = -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(iw_cm_reject);

> +int iw_cm_accept(struct iw_cm_id *cm_id,
> +		 const void *private_data,
> +		 u8 private_data_len)
> +{
> +	struct iwcm_id_private *iwcm_id_priv;
> +	unsigned long flags;
> +	int ret;
> +
> +	if (cm_id->device == 0 || cm_id->device->iwcm == 0)
> +		return -EINVAL;

ditto...

> +		
> +	iwcm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +
> +	spin_lock_irqsave(&iwcm_id_priv->lock, flags);
> +	switch (cm_id->state) {
> +	case IW_CM_STATE_CONN_RECV:
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		ret = cm_id->device->iwcm->accept(cm_id, private_data, 
> +						  private_data_len);
> +		if (ret == 0) {
> +			struct iw_cm_event event;
> +			event.event = IW_CM_EVENT_ESTABLISHED;
> +			event.provider_id = cm_id->provider_id;
> +			event.status = 0;
> +			event.local_addr = cm_id->local_addr;
> +			event.remote_addr = cm_id->remote_addr;

If the cm_id contains the provider_id, local_addr, and remote_addr, does it need 
to be returned as part of the event?

> +			event.private_data = 0;
> +			event.private_data_len = 0;
> +			cm_event_handler(cm_id, &event);

Does this result in a callback being invoked from the calling thread?  If so, 
this behavior differs from other callbacks used throughout the stack.

Also, the state is left as CONN_RECV.  Is this correct?

> +		}
> +		
> +		break;
> +	default:
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		ret = -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(iw_cm_accept);

> +int iw_cm_connect(struct iw_cm_id *cm_id, 
> +		  const void *pdata,
> +		  u8 pdata_len)
> +{
> +	struct iwcm_id_private *cm_id_priv;
> +	int ret = 0;
> +	unsigned long flags;
> +
> +	if (cm_id->device == 0 || cm_id->device->iwcm == 0)
> +		return -EINVAL;

ditto...

> +	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	if (cm_id->state != IW_CM_STATE_IDLE) {
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +		return -EBUSY;

-EINVAL seems like a better return code.

> +	}
> +	cm_id->state = IW_CM_STATE_CONN_SENT;
> +	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> +	ret = cm_id->device->iwcm->connect(cm_id, pdata, pdata_len);
> +	if (ret != 0)

if (ret)?  -- just my preference

> +		cm_id->state = IW_CM_STATE_IDLE;
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(iw_cm_connect);

> +int iw_cm_disconnect(struct iw_cm_id *cm_id)
> +{
> +	struct iwcm_id_private *iwcm_id_priv;
> +	unsigned long flags;
> +	int ret;
> +
> +	if (cm_id->device == 0 || cm_id->device->iwcm == 0 || cm_id->qp == 0)
> +		return -EINVAL;

ditto...

> +	iwcm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +	spin_lock_irqsave(&iwcm_id_priv->lock, flags);
> +	switch (cm_id->state) {
> +	case IW_CM_STATE_ESTABLISHED:
> +		cm_id->state = IW_CM_STATE_CLOSING;
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		ret = cm_id->device->iwcm->disconnect(cm_id, 1);
> +		if (ret == 0) {
> +			struct iw_cm_event event;
> +			event.event = IW_CM_EVENT_LLP_DISCONNECT;
> +			event.provider_id = cm_id->provider_id;
> +			event.status = 0;
> +			event.local_addr = cm_id->local_addr;
> +			event.remote_addr = cm_id->remote_addr;
> +			event.private_data = 0;
> +			event.private_data_len = 0;
> +			cm_event_handler(cm_id, &event);

See comments in iw_cm_accept().

> +		}
> +		cm_id->state = IW_CM_STATE_IDLE;

Why is state set to CLOSING above, then set it to IDLE?  There may be a valid 
reason, I'm just trying to understand why.

> +		break;
> +	default:
> +		spin_unlock_irqrestore(&iwcm_id_priv->lock, flags);
> +		ret = -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(iw_cm_disconnect);
> +
> +static void iwcm_add_one(struct ib_device *device)
> +{
> +	struct iwcm_device *iwcm_dev;
> +	struct iwcm_port *port;
> +	unsigned long flags;
> +	u8 i;
> +
> +	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IWARP)
> +		return;
> +
> +	iwcm_dev = kmalloc(sizeof(*iwcm_dev) + sizeof(*port) *
> +			 device->phys_port_cnt, GFP_KERNEL);
> +	if (!iwcm_dev)
> +		return;
> +
> +	iwcm_dev->device = device;
> +
> +	for (i = 1; i <= device->phys_port_cnt; i++) {
> +		port = &iwcm_dev->port[i-1];
> +		port->iwcm_dev = iwcm_dev;
> +		port->port_num = i;
> +	}
> +
> +	ib_set_client_data(device, &iwcm_client, iwcm_dev);
> +
> +	write_lock_irqsave(&iwcm.device_lock, flags);
> +	list_add_tail(&iwcm_dev->list, &iwcm.device_list);
> +	write_unlock_irqrestore(&iwcm.device_lock, flags);
> +	return;

Can drop 'return'.

{snip}

> +static int cm_conn_req_handler(struct iwcm_work *work)

Return code is ignored.

> +{
> +	struct iw_cm_id *cm_id;
> +	struct iwcm_id_private *cm_id_priv;
> +	int rc;
> +
> +	/* If the status was not successful, ignore request */
> +	if (work->event.status) {
> +		printk(KERN_ERR "%s:%d Bad status=%d for connection request. "
> +		       "Should be filtered by provider\n",
> +		       __FUNCTION__, __LINE__,
> +		       work->event.status);
> +		return work->event.status;
> +	}

Can status really ever be non-zero here?

> +	cm_id = iw_create_cm_id(work->cm_id->id.device,	
> +				work->cm_id->id.cm_handler,
> +				work->cm_id->id.context);
> +	if (IS_ERR(cm_id)) 
> +		return PTR_ERR(cm_id);

Do we still need to kfree(work) before returning?  Actually, see next comment.

> +
> +	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +	cm_id_priv->id.local_addr = work->event.local_addr;
> +	cm_id_priv->id.remote_addr = work->event.remote_addr;
> +	cm_id_priv->id.provider_id = work->event.provider_id;
> +	cm_id_priv->id.state = IW_CM_STATE_CONN_RECV;
> +
> +	/* Call the client CM handler */
> +	iwcm_addref_id(cm_id_priv);
> +	rc = cm_id->cm_handler(cm_id, &work->event);
> +	iwcm_deref_id(cm_id_priv);
> +	if (rc) {
> +		cm_id->state = IW_CM_STATE_IDLE;
> +		iw_destroy_cm_id(cm_id);
> +	}
> +	kfree(work);

Can we move the free into cm_work_handler()?

> +	return 0;
> +}
> +
> +/*
> + * Handles the transition to established state on the passive side.
> + */
> +static int cm_conn_est_handler(struct iwcm_work *work)

Return code is ignored.

> +{
> +	struct iwcm_id_private *cm_id_priv;
> +	unsigned long flags;
> +	int ret = 0;
> +	
> +	cm_id_priv = work->cm_id;
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	if (cm_id_priv->id.state != IW_CM_STATE_CONN_RECV) {
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +		printk(KERN_ERR "%s:%d Invalid cm_id state=%d "
> +		       "for established event\n", __FUNCTION__, __LINE__, 
> +		       cm_id_priv->id.state);
> +		ret = -EINVAL;
> +		goto error_out;
> +	}

If the user destroys the cm_id, the state could change before this callback is 
invoked.  This shouldn't be an error.

> +
> +	if (work->event.status == 0) {
> +		cm_id_priv = work->cm_id;
> +		cm_id_priv->id.local_addr = work->event.local_addr;
> +		cm_id_priv->id.remote_addr = work->event.remote_addr;
> +		cm_id_priv->id.state = IW_CM_STATE_ESTABLISHED;
> +	} else 
> +		cm_id_priv->id.state = IW_CM_STATE_IDLE;
> +
> +	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> +	/* Call the client CM handler */
> +	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->event);
> +	if (ret) {
> +		cm_id_priv->id.state = IW_CM_STATE_IDLE;
> +		iw_destroy_cm_id(&cm_id_priv->id);

I think that we still have a reference on the cm_id from cm_event_handler(), 
which will cause destroy to block forever.

> +	}
> +
> + error_out:
> +	kfree(work);

Move free to cm_work_handler().

> +	return ret;
> +}
> +
> +/*
> + * Handles the reply to our connect request. There are three
> + * possibilities: 
> + * - If the cm_id is in the wrong state when the event is
> + *   delivered, the event is ignored. [What should we do when the
> + *   provider does something crazy?]

The cm_id can be in the wrong state if being destroyed.  Crazy providers just 
need to be fixed.   :)

> + * - If the remote peer accepts the connection, we update the 4-tuple
> + *   in the cm_id with the remote peer info, move the cm_id to the
> + *   ESTABLISHED state and deliver the event to the client.
> + * - If the remote peer rejects the connection, or there is some
> + *   connection error, move the cm_id to the IDLE state, and deliver
> + *   the event to the client.
> + */
> +static int cm_conn_rep_handler(struct iwcm_work *work)

Return code is ignored.

> +{
> +	struct iwcm_id_private *cm_id_priv;
> +	unsigned long flags;
> +	int ret = 0;
> +	
> +	cm_id_priv = work->cm_id;
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	if (cm_id_priv->id.state != IW_CM_STATE_CONN_SENT) {
> +		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +		printk(KERN_ERR "%s:%d Invalid cm_id state=%d "
> +		       "for connect reply event\n", __FUNCTION__, __LINE__, 
> +	 	       cm_id_priv->id.state);
> +		ret = -EINVAL;
> +		goto error_out;
> +	}

See comment in cm_conn_est_handler.

> +
> +	if (work->event.status == 0) {
> +		cm_id_priv = work->cm_id;
> +		cm_id_priv->id.local_addr = work->event.local_addr;
> +		cm_id_priv->id.remote_addr = work->event.remote_addr;
> +		cm_id_priv->id.state = IW_CM_STATE_ESTABLISHED;
> +	} else 
> +		cm_id_priv->id.state = IW_CM_STATE_IDLE;
> +
> +	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> +	/* Call the client CM handler */
> +	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->event);
> +	if (ret) {
> +		cm_id_priv->id.state = IW_CM_STATE_IDLE;
> +		iw_destroy_cm_id(&cm_id_priv->id);

I think we're still holding a reference on the cm_id.

> +	}
> +
> + error_out:
> +	kfree(work);

Move free to cm_work_handler().

> +	return ret;
> +}
> +
> +static int cm_disconnect_handler(struct iwcm_work *work) 

Return value is ignored.

> +{
> +	struct iwcm_id_private *cm_id_priv;
> +	int ret = 0;
> +	
> +	cm_id_priv = work->cm_id;
> +
> +	cm_id_priv->id.state = IW_CM_STATE_IDLE;

Do we need to verify that we haven't already started to destroy the cm_id before 
changing its state?

> +
> +	/* Call the client CM handler */
> +	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->event);
> +	if (ret)
> +		iw_destroy_cm_id(&cm_id_priv->id);
> +
> +	kfree(work);

Same comments as above about free and holding reference apply here.

> +	return ret;
> +}
> +
> +static void cm_work_handler(void *arg)
> +{
> +	struct iwcm_work *work = (struct iwcm_work*)arg;
> +	struct iwcm_id_private *cm_id_priv = work->cm_id;
> +	int rc;
> +
> +	switch (work->event.event) {
> +	case IW_CM_EVENT_CONNECT_REQUEST:
> +		rc = cm_conn_req_handler(work);
> +		break;
> +	case IW_CM_EVENT_CONNECT_REPLY:
> +		rc = cm_conn_rep_handler(work);
> +		break;
> +	case IW_CM_EVENT_ESTABLISHED:
> +		rc = cm_conn_est_handler(work);
> +		break;
> +	case IW_CM_EVENT_LLP_DISCONNECT:
> +	case IW_CM_EVENT_LLP_TIMEOUT:
> +	case IW_CM_EVENT_LLP_RESET:
> +	case IW_CM_EVENT_CLOSE:
> +		rc = cm_disconnect_handler(work);
> +		break;
> +	}
> +
> +	/* The reference was done in cm_event_handler. */
> +	iwcm_deref_id(cm_id_priv);

The cm_*_handlers need to release the reference in order to process destruction. 
  That or use their return code to determine if the reference should be 
released, or if cleanup needs to be invoked.

> +/* IW CM provider event callback handler. This function is called on
> + * interrupt context. The function builds a work queue element
> + * and enqueues it for processing on a work queue thread. This allows
> + * CM client callback functions to block.
> + */
> +static void cm_event_handler(struct iw_cm_id *cm_id,
> +			     struct iw_cm_event *event) 
> +{
> +	struct iwcm_work *work;
> +	struct iwcm_id_private *cm_id_priv;
> +
> +	work = kmalloc(sizeof *work, GFP_ATOMIC);
> +	if (!work)
> +		return;
> +
> +	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> +	
> +	/* 
> +	 * Reference cm_id_priv until the work is completed in 
> +	 * cm_work_handler.
> +	 */
> +	iwcm_addref_id(cm_id_priv);
> +	INIT_WORK(&work->work, cm_work_handler, work);
> +	work->cm_id = cm_id_priv;
> +	work->event = *event;
> +	queue_work(iwcm.wq, &work->work);

Can you use the rdma_wq instead of creating a new one?  We'll need to be careful 
that we don't end up with a deadlock condition, but I think that it's safe to 
use it.

> +static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv,
> +				  struct ib_qp_attr *qp_attr,
> +				  int *qp_attr_mask)
> +{
> +	unsigned long flags;
> +	int ret;
> +
> +	spin_lock_irqsave(&cm_id_priv->lock, flags);
> +	switch (cm_id_priv->id.state) {
> +	case IW_CM_STATE_IDLE:
> +	case IW_CM_STATE_CONN_SENT:
> +	case IW_CM_STATE_CONN_RECV:
> +	case IW_CM_STATE_ESTABLISHED:
> +		*qp_attr_mask = IB_QP_STATE;

Do you need to set what the new state should be?

> +		ret = 0;
> +		break;
> +	default:
> +		ret = -EINVAL;
> +		break;
> +	}
> +	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +	return ret;
> +}

- Sean