[openib-general] Re: [PATCH 1/2] iWARP Connection Manager.
Sean Hefty
mshefty at ichips.intel.com
Wed May 31 15:22:24 PDT 2006
Steve Wise wrote:
> +/*
> + * Release a reference on cm_id. If the last reference is being removed
> + * and iw_destroy_cm_id is waiting, wake up the waiting thread.
> + */
> +static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
> +{
> + int ret = 0;
> +
> + BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
> + if (atomic_dec_and_test(&cm_id_priv->refcount)) {
> + BUG_ON(!list_empty(&cm_id_priv->work_list));
> + if (waitqueue_active(&cm_id_priv->destroy_wait)) {
> + BUG_ON(cm_id_priv->state != IW_CM_STATE_DESTROYING);
> + BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY,
> + &cm_id_priv->flags));
> + ret = 1;
> + wake_up(&cm_id_priv->destroy_wait);
We recently changed the RDMA CM, IB CM, and a couple of other modules from using
wait objects to completions. This avoids a race condition between decrementing
the reference count, which allows destruction to proceed, and calling wake_up on
a freed cm_id. My guess is that you may need to do the same.
Can you also explain the use of the return value here? It's ignored below in
rem_ref() and destroy_cm_id().
> +static void add_ref(struct iw_cm_id *cm_id)
> +{
> + struct iwcm_id_private *cm_id_priv;
> + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> + atomic_inc(&cm_id_priv->refcount);
> +}
> +
> +static void rem_ref(struct iw_cm_id *cm_id)
> +{
> + struct iwcm_id_private *cm_id_priv;
> + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> + iwcm_deref_id(cm_id_priv);
> +}
> +
> +/*
> + * CM_ID <-- CLOSING
> + *
> + * Block if a passive or active connection is currenlty being processed. Then
> + * process the event as follows:
> + * - If we are ESTABLISHED, move to CLOSING and modify the QP state
> + * based on the abrupt flag
> + * - If the connection is already in the CLOSING or IDLE state, the peer is
> + * disconnecting concurrently with us and we've already seen the
> + * DISCONNECT event -- ignore the request and return 0
> + * - Disconnect on a listening endpoint returns -EINVAL
> + */
> +int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
> +{
> + struct iwcm_id_private *cm_id_priv;
> + unsigned long flags;
> + int ret = 0;
> +
> + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> + /* Wait if we're currently in a connect or accept downcall */
> + wait_event(cm_id_priv->connect_wait,
> + !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
Am I understanding this check correctly? You're checking to see if the user has
called iw_cm_disconnect() at the same time that they called iw_cm_connect() or
iw_cm_accept(). Are connect / accept blocking, or are you just waiting for an
event?
> +
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + switch (cm_id_priv->state) {
> + case IW_CM_STATE_ESTABLISHED:
> + cm_id_priv->state = IW_CM_STATE_CLOSING;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + if (cm_id_priv->qp) { /* QP could be <nul> for user-mode client */
> + if (abrupt)
> + ret = iwcm_modify_qp_err(cm_id_priv->qp);
> + else
> + ret = iwcm_modify_qp_sqd(cm_id_priv->qp);
> + /*
> + * If both sides are disconnecting the QP could
> + * already be in ERR or SQD states
> + */
> + ret = 0;
> + }
> + else
> + ret = -EINVAL;
> + break;
> + case IW_CM_STATE_LISTEN:
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + ret = -EINVAL;
> + break;
> + case IW_CM_STATE_CLOSING:
> + /* remote peer closed first */
> + case IW_CM_STATE_IDLE:
> + /* accept or connect returned !0 */
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + break;
> + case IW_CM_STATE_CONN_RECV:
> + /*
> + * App called disconnect before/without calling accept after
> + * connect_request event delivered.
> + */
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + break;
> + case IW_CM_STATE_CONN_SENT:
> + /* Can only get here if wait above fails */
> + default:
> + BUG_ON(1);
> + }
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(iw_cm_disconnect);
> +static void destroy_cm_id(struct iw_cm_id *cm_id)
> +{
> + struct iwcm_id_private *cm_id_priv;
> + unsigned long flags;
> + int ret;
> +
> + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> + /* Wait if we're currently in a connect or accept downcall. A
> + * listening endpoint should never block here. */
> + wait_event(cm_id_priv->connect_wait,
> + !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
Same question/comment as above.
> +
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + switch (cm_id_priv->state) {
> + case IW_CM_STATE_LISTEN:
> + cm_id_priv->state = IW_CM_STATE_DESTROYING;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + /* destroy the listening endpoint */
> + ret = cm_id->device->iwcm->destroy_listen(cm_id);
> + break;
> + case IW_CM_STATE_ESTABLISHED:
> + cm_id_priv->state = IW_CM_STATE_DESTROYING;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + /* Abrupt close of the connection */
> + (void)iwcm_modify_qp_err(cm_id_priv->qp);
> + break;
> + case IW_CM_STATE_IDLE:
> + case IW_CM_STATE_CLOSING:
> + cm_id_priv->state = IW_CM_STATE_DESTROYING;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + break;
> + case IW_CM_STATE_CONN_RECV:
> + /*
> + * App called destroy before/without calling accept after
> + * receiving connection request event notification.
> + */
> + cm_id_priv->state = IW_CM_STATE_DESTROYING;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + break;
> + case IW_CM_STATE_CONN_SENT:
> + case IW_CM_STATE_DESTROYING:
> + default:
> + BUG_ON(1);
> + break;
> + }
> +
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
As an alternative, you could hold the lock from above, an let the LISTEN /
ESTABLISHED state checks release and reacquire.
> + if (cm_id_priv->qp) {
> + cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
> + cm_id_priv->qp = NULL;
> + }
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> + (void)iwcm_deref_id(cm_id_priv);
> +}
> +
> +/*
> + * This function is only called by the application thread and cannot
> + * be called by the event thread. The function will wait for all
> + * references to be released on the cm_id and then kfree the cm_id
> + * object.
> + */
> +void iw_destroy_cm_id(struct iw_cm_id *cm_id)
> +{
> + struct iwcm_id_private *cm_id_priv;
> +
> + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> + BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
> +
> + destroy_cm_id(cm_id);
> +
> + wait_event(cm_id_priv->destroy_wait,
> + !atomic_read(&cm_id_priv->refcount));
> +
> + kfree(cm_id_priv);
> +}
> +EXPORT_SYMBOL(iw_destroy_cm_id);
> +
> +/*
> + * CM_ID <-- LISTEN
> + *
> + * Start listening for connect requests. Generates one CONNECT_REQUEST
> + * event for each inbound connect request.
> + */
> +int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
> +{
> + struct iwcm_id_private *cm_id_priv;
> + unsigned long flags;
> + int ret = 0;
> +
> + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + switch (cm_id_priv->state) {
> + case IW_CM_STATE_IDLE:
> + cm_id_priv->state = IW_CM_STATE_LISTEN;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
> + if (ret)
> + cm_id_priv->state = IW_CM_STATE_IDLE;
> + break;
> + default:
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + ret = -EINVAL;
> + }
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(iw_cm_listen);
> +
> +/*
> + * CM_ID <-- IDLE
> + *
> + * Rejects an inbound connection request. No events are generated.
> + */
> +int iw_cm_reject(struct iw_cm_id *cm_id,
> + const void *private_data,
> + u8 private_data_len)
> +{
> + struct iwcm_id_private *cm_id_priv;
> + unsigned long flags;
> + int ret;
> +
> + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> + set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
> +
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
> + wake_up_all(&cm_id_priv->connect_wait);
> + return -EINVAL;
> + }
> + cm_id_priv->state = IW_CM_STATE_IDLE;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> + ret = cm_id->device->iwcm->reject(cm_id, private_data,
> + private_data_len);
> +
> + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
> + wake_up_all(&cm_id_priv->connect_wait);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(iw_cm_reject);
> +
> +/*
> + * CM_ID <-- ESTABLISHED
> + *
> + * Accepts an inbound connection request and generates an ESTABLISHED
> + * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block
> + * until the ESTABLISHED event is received from the provider.
> + */
This makes it sound like we're just waiting for an event.
> +int iw_cm_accept(struct iw_cm_id *cm_id,
> + struct iw_cm_conn_param *iw_param)
> +{
> + struct iwcm_id_private *cm_id_priv;
> + struct ib_qp *qp;
> + unsigned long flags;
> + int ret;
> +
> + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> + set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
> +
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
> + wake_up_all(&cm_id_priv->connect_wait);
> + return -EINVAL;
> + }
> + /* Get the ib_qp given the QPN */
> + qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
> + if (!qp) {
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + return -EINVAL;
> + }
> + cm_id->device->iwcm->add_ref(qp);
> + cm_id_priv->qp = qp;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> + ret = cm_id->device->iwcm->accept(cm_id, iw_param);
> + if (ret) {
> + /* An error on accept precludes provider events */
> + BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
> + cm_id_priv->state = IW_CM_STATE_IDLE;
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + if (cm_id_priv->qp) {
> + cm_id->device->iwcm->rem_ref(qp);
> + cm_id_priv->qp = NULL;
> + }
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + printk("Accept failed, ret=%d\n", ret);
> + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
> + wake_up_all(&cm_id_priv->connect_wait);
> + }
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(iw_cm_accept);
> +
> +/*
> + * Active Side: CM_ID <-- CONN_SENT
> + *
> + * If successful, results in the generation of a CONNECT_REPLY
> + * event. iw_cm_disconnect and iw_cm_destroy will block until the
> + * CONNECT_REPLY event is received from the provider.
> + */
> +int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
> +{
> + struct iwcm_id_private *cm_id_priv;
> + int ret = 0;
> + unsigned long flags;
> + struct ib_qp *qp;
> +
> + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> + set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
> +
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + if (cm_id_priv->state != IW_CM_STATE_IDLE) {
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
> + wake_up_all(&cm_id_priv->connect_wait);
> + return -EINVAL;
> + }
> +
> + /* Get the ib_qp given the QPN */
> + qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
> + if (!qp) {
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + return -EINVAL;
> + }
> + cm_id->device->iwcm->add_ref(qp);
> + cm_id_priv->qp = qp;
> + cm_id_priv->state = IW_CM_STATE_CONN_SENT;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +
> + ret = cm_id->device->iwcm->connect(cm_id, iw_param);
> + if (ret) {
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + if (cm_id_priv->qp) {
> + cm_id->device->iwcm->rem_ref(qp);
> + cm_id_priv->qp = NULL;
> + }
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
> + cm_id_priv->state = IW_CM_STATE_IDLE;
> + printk("Connect failed, ret=%d\n", ret);
> + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
> + wake_up_all(&cm_id_priv->connect_wait);
> + }
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(iw_cm_connect);
> +
> +/*
> + * Passive Side: new CM_ID <-- CONN_RECV
> + *
> + * Handles an inbound connect request. The function creates a new
> + * iw_cm_id to represent the new connection and inherits the client
> + * callback function and other attributes from the listening parent.
> + *
> + * The work item contains a pointer to the listen_cm_id and the event. The
> + * listen_cm_id contains the client cm_handler, context and
> + * device. These are copied when the device is cloned. The event
> + * contains the new four tuple.
> + *
> + * An error on the child should not affect the parent, so this
> + * function does not return a value.
> + */
> +static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
> + struct iw_cm_event *iw_event)
> +{
> + unsigned long flags;
> + struct iw_cm_id *cm_id;
> + struct iwcm_id_private *cm_id_priv;
> + int ret;
> +
> + /* The provider should never generate a connection request
> + * event with a bad status.
> + */
> + BUG_ON(iw_event->status);
> +
> + /* We could be destroying the listening id. If so, ignore this
> + * upcall. */
> + spin_lock_irqsave(&listen_id_priv->lock, flags);
> + if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
> + spin_unlock_irqrestore(&listen_id_priv->lock, flags);
> + return;
> + }
> + spin_unlock_irqrestore(&listen_id_priv->lock, flags);
> +
> + cm_id = iw_create_cm_id(listen_id_priv->id.device,
> + listen_id_priv->id.cm_handler,
> + listen_id_priv->id.context);
> + /* If the cm_id could not be created, ignore the request */
> + if (IS_ERR(cm_id))
> + return;
> +
> + cm_id->provider_data = iw_event->provider_data;
> + cm_id->local_addr = iw_event->local_addr;
> + cm_id->remote_addr = iw_event->remote_addr;
> +
> + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
> + cm_id_priv->state = IW_CM_STATE_CONN_RECV;
> +
> + /* Call the client CM handler */
> + ret = cm_id->cm_handler(cm_id, iw_event);
> + if (ret) {
> + printk("destroying child id %p, ret=%d\n",
> + cm_id, ret);
We probably don't always want to print a message here.
> + set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
> + destroy_cm_id(cm_id);
> + if (atomic_read(&cm_id_priv->refcount)==0)
> + kfree(cm_id);
> + }
> +}
> +
> +/*
> + * Passive Side: CM_ID <-- ESTABLISHED
> + *
> + * The provider generated an ESTABLISHED event which means that
> + * the MPA negotion has completed successfully and we are now in MPA
> + * FPDU mode.
> + *
> + * This event can only be received in the CONN_RECV state. If the
> + * remote peer closed, the ESTABLISHED event would be received followed
> + * by the CLOSE event. If the app closes, it will block until we wake
> + * it up after processing this event.
> + */
> +static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv,
> + struct iw_cm_event *iw_event)
> +{
> + unsigned long flags;
> + int ret = 0;
> +
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> +
> + /* We clear the CONNECT_WAIT bit here to allow the callback
> + * function to call iw_cm_disconnect. Calling iw_destroy_cm_id
> + * from a callback handler is not allowed */
> + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
> + switch (cm_id_priv->state) {
> + case IW_CM_STATE_CONN_RECV:
> + cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
> + break;
> + default:
> + BUG_ON(1);
Can just BUG_ON the state and avoid the switch. Same comment applies below.
> + }
> + wake_up_all(&cm_id_priv->connect_wait);
> +
> + return ret;
> +}
> +
> +/*
> + * Active Side: CM_ID <-- ESTABLISHED
> + *
> + * The app has called connect and is waiting for the established event to
> + * post it's requests to the server. This event will wake up anyone
> + * blocked in iw_cm_disconnect or iw_destroy_id.
> + */
> +static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
> + struct iw_cm_event *iw_event)
> +{
> + unsigned long flags;
> + int ret = 0;
> +
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + /* Clear the connect wait bit so a callback function calling
> + * iw_cm_disconnect will not wait and deadlock this thread */
> + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
> + switch (cm_id_priv->state) {
> + case IW_CM_STATE_CONN_SENT:
> + if (iw_event->status == IW_CM_EVENT_STATUS_ACCEPTED) {
> + cm_id_priv->id.local_addr = iw_event->local_addr;
> + cm_id_priv->id.remote_addr = iw_event->remote_addr;
> + cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
> + } else {
> + /* REJECTED or RESET */
> + cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
> + cm_id_priv->qp = NULL;
> + cm_id_priv->state = IW_CM_STATE_IDLE;
> + }
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
> + break;
> + default:
> + BUG_ON(1);
> + }
> + /* Wake up waiters on connect complete */
> + wake_up_all(&cm_id_priv->connect_wait);
> +
> + return ret;
> +}
> +
> +/*
> + * CM_ID <-- CLOSING
> + *
> + * If in the ESTABLISHED state, move to CLOSING.
> + */
> +static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv,
> + struct iw_cm_event *iw_event)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&cm_id_priv->lock, flags);
> + if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED)
> + cm_id_priv->state = IW_CM_STATE_CLOSING;
> + spin_unlock_irqrestore(&cm_id_priv->lock, flags);
> +}
> +
> +/*
> + * CM_ID <-- IDLE
> + *
> + * If in the ESTBLISHED or CLOSING states, the QP will have have been
> + * moved by the provider to the ERR state. Disassociate the CM_ID from
> + * the QP, move to IDLE, and remove the 'connected' reference.
> + *
> + * If in some other state, the cm_id was destroyed asynchronously.
> + * This is the last reference that will result in waking up
> + * the app thread blocked in iw_destroy_cm_id.
> + */
> +static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
> + struct iw_cm_event *iw_event)
> +{
> + unsigned long flags;
> + int ret = 0;
> + /* TT */printk("%s:%d cm_id_priv=%p, state=%d\n",
> + __FUNCTION__, __LINE__,
> + cm_id_priv,cm_id_priv->state);
Will want to remove this.
- Sean
More information about the general
mailing list