[ofa-general] [PATCH] rdma/cm: add locking around QP accesses
Sean Hefty
sean.hefty at intel.com
Fri Oct 5 10:19:51 PDT 2007
Rick, have you had a chance to test out this patch?
- Sean
>If a user allocates a QP on an rdma_cm_id, the rdma_cm will automatically
>transition the QP through its states (RTR, RTS, error, etc.) While the
>QP state transitions are occurring, the QP itself must remain valid.
>Provide locking around the QP pointer to prevent its destruction while
>accessing the pointer.
>
>This fixes an issue reported by Olaf Kirch from Oracle that resulted in
>a system crash:
>
>"An incoming connection arrives and we decide to tear down the nascent
> connection. The remote ends decides to do the same. We start to shut
> down the connection, and call rdma_destroy_qp on our cm_id. ... Now
> apparently a 'connect reject' message comes in from the other host,
> and cma_ib_handler() is called with an event of IB_CM_REJ_RECEIVED.
> It calls cma_modify_qp_err, which for some odd reason tries to modify
> the exact same QP we just destroyed."
>
>Signed-off-by: Sean Hefty <sean.hefty at intel.com>
>---
>Rick, can you please test this patch and let me know if it fixes your problem?
>
> drivers/infiniband/core/cma.c | 90 +++++++++++++++++++++++++++--------------
> 1 files changed, 60 insertions(+), 30 deletions(-)
>
>diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
>index 9ffb998..c6a6dba 100644
>--- a/drivers/infiniband/core/cma.c
>+++ b/drivers/infiniband/core/cma.c
>@@ -120,6 +120,8 @@ struct rdma_id_private {
>
> enum cma_state state;
> spinlock_t lock;
>+ struct mutex qp_mutex;
>+
> struct completion comp;
> atomic_t refcount;
> wait_queue_head_t wait_remove;
>@@ -387,6 +389,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler
>event_handler,
> id_priv->id.event_handler = event_handler;
> id_priv->id.ps = ps;
> spin_lock_init(&id_priv->lock);
>+ mutex_init(&id_priv->qp_mutex);
> init_completion(&id_priv->comp);
> atomic_set(&id_priv->refcount, 1);
> init_waitqueue_head(&id_priv->wait_remove);
>@@ -472,61 +475,86 @@ EXPORT_SYMBOL(rdma_create_qp);
>
> void rdma_destroy_qp(struct rdma_cm_id *id)
> {
>- ib_destroy_qp(id->qp);
>+ struct rdma_id_private *id_priv;
>+
>+ id_priv = container_of(id, struct rdma_id_private, id);
>+ mutex_lock(&id_priv->qp_mutex);
>+ ib_destroy_qp(id_priv->id.qp);
>+ id_priv->id.qp = NULL;
>+ mutex_unlock(&id_priv->qp_mutex);
> }
> EXPORT_SYMBOL(rdma_destroy_qp);
>
>-static int cma_modify_qp_rtr(struct rdma_cm_id *id)
>+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv)
> {
> struct ib_qp_attr qp_attr;
> int qp_attr_mask, ret;
>
>- if (!id->qp)
>- return 0;
>+ mutex_lock(&id_priv->qp_mutex);
>+ if (!id_priv->id.qp) {
>+ ret = 0;
>+ goto out;
>+ }
>
> /* Need to update QP attributes from default values. */
> qp_attr.qp_state = IB_QPS_INIT;
>- ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
>+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
> if (ret)
>- return ret;
>+ goto out;
>
>- ret = ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
>+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
> if (ret)
>- return ret;
>+ goto out;
>
> qp_attr.qp_state = IB_QPS_RTR;
>- ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
>+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
> if (ret)
>- return ret;
>+ goto out;
>
>- return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
>+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
>+out:
>+ mutex_unlock(&id_priv->qp_mutex);
>+ return ret;
> }
>
>-static int cma_modify_qp_rts(struct rdma_cm_id *id)
>+static int cma_modify_qp_rts(struct rdma_id_private *id_priv)
> {
> struct ib_qp_attr qp_attr;
> int qp_attr_mask, ret;
>
>- if (!id->qp)
>- return 0;
>+ mutex_lock(&id_priv->qp_mutex);
>+ if (!id_priv->id.qp) {
>+ ret = 0;
>+ goto out;
>+ }
>
> qp_attr.qp_state = IB_QPS_RTS;
>- ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
>+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
> if (ret)
>- return ret;
>+ goto out;
>
>- return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
>+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
>+out:
>+ mutex_unlock(&id_priv->qp_mutex);
>+ return ret;
> }
>
>-static int cma_modify_qp_err(struct rdma_cm_id *id)
>+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
> {
> struct ib_qp_attr qp_attr;
>+ int ret;
>
>- if (!id->qp)
>- return 0;
>+ mutex_lock(&id_priv->qp_mutex);
>+ if (!id_priv->id.qp) {
>+ ret = 0;
>+ goto out;
>+ }
>
> qp_attr.qp_state = IB_QPS_ERR;
>- return ib_modify_qp(id->qp, &qp_attr, IB_QP_STATE);
>+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
>+out:
>+ mutex_unlock(&id_priv->qp_mutex);
>+ return ret;
> }
>
> static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
>@@ -855,11 +883,11 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
> {
> int ret;
>
>- ret = cma_modify_qp_rtr(&id_priv->id);
>+ ret = cma_modify_qp_rtr(id_priv);
> if (ret)
> goto reject;
>
>- ret = cma_modify_qp_rts(&id_priv->id);
>+ ret = cma_modify_qp_rts(id_priv);
> if (ret)
> goto reject;
>
>@@ -869,7 +897,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
>
> return 0;
> reject:
>- cma_modify_qp_err(&id_priv->id);
>+ cma_modify_qp_err(id_priv);
> ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
> NULL, 0, NULL, 0);
> return ret;
>@@ -945,7 +973,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct
>ib_cm_event *ib_event)
> /* ignore event */
> goto out;
> case IB_CM_REJ_RECEIVED:
>- cma_modify_qp_err(&id_priv->id);
>+ cma_modify_qp_err(id_priv);
> event.status = ib_event->param.rej_rcvd.reason;
> event.event = RDMA_CM_EVENT_REJECTED;
> event.param.conn.private_data = ib_event->private_data;
>@@ -2236,7 +2264,7 @@ static int cma_connect_iw(struct rdma_id_private
>*id_priv,
> sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr;
> cm_id->remote_addr = *sin;
>
>- ret = cma_modify_qp_rtr(&id_priv->id);
>+ ret = cma_modify_qp_rtr(id_priv);
> if (ret)
> goto out;
>
>@@ -2303,7 +2331,7 @@ static int cma_accept_ib(struct rdma_id_private *id_priv,
> int qp_attr_mask, ret;
>
> if (id_priv->id.qp) {
>- ret = cma_modify_qp_rtr(&id_priv->id);
>+ ret = cma_modify_qp_rtr(id_priv);
> if (ret)
> goto out;
>
>@@ -2342,7 +2370,7 @@ static int cma_accept_iw(struct rdma_id_private *id_priv,
> struct iw_cm_conn_param iw_param;
> int ret;
>
>- ret = cma_modify_qp_rtr(&id_priv->id);
>+ ret = cma_modify_qp_rtr(id_priv);
> if (ret)
> return ret;
>
>@@ -2414,7 +2442,7 @@ int rdma_accept(struct rdma_cm_id *id, struct
>rdma_conn_param *conn_param)
>
> return 0;
> reject:
>- cma_modify_qp_err(id);
>+ cma_modify_qp_err(id_priv);
> rdma_reject(id, NULL, 0);
> return ret;
> }
>@@ -2484,7 +2512,7 @@ int rdma_disconnect(struct rdma_cm_id *id)
>
> switch (rdma_node_get_transport(id->device->node_type)) {
> case RDMA_TRANSPORT_IB:
>- ret = cma_modify_qp_err(id);
>+ ret = cma_modify_qp_err(id_priv);
> if (ret)
> goto out;
> /* Initiate or respond to a disconnect. */
>@@ -2515,9 +2543,11 @@ static int cma_ib_mc_handler(int status, struct
>ib_sa_multicast *multicast)
> cma_disable_remove(id_priv, CMA_ADDR_RESOLVED))
> return 0;
>
>+ mutex_lock(&id_priv->qp_mutex);
> if (!status && id_priv->id.qp)
> status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
> multicast->rec.mlid);
>+ mutex_unlock(&id_priv->qp_mutex);
>
> memset(&event, 0, sizeof event);
> event.status = status;
>
>_______________________________________________
>general mailing list
>general at lists.openfabrics.org
>http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
>
>To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
More information about the general
mailing list