[ofa-general] [PATCH] rdma/cm: add locking around QP accesses

Sean Hefty sean.hefty at intel.com
Fri Oct 5 10:19:51 PDT 2007


Rick, have you had a chance to test out this patch?

- Sean


>If a user allocates a QP on an rdma_cm_id, the rdma_cm will automatically
>transition the QP through its states (RTR, RTS, error, etc.)  While the
>QP state transitions are occurring, the QP itself must remain valid.
>Provide locking around the QP pointer to prevent its destruction while
>accessing the pointer.
>
>This fixes an issue reported by Olaf Kirch from Oracle that resulted in
>a system crash:
>
>"An incoming connection arrives and we decide to tear down the nascent
> connection.  The remote ends decides to do the same.  We start to shut
> down the connection, and call rdma_destroy_qp on our cm_id. ... Now
> apparently a 'connect reject' message comes in from the other host,
> and cma_ib_handler() is called with an event of IB_CM_REJ_RECEIVED.
> It calls cma_modify_qp_err, which for some odd reason tries to modify
> the exact same QP we just destroyed."
>
>Signed-off-by: Sean Hefty <sean.hefty at intel.com>
>---
>Rick, can you please test this patch and let me know if it fixes your problem?
>
> drivers/infiniband/core/cma.c |   90 +++++++++++++++++++++++++++--------------
> 1 files changed, 60 insertions(+), 30 deletions(-)
>
>diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
>index 9ffb998..c6a6dba 100644
>--- a/drivers/infiniband/core/cma.c
>+++ b/drivers/infiniband/core/cma.c
>@@ -120,6 +120,8 @@ struct rdma_id_private {
>
> 	enum cma_state		state;
> 	spinlock_t		lock;
>+	struct mutex		qp_mutex;
>+
> 	struct completion	comp;
> 	atomic_t		refcount;
> 	wait_queue_head_t	wait_remove;
>@@ -387,6 +389,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler
>event_handler,
> 	id_priv->id.event_handler = event_handler;
> 	id_priv->id.ps = ps;
> 	spin_lock_init(&id_priv->lock);
>+	mutex_init(&id_priv->qp_mutex);
> 	init_completion(&id_priv->comp);
> 	atomic_set(&id_priv->refcount, 1);
> 	init_waitqueue_head(&id_priv->wait_remove);
>@@ -472,61 +475,86 @@ EXPORT_SYMBOL(rdma_create_qp);
>
> void rdma_destroy_qp(struct rdma_cm_id *id)
> {
>-	ib_destroy_qp(id->qp);
>+	struct rdma_id_private *id_priv;
>+
>+	id_priv = container_of(id, struct rdma_id_private, id);
>+	mutex_lock(&id_priv->qp_mutex);
>+	ib_destroy_qp(id_priv->id.qp);
>+	id_priv->id.qp = NULL;
>+	mutex_unlock(&id_priv->qp_mutex);
> }
> EXPORT_SYMBOL(rdma_destroy_qp);
>
>-static int cma_modify_qp_rtr(struct rdma_cm_id *id)
>+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv)
> {
> 	struct ib_qp_attr qp_attr;
> 	int qp_attr_mask, ret;
>
>-	if (!id->qp)
>-		return 0;
>+	mutex_lock(&id_priv->qp_mutex);
>+	if (!id_priv->id.qp) {
>+		ret = 0;
>+		goto out;
>+	}
>
> 	/* Need to update QP attributes from default values. */
> 	qp_attr.qp_state = IB_QPS_INIT;
>-	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
>+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
> 	if (ret)
>-		return ret;
>+		goto out;
>
>-	ret = ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
>+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
> 	if (ret)
>-		return ret;
>+		goto out;
>
> 	qp_attr.qp_state = IB_QPS_RTR;
>-	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
>+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
> 	if (ret)
>-		return ret;
>+		goto out;
>
>-	return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
>+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
>+out:
>+	mutex_unlock(&id_priv->qp_mutex);
>+	return ret;
> }
>
>-static int cma_modify_qp_rts(struct rdma_cm_id *id)
>+static int cma_modify_qp_rts(struct rdma_id_private *id_priv)
> {
> 	struct ib_qp_attr qp_attr;
> 	int qp_attr_mask, ret;
>
>-	if (!id->qp)
>-		return 0;
>+	mutex_lock(&id_priv->qp_mutex);
>+	if (!id_priv->id.qp) {
>+		ret = 0;
>+		goto out;
>+	}
>
> 	qp_attr.qp_state = IB_QPS_RTS;
>-	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
>+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
> 	if (ret)
>-		return ret;
>+		goto out;
>
>-	return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
>+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
>+out:
>+	mutex_unlock(&id_priv->qp_mutex);
>+	return ret;
> }
>
>-static int cma_modify_qp_err(struct rdma_cm_id *id)
>+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
> {
> 	struct ib_qp_attr qp_attr;
>+	int ret;
>
>-	if (!id->qp)
>-		return 0;
>+	mutex_lock(&id_priv->qp_mutex);
>+	if (!id_priv->id.qp) {
>+		ret = 0;
>+		goto out;
>+	}
>
> 	qp_attr.qp_state = IB_QPS_ERR;
>-	return ib_modify_qp(id->qp, &qp_attr, IB_QP_STATE);
>+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
>+out:
>+	mutex_unlock(&id_priv->qp_mutex);
>+	return ret;
> }
>
> static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
>@@ -855,11 +883,11 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
> {
> 	int ret;
>
>-	ret = cma_modify_qp_rtr(&id_priv->id);
>+	ret = cma_modify_qp_rtr(id_priv);
> 	if (ret)
> 		goto reject;
>
>-	ret = cma_modify_qp_rts(&id_priv->id);
>+	ret = cma_modify_qp_rts(id_priv);
> 	if (ret)
> 		goto reject;
>
>@@ -869,7 +897,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
>
> 	return 0;
> reject:
>-	cma_modify_qp_err(&id_priv->id);
>+	cma_modify_qp_err(id_priv);
> 	ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
> 		       NULL, 0, NULL, 0);
> 	return ret;
>@@ -945,7 +973,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct
>ib_cm_event *ib_event)
> 		/* ignore event */
> 		goto out;
> 	case IB_CM_REJ_RECEIVED:
>-		cma_modify_qp_err(&id_priv->id);
>+		cma_modify_qp_err(id_priv);
> 		event.status = ib_event->param.rej_rcvd.reason;
> 		event.event = RDMA_CM_EVENT_REJECTED;
> 		event.param.conn.private_data = ib_event->private_data;
>@@ -2236,7 +2264,7 @@ static int cma_connect_iw(struct rdma_id_private
>*id_priv,
> 	sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr;
> 	cm_id->remote_addr = *sin;
>
>-	ret = cma_modify_qp_rtr(&id_priv->id);
>+	ret = cma_modify_qp_rtr(id_priv);
> 	if (ret)
> 		goto out;
>
>@@ -2303,7 +2331,7 @@ static int cma_accept_ib(struct rdma_id_private *id_priv,
> 	int qp_attr_mask, ret;
>
> 	if (id_priv->id.qp) {
>-		ret = cma_modify_qp_rtr(&id_priv->id);
>+		ret = cma_modify_qp_rtr(id_priv);
> 		if (ret)
> 			goto out;
>
>@@ -2342,7 +2370,7 @@ static int cma_accept_iw(struct rdma_id_private *id_priv,
> 	struct iw_cm_conn_param iw_param;
> 	int ret;
>
>-	ret = cma_modify_qp_rtr(&id_priv->id);
>+	ret = cma_modify_qp_rtr(id_priv);
> 	if (ret)
> 		return ret;
>
>@@ -2414,7 +2442,7 @@ int rdma_accept(struct rdma_cm_id *id, struct
>rdma_conn_param *conn_param)
>
> 	return 0;
> reject:
>-	cma_modify_qp_err(id);
>+	cma_modify_qp_err(id_priv);
> 	rdma_reject(id, NULL, 0);
> 	return ret;
> }
>@@ -2484,7 +2512,7 @@ int rdma_disconnect(struct rdma_cm_id *id)
>
> 	switch (rdma_node_get_transport(id->device->node_type)) {
> 	case RDMA_TRANSPORT_IB:
>-		ret = cma_modify_qp_err(id);
>+		ret = cma_modify_qp_err(id_priv);
> 		if (ret)
> 			goto out;
> 		/* Initiate or respond to a disconnect. */
>@@ -2515,9 +2543,11 @@ static int cma_ib_mc_handler(int status, struct
>ib_sa_multicast *multicast)
> 	    cma_disable_remove(id_priv, CMA_ADDR_RESOLVED))
> 		return 0;
>
>+	mutex_lock(&id_priv->qp_mutex);
> 	if (!status && id_priv->id.qp)
> 		status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
> 					 multicast->rec.mlid);
>+	mutex_unlock(&id_priv->qp_mutex);
>
> 	memset(&event, 0, sizeof event);
> 	event.status = status;
>
>_______________________________________________
>general mailing list
>general at lists.openfabrics.org
>http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
>
>To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



More information about the general mailing list