[openib-general] [PATCH] fix 2 race conditions in ib_destroy_cm_id

Michael S. Tsirkin mst at mellanox.co.il
Sun May 7 08:54:47 PDT 2006


Fix two issues in CM.
1. crash if cm id is destroyed from handler because of non-0 return code,
   and at the same time from user thread by direct call to ib_destroy_cm_id.
2. use after free if ib_destroy_cm_id tests the refcount after cm_deref_id has
   decremented the reference count but before it has called wake_up.

I'm sure the first one has caused crashes for me, and I suspect
the second one caused a system hang.

Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

Index: linux-2.6.16/drivers/infiniband/core/cm.c
===================================================================
--- linux-2.6.16.orig/drivers/infiniband/core/cm.c	2006-05-07 20:41:38.000000000 +0300
+++ linux-2.6.16/drivers/infiniband/core/cm.c	2006-05-07 20:43:28.000000000 +0300
@@ -159,8 +159,12 @@ static void cm_work_handler(void *data);
 
 static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	if (atomic_dec_and_test(&cm_id_priv->refcount))
 		wake_up(&cm_id_priv->wait);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 }
 
 static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
@@ -320,13 +324,22 @@ static int cm_alloc_id(struct cm_id_priv
 	return ret;
 }
 
-static void cm_free_id(__be32 local_id)
+static int cm_free_id(__be32 local_id)
 {
+	struct cm_id_private *cm_id_priv;
 	unsigned long flags;
+	int rc;
 
 	spin_lock_irqsave(&cm.lock, flags);
-	idr_remove(&cm.local_id_table, (__force int) local_id);
+	cm_id_priv = idr_find(&cm.local_id_table, (__force int) local_id);
+	if (unlikely(!cm_id_priv))
+		rc = -EINVAL;
+	else {
+		rc = 0;
+		idr_remove(&cm.local_id_table, (__force int) local_id);
+	}
 	spin_unlock_irqrestore(&cm.lock, flags);
+	return rc;
 }
 
 static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
@@ -710,11 +723,12 @@ static void cm_reset_to_idle(struct cm_i
 	}
 }
 
-void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+static void __ib_destroy_cm_id(struct ib_cm_id *cm_id, int flush)
 {
 	struct cm_id_private *cm_id_priv;
 	struct cm_work *work;
 	unsigned long flags;
+	int rc;
 
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 retest:
@@ -775,15 +789,32 @@ retest:
 		break;
 	}
 
-	cm_free_id(cm_id->local_id);
-	atomic_dec(&cm_id_priv->refcount);
-	wait_event(cm_id_priv->wait, !atomic_read(&cm_id_priv->refcount));
+	rc = cm_free_id(cm_id->local_id);
+	cm_deref_id(cm_id_priv);
+	if (unlikely(rc) && flush) {
+		/* A handler has removed the id from the idr.
+		   Make sure it is not still running. */
+		flush_workqueue(&cm.wq);
+		return;
+	}
+
+	wait_event(cm_id_priv->wait, ({
+				      spin_lock_irq(&cm_id_priv->lock);
+				      rc = !atomic_read(&cm_id_priv->refcount)
+				      spin_unlock_irq(&cm_id_priv->lock);
+				      rc;
+				      }));
 	while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
 		cm_free_work(work);
 	kfree(cm_id_priv->compare_data);
 	kfree(cm_id_priv->private_data);
 	kfree(cm_id_priv);
 }
+
+void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+{
+	__ib_destroy_cm_id(cm_id, 1);
+}
 EXPORT_SYMBOL(ib_destroy_cm_id);
 
 int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
@@ -1171,8 +1202,8 @@ static void cm_process_work(struct cm_id
 		cm_free_work(work);
 	}
 	cm_deref_id(cm_id_priv);
-	if (ret)
-		ib_destroy_cm_id(&cm_id_priv->id);
+	if (ret) /* don't flush: we are running from cm.wq */
+		__ib_destroy_cm_id(&cm_id_priv->id, 0);
 }
 
 static void cm_format_mra(struct cm_mra_msg *mra_msg,

-- 
MST



More information about the general mailing list