[ofa-general] [PATCH] ib_core: avoid race condition between sysfs access and low-level module unload

Jack Morgenstein jackm at dev.mellanox.co.il
Sun Feb 22 08:04:21 PST 2009


In newer kernels, a low-level module will not be unloaded
while its sysfs interface is being accessed, so its code pages will be available
for the sysfs access. However, nothing prevents the low-level module from freeing
its memory resources during such access.  This can cause a kernel Oops.

To avoid this, we protect the device reg_state with a mutex, and perform
all sysfs operations (show, store) atomically within this mutex by locking the
mutex, testing whether the device is still "alive", and only if it is, invoking
low-level module functions -- and finally, freeing the mutex.

Signed-off-by: Jack Morgenstein <jackm at dev.mellanox.co.il>

---

Roland,
I think this patch is a reasonable solution to the sysfs problem of a low-level
driver module being unloaded while sysfs is being accessed for the device.

ib_unregister_device() is always called before the device driver frees up its
resources.  Since this patch makes sysfs accesses atomic wrt the device registration
state, it solves the problem of the race between freeing device resources and
accessing the low-level to retrieve device data.

(I ran checkpatch.pl on this, and I do have several lines slightly more than
 80 chars long -- but that's all).

Jack

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 7913b80..6254202 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -172,9 +172,14 @@ static int end_port(struct ib_device *device)
  */
 struct ib_device *ib_alloc_device(size_t size)
 {
+	struct ib_device *ibdev;
+
 	BUG_ON(size < sizeof (struct ib_device));
 
-	return kzalloc(size, GFP_KERNEL);
+	ibdev = kzalloc(size, GFP_KERNEL);
+	if (ibdev)
+		mutex_init(&ibdev->sysfs_mutex);
+	return ibdev;
 }
 EXPORT_SYMBOL(ib_alloc_device);
 
@@ -305,9 +310,10 @@ int ib_register_device(struct ib_device *device)
 		goto out;
 	}
 
+	mutex_lock(&device->sysfs_mutex);
 	list_add_tail(&device->core_list, &device_list);
-
 	device->reg_state = IB_DEV_REGISTERED;
+	mutex_unlock(&device->sysfs_mutex);
 
 	{
 		struct ib_client *client;
@@ -353,7 +359,9 @@ void ib_unregister_device(struct ib_device *device)
 		kfree(context);
 	spin_unlock_irqrestore(&device->client_data_lock, flags);
 
+	mutex_lock(&device->sysfs_mutex);
 	device->reg_state = IB_DEV_UNREGISTERED;
+	mutex_unlock(&device->sysfs_mutex);
 }
 EXPORT_SYMBOL(ib_unregister_device);
 
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index b43f7d3..29f0ce1 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -94,7 +94,7 @@ static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
 			  char *buf)
 {
 	struct ib_port_attr attr;
-	ssize_t ret;
+	ssize_t ret = -ENODEV;
 
 	static const char *state_name[] = {
 		[IB_PORT_NOP]		= "NOP",
@@ -105,26 +105,33 @@ static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
 		[IB_PORT_ACTIVE_DEFER]	= "ACTIVE_DEFER"
 	};
 
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "%d: %s\n", attr.state,
-		       attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ?
-		       state_name[attr.state] : "UNKNOWN");
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret)
+			ret = sprintf(buf, "%d: %s\n", attr.state,
+				      attr.state >= 0 &&
+				      attr.state < ARRAY_SIZE(state_name) ?
+				      state_name[attr.state] : "UNKNOWN");
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
 			char *buf)
 {
 	struct ib_port_attr attr;
-	ssize_t ret;
-
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
+	ssize_t ret = -ENODEV;
 
-	return sprintf(buf, "0x%x\n", attr.lid);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret)
+			ret = sprintf(buf, "0x%x\n", attr.lid);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t lid_mask_count_show(struct ib_port *p,
@@ -132,52 +139,64 @@ static ssize_t lid_mask_count_show(struct ib_port *p,
 				   char *buf)
 {
 	struct ib_port_attr attr;
-	ssize_t ret;
-
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
+	ssize_t ret = -ENODEV;
 
-	return sprintf(buf, "%d\n", attr.lmc);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret)
+			ret = sprintf(buf, "%d\n", attr.lmc);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
 			   char *buf)
 {
 	struct ib_port_attr attr;
-	ssize_t ret;
-
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
+	ssize_t ret = -ENODEV;
 
-	return sprintf(buf, "0x%x\n", attr.sm_lid);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret)
+			ret = sprintf(buf, "0x%x\n", attr.sm_lid);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
 			  char *buf)
 {
 	struct ib_port_attr attr;
-	ssize_t ret;
-
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
+	ssize_t ret = -ENODEV;
 
-	return sprintf(buf, "%d\n", attr.sm_sl);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret)
+			ret = sprintf(buf, "%d\n", attr.sm_sl);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
 			     char *buf)
 {
 	struct ib_port_attr attr;
-	ssize_t ret;
-
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
+	ssize_t ret = -ENODEV;
 
-	return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret)
+			ret = sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
@@ -186,24 +205,33 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
 	struct ib_port_attr attr;
 	char *speed = "";
 	int rate;
-	ssize_t ret;
-
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
-
-	switch (attr.active_speed) {
-	case 2: speed = " DDR"; break;
-	case 4: speed = " QDR"; break;
+	ssize_t ret = -ENODEV;
+
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret) {
+			switch (attr.active_speed) {
+			case 2: speed = " DDR"; break;
+			case 4: speed = " QDR"; break;
+			}
+
+			rate = 25 * ib_width_enum_to_int(attr.active_width) *
+				attr.active_speed;
+			if (rate < 0) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			ret = sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+				      rate / 10, rate % 10 ? ".5" : "",
+				      ib_width_enum_to_int(attr.active_width),
+				      speed);
+		}
 	}
-
-	rate = 25 * ib_width_enum_to_int(attr.active_width) * attr.active_speed;
-	if (rate < 0)
-		return -EINVAL;
-
-	return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
-		       rate / 10, rate % 10 ? ".5" : "",
-		       ib_width_enum_to_int(attr.active_width), speed);
+out:
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
@@ -211,22 +239,26 @@ static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
 {
 	struct ib_port_attr attr;
 
-	ssize_t ret;
-
-	ret = ib_query_port(p->ibdev, p->port_num, &attr);
-	if (ret)
-		return ret;
-
-	switch (attr.phys_state) {
-	case 1:  return sprintf(buf, "1: Sleep\n");
-	case 2:  return sprintf(buf, "2: Polling\n");
-	case 3:  return sprintf(buf, "3: Disabled\n");
-	case 4:  return sprintf(buf, "4: PortConfigurationTraining\n");
-	case 5:  return sprintf(buf, "5: LinkUp\n");
-	case 6:  return sprintf(buf, "6: LinkErrorRecovery\n");
-	case 7:  return sprintf(buf, "7: Phy Test\n");
-	default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+	ssize_t ret = -ENODEV;
+
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_port(p->ibdev, p->port_num, &attr);
+		if (!ret) {
+			switch (attr.phys_state) {
+			case 1:  ret = sprintf(buf, "1: Sleep\n");
+			case 2:  ret = sprintf(buf, "2: Polling\n");
+			case 3:  ret = sprintf(buf, "3: Disabled\n");
+			case 4:  ret = sprintf(buf, "4: PortConfigurationTraining\n");
+			case 5:  ret = sprintf(buf, "5: LinkUp\n");
+			case 6:  ret = sprintf(buf, "6: LinkErrorRecovery\n");
+			case 7:  ret = sprintf(buf, "7: Phy Test\n");
+			default: ret = sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+			}
+		}
 	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static PORT_ATTR_RO(state);
@@ -256,13 +288,16 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
 	struct port_table_attribute *tab_attr =
 		container_of(attr, struct port_table_attribute, attr);
 	union ib_gid gid;
-	ssize_t ret;
-
-	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
-	if (ret)
-		return ret;
+	ssize_t ret = -ENODEV;
 
-	return sprintf(buf, "%pI6\n", gid.raw);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
+		if (!ret)
+			ret = sprintf(buf, "%pI6\n", gid.raw);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
@@ -271,13 +306,16 @@ static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
 	struct port_table_attribute *tab_attr =
 		container_of(attr, struct port_table_attribute, attr);
 	u16 pkey;
-	ssize_t ret;
-
-	ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
-	if (ret)
-		return ret;
+	ssize_t ret = -ENODEV;
 
-	return sprintf(buf, "0x%04x\n", pkey);
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+		if (!ret)
+			ret = sprintf(buf, "0x%04x\n", pkey);
+	}
+	mutex_unlock(&p->ibdev->sysfs_mutex);
+	return ret;
 }
 
 #define PORT_PMA_ATTR(_name, _counter, _width, _offset)			\
@@ -300,6 +338,12 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
 	if (!p->ibdev->process_mad)
 		return sprintf(buf, "N/A (no PMA)\n");
 
+	mutex_lock(&p->ibdev->sysfs_mutex);
+	if (ibdev_is_alive(p->ibdev)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
 	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
 	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
 	if (!in_mad || !out_mad) {
@@ -346,7 +390,7 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
 out:
 	kfree(in_mad);
 	kfree(out_mad);
-
+	mutex_unlock(&p->ibdev->sysfs_mutex);
 	return ret;
 }
 
@@ -579,20 +623,20 @@ static ssize_t show_sys_image_guid(struct device *device,
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 	struct ib_device_attr attr;
-	ssize_t ret;
-
-	if (!ibdev_is_alive(dev))
-		return -ENODEV;
-
-	ret = ib_query_device(dev, &attr);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
-		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
-		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
-		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
-		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+	ssize_t ret = -ENODEV;
+
+	mutex_lock(&dev->sysfs_mutex);
+	if (ibdev_is_alive(dev)) {
+		ret = ib_query_device(dev, &attr);
+		if (!ret)
+			ret = sprintf(buf, "%04x:%04x:%04x:%04x\n",
+				      be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
+				      be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
+				      be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
+				      be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+	}
+	mutex_unlock(&dev->sysfs_mutex);
+	return ret;
 }
 
 static ssize_t show_node_guid(struct device *device,
@@ -624,17 +668,20 @@ static ssize_t set_node_desc(struct device *device,
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 	struct ib_device_modify desc = {};
-	int ret;
+	int ret = -ENODEV;
 
 	if (!dev->modify_device)
 		return -EIO;
 
 	memcpy(desc.node_desc, buf, min_t(int, count, 64));
-	ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
-	if (ret)
-		return ret;
-
-	return count;
+	mutex_lock(&dev->sysfs_mutex);
+	if (ibdev_is_alive(dev)) {
+		ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
+		if (!ret)
+			ret = count;
+	}
+	mutex_unlock(&dev->sysfs_mutex);
+	return ret;
 }
 
 static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
@@ -662,14 +709,18 @@ static ssize_t show_protocol_stat(const struct device *device,
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 	union rdma_protocol_stats stats;
-	ssize_t ret;
-
-	ret = dev->get_protocol_stats(dev, &stats);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "%llu\n",
-		       (unsigned long long) ((u64 *) &stats)[offset]);
+	ssize_t ret = -ENODEV;
+
+	mutex_lock(&dev->sysfs_mutex);
+	if (ibdev_is_alive(dev)) {
+		ret = dev->get_protocol_stats(dev, &stats);
+		if (!ret)
+			ret = sprintf(buf, "%llu\n",
+				      (unsigned long long)
+				      ((u64 *) &stats)[offset]);
+	}
+	mutex_unlock(&dev->sysfs_mutex);
+	return ret;
 }
 
 /* generate a read-only iwarp statistics attribute */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 936e333..3b2768c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -47,6 +47,7 @@
 #include <linux/list.h>
 #include <linux/rwsem.h>
 #include <linux/scatterlist.h>
+#include <linux/mutex.h>
 
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
@@ -1143,6 +1144,7 @@ struct ib_device {
 		IB_DEV_REGISTERED,
 		IB_DEV_UNREGISTERED
 	}                            reg_state;
+	struct mutex		     sysfs_mutex;
 
 	u64			     uverbs_cmd_mask;
 	int			     uverbs_abi_ver;



More information about the general mailing list