[ofa-general] [RFC V3 PATCH 4/5] rdma/cma: implement RDMA_CM_EVENT_NETDEV_CHANGE notification

Or Gerlitz ogerlitz at voltaire.com
Tue May 27 08:51:34 PDT 2008


RDMA_CM_EVENT_NETDEV_CHANGE event can be used by rdma-cm consumers that wish
to have their RDMA sessions always use the same links (eg <hca/port>) as the
IP stack does. In the current code, this does not happen when bonding is used
and fail-over happened, but the IB link used by an already existing session is
operating fine.

Use netevent notification for sensing that a change has happened in the IP stack,
then scan the rdma-cm IDs list to see if there is an ID that is "misaligned" in
that respect with the IP stack, and deliver RDMA_CM_EVENT_NETDEV_CHANGE for this ID.
The user can act on the event or just ignore it

Signed-off-by: Or Gerlitz <ogerlitz at voltaire.com>

This patch should be applied on top of the previous patch ("simplify locking needed
for serialization of callbacks) and the first two patches of the series I have posted
which remained unchanged at this point:

[RFC v2 PATCH 1/5] net/bonding: announce fail-over for the active-backup mode
http://lists.openfabrics.org/pipermail/general/2008-May/050285.html

[RFC v2 PATCH 2/5] rdma/addr: keep the name of the netdevice in struct rdma_dev_addr
http://lists.openfabrics.org/pipermail/general/2008-May/050286.html

main changes from v2 -

- took the approach of unconditionally notifying the user
- use the handler_mutex of the ID to serialize with other callbacks

As for the locking issues, I still have the double loop in cma_netdev_callback()
being wrapped with the rdma-cm global mutex taken.

The loop on devices has to be under this lock because the device removal code
in cma_remove_one() removes the device from the global linked list of devices
this code loops on.

The loop on IDs has to be under this lock because the device removal code in
cma_process_remove() removes IDs from the device ID list this code loops on.

Index: linux-2.6.26-rc3/drivers/infiniband/core/cma.c
===================================================================
--- linux-2.6.26-rc3.orig/drivers/infiniband/core/cma.c	2008-05-27 13:46:48.000000000 +0300
+++ linux-2.6.26-rc3/drivers/infiniband/core/cma.c	2008-05-27 13:46:58.000000000 +0300
@@ -164,6 +164,12 @@ struct cma_work {
 	struct rdma_cm_event	event;
 };

+struct cma_ndev_work {
+	struct work_struct	work;
+	struct rdma_id_private	*id;
+	struct rdma_cm_event	event;
+};
+
 union cma_ip_addr {
 	struct in6_addr ip6;
 	struct {
@@ -1601,6 +1607,26 @@ out:
 	kfree(work);
 }

+static void cma_ndev_work_handler(struct work_struct *_work)
+{
+	struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work);
+	struct rdma_id_private *id_priv = work->id;
+	int destroy = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+
+	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+		cma_exch(id_priv, CMA_DESTROYING);
+		destroy = 1;
+	}
+
+	cma_enable_remove(id_priv);
+	cma_deref_id(id_priv);
+	if (destroy)
+		rdma_destroy_id(&id_priv->id);
+	kfree(work);
+}
+
 static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
 {
 	struct rdma_route *route = &id_priv->id.route;
@@ -2726,6 +2752,61 @@ void rdma_leave_multicast(struct rdma_cm
 }
 EXPORT_SYMBOL(rdma_leave_multicast);

+static int cma_netdev_align_id(struct net_device *ndev, struct rdma_id_private *id_priv)
+{
+	struct rdma_dev_addr *dev_addr;
+	struct cma_ndev_work *work;
+
+	dev_addr = &id_priv->id.route.addr.dev_addr;
+
+	if (!memcmp(dev_addr->src_dev_name, ndev->name, IFNAMSIZ) &&
+	  memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
+		printk(KERN_ERR "addr change for device %s used by id %p, notifying\n",
+				ndev->name, &id_priv->id);
+		work = kzalloc(sizeof *work, GFP_ATOMIC);
+		if (!work)
+			return -ENOMEM;
+		INIT_WORK(&work->work, cma_ndev_work_handler);
+		work->id = id_priv;
+		work->event.event = RDMA_CM_EVENT_NETDEV_CHANGE;
+		atomic_inc(&id_priv->refcount);
+		queue_work(cma_wq, &work->work);
+	}
+}
+
+static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
+	void *ctx)
+{
+	struct net_device *ndev = (struct net_device *)ctx;
+	struct cma_device *cma_dev;
+	struct rdma_id_private *id_priv;
+	int ret = NOTIFY_DONE;
+
+	if (dev_net(ndev) != &init_net)
+		return NOTIFY_DONE;
+
+	if (event != NETDEV_BONDING_FAILOVER)
+		return NOTIFY_DONE;
+
+	if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING))
+		return NOTIFY_DONE;
+
+	mutex_lock(&lock);
+	list_for_each_entry(cma_dev, &dev_list, list)
+		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+			ret = cma_netdev_align_id(ndev, id_priv);
+			if (ret)
+				break;
+		}
+	mutex_unlock(&lock);
+
+	return ret;
+}
+
+static struct notifier_block cma_nb = {
+	.notifier_call = cma_netdev_callback
+};
+
 static void cma_add_one(struct ib_device *device)
 {
 	struct cma_device *cma_dev;
@@ -2834,6 +2915,7 @@ static int cma_init(void)

 	ib_sa_register_client(&sa_client);
 	rdma_addr_register_client(&addr_client);
+	register_netdevice_notifier(&cma_nb);

 	ret = ib_register_client(&cma_client);
 	if (ret)
@@ -2841,6 +2923,7 @@ static int cma_init(void)
 	return 0;

 err:
+	unregister_netdevice_notifier(&cma_nb);
 	rdma_addr_unregister_client(&addr_client);
 	ib_sa_unregister_client(&sa_client);
 	destroy_workqueue(cma_wq);
@@ -2850,6 +2933,7 @@ err:
 static void cma_cleanup(void)
 {
 	ib_unregister_client(&cma_client);
+	unregister_netdevice_notifier(&cma_nb);
 	rdma_addr_unregister_client(&addr_client);
 	ib_sa_unregister_client(&sa_client);
 	destroy_workqueue(cma_wq);
Index: linux-2.6.26-rc3/include/rdma/rdma_cm.h
===================================================================
--- linux-2.6.26-rc3.orig/include/rdma/rdma_cm.h	2008-05-27 13:44:53.000000000 +0300
+++ linux-2.6.26-rc3/include/rdma/rdma_cm.h	2008-05-27 13:46:58.000000000 +0300
@@ -53,7 +53,8 @@ enum rdma_cm_event_type {
 	RDMA_CM_EVENT_DISCONNECTED,
 	RDMA_CM_EVENT_DEVICE_REMOVAL,
 	RDMA_CM_EVENT_MULTICAST_JOIN,
-	RDMA_CM_EVENT_MULTICAST_ERROR
+	RDMA_CM_EVENT_MULTICAST_ERROR,
+	RDMA_CM_EVENT_NETDEV_CHANGE
 };

 enum rdma_port_space {



More information about the general mailing list