[openib-general] [PATCH 3/4] SA path record caching

Sean Hefty sean.hefty at intel.com
Wed Jan 25 11:47:06 PST 2006


Add a local SA database for path records to eliminate queries to the SA
for connection establishment.

Signed-off-by: Sean Hefty <sean.hefty at intel.com>

---

Index: core/local_sa.c
===================================================================
--- core/local_sa.c	(revision 0)
+++ core/local_sa.c	(revision 0)
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+/* XXX : fixme when 2.6.16 released */
+#include <linux/mutex-backport.h>
+#include <linux/pci.h>
+#include <linux/rbtree.h>
+
+#include <rdma/index.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_local_sa.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand subnet administration caching");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static int retry_timer = 5000; /* 5 sec */
+module_param(retry_timer, int, 0444);
+MODULE_PARM_DESC(retry_timer, "Time in ms between retried requests.");
+
+static int retries = 3;
+module_param(retries, int, 0444);
+MODULE_PARM_DESC(retries, "Number of times to retry a request.");
+
+static unsigned long cache_timeout = 15 * 60 * 1000; /* 15 min */
+module_param(cache_timeout, ulong, 0444);
+MODULE_PARM_DESC(cache_timeout, "Time in ms between cache updates.");
+
+static unsigned long hold_time = 30 * 1000; /* 30 sec */
+module_param(hold_time, ulong, 0444);
+MODULE_PARM_DESC(hold_timer, "Minimal time in ms between cache updates.");
+
+static unsigned long update_delay = 3000; /* 3 sec */
+module_param(update_delay, ulong, 0444);
+MODULE_PARM_DESC(update_delay, "Delay in ms between an event and an update.");
+
+static void sa_db_add_one(struct ib_device *device);
+static void sa_db_remove_one(struct ib_device *device);
+
+static struct ib_client sa_db_client = {
+	.name   = "local_sa",
+	.add    = sa_db_add_one,
+	.remove = sa_db_remove_one
+};
+
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(lock);
+static unsigned long hold_time, update_delay;
+
+struct sa_db_port {
+	struct sa_db_device *dev;
+	struct ib_mad_agent *agent;
+	struct index_root index;
+	unsigned long update_time;
+	struct work_struct work;
+	union ib_gid gid;
+	int port_num;
+	u16 pkey;
+};
+
+struct sa_db_device {
+	struct list_head list;
+	struct ib_device *device;
+	struct ib_event_handler event_handler;
+	struct sa_db_port port[0];
+};
+
+/* Define path record format to enable needed checks against MAD data. */
+struct ib_path_rec {
+	u8	reserved[8];
+	u8	dgid[16];
+	u8	sgid[16];
+	__be16	dlid;
+	__be16	slid;
+	u8	reserved2[20];
+};
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_destroy_ah(mad_send_wc->send_buf->ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+/*
+ * Copy a path record from a received MAD and insert it into our index.
+ * The path record in the MAD is in network order, so must be swapped.  It
+ * can also span multiple MADs, just to make our life hard.
+ */
+static void update_path_rec(struct sa_db_port *port,
+			    struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_recv_buf *recv_buf;
+	struct ib_sa_mad *mad = (void *) mad_recv_wc->recv_buf.mad;
+	struct ib_sa_path_rec *sa_path, *old_path;
+	struct ib_path_rec ib_path, *path = NULL;
+	int i, attr_size, left, offset = 0;
+
+	attr_size = be16_to_cpu(mad->sa_hdr.attr_offset) * 8;
+	if (attr_size < sizeof ib_path)
+		return;
+
+	list_for_each_entry(recv_buf, &mad_recv_wc->rmpp_list, list) {
+		for (i = 0; i < IB_MGMT_SA_DATA;) {
+			mad = (struct ib_sa_mad *) recv_buf->mad;
+
+			left = IB_MGMT_SA_DATA - i;
+			if (left < sizeof ib_path) {
+				/* copy first piece of the attribute */
+				memcpy(&ib_path, &mad->data[i], left);
+				path = &ib_path;
+				offset = left;
+				break;
+			} else if (offset) {
+				/* copy the second piece of the attribute */
+				memcpy((void*) path + offset, &mad->data[i],
+				       sizeof ib_path - offset);
+				i += attr_size - offset;
+				offset = 0;
+			} else {
+				path = (void *) &mad->data[i];
+				i += attr_size;
+			}
+
+			if (!path->slid)
+				return;
+
+			sa_path = kmalloc(sizeof *sa_path, GFP_KERNEL);
+			if (!sa_path)
+				return;
+
+			ib_sa_unpack_attr(sa_path, path, IB_SA_ATTR_PATH_REC);
+
+			mutex_lock(&lock);
+			old_path = index_find_replace(&port->index, sa_path,
+						      sa_path->dgid.raw);
+			if (old_path)
+				kfree(old_path);
+			else if (index_insert(&port->index, sa_path,
+					      sa_path->dgid.raw)) {
+				mutex_unlock(&lock);
+				kfree(sa_path);
+				return;
+			}
+			mutex_unlock(&lock);
+		}
+	}
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_sa_mad *mad = (void *) mad_recv_wc->recv_buf.mad;
+
+	if (mad->mad_hdr.status)
+		goto done;
+	
+	switch (cpu_to_be16(mad->mad_hdr.attr_id)) {
+	case IB_SA_ATTR_PATH_REC:
+		update_path_rec(mad_agent->context, mad_recv_wc);
+		break;
+	default:
+		break;
+	}
+done:
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static struct ib_mad_send_buf* get_sa_msg(struct sa_db_port *port)
+{
+	struct ib_port_attr	port_attr;
+	struct ib_ah_attr	ah_attr;
+	struct ib_mad_send_buf	*msg;
+	int ret;
+
+	ret = ib_query_port(port->dev->device, port->port_num, &port_attr);
+	if (ret || port_attr.state != IB_PORT_ACTIVE)
+		return NULL;
+
+	msg = ib_create_send_mad(port->agent, 1, 0, 0, IB_MGMT_SA_HDR,
+				 IB_MGMT_SA_DATA, GFP_KERNEL);
+	if (IS_ERR(msg))
+		return NULL;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid = port_attr.sm_lid;
+	ah_attr.sl = port_attr.sm_sl;
+	ah_attr.port_num = port->port_num;
+
+	msg->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+	if (IS_ERR(msg->ah)) {
+		ib_free_send_mad(msg);
+		return NULL;
+	}
+
+	msg->timeout_ms = retry_timer;
+	msg->retries = retries;
+	msg->context[0] = port;
+	return msg;
+}
+
+static __be64 form_tid(struct ib_mad_send_buf *msg)
+{
+	u64 hi_tid, low_tid;
+
+	hi_tid  = ((u64) msg->mad_agent->hi_tid) << 32;
+	low_tid = (u32)(unsigned long)(msg);
+	return cpu_to_be64(hi_tid | low_tid);
+}
+
+static void format_path_req(struct sa_db_port *port,
+			    struct ib_mad_send_buf *msg)
+{
+	struct ib_sa_mad *mad = msg->mad;
+	struct ib_sa_path_rec path_rec;
+
+	mad->mad_hdr.base_version  = IB_MGMT_BASE_VERSION;
+	mad->mad_hdr.mgmt_class	   = IB_MGMT_CLASS_SUBN_ADM;
+	mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+	mad->mad_hdr.method	   = IB_SA_METHOD_GET_TABLE;
+	mad->mad_hdr.attr_id	   = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+	mad->mad_hdr.tid	   = form_tid(msg);
+
+	mad->sa_hdr.comp_mask = IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY |
+				IB_SA_PATH_REC_NUMB_PATH;
+
+	path_rec.sgid = port->gid;
+	path_rec.pkey = port->pkey;
+	path_rec.numb_path = 1;
+	ib_sa_pack_attr(mad->data, &path_rec, IB_SA_ATTR_PATH_REC);
+}
+
+static void update_cache(void *data)
+{
+	struct sa_db_port *port = data;
+	struct ib_mad_send_buf *msg;
+
+	msg = get_sa_msg(port);
+	if (!msg)
+		return;
+
+	format_path_req(port, msg);
+
+	if (ib_post_send_mad(msg, NULL)) {
+		ib_destroy_ah(msg->ah);
+		ib_free_send_mad(msg);
+		return;
+	}
+	
+	/*
+	 * We record the time that we requested the update, rather than use the
+	 * time that the update occurred.  This allows us to generate a new
+	 * update if an event occurs while we're still processing this one.
+	 */
+	port->update_time = jiffies;
+	queue_delayed_work(rdma_wq, &port->work, cache_timeout);
+}
+
+static void schedule_update(struct sa_db_port *port)
+{
+	unsigned long time, delay;
+
+	time = jiffies;
+	if (time_after(time, port->update_time + hold_time))
+		delay = update_delay;
+	else
+		delay = port->update_time + hold_time - time;
+
+	cancel_delayed_work(&port->work);
+	queue_delayed_work(rdma_wq, &port->work, delay);
+}
+
+static void handle_event(struct ib_event_handler *event_handler,
+			 struct ib_event *event)
+{
+	struct sa_db_device *dev;
+	dev = container_of(event_handler, typeof(*dev), event_handler);
+
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE)
+		schedule_update(&dev->port[event->element.port_num - 1]);
+}
+
+int ib_get_path_rec(struct ib_device *device, u8 port_num, union ib_gid *sgid,
+		    union ib_gid *dgid, u16 pkey, struct ib_sa_path_rec *rec)
+{
+	struct sa_db_device *dev;
+	struct sa_db_port *port;
+	struct ib_sa_path_rec *path_rec;
+	int ret = 0;
+
+	mutex_lock(&lock);
+	dev = ib_get_client_data(device, &sa_db_client);
+	if (!dev) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+	port = &dev->port[port_num - 1];
+
+	if (memcmp(&port->gid, sgid, sizeof *sgid) || port->pkey != pkey) {
+		ret = -ENODATA;
+		goto unlock;
+	}
+
+	path_rec = index_find(&port->index, dgid->raw);
+	if (!path_rec) {
+		ret = -ENODATA;
+		goto unlock;
+	}
+
+	memcpy(rec, path_rec, sizeof *path_rec);
+unlock:
+	mutex_unlock(&lock);
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_path_rec);
+
+static void sa_db_free_data(void *context, void *data)
+{
+	kfree(data);
+}
+
+static void sa_db_add_one(struct ib_device *device)
+{
+	struct sa_db_device *dev;
+	struct sa_db_port *port;
+	int i;
+
+	dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+		      GFP_KERNEL);
+	if (!dev)
+		return;
+
+	for (i = 1; i <= device->phys_port_cnt; i++) {
+		port = &dev->port[i-1];
+		port->dev = dev;
+		port->port_num = i;
+		port->update_time = jiffies - hold_time;
+		INIT_WORK(&port->work, update_cache, port);
+		index_init(&port->index, sizeof (union ib_gid), GFP_KERNEL);
+	
+		if (ib_get_cached_gid(device, i, 0, &port->gid) ||
+		    ib_get_cached_pkey(device, i, 0, &port->pkey))
+			goto err;
+
+		port->agent = ib_register_mad_agent(device, i, IB_QPT_GSI,
+						    NULL, IB_MGMT_RMPP_VERSION,
+						    send_handler, recv_handler,
+						    port);
+		if (IS_ERR(port->agent))
+			goto err;
+	}
+
+	dev->device = device;
+	ib_set_client_data(device, &sa_db_client, dev);
+
+	mutex_lock(&lock);
+	list_add_tail(&dev->list, &dev_list);
+	mutex_unlock(&lock);
+
+	/* Initialization must be complete before cache updates can occur. */
+	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event);
+	ib_register_event_handler(&dev->event_handler);
+
+	/* Force an update now. */
+	for (i = 1; i <= device->phys_port_cnt; i++)
+		schedule_update(&dev->port[i-1]);
+	return;
+err:
+	while (--i) {
+		ib_unregister_mad_agent(dev->port[i-1].agent);
+		index_destroy(&dev->port[i-1].index);
+	}
+	kfree(dev);
+}
+
+static void sa_db_remove_one(struct ib_device *device)
+{
+	struct sa_db_device *dev;
+	int i;
+
+	dev = ib_get_client_data(device, &sa_db_client);
+	if (!dev)
+		return;
+
+	ib_unregister_event_handler(&dev->event_handler);
+	for (i = 0; i < device->phys_port_cnt; i++)
+		cancel_delayed_work(&dev->port[i].work);
+	flush_workqueue(rdma_wq);
+
+	for (i = 0; i < device->phys_port_cnt; i++) {
+		ib_unregister_mad_agent(dev->port[i].agent);
+		index_remove_all(&dev->port[i].index, sa_db_free_data, NULL);
+		index_destroy(&dev->port[i].index);
+	}
+
+	mutex_lock(&lock);
+	list_del(&dev->list);
+	mutex_unlock(&lock);
+	kfree(dev);
+}
+
+static int __init sa_db_init(void)
+{
+	cache_timeout = msecs_to_jiffies(cache_timeout);
+	hold_time = msecs_to_jiffies(hold_time);
+	update_delay = msecs_to_jiffies(update_delay);
+	return ib_register_client(&sa_db_client);
+}
+
+static void __exit sa_db_cleanup(void)
+{
+	ib_unregister_client(&sa_db_client);
+}
+
+module_init(sa_db_init);
+module_exit(sa_db_cleanup);
Index: include/rdma/ib_local_sa.h
===================================================================
--- include/rdma/ib_local_sa.h	(revision 0)
+++ include/rdma/ib_local_sa.h	(revision 0)
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_LOCAL_SA_H
+#define IB_LOCAL_SA_H
+
+#include <rdma/ib_sa.h>
+
+/**
+ * ib_get_path_rec - Query the local SA database for path information.
+ * @device: The local device to query.
+ * @port_num: The port of the local device being queried.
+ * @sgid: The source GID of the path record.
+ * @dgid: The destination GID of the path record.
+ * @pkey: The protection key of the path record.
+ * @rec: A reference to a path record structure that will receive a copy of
+ *   the response.
+ *
+ * Returns a copy of a path record meeting the specified criteria to the
+ * location referenced by %rec.  A return value < 0 indicates that an error
+ * occurred processing the request, or no path record was found.
+ */
+int ib_get_path_rec(struct ib_device *device, u8 port_num, union ib_gid *sgid,
+		    union ib_gid *dgid, u16 pkey, struct ib_sa_path_rec *rec);
+
+#endif /* IB_LOCAL_SA_H */
Index: core/Makefile
===================================================================
--- core/Makefile	(revision 5098)
+++ core/Makefile	(working copy)
@@ -1,10 +1,13 @@
 EXTRA_CFLAGS += -Idrivers/infiniband/include -Idrivers/infiniband/ulp/ipoib
 
 obj-$(CONFIG_INFINIBAND) +=		ib_core.o ib_mad.o ib_ping.o ib_cm.o \
-					ib_sa.o ib_at.o ib_addr.o rdma_cm.o
+					ib_sa.o ib_at.o ib_addr.o rdma_cm.o \
+					ib_local_sa.o findex.o
 obj-$(CONFIG_INFINIBAND_USER_MAD) += 	ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o ib_uat.o rdma_ucm.o
 
+findex-y :=			index.o
+
 ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
 				device.o fmr_pool.o cache.o
 
@@ -22,6 +25,8 @@ ib_addr-y :=			addr.o
 
 ib_sa-y :=			sa_query.o
 
+ib_local_sa-y :=		local_sa.o
+
 ib_umad-y :=			user_mad.o
 
 ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_mem.o \






More information about the general mailing list