[openib-general] [PATCH 3/4] SA path record caching
Sean Hefty
sean.hefty at intel.com
Wed Jan 25 11:47:06 PST 2006
Add a local SA database for path records to eliminate queries to the SA
for connection establishment.
Signed-off-by: Sean Hefty <sean.hefty at intel.com>
---
Index: core/local_sa.c
===================================================================
--- core/local_sa.c (revision 0)
+++ core/local_sa.c (revision 0)
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+/* XXX : fixme when 2.6.16 released */
+#include <linux/mutex-backport.h>
+#include <linux/pci.h>
+#include <linux/rbtree.h>
+
+#include <rdma/index.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_local_sa.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand subnet administration caching");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static int retry_timer = 5000; /* 5 sec */
+module_param(retry_timer, int, 0444);
+MODULE_PARM_DESC(retry_timer, "Time in ms between retried requests.");
+
+static int retries = 3;
+module_param(retries, int, 0444);
+MODULE_PARM_DESC(retries, "Number of times to retry a request.");
+
+static unsigned long cache_timeout = 15 * 60 * 1000; /* 15 min */
+module_param(cache_timeout, ulong, 0444);
+MODULE_PARM_DESC(cache_timeout, "Time in ms between cache updates.");
+
+static unsigned long hold_time = 30 * 1000; /* 30 sec */
+module_param(hold_time, ulong, 0444);
+MODULE_PARM_DESC(hold_timer, "Minimal time in ms between cache updates.");
+
+static unsigned long update_delay = 3000; /* 3 sec */
+module_param(update_delay, ulong, 0444);
+MODULE_PARM_DESC(update_delay, "Delay in ms between an event and an update.");
+
+static void sa_db_add_one(struct ib_device *device);
+static void sa_db_remove_one(struct ib_device *device);
+
+static struct ib_client sa_db_client = {
+ .name = "local_sa",
+ .add = sa_db_add_one,
+ .remove = sa_db_remove_one
+};
+
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(lock);
+static unsigned long hold_time, update_delay;
+
+struct sa_db_port {
+ struct sa_db_device *dev;
+ struct ib_mad_agent *agent;
+ struct index_root index;
+ unsigned long update_time;
+ struct work_struct work;
+ union ib_gid gid;
+ int port_num;
+ u16 pkey;
+};
+
+struct sa_db_device {
+ struct list_head list;
+ struct ib_device *device;
+ struct ib_event_handler event_handler;
+ struct sa_db_port port[0];
+};
+
+/* Define path record format to enable needed checks against MAD data. */
+struct ib_path_rec {
+ u8 reserved[8];
+ u8 dgid[16];
+ u8 sgid[16];
+ __be16 dlid;
+ __be16 slid;
+ u8 reserved2[20];
+};
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ ib_destroy_ah(mad_send_wc->send_buf->ah);
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+/*
+ * Copy a path record from a received MAD and insert it into our index.
+ * The path record in the MAD is in network order, so must be swapped. It
+ * can also span multiple MADs, just to make our life hard.
+ */
+static void update_path_rec(struct sa_db_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_recv_buf *recv_buf;
+ struct ib_sa_mad *mad = (void *) mad_recv_wc->recv_buf.mad;
+ struct ib_sa_path_rec *sa_path, *old_path;
+ struct ib_path_rec ib_path, *path = NULL;
+ int i, attr_size, left, offset = 0;
+
+ attr_size = be16_to_cpu(mad->sa_hdr.attr_offset) * 8;
+ if (attr_size < sizeof ib_path)
+ return;
+
+ list_for_each_entry(recv_buf, &mad_recv_wc->rmpp_list, list) {
+ for (i = 0; i < IB_MGMT_SA_DATA;) {
+ mad = (struct ib_sa_mad *) recv_buf->mad;
+
+ left = IB_MGMT_SA_DATA - i;
+ if (left < sizeof ib_path) {
+ /* copy first piece of the attribute */
+ memcpy(&ib_path, &mad->data[i], left);
+ path = &ib_path;
+ offset = left;
+ break;
+ } else if (offset) {
+ /* copy the second piece of the attribute */
+ memcpy((void*) path + offset, &mad->data[i],
+ sizeof ib_path - offset);
+ i += attr_size - offset;
+ offset = 0;
+ } else {
+ path = (void *) &mad->data[i];
+ i += attr_size;
+ }
+
+ if (!path->slid)
+ return;
+
+ sa_path = kmalloc(sizeof *sa_path, GFP_KERNEL);
+ if (!sa_path)
+ return;
+
+ ib_sa_unpack_attr(sa_path, path, IB_SA_ATTR_PATH_REC);
+
+ mutex_lock(&lock);
+ old_path = index_find_replace(&port->index, sa_path,
+ sa_path->dgid.raw);
+ if (old_path)
+ kfree(old_path);
+ else if (index_insert(&port->index, sa_path,
+ sa_path->dgid.raw)) {
+ mutex_unlock(&lock);
+ kfree(sa_path);
+ return;
+ }
+ mutex_unlock(&lock);
+ }
+ }
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_sa_mad *mad = (void *) mad_recv_wc->recv_buf.mad;
+
+ if (mad->mad_hdr.status)
+ goto done;
+
+ switch (cpu_to_be16(mad->mad_hdr.attr_id)) {
+ case IB_SA_ATTR_PATH_REC:
+ update_path_rec(mad_agent->context, mad_recv_wc);
+ break;
+ default:
+ break;
+ }
+done:
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static struct ib_mad_send_buf* get_sa_msg(struct sa_db_port *port)
+{
+ struct ib_port_attr port_attr;
+ struct ib_ah_attr ah_attr;
+ struct ib_mad_send_buf *msg;
+ int ret;
+
+ ret = ib_query_port(port->dev->device, port->port_num, &port_attr);
+ if (ret || port_attr.state != IB_PORT_ACTIVE)
+ return NULL;
+
+ msg = ib_create_send_mad(port->agent, 1, 0, 0, IB_MGMT_SA_HDR,
+ IB_MGMT_SA_DATA, GFP_KERNEL);
+ if (IS_ERR(msg))
+ return NULL;
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = port_attr.sm_lid;
+ ah_attr.sl = port_attr.sm_sl;
+ ah_attr.port_num = port->port_num;
+
+ msg->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+ if (IS_ERR(msg->ah)) {
+ ib_free_send_mad(msg);
+ return NULL;
+ }
+
+ msg->timeout_ms = retry_timer;
+ msg->retries = retries;
+ msg->context[0] = port;
+ return msg;
+}
+
+static __be64 form_tid(struct ib_mad_send_buf *msg)
+{
+ u64 hi_tid, low_tid;
+
+ hi_tid = ((u64) msg->mad_agent->hi_tid) << 32;
+ low_tid = (u32)(unsigned long)(msg);
+ return cpu_to_be64(hi_tid | low_tid);
+}
+
+static void format_path_req(struct sa_db_port *port,
+ struct ib_mad_send_buf *msg)
+{
+ struct ib_sa_mad *mad = msg->mad;
+ struct ib_sa_path_rec path_rec;
+
+ mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
+ mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+ mad->mad_hdr.method = IB_SA_METHOD_GET_TABLE;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+ mad->mad_hdr.tid = form_tid(msg);
+
+ mad->sa_hdr.comp_mask = IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY |
+ IB_SA_PATH_REC_NUMB_PATH;
+
+ path_rec.sgid = port->gid;
+ path_rec.pkey = port->pkey;
+ path_rec.numb_path = 1;
+ ib_sa_pack_attr(mad->data, &path_rec, IB_SA_ATTR_PATH_REC);
+}
+
+static void update_cache(void *data)
+{
+ struct sa_db_port *port = data;
+ struct ib_mad_send_buf *msg;
+
+ msg = get_sa_msg(port);
+ if (!msg)
+ return;
+
+ format_path_req(port, msg);
+
+ if (ib_post_send_mad(msg, NULL)) {
+ ib_destroy_ah(msg->ah);
+ ib_free_send_mad(msg);
+ return;
+ }
+
+ /*
+ * We record the time that we requested the update, rather than use the
+ * time that the update occurred. This allows us to generate a new
+ * update if an event occurs while we're still processing this one.
+ */
+ port->update_time = jiffies;
+ queue_delayed_work(rdma_wq, &port->work, cache_timeout);
+}
+
+static void schedule_update(struct sa_db_port *port)
+{
+ unsigned long time, delay;
+
+ time = jiffies;
+ if (time_after(time, port->update_time + hold_time))
+ delay = update_delay;
+ else
+ delay = port->update_time + hold_time - time;
+
+ cancel_delayed_work(&port->work);
+ queue_delayed_work(rdma_wq, &port->work, delay);
+}
+
+static void handle_event(struct ib_event_handler *event_handler,
+ struct ib_event *event)
+{
+ struct sa_db_device *dev;
+ dev = container_of(event_handler, typeof(*dev), event_handler);
+
+ if (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_SM_CHANGE)
+ schedule_update(&dev->port[event->element.port_num - 1]);
+}
+
+int ib_get_path_rec(struct ib_device *device, u8 port_num, union ib_gid *sgid,
+ union ib_gid *dgid, u16 pkey, struct ib_sa_path_rec *rec)
+{
+ struct sa_db_device *dev;
+ struct sa_db_port *port;
+ struct ib_sa_path_rec *path_rec;
+ int ret = 0;
+
+ mutex_lock(&lock);
+ dev = ib_get_client_data(device, &sa_db_client);
+ if (!dev) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+ port = &dev->port[port_num - 1];
+
+ if (memcmp(&port->gid, sgid, sizeof *sgid) || port->pkey != pkey) {
+ ret = -ENODATA;
+ goto unlock;
+ }
+
+ path_rec = index_find(&port->index, dgid->raw);
+ if (!path_rec) {
+ ret = -ENODATA;
+ goto unlock;
+ }
+
+ memcpy(rec, path_rec, sizeof *path_rec);
+unlock:
+ mutex_unlock(&lock);
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_path_rec);
+
+static void sa_db_free_data(void *context, void *data)
+{
+ kfree(data);
+}
+
+static void sa_db_add_one(struct ib_device *device)
+{
+ struct sa_db_device *dev;
+ struct sa_db_port *port;
+ int i;
+
+ dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+ GFP_KERNEL);
+ if (!dev)
+ return;
+
+ for (i = 1; i <= device->phys_port_cnt; i++) {
+ port = &dev->port[i-1];
+ port->dev = dev;
+ port->port_num = i;
+ port->update_time = jiffies - hold_time;
+ INIT_WORK(&port->work, update_cache, port);
+ index_init(&port->index, sizeof (union ib_gid), GFP_KERNEL);
+
+ if (ib_get_cached_gid(device, i, 0, &port->gid) ||
+ ib_get_cached_pkey(device, i, 0, &port->pkey))
+ goto err;
+
+ port->agent = ib_register_mad_agent(device, i, IB_QPT_GSI,
+ NULL, IB_MGMT_RMPP_VERSION,
+ send_handler, recv_handler,
+ port);
+ if (IS_ERR(port->agent))
+ goto err;
+ }
+
+ dev->device = device;
+ ib_set_client_data(device, &sa_db_client, dev);
+
+ mutex_lock(&lock);
+ list_add_tail(&dev->list, &dev_list);
+ mutex_unlock(&lock);
+
+ /* Initialization must be complete before cache updates can occur. */
+ INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event);
+ ib_register_event_handler(&dev->event_handler);
+
+ /* Force an update now. */
+ for (i = 1; i <= device->phys_port_cnt; i++)
+ schedule_update(&dev->port[i-1]);
+ return;
+err:
+ while (--i) {
+ ib_unregister_mad_agent(dev->port[i-1].agent);
+ index_destroy(&dev->port[i-1].index);
+ }
+ kfree(dev);
+}
+
+static void sa_db_remove_one(struct ib_device *device)
+{
+ struct sa_db_device *dev;
+ int i;
+
+ dev = ib_get_client_data(device, &sa_db_client);
+ if (!dev)
+ return;
+
+ ib_unregister_event_handler(&dev->event_handler);
+ for (i = 0; i < device->phys_port_cnt; i++)
+ cancel_delayed_work(&dev->port[i].work);
+ flush_workqueue(rdma_wq);
+
+ for (i = 0; i < device->phys_port_cnt; i++) {
+ ib_unregister_mad_agent(dev->port[i].agent);
+ index_remove_all(&dev->port[i].index, sa_db_free_data, NULL);
+ index_destroy(&dev->port[i].index);
+ }
+
+ mutex_lock(&lock);
+ list_del(&dev->list);
+ mutex_unlock(&lock);
+ kfree(dev);
+}
+
+static int __init sa_db_init(void)
+{
+ cache_timeout = msecs_to_jiffies(cache_timeout);
+ hold_time = msecs_to_jiffies(hold_time);
+ update_delay = msecs_to_jiffies(update_delay);
+ return ib_register_client(&sa_db_client);
+}
+
+static void __exit sa_db_cleanup(void)
+{
+ ib_unregister_client(&sa_db_client);
+}
+
+module_init(sa_db_init);
+module_exit(sa_db_cleanup);
Index: include/rdma/ib_local_sa.h
===================================================================
--- include/rdma/ib_local_sa.h (revision 0)
+++ include/rdma/ib_local_sa.h (revision 0)
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_LOCAL_SA_H
+#define IB_LOCAL_SA_H
+
+#include <rdma/ib_sa.h>
+
+/**
+ * ib_get_path_rec - Query the local SA database for path information.
+ * @device: The local device to query.
+ * @port_num: The port of the local device being queried.
+ * @sgid: The source GID of the path record.
+ * @dgid: The destination GID of the path record.
+ * @pkey: The protection key of the path record.
+ * @rec: A reference to a path record structure that will receive a copy of
+ * the response.
+ *
+ * Returns a copy of a path record meeting the specified criteria to the
+ * location referenced by %rec. A return value < 0 indicates that an error
+ * occurred processing the request, or no path record was found.
+ */
+int ib_get_path_rec(struct ib_device *device, u8 port_num, union ib_gid *sgid,
+ union ib_gid *dgid, u16 pkey, struct ib_sa_path_rec *rec);
+
+#endif /* IB_LOCAL_SA_H */
Index: core/Makefile
===================================================================
--- core/Makefile (revision 5098)
+++ core/Makefile (working copy)
@@ -1,10 +1,13 @@
EXTRA_CFLAGS += -Idrivers/infiniband/include -Idrivers/infiniband/ulp/ipoib
obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_ping.o ib_cm.o \
- ib_sa.o ib_at.o ib_addr.o rdma_cm.o
+ ib_sa.o ib_at.o ib_addr.o rdma_cm.o \
+ ib_local_sa.o findex.o
obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o
obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o ib_uat.o rdma_ucm.o
+findex-y := index.o
+
ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
device.o fmr_pool.o cache.o
@@ -22,6 +25,8 @@ ib_addr-y := addr.o
ib_sa-y := sa_query.o
+ib_local_sa-y := local_sa.o
+
ib_umad-y := user_mad.o
ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_mem.o \
More information about the general
mailing list