[ofa-general] [RFC] [PATCH 2/3] 2.6.22 or 23 ib/sa: add path record caching
Sean Hefty
sean.hefty at intel.com
Thu Apr 19 17:05:08 PDT 2007
IB/sa: Add local SA path record caching.
From: Sean Hefty <sean.hefty at intel.com>
Query and store path records locally to decrease path record query time
and avoid SA flooding during the start-up of large clustered jobs.
Signed-off-by: Sean Hefty <sean.hefty at intel.com>
---
drivers/infiniband/core/Makefile | 3
drivers/infiniband/core/local_sa.c | 1136 ++++++++++++++++++++++++++++++++++++
drivers/infiniband/core/sa_query.c | 26 +
include/rdma/ib_local_sa.h | 84 +++
include/rdma/ib_sa.h | 3
5 files changed, 1252 insertions(+), 0 deletions(-)
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 2e9c4b2..b2a6354 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -2,6 +2,7 @@ infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := ib_addr.o rdma_cm.o
user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o
obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \
+ ib_local_sa.o \
ib_cm.o iw_cm.o $(infiniband-y)
obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o
obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
@@ -14,6 +15,8 @@ ib_mad-y := mad.o smi.o agent.o mad_rmpp.o
ib_sa-y := sa_query.o multicast.o notice.o
+ib_local_sa-y := local_sa.o
+
ib_cm-y := cm.o
iw_cm-y := iwcm.o
diff --git a/drivers/infiniband/core/local_sa.c b/drivers/infiniband/core/local_sa.c
new file mode 100644
index 0000000..1598be1
--- /dev/null
+++ b/drivers/infiniband/core/local_sa.c
@@ -0,0 +1,1136 @@
+/*
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/pci.h>
+#include <linux/miscdevice.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_local_sa.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand subnet administration caching");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+ SA_DB_MAX_PATHS_PER_DEST = 0x7F,
+ SA_DB_MIN_RETRY_TIMER = 4000, /* 4 sec */
+ SA_DB_MAX_RETRY_TIMER = 256000 /* 256 sec */
+};
+
+static unsigned long paths_per_dest = SA_DB_MAX_PATHS_PER_DEST;
+module_param(paths_per_dest, ulong, 0444);
+MODULE_PARM_DESC(paths_per_dest, "Maximum number of paths to retrieve "
+ "to each destination (DGID). Set to 0 "
+ "to disable cache.");
+
+static unsigned long retry_timer = SA_DB_MIN_RETRY_TIMER;
+
+enum sa_db_lookup_method {
+ SA_DB_LOOKUP_LEAST_USED,
+ SA_DB_LOOKUP_RANDOM,
+ SA_DB_LOOKUP_MAX
+};
+
+static unsigned long lookup_method;
+
+static void sa_db_add_dev(struct ib_device *device);
+static void sa_db_remove_dev(struct ib_device *device);
+
+static struct ib_client sa_db_client = {
+ .name = "local_sa",
+ .add = sa_db_add_dev,
+ .remove = sa_db_remove_dev
+};
+
+static struct miscdevice local_sa_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "ib_local_sa",
+};
+
+static LIST_HEAD(dev_list);
+static DECLARE_RWSEM(lock);
+static struct workqueue_struct *sa_wq;
+static struct ib_sa_client sa_client;
+
+enum sa_db_state {
+ SA_DB_IDLE,
+ SA_DB_REFRESH,
+ SA_DB_DESTROY
+};
+
+struct sa_db_port {
+ struct sa_db_device *dev;
+ struct ib_mad_agent *agent;
+ /* Limit number of outstanding MADs to SA to reduce SA flooding */
+ struct ib_mad_send_buf *msg;
+ u16 sm_lid;
+ u8 sm_sl;
+ struct ib_inform_info *in_info;
+ struct ib_inform_info *out_info;
+ struct rb_root paths;
+ struct list_head update_list;
+ unsigned long update_id;
+ enum sa_db_state state;
+ struct work_struct work;
+ union ib_gid gid;
+ int port_num;
+};
+
+struct sa_db_device {
+ struct list_head list;
+ struct ib_device *device;
+ struct ib_event_handler event_handler;
+ int start_port;
+ int port_count;
+ struct sa_db_port port[0];
+};
+
+struct ib_sa_iterator {
+ struct ib_sa_iterator *next;
+};
+
+struct ib_sa_attr_list {
+ struct ib_sa_iterator iter;
+ struct ib_sa_iterator *tail;
+ int update_id;
+ union ib_gid gid;
+ struct rb_node node;
+};
+
+/* maintain field order for ib_get_next_sa_attr() */
+struct ib_path_rec_info {
+ struct ib_sa_iterator iter;
+ struct ib_sa_path_rec rec;
+ unsigned long lookups;
+};
+
+struct ib_sa_iter {
+ struct ib_mad_recv_wc *recv_wc;
+ struct ib_mad_recv_buf *recv_buf;
+ int attr_size;
+ int attr_offset;
+ int data_offset;
+ int data_left;
+ void *attr;
+ u8 attr_data[0];
+};
+
+enum sa_update_type {
+ SA_UPDATE_FULL,
+ SA_UPDATE_ADD,
+ SA_UPDATE_REMOVE
+};
+
+struct update_info {
+ struct list_head list;
+ union ib_gid gid;
+ enum sa_update_type type;
+};
+
+static void process_updates(struct sa_db_port *port);
+
+static void free_attr_list(struct ib_sa_attr_list *attr_list)
+{
+ struct ib_sa_iterator *cur;
+
+ for (cur = attr_list->iter.next; cur; cur = attr_list->iter.next) {
+ attr_list->iter.next = cur->next;
+ kfree(cur);
+ }
+ attr_list->tail = &attr_list->iter;
+}
+
+static void remove_attr(struct rb_root *root, struct ib_sa_attr_list *attr_list)
+{
+ rb_erase(&attr_list->node, root);
+ free_attr_list(attr_list);
+ kfree(attr_list);
+}
+
+static void remove_all_attrs(struct rb_root *root)
+{
+ struct rb_node *node, *next_node;
+ struct ib_sa_attr_list *attr_list;
+
+ for (node = rb_first(root); node; node = next_node) {
+ next_node = rb_next(node);
+ attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+ remove_attr(root, attr_list);
+ }
+}
+
+static void remove_old_attrs(struct rb_root *root, unsigned long update_id)
+{
+ struct rb_node *node, *next_node;
+ struct ib_sa_attr_list *attr_list;
+
+ for (node = rb_first(root); node; node = next_node) {
+ next_node = rb_next(node);
+ attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+ if (attr_list->update_id != update_id)
+ remove_attr(root, attr_list);
+ }
+}
+
+static struct ib_sa_attr_list *insert_attr_list(struct rb_root *root,
+ struct ib_sa_attr_list *attr_list)
+{
+ struct rb_node **link = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ib_sa_attr_list *cur_attr_list;
+ int cmp;
+
+ while (*link) {
+ parent = *link;
+ cur_attr_list = rb_entry(parent, struct ib_sa_attr_list, node);
+ cmp = memcmp(&cur_attr_list->gid, &attr_list->gid,
+ sizeof attr_list->gid);
+ if (cmp < 0)
+ link = &(*link)->rb_left;
+ else if (cmp > 0)
+ link = &(*link)->rb_right;
+ else
+ return cur_attr_list;
+ }
+ rb_link_node(&attr_list->node, parent, link);
+ rb_insert_color(&attr_list->node, root);
+ return NULL;
+}
+
+static struct ib_sa_attr_list *find_attr_list(struct rb_root *root, u8 *gid)
+{
+ struct rb_node *node = root->rb_node;
+ struct ib_sa_attr_list *attr_list;
+ int cmp;
+
+ while (node) {
+ attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+ cmp = memcmp(&attr_list->gid, gid, sizeof attr_list->gid);
+ if (cmp < 0)
+ node = node->rb_left;
+ else if (cmp > 0)
+ node = node->rb_right;
+ else
+ return attr_list;
+ }
+ return NULL;
+}
+
+static int insert_attr(struct rb_root *root, unsigned long update_id, void *key,
+ struct ib_sa_iterator *iter)
+{
+ struct ib_sa_attr_list *attr_list;
+ void *err;
+
+ attr_list = find_attr_list(root, key);
+ if (!attr_list) {
+ attr_list = kmalloc(sizeof *attr_list, GFP_KERNEL);
+ if (!attr_list)
+ return -ENOMEM;
+
+ attr_list->iter.next = NULL;
+ attr_list->tail = &attr_list->iter;
+ attr_list->update_id = update_id;
+ memcpy(attr_list->gid.raw, key, sizeof attr_list->gid);
+
+ err = insert_attr_list(root, attr_list);
+ if (err) {
+ kfree(attr_list);
+ return PTR_ERR(err);
+ }
+ } else if (attr_list->update_id != update_id) {
+ free_attr_list(attr_list);
+ attr_list->update_id = update_id;
+ }
+
+ attr_list->tail->next = iter;
+ iter->next = NULL;
+ attr_list->tail = iter;
+ return 0;
+}
+
+static struct ib_sa_iter *ib_sa_iter_create(struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_sa_iter *iter;
+ struct ib_sa_mad *mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad;
+ int attr_size, attr_offset;
+
+ attr_offset = be16_to_cpu(mad->sa_hdr.attr_offset) * 8;
+ attr_size = 64; /* path record length */
+ if (attr_offset < attr_size)
+ return ERR_PTR(-EINVAL);
+
+ iter = kzalloc(sizeof *iter + attr_size, GFP_KERNEL);
+ if (!iter)
+ return ERR_PTR(-ENOMEM);
+
+ iter->data_left = mad_recv_wc->mad_len - IB_MGMT_SA_HDR;
+ iter->recv_wc = mad_recv_wc;
+ iter->recv_buf = &mad_recv_wc->recv_buf;
+ iter->attr_offset = attr_offset;
+ iter->attr_size = attr_size;
+ return iter;
+}
+
+static void ib_sa_iter_free(struct ib_sa_iter *iter)
+{
+ kfree(iter);
+}
+
+static void *ib_sa_iter_next(struct ib_sa_iter *iter)
+{
+ struct ib_sa_mad *mad;
+ int left, offset = 0;
+
+ while (iter->data_left >= iter->attr_offset) {
+ while (iter->data_offset < IB_MGMT_SA_DATA) {
+ mad = (struct ib_sa_mad *) iter->recv_buf->mad;
+
+ left = IB_MGMT_SA_DATA - iter->data_offset;
+ if (left < iter->attr_size) {
+ /* copy first piece of the attribute */
+ iter->attr = &iter->attr_data;
+ memcpy(iter->attr,
+ &mad->data[iter->data_offset], left);
+ offset = left;
+ break;
+ } else if (offset) {
+ /* copy the second piece of the attribute */
+ memcpy(iter->attr + offset, &mad->data[0],
+ iter->attr_size - offset);
+ iter->data_offset = iter->attr_size - offset;
+ offset = 0;
+ } else {
+ iter->attr = &mad->data[iter->data_offset];
+ iter->data_offset += iter->attr_size;
+ }
+
+ iter->data_left -= iter->attr_offset;
+ goto out;
+ }
+ iter->data_offset = 0;
+ iter->recv_buf = list_entry(iter->recv_buf->list.next,
+ struct ib_mad_recv_buf, list);
+ }
+ iter->attr = NULL;
+out:
+ return iter->attr;
+}
+
+/*
+ * Copy path records from a received response and insert them into our cache.
+ * A path record in the MADs are in network order, packed, and may
+ * span multiple MAD buffers, just to make our life hard.
+ */
+static void update_path_db(struct sa_db_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ enum sa_update_type type)
+{
+ struct ib_sa_iter *iter;
+ struct ib_path_rec_info *path_info;
+ void *attr;
+ int ret;
+
+ iter = ib_sa_iter_create(mad_recv_wc);
+ if (IS_ERR(iter))
+ return;
+
+ port->update_id += (type == SA_UPDATE_FULL);
+
+ while ((attr = ib_sa_iter_next(iter)) &&
+ (path_info = kmalloc(sizeof *path_info, GFP_KERNEL))) {
+
+ ib_sa_unpack_attr(&path_info->rec, attr, IB_SA_ATTR_PATH_REC);
+
+ down_write(&lock);
+ ret = insert_attr(&port->paths, port->update_id,
+ path_info->rec.dgid.raw, &path_info->iter);
+ up_write(&lock);
+
+ if (ret) {
+ kfree(path_info);
+ break;
+ }
+ }
+ ib_sa_iter_free(iter);
+
+ if (type == SA_UPDATE_FULL) {
+ down_write(&lock);
+ remove_old_attrs(&port->paths, port->update_id);
+ up_write(&lock);
+ }
+}
+
+static struct ib_mad_send_buf *get_sa_msg(struct sa_db_port *port,
+ struct update_info *update)
+{
+ struct ib_ah_attr ah_attr;
+ struct ib_mad_send_buf *msg;
+
+ msg = ib_create_send_mad(port->agent, 1, 0, 0, IB_MGMT_SA_HDR,
+ IB_MGMT_SA_DATA, GFP_KERNEL);
+ if (IS_ERR(msg))
+ return NULL;
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = port->sm_lid;
+ ah_attr.sl = port->sm_sl;
+ ah_attr.port_num = port->port_num;
+
+ msg->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+ if (IS_ERR(msg->ah)) {
+ ib_free_send_mad(msg);
+ return NULL;
+ }
+
+ msg->timeout_ms = retry_timer;
+ msg->retries = 0;
+ msg->context[0] = port;
+ msg->context[1] = update;
+ return msg;
+}
+
+static __be64 form_tid(u32 hi_tid)
+{
+ static atomic_t tid;
+ return cpu_to_be64((((u64) hi_tid) << 32) |
+ ((u32) atomic_inc_return(&tid)));
+}
+
+static void format_path_req(struct sa_db_port *port,
+ struct update_info *update,
+ struct ib_mad_send_buf *msg)
+{
+ struct ib_sa_mad *mad = msg->mad;
+ struct ib_sa_path_rec path_rec;
+
+ mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
+ mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+ mad->mad_hdr.method = IB_SA_METHOD_GET_TABLE;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+ mad->mad_hdr.tid = form_tid(msg->mad_agent->hi_tid);
+
+ mad->sa_hdr.comp_mask = IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH;
+
+ path_rec.sgid = port->gid;
+ path_rec.numb_path = paths_per_dest;
+
+ if (update->type == SA_UPDATE_ADD) {
+ mad->sa_hdr.comp_mask |= IB_SA_PATH_REC_DGID;
+ memcpy(&path_rec.dgid, &update->gid, sizeof path_rec.dgid);
+ }
+
+ ib_sa_pack_attr(mad->data, &path_rec, IB_SA_ATTR_PATH_REC);
+}
+
+static int send_query(struct sa_db_port *port,
+ struct update_info *update)
+{
+ int ret;
+
+ port->msg = get_sa_msg(port, update);
+ if (!port->msg)
+ return -ENOMEM;
+
+ format_path_req(port, update, port->msg);
+
+ ret = ib_post_send_mad(port->msg, NULL);
+ if (ret)
+ goto err;
+
+ return 0;
+
+err:
+ ib_destroy_ah(port->msg->ah);
+ ib_free_send_mad(port->msg);
+ return ret;
+}
+
+static void add_update(struct sa_db_port *port, u8 *gid,
+ enum sa_update_type type)
+{
+ struct update_info *update;
+
+ update = kmalloc(sizeof *update, GFP_KERNEL);
+ if (update) {
+ if (gid)
+ memcpy(&update->gid, gid, sizeof update->gid);
+ update->type = type;
+ list_add(&update->list, &port->update_list);
+ }
+
+ if (port->state == SA_DB_IDLE) {
+ port->state = SA_DB_REFRESH;
+ process_updates(port);
+ }
+}
+
+static void clean_update_list(struct sa_db_port *port)
+{
+ struct update_info *update;
+
+ while (!list_empty(&port->update_list)) {
+ update = list_entry(port->update_list.next,
+ struct update_info, list);
+ list_del(&update->list);
+ kfree(update);
+ }
+}
+
+static int notice_handler(int status, struct ib_inform_info *info,
+ struct ib_sa_notice *notice)
+{
+ struct sa_db_port *port = info->context;
+ struct ib_sa_notice_data_gid *gid_data;
+ struct ib_inform_info **pinfo;
+ enum sa_update_type type;
+
+ if (info->trap_number == IB_SA_SM_TRAP_GID_IN_SERVICE) {
+ pinfo = &port->in_info;
+ type = SA_UPDATE_ADD;
+ } else {
+ pinfo = &port->out_info;
+ type = SA_UPDATE_REMOVE;
+ }
+
+ down_write(&lock);
+ if (port->state == SA_DB_DESTROY) {
+ up_write(&lock);
+ return 0;
+ }
+
+ if (notice) {
+ gid_data = (struct ib_sa_notice_data_gid *)
+ ¬ice->data_details;
+ add_update(port, gid_data->gid, type);
+ up_write(&lock);
+ } else if (status == -ENETRESET) {
+ *pinfo = NULL;
+ up_write(&lock);
+ } else {
+ if (status)
+ *pinfo = ERR_PTR(-EINVAL);
+ port->state = SA_DB_IDLE;
+ clean_update_list(port);
+ up_write(&lock);
+ queue_work(sa_wq, &port->work);
+ }
+
+ return status;
+}
+
+static int reg_in_info(struct sa_db_port *port)
+{
+ int ret = 0;
+
+ port->in_info = ib_sa_register_inform_info(&sa_client,
+ port->dev->device,
+ port->port_num,
+ IB_SA_SM_TRAP_GID_IN_SERVICE,
+ GFP_KERNEL, notice_handler,
+ port);
+ if (IS_ERR(port->in_info))
+ ret = PTR_ERR(port->in_info);
+
+ return ret;
+}
+
+static int reg_out_info(struct sa_db_port *port)
+{
+ int ret = 0;
+
+ port->out_info = ib_sa_register_inform_info(&sa_client,
+ port->dev->device,
+ port->port_num,
+ IB_SA_SM_TRAP_GID_OUT_OF_SERVICE,
+ GFP_KERNEL, notice_handler,
+ port);
+ if (IS_ERR(port->out_info))
+ ret = PTR_ERR(port->out_info);
+
+ return ret;
+}
+
+static void cleanup_port(struct sa_db_port *port)
+{
+ if (port->in_info && !IS_ERR(port->in_info))
+ ib_sa_unregister_inform_info(port->in_info);
+
+ if (port->out_info && !IS_ERR(port->out_info))
+ ib_sa_unregister_inform_info(port->out_info);
+
+ port->out_info = NULL;
+ port->in_info = NULL;
+
+ flush_workqueue(sa_wq);
+
+ clean_update_list(port);
+ remove_all_attrs(&port->paths);
+}
+
+static int update_port_info(struct sa_db_port *port)
+{
+ struct ib_port_attr port_attr;
+ int ret;
+
+ ret = ib_query_port(port->dev->device, port->port_num, &port_attr);
+ if (ret)
+ return ret;
+
+ if (port_attr.state != IB_PORT_ACTIVE)
+ return -ENODATA;
+
+ port->sm_lid = port_attr.sm_lid;
+ port->sm_sl = port_attr.sm_sl;
+ return 0;
+}
+
+static void process_updates(struct sa_db_port *port)
+{
+ struct update_info *update;
+ struct ib_sa_attr_list *attr_list;
+ int ret;
+
+ if (!paths_per_dest || update_port_info(port)) {
+ cleanup_port(port);
+ goto out;
+ }
+
+ /* Event registration is an optimization, so ignore failures. */
+ if (!port->out_info) {
+ ret = reg_out_info(port);
+ if (!ret)
+ return;
+ }
+
+ if (!port->in_info) {
+ ret = reg_in_info(port);
+ if (!ret)
+ return;
+ }
+
+ while (!list_empty(&port->update_list)) {
+ update = list_entry(port->update_list.next,
+ struct update_info, list);
+
+ if (update->type == SA_UPDATE_REMOVE) {
+ attr_list = find_attr_list(&port->paths,
+ update->gid.raw);
+ if (attr_list)
+ remove_attr(&port->paths, attr_list);
+ } else {
+ ret = send_query(port, update);
+ if (!ret)
+ return;
+
+ }
+ list_del(&update->list);
+ kfree(update);
+ }
+out:
+ port->state = SA_DB_IDLE;
+}
+
+static void refresh_port_db(struct sa_db_port *port)
+{
+ if (port->state == SA_DB_DESTROY)
+ return;
+
+ if (port->state == SA_DB_REFRESH) {
+ clean_update_list(port);
+ ib_cancel_mad(port->agent, port->msg);
+ }
+
+ add_update(port, NULL, SA_UPDATE_FULL);
+}
+
+static void refresh_dev_db(struct sa_db_device *dev)
+{
+ int i;
+
+ for (i = 0; i < dev->port_count; i++)
+ refresh_port_db(&dev->port[i]);
+}
+
+static void refresh_db(void)
+{
+ struct sa_db_device *dev;
+
+ list_for_each_entry(dev, &dev_list, list)
+ refresh_dev_db(dev);
+}
+
+static ssize_t do_refresh(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ down_write(&lock);
+ refresh_db();
+ up_write(&lock);
+
+ return count;
+}
+static DEVICE_ATTR(refresh, S_IWUSR, NULL, do_refresh);
+
+static ssize_t get_lookup_method(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf,
+ "%c %d round robin\n"
+ "%c %d random\n",
+ (lookup_method == SA_DB_LOOKUP_LEAST_USED) ? '*' : ' ',
+ SA_DB_LOOKUP_LEAST_USED,
+ (lookup_method == SA_DB_LOOKUP_RANDOM) ? '*' : ' ',
+ SA_DB_LOOKUP_RANDOM);
+}
+
+static ssize_t set_lookup_method(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ down_write(&lock);
+ lookup_method = simple_strtoul(buf, NULL, 0);
+ if (lookup_method > SA_DB_LOOKUP_MAX)
+ lookup_method = 0;
+ up_write(&lock);
+
+ return count;
+}
+static DEVICE_ATTR(lookup_method, S_IRUGO | S_IWUSR,
+ get_lookup_method, set_lookup_method);
+
+static ssize_t get_paths_per_dest(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", paths_per_dest);
+}
+
+static ssize_t set_paths_per_dest(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ down_write(&lock);
+ paths_per_dest = simple_strtoul(buf, NULL, 0);
+ if (paths_per_dest > SA_DB_MAX_PATHS_PER_DEST)
+ paths_per_dest = SA_DB_MAX_PATHS_PER_DEST;
+ refresh_db();
+ up_write(&lock);
+
+ return count;
+}
+static DEVICE_ATTR(paths_per_dest, S_IRUGO | S_IWUSR,
+ get_paths_per_dest, set_paths_per_dest);
+
+static void port_work_handler(struct work_struct *work)
+{
+ struct sa_db_port *port;
+
+ port = container_of(work, typeof(*port), work);
+ down_write(&lock);
+ refresh_port_db(port);
+ up_write(&lock);
+}
+
+static void handle_event(struct ib_event_handler *event_handler,
+ struct ib_event *event)
+{
+ struct sa_db_device *dev;
+ struct sa_db_port *port;
+
+ dev = container_of(event_handler, typeof(*dev), event_handler);
+ port = &dev->port[event->element.port_num - dev->start_port];
+
+ switch (event->event) {
+ case IB_EVENT_PORT_ERR:
+ case IB_EVENT_LID_CHANGE:
+ case IB_EVENT_SM_CHANGE:
+ case IB_EVENT_CLIENT_REREGISTER:
+ case IB_EVENT_PKEY_CHANGE:
+ case IB_EVENT_PORT_ACTIVE:
+ queue_work(sa_wq, &port->work);
+ break;
+ default:
+ break;
+ }
+}
+
+static struct ib_sa_path_rec *get_random_path(struct ib_sa_iterator *iter,
+ union ib_gid *sgid, u16 pkey)
+{
+ struct ib_sa_path_rec *path, *rand_path = NULL;
+ int num, count = 0;
+
+ for (path = ib_get_next_sa_attr(&iter); path;
+ path = ib_get_next_sa_attr(&iter)) {
+ if (pkey == path->pkey &&
+ !memcmp(sgid, path->sgid.raw, sizeof *sgid)) {
+ get_random_bytes(&num, sizeof num);
+ if ((num % ++count) == 0)
+ rand_path = path;
+ }
+ }
+
+ return rand_path;
+}
+
+static struct ib_sa_path_rec *get_next_path(struct ib_sa_iterator *iter,
+ union ib_gid *sgid, u16 pkey)
+{
+ struct ib_path_rec_info *path_info, *next_path = NULL;
+ struct ib_sa_path_rec *path;
+ unsigned long lookups = ~0;
+
+ for (path = ib_get_next_sa_attr(&iter); path;
+ path = ib_get_next_sa_attr(&iter)) {
+ if (pkey == path->pkey &&
+ !memcmp(sgid, path->sgid.raw, sizeof *sgid)) {
+
+ path_info = container_of(iter, struct ib_path_rec_info,
+ iter);
+ if (path_info->lookups < lookups) {
+ lookups = path_info->lookups;
+ next_path = path_info;
+ }
+ }
+ }
+
+ if (next_path) {
+ next_path->lookups++;
+ return &next_path->rec;
+ } else
+ return NULL;
+}
+
+int ib_get_path_rec(struct ib_device *device, u8 port_num, union ib_gid *sgid,
+ union ib_gid *dgid, u16 pkey, struct ib_sa_path_rec *rec)
+{
+ struct ib_sa_iterator *iter;
+ struct ib_sa_path_rec *path;
+ int ret;
+
+ iter = ib_create_path_iter(device, port_num, dgid);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ if (lookup_method == SA_DB_LOOKUP_RANDOM)
+ path = get_random_path(iter, sgid, pkey);
+ else
+ path = get_next_path(iter, sgid, pkey);
+
+ if (path) {
+ memcpy(rec, path, sizeof *rec);
+ ret = 0;
+ } else
+ ret = -ENODATA;
+
+ ib_free_sa_iter(iter);
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_path_rec);
+
+struct ib_sa_iterator *ib_create_path_iter(struct ib_device *device,
+ u8 port_num, union ib_gid *dgid)
+{
+ struct sa_db_device *dev;
+ struct sa_db_port *port;
+ struct ib_sa_attr_list *list;
+ int ret;
+
+ down_read(&lock);
+ dev = ib_get_client_data(device, &sa_db_client);
+ if (!dev) {
+ ret = -ENODEV;
+ goto err;
+ }
+ port = &dev->port[port_num - dev->start_port];
+
+ list = find_attr_list(&port->paths, dgid->raw);
+ if (!list) {
+ ret = -ENODATA;
+ goto err;
+ }
+
+ return &list->iter;
+err:
+ up_read(&lock);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_create_path_iter);
+
+void ib_free_sa_iter(struct ib_sa_iterator *iter)
+{
+ up_read(&lock);
+}
+EXPORT_SYMBOL(ib_free_sa_iter);
+
+void *ib_get_next_sa_attr(struct ib_sa_iterator **iter)
+{
+ *iter = (*iter)->next;
+ return (*iter) ? ((void *)(*iter)) + sizeof(**iter) : NULL;
+}
+EXPORT_SYMBOL(ib_get_next_sa_attr);
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct sa_db_port *port;
+ struct update_info *update;
+ struct ib_mad_send_buf *msg;
+ enum sa_update_type type;
+
+ msg = (struct ib_mad_send_buf *) (unsigned long) mad_recv_wc->wc->wr_id;
+ port = msg->context[0];
+ update = msg->context[1];
+
+ down_write(&lock);
+ if (port->state == SA_DB_DESTROY ||
+ update != list_entry(port->update_list.next,
+ struct update_info, list)) {
+ up_write(&lock);
+ } else {
+ type = update->type;
+ up_write(&lock);
+ update_path_db(mad_agent->context, mad_recv_wc, type);
+ }
+
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_send_buf *msg;
+ struct sa_db_port *port;
+ struct update_info *update;
+ int ret;
+
+ msg = mad_send_wc->send_buf;
+ port = msg->context[0];
+ update = msg->context[1];
+
+ down_write(&lock);
+ if (port->state == SA_DB_DESTROY)
+ goto unlock;
+
+ if (update == list_entry(port->update_list.next,
+ struct update_info, list)) {
+
+ if (mad_send_wc->status == IB_WC_RESP_TIMEOUT_ERR &&
+ msg->timeout_ms < SA_DB_MAX_RETRY_TIMER) {
+
+ msg->timeout_ms <<= 1;
+ ret = ib_post_send_mad(msg, NULL);
+ if (!ret) {
+ up_write(&lock);
+ return;
+ }
+ }
+ list_del(&update->list);
+ kfree(update);
+ }
+ process_updates(port);
+unlock:
+ up_write(&lock);
+
+ ib_destroy_ah(msg->ah);
+ ib_free_send_mad(msg);
+}
+
+static int init_port(struct sa_db_device *dev, int port_num)
+{
+ struct sa_db_port *port;
+ int ret;
+
+ port = &dev->port[port_num - dev->start_port];
+ port->dev = dev;
+ port->port_num = port_num;
+ INIT_WORK(&port->work, port_work_handler);
+ port->paths = RB_ROOT;
+ INIT_LIST_HEAD(&port->update_list);
+
+ ret = ib_get_cached_gid(dev->device, port_num, 0, &port->gid);
+ if (ret)
+ return ret;
+
+ port->agent = ib_register_mad_agent(dev->device, port_num, IB_QPT_GSI,
+ NULL, IB_MGMT_RMPP_VERSION,
+ send_handler, recv_handler, port);
+ if (IS_ERR(port->agent))
+ ret = PTR_ERR(port->agent);
+
+ return ret;
+}
+
+static void destroy_port(struct sa_db_port *port)
+{
+ down_write(&lock);
+ port->state = SA_DB_DESTROY;
+ up_write(&lock);
+
+ ib_unregister_mad_agent(port->agent);
+ cleanup_port(port);
+}
+
+static void sa_db_add_dev(struct ib_device *device)
+{
+ struct sa_db_device *dev;
+ struct sa_db_port *port;
+ int s, e, i, ret;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ s = e = 0;
+ } else {
+ s = 1;
+ e = device->phys_port_cnt;
+ }
+
+ dev = kzalloc(sizeof *dev + (e - s + 1) * sizeof *port, GFP_KERNEL);
+ if (!dev)
+ return;
+
+ dev->start_port = s;
+ dev->port_count = e - s + 1;
+ dev->device = device;
+ for (i = 0; i < dev->port_count; i++) {
+ ret = init_port(dev, s + i);
+ if (ret)
+ goto err;
+ }
+
+ ib_set_client_data(device, &sa_db_client, dev);
+
+ INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event);
+
+ down_write(&lock);
+ list_add_tail(&dev->list, &dev_list);
+ refresh_dev_db(dev);
+ up_write(&lock);
+
+ ib_register_event_handler(&dev->event_handler);
+ return;
+err:
+ while (i--)
+ destroy_port(&dev->port[i]);
+ kfree(dev);
+}
+
+static void sa_db_remove_dev(struct ib_device *device)
+{
+ struct sa_db_device *dev;
+ int i;
+
+ dev = ib_get_client_data(device, &sa_db_client);
+ if (!dev)
+ return;
+
+ ib_unregister_event_handler(&dev->event_handler);
+ flush_workqueue(sa_wq);
+
+ for (i = 0; i < dev->port_count; i++)
+ destroy_port(&dev->port[i]);
+
+ down_write(&lock);
+ list_del(&dev->list);
+ up_write(&lock);
+
+ kfree(dev);
+}
+
+static int __init sa_db_init(void)
+{
+ int ret;
+
+ sa_wq = create_singlethread_workqueue("local_sa");
+ if (!sa_wq)
+ return -ENOMEM;
+
+ ib_sa_register_client(&sa_client);
+ ret = ib_register_client(&sa_db_client);
+ if (ret)
+ goto err1;
+
+ ret = misc_register(&local_sa_misc);
+ if (ret)
+ goto err2;
+
+ ret = device_create_file(local_sa_misc.this_device, &dev_attr_refresh);
+ if (ret)
+ goto err3;
+
+ ret = device_create_file(local_sa_misc.this_device,
+ &dev_attr_paths_per_dest);
+ if (ret)
+ goto err4;
+
+ ret = device_create_file(local_sa_misc.this_device,
+ &dev_attr_lookup_method);
+ if (ret)
+ goto err5;
+
+ return 0;
+
+err5:
+ device_remove_file(local_sa_misc.this_device, &dev_attr_paths_per_dest);
+err4:
+ device_remove_file(local_sa_misc.this_device, &dev_attr_refresh);
+err3:
+ misc_deregister(&local_sa_misc);
+err2:
+ ib_unregister_client(&sa_db_client);
+err1:
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(sa_wq);
+ return ret;
+}
+
+static void __exit sa_db_cleanup(void)
+{
+ device_remove_file(local_sa_misc.this_device, &dev_attr_lookup_method);
+ device_remove_file(local_sa_misc.this_device, &dev_attr_paths_per_dest);
+ device_remove_file(local_sa_misc.this_device, &dev_attr_refresh);
+ misc_deregister(&local_sa_misc);
+ ib_unregister_client(&sa_db_client);
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(sa_wq);
+}
+
+module_init(sa_db_init);
+module_exit(sa_db_cleanup);
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 8de4ad8..1dd8063 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -464,6 +464,32 @@ static const struct ib_field notice_table[] = {
.size_bits = 128 },
};
+int ib_sa_pack_attr(void *dst, void *src, int attr_id)
+{
+ switch (attr_id) {
+ case IB_SA_ATTR_PATH_REC:
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ib_sa_pack_attr);
+
+int ib_sa_unpack_attr(void *dst, void *src, int attr_id)
+{
+ switch (attr_id) {
+ case IB_SA_ATTR_PATH_REC:
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ib_sa_unpack_attr);
+
static void free_sm_ah(struct kref *kref)
{
struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
diff --git a/include/rdma/ib_local_sa.h b/include/rdma/ib_local_sa.h
new file mode 100644
index 0000000..0ce084b
--- /dev/null
+++ b/include/rdma/ib_local_sa.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_LOCAL_SA_H
+#define IB_LOCAL_SA_H
+
+#include <rdma/ib_sa.h>
+
+/**
+ * ib_get_path_rec - Query the local SA database for path information.
+ * @device: The local device to query.
+ * @port_num: The port of the local device being queried.
+ * @sgid: The source GID of the path record.
+ * @dgid: The destination GID of the path record.
+ * @pkey: The protection key of the path record.
+ * @rec: A reference to a path record structure that will receive a copy of
+ * the response.
+ *
+ * Returns a copy of a path record meeting the specified criteria to the
+ * location referenced by %rec. A return value < 0 indicates that an error
+ * occurred processing the request, or no path record was found.
+ */
+int ib_get_path_rec(struct ib_device *device, u8 port_num, union ib_gid *sgid,
+ union ib_gid *dgid, u16 pkey, struct ib_sa_path_rec *rec);
+
+/**
+ * ib_create_path_iter - Create an iterator that may be used to walk through
+ * a list of path records.
+ * @device: The local device to retrieve path records for.
+ * @port_num: The port of the local device.
+ * @dgid: The destination GID of the path record.
+ *
+ * This call allocates an iterator that is used to walk through a list of
+ * cached path records. All path records accessed by the iterator will have the
+ * specified DGID. User should not hold the iterator for an extended period of
+ * time, and must free it by calling ib_free_sa_iter.
+ */
+struct ib_sa_iterator *ib_create_path_iter(struct ib_device *device,
+ u8 port_num, union ib_gid *dgid);
+
+/**
+ * ib_free_sa_iter - Release an iterator.
+ * @iter: The iterator to free.
+ */
+void ib_free_sa_iter(struct ib_sa_iterator *iter);
+
+/**
+ * ib_get_next_sa_attr - Retrieve the next SA attribute referenced by an
+ * iterator.
+ * @iter: A reference to an iterator that points to the next attribute to
+ * retrieve.
+ */
+void *ib_get_next_sa_attr(struct ib_sa_iterator **iter);
+
+#endif /* IB_LOCAL_SA_H */
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 46b52fd..1e5e630 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -552,4 +552,7 @@ ib_sa_register_inform_info(struct ib_sa_client *client,
*/
void ib_sa_unregister_inform_info(struct ib_inform_info *info);
+int ib_sa_pack_attr(void *dst, void *src, int attr_id);
+int ib_sa_unpack_attr(void *dst, void *src, int attr_id);
+
#endif /* IB_SA_H */
More information about the general
mailing list