[ewg] [PATCH 3/8 v3] ib_core: Add RDMAoE SA support

Eli Cohen eli at mellanox.co.il
Mon Jul 13 11:14:57 PDT 2009


Add support for resolving paths and joining multicast group for RDMAoE ports.
For Eth links, path resolution will complete immediately but will call the
callback from a workqueue context to avoid deadloks. Multicast joins are
handled in nearly the same way as IB mulitcast joins are handled in
multicast.c. However they are handled entirly at the host and no MADs are
involved. This allows for a client to create groups and dictate the qkey to be
used in that group. The code is put in rdmaoe_sa.c which handles both multicast
joins/leaves and path resolution.

The following files were added:
drivers/infiniband/core/multicast.h
drivers/infiniband/core/rdmaoe_sa.c
include/rdma/rdmaoe_sa.h

There are changes made in vers/infiniband/core/multicast.c,
drivers/infiniband/core/sa_query.c and drivers/infiniband/core/sa.h to allow
sharing of data structs. New API functions are added for RDMAoE and comsumenrs
who want to use this API need to be changed.

Signed-off-by: Eli Cohen <eli at mellanox.co.il>
---
 drivers/infiniband/core/Makefile    |    2 +-
 drivers/infiniband/core/multicast.c |   43 +--
 drivers/infiniband/core/multicast.h |   79 +++
 drivers/infiniband/core/rdmaoe_sa.c |  938 +++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/sa.h        |   24 +
 drivers/infiniband/core/sa_query.c  |   26 +-
 include/rdma/rdmaoe_sa.h            |   66 +++
 7 files changed, 1113 insertions(+), 65 deletions(-)
 create mode 100644 drivers/infiniband/core/multicast.h
 create mode 100644 drivers/infiniband/core/rdmaoe_sa.c
 create mode 100644 include/rdma/rdmaoe_sa.h

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index cb1ab3e..96db705 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -2,7 +2,7 @@ infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= ib_addr.o rdma_cm.o
 user_access-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= rdma_ucm.o
 
 obj-$(CONFIG_INFINIBAND) +=		ib_core.o ib_mad.o ib_sa.o \
-					ib_cm.o iw_cm.o $(infiniband-y)
+					ib_cm.o iw_cm.o rdmaoe_sa.o $(infiniband-y)
 obj-$(CONFIG_INFINIBAND_USER_MAD) +=	ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
 					$(user_access-y)
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index 107f170..727a55a 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -39,6 +39,7 @@
 
 #include <rdma/ib_cache.h>
 #include "sa.h"
+#include "multicast.h"
 
 static void mcast_add_one(struct ib_device *device);
 static void mcast_remove_one(struct ib_device *device);
@@ -72,52 +73,10 @@ struct mcast_device {
 	struct mcast_port	port[0];
 };
 
-enum mcast_state {
-	MCAST_JOINING,
-	MCAST_MEMBER,
-	MCAST_ERROR,
-};
-
-enum mcast_group_state {
-	MCAST_IDLE,
-	MCAST_BUSY,
-	MCAST_GROUP_ERROR,
-	MCAST_PKEY_EVENT
-};
-
 enum {
 	MCAST_INVALID_PKEY_INDEX = 0xFFFF
 };
 
-struct mcast_member;
-
-struct mcast_group {
-	struct ib_sa_mcmember_rec rec;
-	struct rb_node		node;
-	struct mcast_port	*port;
-	spinlock_t		lock;
-	struct work_struct	work;
-	struct list_head	pending_list;
-	struct list_head	active_list;
-	struct mcast_member	*last_join;
-	int			members[3];
-	atomic_t		refcount;
-	enum mcast_group_state	state;
-	struct ib_sa_query	*query;
-	int			query_id;
-	u16			pkey_index;
-};
-
-struct mcast_member {
-	struct ib_sa_multicast	multicast;
-	struct ib_sa_client	*client;
-	struct mcast_group	*group;
-	struct list_head	list;
-	enum mcast_state	state;
-	atomic_t		refcount;
-	struct completion	comp;
-};
-
 static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
 			 void *context);
 static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
diff --git a/drivers/infiniband/core/multicast.h b/drivers/infiniband/core/multicast.h
new file mode 100644
index 0000000..17eb9fe
--- /dev/null
+++ b/drivers/infiniband/core/multicast.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MULTICAST_H
+#define MULTICAST_H
+
+enum mcast_state {
+	MCAST_JOINING,
+	MCAST_MEMBER,
+	MCAST_ERROR,
+};
+
+enum mcast_group_state {
+	MCAST_IDLE,
+	MCAST_BUSY,
+	MCAST_GROUP_ERROR,
+	MCAST_PKEY_EVENT
+};
+
+struct mcast_member;
+
+struct mcast_group {
+	struct ib_sa_mcmember_rec rec;
+	struct rb_node		node;
+	struct mcast_port	*port;
+	spinlock_t		lock;
+	struct work_struct	work;
+	struct list_head	pending_list;
+	struct list_head	active_list;
+	struct mcast_member	*last_join;
+	int			members[3];
+	atomic_t		refcount;
+	enum mcast_group_state	state;
+	struct ib_sa_query	*query;
+	int			query_id;
+	u16			pkey_index;
+};
+
+struct mcast_member {
+	struct ib_sa_multicast	multicast;
+	struct ib_sa_client	*client;
+	struct mcast_group	*group;
+	struct list_head	list;
+	enum mcast_state	state;
+	atomic_t		refcount;
+	struct completion	comp;
+};
+
+#endif /* MULTICAST_H */
+
diff --git a/drivers/infiniband/core/rdmaoe_sa.c b/drivers/infiniband/core/rdmaoe_sa.c
new file mode 100644
index 0000000..3548a56
--- /dev/null
+++ b/drivers/infiniband/core/rdmaoe_sa.c
@@ -0,0 +1,938 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/workqueue.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cache.h>
+#include <rdma/rdmaoe_sa.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+#include "sa.h"
+#include "multicast.h"
+
+MODULE_AUTHOR("Eli Cohen");
+MODULE_DESCRIPTION("RDMAoE SA emulation");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static void rdmaoe_sa_add_one(struct ib_device *device);
+static void rdmaoe_sa_remove_one(struct ib_device *device);
+static void rdmaoe_mcast_add_one(struct ib_device *device);
+static void rdmaoe_mcast_remove_one(struct ib_device *device);
+
+static struct workqueue_struct	*mcast_wq;
+static union ib_gid mgid0;
+
+struct mcast_device;
+
+struct mcast_port {
+	struct mcast_device	*dev;
+	spinlock_t		lock;
+	struct rb_root		table;
+	atomic_t		refcount;
+	struct completion	comp;
+	u8			port_num;
+	struct socket		*sock;
+	int			bound;
+	int			ifidx;
+};
+
+struct mcast_device {
+	struct ib_device	*device;
+	struct ib_event_handler	event_handler;
+	int			start_port;
+	int			end_port;
+	struct mcast_port	port[0];
+};
+
+enum {
+	MCAST_INVALID_PKEY_INDEX = 0xFFFF
+};
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+			 void *context);
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+			  void *context);
+
+static struct ib_client sa_client = {
+	.name   = "rdmaoe_sa",
+	.add    = rdmaoe_sa_add_one,
+	.remove = rdmaoe_sa_remove_one
+};
+
+static struct ib_client mcast_client = {
+	.name   = "rdmaoe_multicast",
+	.add    = rdmaoe_mcast_add_one,
+	.remove = rdmaoe_mcast_remove_one
+};
+
+struct rdmaoe_sa_port {
+	u8	port_num;
+};
+
+struct rdmaoe_sa_device {
+	int                     start_port, end_port;
+	struct ib_event_handler event_handler;
+	struct rdmaoe_sa_port port[0];
+};
+struct eth_work {
+	struct work_struct	 work;
+	struct mcast_member	*member;
+	struct ib_device	*device;
+	u8			 port_num;
+};
+
+static int start_igmp6(struct mcast_port *port)
+{
+	struct socket *sock;
+	int err;
+
+	err = sock_create(AF_INET6, SOCK_DGRAM, 0, &sock);
+	if (err)
+		return err;
+
+	port->sock = sock;
+
+	return 0;
+}
+
+static void stop_igmp6(struct mcast_port *port)
+{
+	if (port->sock) {
+		sock_release(port->sock);
+		port->sock = NULL;
+	}
+}
+
+static int get_if_idx(union ib_gid *gid)
+{
+	struct net_device *dev;
+	struct in6_addr addr;
+
+	memcpy(&addr, gid, sizeof *gid);
+	for_each_netdev(&init_net, dev) {
+		if (ipv6_chk_addr(&init_net, &addr, dev, 1))
+			return dev->ifindex;
+	}
+
+	return -1;
+}
+
+static int attach_socket(struct mcast_port *port, union ib_gid *mgid)
+{
+	union ib_gid gid;
+	struct sockaddr_in6 addr = {0};
+	int err;
+	struct ipv6_mreq mcast;
+
+	if (!port->sock)
+		return -EINVAL;
+
+	if (!port->bound) {
+		err = ib_query_gid(port->dev->device, port->port_num, 0, &gid);
+		if (err)
+			return err;
+
+		memcpy(addr.sin6_addr.in6_u.u6_addr8, &gid, sizeof gid);
+
+		addr.sin6_scope_id = get_if_idx(&gid);
+		if (addr.sin6_scope_id == -1)
+			return -EINVAL;
+
+		addr.sin6_family = AF_INET6;
+		err = kernel_bind(port->sock, (struct sockaddr *)&addr, sizeof addr);
+		if (err)
+			return err;
+		else {
+			port->bound = 1;
+			port->ifidx = addr.sin6_scope_id;
+		}
+	}
+	mcast.ipv6mr_ifindex = port->ifidx;
+	memcpy(mcast.ipv6mr_multiaddr.s6_addr, mgid, sizeof *mgid);
+	err = kernel_setsockopt(port->sock, SOL_IPV6,
+				IPV6_ADD_MEMBERSHIP, (char *)&mcast, sizeof mcast);
+
+	return err;
+}
+
+static int dettach_socket(struct mcast_port *port, union ib_gid *mgid)
+{
+	struct ipv6_mreq mcast;
+
+	if (!port->sock)
+		return -EINVAL;
+
+	mcast.ipv6mr_ifindex = port->ifidx;
+	memcpy(mcast.ipv6mr_multiaddr.s6_addr, mgid, sizeof *mgid);
+
+	return kernel_setsockopt(port->sock, SOL_IPV6, IPV6_DROP_MEMBERSHIP,
+				 (char *)&mcast, sizeof mcast);
+}
+
+static struct mcast_group *mcast_find(struct mcast_port *port,
+				      union ib_gid *mgid)
+{
+	struct rb_node *node = port->table.rb_node;
+	struct mcast_group *group;
+	int ret;
+
+	while (node) {
+		group = rb_entry(node, struct mcast_group, node);
+		ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+		if (!ret)
+			return group;
+
+		if (ret < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mcast_port *port,
+					struct mcast_group *group,
+					int allow_duplicates)
+{
+	struct rb_node **link = &port->table.rb_node;
+	struct rb_node *parent = NULL;
+	struct mcast_group *cur_group;
+	int ret;
+
+	while (*link) {
+		parent = *link;
+		cur_group = rb_entry(parent, struct mcast_group, node);
+
+		ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+			     sizeof group->rec.mgid);
+		if (ret < 0)
+			link = &(*link)->rb_left;
+		else if (ret > 0)
+			link = &(*link)->rb_right;
+		else if (allow_duplicates)
+			link = &(*link)->rb_left;
+		else
+			return cur_group;
+	}
+	rb_link_node(&group->node, parent, link);
+	rb_insert_color(&group->node, &port->table);
+	return NULL;
+}
+
+static void deref_port(struct mcast_port *port)
+{
+	if (atomic_dec_and_test(&port->refcount))
+		complete(&port->comp);
+}
+
+static void release_group(struct mcast_group *group)
+{
+	struct mcast_port *port = group->port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	if (atomic_dec_and_test(&group->refcount)) {
+		rb_erase(&group->node, &port->table);
+		spin_unlock_irqrestore(&port->lock, flags);
+		kfree(group);
+		deref_port(port);
+	} else
+		spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct mcast_member *member)
+{
+	if (atomic_dec_and_test(&member->refcount))
+		complete(&member->comp);
+}
+
+/*
+ * If a multicast group has zero members left for a particular join state, but
+ * the group is still a member with the SA, we need to leave that join state.
+ * Determine which join states we still belong to, but that do not have any
+ * active members.
+ */
+static u8 get_leave_state(struct mcast_group *group)
+{
+	u8 leave_state = 0;
+	int i;
+
+	for (i = 0; i < 3; i++)
+		if (!group->members[i])
+			leave_state |= (0x1 << i);
+
+	return leave_state & group->rec.join_state;
+}
+
+static void queue_join(struct mcast_member *member)
+{
+	struct mcast_group *group = member->group;
+	unsigned long flags;
+
+	spin_lock_irqsave(&group->lock, flags);
+	list_add_tail(&member->list, &group->pending_list);
+	if (group->state == MCAST_IDLE) {
+		group->state = MCAST_BUSY;
+		atomic_inc(&group->refcount);
+		queue_work(mcast_wq, &group->work);
+	}
+	spin_unlock_irqrestore(&group->lock, flags);
+}
+
+/*
+ * A multicast group has three types of members: full member, non member, and
+ * send only member.  We need to keep track of the number of members of each
+ * type based on their join state.  Adjust the number of members the belong to
+ * the specified join states.
+ */
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+	int i;
+
+	for (i = 0; i < 3; i++, join_state >>= 1)
+		if (join_state & 0x1)
+			group->members[i] += inc;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+			  ib_sa_comp_mask selector_mask,
+			  ib_sa_comp_mask value_mask,
+			  u8 selector, u8 src_value, u8 dst_value)
+{
+	int err;
+
+	if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+		return 0;
+
+	switch (selector) {
+	case IB_SA_GT:
+		err = (src_value <= dst_value);
+		break;
+	case IB_SA_LT:
+		err = (src_value >= dst_value);
+		break;
+	case IB_SA_EQ:
+		err = (src_value != dst_value);
+		break;
+	default:
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+static int cmp_rec(struct ib_sa_mcmember_rec *src,
+		   struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
+{
+	/* MGID must already match */
+
+	if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID &&
+	    memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+		return -EINVAL;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+			   IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
+			   src->mtu, dst->mtu))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+	    src->traffic_class != dst->traffic_class)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+		return -EINVAL;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+			   IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
+			   src->rate, dst->rate))
+		return -EINVAL;
+	if (check_selector(comp_mask,
+			   IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+			   IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+			   dst->packet_life_time_selector,
+			   src->packet_life_time, dst->packet_life_time))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+	    src->flow_label != dst->flow_label)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+	    src->hop_limit != dst->hop_limit)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope)
+		return -EINVAL;
+
+	/* join_state checked separately, proxy_join ignored */
+
+	return 0;
+}
+
+static void join_group(struct mcast_group *group, struct mcast_member *member,
+		       u8 join_state)
+{
+	member->state = MCAST_MEMBER;
+	adjust_membership(group, join_state, 1);
+	group->rec.join_state |= join_state;
+	member->multicast.rec = group->rec;
+	member->multicast.rec.join_state = join_state;
+	list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct mcast_group *group, struct mcast_member *member,
+		     int status)
+{
+	spin_lock_irq(&group->lock);
+	list_del_init(&member->list);
+	spin_unlock_irq(&group->lock);
+	return member->multicast.callback(status, &member->multicast);
+}
+
+static void process_group_error(struct mcast_group *group)
+{
+	struct mcast_member *member;
+	int ret = 0;
+	u16 pkey_index;
+
+	if (group->state == MCAST_PKEY_EVENT)
+		ret = ib_find_pkey(group->port->dev->device,
+				   group->port->port_num,
+				   be16_to_cpu(group->rec.pkey), &pkey_index);
+
+	spin_lock_irq(&group->lock);
+	if (group->state == MCAST_PKEY_EVENT && !ret &&
+	    group->pkey_index == pkey_index)
+		goto out;
+
+	while (!list_empty(&group->active_list)) {
+		member = list_entry(group->active_list.next,
+				    struct mcast_member, list);
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		adjust_membership(group, member->multicast.rec.join_state, -1);
+		member->state = MCAST_ERROR;
+		spin_unlock_irq(&group->lock);
+
+		ret = member->multicast.callback(-ENETRESET,
+						 &member->multicast);
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	group->rec.join_state = 0;
+out:
+	group->state = MCAST_BUSY;
+	spin_unlock_irq(&group->lock);
+}
+
+static int send_join(struct mcast_group *group, struct mcast_member *member)
+{
+	group->last_join = member;
+	member->multicast.rec.pkey = cpu_to_be16(0xffff);
+	join_handler(0, &member->multicast.rec, group);
+	return 0;
+}
+
+static int send_leave(struct mcast_group *group, u8 leave_state)
+{
+	struct ib_sa_mcmember_rec rec;
+
+	rec = group->rec;
+	rec.join_state = leave_state;
+	leave_handler(0, &rec, group);
+	return 0;
+}
+
+static void mcast_work_handler(struct work_struct *work)
+{
+	struct mcast_group *group;
+	struct mcast_member *member;
+	struct ib_sa_multicast *multicast;
+	int status, ret;
+	u8 join_state;
+
+	group = container_of(work, typeof(*group), work);
+retest:
+	spin_lock_irq(&group->lock);
+	while (!list_empty(&group->pending_list) ||
+	       (group->state != MCAST_BUSY)) {
+
+		if (group->state != MCAST_BUSY) {
+			spin_unlock_irq(&group->lock);
+			process_group_error(group);
+			goto retest;
+		}
+
+		member = list_entry(group->pending_list.next,
+				    struct mcast_member, list);
+		multicast = &member->multicast;
+		join_state = multicast->rec.join_state;
+		atomic_inc(&member->refcount);
+
+		if (join_state == (group->rec.join_state & join_state)) {
+			status = cmp_rec(&group->rec, &multicast->rec,
+					 multicast->comp_mask);
+			if (!status)
+				join_group(group, member, join_state);
+			else
+				list_del_init(&member->list);
+			spin_unlock_irq(&group->lock);
+			ret = multicast->callback(status, multicast);
+		} else {
+			spin_unlock_irq(&group->lock);
+			status = send_join(group, member);
+			if (!status) {
+				deref_member(member);
+				return;
+			}
+			ret = fail_join(group, member, status);
+		}
+
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	join_state = get_leave_state(group);
+	if (join_state) {
+		group->rec.join_state &= ~join_state;
+		spin_unlock_irq(&group->lock);
+		dettach_socket(group->port, &group->rec.mgid);
+		if (send_leave(group, join_state))
+			goto retest;
+	} else {
+		group->state = MCAST_IDLE;
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+}
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+			 void *context)
+{
+	struct mcast_group *group = context;
+	u16 pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+	ib_find_pkey(group->port->dev->device, group->port->port_num,
+		     be16_to_cpu(rec->pkey), &pkey_index);
+
+	spin_lock_irq(&group->port->lock);
+	group->rec = *rec;
+	if (group->state == MCAST_BUSY &&
+	    group->pkey_index == MCAST_INVALID_PKEY_INDEX)
+		group->pkey_index = pkey_index;
+	if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
+		rb_erase(&group->node, &group->port->table);
+		mcast_insert(group->port, group, 1);
+	}
+	spin_unlock_irq(&group->port->lock);
+	mcast_work_handler(&group->work);
+}
+
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+			  void *context)
+{
+	struct mcast_group *group = context;
+
+	mcast_work_handler(&group->work);
+}
+
+static struct mcast_group *acquire_group(struct mcast_port *port,
+					 union ib_gid *mgid, gfp_t gfp_mask)
+{
+	struct mcast_group *group, *cur_group;
+	unsigned long flags;
+	int is_mgid0;
+	int need_attach = 0;
+
+	is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+	if (!is_mgid0) {
+		spin_lock_irqsave(&port->lock, flags);
+		group = mcast_find(port, mgid);
+		if (group)
+			goto found;
+		spin_unlock_irqrestore(&port->lock, flags);
+	}
+
+	group = kzalloc(sizeof *group, gfp_mask);
+	if (!group)
+		return NULL;
+
+	group->port = port;
+	group->rec.mgid = *mgid;
+	group->pkey_index = MCAST_INVALID_PKEY_INDEX;
+	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->active_list);
+	INIT_WORK(&group->work, mcast_work_handler);
+	spin_lock_init(&group->lock);
+
+	spin_lock_irqsave(&port->lock, flags);
+	cur_group = mcast_insert(port, group, is_mgid0);
+	if (cur_group) {
+		kfree(group);
+		group = cur_group;
+	} else
+		atomic_inc(&port->refcount);
+
+	if (!is_mgid0)
+		need_attach = 1;
+
+found:
+	atomic_inc(&group->refcount);
+	spin_unlock_irqrestore(&port->lock, flags);
+	if (need_attach && attach_socket(port, mgid)) {
+		release_group(group);
+		group = NULL;
+	}
+
+	return group;
+}
+
+struct ib_sa_multicast *
+rdmaoe_sa_join_multicast(struct ib_sa_client *client,
+			 struct ib_device *device, u8 port_num,
+			 struct ib_sa_mcmember_rec *rec,
+			 ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+			 int (*callback)(int status,
+					 struct ib_sa_multicast *multicast),
+			 void *context)
+{
+	struct mcast_device *dev;
+	struct mcast_member *member;
+	struct ib_sa_multicast *multicast;
+	int ret;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	member = kmalloc(sizeof *member, gfp_mask);
+	if (!member)
+		return ERR_PTR(-ENOMEM);
+
+	ib_sa_client_get(client);
+	member->client = client;
+	member->multicast.rec = *rec;
+	member->multicast.comp_mask = comp_mask;
+	member->multicast.callback = callback;
+	member->multicast.context = context;
+	init_completion(&member->comp);
+	atomic_set(&member->refcount, 1);
+	member->state = MCAST_JOINING;
+
+	member->group = acquire_group(&dev->port[port_num - dev->start_port],
+				      &rec->mgid, gfp_mask);
+	if (!member->group) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * The user will get the multicast structure in their callback.  They
+	 * could then free the multicast structure before we can return from
+	 * this routine.  So we save the pointer to return before queuing
+	 * any callback.
+	 */
+	multicast = &member->multicast;
+	queue_join(member);
+	return multicast;
+
+err:
+	ib_sa_client_put(client);
+	kfree(member);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(rdmaoe_sa_join_multicast);
+
+void rdmaoe_sa_free_multicast(struct ib_sa_multicast *multicast)
+{
+	struct mcast_member *member;
+	struct mcast_group *group;
+
+	member = container_of(multicast, struct mcast_member, multicast);
+	group = member->group;
+	spin_lock_irq(&group->lock);
+	if (member->state == MCAST_MEMBER)
+		adjust_membership(group, multicast->rec.join_state, -1);
+
+	list_del_init(&member->list);
+
+	if (group->state == MCAST_IDLE) {
+		group->state = MCAST_BUSY;
+		spin_unlock_irq(&group->lock);
+		/* Continue to hold reference on group until callback */
+		queue_work(mcast_wq, &group->work);
+	} else {
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+
+	deref_member(member);
+	wait_for_completion(&member->comp);
+	ib_sa_client_put(member->client);
+}
+EXPORT_SYMBOL(rdmaoe_sa_free_multicast);
+
+struct eth_work_container {
+	struct work_struct work;
+	struct ib_sa_path_query *query;
+};
+
+static void resolve_callback(struct work_struct *work)
+{
+	struct eth_work_container *eth =
+	container_of(work, struct eth_work_container, work);
+	struct ib_sa_path_query *query = eth->query;
+	struct ib_sa_path_rec res = {};
+
+	res.dgid = query->dgid;
+	res.sgid = query->sgid;
+	res.hop_limit = 2; /* TBD fix this */
+	res.mtu = IB_MTU_1024; /* TBD fix me */
+	query->callback(0, &res, query->context);
+
+	ib_sa_client_put(query->sa_query.client);
+}
+
+int rdmaoe_sa_path_rec_get(struct ib_sa_client *client,
+			   struct ib_device *device, u8 port_num,
+			   struct ib_sa_path_rec *rec,
+			   ib_sa_comp_mask comp_mask,
+			   int timeout_ms, gfp_t gfp_mask,
+			   void (*callback)(int status,
+					    struct ib_sa_path_rec *resp,
+					    void *context),
+			   void *context,
+			   struct ib_sa_query **sa_query)
+{
+	struct ib_sa_path_query *query;
+	struct eth_work_container *eth;
+
+	query = kzalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	eth = kzalloc(sizeof *eth, gfp_mask);
+	if (!eth) {
+		kfree(query);
+		return -ENOMEM;
+	}
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+	query->dgid            = rec->dgid;
+	query->sgid            = rec->sgid;
+
+	*sa_query = &query->sa_query;
+
+	eth->query = query;
+	INIT_WORK(&eth->work, resolve_callback);
+	schedule_work(&eth->work);
+
+	return 0;
+}
+EXPORT_SYMBOL(rdmaoe_sa_path_rec_get);
+
+static void rdmaoe_sa_event(struct ib_event_handler *handler, struct ib_event *event)
+{
+	printk(KERN_NOTICE "%s: got event %d\n", __func__, event->event);
+}
+
+static void rdmaoe_sa_add_one(struct ib_device *device)
+{
+	struct rdmaoe_sa_device *sa_dev;
+	int s, e, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		return;
+
+	s = 1;
+	e = device->phys_port_cnt;
+
+	sa_dev = kmalloc(sizeof *sa_dev +
+			 (e - s + 1) * sizeof *sa_dev, GFP_KERNEL);
+	if (!sa_dev)
+		return;
+
+	sa_dev->start_port = s;
+	sa_dev->end_port   = e;
+
+	for (i = 0; i <= e - s; ++i)
+		sa_dev->port[i].port_num = i + s;
+
+	ib_set_client_data(device, &sa_client, sa_dev);
+
+	/*
+	 * We register our event handler after everything is set up,
+	 * and then update our cached info after the event handler is
+	 * registered to avoid any problems if a port changes state
+	 * during our initialization.
+	 */
+
+	INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, rdmaoe_sa_event);
+	if (ib_register_event_handler(&sa_dev->event_handler))
+		goto err;
+
+	return;
+
+err:
+	kfree(sa_dev);
+
+	return;
+}
+
+static void rdmaoe_sa_remove_one(struct ib_device *device)
+{
+	struct rdmaoe_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+
+	if (!sa_dev)
+		return;
+
+	ib_unregister_event_handler(&sa_dev->event_handler);
+
+	kfree(sa_dev);
+}
+
+static void mcast_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+}
+
+static void rdmaoe_mcast_add_one(struct ib_device *device)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	int i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		return;
+
+	dev = kzalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+		      GFP_KERNEL);
+	if (!dev)
+		return;
+
+	dev->start_port = 1;
+	dev->end_port = device->phys_port_cnt;
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		port = &dev->port[i];
+		port->dev = dev;
+		port->port_num = dev->start_port + i;
+		spin_lock_init(&port->lock);
+		port->table = RB_ROOT;
+		init_completion(&port->comp);
+		atomic_set(&port->refcount, 1);
+		start_igmp6(port);
+	}
+
+	dev->device = device;
+	ib_set_client_data(device, &mcast_client, dev);
+
+	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler);
+	ib_register_event_handler(&dev->event_handler);
+}
+
+static void rdmaoe_mcast_remove_one(struct ib_device *device)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	int i;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return;
+
+	ib_unregister_event_handler(&dev->event_handler);
+	flush_workqueue(mcast_wq);
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		port = &dev->port[i];
+		deref_port(port);
+		wait_for_completion(&port->comp);
+		stop_igmp6(port);
+	}
+
+	kfree(dev);
+}
+
+static int __init rdmaoe_sa_init(void)
+{
+	int err;
+
+	mcast_wq = create_singlethread_workqueue("rdmaoe_mcast");
+	if (!mcast_wq)
+		return -ENOMEM;
+
+	err = ib_register_client(&sa_client);
+	if (err) {
+		printk(KERN_ERR "Couldn't register rdmaoe_sa client\n");
+		goto reg_fail;
+	}
+
+	err = ib_register_client(&mcast_client);
+	if (err)
+		goto reg_mcast_fail;
+
+
+	return 0;
+
+reg_mcast_fail:
+	ib_unregister_client(&sa_client);
+reg_fail:
+	destroy_workqueue(mcast_wq);
+	return err;
+}
+
+static void __exit rdmaoe_sa_cleanup(void)
+{
+	ib_unregister_client(&mcast_client);
+	ib_unregister_client(&sa_client);
+	destroy_workqueue(mcast_wq);
+}
+
+module_init(rdmaoe_sa_init);
+module_exit(rdmaoe_sa_cleanup);
diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h
index b1d4bbf..da780cf 100644
--- a/drivers/infiniband/core/sa.h
+++ b/drivers/infiniband/core/sa.h
@@ -37,6 +37,30 @@
 
 #include <rdma/ib_sa.h>
 
+struct ib_sa_query {
+	void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+	void (*release)(struct ib_sa_query *);
+	struct ib_sa_client    *client;
+	struct ib_sa_port      *port;
+	struct ib_mad_send_buf *mad_buf;
+	struct ib_sa_sm_ah     *sm_ah;
+	int			id;
+};
+
+struct ib_sa_service_query {
+	void (*callback)(int, struct ib_sa_service_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_path_query {
+	void (*callback)(int, struct ib_sa_path_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+	union ib_gid dgid;
+	union ib_gid sgid;
+};
+
 static inline void ib_sa_client_get(struct ib_sa_client *client)
 {
 	atomic_inc(&client->users);
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 1865049..0625e10 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -46,6 +46,7 @@
 #include <rdma/ib_pack.h>
 #include <rdma/ib_cache.h>
 #include "sa.h"
+#include "multicast.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand subnet administration query support");
@@ -72,28 +73,6 @@ struct ib_sa_device {
 	struct ib_sa_port port[0];
 };
 
-struct ib_sa_query {
-	void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
-	void (*release)(struct ib_sa_query *);
-	struct ib_sa_client    *client;
-	struct ib_sa_port      *port;
-	struct ib_mad_send_buf *mad_buf;
-	struct ib_sa_sm_ah     *sm_ah;
-	int			id;
-};
-
-struct ib_sa_service_query {
-	void (*callback)(int, struct ib_sa_service_rec *, void *);
-	void *context;
-	struct ib_sa_query sa_query;
-};
-
-struct ib_sa_path_query {
-	void (*callback)(int, struct ib_sa_path_rec *, void *);
-	void *context;
-	struct ib_sa_query sa_query;
-};
-
 struct ib_sa_mcmember_query {
 	void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
 	void *context;
@@ -363,6 +342,9 @@ static void update_sm_ah(struct work_struct *work)
 	struct ib_port_attr port_attr;
 	struct ib_ah_attr   ah_attr;
 
+	if (ib_get_port_link_type(port->agent->device, port->port_num) != PORT_LINK_IB)
+		return;
+
 	if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
 		printk(KERN_WARNING "Couldn't query port\n");
 		return;
diff --git a/include/rdma/rdmaoe_sa.h b/include/rdma/rdmaoe_sa.h
new file mode 100644
index 0000000..2a93235
--- /dev/null
+++ b/include/rdma/rdmaoe_sa.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMAOE_SA_H
+#define RDMAOE_SA_H
+
+#include <rdma/ib_sa.h>
+
+struct rdmaoe_sa_client {
+	atomic_t users;
+	struct completion comp;
+};
+
+struct ib_sa_multicast *
+rdmaoe_sa_join_multicast(struct ib_sa_client *client,
+			 struct ib_device *device, u8 port_num,
+			 struct ib_sa_mcmember_rec *rec,
+			 ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+			 int (*callback)(int status,
+					 struct ib_sa_multicast *multicast),
+			 void *context);
+
+void rdmaoe_sa_free_multicast(struct ib_sa_multicast *multicast);
+
+int rdmaoe_sa_path_rec_get(struct ib_sa_client *client,
+			   struct ib_device *device, u8 port_num,
+			   struct ib_sa_path_rec *rec,
+			   ib_sa_comp_mask comp_mask,
+			   int timeout_ms, gfp_t gfp_mask,
+			   void (*callback)(int status,
+					    struct ib_sa_path_rec *resp,
+					    void *context),
+			   void *context,
+			   struct ib_sa_query **sa_query);
+
+#endif /* RDMAOE_SA_H */
+
-- 
1.6.3.3




More information about the ewg mailing list