[openib-general] [PATCH 1/7 v2] for 2.6.20 ib/ib_sa: add tracking of multicast join / leave requests
Michael S. Tsirkin
mst at mellanox.co.il
Mon Nov 6 02:35:44 PST 2006
Quoting Sean Hefty <sean.hefty at intel.com>:
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> index 3faa182..d90f804 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> @@ -60,14 +60,11 @@ static DEFINE_MUTEX(mcast_mutex);
> /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
> struct ipoib_mcast {
> struct ib_sa_mcmember_rec mcmember;
> + struct ib_sa_multicast *mc;
> struct ipoib_ah *ah;
>
> struct rb_node rb_node;
> struct list_head list;
> - struct completion done;
> -
> - int query_id;
> - struct ib_sa_query *query;
>
> unsigned long created;
> unsigned long backoff;
> @@ -299,18 +296,22 @@ static int ipoib_mcast_join_finish(struc
> return 0;
> }
>
> -static void
> +static int
> ipoib_mcast_sendonly_join_complete(int status,
> - struct ib_sa_mcmember_rec *mcmember,
> - void *mcast_ptr)
> + struct ib_sa_multicast *multicast)
> {
> - struct ipoib_mcast *mcast = mcast_ptr;
> + struct ipoib_mcast *mcast = multicast->context;
> struct net_device *dev = mcast->dev;
> struct ipoib_dev_priv *priv = netdev_priv(dev);
>
> + /* We trap for port events ourselves. */
> + if (status == -ENETRESET)
> + return 0;
> +
> if (!status)
> - ipoib_mcast_join_finish(mcast, mcmember);
> - else {
> + status = ipoib_mcast_join_finish(mcast, &multicast->rec);
> +
> + if (status) {
> if (mcast->logcount++ < 20)
> ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for "
> IPOIB_GID_FMT ", status %d\n",
> @@ -325,11 +326,10 @@ ipoib_mcast_sendonly_join_complete(int s
> spin_unlock_irq(&priv->tx_lock);
>
> /* Clear the busy flag so we try again */
> - clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
> - mcast->query = NULL;
> + status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
> + &mcast->flags);
> }
> -
> - complete(&mcast->done);
> + return status;
> }
>
> static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
> @@ -359,35 +359,33 @@ #endif
> rec.port_gid = priv->local_gid;
> rec.pkey = cpu_to_be16(priv->pkey);
>
> - init_completion(&mcast->done);
> -
> - ret = ib_sa_mcmember_rec_set(&ipoib_sa_client, priv->ca, priv->port, &rec,
> - IB_SA_MCMEMBER_REC_MGID |
> - IB_SA_MCMEMBER_REC_PORT_GID |
> - IB_SA_MCMEMBER_REC_PKEY |
> - IB_SA_MCMEMBER_REC_JOIN_STATE,
> - 1000, GFP_ATOMIC,
> - ipoib_mcast_sendonly_join_complete,
> - mcast, &mcast->query);
> - if (ret < 0) {
> - ipoib_warn(priv, "ib_sa_mcmember_rec_set failed (ret = %d)\n",
> + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
> + priv->port, &rec,
> + IB_SA_MCMEMBER_REC_MGID |
> + IB_SA_MCMEMBER_REC_PORT_GID |
> + IB_SA_MCMEMBER_REC_PKEY |
> + IB_SA_MCMEMBER_REC_JOIN_STATE,
> + GFP_ATOMIC,
> + ipoib_mcast_sendonly_join_complete,
> + mcast);
> + if (IS_ERR(mcast->mc)) {
> + ret = PTR_ERR(mcast->mc);
> + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
> + ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
> ret);
> } else {
> ipoib_dbg_mcast(priv, "no multicast record for " IPOIB_GID_FMT
> ", starting join\n",
> IPOIB_GID_ARG(mcast->mcmember.mgid));
> -
> - mcast->query_id = ret;
> }
>
> return ret;
> }
>
> -static void ipoib_mcast_join_complete(int status,
> - struct ib_sa_mcmember_rec *mcmember,
> - void *mcast_ptr)
> +static int ipoib_mcast_join_complete(int status,
> + struct ib_sa_multicast *multicast)
> {
> - struct ipoib_mcast *mcast = mcast_ptr;
> + struct ipoib_mcast *mcast = multicast->context;
> struct net_device *dev = mcast->dev;
> struct ipoib_dev_priv *priv = netdev_priv(dev);
>
> @@ -395,23 +393,24 @@ static void ipoib_mcast_join_complete(in
> " (status %d)\n",
> IPOIB_GID_ARG(mcast->mcmember.mgid), status);
>
> - if (!status && !ipoib_mcast_join_finish(mcast, mcmember)) {
> + /* We trap for port events ourselves. */
> + if (status == -ENETRESET)
> + return 0;
> +
> + if (!status)
> + status = ipoib_mcast_join_finish(mcast, &multicast->rec);
> +
> + if (!status) {
> mcast->backoff = 1;
> mutex_lock(&mcast_mutex);
> if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
> queue_work(ipoib_workqueue, &priv->mcast_task);
> mutex_unlock(&mcast_mutex);
> - complete(&mcast->done);
> - return;
> - }
> -
> - if (status == -EINTR) {
> - complete(&mcast->done);
> - return;
> + return 0;
> }
>
> - if (status && mcast->logcount++ < 20) {
> - if (status == -ETIMEDOUT || status == -EINTR) {
> + if (mcast->logcount++ < 20) {
> + if (status == -ETIMEDOUT) {
> ipoib_dbg_mcast(priv, "multicast join failed for " IPOIB_GID_FMT
> ", status %d\n",
> IPOIB_GID_ARG(mcast->mcmember.mgid),
> @@ -428,23 +427,18 @@ static void ipoib_mcast_join_complete(in
> if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
> mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
>
> - mutex_lock(&mcast_mutex);
> + /* Clear the busy flag so we try again */
> + status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
>
> + mutex_lock(&mcast_mutex);
> spin_lock_irq(&priv->lock);
> - mcast->query = NULL;
> -
> - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) {
> - if (status == -ETIMEDOUT)
> - queue_work(ipoib_workqueue, &priv->mcast_task);
> - else
> - queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
> - mcast->backoff * HZ);
> - } else
> - complete(&mcast->done);
> + if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
> + queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
> + mcast->backoff * HZ);
> spin_unlock_irq(&priv->lock);
> mutex_unlock(&mcast_mutex);
>
> - return;
> + return status;
> }
>
> static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
> @@ -493,15 +487,14 @@ static void ipoib_mcast_join(struct net_
> rec.hop_limit = priv->broadcast->mcmember.hop_limit;
> }
>
> - init_completion(&mcast->done);
> -
> - ret = ib_sa_mcmember_rec_set(&ipoib_sa_client, priv->ca, priv->port,
> - &rec, comp_mask, mcast->backoff * 1000,
> - GFP_ATOMIC, ipoib_mcast_join_complete,
> - mcast, &mcast->query);
> -
> - if (ret < 0) {
> - ipoib_warn(priv, "ib_sa_mcmember_rec_set failed, status %d\n", ret);
> + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
> + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
> + &rec, comp_mask, GFP_KERNEL,
> + ipoib_mcast_join_complete, mcast);
> + if (IS_ERR(mcast->mc)) {
> + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
> + ret = PTR_ERR(mcast->mc);
> + ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
>
> mcast->backoff *= 2;
> if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
> @@ -513,8 +506,7 @@ static void ipoib_mcast_join(struct net_
> &priv->mcast_task,
> mcast->backoff * HZ);
> mutex_unlock(&mcast_mutex);
> - } else
> - mcast->query_id = ret;
> + }
> }
>
> void ipoib_mcast_join_task(void *dev_ptr)
> @@ -538,7 +530,7 @@ void ipoib_mcast_join_task(void *dev_ptr
> priv->local_rate = attr.active_speed *
> ib_width_enum_to_int(attr.active_width);
> } else
> - ipoib_warn(priv, "ib_query_port failed\n");
> + ipoib_warn(priv, "ib_query_port failed\n");
> }
>
> if (!priv->broadcast) {
> @@ -565,7 +557,8 @@ void ipoib_mcast_join_task(void *dev_ptr
> }
>
> if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
> - ipoib_mcast_join(dev, priv->broadcast, 0);
> + if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
> + ipoib_mcast_join(dev, priv->broadcast, 0);
> return;
> }
>
> @@ -620,26 +613,9 @@ int ipoib_mcast_start_thread(struct net_
> return 0;
> }
>
> -static void wait_for_mcast_join(struct ipoib_dev_priv *priv,
> - struct ipoib_mcast *mcast)
> -{
> - spin_lock_irq(&priv->lock);
> - if (mcast && mcast->query) {
> - ib_sa_cancel_query(mcast->query_id, mcast->query);
> - mcast->query = NULL;
> - spin_unlock_irq(&priv->lock);
> - ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n",
> - IPOIB_GID_ARG(mcast->mcmember.mgid));
> - wait_for_completion(&mcast->done);
> - }
> - else
> - spin_unlock_irq(&priv->lock);
> -}
> -
> int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
> {
> struct ipoib_dev_priv *priv = netdev_priv(dev);
> - struct ipoib_mcast *mcast;
>
> ipoib_dbg_mcast(priv, "stopping multicast thread\n");
>
> @@ -655,52 +631,27 @@ int ipoib_mcast_stop_thread(struct net_d
> if (flush)
> flush_workqueue(ipoib_workqueue);
>
> - wait_for_mcast_join(priv, priv->broadcast);
> -
> - list_for_each_entry(mcast, &priv->multicast_list, list)
> - wait_for_mcast_join(priv, mcast);
> -
> return 0;
> }
>
> static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
> {
> struct ipoib_dev_priv *priv = netdev_priv(dev);
> - struct ib_sa_mcmember_rec rec = {
> - .join_state = 1
> - };
> int ret = 0;
>
> - if (!test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags))
> - return 0;
> -
> - ipoib_dbg_mcast(priv, "leaving MGID " IPOIB_GID_FMT "\n",
> - IPOIB_GID_ARG(mcast->mcmember.mgid));
> -
> - rec.mgid = mcast->mcmember.mgid;
> - rec.port_gid = priv->local_gid;
> - rec.pkey = cpu_to_be16(priv->pkey);
> + if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
> + ipoib_dbg_mcast(priv, "leaving MGID " IPOIB_GID_FMT "\n",
> + IPOIB_GID_ARG(mcast->mcmember.mgid));
>
> - /* Remove ourselves from the multicast group */
> - ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid),
> - &mcast->mcmember.mgid);
> - if (ret)
> - ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret);
> + /* Remove ourselves from the multicast group */
> + ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid),
> + &mcast->mcmember.mgid);
> + if (ret)
> + ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret);
> + }
>
> - /*
> - * Just make one shot at leaving and don't wait for a reply;
> - * if we fail, too bad.
> - */
> - ret = ib_sa_mcmember_rec_delete(&ipoib_sa_client, priv->ca, priv->port, &rec,
> - IB_SA_MCMEMBER_REC_MGID |
> - IB_SA_MCMEMBER_REC_PORT_GID |
> - IB_SA_MCMEMBER_REC_PKEY |
> - IB_SA_MCMEMBER_REC_JOIN_STATE,
> - 0, GFP_ATOMIC, NULL,
> - mcast, &mcast->query);
> - if (ret < 0)
> - ipoib_warn(priv, "ib_sa_mcmember_rec_delete failed "
> - "for leave (result = %d)\n", ret);
> + if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
> + ib_sa_free_multicast(mcast->mc);
>
> return 0;
> }
> @@ -753,7 +704,7 @@ void ipoib_mcast_send(struct net_device
> dev_kfree_skb_any(skb);
> }
>
> - if (mcast->query)
> + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
> ipoib_dbg_mcast(priv, "no address vector, "
> "but multicast join already started\n");
> else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
> @@ -910,7 +861,6 @@ void ipoib_mcast_restart_task(void *dev_
>
> /* We have to cancel outside of the spinlock */
> list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
> - wait_for_mcast_join(priv, mcast);
> ipoib_mcast_leave(mcast->dev, mcast);
> ipoib_mcast_free(mcast);
> }
> diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
> index 97715b0..3b957e5 100644
> --- a/include/rdma/ib_sa.h
> +++ b/include/rdma/ib_sa.h
> @@ -285,18 +285,6 @@ int ib_sa_path_rec_get(struct ib_sa_clie
> void *context,
> struct ib_sa_query **query);
>
> -int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
> - struct ib_device *device, u8 port_num,
> - u8 method,
> - struct ib_sa_mcmember_rec *rec,
> - ib_sa_comp_mask comp_mask,
> - int timeout_ms, gfp_t gfp_mask,
> - void (*callback)(int status,
> - struct ib_sa_mcmember_rec *resp,
> - void *context),
> - void *context,
> - struct ib_sa_query **query);
> -
> int ib_sa_service_rec_query(struct ib_sa_client *client,
> struct ib_device *device, u8 port_num,
> u8 method,
> @@ -309,93 +297,87 @@ int ib_sa_service_rec_query(struct ib_sa
> void *context,
> struct ib_sa_query **sa_query);
>
> +struct ib_sa_multicast {
> + struct ib_sa_mcmember_rec rec;
> + ib_sa_comp_mask comp_mask;
> + int (*callback)(int status,
> + struct ib_sa_multicast *multicast);
> + void *context;
> +};
> +
> /**
> - * ib_sa_mcmember_rec_set - Start an MCMember set query
> - * @client:SA client
> - * @device:device to send query on
> - * @port_num: port number to send query on
> - * @rec:MCMember Record to send in query
> - * @comp_mask:component mask to send in query
> - * @timeout_ms:time to wait for response
> - * @gfp_mask:GFP mask to use for internal allocations
> - * @callback:function called when query completes, times out or is
> - * canceled
> - * @context:opaque user context passed to callback
> - * @sa_query:query context, used to cancel query
> + * ib_sa_join_multicast - Initiates a join request to the specified multicast
> + * group.
> + * @client: SA client
> + * @device: Device associated with the multicast group.
> + * @port_num: Port on the specified device to associate with the multicast
> + * group.
> + * @rec: SA multicast member record specifying group attributes.
> + * @comp_mask: Component mask indicating which group attributes of %rec are
> + * valid.
> + * @gfp_mask: GFP mask for memory allocations.
> + * @callback: User callback invoked once the join operation completes.
> + * @context: User specified context stored with the ib_sa_multicast structure.
> *
> - * Send an MCMember Set query to the SA (eg to join a multicast
> - * group). The callback function will be called when the query
> - * completes (or fails); status is 0 for a successful response, -EINTR
> - * if the query is canceled, -ETIMEDOUT is the query timed out, or
> - * -EIO if an error occurred sending the query. The resp parameter of
> - * the callback is only valid if status is 0.
> + * This call initiates a multicast join request with the SA for the specified
> + * multicast group. If the join operation is started successfully, it returns
> + * an ib_sa_multicast structure that is used to track the multicast operation.
> + * Users must free this structure by calling ib_free_multicast, even if the
> + * join operation later fails. (The callback status is non-zero.)
> *
> - * If the return value of ib_sa_mcmember_rec_set() is negative, it is
> - * an error code. Otherwise it is a query ID that can be used to
> - * cancel the query.
> + * If the join operation fails; status will be non-zero, with the following
> + * failures possible:
> + * -ETIMEDOUT: The request timed out.
> + * -EIO: An error occurred sending the query.
> + * -EINVAL: The MCMemberRecord values differed from the existing group's.
> + * -ENETRESET: Indicates that an fatal error has occurred on the multicast
> + * group, and the user must rejoin the group to continue using it.
> */
> -static inline int
> -ib_sa_mcmember_rec_set(struct ib_sa_client *client,
> - struct ib_device *device, u8 port_num,
> - struct ib_sa_mcmember_rec *rec,
> - ib_sa_comp_mask comp_mask,
> - int timeout_ms, gfp_t gfp_mask,
> - void (*callback)(int status,
> - struct ib_sa_mcmember_rec *resp,
> - void *context),
> - void *context,
> - struct ib_sa_query **query)
> -{
> - return ib_sa_mcmember_rec_query(client, device, port_num,
> - IB_MGMT_METHOD_SET,
> - rec, comp_mask,
> - timeout_ms, gfp_mask, callback,
> - context, query);
> -}
> +struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client,
> + struct ib_device *device, u8 port_num,
> + struct ib_sa_mcmember_rec *rec,
> + ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
> + int (*callback)(int status,
> + struct ib_sa_multicast
> + *multicast),
> + void *context);
>
> /**
> - * ib_sa_mcmember_rec_delete - Start an MCMember delete query
> - * @client:SA client
> - * @device:device to send query on
> - * @port_num: port number to send query on
> - * @rec:MCMember Record to send in query
> - * @comp_mask:component mask to send in query
> - * @timeout_ms:time to wait for response
> - * @gfp_mask:GFP mask to use for internal allocations
> - * @callback:function called when query completes, times out or is
> - * canceled
> - * @context:opaque user context passed to callback
> - * @sa_query:query context, used to cancel query
> + * ib_free_multicast - Frees the multicast tracking structure, and releases
> + * any reference on the multicast group.
> + * @multicast: Multicast tracking structure allocated by ib_join_multicast.
> *
> - * Send an MCMember Delete query to the SA (eg to leave a multicast
> - * group). The callback function will be called when the query
> - * completes (or fails); status is 0 for a successful response, -EINTR
> - * if the query is canceled, -ETIMEDOUT is the query timed out, or
> - * -EIO if an error occurred sending the query. The resp parameter of
> - * the callback is only valid if status is 0.
> + * This call blocks until the multicast identifier is destroyed. It may
> + * not be called from within the multicast callback; however, returning a non-
> + * zero value from the callback will result in destroying the multicast
> + * tracking structure.
> + */
> +void ib_sa_free_multicast(struct ib_sa_multicast *multicast);
> +
> +/**
> + * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and
> + * returns it if found.
> + * @device: Device associated with the multicast group.
> + * @port_num: Port on the specified device to associate with the multicast
> + * group.
> + * @mgid: optional MGID of multicast group.
> + * @rec: Location to copy SA multicast member record.
> *
> - * If the return value of ib_sa_mcmember_rec_delete() is negative, it
> - * is an error code. Otherwise it is a query ID that can be used to
> - * cancel the query.
> + * If an MGID is specified, returns an existing multicast member record if
> + * one is found for the local port. If no MGID is specified, or the specified
> + * MGID is 0, returns a multicast member record filled in with default values
> + * that may be used to create a new multicast group.
> */
> -static inline int
> -ib_sa_mcmember_rec_delete(struct ib_sa_client *client,
> - struct ib_device *device, u8 port_num,
> - struct ib_sa_mcmember_rec *rec,
> - ib_sa_comp_mask comp_mask,
> - int timeout_ms, gfp_t gfp_mask,
> - void (*callback)(int status,
> - struct ib_sa_mcmember_rec *resp,
> - void *context),
> - void *context,
> - struct ib_sa_query **query)
> -{
> - return ib_sa_mcmember_rec_query(client, device, port_num,
> - IB_SA_METHOD_DELETE,
> - rec, comp_mask,
> - timeout_ms, gfp_mask, callback,
> - context, query);
> -}
> +int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
> + union ib_gid *mgid, struct ib_sa_mcmember_rec *rec);
> +
> +/**
> + * ib_init_ah_from_mcmember - Initialize address handle attributes based on
> + * an SA multicast member record.
> + */
> +int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
> + struct ib_sa_mcmember_rec *rec,
> + struct ib_ah_attr *ah_attr);
>
> /**
> * ib_init_ah_from_path - Initialize address handle attributes based on an SA
>
OK, I went over this again. Some questions:
So instead of ib_sa_mcmember_rec_set which returned the query by pointer we now
have ib_sa_join_multicast which returns the pointer. This part looks OK I guess, but
I still do not understand why does the patch tinker with logic (e.g.
setting/clearing IPOIB_MCAST_FLAG_BUSY) in the IPoIB code.
*All* the new API was supposed to do is add reference counting on top of
join/leave queries. So why the need to rework the logic? Is it possible that
what is missing is the analog of ib_sa_cancel_query - a non-blocking call which
would guarantee that join callback is invoked soon? If yes, I think we should
just add that to the new API.
Sean, it seems like you are trying to push some unrelated re-factoring in the IPoIB
code, which might be fine, but should be done separately from the update to
the new API - could be before, or after the multicast change.
--
MST
More information about the general
mailing list