[ofiwg] API for Libfabric Overrides

Ziemba, Ian ian.ziemba at hpe.com
Mon May 18 11:59:37 PDT 2020


I am following up from a previous OFIWG meeting in which I presented a proposal for allowing libfabric users to provide override functions for common libfabric provider operations. This stems from a requirement to override host/device memory movement calls a provider may use.

The following is a proposal based on the presentation in the OFIWG meeting. It consists of the user defining an override structure and assigning it to a fid through a new fid function call. The following is an example override structure a user would define.

struct fi_override_ops {
       size_t size;

       /* Copy from a host destination buffer to an HMEM iov. */
       ssize_t (*copy_from_hmem_iov)(void *dest, size_t size,
                                 const struct iovec *hmem_iov,
                                 enum fi_hmem_iface hmem_iface,
                                 size_t hmem_iov_count,
                                 uint64_t hmem_iov_offset);

       /* Copy from an HMEM iov to a host source buffer. */
       ssize_t (*copy_to_hmem_iov)(const struct iovec *hmem_iov,
                                enum fi_hmem_iface hmem_iface,
                               size_t hmem_iov_count,
                               uint64_t hmem_iov_offset, void *src,
                               size_t size);
};

Couple things to note about the structure:

  *   Through internal discussion, it was determined it would be beneficial to expose a generic HMEM IOV copy to/from override to the user instead of a per HMEM interface override. This places the requirement on the user to implement the copy routine for all HMEM interfaces a provider will encounter.
  *   Overrides are optional. If a user does not set an override, the provider should use its internal implementation.

The user would register the override structure to a fid using a new fi_set_ops function. The following is an example of this.

enum fi_set_ops {
       FI_OVERRIDE_OPS,
};

struct fi_ops {
       size_t size;
       int    (*close)(struct fid *fid);
       int    (*bind)(struct fid *fid, struct fid *bfid, uint64_t flags);
       int    (*control)(struct fid *fid, int command, void *arg);
       int    (*ops_open)(struct fid *fid, const char *name,
                         uint64_t flags, void **ops, void *context);
       int    (*tostr)(const struct fid *fid, char *buf, size_t len);
       int    (*ops_set)(struct fid *fid, enum fi_set_ops ops_type, void *ops,
                        uint64_t flags);
};

static inline int
fi_set_ops(struct fid *fid, enum fi_set_ops ops_type, void *ops, uint64_t flags)
{
       if (!fid->ops->ops_set)
              return -FI_ENOSYS;

       return fid->ops->ops_set(fid, ops_type, ops, flags);
};

The main issue with this approach is how to provide meaningful feedback to the user if a provider does not support a requested override. I do not think the above approach I presented can accomplish this. Instead, I think minor adjustments could be made to provide users this feedback. Here is a second approach to address overrides.

All override operations are defined in a union instead of a structure.

union fi_override_op {

       /* Copy from a host destination buffer to an HMEM iov. */
       ssize_t (*copy_from_hmem_iov)(void *dest, size_t size,
                                 const struct iovec *hmem_iov,
                                 enum fi_hmem_iface hmem_iface,
                                 size_t hmem_iov_count,
                                 uint64_t hmem_iov_offset);

       /* Copy from an HMEM iov to a host source buffer. */
       ssize_t (*copy_to_hmem_iov)(const struct iovec *hmem_iov,
                                enum fi_hmem_iface hmem_iface,
                               size_t hmem_iov_count,
                               uint64_t hmem_iov_offset, void *src,
                               size_t size);
};

Instead of having an enum value used to define an override structure, an enum value would be defined for each override.

enum fi_set_op {
       FI_OVERRIDE_COPY_FROM_HMEM_IOV,
FI_OVERRIDE_COPY_TO_HMEM_IOV,
};

struct fi_ops {
       size_t size;
       int    (*close)(struct fid *fid);
       int    (*bind)(struct fid *fid, struct fid *bfid, uint64_t flags);
       int    (*control)(struct fid *fid, int command, void *arg);
       int    (*ops_open)(struct fid *fid, const char *name,
                         uint64_t flags, void **ops, void *context);
       int    (*tostr)(const struct fid *fid, char *buf, size_t len);
       int    (*op_set)(struct fid *fid, enum fi_set_op op_type, union fi_override_op *op,
                        uint64_t flags);
};

static inline int
fi_set_op(struct fid *fid, enum fi_set_op op_type, union fi_override_op *op, uint64_t flags)
{
       if (!fid->ops->op_set)
              return -FI_ENOSYS;

       return fid->ops->op_set(fid, op_type, op, flags);
};

A user would have to call fi_set_op for each operation they wish to override. This would allow a provider to provide per override support information back to the user. If a provider does not support a given override or overrides in general, -FI_ENOSYS would be returned. Users should not treat -FI_ENOSYS as a fatal error.

Thoughts?

Thanks,

Ian
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofiwg/attachments/20200518/845d8727/attachment-0001.htm>


More information about the ofiwg mailing list