[openib-general] [PATCH] FMR support in mthca

Michael S. Tsirkin mst at mellanox.co.il
Sun Mar 27 07:31:13 PST 2005


OK, here's an updated version of the patch. This passed basic
tests: allocate/free, map/remap/unmap.

For Tavor, MTTs for FMR are separate from regular MTTs, and are reserved
at driver initialization. This is done to limit the amount of
virtual memory needed to map the MTTs.
For Arbel, there's no such limitation, and all MTTs and MPTs may be used
for FMR or for regular MR.
It would be easy to remove the limitation for Tavor for 64-bit systems, where
it's feasible to ioremap the whole MTT table. Let me know if this is
of interest.

Please comment.

MST

Add FMR support to mthca. Both Tavor and Arbel native are supported.
For Tavor, FMR support is disabled if DDR is hidden.

Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

Index: hw/mthca/mthca_dev.h
===================================================================
--- hw/mthca/mthca_dev.h	(revision 2050)
+++ hw/mthca/mthca_dev.h	(working copy)
@@ -61,7 +61,8 @@ enum {
 	MTHCA_FLAG_SRQ        = 1 << 2,
 	MTHCA_FLAG_MSI        = 1 << 3,
 	MTHCA_FLAG_MSI_X      = 1 << 4,
-	MTHCA_FLAG_NO_LAM     = 1 << 5
+	MTHCA_FLAG_NO_LAM     = 1 << 5,
+	MTHCA_FLAG_FMR        = 1 << 6
 };
 
 enum {
@@ -134,6 +135,7 @@ struct mthca_limits {
 	int      reserved_eqs;
 	int      num_mpts;
 	int      num_mtt_segs;
+	int      fmr_reserved_mtts;
 	int      reserved_mtts;
 	int      reserved_mrws;
 	int      reserved_uars;
@@ -170,13 +172,25 @@ struct mthca_pd_table {
 	struct mthca_alloc alloc;
 };
 
+struct mthca_buddy {
+	unsigned long **bits;
+	int max_order;
+	spinlock_t lock;
+};
+
 struct mthca_mr_table {
 	struct mthca_alloc      mpt_alloc;
-	int                     max_mtt_order;
-	unsigned long         **mtt_buddy;
+	struct mthca_buddy      mtt_buddy;
+	struct mthca_buddy     *fmr_mtt_buddy;
 	u64                     mtt_base;
+	u64                     mpt_base;
 	struct mthca_icm_table *mtt_table;
 	struct mthca_icm_table *mpt_table;
+	struct {
+		void __iomem   *mpt_base;
+		void __iomem   *mtt_base;
+		struct mthca_buddy mtt_buddy;
+	} tavor_fmr;
 };
 
 struct mthca_eq_table {
@@ -375,7 +389,20 @@ int mthca_mr_alloc_phys(struct mthca_dev
 			u64 *buffer_list, int buffer_size_shift,
 			int list_len, u64 iova, u64 total_size,
 			u32 access, struct mthca_mr *mr);
-void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr);
+void mthca_free_mr(struct mthca_dev *dev,  struct mthca_mr *mr);
+
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+			u32 access, struct mthca_fmr *fmr);
+
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+				    int list_len, u64 iova);
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+				    int list_len, u64 iova);
+
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
+
+int mthca_free_fmr(struct mthca_dev *dev,  struct mthca_fmr *fmr);
 
 int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt);
 void mthca_unmap_eq_icm(struct mthca_dev *dev);
Index: hw/mthca/mthca_main.c
===================================================================
--- hw/mthca/mthca_main.c	(revision 2050)
+++ hw/mthca/mthca_main.c	(working copy)
@@ -81,6 +81,7 @@ static struct mthca_profile default_prof
 	.num_mtt    = 1 << 20,
 	.num_udav   = 1 << 15,	/* Tavor only */
 	.uarc_size  = 1 << 18,	/* Arbel only */
+	.fmr_reserved_mtts = 1 << 18, /* Tavor only */
 };
 
 static int __devinit mthca_tune_pci(struct mthca_dev *mdev)
Index: hw/mthca/mthca_memfree.h
===================================================================
--- hw/mthca/mthca_memfree.h	(revision 2050)
+++ hw/mthca/mthca_memfree.h	(working copy)
@@ -90,6 +90,9 @@ int mthca_table_get_range(struct mthca_d
 void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
 			   int start, int end);
 
+void *mthca_table_find(struct mthca_dev *dev, struct mthca_icm_table *table,
+		       int obj);
+
 static inline void mthca_icm_first(struct mthca_icm *icm,
 				   struct mthca_icm_iter *iter)
 {
Index: hw/mthca/mthca_provider.c
===================================================================
--- hw/mthca/mthca_provider.c	(revision 2050)
+++ hw/mthca/mthca_provider.c	(working copy)
@@ -574,6 +574,75 @@ static int mthca_dereg_mr(struct ib_mr *
 	return 0;
 }
 
+static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			   struct ib_fmr_attr *fmr_attr)
+{
+	struct mthca_fmr *fmr;
+	int err;
+	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
+	if (!fmr)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr);
+	err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num,
+			     convert_access(mr_access_flags), fmr);
+
+	if (err) {
+		kfree(fmr);
+		return ERR_PTR(err);
+	}
+	return &fmr->ibmr;
+}
+
+static int mthca_dealloc_fmr(struct ib_fmr *fmr)
+{
+	struct mthca_fmr *mfmr = to_mfmr(fmr);
+	int err;
+
+	err = mthca_free_fmr(to_mdev(fmr->device), mfmr);
+	if (err)
+		return err;
+
+	kfree(mfmr);
+	return 0;
+}
+
+static int mthca_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *fmr;
+	int err;
+	u8 status;
+	struct mthca_dev* mdev = NULL;
+
+	list_for_each_entry(fmr, fmr_list, list) {
+		mdev = to_mdev(fmr->device);
+		break;
+	}
+
+	if (!mdev)
+		return 0;
+
+	if (mdev->hca_type == ARBEL_NATIVE) {
+		list_for_each_entry(fmr, fmr_list, list) {
+			BUG_ON(fmr->device != &mdev->ib_dev);
+			mthca_arbel_fmr_unmap(mdev, to_mfmr(fmr));
+		}
+
+		wmb();
+	} else
+		list_for_each_entry(fmr, fmr_list, list) {
+			BUG_ON(fmr->device != &mdev->ib_dev);
+			mthca_tavor_fmr_unmap(mdev, to_mfmr(fmr));
+		}
+
+	err = mthca_SYNC_TPT(mdev, &status);
+	if (err)
+		return err;
+	if (status)
+		return -EINVAL;
+	return 0;
+}
+
 static ssize_t show_rev(struct class_device *cdev, char *buf)
 {
 	struct mthca_dev *dev = container_of(cdev, struct mthca_dev, ib_dev.class_dev);
@@ -637,6 +706,17 @@ int mthca_register_device(struct mthca_d
 	dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
 	dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
 	dev->ib_dev.dereg_mr             = mthca_dereg_mr;
+
+	if (dev->mthca_flags & MTHCA_FLAG_FMR) {
+		dev->ib_dev.alloc_fmr            = mthca_alloc_fmr;
+		dev->ib_dev.unmap_fmr            = mthca_unmap_fmr;
+		dev->ib_dev.dealloc_fmr          = mthca_dealloc_fmr;
+		if (dev->hca_type == ARBEL_NATIVE)
+			dev->ib_dev.map_phys_fmr = mthca_arbel_map_phys_fmr;
+		else
+			dev->ib_dev.map_phys_fmr = mthca_tavor_map_phys_fmr;
+	}
+
 	dev->ib_dev.attach_mcast         = mthca_multicast_attach;
 	dev->ib_dev.detach_mcast         = mthca_multicast_detach;
 	dev->ib_dev.process_mad          = mthca_process_mad;
Index: hw/mthca/mthca_provider.h
===================================================================
--- hw/mthca/mthca_provider.h	(revision 2050)
+++ hw/mthca/mthca_provider.h	(working copy)
@@ -60,6 +60,24 @@ struct mthca_mr {
 	u32 first_seg;
 };
 
+struct mthca_fmr {
+	struct ib_fmr ibmr;
+	struct ib_fmr_attr attr;
+	int order;
+	u32 first_seg;
+	int maps;
+	union {
+		struct {
+			struct mthca_mpt_entry __iomem *mpt;
+			u64 __iomem *mtts;
+		} tavor;
+		struct {
+			struct mthca_mpt_entry *mpt;
+			__be64 *mtts;
+		} arbel;
+	} mem;
+};
+
 struct mthca_pd {
 	struct ib_pd    ibpd;
 	u32             pd_num;
@@ -218,6 +236,11 @@ struct mthca_sqp {
 	dma_addr_t      header_dma;
 };
 
+static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr)
+{
+	return container_of(ibmr, struct mthca_fmr, ibmr);
+}
+
 static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr)
 {
 	return container_of(ibmr, struct mthca_mr, ibmr);
Index: hw/mthca/mthca_profile.c
===================================================================
--- hw/mthca/mthca_profile.c	(revision 2050)
+++ hw/mthca/mthca_profile.c	(working copy)
@@ -223,9 +223,10 @@ u64 mthca_make_profile(struct mthca_dev 
 			init_hca->mc_hash_sz      = 1 << (profile[i].log_num - 1);
 			break;
 		case MTHCA_RES_MPT:
-			dev->limits.num_mpts = profile[i].num;
-			init_hca->mpt_base   = profile[i].start;
-			init_hca->log_mpt_sz = profile[i].log_num;
+			dev->limits.num_mpts   = profile[i].num;
+			dev->mr_table.mpt_base = profile[i].start;
+			init_hca->mpt_base     = profile[i].start;
+			init_hca->log_mpt_sz   = profile[i].log_num;
 			break;
 		case MTHCA_RES_MTT:
 			dev->limits.num_mtt_segs = profile[i].num;
@@ -259,6 +260,16 @@ u64 mthca_make_profile(struct mthca_dev 
 	 */
 	dev->limits.num_pds = MTHCA_NUM_PDS;
 
+	/* For Tavor, FMRs need to be ioremapped. For 32 bit systems it may be
+	 * too expensive to map all MTT memory, so we reserve some MTTs for FMR
+	 * access, taking them out of the MR pool. They dont take
+	 * additional memory, but we assign them as part of the HCA profile
+	 * anyway. */
+	if (dev->hca_type == ARBEL_NATIVE)
+		dev->limits.fmr_reserved_mtts = 0;
+	else
+		dev->limits.fmr_reserved_mtts = request->fmr_reserved_mtts;
+
 	kfree(profile);
 	return total_size;
 }
Index: hw/mthca/mthca_cmd.c
===================================================================
--- hw/mthca/mthca_cmd.c	(revision 2050)
+++ hw/mthca/mthca_cmd.c	(working copy)
@@ -1384,6 +1384,12 @@ int mthca_HW2SW_MPT(struct mthca_dev *de
 	return err;
 }
 
+int mthca_SYNC_TPT(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYNC_TPT, CMD_TIME_CLASS_B, status);
+}
+
+
 int mthca_WRITE_MTT(struct mthca_dev *dev, u64 *mtt_entry,
 		    int num_mtt, u8 *status)
 {
Index: hw/mthca/mthca_profile.h
===================================================================
--- hw/mthca/mthca_profile.h	(revision 2050)
+++ hw/mthca/mthca_profile.h	(working copy)
@@ -48,6 +48,7 @@ struct mthca_profile {
 	int num_udav;
 	int num_uar;
 	int uarc_size;
+	int fmr_reserved_mtts;
 };
 
 u64 mthca_make_profile(struct mthca_dev *mdev,
Index: hw/mthca/mthca_doorbell.h
===================================================================
--- hw/mthca/mthca_doorbell.h	(revision 2050)
+++ hw/mthca/mthca_doorbell.h	(working copy)
@@ -51,6 +51,11 @@
 #define MTHCA_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
 #define MTHCA_GET_DOORBELL_LOCK(ptr)      (NULL)
 
+static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writeq((__force u64) val, dest);
+}
+
 static inline void mthca_write64(u32 val[2], void __iomem *dest,
 				 spinlock_t *doorbell_lock)
 {
@@ -74,6 +79,12 @@ static inline void mthca_write_db_rec(u3
 #define MTHCA_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
 #define MTHCA_GET_DOORBELL_LOCK(ptr)      (ptr)
 
+static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writel(((__force u32 *) &val)[0], dest);
+	__raw_writel(((__force u32 *) &val)[1], dest + 4);
+}
+
 static inline void mthca_write64(u32 val[2], void __iomem *dest,
 				 spinlock_t *doorbell_lock)
 {
Index: hw/mthca/mthca_cmd.h
===================================================================
--- hw/mthca/mthca_cmd.h	(revision 2050)
+++ hw/mthca/mthca_cmd.h	(working copy)
@@ -276,6 +276,7 @@ int mthca_HW2SW_MPT(struct mthca_dev *de
 		    int mpt_index, u8 *status);
 int mthca_WRITE_MTT(struct mthca_dev *dev, u64 *mtt_entry,
 		    int num_mtt, u8 *status);
+int mthca_SYNC_TPT(struct mthca_dev *dev, u8 *status);
 int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
 		 int eq_num, u8 *status);
 int mthca_SW2HW_EQ(struct mthca_dev *dev, void *eq_context,
Index: hw/mthca/mthca_mr.c
===================================================================
--- hw/mthca/mthca_mr.c	(revision 2050)
+++ hw/mthca/mthca_mr.c	(working copy)
@@ -72,60 +72,107 @@ struct mthca_mpt_entry {
  * through the bitmaps)
  */
 
-static u32 __mthca_alloc_mtt(struct mthca_dev *dev, int order)
+static u32 mthca_buddy_alloc(struct mthca_buddy *buddy, int order)
 {
 	int o;
 	int m;
 	u32 seg;
 
-	spin_lock(&dev->mr_table.mpt_alloc.lock);
+	spin_lock(&buddy->lock);
 
-	for (o = order; o <= dev->mr_table.max_mtt_order; ++o) {
-		m = 1 << (dev->mr_table.max_mtt_order - o);
-		seg = find_first_bit(dev->mr_table.mtt_buddy[o], m);
+	for (o = order; o <= buddy->max_order; ++o) {
+		m = 1 << (buddy->max_order - o);
+		seg = find_first_bit(buddy->bits[o], m);
 		if (seg < m)
 			goto found;
 	}
 
-	spin_unlock(&dev->mr_table.mpt_alloc.lock);
+	spin_unlock(&buddy->lock);
 	return -1;
 
  found:
-	clear_bit(seg, dev->mr_table.mtt_buddy[o]);
+	clear_bit(seg, buddy->bits[o]);
 
 	while (o > order) {
 		--o;
 		seg <<= 1;
-		set_bit(seg ^ 1, dev->mr_table.mtt_buddy[o]);
+		set_bit(seg ^ 1, buddy->bits[o]);
 	}
 
-	spin_unlock(&dev->mr_table.mpt_alloc.lock);
+	spin_unlock(&buddy->lock);
 
 	seg <<= order;
 
 	return seg;
 }
 
-static void __mthca_free_mtt(struct mthca_dev *dev, u32 seg, int order)
+static void mthca_buddy_free(struct mthca_buddy *buddy, u32 seg, int order)
 {
 	seg >>= order;
 
-	spin_lock(&dev->mr_table.mpt_alloc.lock);
+	spin_lock(&buddy->lock);
 
-	while (test_bit(seg ^ 1, dev->mr_table.mtt_buddy[order])) {
-		clear_bit(seg ^ 1, dev->mr_table.mtt_buddy[order]);
+	while (test_bit(seg ^ 1, buddy->bits[order])) {
+		clear_bit(seg ^ 1, buddy->bits[order]);
 		seg >>= 1;
 		++order;
 	}
 
-	set_bit(seg, dev->mr_table.mtt_buddy[order]);
+	set_bit(seg, buddy->bits[order]);
 
-	spin_unlock(&dev->mr_table.mpt_alloc.lock);
+	spin_unlock(&buddy->lock);
 }
 
-static u32 mthca_alloc_mtt(struct mthca_dev *dev, int order)
+static int __devinit mthca_buddy_init(struct mthca_buddy *buddy, int max_order)
 {
-	u32 seg = __mthca_alloc_mtt(dev, order);
+	int i, s;
+
+	buddy->max_order = max_order;
+	spin_lock_init(&buddy->lock);
+
+	buddy->bits = kmalloc((buddy->max_order + 1) * sizeof (long *),
+			      GFP_KERNEL);
+	if (!buddy->bits)
+		goto err_out;
+
+	memset(buddy->bits, 0, (buddy->max_order + 1) * sizeof (long *));
+
+	for (i = 0; i <= buddy->max_order; ++i) {
+		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
+		buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL);
+		if (!buddy->bits[i])
+			goto err_out_free;
+		bitmap_zero(buddy->bits[i],
+			    1 << (buddy->max_order - i));
+	}
+
+	set_bit(0, buddy->bits[buddy->max_order]);
+
+	return 0;
+
+err_out_free:
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+err_out:
+	return -ENOMEM;
+}
+
+static void __devexit mthca_buddy_cleanup(struct mthca_buddy *buddy)
+{
+	int i;
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+}
+
+
+static u32 mthca_alloc_mtt(struct mthca_dev *dev, int order,
+			   struct mthca_buddy *buddy)
+{
+	u32 seg = mthca_buddy_alloc(buddy, order);
 
 	if (seg == -1)
 		return -1;
@@ -133,36 +180,57 @@ static u32 mthca_alloc_mtt(struct mthca_
 	if (dev->hca_type == ARBEL_NATIVE)
 		if (mthca_table_get_range(dev, dev->mr_table.mtt_table, seg,
 					  seg + (1 << order) - 1)) {
-			__mthca_free_mtt(dev, seg, order);
+			mthca_buddy_free(buddy, seg, order);
 			seg = -1;
 		}
 
 	return seg;
 }
 
-static void mthca_free_mtt(struct mthca_dev *dev, u32 seg, int order)
+static void mthca_free_mtt(struct mthca_dev *dev, u32 seg, int order,
+			   struct mthca_buddy* buddy)
 {
-	__mthca_free_mtt(dev, seg, order);
+	mthca_buddy_free(buddy, seg, order);
 
 	if (dev->hca_type == ARBEL_NATIVE)
 		mthca_table_put_range(dev, dev->mr_table.mtt_table, seg,
 				      seg + (1 << order) - 1);
 }
 
+static inline u32 tavor_hw_index_to_key(u32 ind)
+{
+	return ind;
+}
+
+static inline u32 tavor_key_to_hw_index(u32 key)
+{
+	return key;
+}
+
+static inline u32 arbel_hw_index_to_key(u32 ind)
+{
+	return (ind >> 24) | (ind << 8);
+}
+
+static inline u32 arbel_key_to_hw_index(u32 key)
+{
+	return (key << 24) | (key >> 8);
+}
+
 static inline u32 hw_index_to_key(struct mthca_dev *dev, u32 ind)
 {
 	if (dev->hca_type == ARBEL_NATIVE)
-		return (ind >> 24) | (ind << 8);
+		return arbel_hw_index_to_key(ind);
 	else
-		return ind;
+		return tavor_hw_index_to_key(ind);
 }
 
 static inline u32 key_to_hw_index(struct mthca_dev *dev, u32 key)
 {
 	if (dev->hca_type == ARBEL_NATIVE)
-		return (key << 24) | (key >> 8);
+		return arbel_key_to_hw_index(key);
 	else
-		return key;
+		return tavor_key_to_hw_index(key);
 }
 
 int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
@@ -268,7 +336,8 @@ int mthca_mr_alloc_phys(struct mthca_dev
 	     i <<= 1, ++mr->order)
 		; /* nothing */
 
-	mr->first_seg = mthca_alloc_mtt(dev, mr->order);
+	mr->first_seg = mthca_alloc_mtt(dev, mr->order,
+				       	&dev->mr_table.mtt_buddy);
 	if (mr->first_seg == -1)
 		goto err_out_table;
 
@@ -361,7 +430,7 @@ err_out_mailbox_free:
 	kfree(mailbox);
 
 err_out_free_mtt:
-	mthca_free_mtt(dev, mr->first_seg, mr->order);
+	mthca_free_mtt(dev, mr->first_seg, mr->order, &dev->mr_table.mtt_buddy);
 
 err_out_table:
 	if (dev->hca_type == ARBEL_NATIVE)
@@ -372,6 +441,19 @@ err_out_mpt_free:
 	return err;
 }
 
+/* Free mr or fmr */
+static void mthca_free_region(struct mthca_dev *dev, u32 lkey, int order,
+			      u32 first_seg, struct mthca_buddy *buddy)
+{
+	if (order >= 0)
+		mthca_free_mtt(dev, first_seg, order, buddy);
+
+	if (dev->hca_type == ARBEL_NATIVE)
+		mthca_table_put(dev, dev->mr_table.mpt_table,
+				arbel_key_to_hw_index(lkey));
+	mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, lkey));
+}
+
 void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
 {
 	int err;
@@ -389,85 +471,411 @@ void mthca_free_mr(struct mthca_dev *dev
 		mthca_warn(dev, "HW2SW_MPT returned status 0x%02x\n",
 			   status);
 
-	if (mr->order >= 0)
-		mthca_free_mtt(dev, mr->first_seg, mr->order);
+	mthca_free_region(dev, mr->ibmr.lkey, mr->order, mr->first_seg,
+			  &dev->mr_table.mtt_buddy);
+}
+
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+			u32 access, struct mthca_fmr *mr)
+{
+	struct mthca_mpt_entry *mpt_entry;
+	void *mailbox;
+	u64 mtt_seg;
+	u32 key, idx;
+	u8 status;
+	int i, err = -ENOMEM, list_len = mr->attr.max_pages;
+
+	might_sleep();
+
+	if (mr->attr.page_size < 12 || mr->attr.page_size >= 32)
+		return -EINVAL;
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	if (dev->hca_type == ARBEL_NATIVE &&
+	    mr->attr.max_pages * sizeof *mr->mem.arbel.mtts > PAGE_SIZE)
+		return -EINVAL;
+
+	mr->maps = 0;
+
+	key = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (key == -1)
+		return -ENOMEM;
+
+	idx = key & (dev->limits.num_mpts - 1);
+	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
+		if (err)
+			goto err_out_mpt_free;
+
+		mr->mem.arbel.mpt =
+		       	mthca_table_find(dev, dev->mr_table.mpt_table, key);
+
+		BUG_ON(!mr->mem.arbel.mpt);
+	} else
+		mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base +
+		       	sizeof *(mr->mem.tavor.mpt) * idx;
+
+	for (i = MTHCA_MTT_SEG_SIZE / 8, mr->order = 0;
+	     i < list_len;
+	     i <<= 1, ++mr->order)
+		; /* nothing */
+
+	mr->first_seg = mthca_alloc_mtt(dev, mr->order,
+				       	dev->mr_table.fmr_mtt_buddy);
+	if (mr->first_seg == -1)
+		goto err_out_table;
+
+	mtt_seg = mr->first_seg * MTHCA_MTT_SEG_SIZE;
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		mr->mem.arbel.mtts = mthca_table_find(dev,
+						      dev->mr_table.mtt_table,
+						      mr->first_seg);
+		BUG_ON(!mr->mem.arbel.mtts);
+	} else
+		mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg;
+
+	mailbox = kmalloc(sizeof *mpt_entry + MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox)
+		goto err_out_free_mtt;
+
+	mpt_entry = MAILBOX_ALIGN(mailbox);
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+
+	mpt_entry->page_size = cpu_to_be32(mr->attr.page_size - 12);
+	mpt_entry->key       = cpu_to_be32(key);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	memset(&mpt_entry->start, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start));
+	mpt_entry->mtt_seg   = cpu_to_be64(dev->mr_table.mtt_base + mtt_seg);
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpu(((u32 *) mpt_entry)[i]));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	err = mthca_SW2HW_MPT(dev, mpt_entry,
+			      key & (dev->limits.num_mpts - 1),
+			      &status);
+	if (err)
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+	else if (status) {
+		mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+		goto err_out_mailbox_free;
+	}
+
+	kfree(mailbox);
+	return 0;
+
+err_out_mailbox_free:
+	kfree(mailbox);
+
+err_out_free_mtt:
+	mthca_free_mtt(dev, mr->first_seg, mr->order,
+		       dev->mr_table.fmr_mtt_buddy);
 
+err_out_table:
 	if (dev->hca_type == ARBEL_NATIVE)
-		mthca_table_put(dev, dev->mr_table.mpt_table,
-				key_to_hw_index(dev, mr->ibmr.lkey));
-	mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, mr->ibmr.lkey));
+		mthca_table_put(dev, dev->mr_table.mpt_table, key);
+
+err_out_mpt_free:
+	mthca_free(&dev->mr_table.mpt_alloc, mr->ibmr.lkey);
+	return err;
+}
+
+int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (fmr->maps)
+		return -EBUSY;
+
+	mthca_free_region(dev, fmr->ibmr.lkey, fmr->order, fmr->first_seg,
+			  dev->mr_table.fmr_mtt_buddy);
+	return 0;
+}
+
+#define MTHCA_MPT_STATUS_SW 0xF0
+#define MTHCA_MPT_STATUS_HW 0x00
+
+static inline int mthca_check_fmr(struct mthca_fmr *fmr, u64 *page_list,
+				    int list_len, u64 iova)
+{
+	int i, page_mask;
+
+	if (list_len > fmr->attr.max_pages)
+		return -EINVAL;
+
+	page_mask = (1 << fmr->attr.page_size) - 1;
+
+	/* We are getting page lists, so va must be page aligned. */
+	if (iova & page_mask)
+		return -EINVAL;
+
+	/* Trust the user not to pass misaligned data in page_list */
+	if (0)
+		for (i = 0; i < list_len; ++i) {
+			if (page_list[i] & ~page_mask)
+				return -EINVAL;
+		}
+
+	if (fmr->maps >= fmr->attr.max_maps)
+		return -EINVAL;
+
+	return 0;
+}
+
+
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+				    int list_len, u64 iova)
+{
+	struct mthca_fmr *fmr = to_mfmr(ibfmr);
+	struct mthca_dev *dev = to_mdev(ibfmr->device);
+	struct mthca_mpt_entry mpt_entry;
+	u32 key;
+	int i, err;
+
+	if ((err = mthca_check_fmr(fmr, page_list, list_len, iova)))
+		return err;
+
+	fmr->maps++;
+
+	key = tavor_key_to_hw_index(fmr->ibmr.lkey);
+	key += dev->limits.num_mpts;
+	fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key);
+
+	writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+
+	for (i = 0; i < list_len; ++i) {
+		__be64 mtt_entry = cpu_to_be64(page_list[i] |
+					       MTHCA_MTT_FLAG_PRESENT);
+		mthca_write64_raw(mtt_entry, fmr->mem.tavor.mtts + i);
+	}
+
+	mpt_entry.lkey = cpu_to_be32(key);
+	mpt_entry.length = cpu_to_be64(((u64)list_len) *
+				       (1 << fmr->attr.page_size));
+	mpt_entry.start = cpu_to_be64(iova);
+
+	writel(mpt_entry.lkey, &fmr->mem.tavor.mpt->key);
+	memcpy_toio(&fmr->mem.tavor.mpt->start, &mpt_entry.start, 
+		    offsetof(struct mthca_mpt_entry, window_count) -
+		    offsetof(struct mthca_mpt_entry, start));
+
+	writeb(MTHCA_MPT_STATUS_HW, fmr->mem.tavor.mpt);
+
+	return 0;
+}
+
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+				    int list_len, u64 iova)
+{
+	struct mthca_fmr *fmr = to_mfmr(ibfmr);
+	struct mthca_dev *dev = to_mdev(ibfmr->device);
+	struct mthca_mpt_entry *mpt_entry;
+	u8 *mpt_status;
+	u32 key;
+	int i, err;
+
+	if ((err = mthca_check_fmr(fmr, page_list, list_len, iova)))
+		return err;
+
+	fmr->maps++;
+
+	key = arbel_key_to_hw_index(fmr->ibmr.lkey);
+	key += dev->limits.num_mpts;
+	fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key);
+
+	mpt_status = (u8 *)fmr->mem.arbel.mpt;
+	*mpt_status = MTHCA_MPT_STATUS_SW;
+
+	wmb();
+
+	for (i = 0; i < list_len; ++i) {
+		fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] |
+				    MTHCA_MTT_FLAG_PRESENT);
+	}
+
+	mpt_entry = fmr->mem.arbel.mpt;
+	fmr->mem.arbel.mpt->lkey = mpt_entry->key = cpu_to_be32(key);
+	fmr->mem.arbel.mpt->length = cpu_to_be64(((u64)list_len) *
+						 (1 << fmr->attr.page_size));
+	fmr->mem.arbel.mpt->start = cpu_to_be64(iova);
+
+	wmb();
+
+	*mpt_status = MTHCA_MPT_STATUS_HW;
+
+	wmb();
+
+	return 0;
+}
+
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	u32 key;
+
+	if (!fmr->maps)
+		return;
+
+	key = tavor_key_to_hw_index(fmr->ibmr.lkey);
+	key &= dev->limits.num_mpts - 1;
+	fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key);
+
+	fmr->maps = 0;
+
+	writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+}
+
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	u32 key;
+	u8 *mpt_status;
+
+	if (!fmr->maps)
+		return;
+
+	key = arbel_key_to_hw_index(fmr->ibmr.lkey);
+	key &= dev->limits.num_mpts - 1;
+	fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key);
+
+	fmr->maps = 0;
+
+	mpt_status = (u8 *)fmr->mem.arbel.mtts;
+	*mpt_status = MTHCA_MPT_STATUS_SW;
 }
 
 int __devinit mthca_init_mr_table(struct mthca_dev *dev)
 {
-	int err;
-	int i, s;
+	int err, i;
+
 
 	err = mthca_alloc_init(&dev->mr_table.mpt_alloc,
 			       dev->limits.num_mpts,
 			       ~0, dev->limits.reserved_mrws);
 	if (err)
-		return err;
+		goto err_mpt_alloc;
 
-	err = -ENOMEM;
+	if (dev->hca_type != ARBEL_NATIVE &&
+	    (dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN))
+		dev->limits.fmr_reserved_mtts = 0;
+	else
+		dev->mthca_flags |= MTHCA_FLAG_FMR;
 
-	for (i = 1, dev->mr_table.max_mtt_order = 0;
-	     i < dev->limits.num_mtt_segs;
-	     i <<= 1, ++dev->mr_table.max_mtt_order)
-		; /* nothing */
+	i = fls(dev->limits.num_mtt_segs - 1);
+	err = mthca_buddy_init(&dev->mr_table.mtt_buddy, i);
 
-	dev->mr_table.mtt_buddy = kmalloc((dev->mr_table.max_mtt_order + 1) *
-					  sizeof (long *),
-					  GFP_KERNEL);
-	if (!dev->mr_table.mtt_buddy)
-		goto err_out;
+	if (err)
+		goto err_mtt_buddy;
 
-	for (i = 0; i <= dev->mr_table.max_mtt_order; ++i)
-		dev->mr_table.mtt_buddy[i] = NULL;
+	dev->mr_table.tavor_fmr.mpt_base = NULL;
+	dev->mr_table.tavor_fmr.mtt_base = NULL;
 
-	for (i = 0; i <= dev->mr_table.max_mtt_order; ++i) {
-		s = BITS_TO_LONGS(1 << (dev->mr_table.max_mtt_order - i));
-		dev->mr_table.mtt_buddy[i] = kmalloc(s * sizeof (long),
-						     GFP_KERNEL);
-		if (!dev->mr_table.mtt_buddy[i])
-			goto err_out_free;
-		bitmap_zero(dev->mr_table.mtt_buddy[i],
-			    1 << (dev->mr_table.max_mtt_order - i));
-	}
+	if (dev->limits.fmr_reserved_mtts) {
+		i = fls(dev->limits.fmr_reserved_mtts - 1);
 
-	set_bit(0, dev->mr_table.mtt_buddy[dev->mr_table.max_mtt_order]);
+		if (i >= 31) {
+			mthca_warn(dev, "Unable to reserve 2^31 FMR MTTs.\n");
+			err = -EINVAL;
+			goto err_fmr_mpt;
+		}
 
-	for (i = 0; i < dev->mr_table.max_mtt_order; ++i)
-		if (1 << i >= dev->limits.reserved_mtts)
-			break;
+		dev->mr_table.tavor_fmr.mpt_base =
+		       	ioremap(dev->mr_table.mpt_base,
+				(1 << i) * sizeof(struct mthca_mpt_entry));
+
+		if (!dev->mr_table.tavor_fmr.mpt_base) {
+			mthca_warn(dev, "MPT ioremap for FMR failed.\n");
+			err = -ENOMEM;
+			goto err_fmr_mpt;
+		}
 
-	if (i == dev->mr_table.max_mtt_order) {
-		mthca_err(dev, "MTT table of order %d is "
-			  "too small.\n", i);
-		goto err_out_free;
-	}
+		dev->mr_table.tavor_fmr.mtt_base =
+			ioremap(dev->mr_table.mtt_base,
+				(1 << i) * MTHCA_MTT_SEG_SIZE);
+		if (!dev->mr_table.tavor_fmr.mtt_base) {
+			mthca_warn(dev, "MTT ioremap for FMR failed.\n");
+			err = -ENOMEM;
+			goto err_fmr_mtt;
+		}
 
-	(void) mthca_alloc_mtt(dev, i);
+		err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, i);
+		if (err)
+			goto err_fmr_mtt_buddy;
+
+		/* Prevent regular MRs from using FMR keys */
+		err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, i);
+		if (err)
+			goto err_reserve_fmr;
+
+		dev->mr_table.fmr_mtt_buddy =
+		       	&dev->mr_table.tavor_fmr.mtt_buddy;
+	} else
+		dev->mr_table.fmr_mtt_buddy = &dev->mr_table.mtt_buddy;
+
+	/* FMR table is always the first, take reserved MTTs out of there */
+	if (dev->limits.reserved_mtts) {
+		int seg;
+		i = fls(dev->limits.reserved_mtts - 1);
+		seg = mthca_alloc_mtt(dev, i, dev->mr_table.fmr_mtt_buddy);
+
+		if (seg == -1) {
+			mthca_warn(dev, "MTT table of order %d is too small.\n",
+				  dev->mr_table.fmr_mtt_buddy->max_order);
+			err = -ENOMEM;
+			goto err_reserve_mtts;
+		}
+	}
 
 	return 0;
 
- err_out_free:
-	for (i = 0; i <= dev->mr_table.max_mtt_order; ++i)
-		kfree(dev->mr_table.mtt_buddy[i]);
+err_reserve_mtts:
+err_reserve_fmr:
+
+	if (dev->limits.fmr_reserved_mtts)
+		mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+err_fmr_mtt_buddy:
+
+	if (dev->mr_table.tavor_fmr.mtt_base)
+		iounmap(dev->mr_table.tavor_fmr.mtt_base);
+err_fmr_mtt:
+
+	if (dev->mr_table.tavor_fmr.mpt_base)
+		iounmap(dev->mr_table.tavor_fmr.mpt_base);
+err_fmr_mpt:
+
+	mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+err_mtt_buddy:
 
- err_out:
 	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+err_mpt_alloc:
 
 	return err;
 }
 
 void __devexit mthca_cleanup_mr_table(struct mthca_dev *dev)
 {
-	int i;
-
 	/* XXX check if any MRs are still allocated? */
-	for (i = 0; i <= dev->mr_table.max_mtt_order; ++i)
-		kfree(dev->mr_table.mtt_buddy[i]);
-	kfree(dev->mr_table.mtt_buddy);
+	if (dev->limits.fmr_reserved_mtts)
+		mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
+	mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+	if (dev->mr_table.tavor_fmr.mtt_base)
+		iounmap(dev->mr_table.tavor_fmr.mtt_base);
+	if (dev->mr_table.tavor_fmr.mpt_base)
+		iounmap(dev->mr_table.tavor_fmr.mpt_base);
 	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
 }
Index: hw/mthca/mthca_memfree.c
===================================================================
--- hw/mthca/mthca_memfree.c	(revision 2050)
+++ hw/mthca/mthca_memfree.c	(working copy)
@@ -192,6 +192,48 @@ void mthca_table_put(struct mthca_dev *d
 	up(&table->mutex);
 }
 
+void *mthca_table_find(struct mthca_dev *dev,
+			      struct mthca_icm_table *table, int obj)
+{
+	int idx, offset, i;
+	struct mthca_icm_chunk *chunk;
+	struct mthca_icm *icm;
+	struct page *page = NULL;
+	void *p = NULL;
+
+	if (!table->lowmem)
+		return NULL;
+
+	down(&table->mutex);
+
+	idx = (obj & (table->num_obj - 1)) * table->obj_size;
+	icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE];
+	offset = idx % MTHCA_TABLE_CHUNK_SIZE;
+
+	if (!icm)
+		goto out;
+
+	/* Linear scan of ICM on each access. Eventually we may want to
+	 * rearrange things to use some kind of tree. */
+
+	list_for_each_entry(chunk, &icm->chunk_list, list) {
+		for (i = 0; i < chunk->npages; ++i) {
+			if (chunk->mem[i].length >= offset) {
+				page = chunk->mem[i].page;
+				break;
+			}
+			offset -= chunk->mem[i].length;
+		}
+	}
+
+	if (page)
+		p = lowmem_page_address(page) + offset;
+
+out:
+	up(&table->mutex);
+	return p;
+}
+
 int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
 			  int start, int end)
 {

-- 
MST - Michael S. Tsirkin



More information about the general mailing list