[openib-general] [PATCH] fmr support in mthca

Roland Dreier roland at topspin.com
Thu Mar 17 19:59:31 PST 2005


Thanks for implementing this.  You saved me a lot of work and Libor
will be very happy when he gets back next week.

Some comments from my first read through:

 > This patch implements FMR support. I also rolled into it two fixes
 > for regular mrs that I posed previously, let me know if its a problem.

No problem although I'll apply them separately.

 > This seems to be working fine for me, although I only did relatively basic
 > tests. Both Tavor and Arbel Native modes are supported. I made some tradeoffs
 > for simplicity, let me know what do you think:
 > - for tavor, I keep for each fmr two pages mapped: for mpt and one for
 >   mtt access. This spends more kernel virtual memory than could be used,
 >   since many mpts could share one page. Alternatives are:
 >   map/unmap io memory on each fmr map/unmap request, or
 >   keep and intermediate table tomap each page only once.

I don't think this is acceptable.  Each ioremap has to map at least
one page plus a guard page.  With two ioremaps per FMR, every FMR is
using 16K (or more) of vmalloc space.  On 64 bit archs, this doesn't
matter, but on a large memory i386 machine, there's less than 128 MB
of vmalloc space available (possibly a lot less if someone is using a
video card with a big frame buffer or something).  That means we're
limited to a few thousand FMRs, which isn't enough.

What if we just reserve something like 64K MPTs and MTTs for FMRs and
ioremap everything at driver startup?  That would only use a few MB of
vmalloc space and probably simplify the code too.

 > - icm that has the mpts/mtts is linearly scanned and this is repeated
 >   for each mtt on each fmr map. This may be improved somewhat
 >   with some kind of an iterator, but to really speed things up
 >   the icm data structure (list of arrays) would have to
 >   be replaced by some kind of tree.

I don't understand this.  I'm probably missing something but the
addresses don't change after we allocate the FMR, right?  It seems we
could just store the MPT/MTT address in the FMR structure the same way
we do for Tavor mode.

Some more nitpicky comments below...

 > 
 > Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>
 > 
 > Index: hw/mthca/mthca_dev.h
 > ===================================================================
 > --- hw/mthca/mthca_dev.h	(revision 2012)
 > +++ hw/mthca/mthca_dev.h	(working copy)
 > @@ -163,6 +163,7 @@ struct mthca_mr_table {
 >  	int                     max_mtt_order;
 >  	unsigned long         **mtt_buddy;
 >  	u64                     mtt_base;
 > +	u64                     mpt_base; /* Tavor only */
 >  	struct mthca_icm_table *mtt_table;
 >  	struct mthca_icm_table *mpt_table;
 >  };
 > @@ -363,7 +364,17 @@ int mthca_mr_alloc_phys(struct mthca_dev
 >  			u64 *buffer_list, int buffer_size_shift,
 >  			int list_len, u64 iova, u64 total_size,
 >  			u32 access, struct mthca_mr *mr);
 > -void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr);
 > +void mthca_free_mr(struct mthca_dev *dev,  struct mthca_mr *mr);
 > +
 > +int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
 > +			u32 access, struct mthca_fmr *fmr);
 > +
 > +int mthca_fmr_map(struct mthca_dev *dev, struct mthca_fmr *fmr,
 > +		  u64 *page_list, int list_len, u64 iova);
 > +
 > +void mthca_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
 > +
 > +void mthca_free_fmr(struct mthca_dev *dev,  struct mthca_fmr *fmr);
 >  
 >  int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt);
 >  void mthca_unmap_eq_icm(struct mthca_dev *dev);
 > Index: hw/mthca/mthca_memfree.h
 > ===================================================================
 > --- hw/mthca/mthca_memfree.h	(revision 2012)
 > +++ hw/mthca/mthca_memfree.h	(working copy)
 > @@ -90,6 +90,11 @@ int mthca_table_get_range(struct mthca_d
 >  void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
 >  			   int start, int end);
 >  
 > +/* Nonblocking. Callers must make sure the object exists by serializing against
 > + * callers of get/put. */
 > +void *mthca_table_find(struct mthca_dev *dev, struct mthca_icm_table *table,
 > +		       int obj);

Can we just make this use the table mutex and only call it when
allocating an FMR?

 > +
 >  static inline void mthca_icm_first(struct mthca_icm *icm,
 >  				   struct mthca_icm_iter *iter)
 >  {
 > Index: hw/mthca/mthca_provider.c
 > ===================================================================
 > --- hw/mthca/mthca_provider.c	(revision 2012)
 > +++ hw/mthca/mthca_provider.c	(working copy)
 > @@ -568,8 +568,101 @@ static struct ib_mr *mthca_reg_phys_mr(s
 >  
 >  static int mthca_dereg_mr(struct ib_mr *mr)
 >  {
 > -	mthca_free_mr(to_mdev(mr->device), to_mmr(mr));
 > -	kfree(mr);
 > +	struct mthca_mr *mmr = to_mmr(mr);
 > +	mthca_free_mr(to_mdev(mr->device), mmr);
 > +	kfree(mmr);
 > +	return 0;
 > +}
 > +
 > +static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
 > +			   struct ib_fmr_attr *fmr_attr)
 > +{
 > +	struct mthca_fmr *fmr;
 > +	int err;
 > +	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
 > +	if (!fmr)
 > +		return ERR_PTR(-ENOMEM);
 > +
 > +	memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr);
 > +	err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num,
 > +			      convert_access(mr_access_flags), fmr);
 > +
 > +	if (err) {
 > +		kfree(fmr);
 > +		return ERR_PTR(err);
 > +	}
 > +	return &fmr->ibmr;
 > +}
 > +
 > +static int mthca_dealloc_fmr(struct ib_fmr *fmr)
 > +{
 > +	struct mthca_fmr *mfmr = to_mfmr(fmr);
 > +	mthca_free_fmr(to_mdev(fmr->device), mfmr);
 > +	kfree(mfmr);
 > +	return 0;
 > +}
 > +
 > +static int mthca_map_phys_fmr(struct ib_fmr *fmr, u64 *page_list, int list_len,
 > +			      u64 iova)
 > +{
 > +	int i, page_mask;
 > +	struct mthca_fmr *mfmr;
 > +
 > +	mfmr = to_mfmr(fmr);
 > +
 > +	if (list_len > mfmr->attr.max_pages) {
 > +		mthca_warn(to_mdev(fmr->device), "Attempt to map list length "
 > +			   "%d to fmr with max %d pages\n",
 > +			   list_len, mfmr->attr.max_pages);
 > +		return -EINVAL;
 > +	}
 > +
 > +	page_mask = (1 << mfmr->attr.page_size) - 1;
 > +
 > +	/* We are getting page lists, so va must be page aligned. */
 > +	if (iova & page_mask) {
 > +		mthca_warn(to_mdev(fmr->device), "Attempt to map fmr with page "
 > +			   "shift %d at misaligned virtual address %016llx\n",
 > +			   mfmr->attr.page_size, iova);
 > +		return -EINVAL;
 > +	}
 > +
 > +	/* Trust the user not to pass misaligned data in page_list */
 > +	if (0)
 > +		for (i = 0; i < list_len; ++i) {
 > +			if (page_list[i] & page_mask) {
 > +				mthca_warn(to_mdev(fmr->device), "Attempt to "
 > +					   "map fmr with page shift %d at "
 > +					   "address %016llx\n",
 > +					   mfmr->attr.page_size, page_list[i]);
 > +				return -EINVAL;
 > +			}
 > +		}
 > +
 > +	return mthca_fmr_map(to_mdev(fmr->device), to_mfmr(fmr), page_list,
 > +			     list_len, iova);
 > +}
 > +
 > +static int mthca_unmap_fmr(struct list_head *fmr_list)
 > +{
 > +	struct mthca_dev* mdev = NULL;
 > +	struct ib_fmr *fmr;
 > +	int err;
 > +	u8 status;
 > +
 > +	list_for_each_entry(fmr, fmr_list, list) {
 > +		mdev = to_mdev(fmr->device);
 > +		mthca_fmr_unmap(mdev, to_mfmr(fmr));
 > +	}
 > +
 > +	if (!mdev)
 > +		return 0;
 > +
 > +	err = mthca_SYNC_TPT(mdev, &status);
 > +	if (err)
 > +		return err;
 > +	if (status)
 > +		return -EINVAL;
 >  	return 0;
 >  }
 >  
 > @@ -636,6 +729,15 @@ int mthca_register_device(struct mthca_d
 >  	dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
 >  	dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
 >  	dev->ib_dev.dereg_mr             = mthca_dereg_mr;
 > +
 > +	if (dev->hca_type == ARBEL_NATIVE ||
 > +	    !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
 > +		dev->ib_dev.alloc_fmr            = mthca_alloc_fmr;
 > +		dev->ib_dev.map_phys_fmr         = mthca_map_phys_fmr;
 > +		dev->ib_dev.unmap_fmr            = mthca_unmap_fmr;
 > +		dev->ib_dev.dealloc_fmr          = mthca_dealloc_fmr;
 > +	}
 > +
 >  	dev->ib_dev.attach_mcast         = mthca_multicast_attach;
 >  	dev->ib_dev.detach_mcast         = mthca_multicast_detach;
 >  	dev->ib_dev.process_mad          = mthca_process_mad;
 > Index: hw/mthca/mthca_provider.h
 > ===================================================================
 > --- hw/mthca/mthca_provider.h	(revision 2012)
 > +++ hw/mthca/mthca_provider.h	(working copy)
 > @@ -60,6 +60,16 @@ struct mthca_mr {
 >  	u32 first_seg;
 >  };
 >  
 > +struct mthca_fmr {
 > +	struct ib_fmr ibmr;
 > +	struct ib_fmr_attr attr;
 > +	int order;
 > +	u32 first_seg;
 > +	int maps;
 > +	struct mthca_mpt_entry __iomem *mpt;  /* Tavor only */
 > +	u64 __iomem *mtts; /* Tavor only */
 > +};
 > +
 >  struct mthca_pd {
 >  	struct ib_pd    ibpd;
 >  	u32             pd_num;
 > @@ -218,6 +228,11 @@ struct mthca_sqp {
 >  	dma_addr_t      header_dma;
 >  };
 >  
 > +static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr)
 > +{
 > +	return container_of(ibmr, struct mthca_fmr, ibmr);
 > +}
 > +
 >  static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr)
 >  {
 >  	return container_of(ibmr, struct mthca_mr, ibmr);
 > Index: hw/mthca/mthca_profile.c
 > ===================================================================
 > --- hw/mthca/mthca_profile.c	(revision 2012)
 > +++ hw/mthca/mthca_profile.c	(working copy)
 > @@ -223,9 +223,10 @@ u64 mthca_make_profile(struct mthca_dev 
 >  			init_hca->mc_hash_sz      = 1 << (profile[i].log_num - 1);
 >  			break;
 >  		case MTHCA_RES_MPT:
 > -			dev->limits.num_mpts = profile[i].num;
 > -			init_hca->mpt_base   = profile[i].start;
 > -			init_hca->log_mpt_sz = profile[i].log_num;
 > +			dev->limits.num_mpts   = profile[i].num;
 > +			dev->mr_table.mpt_base = profile[i].start;
 > +			init_hca->mpt_base     = profile[i].start;
 > +			init_hca->log_mpt_sz   = profile[i].log_num;
 >  			break;
 >  		case MTHCA_RES_MTT:
 >  			dev->limits.num_mtt_segs = profile[i].num;
 > Index: hw/mthca/mthca_cmd.c
 > ===================================================================
 > --- hw/mthca/mthca_cmd.c	(revision 2012)
 > +++ hw/mthca/mthca_cmd.c	(working copy)
 > @@ -1406,6 +1406,11 @@ int mthca_WRITE_MTT(struct mthca_dev *de
 >  	return err;
 >  }
 >  
 > +int mthca_SYNC_TPT(struct mthca_dev *dev, u8 *status)
 > +{
 > +	return mthca_cmd(dev, 0, 0, 0, CMD_SYNC_TPT, CMD_TIME_CLASS_B, status);
 > +}
 > +
 >  int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
 >  		 int eq_num, u8 *status)
 >  {
 > Index: hw/mthca/mthca_cmd.h
 > ===================================================================
 > --- hw/mthca/mthca_cmd.h	(revision 2012)
 > +++ hw/mthca/mthca_cmd.h	(working copy)
 > @@ -277,6 +277,7 @@ int mthca_HW2SW_MPT(struct mthca_dev *de
 >  		    int mpt_index, u8 *status);
 >  int mthca_WRITE_MTT(struct mthca_dev *dev, u64 *mtt_entry,
 >  		    int num_mtt, u8 *status);
 > +int mthca_SYNC_TPT(struct mthca_dev *dev, u8 *status);
 >  int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
 >  		 int eq_num, u8 *status);
 >  int mthca_SW2HW_EQ(struct mthca_dev *dev, void *eq_context,
 > Index: hw/mthca/mthca_mr.c
 > ===================================================================
 > --- hw/mthca/mthca_mr.c	(revision 2012)
 > +++ hw/mthca/mthca_mr.c	(working copy)
 > @@ -368,11 +368,13 @@ err_out_table:
 >  		mthca_table_put(dev, dev->mr_table.mpt_table, key);
 >  
 >  err_out_mpt_free:
 > -	mthca_free(&dev->mr_table.mpt_alloc, mr->ibmr.lkey);
 > +	mthca_free(&dev->mr_table.mpt_alloc, key);
 >  	return err;
 >  }
 >  
 > -void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
 > +/* Free mr or fmr */
 > +static void mthca_free_region(struct mthca_dev *dev, u32 lkey, int order,
 > +			      u32 first_seg)
 >  {
 >  	int err;
 >  	u8 status;
 > @@ -380,7 +382,7 @@ void mthca_free_mr(struct mthca_dev *dev
 >  	might_sleep();
 >  
 >  	err = mthca_HW2SW_MPT(dev, NULL,
 > -			      key_to_hw_index(dev, mr->ibmr.lkey) &
 > +			      key_to_hw_index(dev, lkey) &
 >  			      (dev->limits.num_mpts - 1),
 >  			      &status);
 >  	if (err)
 > @@ -389,15 +391,276 @@ void mthca_free_mr(struct mthca_dev *dev
 >  		mthca_warn(dev, "HW2SW_MPT returned status 0x%02x\n",
 >  			   status);
 >  
 > -	if (mr->order >= 0)
 > -		mthca_free_mtt(dev, mr->first_seg, mr->order);
 > +	if (order >= 0)
 > +		mthca_free_mtt(dev, first_seg, order);
 >  
 >  	if (dev->hca_type == ARBEL_NATIVE)
 >  		mthca_table_put(dev, dev->mr_table.mpt_table,
 > -				key_to_hw_index(dev, mr->ibmr.lkey));
 > -	mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, mr->ibmr.lkey));
 > +				key_to_hw_index(dev, lkey));
 > +	mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, lkey));
 > +}
 > +
 > +void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
 > +{
 > +	mthca_free_region(dev, mr->ibmr.lkey, mr->order, mr->first_seg);
 > +}
 > +
 > +int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
 > +			u32 access, struct mthca_fmr *mr)
 > +{
 > +	void *mailbox;
 > +	u64 mtt_seg;
 > +	struct mthca_mpt_entry *mpt_entry;
 > +	u32 key, idx;
 > +	int err = -ENOMEM;
 > +	u8 status;
 > +	int i;
 > +	int list_len = mr->attr.max_pages;
 > +
 > +	might_sleep();
 > +
 > +	BUG_ON(mr->attr.page_size < 12);
 > +	WARN_ON(mr->attr.page_size >= 32);

Why is one a BUG and one a WARN?  Why not just return an error in both
cases?  Something like:

	if (mr->attr.page_size < 12 || mr->attr.page_size >= 32)
		return -EINVAL;

 > +
 > +	mr->maps = 0;
 > +
 > +	key = mthca_alloc(&dev->mr_table.mpt_alloc);
 > +	if (key == -1)
 > +		return -ENOMEM;
 > +	idx = key & (dev->limits.num_mpts - 1);
 > +	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
 > +
 > +	if (dev->hca_type == ARBEL_NATIVE) {
 > +		err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
 > +		if (err)
 > +			goto err_out_mpt_free;
 > +	}
 > +	for (i = dev->limits.mtt_seg_size / 8, mr->order = 0;
 > +	     i < list_len;
 > +	     i <<= 1, ++mr->order)
 > +		; /* nothing */
 > +
 > +	mr->first_seg = mthca_alloc_mtt(dev, mr->order);
 > +	if (mr->first_seg == -1)
 > +		goto err_out_table;
 > +
 > +	mtt_seg = dev->mr_table.mtt_base +
 > +		mr->first_seg * dev->limits.mtt_seg_size;
 > +
 > +	mr->mpt = NULL;
 > +	mr->mtts = NULL;
 > +
 > +	if (dev->hca_type != ARBEL_NATIVE) {
 > +		mr->mpt = ioremap(dev->mr_table.mpt_base +
 > +				  sizeof *(mr->mpt) * idx, sizeof *(mr->mpt));
 > +		if (!mr->mpt) {
 > +			mthca_dbg(dev, "Couldn't map MPT entry for fmr %08x.\n",
 > +				  mr->ibmr.lkey);
 > +			goto err_out_free_mtt;
 > +		}
 > +		mr->mtts = ioremap(mtt_seg, list_len * sizeof *(mr->mtts));
 > +		if (!mr->mtts) {
 > +			mthca_dbg(dev, "Couldn't map MTT entry %016llx "
 > +				  "(size %x) for fmr %08x.\n", mtt_seg,
 > +				  list_len * sizeof u64, mr->ibmr.lkey);
 > +			goto err_out_free_mtt;
 > +		}
 > +	}
 > +
 > +	mailbox = kmalloc(sizeof *mpt_entry + MTHCA_CMD_MAILBOX_EXTRA,
 > +			  GFP_KERNEL);
 > +	if (!mailbox)
 > +		goto err_out_free_mtt;
 > +
 > +	mpt_entry = MAILBOX_ALIGN(mailbox);
 > +
 > +	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
 > +				       MTHCA_MPT_FLAG_MIO         |
 > +				       MTHCA_MPT_FLAG_REGION      |
 > +				       access);
 > +
 > +	mpt_entry->page_size = cpu_to_be32(mr->attr.page_size - 12);
 > +	mpt_entry->key       = cpu_to_be32(key);
 > +	mpt_entry->pd        = cpu_to_be32(pd);
 > +	memset(&mpt_entry->start, 0,
 > +	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start));
 > +	mpt_entry->mtt_seg   = cpu_to_be64(mtt_seg);
 > +
 > +	if (0) {
 > +		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
 > +		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
 > +			if (i % 4 == 0)
 > +				printk("[%02x] ", i * 4);
 > +			printk(" %08x", be32_to_cpu(((u32 *) mpt_entry)[i]));
 > +			if ((i + 1) % 4 == 0)
 > +				printk("\n");
 > +		}
 > +	}
 > +
 > +	err = mthca_SW2HW_MPT(dev, mpt_entry,
 > +			      key & (dev->limits.num_mpts - 1),
 > +			      &status);
 > +	if (err)
 > +		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
 > +	else if (status) {
 > +		mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
 > +			   status);
 > +		err = -EINVAL;
 > +	}
 > +
 > +	kfree(mailbox);
 > +	return err;
 > +
 > +err_out_free_mtt:
 > +	if (mr->mtts)
 > +		iounmap(mr->mtts);
 > +	if (mr->mpt)
 > +		iounmap(mr->mpt);
 > +	mthca_free_mtt(dev, mr->first_seg, mr->order);
 > +
 > +err_out_table:
 > +	if (dev->hca_type == ARBEL_NATIVE)
 > +		mthca_table_put(dev, dev->mr_table.mpt_table, key);
 > +
 > +err_out_mpt_free:
 > +	mthca_free(&dev->mr_table.mpt_alloc, key);
 > +	return err;
 > +}
 > +
 > +void mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr)
 > +{
 > +	mthca_free_region(dev, fmr->ibmr.lkey, fmr->order, fmr->first_seg);
 > +}
 > +
 > +#define MTHCA_MPT_STATUS_SW 0xF
 > +#define MTHCA_MPT_STATUS_HW 0x0
 > +
 > +static void mthca_tavor_fmr_map(struct mthca_dev *dev, struct mthca_fmr *fmr,
 > +		  u64 *page_list, int list_len, u64 iova, u32 key)
 > +{
 > +	struct mthca_mpt_entry mpt_entry;
 > +	int i;
 > +
 > +	writeb(MTHCA_MPT_STATUS_SW,fmr->mpt);

It's a minor nitpick but put a space after the "," please.  (This
appears a few more times below too)

 > +
 > +	wmb();

Are the wmb()s required in this function?  writeX() is already ordered.

 > +
 > +	for (i = 0; i < list_len; ++i) {
 > +		u64 mtt_entry = cpu_to_be64(page_list[i] |
 > +					    MTHCA_MTT_FLAG_PRESENT);
 > +		writeq(mtt_entry, fmr->mtts + i);

Don't use writeq() unconditionally.  32 bit archs don't have it.

 > +	}
 > +	mpt_entry.lkey = cpu_to_be32(key);
 > +	mpt_entry.length = cpu_to_be64(((u64)list_len) *
 > +				       (1 << fmr->attr.page_size));
 > +	mpt_entry.start = cpu_to_be64(iova);
 > +
 > +	writel(mpt_entry.lkey, &fmr->mpt->key);
 > +	memcpy_toio(&fmr->mpt->start, &mpt_entry.start, 
 > +		    offsetof(struct mthca_mpt_entry, window_count) -
 > +		    offsetof(struct mthca_mpt_entry, start));
 > +
 > +	wmb();
 > +
 > +	writeb(MTHCA_MPT_STATUS_SW,fmr->mpt);

Should this be STATUS_HW here?  It seems we exit the function with the
MPT marked invalid.

 > +
 > +	wmb();
 > +}
 > +
 > +static int mthca_arbel_fmr_map(struct mthca_dev *dev, struct mthca_fmr *fmr,
 > +		  u64 *page_list, int list_len, u64 iova, u32 key)
 > +{
 > +	void* mpt;

Write this as "void *mpt" instead please.

 > +	struct mthca_mpt_entry *mpt_entry;
 > +	u8 *mpt_status;
 > +	int i;
 > +
 > +	mpt = mthca_table_find(dev, dev->mr_table.mpt_table, key);
 > +	if (!mpt)
 > +		return -EINVAL;
 > +
 > +	mpt_status = mpt;
 > +	*mpt_status = MTHCA_MPT_STATUS_SW;
 > +
 > +	wmb();
 > +
 > +	/* This is really dumb. We are rescanning the ICM on
 > +	 * each mpt entry. We want some kind of iterator here.
 > +	 * May be fine meanwhile, while things are small. */
 > +	for (i = 0; i < list_len; ++i) {
 > +		u64 *mtt_entry = mthca_table_find(dev, dev->mr_table.mtt_table,
 > +						  fmr->first_seg + i);
 > +		if (!mtt_entry)
 > +			return -EINVAL;
 > +
 > +		*mtt_entry = cpu_to_be64(page_list[i] | MTHCA_MTT_FLAG_PRESENT);
 > +	}
 > +
 > +
 > +	mpt_entry = mpt;
 > +	mpt_entry->lkey = mpt_entry->key = cpu_to_be32(key);
 > +	mpt_entry->length = cpu_to_be64(((u64)list_len) *
 > +				       	(1 << fmr->attr.page_size));
 > +	mpt_entry->start = cpu_to_be64(iova);
 > +
 > +	wmb();
 > +
 > +	*mpt_status = MTHCA_MPT_STATUS_HW;
 > +
 > +	wmb();
 > +	return 0;
 > +}
 > +
 > +
 > +int mthca_fmr_map(struct mthca_dev *dev, struct mthca_fmr *fmr,
 > +		  u64 *page_list, int list_len, u64 iova)
 > +{
 > +	u32 key;
 > +
 > +	if (fmr->maps >= fmr->attr.max_maps) {
 > +		mthca_warn(dev, "Attempt to map fmr %d times, max_maps is %d\n",
 > +			   fmr->maps, fmr->attr.max_maps);
 > +		return -EINVAL;
 > +	}
 > +
 > +	key = key_to_hw_index(dev, fmr->ibmr.lkey) + dev->limits.num_mpts;
 > +	fmr->ibmr.lkey = fmr->ibmr.rkey = hw_index_to_key(dev, key);
 > +	fmr->maps++;

Maybe put this common part in a static function and have
mthca_{arbel,tavor}_fmr_map() call it.  Then we could just set the
map_phys_fmr method in the device struct to the right function at
initialization time.

 > +
 > +	if (dev->hca_type == ARBEL_NATIVE) {
 > +		return mthca_arbel_fmr_map(dev, fmr, page_list, list_len,
 > +					   iova, key);
 > +	} else {
 > +		mthca_tavor_fmr_map(dev, fmr, page_list, list_len,
 > +					   iova, key);
 > +		return 0;
 > +	}
 > +}
 > +
 > +void mthca_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
 > +{
 > +	if (!fmr->maps) {
 > +		return;
 > +	}

Nitpick -- no { } needed here.

 > +
 > +	fmr->maps = 0;
 > +
 > +	if (dev->hca_type == ARBEL_NATIVE) {
 > +		u32 key = key_to_hw_index(dev, fmr->ibmr.lkey);
 > +		u8 *mpt_status = mthca_table_find(dev, dev->mr_table.mpt_table,
 > +						  key);
 > +		if (!mpt_status)
 > +			return;
 > +
 > +		*mpt_status = MTHCA_MPT_STATUS_SW;
 > +		wmb();
 > +	} else {
 > +		writeb(MTHCA_MPT_STATUS_SW,fmr->mpt);
 > +		wmb();
 > +	}
 >  }
 >  
 > +
 >  int __devinit mthca_init_mr_table(struct mthca_dev *dev)
 >  {
 >  	int err;
 > Index: hw/mthca/mthca_memfree.c
 > ===================================================================
 > --- hw/mthca/mthca_memfree.c	(revision 2012)
 > +++ hw/mthca/mthca_memfree.c	(working copy)
 > @@ -192,6 +192,47 @@ void mthca_table_put(struct mthca_dev *d
 >  	up(&table->mutex);
 >  }
 >  
 > +/* Nonblocking. Callers must make sure the object exists by serializing against
 > + * callers of get/put. */
 > +void *mthca_table_find(struct mthca_dev *dev, struct mthca_icm_table *table,
 > +		       int obj)
 > +{
 > +	int idx, offset, i;
 > +	struct mthca_icm_chunk *chunk;
 > +	struct mthca_icm *icm;
 > +	struct page *page = NULL;
 > +
 > +	/* Supported only for low mem tables for now. */
 > +	if (!table->lowmem)
 > +		return NULL;
 > +
 > +       	idx = (obj & (table->num_obj - 1)) * table->obj_size;
 > +       	icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE];
 > +	offset = idx % MTHCA_TABLE_CHUNK_SIZE;

What happened to the indendation here?

 > +
 > +	if(!icm)
 > +		return NULL;
 > +
 > +	/* Linear scan of ICM on each access. Since this is called on fmr
 > +	 * registration which is on data path, eventually we may want to
 > +	 * rearrange things to use some kind of tree. */
 > +
 > +	list_for_each_entry(chunk, &icm->chunk_list, list) {
 > +		for (i = 0; i < chunk->npages; ++i) {
 > +			if (chunk->mem[i].length >= offset) {
 > +				page = chunk->mem[i].page;
 > +				break;
 > +			}
 > +			offset -= chunk->mem[i].length;
 > +		}
 > +	}
 > +
 > +	if (!page)
 > +		return NULL;
 > +
 > +	return lowmem_page_address(page) + offset;
 > +}
 > +
 >  int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
 >  			  int start, int end)
 >  {



More information about the general mailing list