[openib-general] [PATCH 3 of 4] IB/mthca: fix non-cache-coherent CPUs with memfree
Michael S. Tsirkin
mst at mellanox.co.il
Sat Feb 10 13:15:08 PST 2007
Fix non-cache-coherent CPUs with memfree HCAs.
We allocate the MTT table with alloc_pages() and then do
pci_map_sg(), so we must call pci_dma_sync_sg after the CPU
writes to the MTT table (this works since device never writes the
MTTs on memfree).
For MPTs, both the device and CPU might write there, so we must
allocate dma coherent memory for these.
Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>
---
Index: linux-2.6/drivers/infiniband/hw/mthca/mthca_memfree.c
===================================================================
--- linux-2.6.orig/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ linux-2.6/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -35,6 +35,8 @@
*/
#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <asm/page.h>
#include "mthca_memfree.h"
#include "mthca_dev.h"
@@ -58,22 +60,31 @@ struct mthca_user_db_table {
} page[0];
};
-void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm)
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent)
{
struct mthca_icm_chunk *chunk, *tmp;
+ void *buf;
int i;
if (!icm)
return;
list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
- if (chunk->nsg > 0)
- pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
- PCI_DMA_BIDIRECTIONAL);
-
- for (i = 0; i < chunk->npages; ++i)
- __free_pages(chunk->mem[i].page,
- get_order(chunk->mem[i].length));
+ if (coherent)
+ for (i = 0; i < chunk->npages; ++i) {
+ buf = lowmem_page_address(chunk->mem[i].page);
+ dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+ buf, sg_dma_address(&chunk->mem[i]));
+ }
+ else {
+ if (chunk->nsg > 0)
+ pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+ PCI_DMA_BIDIRECTIONAL);
+
+ for (i = 0; i < chunk->npages; ++i)
+ __free_pages(chunk->mem[i].page,
+ get_order(chunk->mem[i].length));
+ }
kfree(chunk);
}
@@ -81,12 +92,41 @@ void mthca_free_icm(struct mthca_dev *de
kfree(icm);
}
+static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
+{
+ mem->page = alloc_pages(gfp_mask, order);
+ if (!mem->page)
+ return -ENOMEM;
+
+ mem->length = PAGE_SIZE << order;
+ mem->offset = 0;
+ return 0;
+}
+
+static int mthca_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
+ int order, gfp_t gfp_mask)
+{
+ void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, &sg_dma_address(mem),
+ gfp_mask);
+ if (!buf)
+ return -ENOMEM;
+
+ sg_set_buf(mem, buf, PAGE_SIZE << order);
+ BUG_ON(mem->offset);
+ sg_dma_len(mem) = PAGE_SIZE << order;
+ return 0;
+}
+
struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
- gfp_t gfp_mask)
+ gfp_t gfp_mask, int coherent)
{
struct mthca_icm *icm;
struct mthca_icm_chunk *chunk = NULL;
int cur_order;
+ int ret;
+
+ /* We use sg_set_buf for coherent allocs, which assumes low memory */
+ BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
if (!icm)
@@ -112,21 +152,28 @@ struct mthca_icm *mthca_alloc_icm(struct
while (1 << cur_order > npages)
--cur_order;
- chunk->mem[chunk->npages].page = alloc_pages(gfp_mask, cur_order);
- if (chunk->mem[chunk->npages].page) {
- chunk->mem[chunk->npages].length = PAGE_SIZE << cur_order;
- chunk->mem[chunk->npages].offset = 0;
+ if (coherent)
+ ret = mthca_alloc_icm_coherent(&dev->pdev->dev,
+ &chunk->mem[chunk->npages],
+ cur_order, gfp_mask);
+ else
+ ret = mthca_alloc_icm_pages(&chunk->mem[chunk->npages],
+ cur_order, gfp_mask);
- if (++chunk->npages == MTHCA_ICM_CHUNK_LEN) {
+ if (!ret) {
+ ++chunk->npages;
+
+ if (!coherent && chunk->npages == MTHCA_ICM_CHUNK_LEN) {
chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
chunk->npages,
PCI_DMA_BIDIRECTIONAL);
if (chunk->nsg <= 0)
goto fail;
+ }
+ if (chunk->npages == MTHCA_ICM_CHUNK_LEN)
chunk = NULL;
- }
npages -= 1 << cur_order;
} else {
@@ -136,7 +183,7 @@ struct mthca_icm *mthca_alloc_icm(struct
}
}
- if (chunk) {
+ if (!coherent && chunk) {
chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
chunk->npages,
PCI_DMA_BIDIRECTIONAL);
@@ -148,7 +195,7 @@ struct mthca_icm *mthca_alloc_icm(struct
return icm;
fail:
- mthca_free_icm(dev, icm);
+ mthca_free_icm(dev, icm, coherent);
return NULL;
}
@@ -167,7 +214,7 @@ int mthca_table_get(struct mthca_dev *de
table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
(table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
- __GFP_NOWARN);
+ __GFP_NOWARN, table->coherent);
if (!table->icm[i]) {
ret = -ENOMEM;
goto out;
@@ -175,7 +222,7 @@ int mthca_table_get(struct mthca_dev *de
if (mthca_MAP_ICM(dev, table->icm[i], table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
&status) || status) {
- mthca_free_icm(dev, table->icm[i]);
+ mthca_free_icm(dev, table->icm[i], table->coherent);
table->icm[i] = NULL;
ret = -ENOMEM;
goto out;
@@ -204,16 +251,16 @@ void mthca_table_put(struct mthca_dev *d
mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE,
&status);
- mthca_free_icm(dev, table->icm[i]);
+ mthca_free_icm(dev, table->icm[i], table->coherent);
table->icm[i] = NULL;
}
mutex_unlock(&table->mutex);
}
-void *mthca_table_find(struct mthca_icm_table *table, int obj)
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle)
{
- int idx, offset, i;
+ int idx, offset, dma_offset, i;
struct mthca_icm_chunk *chunk;
struct mthca_icm *icm;
struct page *page = NULL;
@@ -225,13 +272,22 @@ void *mthca_table_find(struct mthca_icm_
idx = (obj & (table->num_obj - 1)) * table->obj_size;
icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE];
- offset = idx % MTHCA_TABLE_CHUNK_SIZE;
+ dma_offset = offset = idx % MTHCA_TABLE_CHUNK_SIZE;
if (!icm)
goto out;
list_for_each_entry(chunk, &icm->chunk_list, list) {
for (i = 0; i < chunk->npages; ++i) {
+ if (dma_handle && dma_offset >= 0) {
+ if (sg_dma_len(&chunk->mem[i]) > dma_offset)
+ *dma_handle = sg_dma_address(&chunk->mem[i]) +
+ dma_offset;
+ dma_offset -= sg_dma_len(&chunk->mem[i]);
+ }
+ /* DMA mapping can merge pages but not split them,
+ * so if we found the page, dma_handle has already
+ * been assigned to. */
if (chunk->mem[i].length > offset) {
page = chunk->mem[i].page;
goto out;
@@ -283,7 +339,7 @@ void mthca_table_put_range(struct mthca_
struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
u64 virt, int obj_size,
int nobj, int reserved,
- int use_lowmem)
+ int use_lowmem, int use_coherent)
{
struct mthca_icm_table *table;
int num_icm;
@@ -302,6 +358,7 @@ struct mthca_icm_table *mthca_alloc_icm_
table->num_obj = nobj;
table->obj_size = obj_size;
table->lowmem = use_lowmem;
+ table->coherent = use_coherent;
mutex_init(&table->mutex);
for (i = 0; i < num_icm; ++i)
@@ -314,12 +371,12 @@ struct mthca_icm_table *mthca_alloc_icm_
table->icm[i] = mthca_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
(use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
- __GFP_NOWARN);
+ __GFP_NOWARN, use_coherent);
if (!table->icm[i])
goto err;
if (mthca_MAP_ICM(dev, table->icm[i], virt + i * MTHCA_TABLE_CHUNK_SIZE,
&status) || status) {
- mthca_free_icm(dev, table->icm[i]);
+ mthca_free_icm(dev, table->icm[i], table->coherent);
table->icm[i] = NULL;
goto err;
}
@@ -339,7 +396,7 @@ err:
mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE,
MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE,
&status);
- mthca_free_icm(dev, table->icm[i]);
+ mthca_free_icm(dev, table->icm[i], table->coherent);
}
kfree(table);
@@ -357,7 +414,7 @@ void mthca_free_icm_table(struct mthca_d
mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE,
&status);
- mthca_free_icm(dev, table->icm[i]);
+ mthca_free_icm(dev, table->icm[i], table->coherent);
}
kfree(table);
Index: linux-2.6/drivers/infiniband/hw/mthca/mthca_main.c
===================================================================
--- linux-2.6.orig/drivers/infiniband/hw/mthca/mthca_main.c
+++ linux-2.6/drivers/infiniband/hw/mthca/mthca_main.c
@@ -379,7 +379,7 @@ static int mthca_load_fw(struct mthca_de
mdev->fw.arbel.fw_icm =
mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages,
- GFP_HIGHUSER | __GFP_NOWARN);
+ GFP_HIGHUSER | __GFP_NOWARN, 0);
if (!mdev->fw.arbel.fw_icm) {
mthca_err(mdev, "Couldn't allocate FW area, aborting.\n");
return -ENOMEM;
@@ -412,7 +412,7 @@ err_unmap_fa:
mthca_UNMAP_FA(mdev, &status);
err_free:
- mthca_free_icm(mdev, mdev->fw.arbel.fw_icm);
+ mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
return err;
}
@@ -441,7 +441,7 @@ static int mthca_init_icm(struct mthca_d
(unsigned long long) aux_pages << 2);
mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages,
- GFP_HIGHUSER | __GFP_NOWARN);
+ GFP_HIGHUSER | __GFP_NOWARN, 0);
if (!mdev->fw.arbel.aux_icm) {
mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n");
return -ENOMEM;
@@ -467,7 +467,8 @@ static int mthca_init_icm(struct mthca_d
mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base,
MTHCA_MTT_SEG_SIZE,
mdev->limits.num_mtt_segs,
- mdev->limits.reserved_mtts, 1);
+ mdev->limits.reserved_mtts,
+ 1, 0);
if (!mdev->mr_table.mtt_table) {
mthca_err(mdev, "Failed to map MTT context memory, aborting.\n");
err = -ENOMEM;
@@ -477,7 +478,8 @@ static int mthca_init_icm(struct mthca_d
mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base,
dev_lim->mpt_entry_sz,
mdev->limits.num_mpts,
- mdev->limits.reserved_mrws, 1);
+ mdev->limits.reserved_mrws,
+ 1, 1);
if (!mdev->mr_table.mpt_table) {
mthca_err(mdev, "Failed to map MPT context memory, aborting.\n");
err = -ENOMEM;
@@ -487,7 +489,8 @@ static int mthca_init_icm(struct mthca_d
mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base,
dev_lim->qpc_entry_sz,
mdev->limits.num_qps,
- mdev->limits.reserved_qps, 0);
+ mdev->limits.reserved_qps,
+ 0, 0);
if (!mdev->qp_table.qp_table) {
mthca_err(mdev, "Failed to map QP context memory, aborting.\n");
err = -ENOMEM;
@@ -497,7 +500,8 @@ static int mthca_init_icm(struct mthca_d
mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base,
dev_lim->eqpc_entry_sz,
mdev->limits.num_qps,
- mdev->limits.reserved_qps, 0);
+ mdev->limits.reserved_qps,
+ 0, 0);
if (!mdev->qp_table.eqp_table) {
mthca_err(mdev, "Failed to map EQP context memory, aborting.\n");
err = -ENOMEM;
@@ -507,7 +511,7 @@ static int mthca_init_icm(struct mthca_d
mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base,
MTHCA_RDB_ENTRY_SIZE,
mdev->limits.num_qps <<
- mdev->qp_table.rdb_shift,
+ mdev->qp_table.rdb_shift, 0,
0, 0);
if (!mdev->qp_table.rdb_table) {
mthca_err(mdev, "Failed to map RDB context memory, aborting\n");
@@ -518,7 +522,8 @@ static int mthca_init_icm(struct mthca_d
mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base,
dev_lim->cqc_entry_sz,
mdev->limits.num_cqs,
- mdev->limits.reserved_cqs, 0);
+ mdev->limits.reserved_cqs,
+ 0, 0);
if (!mdev->cq_table.table) {
mthca_err(mdev, "Failed to map CQ context memory, aborting.\n");
err = -ENOMEM;
@@ -530,7 +535,8 @@ static int mthca_init_icm(struct mthca_d
mthca_alloc_icm_table(mdev, init_hca->srqc_base,
dev_lim->srq_entry_sz,
mdev->limits.num_srqs,
- mdev->limits.reserved_srqs, 0);
+ mdev->limits.reserved_srqs,
+ 0, 0);
if (!mdev->srq_table.table) {
mthca_err(mdev, "Failed to map SRQ context memory, "
"aborting.\n");
@@ -550,7 +556,7 @@ static int mthca_init_icm(struct mthca_d
mdev->limits.num_amgms,
mdev->limits.num_mgms +
mdev->limits.num_amgms,
- 0);
+ 0, 0);
if (!mdev->mcg_table.table) {
mthca_err(mdev, "Failed to map MCG context memory, aborting.\n");
err = -ENOMEM;
@@ -588,7 +594,7 @@ err_unmap_aux:
mthca_UNMAP_ICM_AUX(mdev, &status);
err_free_aux:
- mthca_free_icm(mdev, mdev->fw.arbel.aux_icm);
+ mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
return err;
}
@@ -609,7 +615,7 @@ static void mthca_free_icms(struct mthca
mthca_unmap_eq_icm(mdev);
mthca_UNMAP_ICM_AUX(mdev, &status);
- mthca_free_icm(mdev, mdev->fw.arbel.aux_icm);
+ mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
}
static int mthca_init_arbel(struct mthca_dev *mdev)
@@ -693,7 +699,7 @@ err_free_icm:
err_stop_fw:
mthca_UNMAP_FA(mdev, &status);
- mthca_free_icm(mdev, mdev->fw.arbel.fw_icm);
+ mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
err_disable:
if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
@@ -712,7 +718,7 @@ static void mthca_close_hca(struct mthca
mthca_free_icms(mdev);
mthca_UNMAP_FA(mdev, &status);
- mthca_free_icm(mdev, mdev->fw.arbel.fw_icm);
+ mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
mthca_DISABLE_LAM(mdev, &status);
Index: linux-2.6/drivers/infiniband/hw/mthca/mthca_memfree.h
===================================================================
--- linux-2.6.orig/drivers/infiniband/hw/mthca/mthca_memfree.h
+++ linux-2.6/drivers/infiniband/hw/mthca/mthca_memfree.h
@@ -69,6 +69,7 @@ struct mthca_icm_table {
int num_obj;
int obj_size;
int lowmem;
+ int coherent;
struct mutex mutex;
struct mthca_icm *icm[0];
};
@@ -82,17 +83,17 @@ struct mthca_icm_iter {
struct mthca_dev;
struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
- gfp_t gfp_mask);
-void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm);
+ gfp_t gfp_mask, int coherent);
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent);
struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
u64 virt, int obj_size,
int nobj, int reserved,
- int use_lowmem);
+ int use_lowmem, int use_coherent);
void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table);
int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
-void *mthca_table_find(struct mthca_icm_table *table, int obj);
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle);
int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
int start, int end);
void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
Index: linux-2.6/drivers/infiniband/hw/mthca/mthca_mr.c
===================================================================
--- linux-2.6.orig/drivers/infiniband/hw/mthca/mthca_mr.c
+++ linux-2.6/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -315,6 +315,7 @@ void mthca_arbel_write_mtt_seg(struct mt
int start_index, u64 *buffer_list, int list_len)
{
__be64 *mtts;
+ dma_addr_t dma_handle;
int i;
int s = start_index * sizeof (u64);
@@ -324,12 +325,14 @@ void mthca_arbel_write_mtt_seg(struct mt
BUG_ON(s % MTHCA_MTT_SEG_SIZE);
mtts = mthca_table_find(dev->mr_table.mtt_table, mtt->first_seg +
- s / MTHCA_MTT_SEG_SIZE);
+ s / MTHCA_MTT_SEG_SIZE, &dma_handle);
BUG_ON(!mtts);
for (i = 0; i < list_len; ++i)
mtts[i] = cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT);
+
+ dma_sync_single(&dev->pdev->dev, dma_handle, list_len * sizeof(u64), DMA_TO_DEVICE);
}
int mthca_write_mtt_size(struct mthca_dev *dev)
@@ -602,7 +605,7 @@ int mthca_fmr_alloc(struct mthca_dev *de
if (err)
goto err_out_mpt_free;
- mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key);
+ mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL);
BUG_ON(!mr->mem.arbel.mpt);
} else
mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base +
@@ -616,7 +619,8 @@ int mthca_fmr_alloc(struct mthca_dev *de
if (mthca_is_memfree(dev)) {
mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table,
- mr->mtt->first_seg);
+ mr->mtt->first_seg,
+ &mr->mem.arbel.dma_handle);
BUG_ON(!mr->mem.arbel.mtts);
} else
mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg;
@@ -790,6 +794,9 @@ int mthca_arbel_map_phys_fmr(struct ib_f
fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] |
MTHCA_MTT_FLAG_PRESENT);
+ dma_sync_single(&dev->pdev->dev, fmr->mem.arbel.dma_handle,
+ list_len * sizeof(u64), DMA_TO_DEVICE);
+
fmr->mem.arbel.mpt->key = cpu_to_be32(key);
fmr->mem.arbel.mpt->lkey = cpu_to_be32(key);
fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
Index: linux-2.6/drivers/infiniband/hw/mthca/mthca_provider.h
===================================================================
--- linux-2.6.orig/drivers/infiniband/hw/mthca/mthca_provider.h
+++ linux-2.6/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -89,6 +89,7 @@ struct mthca_fmr {
struct {
struct mthca_mpt_entry *mpt;
__be64 *mtts;
+ dma_addr_t dma_handle;
} arbel;
} mem;
};
--
MST
More information about the general
mailing list