each dma size is 12MB, test 1000 loops, cost: 304.715996480s, 39.38MBps?
the snippet is:
#define BUFF_SIZE (12 * 1024 * 1024)
static int do_memcpy_with_dma_iommu(void)
{
struct dma_device *dev;
struct dma_chan *chan = NULL;
dma_cap_mask_t mask;
struct device *cdev;
struct iommu_domain *domain;
dma_addr_t dst_iova;
dma_addr_t src_iova;
struct dma_async_tx_descriptor *tx = NULL;
dma_cookie_t dma_cookie;
int i;
int ret;
ktime_t k0, k1;
dma_cap_zero(mask);
dma_cap_set(DMA_MEMCPY, mask);
chan = dma_request_channel(mask, NULL, NULL);
if(NULL == chan )
{
printk("request channel fail\n");
return -1;
}
dev = chan->device;
cdev = dev->dev;
domain = iommu_get_domain_for_dev(cdev);
printk("%s domain %p\n", __func__, domain);
if (!domain)
return 0;
dst_iova = iommu_dma_alloc_iova(cdev, BUFF_SIZE,
cdev->coherent_dma_mask);
if (!dst_iova) {
dev_err(cdev, "dst iommu_dma_alloc_iova() failed\n");
goto out;
}
dev_info(cdev, "dst IOVA: 0x%08llx\n", dst_iova);
ret = iommu_map(domain, dst_iova,
dst_phys,
BUFF_SIZE, IOMMU_READ | IOMMU_WRITE);
src_iova = iommu_dma_alloc_iova(cdev, BUFF_SIZE,
cdev->coherent_dma_mask);
if (!src_iova) {
dev_err(cdev, "src iommu_dma_alloc_iova() failed\n");
goto out;
}
dev_info(cdev, "src IOVA: 0x%08llx\n", src_iova);
ret = iommu_map(domain, src_iova,
src_phys,
BUFF_SIZE, IOMMU_READ | IOMMU_WRITE);
k0 = ktime_get();
for(i = 0; i < 1000; i++)
{
dma_finished = 0;
//tx = dev->device_prep_dma_memcpy(chan, dst_phys, src_phys, BUFF_SIZE, DMA_PREP_INTERRUPT|DMA_CTRL_ACK);
tx = dev->device_prep_dma_memcpy(chan, dst_iova, src_iova, BUFF_SIZE, DMA_PREP_INTERRUPT|DMA_CTRL_ACK);
if(NULL == tx)
{
printk("prep_dma_memcpy fail\n");
dma_release_channel(chan);
return -1;
}
tx->callback = tx_callback;
dma_cookie = dmaengine_submit(tx);
if (dma_submit_error(dma_cookie))
{
printk("submit fail\n");
}
dma_async_issue_pending(chan);
wait_event_interruptible(wq, dma_finished);
}
k1 = ktime_get();
printk("%s cost: d1=%lld \n", __func__, (k1.tv64 - k0.tv64));
dma_release_channel(chan);
out:
return 0;
}
any ideas? thanks.