arm Linux中dma的cache管理

judy 在周四, 12/22/2022 - 14:50 提交

作者：付汉杰 hankf@amd.com，来源：博客园

概述

前两周有人询问DMA下的cache操作和dma-coherent。以前零碎看过代码。临时找，还没有找到。

这两天整理了调用流程，也找到了dma-coherent的用法。Linux的文档里没有详细说明dma-coherent的用法。根据代码，如果dma的设备树里有dma-coherent，Linux则认为硬件会维护cache一致性，不会在dma运行过程中执行cache操作。

dma_map_single/dma_unmap_single的使用

设备驱动里一般调用dma_map_single()/dma_unmap_single()处理cache。调用dma_map_single函数时需要指定DMA的方向，DMA_TO_DEVICE或者DMA_FROM_DEVICE。Linux会根据direction的值invalidate或者clean cache。

drivers\net\ethernet\cadence\macb_main.c的函数macb_tx_map()里，调用dma_map_single()刷新cache，macb_tx_interrupt()的macb_tx_unmap()再调用dma_unmap_single()。

代码简化后如下：

macb_tx_map( )
{
.......
	mapping = dma_map_single(&bp->pdev->dev,
			 skb->data + offset,
			 size, DMA_TO_DEVICE);
	.......		
}
			 
macb_tx_unmap( )
{
	.......				
	 
	dma_unmap_single(&bp->pdev->dev, tx_skb->mapping,
			 tx_skb->size, DMA_TO_DEVICE);
	.......		
}
	 
					 					
gem_rx( )
{
.......
	dma_unmap_single(&bp->pdev->dev, addr,
		bp->rx_buffer_size, DMA_FROM_DEVICE);
	.......		
}					
					
gem_rx_refill()
{
.......
	/* now fill corresponding descriptor entry */
	paddr = dma_map_single(&bp->pdev->dev, skb->data,
					bp->rx_buffer_size,
					DMA_FROM_DEVICE);
	.......		
}

dma_map_single/dma_unmap_single的定义

dma_map_single()和dma_unmap_single（）都在include\linux\dma-mapping.h里定义。如果没有特殊情况，会调用dma_direct_map_page（）、dma_direct_unmap_page（）。
arm64的特殊情况包括iommu和Xen虚拟机。 iommu和Xen虚拟机都需要提供dma_map_ops，于是使用其中的map、unmap函数。iommu的dma_map_ops是drivers\iommu\Dma-iommu.c中定义的iommu_dma_ops。 iommu的dma_map_ops是drivers/xen/swiotlb-xen.c中定义的xen_swiotlb_dma_ops。

#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0)
#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0)

static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
		size_t size, enum dma_data_direction dir, unsigned long attrs)
{
	debug_dma_map_single(dev, ptr, size);
	return dma_map_page_attrs(dev, virt_to_page(ptr), offset_in_page(ptr),
			size, dir, attrs);
}

static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
		size_t size, enum dma_data_direction dir, unsigned long attrs)
{
	return dma_unmap_page_attrs(dev, addr, size, dir, attrs);
}


static inline dma_addr_t dma_map_page_attrs(struct device *dev,
		struct page *page, size_t offset, size_t size,
		enum dma_data_direction dir, unsigned long attrs)
{
	const struct dma_map_ops *ops = get_dma_ops(dev);
	dma_addr_t addr;

	BUG_ON(!valid_dma_direction(dir));
	if (dma_is_direct(ops))
		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
	else
		addr = ops->map_page(dev, page, offset, size, dir, attrs);
	debug_dma_map_page(dev, page, offset, size, dir, addr);

	return addr;
}

static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr,
		size_t size, enum dma_data_direction dir, unsigned long attrs)
{
	const struct dma_map_ops *ops = get_dma_ops(dev);

	BUG_ON(!valid_dma_direction(dir));
	if (dma_is_direct(ops))
		dma_direct_unmap_page(dev, addr, size, dir, attrs);
	else if (ops->unmap_page)
		ops->unmap_page(dev, addr, size, dir, attrs);
	debug_dma_unmap_page(dev, addr, size, dir);
}

dma_direct_map_page/dma_direct_unmap_page的定义

dma_direct_map_page()、dma_direct_unmap_page()在kernel\dma\direct.c中定义。

dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
		unsigned long offset, size_t size, enum dma_data_direction dir,
		unsigned long attrs)
{
	phys_addr_t phys = page_to_phys(page) + offset;
	dma_addr_t dma_addr = phys_to_dma(dev, phys);

	if (unlikely(!dma_direct_possible(dev, dma_addr, size)) &&
	    !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) {
		report_addr(dev, dma_addr, size);
		return DMA_MAPPING_ERROR;
	}

	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
		arch_sync_dma_for_device(dev, phys, size, dir);
	return dma_addr;
}
EXPORT_SYMBOL(dma_direct_map_page);


void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
		size_t size, enum dma_data_direction dir, unsigned long attrs)
{
	phys_addr_t phys = dma_to_phys(dev, addr);

	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
		dma_direct_sync_single_for_cpu(dev, addr, size, dir);

	if (unlikely(is_swiotlb_buffer(phys)))
		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
}
EXPORT_SYMBOL(dma_direct_unmap_page);


void dma_direct_sync_single_for_cpu(struct device *dev,
        dma_addr_t addr, size_t size, enum dma_data_direction dir)
{
    phys_addr_t paddr = dma_to_phys(dev, addr);
 
    if (!dev_is_dma_coherent(dev)) {
        arch_sync_dma_for_cpu(dev, paddr, size, dir);
        arch_sync_dma_for_cpu_all(dev);
    }
 
    if (unlikely(is_swiotlb_buffer(paddr)))
        swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
}

一路跟踪，dma_map_single()会最终调用到arch_sync_dma_for_device()， dma_unmap_single()会最终调用到arch_sync_dma_for_cpu()，和arch_sync_dma_for_cpu_all()。而arch_sync_dma_for_cpu_all()对Arm64是空函数。

arch_sync_dma_for_device/arch_sync_dma_for_cpu的定义

arch_sync_dma_for_device/arch_sync_dma_for_cpu的定义在文件arch\arm64\mm\dma-mapping.c中。

void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
		size_t size, enum dma_data_direction dir)
{
	__dma_map_area(phys_to_virt(paddr), size, dir);
}


void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
		size_t size, enum dma_data_direction dir)
{
	__dma_unmap_area(phys_to_virt(paddr), size, dir);
}

__dma_map_area/__dma_unmap_area的定义

__dma_map_area/__dma_unmap_area的定义在文件arch\arm64\mm\cache.S中。

所有汇编实现，也在文件arch\arm64\mm\cache.S中。

/*
 *	__dma_map_area(start, size, dir)
 *	- start	- kernel virtual start address
 *	- size	- size of region
 *	- dir	- DMA direction
 */
ENTRY(__dma_map_area)
	cmp	w2, #DMA_FROM_DEVICE
	b.eq	__dma_inv_area
	b	__dma_clean_area
ENDPIPROC(__dma_map_area)

/*
 *	__dma_unmap_area(start, size, dir)
 *	- start	- kernel virtual start address
 *	- size	- size of region
 *	- dir	- DMA direction
 */
ENTRY(__dma_unmap_area)
	cmp	w2, #DMA_TO_DEVICE
	b.ne	__dma_inv_area
	ret
ENDPIPROC(__dma_unmap_area)

可以看到，map系列函数调用的__dma_map_area，方向如果是DMA_FROM_DEVICE，执行__dma_inv_area；否则执行 __dma_clean_area。
unmap系列函数调用的__dma_unmap_area，方向如果不是DMA_TO_DEVICE，执行__dma_inv_area；否则执行__dma_clean_area。

总结如下：

Operation	map	unmap
DMA_FROM_DEVICE	__dma_inv_area	__dma_inv_area
DMA_TO_DEVICE	__dma_clean_area	__dma_clean_area

__dma_inv_area 完成invalidate操作，丢弃cache数据。它的注释是：

关于Invalidate， ARM的手册"Arm Architecture Reference Manual for A-profile architecture"说明如下：

Invalidate  A cache invalidate instruction ensures that updates made visible by observers that access memory 
at the point to which the invalidate is defined, are made visible to an observer that controls the cache. 
This might result in the loss of updates to the locations affected by the invalidate instruction that 
have been written by observers that access the cache, if those updates have not been cleaned from 
the cache since they were made.
If the address of an entry on which the invalidate instruction operates is Normal, Non-cacheable or 
any type of Device memory then an invalidate instruction also ensures that this address is not 
present in the cache.

__dma_clean_area 完成clean操作，把接收数据更新到DDR。它的的注释是：

 Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
 * 	are cleaned to the PoC.

关于Clean， ARM的手册说明如下：

Clean  A cache clean instruction ensures that updates made by an observer that controls the cache are made 
visible to other observers that can access memory at the point to which the instruction is performed. 
Once the Clean has completed, the new memory values are guaranteed to be visible to the point to 
which the instruction is performed, for example to the Point of Unification.
The cleaning of a cache entry from a cache can overwrite memory that has been written by another 
observer only if the entry contains a location that has been written to by an observer in the 
shareability domain of that memory location.

以太网发送前是 map DMA_TO_DEVICE; 发送后是 unmap DMA_TO_DEVICE；都执行的 __dma_clean_area，把数据从cache更新到DDR。

以太网接收前是 map DMA_FROM_DEVICE，执行的 __dma_inv_area，丢弃cache数据；接收后是unmap DMA_FROM_DEVICE，执行的 __dma_clean_area，把数据从cache更新到DDR。

更新表格：

Operation	map，for_device	unmap，for_cpu
DMA_FROM_DEVICE	接收前，__dma_inv_area	接收后，__dma_inv_area
DMA_TO_DEVICE	发送前，__dma_clean_area	发送后，__dma_clean_area

一直没有想通linux驱动里为什么要做两次cache操作，而且名字也有点费解，先map，再unmap。Standalone的驱动里，只需要发送前flush cashe，接收后invalidate cache；发送后和接收前不需要操作cache。

接收做两次操作还稍微可以理解，可能是别的模块导致旧数据又被加载到cache。发送做两次cache操作很难理解。也许这里面还更改了MMU table的设置。

以前也处理一个问题，arm的预测执行会导致软件完全没有使用的ddr被读取，必须在mmu table里设置对应地址的表项完全无效，才能杜绝这种情况。也许Linux之前遇到一些问题，才改成这样的操作。

dma-coherent

DMA的设备树里可以配置属性"dma-coherent"。

drivers\of\address.c里的of_dma_is_coherent( )读取属性"dma-coherent"。

bool of_dma_is_coherent(struct device_node *np)
{
    struct device_node *node = of_node_get(np);
 
    while (node) {
        if (of_property_read_bool(node, "dma-coherent")) {
            of_node_put(node);
            return true;
        }
        node = of_get_next_parent(node);
    }
    of_node_put(node);
    return false;
}

drivers\of\Device.c中的of_dma_configure( )调用of_dma_is_coherent( )读取属性"dma-coherent"，然后再调用arch_setup_dma_ops( )，保存在变量“dev->dma_coherent”中。

/**
 * of_dma_configure - Setup DMA configuration
 * @dev:	Device to apply DMA configuration
 * @np:		Pointer to OF node having DMA configuration
 * @force_dma:  Whether device is to be set up by of_dma_configure() even if
 *		DMA capability is not explicitly described by firmware.
 *
 * Try to get devices's DMA configuration from DT and update it
 * accordingly.
 *
 * If platform code needs to use its own special DMA configuration, it
 * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events
 * to fix up DMA configuration.
 */
int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
{
    ........

	coherent = of_dma_is_coherent(np);
	dev_dbg(dev, "device is%sdma coherent\n",
		coherent ? " " : " not ");

	iommu = of_iommu_configure(dev, np);
	if (IS_ERR(iommu) && PTR_ERR(iommu) == -EPROBE_DEFER)
		return -EPROBE_DEFER;

	dev_dbg(dev, "device is%sbehind an iommu\n",
		iommu ? " " : " not ");

	arch_setup_dma_ops(dev, dma_addr, size, iommu, coherent);

	return 0;
}

arch\arm64\mm\Dma-mapping.c中的arch_setup_dma_ops( )，把设置保存在变量“dev->dma_coherent”中。

void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
			const struct iommu_ops *iommu, bool coherent)
{
	int cls = cache_line_size_of_cpu();

	WARN_TAINT(!coherent && cls > ARCH_DMA_MINALIGN,
		   TAINT_CPU_OUT_OF_SPEC,
		   "%s %s: ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)",
		   dev_driver_string(dev), dev_name(dev),
		   ARCH_DMA_MINALIGN, cls);

	dev->dma_coherent = coherent;
	if (iommu)
		iommu_setup_dma_ops(dev, dma_base, size);

#ifdef CONFIG_XEN
	if (xen_initial_domain())
		dev->dma_ops = &xen_swiotlb_dma_ops;
#endif

dev_is_dma_coherent的定义和使用

在dma_direct_map_page和中，调用dev_is_dma_coherent（），检查上述变量dev->dma_coherent，检查是否支持硬件cache同步。如果是，则不进行cache操作。

dev_is_dma_coherent的定义在文件include\linux\dma-noncoherent.h中。

static inline bool dev_is_dma_coherent(struct device *dev)
{
    return dev->dma_coherent;
}

DMA

LINUX

为什么当linux环境中多个MAC在同一子网域内时内核总是通过某一个MAC来发送数据包	【工程师分享】快速运行AMD Xilinx KR260的Linux	Linux_GUI加速(3)——加速模块设计
ZYNQ跑系统系列（四） AXI-DMA的linux下运行	【工程师分享】快速去掉不用的linux kernel模块	在PetaLinux外部创建Linux 模块