红联Linux门户
Linux帮助

Linux内核最新的连续内存分配器(CMA)-避免预留大块内存

发布时间:2014-11-22 09:47:37来源:linux网站作者:宋宝华

在我们使用ARM等嵌入式Linux系统的时候,一个头疼的问题是GPU,Camera,HDMI等都需要预留大量连续内存,这部分内存平时不用,但是一般的做法又必须先预留着。目前,Marek Szyprowski和Michal Nazarewicz实现了一套全新的Contiguous Memory Allocator。通过这套机制,我们可以做到不预留内存,这些内存平时是可用的,只有当需要的时候才被分配给Camera,HDMI等设备。下面分析它的基本代码流程。


声明连续内存

内核启动过程中arch/arm/mm/init.c中的arm_memblock_init()会调用dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));

该函数位于:drivers/base/dma-contiguous.c

/**
 * dma_contiguous_reserve() - reserve area for contiguous memory handling
 * @limit: End address of the reserved memory (optional, 0 for any).
 *
 * This function reserves memory from early allocator. It should be
 * called by arch specific code once the early allocator (memblock or bootmem)
 * has been activated and all other subsystems have already allocated/reserved
 * memory.
 */ 
void __init dma_contiguous_reserve(phys_addr_t limit) 

unsigned long selected_size = 0; 
 
pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); 
 
if (size_cmdline != -1) { 
selected_size = size_cmdline; 
} else { 
#ifdef CONFIG_CMA_SIZE_SEL_MBYTES  
selected_size = size_bytes; 
#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)  
selected_size = cma_early_percent_memory(); 
#elif defined(CONFIG_CMA_SIZE_SEL_MIN)  
selected_size = min(size_bytes, cma_early_percent_memory()); 
#elif defined(CONFIG_CMA_SIZE_SEL_MAX)  
selected_size = max(size_bytes, cma_early_percent_memory()); 
#endif  
}
 
if (selected_size) { 
pr_debug("%s: reserving %ld MiB for global area\n", __func__, 
 selected_size / SZ_1M); 
 
dma_declare_contiguous(NULL, selected_size, 0, limit); 
}
};


其中的size_bytes定义为:

static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M; 默认情况下,CMA_SIZE_MBYTES会被定义为16MB,来源于CONFIG_CMA_SIZE_MBYTES=16
->

int __init dma_declare_contiguous(struct device *dev, unsigned long size, 
  phys_addr_t base, phys_addr_t limit) 

... 
/* Reserve memory */ 
if (base) { 
if (memblock_is_region_reserved(base, size) || 
memblock_reserve(base, size) < 0) { 
base = -EBUSY; 
goto err; 

} else { 
/*
 * Use __memblock_alloc_base() since
 * memblock_alloc_base() panic()s.
 */ 
phys_addr_t addr = __memblock_alloc_base(size, alignment, limit); 
if (!addr) { 
base = -ENOMEM; 
goto err; 
} else if (addr + size > ~(unsigned long)0) { 
memblock_free(addr, size); 
base = -EINVAL; 
base = -EINVAL; 
goto err; 
} else { 
base = addr; 


 
/*
 * Each reserved area must be initialised later, when more kernel
 * subsystems (like slab allocator) are available.
 */ 
r->start = base; 
r->size = size; 
r->dev = dev; 
cma_reserved_count++; 
pr_info("CMA: reserved %ld MiB at %08lx\n", size / SZ_1M, 
(unsigned long)base); 
 
/* Architecture specific contiguous memory fixup. */ 
dma_contiguous_early_fixup(base, size); 
return 0; 
err: 
pr_err("CMA: failed to reserve %ld MiB\n", size / SZ_1M); 
return base; 
}


由此可见,连续内存区域也是在内核启动的早期,通过__memblock_alloc_base()拿到的。

另外:

drivers/base/dma-contiguous.c里面的core_initcall()会导致cma_init_reserved_areas()被调用:

static int __init cma_init_reserved_areas(void) 

struct cma_reserved *r = cma_reserved; 
unsigned i = cma_reserved_count; 
 
pr_debug("%s()\n", __func__); 
 
for (; i; --i, ++r) { 
struct cma *cma; 
cma = cma_create_area(PFN_DOWN(r->start), 
  r->size >> PAGE_SHIFT); 
if (!IS_ERR(cma)) 
dev_set_cma_area(r->dev, cma); 

return 0; 

core_initcall(cma_init_reserved_areas);

cma_create_area()会调用cma_activate_area(),cma_activate_area()函数则会针对每个page调用:

init_cma_reserved_pageblock(pfn_to_page(base_pfn));


这个函数则会通过set_pageblock_migratetype(page, MIGRATE_CMA)将页设置为MIGRATE_CMA类型的:

#ifdef CONFIG_CMA  
/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ 
void __init init_cma_reserved_pageblock(struct page *page) 

unsigned i = pageblock_nr_pages; 
struct page *p = page; 
 
do { 
__ClearPageReserved(p); 
set_page_count(p, 0); 
} while (++p, --i); 
 
set_page_refcounted(page); 
set_pageblock_migratetype(page, MIGRATE_CMA); 
__free_pages(page, pageblock_order); 
totalram_pages += pageblock_nr_pages; 
}
#endif


同时其中调用的__free_pages(page, pageblock_order);最终会调用到__free_one_page(page, zone, order, migratetype);
相关的page会被加到MIGRATE_CMA的free_list上面去:

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);


申请连续内存

申请连续内存仍然使用标准的arch/arm/mm/dma-mapping.c中定义的dma_alloc_coherent()和dma_alloc_writecombine(),这二者会间接调用drivers/base/dma-contiguous.c中的

struct page *dma_alloc_from_contiguous(struct device *dev, int count, 
   unsigned int align) 

->

 struct page *dma_alloc_from_contiguous(struct device *dev, int count, 
  unsigned int align) 

   ... 
 
   for (;;) { 
pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count, 
start, count, mask); 
if (pageno >= cma->count) { 
ret = -ENOMEM; 
goto error; 

 
pfn = cma->base_pfn + pageno; 
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); 
if (ret == 0) { 
bitmap_set(cma->bitmap, pageno, count); 
break; 
} else if (ret != -EBUSY) { 
goto error; 

pr_debug("%s(): memory range at %p is busy, retrying\n", 
 __func__, pfn_to_page(pfn)); 
/* try again with a bit different memory target */ 
start = pageno + mask + 1; 

   ... 
 

->

int alloc_contig_range(unsigned long start, unsigned long end,
   unsigned migratetype)


需要隔离page,隔离page的作用通过代码的注释可以体现:

/*
 * What we do here is we mark all pageblocks in range as
 * MIGRATE_ISOLATE.  Because of the way page allocator work, we
 * align the range to MAX_ORDER pages so that page allocator
 * won't try to merge buddies from different pageblocks and
 * change MIGRATE_ISOLATE to some other migration type.
 *
 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
 * migrate the pages from an unaligned range (ie. pages that
 * we are interested in).  This will put all the pages in
 * range back to page allocator as MIGRATE_ISOLATE.
 *
 * When this is done, we take the pages in range from page
 * allocator removing them from the buddy system.  This way
 * page allocator will never consider using them.
 *
 * This lets us mark the pageblocks back as
 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
 * MAX_ORDER aligned range but not in the unaligned, original
 * range are put back to page allocator so that buddy can use
 * them. 
 */
 
ret = start_isolate_page_range(pfn_align_to_maxpage_down(start), 
   pfn_align_to_maxpage_up(end), 
   migratetype);


简单地说,就是把相关的page标记为MIGRATE_ISOLATE,这样buddy系统就不会再使用他们。

/*  
 * start_isolate_page_range() -- make page-allocation-type of range of pages
 * to be MIGRATE_ISOLATE.
 * @start_pfn: The lower PFN of the range to be isolated.
 * @end_pfn: The upper PFN of the range to be isolated.
 * @migratetype: migrate type to set in error recovery.
 *
 * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
 * the range will never be allocated. Any free pages and pages freed in the
 * future will not be allocated again.
 *
 * start_pfn/end_pfn must be aligned to pageblock_order.
 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
 */ 
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, 
 unsigned migratetype) 

unsigned long pfn; 
unsigned long undo_pfn; 
struct page *page; 
 
BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); 
BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); 
 
for (pfn = start_pfn; 
 pfn < end_pfn; 
 pfn += pageblock_nr_pages) { 
page = __first_valid_page(pfn, pageblock_nr_pages); 
if (page && set_migratetype_isolate(page)) { 
undo_pfn = pfn; 
goto undo; 


return 0; 
undo: 
for (pfn = start_pfn; 
 pfn < undo_pfn; 
 pfn += pageblock_nr_pages) 
unset_migratetype_isolate(pfn_to_page(pfn), migratetype); 
 
return -EBUSY; 
}


接下来调用__alloc_contig_migrate_range()进行页面隔离和迁移:

static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)  

/* This function is based on compact_zone() from compaction.c. */ 
 
unsigned long pfn = start; 
unsigned int tries = 0;  
int ret = 0;  
 
struct compact_control cc = { 
.nr_migratepages = 0,  
.order = -1, 
.zone = page_zone(pfn_to_page(start)), 
.sync = true, 
};
INIT_LIST_HEAD(&cc.migratepages); 
 
migrate_prep_local(); 
 
while (pfn < end || !list_empty(&cc.migratepages)) { 
if (fatal_signal_pending(current)) { 
ret = -EINTR; 
break; 

 
if (list_empty(&cc.migratepages)) { 
cc.nr_migratepages = 0;  
pfn = isolate_migratepages_range(cc.zone, &cc,  
 pfn, end); 
if (!pfn) { 
ret = -EINTR; 
break; 

tries = 0;  
} else if (++tries == 5) {  
ret = ret < 0 ? ret : -EBUSY; 
break; 

 
ret = migrate_pages(&cc.migratepages, 
__alloc_contig_migrate_alloc, 
0, false, true); 

 
putback_lru_pages(&cc.migratepages); 
return ret > 0 ? 0 : ret;  
}


其中的函数migrate_pages()会完成页面的迁移,迁移过程中通过传入的__alloc_contig_migrate_alloc()申请新的page,并将老的page付给新的page:

int migrate_pages(struct list_head *from, 
new_page_t get_new_page, unsigned long private, bool offlining, 
bool sync) 

int retry = 1;  
int nr_failed = 0;  
int pass = 0;  
struct page *page; 
struct page *page2; 
int swapwrite = current->flags & PF_SWAPWRITE; 
int rc; 
 
if (!swapwrite) 
current->flags |= PF_SWAPWRITE; 
 
for(pass = 0; pass < 10 && retry; pass++) { 
retry = 0;  
 
list_for_each_entry_safe(page, page2, from, lru) { 
cond_resched(); 
 
rc = unmap_and_move(get_new_page, private, 
page, pass > 2, offlining, 
sync); 
 
switch(rc) { 
case -ENOMEM: 
goto out;  
case -EAGAIN: 
retry++; 
break; 
case 0: 
break; 
default: 
/* Permanent failure */ 
nr_failed++; 
break; 



rc = 0; 
... 
}  

其中的unmap_and_move()函数较为关键,它定义在mm/migrate.c中

/*
 * Obtain the lock on page, remove all ptes and migrate the page
 * to the newly allocated page in newpage.
 */ 
static int unmap_and_move(new_page_t get_new_page, unsigned long private, 
struct page *page, int force, bool offlining, bool sync) 

int rc = 0; 
int *result = NULL; 
struct page *newpage = get_new_page(page, private, &result); 
int remap_swapcache = 1; 
int charge = 0; 
struct mem_cgroup *mem = NULL; 
struct anon_vma *anon_vma = NULL; 
 
... 
 
/* charge against new page */ 
charge = mem_cgroup_prepare_migration(page, newpage, &mem); 
... 
 
if (PageWriteback(page)) { 
if (!force || !sync) 
goto uncharge; 
wait_on_page_writeback(page); 

/*
 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
 * we cannot notice that anon_vma is freed while we migrates a page.
 * This get_anon_vma() delays freeing anon_vma pointer until the end
 * of migration. File cache pages are no problem because of page_lock()
 * File Caches may use write_page() or lock_page() in migration, then,
 * just care Anon page here.
 */ 
if (PageAnon(page)) { 
/*
 * Only page_lock_anon_vma() understands the subtleties of
 * getting a hold on an anon_vma from outside one of its mms.
 */ 
anon_vma = page_lock_anon_vma(page); 
if (anon_vma) { 
/*
 * Take a reference count on the anon_vma if the
 * page is mapped so that it is guaranteed to
 * exist when the page is remapped later
 */ 
get_anon_vma(anon_vma); 
page_unlock_anon_vma(anon_vma); 
} else if (PageSwapCache(page)) { 
/*
 * We cannot be sure that the anon_vma of an unmapped
 * swapcache page is safe to use because we don't
 * know in advance if the VMA that this page belonged
 * to still exists. If the VMA and others sharing the
 * data have been freed, then the anon_vma could
 * already be invalid.
 *
 * To avoid this possibility, swapcache pages get
 * migrated but are not remapped when migration
 * completes
 */ 
remap_swapcache = 0; 
} else { 
goto uncharge; 


 
... 
/* Establish migration ptes or remove ptes */ 
try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 
 
skip_unmap: 
if (!page_mapped(page)) 
rc = move_to_new_page(newpage, page, remap_swapcache); 
 
if (rc && remap_swapcache) 
remove_migration_ptes(page, page); 
 
/* Drop an anon_vma reference if we took one */ 
if (anon_vma) 
drop_anon_vma(anon_vma); 
 
uncharge: 
if (!charge) 
mem_cgroup_end_migration(mem, page, newpage, rc == 0); 
unlock: 
unlock_page(page); 
 
move_newpage: 
... 
}


通过unmap_and_move(),老的page就被迁移过去新的page。

接下来要回收page,回收page的作用是,不至于因为拿了连续的内存后,系统变得内存饥饿:

->

/*
 * Reclaim enough pages to make sure that contiguous allocation
 * will not starve the system.
 */ 
__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); 

->

/*
 * Trigger memory pressure bump to reclaim some pages in order to be able to
 * allocate 'count' pages in single page units. Does similar work as
 *__alloc_pages_slowpath() function.
 */ 
static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) 

enum zone_type high_zoneidx = gfp_zone(gfp_mask); 
struct zonelist *zonelist = node_zonelist(0, gfp_mask); 
int did_some_progress = 0; 
int order = 1; 
unsigned long watermark; 
 
/*
 * Increase level of watermarks to force kswapd do his job
 * to stabilise at new watermark level.
 */ 
__update_cma_watermarks(zone, count); 
 
/* Obey watermarks as if the page was being allocated */ 
watermark = low_wmark_pages(zone) + count; 
while (!zone_watermark_ok(zone, 0, watermark, 0, 0)) { 
wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); 
 
did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 
  NULL); 
if (!did_some_progress) { 
/* Exhausted what can be done so it's blamo time */ 
out_of_memory(zonelist, gfp_mask, order, NULL); 


 
/* Restore original watermark levels. */ 
__update_cma_watermarks(zone, -count); 
 
return count; 
}


释放连续内存
内存释放的时候也比较简单,直接就是:

arch/arm/mm/dma-mapping.c:

void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle) 

->

arch/arm/mm/dma-mapping.c:

static void __free_from_contiguous(struct device *dev, struct page *page, 
   size_t size) 

__dma_remap(page, size, pgprot_kernel); 
dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT); 

->

bool dma_release_from_contiguous(struct device *dev, struct page *pages, 
 int count) 

... 
free_contig_range(pfn, count); 
.. 
 

void free_contig_range(unsigned long pfn, unsigned nr_pages) 
{
for (; nr_pages--; ++pfn) 
__free_page(pfn_to_page(pfn)); 
}


将page交还给buddy。

内核内存分配的migratetype

内核内存分配的时候,带的标志是GFP_,但是GFP_可以转化为migratetype:

static inline int allocflags_to_migratetype(gfp_t gfp_flags) 

WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); 
 
if (unlikely(page_group_by_mobility_disabled)) 
return MIGRATE_UNMOVABLE; 
 
/* Group based on mobility */ 
return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) | 
((gfp_flags & __GFP_RECLAIMABLE) != 0);  
}


之后申请内存的时候,会对比迁移类型匹配的free_list:

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, 
preferred_zone, migratetype);


另外,笔者也编写了一个测试程序,透过它随时测试CMA的功能:

/*
 * kernel module helper for testing CMA
 *
 * Licensed under GPLv2 or later.
 */ 
 
#include <linux/module.h>  
#include <linux/device.h>  
#include <linux/fs.h>  
#include <linux/miscdevice.h>  
#include <linux/dma-mapping.h>  
 
#define CMA_NUM  10  
static struct device *cma_dev; 
static dma_addr_t dma_phys[CMA_NUM]; 
static void *dma_virt[CMA_NUM]; 
 
/* any read request will free coherent memory, eg.
 * cat /dev/cma_test
 */ 
static ssize_t 
cma_test_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 

int i; 
 
for (i = 0; i < CMA_NUM; i++) { 
if (dma_virt[i]) { 
dma_free_coherent(cma_dev, (i + 1) * SZ_1M, dma_virt[i], dma_phys[i]); 
_dev_info(cma_dev, "free virt: %p phys: %p\n", dma_virt[i], (void *)dma_phys[i]); 
dma_virt[i] = NULL; 
break; 


return 0; 

 
/*
 * any write request will alloc coherent memory, eg.
 * echo 0 > /dev/cma_test
 */ 
static ssize_t 
cma_test_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) 

int i; 
int ret; 
 
for (i = 0; i < CMA_NUM; i++) { 
if (!dma_virt[i]) { 
dma_virt[i] = dma_alloc_coherent(cma_dev, (i + 1) * SZ_1M, &dma_phys[i], GFP_KERNEL); 
 
if (dma_virt[i]) { 
void *p; 
/* touch every page in the allocated memory */ 
for (p = dma_virt[i]; p <  dma_virt[i] + (i + 1) * SZ_1M; p += PAGE_SIZE) 
*(u32 *)p = 0; 
 
_dev_info(cma_dev, "alloc virt: %p phys: %p\n", dma_virt[i], (void *)dma_phys[i]); 
} else { 
dev_err(cma_dev, "no mem in CMA area\n"); 
ret = -ENOMEM; 

break; 


 
return count; 

 
static const struct file_operations cma_test_fops = { 
.owner =THIS_MODULE, 
.read  =cma_test_read, 
.write =cma_test_write, 
}; 
 
static struct miscdevice cma_test_misc = { 
.name = "cma_test", 
.fops = &cma_test_fops, 
}; 
 
static int __init cma_test_init(void) 

int ret = 0; 
 
ret = misc_register(&cma_test_misc); 
if (unlikely(ret)) { 
pr_err("failed to register cma test misc device!\n"); 
return ret; 

cma_dev = cma_test_misc.this_device; 
cma_dev->coherent_dma_mask = ~0; 
_dev_info(cma_dev, "registered.\n"); 
 
return ret; 

module_init(cma_test_init); 
 
static void __exit cma_test_exit(void) 

misc_deregister(&cma_test_misc); 

module_exit(cma_test_exit); 
 
MODULE_LICENSE("GPL"); 
MODULE_AUTHOR("Barry Song <21cnbao@gmail.com>"); 
MODULE_DESCRIPTION("kernel module to help the test of CMA"); 
MODULE_ALIAS("CMA test");


申请内存:

# echo 0 > /dev/cma_test 

释放内存:

# cat /dev/cma_test