您的位置：首页 > 其它

内存管理-----伙伴系统---2

2016-05-07 23:53 323 查看

伙伴系统分配器大体上分为两类。__get_free_pages()类函数返回分配的第一个页面的线性地址；alloc_pages()类函数返回页面描述符地址。不管以哪种函数进行分配，最终会调用alloc_pages()进行分配页面；

alloc_page最后调用统一接口;__alloc_pages_nodemask

* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);/*根据gfp_mask确定分配页所处的管理区*/  
struct zone *preferred_zone;
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask); /*根据gfp_mask得到迁移类分配页的型*/  
unsigned int cpuset_mems_cookie;

gfp_mask &= gfp_allowed_mask;

lockdep_trace_alloc(gfp_mask);

might_sleep_if(gfp_mask & __GFP_WAIT);/*如果__GFP_WAIT标志设置了，需要等待和重新调度*/

if (should_fail_alloc_page(gfp_mask, order))
return NULL;

/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
* of GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;

retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();

/* The preferred zone is used for statistics later */ //从zonelist中找到zone_idx与high_zoneidx相同的管理区，
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
if (!preferred_zone)
goto out;

/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
preferred_zone, migratetype);
if (unlikely(!page))/*第一次分配失败的话则会用通过一条低速路径来进行第二次分配，包括唤醒页换出守护进程等等*/  
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

out:
/*
* When updating a task's mems_allowed, it is possible to race with
* parallel threads in such a way that an allocation can fail while
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;

return page;

首先要做的就是找到指定的分配管理区，管理区的编号保存在high_zoneidx中

然后就是尝试第一次分配，流程是从指定的管理区开始扫描管理区-->找到充足的管理区-->从指定的迁移类型链表中分配内存-->如果在指定迁移类型中找不到则到其他的迁移类型中去寻找

如果第二步在各个区域都找不到可以满足分配的内存了，那么说明管理区的内存已经确实不够了，于是开始启用一条慢速的途径来分配，包括尝试去换出一些不经常使用的页等等

/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0;		/* set if using zonelist_cache */
int did_zlc_setup = 0;		/* just call zlc_setup() one time */

classzone_idx = zone_idx(preferred_zone);/*zone对应的下标*/  
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*//*遍历每个zone，直到找到一个拥有足够空间的管理区进行分配,<span style="font-family: 宋体;">*/  
</span><span>	例如，如果high_zoneidx对应的ZONE_HIGHMEM，则遍历顺序为HIGHMEM-->NORMAL-->DMA， 
      如果high_zoneidx对应ZONE_NORMAL，则遍历顺序为NORMAL-->DMA*/ </span>
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))  /*在UMA模式下不成立*/   
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
/*
* When allocating a page cache page for writing, we
* want to get it from a zone that is within its dirty
* limit, such that no single zone holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the zone's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* This may look like it could increase pressure on
* lower zones by failing allocations in higher zones
* before they are full.  But the pages that do spill
* over are limited as the lower zones are protected
* by this very same mechanism.  It should not become
* a practical burden to them.
*
* XXX: For now, allow allocations to potentially
* exceed the per-zone dirty limit in the slowpath
* (ALLOC_WMARK_LOW unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* zones are together not big enough to reach the
* global limit.  The proper fix for these situations
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
if ((alloc_flags & ALLOC_WMARK_LOW) &&
(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))  /*检查给定的内存域是否属于该进程允许运行的CPU*/  
goto this_zone_full;

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
int ret;
<span>				 /*通过alloc_flags来确定是使用何种水印，pages_min?pages_low?pages_high? 
              选择了一种水印，就要求分配后的空闲不低于该水印才能进行分配*/ 
</span>
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags)) /*如果水位正常，从本zone中分配*/  
goto try_this_zone;

if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
* by the cpuset.
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}

if (zone_reclaim_mode == 0)/*如果上面检查的水位低于正常值，且没有设置页面回收值*/ 
goto this_zone_full;

/*
* As we may have just activated ZLC, check if the first
* eligible zone has failed zone_reclaim recently.
*/
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;

ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto this_zone_full;
}
}

try_this_zone:/*本zone正常水位*/  
<span>			   /*先从pcp中分配，然后不行的话再从伙伴系统中分配*/  </span>
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
if (page)
break;
this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);
}

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}

if (page)
/*
* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
* necessary to allocate the page. The expectation is
* that the caller is taking steps that will free more
* memory. The caller should avoid the page being used
* for !PFMEMALLOC purposes.
*/
page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

return page;
}

从指定的管理区开始按照zonelist中定义的顺序来遍历管理区
如果该管理区的水位线正常，则调用buffered_rmqueue()在该管理区中分配
如果管理区的水位线过低，则在NUMA架构下会申请页面回收

*
* Really, prep_compound_page() should be called from __rmqueue_bulk().  But
* we cheat by calling it from here, in the order > 0 path.  Saves a branch
* or two.
*//*先考虑从pcp中分配空间，当order大于0时再考虑从伙伴系统中分配*/ 
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);//cold  or  hot

again:
if (likely(order == 0)) {/*order为0，即要求分配一个页*/  
struct per_cpu_pages *pcp;
struct list_head *list;

local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp; /*找到zone对应的cpu的pcp*/  
list = &pcp->lists[migratetype];/*获取和迁移类型对应的链表*/  
if (list_empty(list)) {  /*如果链表为空，则表示没有可分配的页，需要补充，从伙伴系统中获得batch个页面给list*/  
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, cold);
if (unlikely(list_empty(list)))
goto failed;
}

if (cold)/*如果是需要冷页，则从链表的尾部获取*/  
page = list_entry(list->prev, struct page, lru);
else
page = list_entry(list->next, struct page, lru);

list_del(&page->lru);
pcp->count--;
} else {/*当order为大于1时，不从pcp中分配，直接考虑从伙伴系统中分配*/  
if (unlikely(gfp_flags & __GFP_NOFAIL)) {
/*
* __GFP_NOFAIL is not to be used in new code.
*
* All __GFP_NOFAIL callers should be fixed so that they
* properly detect and handle allocation failures.
*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with
* __GFP_NOFAIL.
*/
WARN_ON_ONCE(order > 1);
}
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order, migratetype);
spin_unlock(&zone->lock);/* 这里仅仅打开自旋锁，待后面统计计数设置完毕后再开中断*/  
if (!page)
goto failed;
<span>			/* 已经分配了1 << order个页面，这里进行管理区空闲页面统计计数*/  </span>
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
}

__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);/*恢复中断*/  
VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags)) /* 这里进行安全性检查，并进行一些善后工作。如果页面标志破坏，返回的页面出现了问题，则返回试图分配其他页面*/  
goto again;
return page;

failed:
local_irq_restore(flags);
return NULL;

该函数分两种情况进行处理，一种是只要求分配单个页框，另一种是要求分配多个连续页框

对于单个页面，内核选择从每CPU页框高速缓存中分配，它的核心描述结构也是MIGRATE_TYPES个链表，只不过链表中的元素都是单个页。这些页分为热页和冷页，所谓热页就是还处在CPU高速缓存中的页，相反，冷页就是不存在于高速缓存中的页。对于单个页框的申请，分配热页可以提高效率。需要注意的是，越靠近链表头的页越热，越靠近链表尾的页越冷，因为每次释放单个页框的时候，页框是插入到链表的头部的，也就是说靠近头部的页框是最近才释放的，因此最有可能存在于高速缓存当中

对于连续的页框分配，通过调用__rmqueue()来完成分配

*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency.  Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/*该函数返回的是1<<order个页面，但是在pcp 
 处理中调用，其他地方没看到，order为0 
  也就是说返回的是页面数，加入的链表为 
  对应调用pcp的链表*/  
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, int cold)
{
int mt = migratetype, i;

spin_lock(&zone->lock);/* 上层函数已经关了中断，这里需要操作管理区，获取管理区的自旋锁 */  
for (i = 0; i < count; ++i) {/* 重复指定的次数，从伙伴系统中分配页面*/ 
struct page *page = __rmqueue(zone, order, migratetype);  /* 从伙伴系统中取出页面 */  
if (unlikely(page == NULL))
break;

/*
* Split buddy pages returned by expand() are received here
* in physical page order. The page is added to the callers and
* list and the list head then moves forward. From the callers
* perspective, the linked list is ordered by page number in
* some conditions. This is useful for IO devices that can
* merge IO requests if the physical pages are ordered
* properly.
*//*根据调用者的要求，将页面放到每CPU缓存链表的头部或者尾部*/
if (likely(cold == 0))
list_add(&page->lru, list);
else
list_add_tail(&page->lru, list);
if (IS_ENABLED(CONFIG_CMA)) {
mt = get_pageblock_migratetype(page);
if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
mt = migratetype;
}
set_page_private(page, mt);/*设置private属性为页面的迁移类型*/ 
list = &page->lru;
}
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));/*递减管理区的空闲页面计数*/
spin_unlock(&zone->lock);/*释放管理区的子璇锁*/
return i;

/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static struct page *__rmqueue(struct zone *zone, unsigned int order,
int migratetype)
{
struct page *page;

retry_reserve:
page = __rmqueue_smallest(zone, order, migratetype);/*从指定order开始从小到达遍历,优先从指定的迁移类型链表中分配页面*/
<span>		</span>/* 
         * 如果满足以下两个条件,就从备用链表中分配页面: 
         *        快速流程没有分配到页面,需要从备用迁移链表中分配. 
         *        当前不是从保留的链表中分配.因为保留的链表是最后可用的链表, 
             *  不能从该链表分配的话,说明本管理区真的没有可用内存了. 
         */   
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
page = __rmqueue_fallback(zone, order, migratetype); /*order从大到小遍历，从备用（从各种迁移类型中寻找）链表中分配页面*/  

/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
* is used because __rmqueue_smallest is an inline function
* and we want just one call site
*/
if (!page) {
migratetype = MIGRATE_RESERVE;/* 备用链表中没有分配到页面,从保留链表中分配页面了 更换迁移类型 */  
goto retry_reserve;
}
}

trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}

从指定的迁移类型链表中分配页面

从指定order开始从小到达遍历,优先从指定的迁移类型链表中分配页面

/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
 /*从给定的order开始，从小到大遍历； 
  找到后返回页面基址，合并分割后2^h-2^k的空间*/
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area * area;
struct page *page;

/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
if (list_empty(&area->free_list[migratetype]))
continue;
<span>		 /*对应的链表不空，得到链表中数据*/  </span>
page = list_entry(area->free_list[migratetype].next,
struct page, lru);
list_del(&page->lru);
rmv_page_order(page);
area->nr_free--;
<span>			/*进行</span>合并2^h-2^k空间<span>(在current_order>order的情况下)*/  </span>
expand(zone, page, order, current_order, area, migratetype); /*合并2^h-2^k空间*/  
return page;
}

return NULL;
}

/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
* testing. Specifically, as large blocks of memory are subdivided,
* the order in which smaller blocks are delivered depends on the order
* they're subdivided in this function. This is the primary factor
* influencing the order in which pages are delivered to the IO
* subsystem according to empirical testing, and this is also justified
* by considering the behavior of a buddy system containing a single
* large block of memory acted on by a series of small allocations.
* This behavior is a critical factor in sglist merging's success.
*
* -- wli
*/ /*此函数主要用于下面这种情况: 
  分配函数从high中分割出去了low大小的内存； 
  然后要将high留下的内存块合并放到伙伴系统中；*/  
static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
int migratetype)
{
unsigned long size = 1 << high;

while (high > low) {
area--;/*减一到order减一的area*/  
high--;/*order减一*/  
size >>= 1;
VM_BUG_ON(bad_range(zone, &page[size]));

#ifdef CONFIG_DEBUG_PAGEALLOC
if (high < debug_guardpage_minorder()) {
/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
INIT_LIST_HEAD(&page[size].lru);
set_page_guard_flag(&page[size]);
set_page_private(&page[size], high);
/* Guard pages are not available for any usage */
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
continue;
}
#endif
list_add(&page[size].lru, &area->free_list[migratetype]); /*加到指定的伙伴系统中*/  
area->nr_free++;/*空闲块加一*/
set_page_order(&page[size], high);/*设定private域为high*/  
}
}

从备用链表中分配页面

/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area * area;
int current_order;
struct page *page;
int migratetype, i;

/* Find the largest possible block of pages in the other list */ /* 从最高阶搜索,这样可以尽量的将其他迁移列表中的大块分割,避免形成过多的碎片 */  
for (current_order = MAX_ORDER-1; current_order >= order;
--current_order) {
for (i = 0;; i++) {
migratetype = fallbacks[start_migratetype][i];   /*回调到下一个migratetype*/  
<span style="white-space:pre">			 /* 本函数不处理MIGRATE_RESERVE类型的迁移链表,如果本函数返回NULL, 
            则上层函数直接从MIGRATE_RESERVE中分配 */  </span>
/* MIGRATE_RESERVE handled later if necessary */
if (migratetype == MIGRATE_RESERVE)
break;

area = &(zone->free_area[current_order]);
if (list_empty(&area->free_list[migratetype]))  /*如果指定order和类型的链表为空*/  
continue;
<span style="white-space:pre">			  /*得到指定类型和order的页面基址*/  </span>
page = list_entry(area->free_list[migratetype].next,
struct page, lru);
area->nr_free--;

/*
* If breaking a large block of pages, move all free
* pages to the preferred allocation list. If falling
* back for a reclaimable kernel allocation, be more
* aggressive about taking ownership of free pages
*
* On the other hand, never change migration
* type of MIGRATE_CMA pageblocks nor move CMA
* pages on different free lists. We don't
* want unmovable pages to be allocated from
* MIGRATE_CMA areas.
*/             
if (!is_migrate_cma(migratetype) &&
(unlikely(current_order >= pageblock_order / 2) ||/* 要分割的页面是一个大页面,则将整个页面全部迁移到当前迁移类型的链表中, 
                这样可以避免过多的碎片 */  
start_migratetype == MIGRATE_RECLAIMABLE ||/* 目前分配的是可回收页面,这类页面有突发的特点,将页面全部迁移到可回收链表中, 
                可以避免将其他迁移链表分割成太多的碎片 */    
page_group_by_mobility_disabled)) {/* 指定了迁移策略,总是将被分割的页面迁移 */
int pages;
pages = move_freepages_block(zone, page,start_migratetype);   /*移动到先前类型的伙伴系统中*/  

/* Claim the whole block if over half of it is free */ /* pages是移动的页面数,如果可移动的页面数量较多, 
                则将整个大内存块的迁移类型修改 */  
if (pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page,start_migratetype); /*设置页面标示*/  

migratetype = start_migratetype;
}

/* Remove the page from the freelists */
list_del(&page->lru);
rmv_page_order(page);

/* Take ownership for orders >= pageblock_order *///大于pageblock_order的部分设置相应标示  
if (current_order >= pageblock_order &&
!is_migrate_cma(migratetype))
change_pageblock_range(page, current_order,
start_migratetype);

expand(zone, page, order, current_order, area,
is_migrate_cma(migratetype)
? migratetype : start_migratetype); /*拆分和合并*/  

trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, migratetype);

return page;
}
}

return NULL;
}

备用链表

/*
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*/
/*指定类型的链表为空时，这个数组规定
回调的到那个类型的链表*/
static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
[MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
};

移动到指定类型的伙伴系统中
将指定区域段的页面移动到指定类型的伙伴系统中，其实就是将页面的类型做了更改，但是是采用移动的方式功能和上面函数类似，但是要求以页面块方式对其*/
/
static int move_freepages_block(struct zone *zone, struct page *page,
int migratetype)
{
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;

/*如下是对齐操作，其中变量pageblock_nr_pages为MAX_ORDER-1*/
start_pfn = page_to_pfn(page);
start_pfn = start_pfn & ~(pageblock_nr_pages-1);
start_page = pfn_to_page(start_pfn);
end_page = start_page + pageblock_nr_pages - 1;
end_pfn = start_pfn + pageblock_nr_pages - 1;

/* Do not cross zone boundaries */
if (start_pfn < zone->zone_start_pfn)
start_page = page;
/*结束边界检查*/
if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
return 0;
/*调用上面函数*/
return move_freepages(zone, start_page, end_page, migratetype);
}将指定区域段的页面移动到指定类型的伙伴系统中，实际上将页面的类型做了更改

/*
* Move the free pages in a range to the free lists of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/

static int move_freepages(struct zone *zone,
struct page *start_page, struct page *end_page,
int migratetype)
{
struct page *page;
unsigned long order;
int pages_moved = 0;

#ifndef CONFIG_HOLES_IN_ZONE
/*
* page_zone is not safe to call in this context when
* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
* anyway as we check zone boundaries in move_freepages_block().
* Remove at a later date when no bug reports exist related to
* grouping pages by mobility
*/
BUG_ON(page_zone(start_page) != page_zone(end_page));
#endif

for (page = start_page; page <= end_page;) {
/* Make sure we are not inadvertently changing nodes */
VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}

if (!PageBuddy(page)) {
page++;
continue;
}

order = page_order(page);
list_del(&page->lru);/*将页面块从原来的伙伴系统链表*/
/*中删除，注意，这里不是一个页面
*而是以该页面的伙伴块*/
list_add(&page->lru,/*添加到指定order和类型下的伙伴系统链表*/
&zone->free_area[order].free_list[migratetype]);
page += 1 << order;/*移动页面数往上定位*/
pages_moved += 1 << order;/*移动的页面数*/
}

return pages_moved;
}

慢速分配，允许等待和回收
当无法快速分配页面时，如果调用者允许等待则通过本函数进行慢速分配。此时允许进行内存回收。

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
bool sync_migration = false;
bool deferred_compaction = false;
bool contended_compaction = false;

/*
* In the slowpath, we sanity check order to avoid ever trying to
* reclaim >= MAX_ORDER areas which will never succeed. Callers may
* be using allocators in order of preference for an area that is
* too large.
*/
if (order >= MAX_ORDER) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}

/*
* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
* __GFP_NOWARN set) should not cause reclaim since the subsystem
* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
* using a larger set of nodes after it has established that the
* allowed per node queues are empty and that nodes are
* over allocated.
*//**
* 调用者指定了GFP_THISNODE标志，表示不能进行内存回收。
* 上层调用者应当在指定了GFP_THISNODE失败后，使用其他标志进行分配。
*/
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;

restart:
if (!(gfp_mask & __GFP_NO_KSWAPD))/*如果调用者没有禁止kswapd，则唤醒该线程进行内存回收。*/
wake_all_kswapd(order, zonelist, high_zoneidx,
zone_idx(preferred_zone));

/*
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
* to how we want to proceed.
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);/*根据分配标志确定内部标志，主要是用于水线 */

/*
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*//**
* 与快速分配流程相比，这里的分配标志使用了低的水线。
* 在进行内存回收操作前，我们使用低水线再尝试分配一下。
* 当然，不管是否允许ALLOC_NO_WATERMARKS标志，我们都将它清除。
*/
if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
first_zones_zonelist(zonelist, high_zoneidx, NULL,
&preferred_zone);

rebalance:
/* This is the last chance, in general, before the goto nopage. */
/* 某些上下文，如内存回收进程及被杀死的任务，都允许它完全突破水线的限制分配内存。 */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page))
goto got_pg;

/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
/*
* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
* the allocation is high priority and these type of
* allocations are system rather than user orientated
*/
zonelist = node_zonelist(numa_node_id(), gfp_mask);

page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
if (page) {<pre name="code" class="cpp" style="font-size: 18px; line-height: 26px;"><span style="white-space:pre"> </span>/* 在不考虑水线的情况下，分配到了内存 */ goto got_pg;}}/* Atomic allocations - we can't balance anything */if (!wait)goto nopage;/* Avoid recursion of direct reclaim *//* 调用者本身就是内存回收进程，不能进入后面的内存回收处理流程，否则死锁 */if (current->flags
& PF_MEMALLOC)goto nopage;/* Avoid allocations with no watermarks from looping endlessly */ /**

* 当前线程正在被杀死，它可以完全突破水线分配内存。这里向上层返回NULL，是为了避免系统进入死循环。

* 当然，如果上层调用不允许失败，则死循环继续分配，等待其他线程释放一点点内存。

*/ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))goto nopage;/* * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ page = __alloc_pages_direct_compact(gfp_mask, order,zonelist,
high_zoneidx,nodemask,alloc_flags, preferred_zone,migratetype, sync_migration,&contended_compaction,&deferred_compaction,&did_some_progress);if (page)goto got_pg;sync_migration = true;/* * If compaction is deferred for high-order allocations, it is because
* sync compaction recently failed. In this is the case and the caller * requested a movable allocation that does not heavily disrupt the * system then fail the allocation instead of entering direct reclaim. ** 内存回收过程没有回收到内存，系统真的内存不足了 */ if ((deferred_compaction
|| contended_compaction) &&(gfp_mask & __GFP_NO_KSWAPD))goto nopage; /* Try direct reclaim and then allocating *//**

* 直接在内存分配上下文中进行内存回收操作。

*/ page = __alloc_pages_direct_reclaim(gfp_mask, order,zonelist, high_zoneidx,nodemask,alloc_flags, preferred_zone,migratetype, &did_some_progress);if (page)goto got_pg;/* * If we failed to make any progress reclaiming, then we are * running out of options
and have to consider going OOM */if (!did_some_progress) { /**

* 调用者不是文件系统的代码，允许进行文件系统操作，并且允许重试。

* 这里需要__GFP_FS标志可能是进入OOM流程后会杀进程或进入panic，需要文件操作。

*/ if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {if (oom_killer_disabled)/* 系统禁止了OOM，向上层返回NULL */ goto nopage;/* Coredumps can quickly deplete all memory reserves */if ((current->flags & PF_DUMPCORE) && !(gfp_mask & __GFP_NOFAIL))goto
nopage; /**

* 杀死其他进程后再尝试分配内存

*/ page = __alloc_pages_may_oom(gfp_mask, order,zonelist, high_zoneidx,nodemask, preferred_zone,migratetype);if (page)goto got_pg;if (!(gfp_mask & __GFP_NOFAIL)) {/* * The oom killer is not called for high-order * allocations that may fail,
so if no progress * is being made, there are no other options and * retrying is unlikely to help. */if (order > PAGE_ALLOC_COSTLY_ORDER)goto nopage;/* 要求的页面数量较多，再试意义不大 */ /* * The oom killer is not called for lowmem * allocations to prevent needlessly killing
* innocent tasks. */if (high_zoneidx < ZONE_NORMAL)goto nopage;}goto restart;}}/* Check if we should retry the allocation *//* 内存回收过程回收了一些内存，接下来判断是否有必要继续重试 */ pages_reclaimed += did_some_progress;if (should_alloc_retry(gfp_mask, order, did_some_progress,pages_reclaimed))
{/* Wait for some write requests to complete then retry */wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);goto rebalance;} else {/* * High-order allocations do not necessarily loop after * direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary */page = __alloc_pages_direct_compact(gfp_mask, order,zonelist, high_zoneidx,nodemask,alloc_flags, preferred_zone,migratetype, sync_migration,&contended_compaction,&deferred_compaction,&did_some_progress);if
(page)goto got_pg;}nopage:/* 内存分配失败了，打印内存分配失败的警告 */ warn_alloc_failed(gfp_mask, order, NULL);return page;got_pg:if (kmemcheck_enabled) /* 运行到这里，说明成功分配了内存，这里进行内存检测调试 */ kmemcheck_pagealloc_alloc(page, order, gfp_mask);return page;}

正常非配（或叫快速分配）流程：

1，如果分配的是单个页面，考虑从per CPU缓存中分配空间，如果缓存中没有页面，从伙伴系统中提取页面做补充。

2，分配多个页面时，从指定类型中分配，如果指定类型中没有足够的页面，从备用类型链表中分配。最后会试探保留类型链表。

慢速（允许等待和页面回收）分配：

3，当上面两种分配方案都不能满足要求时，考虑页面回收、杀死进程等操作后在试。

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航