您的位置：首页 > 运维架构 > Linux

Linux内存分配alloc_page和__get_free_page详注（伙伴管理系统Buddy）

2014-02-23 00:10 519 查看

alloc_page和__get_free_page都是从Buddy分配页面，只是最终返回值类型不同而已，前者返回page指针，后者返回该page所在的虚拟地址。

两者最终都会调用到核心函数__alloc_pages_nodemask，下面详述该函数的处理流程。

struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
struct page *page;
int migratetype = allocflags_to_migratetype(gfp_mask);//由于gfp flag和migrate type不是一一对应的关系，在此进行转换

gfp_mask &= gfp_allowed_mask;

lockdep_trace_alloc(gfp_mask);

might_sleep_if(gfp_mask & __GFP_WAIT);//如果此次内存分配可以等待（睡眠），那么再深入判断此task是否可以被调度，如果是将主动schedule

if (should_fail_alloc_page(gfp_mask, order))//打开CONFIG_FAIL_PAGE_ALLOC调试配置选项时，为分配失败调试做准备
return NULL;

/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
* of GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;

get_mems_allowed();//锁定分配策略，防止被修改
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);//搜索可用的zone保存在preferred_zone
if (!preferred_zone) {//如果没有可用的zone，释放分配策略，返回
put_mems_allowed();
return NULL;
}

/* First allocation attempt */
//快速路径分配，指定了cpu亲和性和选择高水线区，check zonelist，找到合适的zone，check水线值，
//如果不满足水线值要求，启动回收机制，最后调用它再buffered_rmqueue在该zone分配内存
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
preferred_zone, migratetype);
if (unlikely(!page))//上面分配失败，开始从slowpath分配，会启动回收机制，并且降低水线值，再次调用快速路径分配
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
put_mems_allowed();//释放策略

trace_mm_page_alloc(page, order, gfp_mask, migratetype);//调试使用
return page;
}

/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0;		/* set if using zonelist_cache */
int did_zlc_setup = 0;		/* just call zlc_setup() one time */

classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,//遍历zonelist
high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))//check zonelist cache中是否有符合的zone
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))//check cpuset和allowed node，函数头注释很清楚
goto try_next_zone;//不满足要求，check下一个zone

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {//需要check水线，否则直接跳到真正的分配动作
unsigned long mark;
int ret;

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (zone_watermark_ok(zone, order, mark,//check该zone水线是否满足要求，水线值根据分配flag ALLOC_HIGH和ALLOC_HARDER,
//有不同的计算方式，满足跳到真正分配动作，否则继续check
classzone_idx, alloc_flags))
goto try_this_zone;

if (zone_reclaim_mode == 0)//如果本zone不允许回收，更新zone list cache为full，为下一次check节省时间
goto this_zone_full;

ret = zone_reclaim(zone, gfp_mask, order);//启动回收机制，如果不可以wait，返回ZONE_RECLAIM_NOSCAN，
//否则在local zone或没有关联到其他processor的zone，调用__zone_reclaim进行回收
switch (ret) {
case ZONE_RECLAIM_NOSCAN://没有scan，直接check next zone
/* did not scan */
goto try_next_zone;
case ZONE_RECLAIM_FULL://没有分配空间
/* scanned but unreclaimable */
goto this_zone_full;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,//成功进行了回收，check水线是否满足要求
classzone_idx, alloc_flags))
goto this_zone_full;
}
}

try_this_zone:
page = buffered_rmqueue(preferred_zone, zone, order,//各种情况check完毕，在本zone进行真正的内存分配动作
gfp_mask, migratetype);
if (page)
break;
this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);
try_next_zone:
if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup after the first zone is tried but only
* if there are multiple nodes make it worthwhile
*/
allowednodes = zlc_setup(zonelist, alloc_flags);//更新zone list cache
zlc_active = 1;
did_zlc_setup = 1;
}
}

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}
return page;
}

/*
* Really, prep_compound_page() should be called from __rmqueue_bulk().  But
* we cheat by calling it from here, in the order > 0 path.  Saves a branch
* or two.
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);

again:
if (likely(order == 0)) {//如果分配单页，直接从pcp搜索
struct per_cpu_pages *pcp;
struct list_head *list;

local_irq_save(flags);//Disable and save irq state
pcp = &this_cpu_ptr(zone->pageset)->pcp//取得pcp指针
list = &pcp->lists[migratetype];//取得对应的migrate type pcp list
if (list_empty(list)) {//如果该list为空
pcp->count += rmqueue_bulk(zone, 0,//从buddy的free_area释放batch个页面到pcp
pcp->batch, list,
migratetype, cold);
if (unlikely(list_empty(list)))
goto failed;
}

if (cold)//分配冷页，不被cache的页，比如用于DMA
page = list_entry(list->prev, struct page, lru);//分配冷页，即从链表尾开始查找
else//分配热页，被cache的页，提高效率
page = list_entry(list->next, struct page, lru);//热页从链表头开始查找

list_del(&page->lru);//从lru list删除该页
pcp->count--;//pcp->count值代表本pcp有多少页
} else {
if (unlikely(gfp_flags & __GFP_NOFAIL)) {
/*
* __GFP_NOFAIL is not to be used in new code.
*
* All __GFP_NOFAIL callers should be fixed so that they
* properly detect and handle allocation failures.
*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with
* __GFP_NOFAIL.
*/
WARN_ON_ONCE(order > 1);
}
spin_lock_irqsave(&zone->lock, flags);//为操作buddy上锁
page = __rmqueue(zone, order, migratetype);//真正从buddy的free_area链表分配内存，分两种情况
//1. __rmqueue_smallest()：如果order上对应分配策略要求的migrate type list有空间，从第一满足
//的节点上分配内存，并将剩余的部分add到更小的order链表上
//2. __rmqueue_fallback()：如果对应的migrate type list上没有空间，fallback到其他的type list上，
//释放一定空间到本migratte type list上，fallback有相应的sequence，再进行和__rmqueue_smallest类似的分配、合并动作
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
}

__count_zone_vm_events(PGALLOC, zone, 1 << order);//更新本cpu vm event信息
zone_statistics(preferred_zone, zone);//更新zone相关信息
local_irq_restore(flags);//enable and restore irq

VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))//如果本页已经本映射，重新分配
goto again;
return page;

failed:
local_irq_restore(flags);
return NULL;
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： Linux kernel 内存分配 alloc_page __get_free_page

相关文章推荐

新的分享

章节导航