您的位置:首页 > 运维架构 > Linux

Linux内核源代码情景分析-内存管理之用户堆栈的扩展

2016-09-08 16:00 246 查看
 在下面几种情况下会发生,页面出错异常(也叫缺页中断):

    1、相应的页面目录项或者页面表项为空,也就是该线性地址与物理地址的映射关系尚未建立,或者已经撤销。本文讨论的就是这种情况。

    2、相应的物理页面不在内存中。

    3、指令中规定的访问方式与页面的权限不符,例如企图写一个“只读”的页面。

    首先看下进程地址空间示意图:

    假设现在需要调用某个子程序,因此CPU需将返回地址压入堆栈,也就是要将返回地址写入虚拟空间地址为(%esp-4)的地方。可是,在我们这个情景中地址(%esp-4)落入了空洞中,这是尚未映射的地址,因此必然要引起一次页面出错异常。

  

    这里假定CPU的运行已经到达了页面异常服务程序的主体do_page_fault()的入口处。代码如下:

    arch/i386/mm/fault.c

asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
struct task_struct *tsk;
struct mm_struct *mm;
struct vm_area_struct * vma;
unsigned long address;
unsigned long page;
unsigned long fixup;
int write;
siginfo_t info;

/* get the address */
__asm__("movl %%cr2,%0":"=r" (address));//把映射的失败的地址保存在address中,也就是%esp-4

tsk = current;//task_struct

/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*/
if (address >= TASK_SIZE)
goto vmalloc_fault;

mm = tsk->mm;//mm_struct
info.si_code = SEGV_MAPERR;

/*
* If we're in an interrupt or have no user
* context, we must not take the fault..
*/
if (in_interrupt() || !mm)
goto no_context;

down(&mm->mmap_sem);

vma = find_vma(mm, address);//找出结束地址大于给定地址的第一个区间。
if (!vma)//没有找到,说明没有一个区间的结束地址高于给定的地址,参考上图,说明这个地址是在堆栈之下,也就是3G字节以上了。
goto bad_area;
if (vma->vm_start <= address)//起始地址不高于address,说明映射已经建立,转到good_area去进一步检查失败原因。
goto good_area;
if (!(vma->vm_flags & VM_GROWSDOWN))//起始地址大于address,说明落到了空洞里面;如果vm_flags为VM_GROWSDOWN,说明落在堆栈区中,不会goto bad_area。
goto bad_area;
if (error_code & 4) {//发生在用户态
/*
* accessing the stack below %esp is always a bug.
* The "+ 32" is there due to some instructions (like
* pusha) doing post-decrement on the stack and that
* doesn't show up until later..
*/
if (address + 32 < regs->esp)//确保这是压栈操作,一次压入堆栈是4个字节,最多是pusha,压入32个字节。
goto bad_area;
}
if (expand_stack(vma, address))//看下面代码注释
goto bad_area;
/*
* Ok, we have a good vm_area for this memory access, so
* we can handle it..
*/
good_area:
info.si_code = SEGV_ACCERR;
write = 0;
switch (error_code & 3) {// 110 & 011 = 2
default:	/* 3: write, present */
#ifdef TEST_VERIFY_AREA
if (regs->cs == KERNEL_CS)
printk("WP fault at %08lx\n", regs->eip);
#endif
/* fall through */
case 2:		/* write, not present */
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
write++;//执行到这里
break;
case 1:		/* read, present */
goto bad_area;
case 0:		/* read, not present */
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
goto bad_area;
}

/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
switch (handle_mm_fault(mm, vma, address, write)) {
case 1:
tsk->min_flt++;
break;
case 2:
tsk->maj_flt++;
break;
case 0:
goto do_sigbus;
default:
goto out_of_memory;
}

/*
* Did it hit the DOS screen memory VA from vm86 mode?
*/
if (regs->eflags & VM_MASK) {
unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
if (bit < 32)
tsk->thread.screen_bitmap |= 1 << bit;
}
up(&mm->mmap_sem);
return;
.......
}


    内核的中断/异常响应机制还传过来两个参数。一个是pt_regs结构指针regs,它指向例外发生前夕CPU中各寄存器内容的一份副本。而error_code则进一步指明映射失败的具体原因。

    error_code:

  bit 0 == 0 means no page found, 1 means protection fault

  bit 1 == 0 means read, 1 means write
bit 2 == 0 means kernel, 1 means user-mode

    此时,error_code为110,用户态,尚未映射,写。

    expand_stack函数,代码如下:

static inline int expand_stack(struct vm_area_struct * vma, unsigned long address)
{
unsigned long grow;

address &= PAGE_MASK;//地址按页面边界对齐
grow = (vma->vm_start - address) >> PAGE_SHIFT;//本例中grow为1个页面
if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur)
return -ENOMEM;
vma->vm_start = address;//起始地址向低地址移了一个页面的距离
vma->vm_pgoff -= grow;
vma->vm_mm->total_vm += grow;
if (vma->vm_flags & VM_LOCKED)
vma->vm_mm->locked_vm += grow;
return 0;
}


    handle_mm_fault函数,代码如下:

int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
unsigned long address, int write_access)
{
int ret = -1;
pgd_t *pgd;
pmd_t *pmd;

pgd = pgd_offset(mm, address);//返回页面表项指针
pmd = pmd_alloc(pgd, address);//中转了一下,还是页目录表项指针

if (pmd) {
pte_t * pte = pte_alloc(pmd, address);//返回指向页表项的指针
if (pte)
ret = handle_pte_fault(mm, vma, address, write_access, pte);
}
return ret;
}


    pgd_offset函数,如下:

#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))


    pmd_alloc函数,如下:

extern inline pmd_t * pmd_alloc(pgd_t *pgd, unsigned long address)
{
if (!pgd)
BUG();
return (pmd_t *) pgd;
}


    pte_alloc函数,如下:

extern inline pte_t * pte_alloc(pmd_t * pmd, unsigned long address)
{
address = (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);//在页表中的偏移

if (pmd_none(*pmd))//是否存在该页目录项
goto getnew;//如果没有就去创建
if (pmd_bad(*pmd))
goto fix;
return (pte_t *)pmd_page(*pmd) + address;//有就返回指向页表项的指针
getnew:
{
unsigned long page = (unsigned long) get_pte_fast();//创建页表

if (!page)
return get_pte_slow(pmd, address);
set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(page)));//让页目录项指向页表
return (pte_t *)page + address;//返回指向页表项的指针
}
fix:
__handle_bad_pmd(pmd);
return NULL;


    handle_pte_fault函数,如下:

static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct * vma, unsigned long address,
int write_access, pte_t * pte)
{
pte_t entry;

/*
* We need the page table lock to synchronize with kswapd
* and the SMP-safe atomic PTE updates.
*/
spin_lock(&mm->page_table_lock);
entry = *pte;//页表项中内容
if (!pte_present(entry)) {//页表项为空
/*
* If it truly wasn't present, we know that kswapd
* and the PTE updates will not touch it later. So
* drop the lock.
*/
spin_unlock(&mm->page_table_lock);
if (pte_none(entry))//页表项为空
return do_no_page(mm, vma, address, write_access, pte);
return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);
}

if (write_access) {
if (!pte_write(entry))
return do_wp_page(mm, vma, address, pte, entry);

entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
establish_pte(vma, address, pte, entry);
spin_unlock(&mm->page_table_lock);
return 1;
}


    do_no_page函数,如下:

static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
unsigned long address, int write_access, pte_t *page_table)
{
struct page * new_page;
pte_t entry;

if (!vma->vm_ops || !vma->vm_ops->nopage)//都为空
return do_anonymous_page(mm, vma, page_table, write_access, address);

.......
return 2;	/* Major fault */
}


    do_anonymous_page函数,如下:

static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
{
struct page *page = NULL;
pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
if (write_access) { //write_access为1
page = alloc_page(GFP_HIGHUSER);//分配页面
if (!page)
return -1;
clear_user_highpage(page, addr);
entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));//页表项赋予已写过对应的物理页,可进行读、写或者执行
mm->rss++;
flush_page_to_ram(page);
}
set_pte(page_table, entry);//页表项(属性刚才已经设置了)指向对应的页面
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, entry);
return 1;	/* Minor fault */
}


    依次返回,从异常处理返回以后,堆栈区已经扩展了,再重新执行一便以前夭折的那条压栈指令,然后就可以继续往下执行了。对于用户程序来说,这整个过程都是“透明”的,就像什么事也没有发生,而堆栈区间就仿佛从一开始就已经分配好了足够大的空间一样。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: