您的位置:首页 > 运维架构 > Linux

7、Linux内核如何装载和启动一个可执行程序

2017-04-09 19:19 543 查看
姓名:周毅原创作品转载请注明出处 《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000

一、可执行文件的创建——预处理、编译和链接



预处理,替换宏定义等等:gcc -E -o hello.cpp hello.c

编译,编译成汇编文本代码:gcc -x cpp-output -S -o hello.s hello.cpp

汇编,汇编成目标文件:gcc -x assembler -c hello.s -o hello.o

链接共享库,得到可以执行文件:gcc -o hello hello.o

静态链接:gcc -o hello.static hello.o -static

二、查看ELF目标文件格式

是UNIX系统实验室(USL)作为应用程序二进制接口(Application Binary Interface,ABI)而开发和发布的,也是Linux的主要可执行文件格式。

#define EI_NIDENT
  typedef struct{
  unsigned char e_ident[EI_NIDENT];
  Elf32_Half e_type;
  Elf32_Half e_machine;
  Elf32_Word e_version;
  Elf32_Addr e_entry;
  Elf32_Off e_phoff;
  Elf32_Off e_shoff;
  Elf32_Word e_flags;
  Elf32_Half e_ehsize;
  Elf32_Half e_phentsize;
  Elf32_Half e_phnum;
  Elf32_Half e_shentsize;
  Elf32_Half e_shnum;
  Elf32_Half e_shstrndx;
  }Elf32_Ehdr;


e_type 它标识的是该文件的类型。

e_machine 表明运行该程序需要的体系结构。

e_version 表示文件的版本。

e_entry 程序的入口地址。

e_phoff 表示Program header table 在文件中的偏移量(以字节计数)。

e_shoff 表示Section header table 在文件中的偏移量(以字节计数)。

e_flags 对IA32而言,此项为0。

e_ehsize 表示ELF header大小(以字节计数)。

e_phentsize 表示Program header table中每一个条目的大小。

e_phnum 表示Program header table中有多少个条目。

e_shentsize 表示Section header table中的每一个条目的大小。 

e_shnum 表示Section header table中有多少个条目。

e_shstrndx 包含节名称的字符串是第几个节(从零开始计数)



可以看到elf头的各种信息。

三、可执行程序的装载

Shell本身不限制命令行参数的个数, 命令行参数的个数受限于命令自身

例如,int main(int argc, char *argv[])

又如, int main(int argc, char *argv[], char *envp[])

Shell会调用execve将命令行参数和环境参数传递给可执行程序的main函数

int execve(const char * filename,char * const argv[ ],char * const envp[ ]);

库函数exec*都是execve的封装例程

下面通过内核代码看看sys_execve的执行过程:

在linux-3.18.6/fs/exec.c中我们找到sys_execve的系统调用,实际上执行的是do_execve:

SYSCALL_DEFINE3(execve,
const char __user *, filename,
const char __user *const __user *, argv,
const char __user *const __user *, envp)
{
return do_execve(getname(filename), argv, envp);
}


继续找到do_execve,最终执行的是do_execve_common,:

int do_execve(struct filename *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp)
{
struct user_arg_ptr argv = { .ptr.native = __argv };
struct user_arg_ptr envp = { .ptr.native = __envp };
return do_execve_common(filename, argv, envp);
}


找到do_execve_common,:

static int do_execve_common(struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp)
{
struct linux_binprm *bprm;// linux_binprm结构体用于维护程序执行过程中所使用的各种数据。
struct file *file;
struct files_struct *displaced;
int retval;

if (IS_ERR(filename))
return PTR_ERR(filename);

/*
* We move the actual failure in case of RLIMIT_NPROC excess from
* set*uid() to execve() because too many poorly written programs
* don't check setuid() return code.  Here we additionally recheck
* whether NPROC limit is still exceeded.
*/
if ((current->flags & PF_NPROC_EXCEEDED) &&
atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) {
retval = -EAGAIN;
goto out_ret;
}
//下面是初始化bprm的过程
/* We're below the limit (still or again), so we don't want to make
* further execve() calls fail. */
current->flags &= ~PF_NPROC_EXCEEDED;

retval = unshare_files(&displaced);
if (retval)
goto out_ret;

retval = -ENOMEM;
bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
if (!bprm)
goto out_files;

retval = prepare_bprm_creds(bprm);
if (retval)
goto out_free;

check_unsafe_exec(bprm);
current->in_execve = 1;

file = do_open_exec(filename);
retval = PTR_ERR(file);
if (IS_ERR(file))
goto out_unmark;

sched_exec();

bprm->file = file;
bprm->filename = bprm->interp = filename->name;

retval = bprm_mm_init(bprm);
if (retval)
goto out_unmark;

bprm->argc = count(argv, MAX_ARG_STRINGS);
if ((retval = bprm->argc) < 0)
goto out;

bprm->envc = count(envp, MAX_ARG_STRINGS);
if ((retval = bprm->envc) < 0)
goto out;

retval = prepare_binprm(bprm);
if (retval < 0)
goto out;

retval = copy_strings_kernel(1, &bprm->filename, bprm);
if (retval < 0)
goto out;

bprm->exec = bprm->p;
retval = copy_strings(bprm->envc, envp, bprm);
if (retval < 0)
goto out;

retval = copy_strings(bprm->argc, argv, bprm);
if (retval < 0)
goto out;

retval = exec_binprm(bprm);//加载程序
if (retval < 0)
goto out;

/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
acct_update_integrals(current);
task_numa_free(current);
free_bprm(bprm);
putname(filename);
if (displaced)
put_files_struct(displaced);
return retval;

out:
if (bprm->mm) {
acct_arg_size(bprm, 0);
mmput(bprm->mm);
}

out_unmark:
current->fs->in_exec = 0;
current->in_execve = 0;

out_free:
free_bprm(bprm);

out_files:
if (displaced)
reset_files_struct(displaced);
out_ret:
putname(filename);
return retval;
}


我们找到exec_binprm:

static int exec_binprm(struct linux_binprm *bprm)
{
pid_t old_pid, old_vpid;
int ret;

/* Need to fetch pid before load_binary changes it */
old_pid = current->pid;
rcu_read_lock();
old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
rcu_read_unlock();

ret = search_binary_handler(bprm);//寻找文件格式对应的解析模块
if (ret >= 0) {
audit_bprm(bprm);
trace_sched_process_exec(current, old_pid, bprm);
ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
proc_exec_connector(current);
}

return ret;
}


找到search_binary_handler:

int search_binary_handler(struct linux_binprm *bprm)
{
bool need_retry = IS_ENABLED(CONFIG_MODULES);
struct linux_binfmt *fmt;
int retval;

/* This allows 4 levels of binfmt rewrites before failing hard. */
if (bprm->recursion_depth > 5)
return -ELOOP;

retval = security_bprm_check(bprm);
if (retval)
return retval;

retval = -ENOENT;
retry:
read_lock(&binfmt_lock);
list_for_each_entry(fmt, &formats, lh) {
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
bprm->recursion_depth++;
retval = fmt->load_binary(bprm);//寻找文件格式对应的解析模块
read_lock(&binfmt_lock);
put_binfmt(fmt);
bprm->recursion_depth--;
if (retval < 0 && !bprm->mm) {
/* we got to flush_old_exec() and failed after it */
read_unlock(&binfmt_lock);
force_sigsegv(SIGSEGV, current);
return retval;
}
if (retval != -ENOEXEC || !bprm->file) {
read_unlock(&binfmt_lock);
return retval;
}
}
read_unlock(&binfmt_lock);

if (need_retry) {
if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
printable(bprm->buf[2]) && printable(bprm->buf[3]))
return retval;
if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
return retval;
need_retry = false;
goto retry;
}

return retval;
}


于ELF格式的可执行文件fmt->load_binary(bprm);执行的应该是load_elf_binary其内部是和ELF文件格式解析:

static int load_elf_binary(struct linux_binprm *bprm)
{
///解析elf文件格式
struct file *interpreter = NULL; /* to shut gcc up */
unsigned long load_addr = 0, load_bias = 0;
int load_addr_set = 0;
char * elf_interpreter = NULL;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata;
unsigned long elf_bss, elf_brk;
int retval, i;
unsigned int size;
unsigned long elf_entry;
unsigned long interp_load_addr = 0;
unsigned long start_code, end_code, start_data, end_data;
unsigned long reloc_func_desc __maybe_unused = 0;
int executable_stack = EXSTACK_DEFAULT;
struct pt_regs *regs = current_pt_regs();
struct {
struct elfhdr elf_ex;
struct elfhdr interp_elf_ex;
} *loc;

loc = kmalloc(sizeof(*loc), GFP_KERNEL);
if (!loc) {
retval = -ENOMEM;
goto out_ret;
}

/* Get the exec-header */
loc->elf_ex = *((struct elfhdr *)bprm->buf);

retval = -ENOEXEC;
/* First of all, some simple consistency checks */
if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
goto out;

if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
goto out;
if (!elf_check_arch(&loc->elf_ex))
goto out;
if (!bprm->file->f_op->mmap)
goto out;

/* Now read in all of the header information */
if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr))
goto out;
if (loc->elf_ex.e_phnum < 1 ||
loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr))
goto out;
size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);
retval = -ENOMEM;
elf_phdata = kmalloc(size, GFP_KERNEL);
if (!elf_phdata)
goto out;

retval = kernel_read(bprm->file, loc->elf_ex.e_phoff,
(char *)elf_phdata, size);
if (retval != size) {
if (retval >= 0)
retval = -EIO;
goto out_free_ph;
}

elf_ppnt = elf_phdata;
elf_bss = 0;
elf_brk = 0;

start_code = ~0UL;
end_code = 0;
start_data = 0;
end_data = 0;

for (i = 0; i < loc->elf_ex.e_phnum; i++) {
if (elf_ppnt->p_type == PT_INTERP) {
/* This is the program interpreter used for
* shared libraries - for now assume that this
* is an a.out format binary
*/
retval = -ENOEXEC;
if (elf_ppnt->p_filesz > PATH_MAX ||
elf_ppnt->p_filesz < 2)
goto out_free_ph;

retval = -ENOMEM;
elf_interpreter = kmalloc(elf_ppnt->p_filesz,
GFP_KERNEL);
if (!elf_interpreter)
goto out_free_ph;

retval = kernel_read(bprm->file, elf_ppnt->p_offset,
elf_interpreter,
elf_ppnt->p_filesz);
if (retval != elf_ppnt->p_filesz) {
if (retval >= 0)
retval = -EIO;
goto out_free_interp;
}
/* make sure path is NULL terminated */
retval = -ENOEXEC;
if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
goto out_free_interp;

interpreter = open_exec(elf_interpreter);
retval = PTR_ERR(interpreter);
if (IS_ERR(interpreter))
goto out_free_interp;

/*
* If the binary is not readable then enforce
* mm->dumpable = 0 regardless of the interpreter's
* permissions.
*/
would_dump(bprm, interpreter);

retval = kernel_read(interpreter, 0, bprm->buf,
BINPRM_BUF_SIZE);
if (retval != BINPRM_BUF_SIZE) {
if (retval >= 0)
retval = -EIO;
goto out_free_dentry;
}

/* Get the exec headers */
loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
break;
}
elf_ppnt++;
}

elf_ppnt = elf_phdata;
for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
if (elf_ppnt->p_type == PT_GNU_STACK) {
if (elf_ppnt->p_flags & PF_X)
executable_stack = EXSTACK_ENABLE_X;
else
executable_stack = EXSTACK_DISABLE_X;
break;
}

/* Some simple consistency checks for the interpreter */
if (elf_interpreter) {
retval = -ELIBBAD;
/* Not an ELF interpreter */
if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
goto out_free_dentry;
/* Verify the interpreter has a valid arch */
if (!elf_check_arch(&loc->interp_elf_ex))
goto out_free_dentry;
}

/* Flush all traces of the currently running executable */
retval = flush_old_exec(bprm);
if (retval)
goto out_free_dentry;

/* Do this immediately, since STACK_TOP as used in setup_arg_pages
may depend on the personality.  */
SET_PERSONALITY(loc->elf_ex);
if (elf_read_implies_exec(loc->elf_ex, executable_stack))
current->personality |= READ_IMPLIES_EXEC;

if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
current->flags |= PF_RANDOMIZE;

setup_new_exec(bprm);

/* Do this so that we can load the interpreter, if need be.  We will
change some of these later */
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
if (retval
d555
< 0)
goto out_free_dentry;

current->mm->start_stack = bprm->p;

/* Now we do a little grungy work by mmapping the ELF image into
the correct location in memory. */
for(i = 0, elf_ppnt = elf_phdata;
i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
int elf_prot = 0, elf_flags;
unsigned long k, vaddr;

if (elf_ppnt->p_type != PT_LOAD)
continue;

if (unlikely (elf_brk > elf_bss)) {
unsigned long nbyte;

/* There was a PT_LOAD segment with p_memsz > p_filesz
before this one. Map anonymous pages, if needed,
and clear the area.  */
retval = set_brk(elf_bss + load_bias,
elf_brk + load_bias);
if (retval)
goto out_free_dentry;
nbyte = ELF_PAGEOFFSET(elf_bss);
if (nbyte) {
nbyte = ELF_MIN_ALIGN - nbyte;
if (nbyte > elf_brk - elf_bss)
nbyte = elf_brk - elf_bss;
if (clear_user((void __user *)elf_bss +
load_bias, nbyte)) {
/*
* This bss-zeroing can fail if the ELF
* file specifies odd protections. So
* we don't check the return value
*/
}
}
}

if (elf_ppnt->p_flags & PF_R)
elf_prot |= PROT_READ;
if (elf_ppnt->p_flags & PF_W)
elf_prot |= PROT_WRITE;
if (elf_ppnt->p_flags & PF_X)
elf_prot |= PROT_EXEC;

elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;

vaddr = elf_ppnt->p_vaddr;
if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
elf_flags |= MAP_FIXED;
} else if (loc->elf_ex.e_type == ET_DYN) {
/* Try and get dynamic programs out of the way of the
* default mmap base, as well as whatever program they
* might try to exec.  This is because the brk will
* follow the loader, and is not movable.  */
#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
/* Memory randomization might have been switched off
* in runtime via sysctl or explicit setting of
* personality flags.
* If that is the case, retain the original non-zero
* load_bias value in order to establish proper
* non-randomized mappings.
*/
if (current->flags & PF_RANDOMIZE)
load_bias = 0;
else
load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
#else
load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
#endif
}

error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
elf_prot, elf_flags, 0); //目标文件映射到地址空间
if (BAD_ADDR(error)) {
retval = IS_ERR((void *)error) ?
PTR_ERR((void*)error) : -EINVAL;
goto out_free_dentry;
}

if (!load_addr_set) {
load_addr_set = 1;
load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
if (loc->elf_ex.e_type == ET_DYN) {
load_bias += error -
ELF_PAGESTART(load_bias + vaddr);
load_addr += load_bias;
reloc_func_desc = load_bias;
}
}
k = elf_ppnt->p_vaddr;
if (k < start_code)
start_code = k;
if (start_data < k)
start_data = k;

/*
* Check to see if the section's size will overflow the
* allowed task size. Note that p_filesz must always be
* <= p_memsz so it is only necessary to check p_memsz.
*/
if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
elf_ppnt->p_memsz > TASK_SIZE ||
TASK_SIZE - elf_ppnt->p_memsz < k) {
/* set_brk can never work. Avoid overflows. */
retval = -EINVAL;
goto out_free_dentry;
}

k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;

if (k > elf_bss)
elf_bss = k;
if ((elf_ppnt->p_flags & PF_X) && end_code < k)
end_code = k;
if (end_data < k)
end_data = k;
k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
if (k > elf_brk)
elf_brk = k;
}

loc->elf_ex.e_entry += load_bias;
elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
end_code += load_bias;
start_data += load_bias;
end_data += load_bias;

/* Calling set_brk effectively mmaps the pages that we need
* for the bss and break sections.  We must do this before
* mapping in the interpreter, to make sure it doesn't wind
* up getting placed where the bss needs to go.
*/
retval = set_brk(elf_bss, elf_brk);
if (retval)
goto out_free_dentry;
if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
retval = -EFAULT; /* Nobody gets to see this, but.. */
goto out_free_dentry;
}

if (elf_interpreter) {//动态链接时的入口
unsigned long interp_map_addr = 0;

elf_entry = load_elf_interp(&loc->interp_elf_ex,
interpreter,
&interp_map_addr,
load_bias);
if (!IS_ERR((void *)elf_entry)) {
/*
* load_elf_interp() returns relocation
* adjustment
*/
interp_load_addr = elf_entry;
elf_entry += loc->interp_elf_ex.e_entry;
}
if (BAD_ADDR(elf_entry)) {
retval = IS_ERR((void *)elf_entry) ?
(int)elf_entry : -EINVAL;
goto out_free_dentry;
}
reloc_func_desc = interp_load_addr;

allow_write_access(interpreter);
fput(interpreter);
kfree(elf_interpreter);
} else {
elf_entry = loc->elf_ex.e_entry;//无动态链接的程序入口
if (BAD_ADDR(elf_entry)) {
retval = -EINVAL;
goto out_free_dentry;
}
}

kfree(elf_phdata);

set_binfmt(&elf_format);

#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
if (retval < 0)
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */

install_exec_creds(bprm);
retval = create_elf_tables(bprm, &loc->elf_ex,
load_addr, interp_load_addr);
if (retval < 0)
goto out;
/* N.B. passed_fileno might not be initialized? */
current->mm->end_code = end_code;
current->mm->start_code = start_code;
current->mm->start_data = start_data;
current->mm->end_data = end_data;
current->mm->start_stack = bprm->p;

#ifdef arch_randomize_brk
if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
current->mm->brk = current->mm->start_brk =
arch_randomize_brk(current->mm);
#ifdef CONFIG_COMPAT_BRK
current->brk_randomized = 1;
#endif
}
#endif

if (current->personality & MMAP_PAGE_ZERO) {
/* Why this, you ask???  Well SVr4 maps page 0 as read-only,
and some applications "depend" upon this behavior.
Since we do not have the power to recompile these, we
emulate the SVr4 behavior. Sigh. */
error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE, 0);
}

#ifdef ELF_PLAT_INIT
/*
* The ABI may specify that certain registers be set up in special
* ways (on i386 %edx is the address of a DT_FINI function, for
* example.  In addition, it may also specify (eg, PowerPC64 ELF)
* that the e_entry field is the address of the function descriptor
* for the startup routine, rather than the address of the startup
* routine itself.  This macro performs whatever initialization to
* the regs structure is required as well as any relocations to the
* function descriptor entries when executing dynamically links apps.
*/
ELF_PLAT_INIT(regs, reloc_func_desc);
#endif

start_thread(regs, elf_entry, bprm->p);//执行程序
retval = 0;
out:
kfree(loc);
out_ret:
return retval;

/* error cleanup */
out_free_dentry:
allow_write_access(interpreter);
if (interpreter)
fput(interpreter);
out_free_interp:
kfree(elf_interpreter);
out_free_ph:
kfree(elf_phdata);
goto out;
}


start_thread修改了寄存器的值:

start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
set_user_gs(regs, 0);
regs->fs        = 0;
regs->ds        = __USER_DS;
regs->es        = __USER_DS;
regs->ss        = __USER_DS;
regs->cs        = __USER_CS;
regs->ip        = new_ip;//返回用户态的位置从int 0x80的下一条指令的位置变成新加载的可执行文件的entry位置(new_ip)。
regs->sp        = new_sp;
regs->flags     = X86_EFLAGS_IF;
/*
* force it to the iret return path by making it look as if there was
* some work pending.
*/
set_thread_flag(TIF_NOTIFY_RESUME);
}


这样程序在执行时,从new_ip(加载的程序)开始执行。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  linux kernel