ernel 3.10内核源码分析--KVM相关--虚拟机运行
2015-11-19 17:17
1541 查看
1、基本原理
KVM虚拟机通过字符设备/dev/kvm的ioctl接口创建和运行,相关原理见之前的文章说明。
虚拟机的运行通过/dev/kvm设备ioctl VCPU接口的KVM_RUN指令实现,在VM和VCPU创建好并完成初始化后,就可以调度该虚拟机运行了,通常,一个VCPU对应于一个线程,虚拟机运行的本质为调度该虚拟机相关的VCPU所在线程运行。虚拟机(VCPU)的运行主要任务是要进行上下文切换,上下文主要包括相关寄存器、APIC状态、TLB等,通常上下文切换的过程如下:
1、 保存当前的上下文。
2、 使用kvm_vcpu结构体中的上下文信息,加载到物理CPU中。
3、 执行kvm_x86_ops中的run_vcpu函数,调用硬件相关的指令(如VMLAUNCH),进入虚拟机运行环境中。
虚拟机运行于qemu-kvm的进程上下文中,从硬件的角度看,虚拟机的运行过程,实质为相关指令的执行过程,虚拟机编译后的也就是相应的CPU指令序列,而虚拟机的指令跟Host机的指令执行过程并没有太多的差别,最关键的差别为“敏感指令”(通常为IO、内存等关键操作)的执行,这也是虚拟化实现的本质所在,当在虚拟机中(Guest模式)执行“敏感指令”时,会触发(由硬件触发)VM-exit,使当前CPU从Guest模式(non-root模式)切换到root模式,当前CPU的控制权随之转交给VMM(Hypervisor,KVM中即Host),由VMM进行相应的处理,处理完成后再次通过应该硬件指令(如VMLAUNCH),重新进入到Guest模式,从而进入虚拟机运行环境中继续运行。
本文简单解释及分析在3.10版本内核代码中的相关流程,用户态qemu-kvm部分暂不包括。
2、大致流程:
Qemu-kvm可以通过ioctl(KVM_RUN…)使虚拟机运行,最终进入内核态,由KVM相关内核流程处理,在内核态执行的大致过程如下:
kvm_vcpu_ioctl -->
kvm_arch_vcpu_ioctl_run
具体由内核函数kvm_arch_vcpu_ioctl_run完成相关工作。主要流程如下:
1、 Sigprocmask()屏蔽信号,防止在此过程中受到信号的干扰。
2、 设置当前VCPU状态为KVM_MP_STATE_UNINITIALIZED
3、 配置APIC和mmio相关信息
4、 将VCPU中保存的上下文信息写入指定位置
5、 然后的工作交由__vcpu_run完成
6、 __vcpu_run最终调用vcpu_enter_guest,该函数实现了进入Guest,并执行Guest
OS具体指令的操作。
7、 vcpu_enter_guest最终调用kvm_x86_ops中的run函数运行。对应于Intel平台,该函数为vmx_vcpu_run(设置Guest
CR3和其他寄存器、EPT/影子页表相关设置、汇编代码VMLAUNCH切换到非根模式,执行Guest目标代码)。
8、 Guest代码执行到敏感指令或因其他原因(比如中断/异常),VM-Exit退出非根模式,返回到vcpu_enter_guest函数继续执行。
9、 vcpu_enter_guest函数中会判断VM-Exit原因,并进行相应处理。
10、处理完成后VM-Entry到Guest重新执行Guest代码,或重新等待下次调度。
3、代码分析
kvm_vcpu_ioctl():
点击(此处)折叠或打开
/*
* kvm
ioctl VCPU指令的入口,传入的fd为KVM_CREATE_VCPU中返回的fd。
* 主要针对具体的VCPU进行参数设置。如:相关寄存器的读
* 写、中断控制等
*/
static long kvm_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned
long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void
__user *)arg;
int r;
struct kvm_fpu *fpu = NULL;
struct kvm_sregs *kvm_sregs = NULL;
if (vcpu->kvm->mm != current->mm)
return -EIO;
#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
/*
* Special
cases: vcpu ioctls that are asynchronous to vcpu execution,
* so
vcpu_load() would
break it.
*/
if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
#endif
// KVM虚拟机VCPU数据结构载入物理CPU
r = vcpu_load(vcpu);
if (r)
return r;
switch (ioctl) {
/*
* 运行虚拟机,最终通过执行VMLAUNCH指令进入non
root模式,
* 进入虚拟机运行。当虚拟机内部执行敏感指令时,由硬
* 件触发VM-exit,返回到root模式
*/
case KVM_RUN:
r = -EINVAL;
// 不能带参数。
if (arg)
goto out;
// 运行VCPU(即运行虚拟机)的入口函数
r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
...
kvm_vcpu_ioctl()-->kvm_arch_vcpu_ioctl_run()-->__vcpu_run():
点击(此处)折叠或打开
static int __vcpu_run(struct
kvm_vcpu *vcpu)
{
int r;
struct kvm *kvm = vcpu->kvm;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
/*设置vcpu->arch.apic->vapic_page*/
r = vapic_enter(vcpu);
if (r) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
return r;
}
r = 1;
while (r > 0) {
/*检查状态*/
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
/* 进入Guest模式,最终通过VMLAUNCH指令实现*/
r = vcpu_enter_guest(vcpu);
else {/*什么情况下会走到这里?*/
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
/*阻塞VCPU,其实就是schddule()调度出去,但在有特殊情况时(比如有挂起的定时器或信号时),不进行调度而直接退出*/
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
kvm_apic_accept_events(vcpu);
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
break;
case KVM_MP_STATE_INIT_RECEIVED:
break;
default:
r = -EINTR;
break;
}
}
}
if (r <= 0)
break;
clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
if (dm_request_for_irq_injection(vcpu)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.request_irq_exits;
}
kvm_check_async_pf_completion(vcpu);
if (signal_pending(current)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
}
/*这是kvm中的一个调度时机点,即选择新VCPU运行的时机点*/
if (need_resched()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_resched(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
vapic_exit(vcpu);
return r;
}
kvm_vcpu_ioctl()-->kvm_arch_vcpu_ioctl_run()-->__vcpu_run()-->vcpu_enter_guest():
点击(此处)折叠或打开
/* 进入Guest模式,最终通过VMLAUNCH指令实现*/
static int vcpu_enter_guest(struct
kvm_vcpu *vcpu)
{
int r;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
bool req_immediate_exit = false;
/*进入Guest模式前先处理相关挂起的请求*/
if (vcpu->requests) {
/*卸载MMU*/
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu);
/*定时器迁移*/
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
/*主时钟更新*/
if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
kvm_gen_update_masterclock(vcpu->kvm);
/*全局时钟更新*/
if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
kvm_gen_kvmclock_update(vcpu);
/*虚拟机时钟更新*/
if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
r = kvm_guest_time_update(vcpu);
if (unlikely(r))
goto out;
}
/*更新mmu*/
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
/*刷新TLB*/
if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_x86_ops->tlb_flush(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
/* Page is swapped
out. Do synthetic
halt */
vcpu->arch.apf.halted = true;
r = 1;
goto out;
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
if (kvm_check_request(KVM_REQ_PMU, vcpu))
kvm_handle_pmu_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
kvm_deliver_pmi(vcpu);
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
}
// 检查是否有事件请求
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
r = 1;
goto out;
}
// 注入阻塞的事件,中断,异常和nmi等
inject_pending_event(vcpu);
/* enable
NMI/IRQ window open
exits if needed */
/*
* 使能NMI/IRQ
window,参见Intel64 System Programming Guide 25.3节
* 当使能了interrupt-window exiting或NMI-window exiting(由VMCS中相关字段控制),
* 表示在刚进入虚拟机后,就会立刻因为有pending或注入的中断导致VM-exit
*/
if (vcpu->arch.nmi_pending)
req_immediate_exit =
kvm_x86_ops->enable_nmi_window(vcpu) != 0;
else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
req_immediate_exit =
kvm_x86_ops->enable_irq_window(vcpu) != 0;
if (kvm_lapic_enabled(vcpu)) {
/*
* Update
architecture specific hints for APIC
* virtual
interrupt delivery.
*/
if (kvm_x86_ops->hwapic_irr_update)
kvm_x86_ops->hwapic_irr_update(vcpu,
kvm_lapic_find_highest_irr(vcpu));
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
}
}
// 装载MMU,待深入分析
r = kvm_mmu_reload(vcpu);
if (unlikely(r)) {
goto cancel_injection;
}
preempt_disable();
// 进入Guest前期准备,架构相关
kvm_x86_ops->prepare_guest_switch(vcpu);
if (vcpu->fpu_active)
kvm_load_guest_fpu(vcpu);
kvm_load_guest_xcr0(vcpu);
vcpu->mode = IN_GUEST_MODE;
/* We
should set ->mode
before check ->requests,
* see
the comment in make_all_cpus_request.
*/
smp_mb();
local_irq_disable();
/*
* 如果VCPU处于EXITING_GUEST_MODE或者vcpu->requests(?)或者需要调度或者
* 有挂起的信号,则放弃
*/
if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
|| need_resched() || signal_pending(current)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
local_irq_enable();
preempt_enable();
r = 1;
goto cancel_injection;
}
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
// req_immediate_exit在前面使能NMI/IRQ
window失败时设置,此时需要立即退出,触发重新调度
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
// 计算虚拟机的enter时间
kvm_guest_enter();
// 调试相关
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
}
trace_kvm_entry(vcpu->vcpu_id);
// 调用架构相关的run接口(vmx_vcpu_run),进入Guest模式
kvm_x86_ops->run(vcpu);
// 此处开始,说明已经发生了VM-exit,退出了Guest模式
/*
* If the
guest has used debug registers, at least dr7
* will
be disabled while returning to the
host.
* If we
don't have active breakpoints in the
host, we don't
* care
about the messed up debug address registers. But if
* we
have some of them active, restore the old state.
*/
if (hw_breakpoint_active())
hw_breakpoint_restore();
/*记录Guest退出前的TSC时钟*/
vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
native_read_tsc());
// 设置模式
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
/* Interrupt is enabled
by handle_external_intr() */
kvm_x86_ops->handle_external_intr(vcpu);
++vcpu->stat.exits;
/*
* We
must have an instruction between local_irq_enable() and
* kvm_guest_exit(), so
the timer interrupt isn't delayed by
* the
interrupt shadow. The stat.exits
increment will do nicely.
* But
we need to prevent reordering, hence
this barrier():
*/
barrier();
// 计算虚拟机的退出时间,其中还开中断了?
kvm_guest_exit();
preempt_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
/*
* Profile
KVM exit RIPs:
*/
// Profile(采样计数,用于性能分析和调优)相关
if (unlikely(prof_on == KVM_PROFILING)) {
unsigned long rip = kvm_rip_read(vcpu);
profile_hit(KVM_PROFILING, (void *)rip);
}
if (unlikely(vcpu->arch.tsc_always_catchup))
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
/*
* 调用vmx_handle_exit()处理虚拟机异常,异常原因及其它关键信息
* 已经在之前获取。
*/
r = kvm_x86_ops->handle_exit(vcpu);
return r;
cancel_injection:
kvm_x86_ops->cancel_injection(vcpu);
if (unlikely(vcpu->arch.apic_attention))
kvm_lapic_sync_from_vapic(vcpu);
out:
return r;
}
kvm_vcpu_ioctl()-->kvm_arch_vcpu_ioctl_run()-->__vcpu_run()-->vcpu_enter_guest()-->vmx_vcpu_run():
点击(此处)折叠或打开
/*
* 运行虚拟机,进入Guest模式,即non
root模式
*/
static void __noclone vmx_vcpu_run(struct
kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long debugctlmsr;
/* Record
the guest's net vcpu time for enforced
NMI injections. */
// nmi注入?跟nmi_watchdog相关?
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
vmx->entry_time = ktime_get();
/* Don't
enter VMX if guest state is invalid, let the exit handler
start emulation until we
arrive back to a valid state */
if (vmx->emulation_required)
return;
if (vmx->nested.sync_shadow_vmcs) {
copy_vmcs12_to_shadow(vmx);
vmx->nested.sync_shadow_vmcs = false;
}
// 写入Guest的RSP寄存器信息至VMCS相关位置中
if (test_bit(VCPU_REGS_RSP, (unsigned
long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
// 写入Guest的RIP寄存器信息至VMCS相关位置中
if (test_bit(VCPU_REGS_RIP, (unsigned
long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
/* When
single-stepping over STI and MOV
SS, we must clear the
* corresponding
interruptibility bits in the guest state. Otherwise
* vmentry
fails as it then expects bit 14 (BS) in pending
debug
* exceptions
being set, but
that's not correct for the
guest debugging
* case. */
// 单步调试时,需要禁用Guest中断
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
// vmx->__launched用于判断当前VCPU是否已经VMLAUNCH了
vmx->__launched = vmx->loaded_vmcs->launched;
// 执行VMLAUNCH指令进入Guest模式,虚拟机开始运行
asm(
/* Store
host registers */
/*将相关寄存器压栈*/
"push %%" _ASM_DX ";
push %%" _ASM_BP ";"/*BP压栈*/
/*为guest的rcx寄存器保留个位置,所以这里压两次栈*/
"push %%" _ASM_CX "
\n\t" /* placeholder for guest
rcx */
"push %%" _ASM_CX "
\n\t"
/*
* %c表示用来表示使用立即数替换,但不使用立即数的语法,at&t汇编中表示立即数的语法前面有一个$,而用了%c后,就去掉了这个$。
* 主要是用在间接寻址的情况,这种情况下如果直接使用$立即数的方式的话,会报语法错误。
* [host_rsp]是后面输入部分定义的tag,使用%tag方式可以直接引用,%0是后面输入输出部分中的第一个操作数,即vmx,这里是间接寻址
* %c[host_rsp](%0)整体来看就是vmx(以寄存器ecx传入)中的host_rsp成员。
* 所以,如下语句的整体含义就是比较当前SP寄存器和vmx->host_rsp的值。
*/
/*如果当前RSP和vmx->rsp相等,那就不用mov了,否则将当前RSP保存到vmx中*/
"cmp %%" _ASM_SP ",
%c[host_rsp](%0) \n\t"
"je 1f \n\t"
"mov %%" _ASM_SP ",
%c[host_rsp](%0) \n\t"
/*
* 执行ASM_VMX_VMWRITE_RSP_RDX指令(Writes the contents of a primary source operand (register or memory) to a specified field in a VMCS,即将RSP的值写入vmcs中,field由RDX寄存器指定,
* 而此时的RDX寄存器的内容由后面的约束条件:"d"((unsigned long)HOST_RSP指定为HOST_RSP,所以这句命令的作用为:将rsp的值写vmcs,field是HOST_RSP。),
* 当出现异常时直接重启,由__ex()实现
*/
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
/* Reload
cr2 if changed */
/*比较当前CR2寄存器和vmx中保存的CR2寄存器内容,如果不相等,就从vmx中重新CR2内容到当前CR2寄存器中*/
"mov %c[cr2](%0), %%" _ASM_AX "
\n\t"
"mov %%cr2, %%" _ASM_DX "
\n\t"
"cmp %%" _ASM_AX ",
%%" _ASM_DX " \n\t"
"je 2f \n\t"
"mov %%" _ASM_AX",
%%cr2 \n\t"
"2: \n\t"
/* Check if vmlaunch
of vmresume is needed */
/*判断vcpu_vmx->__launched,确认是否需要执行VMLAUNCH*/
"cmpl $0, %c[launched](%0) \n\t"
/* Load
guest registers. Don't
clobber flags. */
/*加载guest寄存器,其实就是从vmx中加载*/
"mov %c[rax](%0), %%" _ASM_AX "
\n\t"
"mov %c[rbx](%0), %%" _ASM_BX "
\n\t"
"mov %c[rdx](%0), %%" _ASM_DX "
\n\t"
"mov %c[rsi](%0), %%" _ASM_SI "
\n\t"
"mov %c[rdi](%0), %%" _ASM_DI "
\n\t"
"mov %c[rbp](%0), %%" _ASM_BP "
\n\t"
#ifdef CONFIG_X86_64
"mov %c[r8](%0), %%r8 \n\t"
"mov %c[r9](%0), %%r9 \n\t"
"mov %c[r10](%0), %%r10 \n\t"
"mov %c[r11](%0), %%r11 \n\t"
"mov %c[r12](%0), %%r12 \n\t"
"mov %c[r13](%0), %%r13 \n\t"
"mov %c[r14](%0), %%r14 \n\t"
"mov %c[r15](%0), %%r15 \n\t"
#endif
"mov %c[rcx](%0), %%" _ASM_CX "
\n\t" /* kills %0 (ecx) */
/* Enter
guest mode */
"jne 1f \n\t"
/* 执行VMLAUNCH指令,进入Guest模式*/
__ex(ASM_VMX_VMLAUNCH) "\n\t"
"jmp 2f \n\t"
/* 执行VMRESUME指令,从Guest模式恢复到root模式*/
"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
"2: "
/* Save
guest registers, load host registers, keep
flags */
"mov %0, %c[wordsize](%%" _ASM_SP ")
\n\t"
"pop %0 \n\t"
"mov %%" _ASM_AX ",
%c[rax](%0) \n\t"
"mov %%" _ASM_BX ",
%c[rbx](%0) \n\t"
__ASM_SIZE(pop) "
%c[rcx](%0) \n\t"
"mov %%" _ASM_DX ",
%c[rdx](%0) \n\t"
"mov %%" _ASM_SI ",
%c[rsi](%0) \n\t"
"mov %%" _ASM_DI ",
%c[rdi](%0) \n\t"
"mov %%" _ASM_BP ",
%c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64
"mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t"
"mov %%r10, %c[r10](%0) \n\t"
"mov %%r11, %c[r11](%0) \n\t"
"mov %%r12, %c[r12](%0) \n\t"
"mov %%r13, %c[r13](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
#endif
"mov %%cr2, %%" _ASM_AX "
\n\t"
"mov %%" _ASM_AX ",
%c[cr2](%0) \n\t"
"pop %%" _ASM_BP ";
pop %%" _ASM_DX " \n\t"
"setbe %c[fail](%0) \n\t"
".pushsection .rodata \n\t"
".global vmx_return \n\t"
"vmx_return: " _ASM_PTR "
2b \n\t"
".popsection"
: : "c"(vmx), "d"((unsigned
long)HOST_RSP),
[launched]"i"(offsetof(struct
vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct
vcpu_vmx, fail)),
/*[host_rsp]是tag,可以在前面以%[host_rsp]方式引用*/
[host_rsp]"i"(offsetof(struct
vcpu_vmx, host_rsp)),
[rax]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
[rdx]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
[rsi]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
[rdi]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
[rbp]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
#ifdef CONFIG_X86_64
[r8]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
[r9]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
[r10]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
[r11]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
[r12]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
[r13]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
[r14]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
#endif
[cr2]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.cr2)),
[wordsize]"i"(sizeof(ulong))
: "cc", "memory"/*clobber
list,cc表示寄存器,memory表示内存*/
#ifdef CONFIG_X86_64
, "rax", "rbx", "rdi", "rsi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#else
, "eax", "ebx", "edi", "esi"
#endif
);
// 运行到这里,说明已经发生了VM-exit,返回到了root模式
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore
it if needed */
if (debugctlmsr)
update_debugctlmsr(debugctlmsr);
#ifndef CONFIG_X86_64
/*
* The
sysexit path does not restore ds/es, so
we must set them to
* a
reasonable value ourselves.
*
* We
can't defer this to vmx_load_host_state() since
that function
* may
be executed in interrupt context, which
saves and restore segments
* around
it, nullifying its effect.
*/
/*重新加载ds/es段寄存器,因为VM-exit不会自动加载他们*/
loadsegment(ds, __USER_DS);
loadsegment(es, __USER_DS);
#endif
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
| (1 << VCPU_EXREG_RFLAGS)
| (1 << VCPU_EXREG_CPL)
| (1 << VCPU_EXREG_PDPTR)
| (1 << VCPU_EXREG_SEGMENTS)
| (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
// 从硬件VMCS中读取中断向量表信息
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx->loaded_vmcs->launched = 1;
// 从硬件VMCS中读取VM-exit原因信息,这些信息是VM-exit过程中由硬件自动写入的
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
/*处理MCE异常和NMI中断*/
vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}
KVM虚拟机通过字符设备/dev/kvm的ioctl接口创建和运行,相关原理见之前的文章说明。
虚拟机的运行通过/dev/kvm设备ioctl VCPU接口的KVM_RUN指令实现,在VM和VCPU创建好并完成初始化后,就可以调度该虚拟机运行了,通常,一个VCPU对应于一个线程,虚拟机运行的本质为调度该虚拟机相关的VCPU所在线程运行。虚拟机(VCPU)的运行主要任务是要进行上下文切换,上下文主要包括相关寄存器、APIC状态、TLB等,通常上下文切换的过程如下:
1、 保存当前的上下文。
2、 使用kvm_vcpu结构体中的上下文信息,加载到物理CPU中。
3、 执行kvm_x86_ops中的run_vcpu函数,调用硬件相关的指令(如VMLAUNCH),进入虚拟机运行环境中。
虚拟机运行于qemu-kvm的进程上下文中,从硬件的角度看,虚拟机的运行过程,实质为相关指令的执行过程,虚拟机编译后的也就是相应的CPU指令序列,而虚拟机的指令跟Host机的指令执行过程并没有太多的差别,最关键的差别为“敏感指令”(通常为IO、内存等关键操作)的执行,这也是虚拟化实现的本质所在,当在虚拟机中(Guest模式)执行“敏感指令”时,会触发(由硬件触发)VM-exit,使当前CPU从Guest模式(non-root模式)切换到root模式,当前CPU的控制权随之转交给VMM(Hypervisor,KVM中即Host),由VMM进行相应的处理,处理完成后再次通过应该硬件指令(如VMLAUNCH),重新进入到Guest模式,从而进入虚拟机运行环境中继续运行。
本文简单解释及分析在3.10版本内核代码中的相关流程,用户态qemu-kvm部分暂不包括。
2、大致流程:
Qemu-kvm可以通过ioctl(KVM_RUN…)使虚拟机运行,最终进入内核态,由KVM相关内核流程处理,在内核态执行的大致过程如下:
kvm_vcpu_ioctl -->
kvm_arch_vcpu_ioctl_run
具体由内核函数kvm_arch_vcpu_ioctl_run完成相关工作。主要流程如下:
1、 Sigprocmask()屏蔽信号,防止在此过程中受到信号的干扰。
2、 设置当前VCPU状态为KVM_MP_STATE_UNINITIALIZED
3、 配置APIC和mmio相关信息
4、 将VCPU中保存的上下文信息写入指定位置
5、 然后的工作交由__vcpu_run完成
6、 __vcpu_run最终调用vcpu_enter_guest,该函数实现了进入Guest,并执行Guest
OS具体指令的操作。
7、 vcpu_enter_guest最终调用kvm_x86_ops中的run函数运行。对应于Intel平台,该函数为vmx_vcpu_run(设置Guest
CR3和其他寄存器、EPT/影子页表相关设置、汇编代码VMLAUNCH切换到非根模式,执行Guest目标代码)。
8、 Guest代码执行到敏感指令或因其他原因(比如中断/异常),VM-Exit退出非根模式,返回到vcpu_enter_guest函数继续执行。
9、 vcpu_enter_guest函数中会判断VM-Exit原因,并进行相应处理。
10、处理完成后VM-Entry到Guest重新执行Guest代码,或重新等待下次调度。
3、代码分析
kvm_vcpu_ioctl():
点击(此处)折叠或打开
/*
* kvm
ioctl VCPU指令的入口,传入的fd为KVM_CREATE_VCPU中返回的fd。
* 主要针对具体的VCPU进行参数设置。如:相关寄存器的读
* 写、中断控制等
*/
static long kvm_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned
long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void
__user *)arg;
int r;
struct kvm_fpu *fpu = NULL;
struct kvm_sregs *kvm_sregs = NULL;
if (vcpu->kvm->mm != current->mm)
return -EIO;
#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
/*
* Special
cases: vcpu ioctls that are asynchronous to vcpu execution,
* so
vcpu_load() would
break it.
*/
if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
#endif
// KVM虚拟机VCPU数据结构载入物理CPU
r = vcpu_load(vcpu);
if (r)
return r;
switch (ioctl) {
/*
* 运行虚拟机,最终通过执行VMLAUNCH指令进入non
root模式,
* 进入虚拟机运行。当虚拟机内部执行敏感指令时,由硬
* 件触发VM-exit,返回到root模式
*/
case KVM_RUN:
r = -EINVAL;
// 不能带参数。
if (arg)
goto out;
// 运行VCPU(即运行虚拟机)的入口函数
r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
...
kvm_vcpu_ioctl()-->kvm_arch_vcpu_ioctl_run()-->__vcpu_run():
点击(此处)折叠或打开
static int __vcpu_run(struct
kvm_vcpu *vcpu)
{
int r;
struct kvm *kvm = vcpu->kvm;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
/*设置vcpu->arch.apic->vapic_page*/
r = vapic_enter(vcpu);
if (r) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
return r;
}
r = 1;
while (r > 0) {
/*检查状态*/
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
/* 进入Guest模式,最终通过VMLAUNCH指令实现*/
r = vcpu_enter_guest(vcpu);
else {/*什么情况下会走到这里?*/
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
/*阻塞VCPU,其实就是schddule()调度出去,但在有特殊情况时(比如有挂起的定时器或信号时),不进行调度而直接退出*/
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
kvm_apic_accept_events(vcpu);
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
break;
case KVM_MP_STATE_INIT_RECEIVED:
break;
default:
r = -EINTR;
break;
}
}
}
if (r <= 0)
break;
clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
if (dm_request_for_irq_injection(vcpu)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.request_irq_exits;
}
kvm_check_async_pf_completion(vcpu);
if (signal_pending(current)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
}
/*这是kvm中的一个调度时机点,即选择新VCPU运行的时机点*/
if (need_resched()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_resched(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
vapic_exit(vcpu);
return r;
}
kvm_vcpu_ioctl()-->kvm_arch_vcpu_ioctl_run()-->__vcpu_run()-->vcpu_enter_guest():
点击(此处)折叠或打开
/* 进入Guest模式,最终通过VMLAUNCH指令实现*/
static int vcpu_enter_guest(struct
kvm_vcpu *vcpu)
{
int r;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
bool req_immediate_exit = false;
/*进入Guest模式前先处理相关挂起的请求*/
if (vcpu->requests) {
/*卸载MMU*/
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu);
/*定时器迁移*/
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
/*主时钟更新*/
if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
kvm_gen_update_masterclock(vcpu->kvm);
/*全局时钟更新*/
if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
kvm_gen_kvmclock_update(vcpu);
/*虚拟机时钟更新*/
if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
r = kvm_guest_time_update(vcpu);
if (unlikely(r))
goto out;
}
/*更新mmu*/
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
/*刷新TLB*/
if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_x86_ops->tlb_flush(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
/* Page is swapped
out. Do synthetic
halt */
vcpu->arch.apf.halted = true;
r = 1;
goto out;
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
if (kvm_check_request(KVM_REQ_PMU, vcpu))
kvm_handle_pmu_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
kvm_deliver_pmi(vcpu);
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
}
// 检查是否有事件请求
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
r = 1;
goto out;
}
// 注入阻塞的事件,中断,异常和nmi等
inject_pending_event(vcpu);
/* enable
NMI/IRQ window open
exits if needed */
/*
* 使能NMI/IRQ
window,参见Intel64 System Programming Guide 25.3节
* 当使能了interrupt-window exiting或NMI-window exiting(由VMCS中相关字段控制),
* 表示在刚进入虚拟机后,就会立刻因为有pending或注入的中断导致VM-exit
*/
if (vcpu->arch.nmi_pending)
req_immediate_exit =
kvm_x86_ops->enable_nmi_window(vcpu) != 0;
else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
req_immediate_exit =
kvm_x86_ops->enable_irq_window(vcpu) != 0;
if (kvm_lapic_enabled(vcpu)) {
/*
* Update
architecture specific hints for APIC
* virtual
interrupt delivery.
*/
if (kvm_x86_ops->hwapic_irr_update)
kvm_x86_ops->hwapic_irr_update(vcpu,
kvm_lapic_find_highest_irr(vcpu));
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
}
}
// 装载MMU,待深入分析
r = kvm_mmu_reload(vcpu);
if (unlikely(r)) {
goto cancel_injection;
}
preempt_disable();
// 进入Guest前期准备,架构相关
kvm_x86_ops->prepare_guest_switch(vcpu);
if (vcpu->fpu_active)
kvm_load_guest_fpu(vcpu);
kvm_load_guest_xcr0(vcpu);
vcpu->mode = IN_GUEST_MODE;
/* We
should set ->mode
before check ->requests,
* see
the comment in make_all_cpus_request.
*/
smp_mb();
local_irq_disable();
/*
* 如果VCPU处于EXITING_GUEST_MODE或者vcpu->requests(?)或者需要调度或者
* 有挂起的信号,则放弃
*/
if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
|| need_resched() || signal_pending(current)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
local_irq_enable();
preempt_enable();
r = 1;
goto cancel_injection;
}
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
// req_immediate_exit在前面使能NMI/IRQ
window失败时设置,此时需要立即退出,触发重新调度
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
// 计算虚拟机的enter时间
kvm_guest_enter();
// 调试相关
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
}
trace_kvm_entry(vcpu->vcpu_id);
// 调用架构相关的run接口(vmx_vcpu_run),进入Guest模式
kvm_x86_ops->run(vcpu);
// 此处开始,说明已经发生了VM-exit,退出了Guest模式
/*
* If the
guest has used debug registers, at least dr7
* will
be disabled while returning to the
host.
* If we
don't have active breakpoints in the
host, we don't
* care
about the messed up debug address registers. But if
* we
have some of them active, restore the old state.
*/
if (hw_breakpoint_active())
hw_breakpoint_restore();
/*记录Guest退出前的TSC时钟*/
vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
native_read_tsc());
// 设置模式
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
/* Interrupt is enabled
by handle_external_intr() */
kvm_x86_ops->handle_external_intr(vcpu);
++vcpu->stat.exits;
/*
* We
must have an instruction between local_irq_enable() and
* kvm_guest_exit(), so
the timer interrupt isn't delayed by
* the
interrupt shadow. The stat.exits
increment will do nicely.
* But
we need to prevent reordering, hence
this barrier():
*/
barrier();
// 计算虚拟机的退出时间,其中还开中断了?
kvm_guest_exit();
preempt_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
/*
* Profile
KVM exit RIPs:
*/
// Profile(采样计数,用于性能分析和调优)相关
if (unlikely(prof_on == KVM_PROFILING)) {
unsigned long rip = kvm_rip_read(vcpu);
profile_hit(KVM_PROFILING, (void *)rip);
}
if (unlikely(vcpu->arch.tsc_always_catchup))
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
/*
* 调用vmx_handle_exit()处理虚拟机异常,异常原因及其它关键信息
* 已经在之前获取。
*/
r = kvm_x86_ops->handle_exit(vcpu);
return r;
cancel_injection:
kvm_x86_ops->cancel_injection(vcpu);
if (unlikely(vcpu->arch.apic_attention))
kvm_lapic_sync_from_vapic(vcpu);
out:
return r;
}
kvm_vcpu_ioctl()-->kvm_arch_vcpu_ioctl_run()-->__vcpu_run()-->vcpu_enter_guest()-->vmx_vcpu_run():
点击(此处)折叠或打开
/*
* 运行虚拟机,进入Guest模式,即non
root模式
*/
static void __noclone vmx_vcpu_run(struct
kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long debugctlmsr;
/* Record
the guest's net vcpu time for enforced
NMI injections. */
// nmi注入?跟nmi_watchdog相关?
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
vmx->entry_time = ktime_get();
/* Don't
enter VMX if guest state is invalid, let the exit handler
start emulation until we
arrive back to a valid state */
if (vmx->emulation_required)
return;
if (vmx->nested.sync_shadow_vmcs) {
copy_vmcs12_to_shadow(vmx);
vmx->nested.sync_shadow_vmcs = false;
}
// 写入Guest的RSP寄存器信息至VMCS相关位置中
if (test_bit(VCPU_REGS_RSP, (unsigned
long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
// 写入Guest的RIP寄存器信息至VMCS相关位置中
if (test_bit(VCPU_REGS_RIP, (unsigned
long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
/* When
single-stepping over STI and MOV
SS, we must clear the
* corresponding
interruptibility bits in the guest state. Otherwise
* vmentry
fails as it then expects bit 14 (BS) in pending
debug
* exceptions
being set, but
that's not correct for the
guest debugging
* case. */
// 单步调试时,需要禁用Guest中断
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
// vmx->__launched用于判断当前VCPU是否已经VMLAUNCH了
vmx->__launched = vmx->loaded_vmcs->launched;
// 执行VMLAUNCH指令进入Guest模式,虚拟机开始运行
asm(
/* Store
host registers */
/*将相关寄存器压栈*/
"push %%" _ASM_DX ";
push %%" _ASM_BP ";"/*BP压栈*/
/*为guest的rcx寄存器保留个位置,所以这里压两次栈*/
"push %%" _ASM_CX "
\n\t" /* placeholder for guest
rcx */
"push %%" _ASM_CX "
\n\t"
/*
* %c表示用来表示使用立即数替换,但不使用立即数的语法,at&t汇编中表示立即数的语法前面有一个$,而用了%c后,就去掉了这个$。
* 主要是用在间接寻址的情况,这种情况下如果直接使用$立即数的方式的话,会报语法错误。
* [host_rsp]是后面输入部分定义的tag,使用%tag方式可以直接引用,%0是后面输入输出部分中的第一个操作数,即vmx,这里是间接寻址
* %c[host_rsp](%0)整体来看就是vmx(以寄存器ecx传入)中的host_rsp成员。
* 所以,如下语句的整体含义就是比较当前SP寄存器和vmx->host_rsp的值。
*/
/*如果当前RSP和vmx->rsp相等,那就不用mov了,否则将当前RSP保存到vmx中*/
"cmp %%" _ASM_SP ",
%c[host_rsp](%0) \n\t"
"je 1f \n\t"
"mov %%" _ASM_SP ",
%c[host_rsp](%0) \n\t"
/*
* 执行ASM_VMX_VMWRITE_RSP_RDX指令(Writes the contents of a primary source operand (register or memory) to a specified field in a VMCS,即将RSP的值写入vmcs中,field由RDX寄存器指定,
* 而此时的RDX寄存器的内容由后面的约束条件:"d"((unsigned long)HOST_RSP指定为HOST_RSP,所以这句命令的作用为:将rsp的值写vmcs,field是HOST_RSP。),
* 当出现异常时直接重启,由__ex()实现
*/
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
/* Reload
cr2 if changed */
/*比较当前CR2寄存器和vmx中保存的CR2寄存器内容,如果不相等,就从vmx中重新CR2内容到当前CR2寄存器中*/
"mov %c[cr2](%0), %%" _ASM_AX "
\n\t"
"mov %%cr2, %%" _ASM_DX "
\n\t"
"cmp %%" _ASM_AX ",
%%" _ASM_DX " \n\t"
"je 2f \n\t"
"mov %%" _ASM_AX",
%%cr2 \n\t"
"2: \n\t"
/* Check if vmlaunch
of vmresume is needed */
/*判断vcpu_vmx->__launched,确认是否需要执行VMLAUNCH*/
"cmpl $0, %c[launched](%0) \n\t"
/* Load
guest registers. Don't
clobber flags. */
/*加载guest寄存器,其实就是从vmx中加载*/
"mov %c[rax](%0), %%" _ASM_AX "
\n\t"
"mov %c[rbx](%0), %%" _ASM_BX "
\n\t"
"mov %c[rdx](%0), %%" _ASM_DX "
\n\t"
"mov %c[rsi](%0), %%" _ASM_SI "
\n\t"
"mov %c[rdi](%0), %%" _ASM_DI "
\n\t"
"mov %c[rbp](%0), %%" _ASM_BP "
\n\t"
#ifdef CONFIG_X86_64
"mov %c[r8](%0), %%r8 \n\t"
"mov %c[r9](%0), %%r9 \n\t"
"mov %c[r10](%0), %%r10 \n\t"
"mov %c[r11](%0), %%r11 \n\t"
"mov %c[r12](%0), %%r12 \n\t"
"mov %c[r13](%0), %%r13 \n\t"
"mov %c[r14](%0), %%r14 \n\t"
"mov %c[r15](%0), %%r15 \n\t"
#endif
"mov %c[rcx](%0), %%" _ASM_CX "
\n\t" /* kills %0 (ecx) */
/* Enter
guest mode */
"jne 1f \n\t"
/* 执行VMLAUNCH指令,进入Guest模式*/
__ex(ASM_VMX_VMLAUNCH) "\n\t"
"jmp 2f \n\t"
/* 执行VMRESUME指令,从Guest模式恢复到root模式*/
"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
"2: "
/* Save
guest registers, load host registers, keep
flags */
"mov %0, %c[wordsize](%%" _ASM_SP ")
\n\t"
"pop %0 \n\t"
"mov %%" _ASM_AX ",
%c[rax](%0) \n\t"
"mov %%" _ASM_BX ",
%c[rbx](%0) \n\t"
__ASM_SIZE(pop) "
%c[rcx](%0) \n\t"
"mov %%" _ASM_DX ",
%c[rdx](%0) \n\t"
"mov %%" _ASM_SI ",
%c[rsi](%0) \n\t"
"mov %%" _ASM_DI ",
%c[rdi](%0) \n\t"
"mov %%" _ASM_BP ",
%c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64
"mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t"
"mov %%r10, %c[r10](%0) \n\t"
"mov %%r11, %c[r11](%0) \n\t"
"mov %%r12, %c[r12](%0) \n\t"
"mov %%r13, %c[r13](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
#endif
"mov %%cr2, %%" _ASM_AX "
\n\t"
"mov %%" _ASM_AX ",
%c[cr2](%0) \n\t"
"pop %%" _ASM_BP ";
pop %%" _ASM_DX " \n\t"
"setbe %c[fail](%0) \n\t"
".pushsection .rodata \n\t"
".global vmx_return \n\t"
"vmx_return: " _ASM_PTR "
2b \n\t"
".popsection"
: : "c"(vmx), "d"((unsigned
long)HOST_RSP),
[launched]"i"(offsetof(struct
vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct
vcpu_vmx, fail)),
/*[host_rsp]是tag,可以在前面以%[host_rsp]方式引用*/
[host_rsp]"i"(offsetof(struct
vcpu_vmx, host_rsp)),
[rax]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
[rdx]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
[rsi]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
[rdi]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
[rbp]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
#ifdef CONFIG_X86_64
[r8]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
[r9]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
[r10]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
[r11]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
[r12]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
[r13]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
[r14]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
#endif
[cr2]"i"(offsetof(struct
vcpu_vmx, vcpu.arch.cr2)),
[wordsize]"i"(sizeof(ulong))
: "cc", "memory"/*clobber
list,cc表示寄存器,memory表示内存*/
#ifdef CONFIG_X86_64
, "rax", "rbx", "rdi", "rsi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#else
, "eax", "ebx", "edi", "esi"
#endif
);
// 运行到这里,说明已经发生了VM-exit,返回到了root模式
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore
it if needed */
if (debugctlmsr)
update_debugctlmsr(debugctlmsr);
#ifndef CONFIG_X86_64
/*
* The
sysexit path does not restore ds/es, so
we must set them to
* a
reasonable value ourselves.
*
* We
can't defer this to vmx_load_host_state() since
that function
* may
be executed in interrupt context, which
saves and restore segments
* around
it, nullifying its effect.
*/
/*重新加载ds/es段寄存器,因为VM-exit不会自动加载他们*/
loadsegment(ds, __USER_DS);
loadsegment(es, __USER_DS);
#endif
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
| (1 << VCPU_EXREG_RFLAGS)
| (1 << VCPU_EXREG_CPL)
| (1 << VCPU_EXREG_PDPTR)
| (1 << VCPU_EXREG_SEGMENTS)
| (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
// 从硬件VMCS中读取中断向量表信息
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx->loaded_vmcs->launched = 1;
// 从硬件VMCS中读取VM-exit原因信息,这些信息是VM-exit过程中由硬件自动写入的
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
/*处理MCE异常和NMI中断*/
vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}
相关文章推荐
- placeholder在不同浏览器下的兼容
- centos编辑/etc/crontab加计划任务不生效问题
- javaScript数组操作--有道笔记整理
- openssl 生成私钥、申请文件,证书导入jks说明
- Qt环境下利用OpenGL显示三维模型
- 基于thinkphp 的mysql主从配置
- highcharts 常用配置
- ps还能用脚本切片?
- 【转】C语言产生随机数
- Linux查看CPU信息
- kernel 3.10代码分析--KVM-KVM_SET_USER_MEMORY_REGION流程
- Xcode 中 NSAssert() 的用法
- 解决手机关闭屏幕线程挂起问题
- 使用pull方式解析xml文件
- C++中for循环的5种语法,你可知道?
- STL Container和ATL智能包裹类的冲突
- JavaScript学习----初步
- UUID详解
- Kettle优化就这么多
- 猫猫学iOS之去除服务器返回数据中的html标签,去除指定字符串,替换字符串