Newer
Older
{
struct kvm_segment kvm_seg;
kvm_get_segment(vcpu, &kvm_seg, seg);
return kvm_seg.selector;
}
static void emulator_set_segment_selector(u16 sel, int seg,
struct kvm_vcpu *vcpu)
{
struct kvm_segment kvm_seg;
kvm_get_segment(vcpu, &kvm_seg, seg);
kvm_seg.selector = sel;
kvm_set_segment(vcpu, &kvm_seg, seg);
}
static struct x86_emulate_ops emulate_ops = {
.read_std = kvm_read_guest_virt_system,
.write_std = kvm_write_guest_virt_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
.pio_in_emulated = emulator_pio_in_emulated,
.pio_out_emulated = emulator_pio_out_emulated,
.get_cached_descriptor = emulator_get_cached_descriptor,
.set_cached_descriptor = emulator_set_cached_descriptor,
.get_segment_selector = emulator_get_segment_selector,
.set_segment_selector = emulator_set_segment_selector,
.get_cached_segment_base = emulator_get_cached_segment_base,
.get_gdt = emulator_get_gdt,
.get_idt = emulator_get_idt,
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
.set_msr = kvm_set_msr,
.get_msr = kvm_get_msr,
};
static void cache_all_regs(struct kvm_vcpu *vcpu)
{
kvm_register_read(vcpu, VCPU_REGS_RAX);
kvm_register_read(vcpu, VCPU_REGS_RSP);
kvm_register_read(vcpu, VCPU_REGS_RIP);
vcpu->arch.regs_dirty = ~0;
}
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
* not, left the system with the INT_STI flag enabled, it
* means that the last instruction is an sti. We should not
* leave the flag on in this case. The same goes for mov ss
*/
if (!(int_shadow & mask))
kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
}
static void inject_emulated_exception(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
if (ctxt->exception == PF_VECTOR)
kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
else if (ctxt->error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
else
kvm_queue_exception(vcpu, ctxt->exception);
}
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
{
struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
int cs_db, cs_l;
cache_all_regs(vcpu);
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
vcpu->arch.emulate_ctxt.vcpu = vcpu;
vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
vcpu->arch.emulate_ctxt.mode =
(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
? X86EMUL_MODE_VM86 : cs_l
? X86EMUL_MODE_PROT64 : cs_db
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
memset(c, 0, sizeof(struct decode_cache));
memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
}
static int handle_emulation_failure(struct kvm_vcpu *vcpu)
{
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
kvm_queue_exception(vcpu, UD_VECTOR);
return EMULATE_FAIL;
}

Gleb Natapov
committed
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa;
if (tdp_enabled)
return false;

Gleb Natapov
committed
/*
* if emulation was due to access to shadowed page table
* and it failed try to unshadow page and re-entetr the
* guest to let CPU execute the instruction.
*/
if (kvm_mmu_unprotect_page_virt(vcpu, gva))
return true;
gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
if (gpa == UNMAPPED_GVA)
return true; /* let cpu generate fault */
if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
return true;
return false;
}
int emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
u16 error_code,
int emulation_type)
int r;
struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
* TODO: fix emulate.c to use guest_read/write_register
* instead of direct ->regs accesses, can save hundred cycles
* on Intel for instructions that don't read/change RSP, for
* for example.
*/
cache_all_regs(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
init_emulate_ctxt(vcpu);
vcpu->arch.emulate_ctxt.interruptibility = 0;
vcpu->arch.emulate_ctxt.exception = -1;
vcpu->arch.emulate_ctxt.perm_ok = false;
r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
/* Only allow emulation of specific instructions on #UD
* (namely VMMCALL, sysenter, sysexit, syscall)*/
if (emulation_type & EMULTYPE_TRAP_UD) {
if (!c->twobyte)
return EMULATE_FAIL;
switch (c->b) {
case 0x01: /* VMMCALL */
if (c->modrm_mod != 3 || c->modrm_rm != 1)
return EMULATE_FAIL;
break;
case 0x34: /* sysenter */
case 0x35: /* sysexit */
if (c->modrm_mod != 0 || c->modrm_rm != 0)
return EMULATE_FAIL;
break;
case 0x05: /* syscall */
if (c->modrm_mod != 0 || c->modrm_rm != 0)
return EMULATE_FAIL;
break;
default:
return EMULATE_FAIL;
}
if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
return EMULATE_FAIL;
}
if (r) {

Gleb Natapov
committed
if (reexecute_instruction(vcpu, cr2))
return EMULATE_DONE;
if (emulation_type & EMULTYPE_SKIP)
return EMULATE_FAIL;
return handle_emulation_failure(vcpu);
}
}
if (emulation_type & EMULTYPE_SKIP) {
kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
return EMULATE_DONE;
}
/* this is needed for vmware backdor interface to work since it
changes registers values during IO operation */
memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
restart:
r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
if (r == EMULATION_FAILED) {

Gleb Natapov
committed
if (reexecute_instruction(vcpu, cr2))

Gleb Natapov
committed
return EMULATE_DONE;
return handle_emulation_failure(vcpu);
}
if (vcpu->arch.emulate_ctxt.exception >= 0) {
inject_emulated_exception(vcpu);
r = EMULATE_DONE;
} else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in)
vcpu->arch.pio.count = 0;
r = EMULATE_DO_MMIO;
} else if (vcpu->mmio_needed) {
if (vcpu->mmio_is_write)
vcpu->mmio_needed = 0;
r = EMULATE_DO_MMIO;
} else if (r == EMULATION_RESTART)
goto restart;
else
r = EMULATE_DONE;
toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
return r;
EXPORT_SYMBOL_GPL(emulate_instruction);
int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
/* do not return to emulator after return from userspace */
vcpu->arch.pio.count = 0;
return ret;
}
EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
static void tsc_bad(void *info)
{
__get_cpu_var(cpu_tsc_khz) = 0;
}
static void tsc_khz_changed(void *data)
struct cpufreq_freqs *freq = data;
unsigned long khz = 0;
if (data)
khz = freq->new;
else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
khz = cpufreq_quick_get(raw_smp_processor_id());
if (!khz)
khz = tsc_khz;
__get_cpu_var(cpu_tsc_khz) = khz;
}
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct cpufreq_freqs *freq = data;
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
/*
* We allow guests to temporarily run on slowing clocks,
* provided we notify them after, or to run on accelerating
* clocks, provided we notify them before. Thus time never
* goes backwards.
*
* However, we have a problem. We can't atomically update
* the frequency of a given CPU from this function; it is
* merely a notifier, which can be called from any CPU.
* Changing the TSC frequency at arbitrary points in time
* requires a recomputation of local variables related to
* the TSC for each VCPU. We must flag these local variables
* to be updated and be sure the update takes place with the
* new frequency before any guests proceed.
*
* Unfortunately, the combination of hotplug CPU and frequency
* change creates an intractable locking scenario; the order
* of when these callouts happen is undefined with respect to
* CPU hotplug, and they can race with each other. As such,
* merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
* undefined; you can actually have a CPU frequency change take
* place in between the computation of X and the setting of the
* variable. To protect against this problem, all updates of
* the per_cpu tsc_khz variable are done in an interrupt
* protected IPI, and all callers wishing to update the value
* must wait for a synchronous IPI to complete (which is trivial
* if the caller is on the CPU already). This establishes the
* necessary total order on variable updates.
*
* Note that because a guest time update may take place
* anytime after the setting of the VCPU's request bit, the
* correct TSC value must be set before the request. However,
* to ensure the update actually makes it to any guest which
* starts running in hardware virtualization between the set
* and the acquisition of the spinlock, we must also ping the
* CPU after setting the request bit.
*
*/
if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
return 0;
if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
return 0;
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
if (vcpu->cpu != freq->cpu)
continue;
if (!kvm_request_guest_time_update(vcpu))
continue;
if (vcpu->cpu != smp_processor_id())
}
}
spin_unlock(&kvm_lock);
if (freq->old < freq->new && send_ipi) {
/*
* We upscale the frequency. Must make the guest
* doesn't see old kvmclock values while running with
* the new frequency, otherwise we risk the guest sees
* time go backwards.
*
* In case we update the frequency for another cpu
* (which might be in guest context) send an interrupt
* to kick the cpu out of guest context. Next time
* guest context is entered kvmclock will be updated,
* so the guest will not see stale values.
*/
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
}
return 0;
}
static struct notifier_block kvmclock_cpufreq_notifier_block = {
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
.notifier_call = kvmclock_cpufreq_notifier
};
static int kvmclock_cpu_notifier(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
switch (action) {
case CPU_ONLINE:
case CPU_DOWN_FAILED:
smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
break;
case CPU_DOWN_PREPARE:
smp_call_function_single(cpu, tsc_bad, NULL, 1);
break;
}
return NOTIFY_OK;
}
static struct notifier_block kvmclock_cpu_notifier_block = {
.notifier_call = kvmclock_cpu_notifier,
.priority = -INT_MAX
static void kvm_timer_init(void)
{
int cpu;
register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
}
for_each_online_cpu(cpu)
smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
static int kvm_is_in_guest(void)
{
return percpu_read(current_vcpu) != NULL;
}
static int kvm_is_user_mode(void)
{
int user_mode = 3;
if (percpu_read(current_vcpu))
user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
return user_mode != 0;
}
static unsigned long kvm_get_guest_ip(void)
{
unsigned long ip = 0;
if (percpu_read(current_vcpu))
ip = kvm_rip_read(percpu_read(current_vcpu));
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
return ip;
}
static struct perf_guest_info_callbacks kvm_guest_cbs = {
.is_in_guest = kvm_is_in_guest,
.is_user_mode = kvm_is_user_mode,
.get_guest_ip = kvm_get_guest_ip,
};
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
{
percpu_write(current_vcpu, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
{
percpu_write(current_vcpu, NULL);
}
EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
int kvm_arch_init(void *opaque)
struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
if (kvm_x86_ops) {
printk(KERN_ERR "kvm: already loaded the other module\n");
r = -EEXIST;
goto out;
}
if (!ops->cpu_has_kvm_support()) {
printk(KERN_ERR "kvm: no hardware support\n");
r = -EOPNOTSUPP;
goto out;
}
if (ops->disabled_by_bios()) {
printk(KERN_ERR "kvm: disabled by bios\n");
r = -EOPNOTSUPP;
goto out;
}
r = kvm_mmu_module_init();
if (r)
goto out;
kvm_init_msr_list();
kvm_x86_ops = ops;
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
if (cpu_has_xsave)
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
return 0;
out:
return r;
void kvm_arch_exit(void)
{
perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
}
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
} else {
vcpu->run->exit_reason = KVM_EXIT_HLT;
return 0;
}
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
unsigned long a1)
{
if (is_long_mode(vcpu))
return a0;
else
return a0 | ((gpa_t)a1 << 32);
}
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
u64 param, ingpa, outgpa, ret;
uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
bool fast, longmode;
int cs_db, cs_l;
/*
* hypercall generates UD from non zero cpl and real mode
* per HYPER-V spec
*/
if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 0;
}
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
longmode = is_long_mode(vcpu) && cs_l == 1;
if (!longmode) {
param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
(kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
(kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
(kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
}
#ifdef CONFIG_X86_64
else {
param = kvm_register_read(vcpu, VCPU_REGS_RCX);
ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
}
#endif
code = param & 0xffff;
fast = (param >> 16) & 0x1;
rep_cnt = (param >> 32) & 0xfff;
rep_idx = (param >> 48) & 0xfff;
trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
switch (code) {
case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
kvm_vcpu_on_spin(vcpu);
break;
default:
res = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
}
ret = res | (((u64)rep_done & 0xfff) << 32);
if (longmode) {
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
} else {
kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
}
return 1;
}
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
trace_kvm_hypercall(nr, a0, a1, a2, a3);
if (!is_long_mode(vcpu)) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
a1 &= 0xFFFFFFFF;
a2 &= 0xFFFFFFFF;
a3 &= 0xFFFFFFFF;
}
if (kvm_x86_ops->get_cpl(vcpu) != 0) {
ret = -KVM_EPERM;
goto out;
}
switch (nr) {
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
case KVM_HC_MMU_OP:
r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
break;
default:
ret = -KVM_ENOSYS;
break;
}
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
{
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
/*
* Blow out the MMU to ensure that no other VCPU has an active mapping
* to ensure that the updated hypercall appears atomically across all
* VCPUs.
*/
kvm_mmu_zap_all(vcpu->kvm);
kvm_x86_ops->patch_hypercall(vcpu, instruction);
return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
}
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
{
struct desc_ptr dt = { limit, base };
kvm_x86_ops->set_gdt(vcpu, &dt);
}
void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
{
struct desc_ptr dt = { limit, base };
kvm_x86_ops->set_idt(vcpu, &dt);
}
static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
{
struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
int j, nent = vcpu->arch.cpuid_nent;
e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
/* when no next entry is found, the current entry[i] is reselected */
for (j = i + 1; ; j = (j + 1) % nent) {
struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
if (ej->function == e->function) {
ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
return j;
}
}
return 0; /* silence gcc, even though control never reaches here */
}
/* find an entry with matching function, matching index (if needed), and that
* should be read next (if it's stateful) */
static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
u32 function, u32 index)
{
if (e->function != function)
return 0;
if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
return 0;
if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
!(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
return 0;
return 1;
}
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index)
{
int i;
struct kvm_cpuid_entry2 *best = NULL;
for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
struct kvm_cpuid_entry2 *e;
e = &vcpu->arch.cpuid_entries[i];
if (is_matching_cpuid_entry(e, function, index)) {
if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
move_to_next_stateful_cpuid_entry(vcpu, i);
best = e;
break;
}
/*
* Both basic or both extended?
*/
if (((e->function ^ function) & 0x80000000) == 0)
if (!best || e->function > best->function)
best = e;
}
return best;
}
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
if (!best || best->eax < 0x80000008)
goto not_found;
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
if (best)
return best->eax & 0xff;
return 36;
}
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
u32 function, index;
struct kvm_cpuid_entry2 *best;
function = kvm_register_read(vcpu, VCPU_REGS_RAX);
index = kvm_register_read(vcpu, VCPU_REGS_RCX);
kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
best = kvm_find_cpuid_entry(vcpu, function, index);
if (best) {
kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
}
kvm_x86_ops->skip_emulated_instruction(vcpu);
trace_kvm_cpuid(function,
kvm_register_read(vcpu, VCPU_REGS_RAX),
kvm_register_read(vcpu, VCPU_REGS_RBX),
kvm_register_read(vcpu, VCPU_REGS_RCX),
kvm_register_read(vcpu, VCPU_REGS_RDX));
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
/*
* Check if userspace requested an interrupt window, and that the
* interrupt window is open.
*
* No need to exit to userspace if we already have an interrupt queued.
*/
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&

Gleb Natapov
committed
kvm_arch_interrupt_allowed(vcpu));
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
kvm_run->ready_for_interrupt_injection = 1;
kvm_run->ready_for_interrupt_injection =

Gleb Natapov
committed
kvm_arch_interrupt_allowed(vcpu) &&
!kvm_cpu_has_interrupt(vcpu) &&
!kvm_event_needs_reinjection(vcpu);
static void vapic_enter(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
struct page *page;
if (!apic || !apic->vapic_addr)
return;
page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
vcpu->arch.apic->vapic_page = page;
}
static void vapic_exit(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
if (!apic || !apic->vapic_addr)
return;
idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_release_page_dirty(apic->vapic_page);
mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
{
int max_irr, tpr;
if (!kvm_x86_ops->update_cr8_intercept)
return;
if (!vcpu->arch.apic)
return;
if (!vcpu->arch.apic->vapic_addr)
max_irr = kvm_lapic_find_highest_irr(vcpu);
else
max_irr = -1;
if (max_irr != -1)
max_irr >>= 4;
tpr = kvm_lapic_get_cr8(vcpu);
kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
}
static void inject_pending_event(struct kvm_vcpu *vcpu)
{
/* try to reinject previous events if any */
if (vcpu->arch.exception.pending) {
trace_kvm_inj_exception(vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code);
kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
vcpu->arch.exception.reinject);
return;
}
if (vcpu->arch.nmi_injected) {
kvm_x86_ops->set_nmi(vcpu);
return;
}
if (vcpu->arch.interrupt.pending) {
return;
}
/* try to inject new event if pending */
if (vcpu->arch.nmi_pending) {
if (kvm_x86_ops->nmi_allowed(vcpu)) {
vcpu->arch.nmi_pending = false;
vcpu->arch.nmi_injected = true;
kvm_x86_ops->set_nmi(vcpu);
}
} else if (kvm_cpu_has_interrupt(vcpu)) {
if (kvm_x86_ops->interrupt_allowed(vcpu)) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
false);
kvm_x86_ops->set_irq(vcpu);
static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
{
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
!vcpu->guest_xcr0_loaded) {
/* kvm_set_xcr() also depends on this */
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
vcpu->guest_xcr0_loaded = 1;
}
}
static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
{
if (vcpu->guest_xcr0_loaded) {
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
vcpu->guest_xcr0_loaded = 0;
}
}
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
if (vcpu->requests) {
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) {
r = kvm_write_guest_time(vcpu);
if (unlikely(r))
goto out;
}
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_x86_ops->tlb_flush(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
r = kvm_mmu_reload(vcpu);
if (unlikely(r))
goto out;
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
if (vcpu->fpu_active)
kvm_load_guest_fpu(vcpu);
atomic_set(&vcpu->guest_mode, 1);
smp_wmb();
if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
|| need_resched() || signal_pending(current)) {
atomic_set(&vcpu->guest_mode, 0);
smp_wmb();
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
/* enable NMI/IRQ window open exits if needed */
if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
kvm_x86_ops->enable_irq_window(vcpu);
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_guest_enter();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
}
trace_kvm_entry(vcpu->vcpu_id);

Frederic Weisbecker
committed
/*
* If the guest has used debug registers, at least dr7
* will be disabled while returning to the host.
* If we don't have active breakpoints in the host, we don't
* care about the messed up debug address registers. But if
* we have some of them active, restore the old state.
*/
if (hw_breakpoint_active())

Frederic Weisbecker
committed
hw_breakpoint_restore();
kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);