Newer
Older
return __kvm_set_dr(vcpu, dr, value);
}
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
unsigned long value;
switch (cr) {
case 0:
value = kvm_read_cr0(vcpu);
break;
case 2:
value = vcpu->arch.cr2;
break;
case 3:
value = vcpu->arch.cr3;
break;
case 4:
value = kvm_read_cr4(vcpu);
break;
case 8:
value = kvm_get_cr8(vcpu);
break;
default:
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
return 0;
}
return value;
}
static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
int res = 0;
switch (cr) {
case 0:
res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
break;
case 2:
vcpu->arch.cr2 = val;
break;
case 3:
break;
case 4:
res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
break;
case 8:
res = __kvm_set_cr8(vcpu, val & 0xfUL);
break;
default:
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
res = -1;
return res;
}
static int emulator_get_cpl(struct kvm_vcpu *vcpu)
{
return kvm_x86_ops->get_cpl(vcpu);
}
static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
{
kvm_x86_ops->get_gdt(vcpu, dt);
}
static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
{
kvm_x86_ops->get_idt(vcpu, dt);
}
static unsigned long emulator_get_cached_segment_base(int seg,
struct kvm_vcpu *vcpu)
{
return get_segment_base(vcpu, seg);
}
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
struct kvm_vcpu *vcpu)
{
struct kvm_segment var;
kvm_get_segment(vcpu, &var, seg);
if (var.unusable)
return false;
if (var.g)
var.limit >>= 12;
set_desc_limit(desc, var.limit);
set_desc_base(desc, (unsigned long)var.base);
desc->type = var.type;
desc->s = var.s;
desc->dpl = var.dpl;
desc->p = var.present;
desc->avl = var.avl;
desc->l = var.l;
desc->d = var.db;
desc->g = var.g;
return true;
}
static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
struct kvm_vcpu *vcpu)
{
struct kvm_segment var;
/* needed to preserve selector */
kvm_get_segment(vcpu, &var, seg);
var.base = get_desc_base(desc);
var.limit = get_desc_limit(desc);
if (desc->g)
var.limit = (var.limit << 12) | 0xfff;
var.type = desc->type;
var.present = desc->p;
var.dpl = desc->dpl;
var.db = desc->d;
var.s = desc->s;
var.l = desc->l;
var.g = desc->g;
var.avl = desc->avl;
var.present = desc->p;
var.unusable = !var.present;
var.padding = 0;
kvm_set_segment(vcpu, &var, seg);
return;
}
static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
{
struct kvm_segment kvm_seg;
kvm_get_segment(vcpu, &kvm_seg, seg);
return kvm_seg.selector;
}
static void emulator_set_segment_selector(u16 sel, int seg,
struct kvm_vcpu *vcpu)
{
struct kvm_segment kvm_seg;
kvm_get_segment(vcpu, &kvm_seg, seg);
kvm_seg.selector = sel;
kvm_set_segment(vcpu, &kvm_seg, seg);
}
static struct x86_emulate_ops emulate_ops = {
.read_std = kvm_read_guest_virt_system,
.write_std = kvm_write_guest_virt_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
.pio_in_emulated = emulator_pio_in_emulated,
.pio_out_emulated = emulator_pio_out_emulated,
.get_cached_descriptor = emulator_get_cached_descriptor,
.set_cached_descriptor = emulator_set_cached_descriptor,
.get_segment_selector = emulator_get_segment_selector,
.set_segment_selector = emulator_set_segment_selector,
.get_cached_segment_base = emulator_get_cached_segment_base,
.get_gdt = emulator_get_gdt,
.get_idt = emulator_get_idt,
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
.set_msr = kvm_set_msr,
.get_msr = kvm_get_msr,
};
static void cache_all_regs(struct kvm_vcpu *vcpu)
{
kvm_register_read(vcpu, VCPU_REGS_RAX);
kvm_register_read(vcpu, VCPU_REGS_RSP);
kvm_register_read(vcpu, VCPU_REGS_RIP);
vcpu->arch.regs_dirty = ~0;
}
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
* not, left the system with the INT_STI flag enabled, it
* means that the last instruction is an sti. We should not
* leave the flag on in this case. The same goes for mov ss
*/
if (!(int_shadow & mask))
kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
}
static void inject_emulated_exception(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
if (ctxt->exception == PF_VECTOR)
kvm_propagate_fault(vcpu);
else if (ctxt->error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
else
kvm_queue_exception(vcpu, ctxt->exception);
}
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
{
struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
int cs_db, cs_l;
cache_all_regs(vcpu);
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
vcpu->arch.emulate_ctxt.vcpu = vcpu;
vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
vcpu->arch.emulate_ctxt.mode =
(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
? X86EMUL_MODE_VM86 : cs_l
? X86EMUL_MODE_PROT64 : cs_db
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
memset(c, 0, sizeof(struct decode_cache));
memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
}
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
{
struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
int ret;
init_emulate_ctxt(vcpu);
vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
if (ret != X86EMUL_CONTINUE)
return EMULATE_FAIL;
vcpu->arch.emulate_ctxt.eip = c->eip;
memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
if (irq == NMI_VECTOR)
vcpu->arch.nmi_pending = false;
else
vcpu->arch.interrupt.pending = false;
return EMULATE_DONE;
}
EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
static int handle_emulation_failure(struct kvm_vcpu *vcpu)
{
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
kvm_queue_exception(vcpu, UD_VECTOR);
return EMULATE_FAIL;
}

Gleb Natapov
committed
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa;
if (tdp_enabled)
return false;

Gleb Natapov
committed
/*
* if emulation was due to access to shadowed page table
* and it failed try to unshadow page and re-entetr the
* guest to let CPU execute the instruction.
*/
if (kvm_mmu_unprotect_page_virt(vcpu, gva))
return true;
gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
if (gpa == UNMAPPED_GVA)
return true; /* let cpu generate fault */
if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
return true;
return false;
}
int emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
u16 error_code,
int emulation_type)
int r;
struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
* TODO: fix emulate.c to use guest_read/write_register
* instead of direct ->regs accesses, can save hundred cycles
* on Intel for instructions that don't read/change RSP, for
* for example.
*/
cache_all_regs(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
init_emulate_ctxt(vcpu);
vcpu->arch.emulate_ctxt.interruptibility = 0;
vcpu->arch.emulate_ctxt.exception = -1;
vcpu->arch.emulate_ctxt.perm_ok = false;
r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
if (r == X86EMUL_PROPAGATE_FAULT)
goto done;
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
/* Only allow emulation of specific instructions on #UD
* (namely VMMCALL, sysenter, sysexit, syscall)*/
if (emulation_type & EMULTYPE_TRAP_UD) {
if (!c->twobyte)
return EMULATE_FAIL;
switch (c->b) {
case 0x01: /* VMMCALL */
if (c->modrm_mod != 3 || c->modrm_rm != 1)
return EMULATE_FAIL;
break;
case 0x34: /* sysenter */
case 0x35: /* sysexit */
if (c->modrm_mod != 0 || c->modrm_rm != 0)
return EMULATE_FAIL;
break;
case 0x05: /* syscall */
if (c->modrm_mod != 0 || c->modrm_rm != 0)
return EMULATE_FAIL;
break;
default:
return EMULATE_FAIL;
}
if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
return EMULATE_FAIL;
}
if (r) {

Gleb Natapov
committed
if (reexecute_instruction(vcpu, cr2))
return EMULATE_DONE;
if (emulation_type & EMULTYPE_SKIP)
return EMULATE_FAIL;
return handle_emulation_failure(vcpu);
}
}
if (emulation_type & EMULTYPE_SKIP) {
kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
return EMULATE_DONE;
}
/* this is needed for vmware backdor interface to work since it
changes registers values during IO operation */
memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
restart:
r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
if (r == EMULATION_FAILED) {

Gleb Natapov
committed
if (reexecute_instruction(vcpu, cr2))

Gleb Natapov
committed
return EMULATE_DONE;
return handle_emulation_failure(vcpu);
}
if (vcpu->arch.emulate_ctxt.exception >= 0) {
inject_emulated_exception(vcpu);
r = EMULATE_DONE;
} else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in)
vcpu->arch.pio.count = 0;
r = EMULATE_DO_MMIO;
} else if (vcpu->mmio_needed) {
if (vcpu->mmio_is_write)
vcpu->mmio_needed = 0;
r = EMULATE_DO_MMIO;
} else if (r == EMULATION_RESTART)
goto restart;
else
r = EMULATE_DONE;
toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
return r;
EXPORT_SYMBOL_GPL(emulate_instruction);
int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
/* do not return to emulator after return from userspace */
vcpu->arch.pio.count = 0;
return ret;
}
EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
static void tsc_bad(void *info)
{
__get_cpu_var(cpu_tsc_khz) = 0;
}
static void tsc_khz_changed(void *data)
struct cpufreq_freqs *freq = data;
unsigned long khz = 0;
if (data)
khz = freq->new;
else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
khz = cpufreq_quick_get(raw_smp_processor_id());
if (!khz)
khz = tsc_khz;
__get_cpu_var(cpu_tsc_khz) = khz;
}
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct cpufreq_freqs *freq = data;
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
/*
* We allow guests to temporarily run on slowing clocks,
* provided we notify them after, or to run on accelerating
* clocks, provided we notify them before. Thus time never
* goes backwards.
*
* However, we have a problem. We can't atomically update
* the frequency of a given CPU from this function; it is
* merely a notifier, which can be called from any CPU.
* Changing the TSC frequency at arbitrary points in time
* requires a recomputation of local variables related to
* the TSC for each VCPU. We must flag these local variables
* to be updated and be sure the update takes place with the
* new frequency before any guests proceed.
*
* Unfortunately, the combination of hotplug CPU and frequency
* change creates an intractable locking scenario; the order
* of when these callouts happen is undefined with respect to
* CPU hotplug, and they can race with each other. As such,
* merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
* undefined; you can actually have a CPU frequency change take
* place in between the computation of X and the setting of the
* variable. To protect against this problem, all updates of
* the per_cpu tsc_khz variable are done in an interrupt
* protected IPI, and all callers wishing to update the value
* must wait for a synchronous IPI to complete (which is trivial
* if the caller is on the CPU already). This establishes the
* necessary total order on variable updates.
*
* Note that because a guest time update may take place
* anytime after the setting of the VCPU's request bit, the
* correct TSC value must be set before the request. However,
* to ensure the update actually makes it to any guest which
* starts running in hardware virtualization between the set
* and the acquisition of the spinlock, we must also ping the
* CPU after setting the request bit.
*
*/
if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
return 0;
if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
return 0;
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
if (vcpu->cpu != freq->cpu)
continue;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != smp_processor_id())
}
}
spin_unlock(&kvm_lock);
if (freq->old < freq->new && send_ipi) {
/*
* We upscale the frequency. Must make the guest
* doesn't see old kvmclock values while running with
* the new frequency, otherwise we risk the guest sees
* time go backwards.
*
* In case we update the frequency for another cpu
* (which might be in guest context) send an interrupt
* to kick the cpu out of guest context. Next time
* guest context is entered kvmclock will be updated,
* so the guest will not see stale values.
*/
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
}
return 0;
}
static struct notifier_block kvmclock_cpufreq_notifier_block = {
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
.notifier_call = kvmclock_cpufreq_notifier
};
static int kvmclock_cpu_notifier(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
switch (action) {
case CPU_ONLINE:
case CPU_DOWN_FAILED:
smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
break;
case CPU_DOWN_PREPARE:
smp_call_function_single(cpu, tsc_bad, NULL, 1);
break;
}
return NOTIFY_OK;
}
static struct notifier_block kvmclock_cpu_notifier_block = {
.notifier_call = kvmclock_cpu_notifier,
.priority = -INT_MAX
static void kvm_timer_init(void)
{
int cpu;
register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
#ifdef CONFIG_CPU_FREQ
struct cpufreq_policy policy;
memset(&policy, 0, sizeof(policy));
cpu = get_cpu();
cpufreq_get_policy(&policy, cpu);
if (policy.cpuinfo.max_freq)
max_tsc_khz = policy.cpuinfo.max_freq;
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
}
pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
for_each_online_cpu(cpu)
smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
static int kvm_is_in_guest(void)
{
return percpu_read(current_vcpu) != NULL;
}
static int kvm_is_user_mode(void)
{
int user_mode = 3;
if (percpu_read(current_vcpu))
user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
return user_mode != 0;
}
static unsigned long kvm_get_guest_ip(void)
{
unsigned long ip = 0;
if (percpu_read(current_vcpu))
ip = kvm_rip_read(percpu_read(current_vcpu));
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
return ip;
}
static struct perf_guest_info_callbacks kvm_guest_cbs = {
.is_in_guest = kvm_is_in_guest,
.is_user_mode = kvm_is_user_mode,
.get_guest_ip = kvm_get_guest_ip,
};
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
{
percpu_write(current_vcpu, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
{
percpu_write(current_vcpu, NULL);
}
EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
int kvm_arch_init(void *opaque)
struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
if (kvm_x86_ops) {
printk(KERN_ERR "kvm: already loaded the other module\n");
r = -EEXIST;
goto out;
}
if (!ops->cpu_has_kvm_support()) {
printk(KERN_ERR "kvm: no hardware support\n");
r = -EOPNOTSUPP;
goto out;
}
if (ops->disabled_by_bios()) {
printk(KERN_ERR "kvm: disabled by bios\n");
r = -EOPNOTSUPP;
goto out;
}
r = kvm_mmu_module_init();
if (r)
goto out;
kvm_init_msr_list();
kvm_x86_ops = ops;
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
if (cpu_has_xsave)
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
return 0;
out:
return r;
void kvm_arch_exit(void)
{
perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
}
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
} else {
vcpu->run->exit_reason = KVM_EXIT_HLT;
return 0;
}
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
unsigned long a1)
{
if (is_long_mode(vcpu))
return a0;
else
return a0 | ((gpa_t)a1 << 32);
}
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
u64 param, ingpa, outgpa, ret;
uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
bool fast, longmode;
int cs_db, cs_l;
/*
* hypercall generates UD from non zero cpl and real mode
* per HYPER-V spec
*/
if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 0;
}
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
longmode = is_long_mode(vcpu) && cs_l == 1;
if (!longmode) {
param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
(kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
(kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
(kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
}
#ifdef CONFIG_X86_64
else {
param = kvm_register_read(vcpu, VCPU_REGS_RCX);
ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
}
#endif
code = param & 0xffff;
fast = (param >> 16) & 0x1;
rep_cnt = (param >> 32) & 0xfff;
rep_idx = (param >> 48) & 0xfff;
trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
switch (code) {
case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
kvm_vcpu_on_spin(vcpu);
break;
default:
res = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
}
ret = res | (((u64)rep_done & 0xfff) << 32);
if (longmode) {
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
} else {
kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
}
return 1;
}
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
trace_kvm_hypercall(nr, a0, a1, a2, a3);
if (!is_long_mode(vcpu)) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
a1 &= 0xFFFFFFFF;
a2 &= 0xFFFFFFFF;
a3 &= 0xFFFFFFFF;
}
if (kvm_x86_ops->get_cpl(vcpu) != 0) {
ret = -KVM_EPERM;
goto out;
}
switch (nr) {
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
case KVM_HC_MMU_OP:
r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
break;
default:
ret = -KVM_ENOSYS;
break;
}
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
{
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
/*
* Blow out the MMU to ensure that no other VCPU has an active mapping
* to ensure that the updated hypercall appears atomically across all
* VCPUs.
*/
kvm_mmu_zap_all(vcpu->kvm);
kvm_x86_ops->patch_hypercall(vcpu, instruction);
return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
}
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
{
struct desc_ptr dt = { limit, base };
kvm_x86_ops->set_gdt(vcpu, &dt);
}
void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
{
struct desc_ptr dt = { limit, base };
kvm_x86_ops->set_idt(vcpu, &dt);
}
static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
{
struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
int j, nent = vcpu->arch.cpuid_nent;
e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
/* when no next entry is found, the current entry[i] is reselected */
for (j = i + 1; ; j = (j + 1) % nent) {
struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
if (ej->function == e->function) {
ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
return j;
}
}
return 0; /* silence gcc, even though control never reaches here */
}
/* find an entry with matching function, matching index (if needed), and that
* should be read next (if it's stateful) */
static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
u32 function, u32 index)
{
if (e->function != function)
return 0;
if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
return 0;
if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
!(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
return 0;
return 1;
}
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index)
{
int i;
struct kvm_cpuid_entry2 *best = NULL;
for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
struct kvm_cpuid_entry2 *e;
e = &vcpu->arch.cpuid_entries[i];
if (is_matching_cpuid_entry(e, function, index)) {
if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
move_to_next_stateful_cpuid_entry(vcpu, i);
best = e;
break;
}
/*
* Both basic or both extended?
*/
if (((e->function ^ function) & 0x80000000) == 0)
if (!best || e->function > best->function)
best = e;
}
return best;
}
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
if (!best || best->eax < 0x80000008)
goto not_found;
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
if (best)
return best->eax & 0xff;
return 36;
}
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
u32 function, index;
struct kvm_cpuid_entry2 *best;
function = kvm_register_read(vcpu, VCPU_REGS_RAX);
index = kvm_register_read(vcpu, VCPU_REGS_RCX);
kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
best = kvm_find_cpuid_entry(vcpu, function, index);
if (best) {
kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
}
kvm_x86_ops->skip_emulated_instruction(vcpu);
trace_kvm_cpuid(function,
kvm_register_read(vcpu, VCPU_REGS_RAX),
kvm_register_read(vcpu, VCPU_REGS_RBX),
kvm_register_read(vcpu, VCPU_REGS_RCX),
kvm_register_read(vcpu, VCPU_REGS_RDX));
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
/*
* Check if userspace requested an interrupt window, and that the
* interrupt window is open.
*
* No need to exit to userspace if we already have an interrupt queued.
*/
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&

Gleb Natapov
committed
kvm_arch_interrupt_allowed(vcpu));
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
kvm_run->ready_for_interrupt_injection = 1;
kvm_run->ready_for_interrupt_injection =

Gleb Natapov
committed
kvm_arch_interrupt_allowed(vcpu) &&
!kvm_cpu_has_interrupt(vcpu) &&
!kvm_event_needs_reinjection(vcpu);
static void vapic_enter(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
struct page *page;
if (!apic || !apic->vapic_addr)
return;
page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
vcpu->arch.apic->vapic_page = page;
}
static void vapic_exit(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
if (!apic || !apic->vapic_addr)
return;
idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_release_page_dirty(apic->vapic_page);
mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
srcu_read_unlock(&vcpu->kvm->srcu, idx);