Newer
Older
/*
* Kernel-based Virtual Machine driver for Linux
*
* derived from drivers/kvm/kvm_main.c
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2008 Qumranet, Inc.
* Copyright IBM Corporation, 2008
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
* Amit Shah <amit.shah@qumranet.com>
* Ben-Ami Yassour <benami@il.ibm.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#include <linux/kvm_host.h>
#include "mmu.h"
#include <linux/clocksource.h>
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/highmem.h>
#define CREATE_TRACE_POINTS
#include "trace.h"

Frederic Weisbecker
committed
#include <asm/debugreg.h>
#include <asm/desc.h>
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
| X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
| X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
#define KVM_MAX_MCE_BANKS 32
#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
* - enable LME and LMA per default on 64 bit KVM
*/
#ifdef CONFIG_X86_64
static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
#else
static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
#endif
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
struct kvm_x86_ops *kvm_x86_ops;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
int ignore_msrs = 0;
module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
int nr;
struct kvm_shared_msr {
u32 msr;
u64 value;
} msrs[KVM_NR_SHARED_MSRS];
};
struct kvm_shared_msrs {
struct user_return_notifier urn;
bool registered;
u64 current_value[KVM_NR_SHARED_MSRS];
};
static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
{ "pf_guest", VCPU_STAT(pf_guest) },
{ "tlb_flush", VCPU_STAT(tlb_flush) },
{ "invlpg", VCPU_STAT(invlpg) },
{ "exits", VCPU_STAT(exits) },
{ "io_exits", VCPU_STAT(io_exits) },
{ "mmio_exits", VCPU_STAT(mmio_exits) },
{ "signal_exits", VCPU_STAT(signal_exits) },
{ "irq_window", VCPU_STAT(irq_window_exits) },
{ "nmi_window", VCPU_STAT(nmi_window_exits) },
{ "halt_exits", VCPU_STAT(halt_exits) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
{ "efer_reload", VCPU_STAT(efer_reload) },
{ "fpu_reload", VCPU_STAT(fpu_reload) },
{ "insn_emulation", VCPU_STAT(insn_emulation) },
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
{ "irq_injections", VCPU_STAT(irq_injections) },
{ "nmi_injections", VCPU_STAT(nmi_injections) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
{ "mmu_flooded", VM_STAT(mmu_flooded) },
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ NULL }
};
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
struct kvm_shared_msr *global;
struct kvm_shared_msrs *locals
= container_of(urn, struct kvm_shared_msrs, urn);
for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
global = &shared_msrs_global.msrs[slot];
if (global->value != locals->current_value[slot]) {
wrmsrl(global->msr, global->value);
locals->current_value[slot] = global->value;
}
}
locals->registered = false;
user_return_notifier_unregister(urn);
}
void kvm_define_shared_msr(unsigned slot, u32 msr)
{
int cpu;
u64 value;
if (slot >= shared_msrs_global.nr)
shared_msrs_global.nr = slot + 1;
shared_msrs_global.msrs[slot].msr = msr;
rdmsrl_safe(msr, &value);
shared_msrs_global.msrs[slot].value = value;
for_each_online_cpu(cpu)
per_cpu(shared_msrs, cpu).current_value[slot] = value;
}
EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
static void kvm_shared_msr_cpu_online(void)
{
unsigned i;
struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
for (i = 0; i < shared_msrs_global.nr; ++i)
locals->current_value[i] = shared_msrs_global.msrs[i].value;
}
void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
{
struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
if (((value ^ smsr->current_value[slot]) & mask) == 0)
return;
smsr->current_value[slot] = value;
wrmsrl(shared_msrs_global.msrs[slot].msr, value);
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
smsr->registered = true;
}
}
EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
static void drop_user_return_notifiers(void *ignore)
{
struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
if (smsr->registered)
kvm_on_user_return(&smsr->urn);
}
unsigned long segment_base(u16 selector)
{
struct descriptor_table gdt;
struct desc_struct *d;
unsigned long table_base;
unsigned long v;
if (selector == 0)
return 0;
table_base = gdt.base;
if (selector & 4) { /* from ldt */
u16 ldt_selector = kvm_read_ldt();
table_base = segment_base(ldt_selector);
}
d = (struct desc_struct *)(table_base + (selector & ~7));
if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
#endif
return v;
}
EXPORT_SYMBOL_GPL(segment_base);
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm))
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
{
/* TODO: reserve bits check */
if (irqchip_in_kernel(vcpu->kvm))
kvm_lapic_set_base(vcpu, data);
else
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
WARN_ON(vcpu->arch.exception.pending);
vcpu->arch.exception.pending = true;
vcpu->arch.exception.has_error_code = false;
vcpu->arch.exception.nr = nr;
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
u32 error_code)
{
++vcpu->stat.pf_guest;
switch(vcpu->arch.exception.nr) {
case DF_VECTOR:
/* triple fault -> shutdown */
set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
return;
case PF_VECTOR:
vcpu->arch.exception.nr = DF_VECTOR;
vcpu->arch.exception.error_code = 0;
return;
default:
/* replace previous exception with a new one in a hope
that instruction re-execution will regenerate lost
exception */
vcpu->arch.exception.pending = false;
break;
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
}
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
vcpu->arch.nmi_pending = 1;
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
WARN_ON(vcpu->arch.exception.pending);
vcpu->arch.exception.pending = true;
vcpu->arch.exception.has_error_code = true;
vcpu->arch.exception.nr = nr;
vcpu->arch.exception.error_code = error_code;
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
/*
* Checks if cpl <= required_cpl; if true, return true. Otherwise queue
* a #GP and return false.
*/
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
return true;
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return false;
EXPORT_SYMBOL_GPL(kvm_require_cpl);
/*
* Load the pae pdptrs. Return true is they are all valid.
*/
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
{
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
int i;
int ret;
u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
offset * sizeof(u64), sizeof(pdpte));
if (ret < 0) {
ret = 0;
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if (is_present_gpte(pdpte[i]) &&
(pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
}
}
ret = 1;
memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail);
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_dirty);
out:
return ret;
}
EXPORT_SYMBOL_GPL(load_pdptrs);
static bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
bool changed = true;
int r;
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
if (!test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail))
return true;
r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
out:
return changed;
}
void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
if (cr0 & CR0_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
kvm_inject_gp(vcpu, 0);
return;
}
if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
kvm_inject_gp(vcpu, 0);
return;
}
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
"and a clear PE flag\n");
kvm_inject_gp(vcpu, 0);
return;
}
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
if ((vcpu->arch.shadow_efer & EFER_LME)) {
int cs_db, cs_l;
if (!is_pae(vcpu)) {
printk(KERN_DEBUG "set_cr0: #GP, start paging "
"in long mode while PAE is disabled\n");
kvm_inject_gp(vcpu, 0);
return;
}
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
if (cs_l) {
printk(KERN_DEBUG "set_cr0: #GP, start paging "
"in long mode while CS.L == 1\n");
kvm_inject_gp(vcpu, 0);
return;
}
} else
#endif
if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
"reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
}
kvm_x86_ops->set_cr0(vcpu, cr0);
kvm_mmu_reset_context(vcpu);
return;
}
EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
EXPORT_SYMBOL_GPL(kvm_lmsw);
void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long old_cr4 = vcpu->arch.cr4;
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
if (cr4 & CR4_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE)) {
printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
"in long mode\n");
kvm_inject_gp(vcpu, 0);
return;
}
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
&& !load_pdptrs(vcpu, vcpu->arch.cr3)) {
printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
if (cr4 & X86_CR4_VMXE) {
printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
kvm_inject_gp(vcpu, 0);
return;
}
kvm_x86_ops->set_cr4(vcpu, cr4);
vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
kvm_mmu_flush_tlb(vcpu);
return;
}
if (is_long_mode(vcpu)) {
if (cr3 & CR3_L_MODE_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
} else {
if (is_pae(vcpu)) {
if (cr3 & CR3_PAE_RESERVED_BITS) {
printk(KERN_DEBUG
"set_cr3: #GP, reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
"reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
}
/*
* We don't check reserved bits in nonpae mode, because
* this isn't enforced, and VMware depends on this.
*/
}
/*
* Does the new cr3 value map to physical memory? (Note, we
* catch an invalid cr3 even in real-mode, because it would
* cause trouble later on when we turn on paging anyway.)
*
* A real CPU would silently accept an invalid cr3 and would
* attempt to use it - with largely undefined (and often hard
* to debug) behavior on the guest side.
*/
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
kvm_inject_gp(vcpu, 0);
vcpu->arch.cr3 = cr3;
vcpu->arch.mmu.new_cr3(vcpu);
}
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
kvm_inject_gp(vcpu, 0);
return;
}
if (irqchip_in_kernel(vcpu->kvm))
kvm_lapic_set_tpr(vcpu, cr8);
else
EXPORT_SYMBOL_GPL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm))
return kvm_lapic_get_cr8(vcpu);
else
EXPORT_SYMBOL_GPL(kvm_get_cr8);
static inline u32 bit(int bitno)
{
return 1 << (bitno & 31);
}
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
*
* This list is modified at module load time to reflect the
* capabilities of the host cpu. This capabilities test skips MSRs that are
* kvm-specific. Those are put in the beginning of the list.
#define KVM_SAVE_MSRS_BEGIN 2
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_K6_STAR,
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
};
static unsigned num_msrs_to_save;
static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
};
static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
if (efer & efer_reserved_bits) {
printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
efer);
kvm_inject_gp(vcpu, 0);
return;
}
if (is_paging(vcpu)
&& (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
kvm_inject_gp(vcpu, 0);
return;
}
if (efer & EFER_FFXSR) {
struct kvm_cpuid_entry2 *feat;
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
kvm_inject_gp(vcpu, 0);
return;
}
}
if (efer & EFER_SVME) {
struct kvm_cpuid_entry2 *feat;
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
kvm_inject_gp(vcpu, 0);
return;
}
}
kvm_x86_ops->set_efer(vcpu, efer);
efer &= ~EFER_LMA;
efer |= vcpu->arch.shadow_efer & EFER_LMA;
vcpu->arch.shadow_efer = efer;
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
void kvm_enable_efer_bits(u64 mask)
{
efer_reserved_bits &= ~mask;
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{
return kvm_x86_ops->set_msr(vcpu, msr_index, data);
}
/*
* Adapt set_msr() to msr_io()'s calling convention
*/
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
return kvm_set_msr(vcpu, index, *data);
}
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
static int version;
struct pvclock_wall_clock wc;
struct timespec now, sys, boot;
if (!wall_clock)
return;
version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
/*
* The guest calculates current wall clock time by adding
* system time (updated by kvm_write_guest_time below) to the
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
now = current_kernel_time();
ktime_get_ts(&sys);
boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
wc.sec = boot.tv_sec;
wc.nsec = boot.tv_nsec;
wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
uint32_t quotient, remainder;
/* Don't try to replace with do_div(), this one calculates
* "(dividend << 32) / divisor" */
__asm__ ( "divl %4"
: "=a" (quotient), "=d" (remainder)
: "0" (0), "1" (dividend), "r" (divisor) );
return quotient;
}
static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
{
uint64_t nsecs = 1000000000LL;
int32_t shift = 0;
uint64_t tps64;
uint32_t tps32;
tps64 = tsc_khz * 1000LL;
while (tps64 > nsecs*2) {
tps64 >>= 1;
shift--;
}
tps32 = (uint32_t)tps64;
while (tps32 <= (uint32_t)nsecs) {
tps32 <<= 1;
shift++;
}
hv_clock->tsc_shift = shift;
hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
__func__, tsc_khz, hv_clock->tsc_shift,
hv_clock->tsc_to_system_mul);
}
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
struct timespec ts;
unsigned long flags;
struct kvm_vcpu_arch *vcpu = &v->arch;
void *shared_kaddr;
unsigned long this_tsc_khz;
if ((!vcpu->time_page))
return;
this_tsc_khz = get_cpu_var(cpu_tsc_khz);
if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
vcpu->hv_clock_tsc_khz = this_tsc_khz;
put_cpu_var(cpu_tsc_khz);
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);

Jaswinder Singh Rajput
committed
kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
ktime_get_ts(&ts);
local_irq_restore(flags);
/* With all the info we got, fill in the values */
vcpu->hv_clock.system_time = ts.tv_nsec +
(NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
* state, we just increase by 2 at the end.
vcpu->hv_clock.version += 2;
shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
sizeof(vcpu->hv_clock));
kunmap_atomic(shared_kaddr, KM_USER0);
mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
}
static int kvm_request_guest_time_update(struct kvm_vcpu *v)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
if (!vcpu->time_page)
return 0;
set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
return 1;
}
static bool msr_mtrr_valid(unsigned msr)
{
switch (msr) {
case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
case MSR_MTRRfix64K_00000:
case MSR_MTRRfix16K_80000:
case MSR_MTRRfix16K_A0000:
case MSR_MTRRfix4K_C0000:
case MSR_MTRRfix4K_C8000:
case MSR_MTRRfix4K_D0000:
case MSR_MTRRfix4K_D8000:
case MSR_MTRRfix4K_E0000:
case MSR_MTRRfix4K_E8000:
case MSR_MTRRfix4K_F0000:
case MSR_MTRRfix4K_F8000:
case MSR_MTRRdefType:
case MSR_IA32_CR_PAT:
return true;
case 0x2f8:
return true;
}
return false;
}
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
static bool valid_pat_type(unsigned t)
{
return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
}
static bool valid_mtrr_type(unsigned t)
{
return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
}
static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
int i;
if (!msr_mtrr_valid(msr))
return false;
if (msr == MSR_IA32_CR_PAT) {
for (i = 0; i < 8; i++)
if (!valid_pat_type((data >> (i * 8)) & 0xff))
return false;
return true;
} else if (msr == MSR_MTRRdefType) {
if (data & ~0xcff)
return false;
return valid_mtrr_type(data & 0xff);
} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
for (i = 0; i < 8 ; i++)
if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
return false;
return true;
}
/* variable MTRRs */
return valid_mtrr_type(data & 0xff);
}
static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
if (msr == MSR_MTRRdefType) {
vcpu->arch.mtrr_state.def_type = data;
vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
} else if (msr == MSR_MTRRfix64K_00000)
p[0] = data;
else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
p[1 + msr - MSR_MTRRfix16K_80000] = data;
else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
p[3 + msr - MSR_MTRRfix4K_C0000] = data;
else if (msr == MSR_IA32_CR_PAT)
vcpu->arch.pat = data;
else { /* Variable MTRRs */
int idx, is_mtrr_mask;
u64 *pt;
idx = (msr - 0x200) / 2;
is_mtrr_mask = msr - 0x200 - 2 * idx;
if (!is_mtrr_mask)
pt =
(u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
else
pt =
(u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
*pt = data;
}
kvm_mmu_reset_context(vcpu);
static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
u64 mcg_cap = vcpu->arch.mcg_cap;
unsigned bank_num = mcg_cap & 0xff;
switch (msr) {
case MSR_IA32_MCG_STATUS:
if (!(mcg_cap & MCG_CTL_P))
return 1;
if (data != 0 && data != ~(u64)0)
return -1;
vcpu->arch.mcg_ctl = data;
break;
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
u32 offset = msr - MSR_IA32_MC0_CTL;
/* only 0 or all 1s can be written to IA32_MCi_CTL */
if ((offset & 0x3) == 0 &&
data != 0 && data != ~(u64)0)
return -1;
vcpu->arch.mce_banks[offset] = data;
break;
}
return 1;
}
return 0;
}
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
int lm = is_long_mode(vcpu);
u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
: kvm->arch.xen_hvm_config.blob_size_32;
u32 page_num = data & ~PAGE_MASK;
u64 page_addr = data & PAGE_MASK;
u8 *page;
int r;
r = -E2BIG;
if (page_num >= blob_size)
goto out;
r = -ENOMEM;
page = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!page)
goto out;
r = -EFAULT;
if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
goto out_free;
if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
goto out_free;
r = 0;
out_free:
kfree(page);
out:
return r;
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
case MSR_EFER:
set_efer(vcpu, data);
break;
case MSR_K7_HWCR:
data &= ~(u64)0x40; /* ignore flush filter disable */
if (data != 0) {
pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
data);
return 1;
}
case MSR_FAM10H_MMIO_CONF_BASE:
if (data != 0) {
pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
"0x%llx\n", data);
return 1;
}
case MSR_IA32_DEBUGCTLMSR:
if (!data) {
/* We support the non-activated case already */
break;
} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
/* Values other than LBR and BTF are vendor-specific,
thus reserved and should throw a #GP */
return 1;
}
pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
__func__, data);
break;
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE: