Newer
Older
/*
* Kernel-based Virtual Machine driver for Linux
*
* derived from drivers/kvm/kvm_main.c
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2008 Qumranet, Inc.
* Copyright IBM Corporation, 2008
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
* Amit Shah <amit.shah@qumranet.com>
* Ben-Ami Yassour <benami@il.ibm.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#include <linux/kvm_host.h>
#include "mmu.h"
#include <linux/clocksource.h>
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/highmem.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
| X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
| X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
* - enable LME and LMA per default on 64 bit KVM
*/
#ifdef CONFIG_X86_64
static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
#else
static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
#endif
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
struct kvm_x86_ops *kvm_x86_ops;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
{ "pf_guest", VCPU_STAT(pf_guest) },
{ "tlb_flush", VCPU_STAT(tlb_flush) },
{ "invlpg", VCPU_STAT(invlpg) },
{ "exits", VCPU_STAT(exits) },
{ "io_exits", VCPU_STAT(io_exits) },
{ "mmio_exits", VCPU_STAT(mmio_exits) },
{ "signal_exits", VCPU_STAT(signal_exits) },
{ "irq_window", VCPU_STAT(irq_window_exits) },
{ "nmi_window", VCPU_STAT(nmi_window_exits) },
{ "halt_exits", VCPU_STAT(halt_exits) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
{ "efer_reload", VCPU_STAT(efer_reload) },
{ "fpu_reload", VCPU_STAT(fpu_reload) },
{ "insn_emulation", VCPU_STAT(insn_emulation) },
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
{ "irq_injections", VCPU_STAT(irq_injections) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
{ "mmu_flooded", VM_STAT(mmu_flooded) },
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ NULL }
};
static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
int assigned_dev_id)
{
struct list_head *ptr;
struct kvm_assigned_dev_kernel *match;
list_for_each(ptr, head) {
match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
if (match->assigned_dev_id == assigned_dev_id)
return match;
}
return NULL;
}
static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
{
struct kvm_assigned_dev_kernel *assigned_dev;
assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
interrupt_work);
/* This is taken to safely inject irq inside the guest. When
* the interrupt injection (or the ioapic code) uses a
* finer-grained lock, update this
*/
mutex_lock(&assigned_dev->kvm->lock);
kvm_set_irq(assigned_dev->kvm,
assigned_dev->guest_irq, 1);
mutex_unlock(&assigned_dev->kvm->lock);
kvm_put_kvm(assigned_dev->kvm);
}
/* FIXME: Implement the OR logic needed to make shared interrupts on
* this line behave properly
*/
static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev =
(struct kvm_assigned_dev_kernel *) dev_id;
kvm_get_kvm(assigned_dev->kvm);
schedule_work(&assigned_dev->interrupt_work);
disable_irq_nosync(irq);
return IRQ_HANDLED;
}
/* Ack the irq line for an assigned device */
static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_assigned_dev_kernel *dev;
if (kian->gsi == -1)
return;
dev = container_of(kian, struct kvm_assigned_dev_kernel,
ack_notifier);
kvm_set_irq(dev->kvm, dev->guest_irq, 0);
enable_irq(dev->host_irq);
}
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
static void kvm_free_assigned_device(struct kvm *kvm,
struct kvm_assigned_dev_kernel
*assigned_dev)
{
if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
free_irq(assigned_dev->host_irq, (void *)assigned_dev);
kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
if (cancel_work_sync(&assigned_dev->interrupt_work))
/* We had pending work. That means we will have to take
* care of kvm_put_kvm.
*/
kvm_put_kvm(kvm);
pci_release_regions(assigned_dev->dev);
pci_disable_device(assigned_dev->dev);
pci_dev_put(assigned_dev->dev);
list_del(&assigned_dev->list);
kfree(assigned_dev);
}
static void kvm_free_all_assigned_devices(struct kvm *kvm)
{
struct list_head *ptr, *ptr2;
struct kvm_assigned_dev_kernel *assigned_dev;
list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
assigned_dev = list_entry(ptr,
struct kvm_assigned_dev_kernel,
list);
kvm_free_assigned_device(kvm, assigned_dev);
}
}
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
struct kvm_assigned_irq
*assigned_irq)
{
int r = 0;
struct kvm_assigned_dev_kernel *match;
mutex_lock(&kvm->lock);
match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
assigned_irq->assigned_dev_id);
if (!match) {
mutex_unlock(&kvm->lock);
return -EINVAL;
}
if (match->irq_requested) {
match->guest_irq = assigned_irq->guest_irq;
match->ack_notifier.gsi = assigned_irq->guest_irq;
mutex_unlock(&kvm->lock);
return 0;
}
INIT_WORK(&match->interrupt_work,
kvm_assigned_dev_interrupt_work_handler);
if (irqchip_in_kernel(kvm)) {
if (!capable(CAP_SYS_RAWIO)) {
r = -EPERM;
goto out_release;
if (assigned_irq->host_irq)
match->host_irq = assigned_irq->host_irq;
else
match->host_irq = match->dev->irq;
match->guest_irq = assigned_irq->guest_irq;
match->ack_notifier.gsi = assigned_irq->guest_irq;
match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
/* Even though this is PCI, we don't want to use shared
* interrupts. Sharing host devices with guest-assigned devices
* on the same interrupt line is not a happy situation: there
* are going to be long delays in accepting, acking, etc.
*/
if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
"kvm_assigned_device", (void *)match)) {
r = -EIO;
goto out_release;
}
}
match->irq_requested = true;
mutex_unlock(&kvm->lock);
return r;
out_release:
mutex_unlock(&kvm->lock);
kvm_free_assigned_device(kvm, match);
return r;
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
}
static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
struct kvm_assigned_pci_dev *assigned_dev)
{
int r = 0;
struct kvm_assigned_dev_kernel *match;
struct pci_dev *dev;
mutex_lock(&kvm->lock);
match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
assigned_dev->assigned_dev_id);
if (match) {
/* device already assigned */
r = -EINVAL;
goto out;
}
match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
if (match == NULL) {
printk(KERN_INFO "%s: Couldn't allocate memory\n",
__func__);
r = -ENOMEM;
goto out;
}
dev = pci_get_bus_and_slot(assigned_dev->busnr,
assigned_dev->devfn);
if (!dev) {
printk(KERN_INFO "%s: host device not found\n", __func__);
r = -EINVAL;
goto out_free;
}
if (pci_enable_device(dev)) {
printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
r = -EBUSY;
goto out_put;
}
r = pci_request_regions(dev, "kvm_assigned_device");
if (r) {
printk(KERN_INFO "%s: Could not get access to device regions\n",
__func__);
goto out_disable;
}
match->assigned_dev_id = assigned_dev->assigned_dev_id;
match->host_busnr = assigned_dev->busnr;
match->host_devfn = assigned_dev->devfn;
match->dev = dev;
match->kvm = kvm;
list_add(&match->list, &kvm->arch.assigned_dev_head);
if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
r = kvm_iommu_map_guest(kvm, match);
if (r)
goto out_list_del;
}
out:
mutex_unlock(&kvm->lock);
return r;
out_list_del:
list_del(&match->list);
pci_release_regions(dev);
out_disable:
pci_disable_device(dev);
out_put:
pci_dev_put(dev);
out_free:
kfree(match);
mutex_unlock(&kvm->lock);
return r;
}
unsigned long segment_base(u16 selector)
{
struct descriptor_table gdt;
struct desc_struct *d;
unsigned long table_base;
unsigned long v;
if (selector == 0)
return 0;
asm("sgdt %0" : "=m"(gdt));
table_base = gdt.base;
if (selector & 4) { /* from ldt */
u16 ldt_selector;
asm("sldt %0" : "=g"(ldt_selector));
table_base = segment_base(ldt_selector);
}
d = (struct desc_struct *)(table_base + (selector & ~7));
v = d->base0 | ((unsigned long)d->base1 << 16) |
((unsigned long)d->base2 << 24);
if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
#endif
return v;
}
EXPORT_SYMBOL_GPL(segment_base);
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm))
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
{
/* TODO: reserve bits check */
if (irqchip_in_kernel(vcpu->kvm))
kvm_lapic_set_base(vcpu, data);
else
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
WARN_ON(vcpu->arch.exception.pending);
vcpu->arch.exception.pending = true;
vcpu->arch.exception.has_error_code = false;
vcpu->arch.exception.nr = nr;
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
u32 error_code)
{
++vcpu->stat.pf_guest;
if (vcpu->arch.exception.pending) {
if (vcpu->arch.exception.nr == PF_VECTOR) {
printk(KERN_DEBUG "kvm: inject_page_fault:"
" double fault 0x%lx\n", addr);
vcpu->arch.exception.nr = DF_VECTOR;
vcpu->arch.exception.error_code = 0;
} else if (vcpu->arch.exception.nr == DF_VECTOR) {
/* triple fault -> shutdown */
set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
}
return;
}
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
}
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
vcpu->arch.nmi_pending = 1;
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
WARN_ON(vcpu->arch.exception.pending);
vcpu->arch.exception.pending = true;
vcpu->arch.exception.has_error_code = true;
vcpu->arch.exception.nr = nr;
vcpu->arch.exception.error_code = error_code;
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
static void __queue_exception(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code);
/*
* Load the pae pdptrs. Return true is they are all valid.
*/
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
{
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
int i;
int ret;
u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
offset * sizeof(u64), sizeof(pdpte));
if (ret < 0) {
ret = 0;
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
ret = 0;
goto out;
}
}
ret = 1;
memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
out:
return ret;
}
EXPORT_SYMBOL_GPL(load_pdptrs);
static bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
bool changed = true;
int r;
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
out:
return changed;
}
void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
if (cr0 & CR0_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
kvm_inject_gp(vcpu, 0);
return;
}
if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
kvm_inject_gp(vcpu, 0);
return;
}
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
"and a clear PE flag\n");
kvm_inject_gp(vcpu, 0);
return;
}
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
if ((vcpu->arch.shadow_efer & EFER_LME)) {
int cs_db, cs_l;
if (!is_pae(vcpu)) {
printk(KERN_DEBUG "set_cr0: #GP, start paging "
"in long mode while PAE is disabled\n");
kvm_inject_gp(vcpu, 0);
return;
}
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
if (cs_l) {
printk(KERN_DEBUG "set_cr0: #GP, start paging "
"in long mode while CS.L == 1\n");
kvm_inject_gp(vcpu, 0);
return;
}
} else
#endif
if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
"reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
}
kvm_x86_ops->set_cr0(vcpu, cr0);
kvm_mmu_reset_context(vcpu);
return;
}
EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
KVMTRACE_1D(LMSW, vcpu,
(u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
handler);
EXPORT_SYMBOL_GPL(kvm_lmsw);
void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
if (cr4 & CR4_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE)) {
printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
"in long mode\n");
kvm_inject_gp(vcpu, 0);
return;
}
} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
&& !load_pdptrs(vcpu, vcpu->arch.cr3)) {
printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
if (cr4 & X86_CR4_VMXE) {
printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
kvm_inject_gp(vcpu, 0);
return;
}
kvm_x86_ops->set_cr4(vcpu, cr4);
kvm_mmu_reset_context(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
kvm_mmu_flush_tlb(vcpu);
return;
}
if (is_long_mode(vcpu)) {
if (cr3 & CR3_L_MODE_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
} else {
if (is_pae(vcpu)) {
if (cr3 & CR3_PAE_RESERVED_BITS) {
printk(KERN_DEBUG
"set_cr3: #GP, reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
"reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
}
/*
* We don't check reserved bits in nonpae mode, because
* this isn't enforced, and VMware depends on this.
*/
}
/*
* Does the new cr3 value map to physical memory? (Note, we
* catch an invalid cr3 even in real-mode, because it would
* cause trouble later on when we turn on paging anyway.)
*
* A real CPU would silently accept an invalid cr3 and would
* attempt to use it - with largely undefined (and often hard
* to debug) behavior on the guest side.
*/
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
kvm_inject_gp(vcpu, 0);
vcpu->arch.cr3 = cr3;
vcpu->arch.mmu.new_cr3(vcpu);
}
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
kvm_inject_gp(vcpu, 0);
return;
}
if (irqchip_in_kernel(vcpu->kvm))
kvm_lapic_set_tpr(vcpu, cr8);
else
EXPORT_SYMBOL_GPL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm))
return kvm_lapic_get_cr8(vcpu);
else
EXPORT_SYMBOL_GPL(kvm_get_cr8);
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
*
* This list is modified at module load time to reflect the
* capabilities of the host cpu.
*/
static u32 msrs_to_save[] = {
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_K6_STAR,
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
};
static unsigned num_msrs_to_save;
static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
};
static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
if (efer & efer_reserved_bits) {
printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
efer);
kvm_inject_gp(vcpu, 0);
return;
}
if (is_paging(vcpu)
&& (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
kvm_inject_gp(vcpu, 0);
return;
}
kvm_x86_ops->set_efer(vcpu, efer);
efer &= ~EFER_LMA;
efer |= vcpu->arch.shadow_efer & EFER_LMA;
vcpu->arch.shadow_efer = efer;
void kvm_enable_efer_bits(u64 mask)
{
efer_reserved_bits &= ~mask;
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{
return kvm_x86_ops->set_msr(vcpu, msr_index, data);
}
/*
* Adapt set_msr() to msr_io()'s calling convention
*/
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
return kvm_set_msr(vcpu, index, *data);
}
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
static int version;
struct pvclock_wall_clock wc;
struct timespec now, sys, boot;
if (!wall_clock)
return;
version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
/*
* The guest calculates current wall clock time by adding
* system time (updated by kvm_write_guest_time below) to the
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
now = current_kernel_time();
ktime_get_ts(&sys);
boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
wc.sec = boot.tv_sec;
wc.nsec = boot.tv_nsec;
wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
uint32_t quotient, remainder;
/* Don't try to replace with do_div(), this one calculates
* "(dividend << 32) / divisor" */
__asm__ ( "divl %4"
: "=a" (quotient), "=d" (remainder)
: "0" (0), "1" (dividend), "r" (divisor) );
return quotient;
}
static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
{
uint64_t nsecs = 1000000000LL;
int32_t shift = 0;
uint64_t tps64;
uint32_t tps32;
tps64 = tsc_khz * 1000LL;
while (tps64 > nsecs*2) {
tps64 >>= 1;
shift--;
}
tps32 = (uint32_t)tps64;
while (tps32 <= (uint32_t)nsecs) {
tps32 <<= 1;
shift++;
}
hv_clock->tsc_shift = shift;
hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
__FUNCTION__, tsc_khz, hv_clock->tsc_shift,
hv_clock->tsc_to_system_mul);
}
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
struct timespec ts;
unsigned long flags;
struct kvm_vcpu_arch *vcpu = &v->arch;
void *shared_kaddr;
if ((!vcpu->time_page))
return;
if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
vcpu->hv_clock_tsc_khz = tsc_khz;
}
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
&vcpu->hv_clock.tsc_timestamp);
ktime_get_ts(&ts);
local_irq_restore(flags);
/* With all the info we got, fill in the values */
vcpu->hv_clock.system_time = ts.tv_nsec +
(NSEC_PER_SEC * (u64)ts.tv_sec);
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
* state, we just increase by 2 at the end.
vcpu->hv_clock.version += 2;
shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
sizeof(vcpu->hv_clock));
kunmap_atomic(shared_kaddr, KM_USER0);
mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
}
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
static bool msr_mtrr_valid(unsigned msr)
{
switch (msr) {
case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
case MSR_MTRRfix64K_00000:
case MSR_MTRRfix16K_80000:
case MSR_MTRRfix16K_A0000:
case MSR_MTRRfix4K_C0000:
case MSR_MTRRfix4K_C8000:
case MSR_MTRRfix4K_D0000:
case MSR_MTRRfix4K_D8000:
case MSR_MTRRfix4K_E0000:
case MSR_MTRRfix4K_E8000:
case MSR_MTRRfix4K_F0000:
case MSR_MTRRfix4K_F8000:
case MSR_MTRRdefType:
case MSR_IA32_CR_PAT:
return true;
case 0x2f8:
return true;
}
return false;
}
static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
if (!msr_mtrr_valid(msr))
return 1;
vcpu->arch.mtrr[msr - 0x200] = data;
return 0;
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
case MSR_EFER:
set_efer(vcpu, data);
break;
case MSR_IA32_MC0_STATUS:
pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
break;
case MSR_IA32_MCG_STATUS:
pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
case MSR_IA32_MCG_CTL:
pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
case MSR_IA32_DEBUGCTLMSR:
if (!data) {
/* We support the non-activated case already */
break;
} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
/* Values other than LBR and BTF are vendor-specific,
thus reserved and should throw a #GP */
return 1;
}
pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
__func__, data);
break;
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE:
break;
case 0x200 ... 0x2ff:
return set_msr_mtrr(vcpu, msr, data);
case MSR_IA32_APICBASE:
kvm_set_apic_base(vcpu, data);
break;
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
case MSR_KVM_WALL_CLOCK:
vcpu->kvm->arch.wall_clock = data;
kvm_write_wall_clock(vcpu->kvm, data);
break;
case MSR_KVM_SYSTEM_TIME: {
if (vcpu->arch.time_page) {
kvm_release_page_dirty(vcpu->arch.time_page);
vcpu->arch.time_page = NULL;
}
vcpu->arch.time = data;
/* we verify if the enable bit is set... */
if (!(data & 1))
break;
/* ...but clean it before doing the actual write */
vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
if (is_error_page(vcpu->arch.time_page)) {
kvm_release_page_clean(vcpu->arch.time_page);
vcpu->arch.time_page = NULL;
}
kvm_write_guest_time(vcpu);
break;
}
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
return 1;
}
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_msr_common);
/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
}
static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
if (!msr_mtrr_valid(msr))
return 1;
*pdata = vcpu->arch.mtrr[msr - 0x200];
return 0;
}
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
switch (msr) {
case 0xc0010010: /* SYSCFG */
case 0xc0010015: /* HWCR */
case MSR_IA32_PLATFORM_ID:
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MC0_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MCG_CAP: