Skip to content
Snippets Groups Projects
x86.c 106 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     * Kernel-based Virtual Machine driver for Linux
     *
     * derived from drivers/kvm/kvm_main.c
     *
     * Copyright (C) 2006 Qumranet, Inc.
    
     * Copyright (C) 2008 Qumranet, Inc.
     * Copyright IBM Corporation, 2008
    
     *
     * Authors:
     *   Avi Kivity   <avi@qumranet.com>
     *   Yaniv Kamay  <yaniv@qumranet.com>
    
     *   Amit Shah    <amit.shah@qumranet.com>
     *   Ben-Ami Yassour <benami@il.ibm.com>
    
     *
     * This work is licensed under the terms of the GNU GPL, version 2.  See
     * the COPYING file in the top-level directory.
     *
     */
    
    
    #include <linux/kvm_host.h>
    
    #include "irq.h"
    
    Sheng Yang's avatar
    Sheng Yang committed
    #include "i8254.h"
    
    #include "tss.h"
    
    #include "kvm_cache_regs.h"
    
    #include <linux/clocksource.h>
    
    #include <linux/interrupt.h>
    
    #include <linux/kvm.h>
    #include <linux/fs.h>
    
    #include <linux/pci.h>
    
    #include <linux/vmalloc.h>
    
    #include <linux/module.h>
    
    #include <linux/highmem.h>
    
    #include <linux/intel-iommu.h>
    
    
    #include <asm/uaccess.h>
    
    #include <asm/msr.h>
    
    #define MAX_IO_MSRS 256
    
    #define CR0_RESERVED_BITS						\
    	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
    			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
    			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
    #define CR4_RESERVED_BITS						\
    	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
    			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
    			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
    			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
    
    #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
    
    /* EFER defaults:
     * - enable syscall per default because its emulated by KVM
     * - enable LME and LMA per default on 64 bit KVM
     */
    #ifdef CONFIG_X86_64
    static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
    #else
    static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
    #endif
    
    #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
    #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
    
    static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
    				    struct kvm_cpuid_entry2 __user *entries);
    
    
    struct kvm_x86_ops *kvm_x86_ops;
    
    EXPORT_SYMBOL_GPL(kvm_x86_ops);
    
    struct kvm_stats_debugfs_item debugfs_entries[] = {
    
    	{ "pf_fixed", VCPU_STAT(pf_fixed) },
    	{ "pf_guest", VCPU_STAT(pf_guest) },
    	{ "tlb_flush", VCPU_STAT(tlb_flush) },
    	{ "invlpg", VCPU_STAT(invlpg) },
    	{ "exits", VCPU_STAT(exits) },
    	{ "io_exits", VCPU_STAT(io_exits) },
    	{ "mmio_exits", VCPU_STAT(mmio_exits) },
    	{ "signal_exits", VCPU_STAT(signal_exits) },
    	{ "irq_window", VCPU_STAT(irq_window_exits) },
    
    	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
    
    	{ "halt_exits", VCPU_STAT(halt_exits) },
    	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
    
    	{ "hypercalls", VCPU_STAT(hypercalls) },
    
    	{ "request_irq", VCPU_STAT(request_irq_exits) },
    	{ "irq_exits", VCPU_STAT(irq_exits) },
    	{ "host_state_reload", VCPU_STAT(host_state_reload) },
    	{ "efer_reload", VCPU_STAT(efer_reload) },
    	{ "fpu_reload", VCPU_STAT(fpu_reload) },
    	{ "insn_emulation", VCPU_STAT(insn_emulation) },
    	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
    
    	{ "irq_injections", VCPU_STAT(irq_injections) },
    
    	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
    	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
    	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
    	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
    	{ "mmu_flooded", VM_STAT(mmu_flooded) },
    	{ "mmu_recycled", VM_STAT(mmu_recycled) },
    
    	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
    
    	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
    
    	{ "largepages", VM_STAT(lpages) },
    
    static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
    
    						      int assigned_dev_id)
    {
    	struct list_head *ptr;
    	struct kvm_assigned_dev_kernel *match;
    
    	list_for_each(ptr, head) {
    		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
    		if (match->assigned_dev_id == assigned_dev_id)
    			return match;
    	}
    	return NULL;
    }
    
    static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
    {
    	struct kvm_assigned_dev_kernel *assigned_dev;
    
    	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
    				    interrupt_work);
    
    	/* This is taken to safely inject irq inside the guest. When
    	 * the interrupt injection (or the ioapic code) uses a
    	 * finer-grained lock, update this
    	 */
    	mutex_lock(&assigned_dev->kvm->lock);
    	kvm_set_irq(assigned_dev->kvm,
    		    assigned_dev->guest_irq, 1);
    	mutex_unlock(&assigned_dev->kvm->lock);
    	kvm_put_kvm(assigned_dev->kvm);
    }
    
    /* FIXME: Implement the OR logic needed to make shared interrupts on
     * this line behave properly
     */
    static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
    {
    	struct kvm_assigned_dev_kernel *assigned_dev =
    		(struct kvm_assigned_dev_kernel *) dev_id;
    
    	kvm_get_kvm(assigned_dev->kvm);
    	schedule_work(&assigned_dev->interrupt_work);
    	disable_irq_nosync(irq);
    	return IRQ_HANDLED;
    }
    
    /* Ack the irq line for an assigned device */
    static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
    {
    	struct kvm_assigned_dev_kernel *dev;
    
    	if (kian->gsi == -1)
    		return;
    
    	dev = container_of(kian, struct kvm_assigned_dev_kernel,
    			   ack_notifier);
    	kvm_set_irq(dev->kvm, dev->guest_irq, 0);
    	enable_irq(dev->host_irq);
    }
    
    
    static void kvm_free_assigned_device(struct kvm *kvm,
    				     struct kvm_assigned_dev_kernel
    				     *assigned_dev)
    {
    	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
    		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
    
    	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
    
    	if (cancel_work_sync(&assigned_dev->interrupt_work))
    		/* We had pending work. That means we will have to take
    		 * care of kvm_put_kvm.
    		 */
    		kvm_put_kvm(kvm);
    
    	pci_release_regions(assigned_dev->dev);
    	pci_disable_device(assigned_dev->dev);
    	pci_dev_put(assigned_dev->dev);
    
    	list_del(&assigned_dev->list);
    	kfree(assigned_dev);
    }
    
    static void kvm_free_all_assigned_devices(struct kvm *kvm)
    {
    	struct list_head *ptr, *ptr2;
    	struct kvm_assigned_dev_kernel *assigned_dev;
    
    	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
    		assigned_dev = list_entry(ptr,
    					  struct kvm_assigned_dev_kernel,
    					  list);
    
    		kvm_free_assigned_device(kvm, assigned_dev);
    	}
    }
    
    
    static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
    				   struct kvm_assigned_irq
    				   *assigned_irq)
    {
    	int r = 0;
    	struct kvm_assigned_dev_kernel *match;
    
    	mutex_lock(&kvm->lock);
    
    	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
    				      assigned_irq->assigned_dev_id);
    	if (!match) {
    		mutex_unlock(&kvm->lock);
    		return -EINVAL;
    	}
    
    	if (match->irq_requested) {
    		match->guest_irq = assigned_irq->guest_irq;
    		match->ack_notifier.gsi = assigned_irq->guest_irq;
    		mutex_unlock(&kvm->lock);
    		return 0;
    	}
    
    	INIT_WORK(&match->interrupt_work,
    		  kvm_assigned_dev_interrupt_work_handler);
    
    	if (irqchip_in_kernel(kvm)) {
    
    		if (!capable(CAP_SYS_RAWIO)) {
    
    		if (assigned_irq->host_irq)
    			match->host_irq = assigned_irq->host_irq;
    		else
    			match->host_irq = match->dev->irq;
    		match->guest_irq = assigned_irq->guest_irq;
    		match->ack_notifier.gsi = assigned_irq->guest_irq;
    		match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
    		kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
    
    		/* Even though this is PCI, we don't want to use shared
    		 * interrupts. Sharing host devices with guest-assigned devices
    		 * on the same interrupt line is not a happy situation: there
    		 * are going to be long delays in accepting, acking, etc.
    		 */
    		if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
    				"kvm_assigned_device", (void *)match)) {
    			r = -EIO;
    
    		}
    	}
    
    	match->irq_requested = true;
    	mutex_unlock(&kvm->lock);
    	return r;
    
    out_release:
    	mutex_unlock(&kvm->lock);
    	kvm_free_assigned_device(kvm, match);
    	return r;
    
    }
    
    static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
    				      struct kvm_assigned_pci_dev *assigned_dev)
    {
    	int r = 0;
    	struct kvm_assigned_dev_kernel *match;
    	struct pci_dev *dev;
    
    	mutex_lock(&kvm->lock);
    
    	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
    				      assigned_dev->assigned_dev_id);
    	if (match) {
    		/* device already assigned */
    		r = -EINVAL;
    		goto out;
    	}
    
    	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
    	if (match == NULL) {
    		printk(KERN_INFO "%s: Couldn't allocate memory\n",
    		       __func__);
    		r = -ENOMEM;
    		goto out;
    	}
    	dev = pci_get_bus_and_slot(assigned_dev->busnr,
    				   assigned_dev->devfn);
    	if (!dev) {
    		printk(KERN_INFO "%s: host device not found\n", __func__);
    		r = -EINVAL;
    		goto out_free;
    	}
    	if (pci_enable_device(dev)) {
    		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
    		r = -EBUSY;
    		goto out_put;
    	}
    	r = pci_request_regions(dev, "kvm_assigned_device");
    	if (r) {
    		printk(KERN_INFO "%s: Could not get access to device regions\n",
    		       __func__);
    		goto out_disable;
    	}
    	match->assigned_dev_id = assigned_dev->assigned_dev_id;
    	match->host_busnr = assigned_dev->busnr;
    	match->host_devfn = assigned_dev->devfn;
    	match->dev = dev;
    
    	match->kvm = kvm;
    
    	list_add(&match->list, &kvm->arch.assigned_dev_head);
    
    
    	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
    		r = kvm_iommu_map_guest(kvm, match);
    		if (r)
    			goto out_list_del;
    	}
    
    
    out:
    	mutex_unlock(&kvm->lock);
    	return r;
    
    out_list_del:
    	list_del(&match->list);
    	pci_release_regions(dev);
    
    out_disable:
    	pci_disable_device(dev);
    out_put:
    	pci_dev_put(dev);
    out_free:
    	kfree(match);
    	mutex_unlock(&kvm->lock);
    	return r;
    }
    
    
    unsigned long segment_base(u16 selector)
    {
    	struct descriptor_table gdt;
    
    	unsigned long table_base;
    	unsigned long v;
    
    	if (selector == 0)
    		return 0;
    
    	asm("sgdt %0" : "=m"(gdt));
    	table_base = gdt.base;
    
    	if (selector & 4) {           /* from ldt */
    		u16 ldt_selector;
    
    		asm("sldt %0" : "=g"(ldt_selector));
    		table_base = segment_base(ldt_selector);
    	}
    
    	d = (struct desc_struct *)(table_base + (selector & ~7));
    	v = d->base0 | ((unsigned long)d->base1 << 16) |
    		((unsigned long)d->base2 << 24);
    
    #ifdef CONFIG_X86_64
    
    	if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
    		v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
    
    #endif
    	return v;
    }
    EXPORT_SYMBOL_GPL(segment_base);
    
    
    u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
    {
    	if (irqchip_in_kernel(vcpu->kvm))
    
    		return vcpu->arch.apic_base;
    
    		return vcpu->arch.apic_base;
    
    }
    EXPORT_SYMBOL_GPL(kvm_get_apic_base);
    
    void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
    {
    	/* TODO: reserve bits check */
    	if (irqchip_in_kernel(vcpu->kvm))
    		kvm_lapic_set_base(vcpu, data);
    	else
    
    		vcpu->arch.apic_base = data;
    
    }
    EXPORT_SYMBOL_GPL(kvm_set_apic_base);
    
    
    void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
    {
    
    	WARN_ON(vcpu->arch.exception.pending);
    	vcpu->arch.exception.pending = true;
    	vcpu->arch.exception.has_error_code = false;
    	vcpu->arch.exception.nr = nr;
    
    }
    EXPORT_SYMBOL_GPL(kvm_queue_exception);
    
    
    void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
    			   u32 error_code)
    {
    	++vcpu->stat.pf_guest;
    
    	if (vcpu->arch.exception.pending) {
    		if (vcpu->arch.exception.nr == PF_VECTOR) {
    			printk(KERN_DEBUG "kvm: inject_page_fault:"
    					" double fault 0x%lx\n", addr);
    			vcpu->arch.exception.nr = DF_VECTOR;
    			vcpu->arch.exception.error_code = 0;
    		} else if (vcpu->arch.exception.nr == DF_VECTOR) {
    			/* triple fault -> shutdown */
    			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
    		}
    
    	vcpu->arch.cr2 = addr;
    
    	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
    }
    
    
    void kvm_inject_nmi(struct kvm_vcpu *vcpu)
    {
    	vcpu->arch.nmi_pending = 1;
    }
    EXPORT_SYMBOL_GPL(kvm_inject_nmi);
    
    
    void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
    {
    
    	WARN_ON(vcpu->arch.exception.pending);
    	vcpu->arch.exception.pending = true;
    	vcpu->arch.exception.has_error_code = true;
    	vcpu->arch.exception.nr = nr;
    	vcpu->arch.exception.error_code = error_code;
    
    }
    EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
    
    static void __queue_exception(struct kvm_vcpu *vcpu)
    {
    
    	kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
    				     vcpu->arch.exception.has_error_code,
    				     vcpu->arch.exception.error_code);
    
    /*
     * Load the pae pdptrs.  Return true is they are all valid.
     */
    int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
    {
    	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
    	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
    	int i;
    	int ret;
    
    	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
    
    
    	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
    				  offset * sizeof(u64), sizeof(pdpte));
    	if (ret < 0) {
    		ret = 0;
    		goto out;
    	}
    	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
    		if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
    			ret = 0;
    			goto out;
    		}
    	}
    	ret = 1;
    
    
    	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
    
    EXPORT_SYMBOL_GPL(load_pdptrs);
    
    static bool pdptrs_changed(struct kvm_vcpu *vcpu)
    {
    
    	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
    
    	bool changed = true;
    	int r;
    
    	if (is_long_mode(vcpu) || !is_pae(vcpu))
    		return false;
    
    
    	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
    
    	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
    
    void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
    
    {
    	if (cr0 & CR0_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
    
    		       cr0, vcpu->arch.cr0);
    
    		return;
    	}
    
    	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
    		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
    
    		return;
    	}
    
    	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
    		printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
    		       "and a clear PE flag\n");
    
    		return;
    	}
    
    	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
    #ifdef CONFIG_X86_64
    
    		if ((vcpu->arch.shadow_efer & EFER_LME)) {
    
    			int cs_db, cs_l;
    
    			if (!is_pae(vcpu)) {
    				printk(KERN_DEBUG "set_cr0: #GP, start paging "
    				       "in long mode while PAE is disabled\n");
    
    				return;
    			}
    			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
    			if (cs_l) {
    				printk(KERN_DEBUG "set_cr0: #GP, start paging "
    				       "in long mode while CS.L == 1\n");
    
    		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
    
    			printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
    			       "reserved bits\n");
    
    	vcpu->arch.cr0 = cr0;
    
    void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
    
    	kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
    
    Feng (Eric) Liu's avatar
    Feng (Eric) Liu committed
    	KVMTRACE_1D(LMSW, vcpu,
    		    (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
    		    handler);
    
    void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
    
    {
    	if (cr4 & CR4_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
    
    		return;
    	}
    
    	if (is_long_mode(vcpu)) {
    		if (!(cr4 & X86_CR4_PAE)) {
    			printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
    			       "in long mode\n");
    
    			return;
    		}
    	} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
    
    		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
    
    		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
    
    		return;
    	}
    
    	if (cr4 & X86_CR4_VMXE) {
    		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
    
    		return;
    	}
    	kvm_x86_ops->set_cr4(vcpu, cr4);
    
    	vcpu->arch.cr4 = cr4;
    
    void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
    
    	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
    
    	if (is_long_mode(vcpu)) {
    		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
    			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
    
    			return;
    		}
    	} else {
    		if (is_pae(vcpu)) {
    			if (cr3 & CR3_PAE_RESERVED_BITS) {
    				printk(KERN_DEBUG
    				       "set_cr3: #GP, reserved bits\n");
    
    				return;
    			}
    			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
    				printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
    				       "reserved bits\n");
    
    				return;
    			}
    		}
    		/*
    		 * We don't check reserved bits in nonpae mode, because
    		 * this isn't enforced, and VMware depends on this.
    		 */
    	}
    
    	/*
    	 * Does the new cr3 value map to physical memory? (Note, we
    	 * catch an invalid cr3 even in real-mode, because it would
    	 * cause trouble later on when we turn on paging anyway.)
    	 *
    	 * A real CPU would silently accept an invalid cr3 and would
    	 * attempt to use it - with largely undefined (and often hard
    	 * to debug) behavior on the guest side.
    	 */
    	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
    
    		vcpu->arch.cr3 = cr3;
    		vcpu->arch.mmu.new_cr3(vcpu);
    
    void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
    
    {
    	if (cr8 & CR8_RESERVED_BITS) {
    		printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
    
    		return;
    	}
    	if (irqchip_in_kernel(vcpu->kvm))
    		kvm_lapic_set_tpr(vcpu, cr8);
    	else
    
    		vcpu->arch.cr8 = cr8;
    
    unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
    
    {
    	if (irqchip_in_kernel(vcpu->kvm))
    		return kvm_lapic_get_cr8(vcpu);
    	else
    
    		return vcpu->arch.cr8;
    
    /*
     * List of msr numbers which we expose to userspace through KVM_GET_MSRS
     * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
     *
     * This list is modified at module load time to reflect the
     * capabilities of the host cpu.
     */
    static u32 msrs_to_save[] = {
    	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
    	MSR_K6_STAR,
    #ifdef CONFIG_X86_64
    	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
    #endif
    
    	MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
    
    	MSR_IA32_PERF_STATUS,
    
    };
    
    static unsigned num_msrs_to_save;
    
    static u32 emulated_msrs[] = {
    	MSR_IA32_MISC_ENABLE,
    };
    
    
    static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
    {
    
    	if (efer & efer_reserved_bits) {
    
    		printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
    		       efer);
    
    	    && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
    
    		printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
    
    		return;
    	}
    
    	kvm_x86_ops->set_efer(vcpu, efer);
    
    	efer &= ~EFER_LMA;
    
    	efer |= vcpu->arch.shadow_efer & EFER_LMA;
    
    	vcpu->arch.shadow_efer = efer;
    
    void kvm_enable_efer_bits(u64 mask)
    {
           efer_reserved_bits &= ~mask;
    }
    EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
    
    
    
    /*
     * Writes msr value into into the appropriate "register".
     * Returns 0 on success, non-0 otherwise.
     * Assumes vcpu_load() was already called.
     */
    int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
    {
    	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
    }
    
    
    /*
     * Adapt set_msr() to msr_io()'s calling convention
     */
    static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
    {
    	return kvm_set_msr(vcpu, index, *data);
    }
    
    
    static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
    {
    	static int version;
    
    	struct pvclock_wall_clock wc;
    	struct timespec now, sys, boot;
    
    
    	if (!wall_clock)
    		return;
    
    	version++;
    
    	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
    
    
    	/*
    	 * The guest calculates current wall clock time by adding
    	 * system time (updated by kvm_write_guest_time below) to the
    	 * wall clock specified here.  guest system time equals host
    	 * system time for us, thus we must fill in host boot time here.
    	 */
    	now = current_kernel_time();
    	ktime_get_ts(&sys);
    	boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
    
    	wc.sec = boot.tv_sec;
    	wc.nsec = boot.tv_nsec;
    	wc.version = version;
    
    
    	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
    
    	version++;
    	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
    }
    
    
    static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
    {
    	uint32_t quotient, remainder;
    
    	/* Don't try to replace with do_div(), this one calculates
    	 * "(dividend << 32) / divisor" */
    	__asm__ ( "divl %4"
    		  : "=a" (quotient), "=d" (remainder)
    		  : "0" (0), "1" (dividend), "r" (divisor) );
    	return quotient;
    }
    
    static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
    {
    	uint64_t nsecs = 1000000000LL;
    	int32_t  shift = 0;
    	uint64_t tps64;
    	uint32_t tps32;
    
    	tps64 = tsc_khz * 1000LL;
    	while (tps64 > nsecs*2) {
    		tps64 >>= 1;
    		shift--;
    	}
    
    	tps32 = (uint32_t)tps64;
    	while (tps32 <= (uint32_t)nsecs) {
    		tps32 <<= 1;
    		shift++;
    	}
    
    	hv_clock->tsc_shift = shift;
    	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
    
    	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
    		 __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
    		 hv_clock->tsc_to_system_mul);
    }
    
    
    static void kvm_write_guest_time(struct kvm_vcpu *v)
    {
    	struct timespec ts;
    	unsigned long flags;
    	struct kvm_vcpu_arch *vcpu = &v->arch;
    	void *shared_kaddr;
    
    	if ((!vcpu->time_page))
    		return;
    
    
    	if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
    		kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
    		vcpu->hv_clock_tsc_khz = tsc_khz;
    	}
    
    
    	/* Keep irq disabled to prevent changes to the clock */
    	local_irq_save(flags);
    	kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
    			  &vcpu->hv_clock.tsc_timestamp);
    	ktime_get_ts(&ts);
    	local_irq_restore(flags);
    
    	/* With all the info we got, fill in the values */
    
    	vcpu->hv_clock.system_time = ts.tv_nsec +
    				     (NSEC_PER_SEC * (u64)ts.tv_sec);
    	/*
    	 * The interface expects us to write an even number signaling that the
    	 * update is finished. Since the guest won't see the intermediate
    
    	 * state, we just increase by 2 at the end.
    
    	vcpu->hv_clock.version += 2;
    
    
    	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
    
    	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
    
    	       sizeof(vcpu->hv_clock));
    
    
    	kunmap_atomic(shared_kaddr, KM_USER0);
    
    	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    static bool msr_mtrr_valid(unsigned msr)
    {
    	switch (msr) {
    	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
    	case MSR_MTRRfix64K_00000:
    	case MSR_MTRRfix16K_80000:
    	case MSR_MTRRfix16K_A0000:
    	case MSR_MTRRfix4K_C0000:
    	case MSR_MTRRfix4K_C8000:
    	case MSR_MTRRfix4K_D0000:
    	case MSR_MTRRfix4K_D8000:
    	case MSR_MTRRfix4K_E0000:
    	case MSR_MTRRfix4K_E8000:
    	case MSR_MTRRfix4K_F0000:
    	case MSR_MTRRfix4K_F8000:
    	case MSR_MTRRdefType:
    	case MSR_IA32_CR_PAT:
    		return true;
    	case 0x2f8:
    		return true;
    	}
    	return false;
    }
    
    static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	if (!msr_mtrr_valid(msr))
    		return 1;
    
    	vcpu->arch.mtrr[msr - 0x200] = data;
    	return 0;
    }
    
    
    int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	switch (msr) {
    	case MSR_EFER:
    		set_efer(vcpu, data);
    		break;
    	case MSR_IA32_MC0_STATUS:
    		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
    
    		break;
    	case MSR_IA32_MCG_STATUS:
    		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
    
    	case MSR_IA32_MCG_CTL:
    		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
    
    	case MSR_IA32_DEBUGCTLMSR:
    		if (!data) {
    			/* We support the non-activated case already */
    			break;
    		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
    			/* Values other than LBR and BTF are vendor-specific,
    			   thus reserved and should throw a #GP */
    			return 1;
    		}
    		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
    			__func__, data);
    		break;
    
    	case MSR_IA32_UCODE_REV:
    	case MSR_IA32_UCODE_WRITE:
    		break;
    
    Avi Kivity's avatar
    Avi Kivity committed
    	case 0x200 ... 0x2ff:
    		return set_msr_mtrr(vcpu, msr, data);
    
    	case MSR_IA32_APICBASE:
    		kvm_set_apic_base(vcpu, data);
    		break;
    	case MSR_IA32_MISC_ENABLE:
    
    		vcpu->arch.ia32_misc_enable_msr = data;
    
    	case MSR_KVM_WALL_CLOCK:
    		vcpu->kvm->arch.wall_clock = data;
    		kvm_write_wall_clock(vcpu->kvm, data);
    		break;
    	case MSR_KVM_SYSTEM_TIME: {
    		if (vcpu->arch.time_page) {
    			kvm_release_page_dirty(vcpu->arch.time_page);
    			vcpu->arch.time_page = NULL;
    		}
    
    		vcpu->arch.time = data;
    
    		/* we verify if the enable bit is set... */
    		if (!(data & 1))
    			break;
    
    		/* ...but clean it before doing the actual write */
    		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
    
    		vcpu->arch.time_page =
    				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
    
    		if (is_error_page(vcpu->arch.time_page)) {
    			kvm_release_page_clean(vcpu->arch.time_page);
    			vcpu->arch.time_page = NULL;
    		}
    
    		kvm_write_guest_time(vcpu);
    		break;
    	}
    
    		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
    
    		return 1;
    	}
    	return 0;
    }
    EXPORT_SYMBOL_GPL(kvm_set_msr_common);
    
    
    /*
     * Reads an msr value (of 'msr_index') into 'pdata'.
     * Returns 0 on success, non-0 otherwise.
     * Assumes vcpu_load() was already called.
     */
    int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
    {
    	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    	if (!msr_mtrr_valid(msr))
    		return 1;
    
    	*pdata = vcpu->arch.mtrr[msr - 0x200];
    	return 0;
    }
    
    
    int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    	u64 data;
    
    	switch (msr) {
    	case 0xc0010010: /* SYSCFG */
    	case 0xc0010015: /* HWCR */
    	case MSR_IA32_PLATFORM_ID:
    	case MSR_IA32_P5_MC_ADDR:
    	case MSR_IA32_P5_MC_TYPE:
    	case MSR_IA32_MC0_CTL:
    	case MSR_IA32_MCG_STATUS:
    	case MSR_IA32_MCG_CAP: