Skip to content
Snippets Groups Projects
x86.c 150 KiB
Newer Older
  • Learn to ignore specific revisions
  • void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
    {
    	struct kvm *kvm = vcpu->kvm;
    
    	u64 offset, ns, elapsed;
    
    	unsigned long flags;
    
    	s64 sdiff;
    
    
    	spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
    	offset = data - native_read_tsc();
    
    	elapsed = ns - kvm->arch.last_tsc_nsec;
    
    	sdiff = data - kvm->arch.last_tsc_write;
    	if (sdiff < 0)
    		sdiff = -sdiff;
    
    	 * Special case: close write to TSC within 5 seconds of
    
    	 * another CPU is interpreted as an attempt to synchronize
    
    	 * The 5 seconds is to accomodate host load / swapping as
    	 * well as any reset of TSC during the boot process.
    
    	 *
    	 * In that case, for a reliable TSC, we can match TSC offsets,
    
    	 * or make a best guest using elapsed value.
    
    	if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
    	    elapsed < 5ULL * NSEC_PER_SEC) {
    
    		if (!check_tsc_unstable()) {
    			offset = kvm->arch.last_tsc_offset;
    			pr_debug("kvm: matched tsc offset for %llu\n", data);
    		} else {
    
    			u64 delta = nsec_to_cycles(elapsed);
    			offset += delta;
    			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
    
    		}
    		ns = kvm->arch.last_tsc_nsec;
    	}
    	kvm->arch.last_tsc_nsec = ns;
    	kvm->arch.last_tsc_write = data;
    	kvm->arch.last_tsc_offset = offset;
    
    	kvm_x86_ops->write_tsc_offset(vcpu, offset);
    	spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
    
    	/* Reset of TSC must disable overshoot protection below */
    	vcpu->arch.hv_clock.tsc_timestamp = 0;
    
    	vcpu->arch.last_tsc_write = data;
    	vcpu->arch.last_tsc_nsec = ns;
    
    }
    EXPORT_SYMBOL_GPL(kvm_write_tsc);
    
    
    static int kvm_guest_time_update(struct kvm_vcpu *v)
    
    {
    	unsigned long flags;
    	struct kvm_vcpu_arch *vcpu = &v->arch;
    	void *shared_kaddr;
    
    	s64 kernel_ns, max_kernel_ns;
    	u64 tsc_timestamp;
    
    
    	/* Keep irq disabled to prevent changes to the clock */
    	local_irq_save(flags);
    
    	kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
    
    	kernel_ns = get_kernel_ns();
    
    	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
    
    	if (unlikely(this_tsc_khz == 0)) {
    
    		local_irq_restore(flags);
    
    		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
    
    	/*
    	 * We may have to catch up the TSC to match elapsed wall clock
    	 * time for two reasons, even if kvmclock is used.
    	 *   1) CPU could have been running below the maximum TSC rate
    	 *   2) Broken TSC compensation resets the base at each VCPU
    	 *      entry to avoid unknown leaps of TSC even when running
    	 *      again on the same CPU.  This may cause apparent elapsed
    	 *      time to disappear, and the guest to stand still or run
    	 *	very slowly.
    	 */
    	if (vcpu->tsc_catchup) {
    		u64 tsc = compute_guest_tsc(v, kernel_ns);
    		if (tsc > tsc_timestamp) {
    			kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
    			tsc_timestamp = tsc;
    		}
    
    	if (!vcpu->time_page)
    		return 0;
    
    	/*
    	 * Time as measured by the TSC may go backwards when resetting the base
    	 * tsc_timestamp.  The reason for this is that the TSC resolution is
    	 * higher than the resolution of the other clock scales.  Thus, many
    	 * possible measurments of the TSC correspond to one measurement of any
    	 * other clock, and so a spread of values is possible.  This is not a
    	 * problem for the computation of the nanosecond clock; with TSC rates
    	 * around 1GHZ, there can only be a few cycles which correspond to one
    	 * nanosecond value, and any path through this code will inevitably
    	 * take longer than that.  However, with the kernel_ns value itself,
    	 * the precision may be much lower, down to HZ granularity.  If the
    	 * first sampling of TSC against kernel_ns ends in the low part of the
    	 * range, and the second in the high end of the range, we can get:
    	 *
    	 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
    	 *
    	 * As the sampling errors potentially range in the thousands of cycles,
    	 * it is possible such a time value has already been observed by the
    	 * guest.  To protect against this, we must compute the system time as
    	 * observed by the guest and ensure the new system time is greater.
    	 */
    	max_kernel_ns = 0;
    	if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
    		max_kernel_ns = vcpu->last_guest_tsc -
    				vcpu->hv_clock.tsc_timestamp;
    		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
    				    vcpu->hv_clock.tsc_to_system_mul,
    				    vcpu->hv_clock.tsc_shift);
    		max_kernel_ns += vcpu->last_kernel_ns;
    	}
    
    	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
    
    		kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
    				   &vcpu->hv_clock.tsc_shift,
    				   &vcpu->hv_clock.tsc_to_system_mul);
    
    		vcpu->hw_tsc_khz = this_tsc_khz;
    
    	if (max_kernel_ns > kernel_ns)
    		kernel_ns = max_kernel_ns;
    
    
    	/* With all the info we got, fill in the values */
    
    	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
    
    	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
    
    	vcpu->last_kernel_ns = kernel_ns;
    
    	vcpu->last_guest_tsc = tsc_timestamp;
    
    	/*
    	 * The interface expects us to write an even number signaling that the
    	 * update is finished. Since the guest won't see the intermediate
    
    	 * state, we just increase by 2 at the end.
    
    	vcpu->hv_clock.version += 2;
    
    
    	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
    
    	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
    
    	       sizeof(vcpu->hv_clock));
    
    
    	kunmap_atomic(shared_kaddr, KM_USER0);
    
    	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
    
    Avi Kivity's avatar
    Avi Kivity committed
    static bool msr_mtrr_valid(unsigned msr)
    {
    	switch (msr) {
    	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
    	case MSR_MTRRfix64K_00000:
    	case MSR_MTRRfix16K_80000:
    	case MSR_MTRRfix16K_A0000:
    	case MSR_MTRRfix4K_C0000:
    	case MSR_MTRRfix4K_C8000:
    	case MSR_MTRRfix4K_D0000:
    	case MSR_MTRRfix4K_D8000:
    	case MSR_MTRRfix4K_E0000:
    	case MSR_MTRRfix4K_E8000:
    	case MSR_MTRRfix4K_F0000:
    	case MSR_MTRRfix4K_F8000:
    	case MSR_MTRRdefType:
    	case MSR_IA32_CR_PAT:
    		return true;
    	case 0x2f8:
    		return true;
    	}
    	return false;
    }
    
    
    static bool valid_pat_type(unsigned t)
    {
    	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
    }
    
    static bool valid_mtrr_type(unsigned t)
    {
    	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
    }
    
    static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	int i;
    
    	if (!msr_mtrr_valid(msr))
    		return false;
    
    	if (msr == MSR_IA32_CR_PAT) {
    		for (i = 0; i < 8; i++)
    			if (!valid_pat_type((data >> (i * 8)) & 0xff))
    				return false;
    		return true;
    	} else if (msr == MSR_MTRRdefType) {
    		if (data & ~0xcff)
    			return false;
    		return valid_mtrr_type(data & 0xff);
    	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
    		for (i = 0; i < 8 ; i++)
    			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
    				return false;
    		return true;
    	}
    
    	/* variable MTRRs */
    	return valid_mtrr_type(data & 0xff);
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    
    Sheng Yang's avatar
    Sheng Yang committed
    	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
    
    
    	if (!mtrr_valid(vcpu, msr, data))
    
    Avi Kivity's avatar
    Avi Kivity committed
    		return 1;
    
    
    Sheng Yang's avatar
    Sheng Yang committed
    	if (msr == MSR_MTRRdefType) {
    		vcpu->arch.mtrr_state.def_type = data;
    		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
    	} else if (msr == MSR_MTRRfix64K_00000)
    		p[0] = data;
    	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
    		p[1 + msr - MSR_MTRRfix16K_80000] = data;
    	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
    		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
    	else if (msr == MSR_IA32_CR_PAT)
    		vcpu->arch.pat = data;
    	else {	/* Variable MTRRs */
    		int idx, is_mtrr_mask;
    		u64 *pt;
    
    		idx = (msr - 0x200) / 2;
    		is_mtrr_mask = msr - 0x200 - 2 * idx;
    		if (!is_mtrr_mask)
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
    		else
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
    		*pt = data;
    	}
    
    	kvm_mmu_reset_context(vcpu);
    
    Avi Kivity's avatar
    Avi Kivity committed
    	return 0;
    }
    
    Huang Ying's avatar
    Huang Ying committed
    static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    
    Huang Ying's avatar
    Huang Ying committed
    	u64 mcg_cap = vcpu->arch.mcg_cap;
    	unsigned bank_num = mcg_cap & 0xff;
    
    
    	switch (msr) {
    	case MSR_IA32_MCG_STATUS:
    
    Huang Ying's avatar
    Huang Ying committed
    		vcpu->arch.mcg_status = data;
    
    	case MSR_IA32_MCG_CTL:
    
    Huang Ying's avatar
    Huang Ying committed
    		if (!(mcg_cap & MCG_CTL_P))
    			return 1;
    		if (data != 0 && data != ~(u64)0)
    			return -1;
    		vcpu->arch.mcg_ctl = data;
    		break;
    	default:
    		if (msr >= MSR_IA32_MC0_CTL &&
    		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
    			u32 offset = msr - MSR_IA32_MC0_CTL;
    
    			/* only 0 or all 1s can be written to IA32_MCi_CTL
    			 * some Linux kernels though clear bit 10 in bank 4 to
    			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
    			 * this to avoid an uncatched #GP in the guest
    			 */
    
    Huang Ying's avatar
    Huang Ying committed
    			if ((offset & 0x3) == 0 &&
    
    			    data != 0 && (data | (1 << 10)) != ~(u64)0)
    
    Huang Ying's avatar
    Huang Ying committed
    				return -1;
    			vcpu->arch.mce_banks[offset] = data;
    			break;
    		}
    		return 1;
    	}
    	return 0;
    }
    
    
    static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
    {
    	struct kvm *kvm = vcpu->kvm;
    	int lm = is_long_mode(vcpu);
    	u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
    		: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
    	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
    		: kvm->arch.xen_hvm_config.blob_size_32;
    	u32 page_num = data & ~PAGE_MASK;
    	u64 page_addr = data & PAGE_MASK;
    	u8 *page;
    	int r;
    
    	r = -E2BIG;
    	if (page_num >= blob_size)
    		goto out;
    	r = -ENOMEM;
    	page = kzalloc(PAGE_SIZE, GFP_KERNEL);
    	if (!page)
    		goto out;
    	r = -EFAULT;
    	if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
    		goto out_free;
    	if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
    		goto out_free;
    	r = 0;
    out_free:
    	kfree(page);
    out:
    	return r;
    }
    
    
    static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
    {
    	return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
    }
    
    static bool kvm_hv_msr_partition_wide(u32 msr)
    {
    	bool r = false;
    	switch (msr) {
    	case HV_X64_MSR_GUEST_OS_ID:
    	case HV_X64_MSR_HYPERCALL:
    		r = true;
    		break;
    	}
    
    	return r;
    }
    
    static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	struct kvm *kvm = vcpu->kvm;
    
    	switch (msr) {
    	case HV_X64_MSR_GUEST_OS_ID:
    		kvm->arch.hv_guest_os_id = data;
    		/* setting guest os id to zero disables hypercall page */
    		if (!kvm->arch.hv_guest_os_id)
    			kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
    		break;
    	case HV_X64_MSR_HYPERCALL: {
    		u64 gfn;
    		unsigned long addr;
    		u8 instructions[4];
    
    		/* if guest os id is not set hypercall should remain disabled */
    		if (!kvm->arch.hv_guest_os_id)
    			break;
    		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
    			kvm->arch.hv_hypercall = data;
    			break;
    		}
    		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
    		addr = gfn_to_hva(kvm, gfn);
    		if (kvm_is_error_hva(addr))
    			return 1;
    		kvm_x86_ops->patch_hypercall(vcpu, instructions);
    		((unsigned char *)instructions)[3] = 0xc3; /* ret */
    		if (copy_to_user((void __user *)addr, instructions, 4))
    			return 1;
    		kvm->arch.hv_hypercall = data;
    		break;
    	}
    	default:
    		pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
    			  "data 0x%llx\n", msr, data);
    		return 1;
    	}
    	return 0;
    }
    
    static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    
    	switch (msr) {
    	case HV_X64_MSR_APIC_ASSIST_PAGE: {
    		unsigned long addr;
    
    		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
    			vcpu->arch.hv_vapic = data;
    			break;
    		}
    		addr = gfn_to_hva(vcpu->kvm, data >>
    				  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
    		if (kvm_is_error_hva(addr))
    			return 1;
    		if (clear_user((void __user *)addr, PAGE_SIZE))
    			return 1;
    		vcpu->arch.hv_vapic = data;
    		break;
    	}
    	case HV_X64_MSR_EOI:
    		return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
    	case HV_X64_MSR_ICR:
    		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
    	case HV_X64_MSR_TPR:
    		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
    	default:
    		pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
    			  "data 0x%llx\n", msr, data);
    		return 1;
    	}
    
    	return 0;
    
    int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	switch (msr) {
    	case MSR_EFER:
    
    		return set_efer(vcpu, data);
    
    	case MSR_K7_HWCR:
    		data &= ~(u64)0x40;	/* ignore flush filter disable */
    
    		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
    
    		if (data != 0) {
    			pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
    				data);
    			return 1;
    		}
    
    	case MSR_FAM10H_MMIO_CONF_BASE:
    		if (data != 0) {
    			pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
    				"0x%llx\n", data);
    			return 1;
    		}
    
    	case MSR_AMD64_NB_CFG:
    
    	case MSR_IA32_DEBUGCTLMSR:
    		if (!data) {
    			/* We support the non-activated case already */
    			break;
    		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
    			/* Values other than LBR and BTF are vendor-specific,
    			   thus reserved and should throw a #GP */
    			return 1;
    		}
    		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
    			__func__, data);
    		break;
    
    	case MSR_IA32_UCODE_REV:
    	case MSR_IA32_UCODE_WRITE:
    
    	case MSR_VM_HSAVE_PA:
    
    	case MSR_AMD64_PATCH_LOADER:
    
    Avi Kivity's avatar
    Avi Kivity committed
    	case 0x200 ... 0x2ff:
    		return set_msr_mtrr(vcpu, msr, data);
    
    	case MSR_IA32_APICBASE:
    		kvm_set_apic_base(vcpu, data);
    		break;
    
    	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
    		return kvm_x2apic_msr_write(vcpu, msr, data);
    
    	case MSR_IA32_MISC_ENABLE:
    
    		vcpu->arch.ia32_misc_enable_msr = data;
    
    	case MSR_KVM_WALL_CLOCK_NEW:
    
    	case MSR_KVM_WALL_CLOCK:
    		vcpu->kvm->arch.wall_clock = data;
    		kvm_write_wall_clock(vcpu->kvm, data);
    		break;
    
    	case MSR_KVM_SYSTEM_TIME_NEW:
    
    	case MSR_KVM_SYSTEM_TIME: {
    		if (vcpu->arch.time_page) {
    			kvm_release_page_dirty(vcpu->arch.time_page);
    			vcpu->arch.time_page = NULL;
    		}
    
    		vcpu->arch.time = data;
    
    		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
    
    
    		/* we verify if the enable bit is set... */
    		if (!(data & 1))
    			break;
    
    		/* ...but clean it before doing the actual write */
    		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
    
    		vcpu->arch.time_page =
    				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
    
    		if (is_error_page(vcpu->arch.time_page)) {
    			kvm_release_page_clean(vcpu->arch.time_page);
    			vcpu->arch.time_page = NULL;
    		}
    		break;
    	}
    
    Huang Ying's avatar
    Huang Ying committed
    	case MSR_IA32_MCG_CTL:
    	case MSR_IA32_MCG_STATUS:
    	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
    		return set_msr_mce(vcpu, msr, data);
    
    
    	/* Performance counters are not protected by a CPUID bit,
    	 * so we should check all of them in the generic path for the sake of
    	 * cross vendor migration.
    	 * Writing a zero into the event select MSRs disables them,
    	 * which we perfectly emulate ;-). Any other value should be at least
    	 * reported, some guests depend on them.
    	 */
    	case MSR_P6_EVNTSEL0:
    	case MSR_P6_EVNTSEL1:
    	case MSR_K7_EVNTSEL0:
    	case MSR_K7_EVNTSEL1:
    	case MSR_K7_EVNTSEL2:
    	case MSR_K7_EVNTSEL3:
    		if (data != 0)
    			pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
    				"0x%x data 0x%llx\n", msr, data);
    		break;
    	/* at least RHEL 4 unconditionally writes to the perfctr registers,
    	 * so we ignore writes to make it happy.
    	 */
    	case MSR_P6_PERFCTR0:
    	case MSR_P6_PERFCTR1:
    	case MSR_K7_PERFCTR0:
    	case MSR_K7_PERFCTR1:
    	case MSR_K7_PERFCTR2:
    	case MSR_K7_PERFCTR3:
    		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
    			"0x%x data 0x%llx\n", msr, data);
    		break;
    
    	case MSR_K7_CLK_CTL:
    		/*
    		 * Ignore all writes to this no longer documented MSR.
    		 * Writes are only relevant for old K7 processors,
    		 * all pre-dating SVM, but a recommended workaround from
    		 * AMD for these chips. It is possible to speicify the
    		 * affected processor models on the command line, hence
    		 * the need to ignore the workaround.
    		 */
    		break;
    
    	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
    		if (kvm_hv_msr_partition_wide(msr)) {
    			int r;
    			mutex_lock(&vcpu->kvm->lock);
    			r = set_msr_hyperv_pw(vcpu, msr, data);
    			mutex_unlock(&vcpu->kvm->lock);
    			return r;
    		} else
    			return set_msr_hyperv(vcpu, msr, data);
    		break;
    
    		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
    			return xen_hvm_config(vcpu, data);
    
    		if (!ignore_msrs) {
    			pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
    				msr, data);
    			return 1;
    		} else {
    			pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
    				msr, data);
    			break;
    		}
    
    	}
    	return 0;
    }
    EXPORT_SYMBOL_GPL(kvm_set_msr_common);
    
    
    /*
     * Reads an msr value (of 'msr_index') into 'pdata'.
     * Returns 0 on success, non-0 otherwise.
     * Assumes vcpu_load() was already called.
     */
    int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
    {
    	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    
    Sheng Yang's avatar
    Sheng Yang committed
    	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    	if (!msr_mtrr_valid(msr))
    		return 1;
    
    
    Sheng Yang's avatar
    Sheng Yang committed
    	if (msr == MSR_MTRRdefType)
    		*pdata = vcpu->arch.mtrr_state.def_type +
    			 (vcpu->arch.mtrr_state.enabled << 10);
    	else if (msr == MSR_MTRRfix64K_00000)
    		*pdata = p[0];
    	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
    		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
    	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
    		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
    	else if (msr == MSR_IA32_CR_PAT)
    		*pdata = vcpu->arch.pat;
    	else {	/* Variable MTRRs */
    		int idx, is_mtrr_mask;
    		u64 *pt;
    
    		idx = (msr - 0x200) / 2;
    		is_mtrr_mask = msr - 0x200 - 2 * idx;
    		if (!is_mtrr_mask)
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
    		else
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
    		*pdata = *pt;
    	}
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    	return 0;
    }
    
    
    Huang Ying's avatar
    Huang Ying committed
    static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    
    Huang Ying's avatar
    Huang Ying committed
    	u64 mcg_cap = vcpu->arch.mcg_cap;
    	unsigned bank_num = mcg_cap & 0xff;
    
    
    	switch (msr) {
    	case MSR_IA32_P5_MC_ADDR:
    	case MSR_IA32_P5_MC_TYPE:
    
    Huang Ying's avatar
    Huang Ying committed
    		data = 0;
    		break;
    
    Huang Ying's avatar
    Huang Ying committed
    		data = vcpu->arch.mcg_cap;
    		break;
    
    	case MSR_IA32_MCG_CTL:
    
    Huang Ying's avatar
    Huang Ying committed
    		if (!(mcg_cap & MCG_CTL_P))
    			return 1;
    		data = vcpu->arch.mcg_ctl;
    		break;
    	case MSR_IA32_MCG_STATUS:
    		data = vcpu->arch.mcg_status;
    		break;
    	default:
    		if (msr >= MSR_IA32_MC0_CTL &&
    		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
    			u32 offset = msr - MSR_IA32_MC0_CTL;
    			data = vcpu->arch.mce_banks[offset];
    			break;
    		}
    		return 1;
    	}
    	*pdata = data;
    	return 0;
    }
    
    
    static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    	u64 data = 0;
    	struct kvm *kvm = vcpu->kvm;
    
    	switch (msr) {
    	case HV_X64_MSR_GUEST_OS_ID:
    		data = kvm->arch.hv_guest_os_id;
    		break;
    	case HV_X64_MSR_HYPERCALL:
    		data = kvm->arch.hv_hypercall;
    		break;
    	default:
    		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
    		return 1;
    	}
    
    	*pdata = data;
    	return 0;
    }
    
    static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    	u64 data = 0;
    
    	switch (msr) {
    	case HV_X64_MSR_VP_INDEX: {
    		int r;
    		struct kvm_vcpu *v;
    		kvm_for_each_vcpu(r, v, vcpu->kvm)
    			if (v == vcpu)
    				data = r;
    		break;
    	}
    
    	case HV_X64_MSR_EOI:
    		return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
    	case HV_X64_MSR_ICR:
    		return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
    	case HV_X64_MSR_TPR:
    		return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
    
    	default:
    		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
    		return 1;
    	}
    	*pdata = data;
    	return 0;
    }
    
    
    Huang Ying's avatar
    Huang Ying committed
    int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
    {
    	u64 data;
    
    	switch (msr) {
    	case MSR_IA32_PLATFORM_ID:
    
    	case MSR_IA32_UCODE_REV:
    	case MSR_IA32_EBL_CR_POWERON:
    
    	case MSR_IA32_DEBUGCTLMSR:
    	case MSR_IA32_LASTBRANCHFROMIP:
    	case MSR_IA32_LASTBRANCHTOIP:
    	case MSR_IA32_LASTINTFROMIP:
    	case MSR_IA32_LASTINTTOIP:
    
    	case MSR_K8_SYSCFG:
    	case MSR_K7_HWCR:
    
    	case MSR_VM_HSAVE_PA:
    
    	case MSR_P6_PERFCTR0:
    	case MSR_P6_PERFCTR1:
    
    	case MSR_P6_EVNTSEL0:
    	case MSR_P6_EVNTSEL1:
    
    	case MSR_K7_EVNTSEL0:
    
    	case MSR_K7_PERFCTR0:
    
    	case MSR_K8_INT_PENDING_MSG:
    
    	case MSR_AMD64_NB_CFG:
    
    	case MSR_FAM10H_MMIO_CONF_BASE:
    
    Avi Kivity's avatar
    Avi Kivity committed
    	case MSR_MTRRcap:
    		data = 0x500 | KVM_NR_VAR_MTRR;
    		break;
    	case 0x200 ... 0x2ff:
    		return get_msr_mtrr(vcpu, msr, pdata);
    
    	case 0xcd: /* fsb frequency */
    		data = 3;
    		break;
    
    		/*
    		 * MSR_EBC_FREQUENCY_ID
    		 * Conservative value valid for even the basic CPU models.
    		 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
    		 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
    		 * and 266MHz for model 3, or 4. Set Core Clock
    		 * Frequency to System Bus Frequency Ratio to 1 (bits
    		 * 31:24) even though these are only valid for CPU
    		 * models > 2, however guests may end up dividing or
    		 * multiplying by zero otherwise.
    		 */
    	case MSR_EBC_FREQUENCY_ID:
    		data = 1 << 24;
    		break;
    
    	case MSR_IA32_APICBASE:
    		data = kvm_get_apic_base(vcpu);
    		break;
    
    	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
    		return kvm_x2apic_msr_read(vcpu, msr, pdata);
    		break;
    
    	case MSR_IA32_MISC_ENABLE:
    
    		data = vcpu->arch.ia32_misc_enable_msr;
    
    	case MSR_IA32_PERF_STATUS:
    		/* TSC increment by tick */
    		data = 1000ULL;
    		/* CPU multiplier */
    		data |= (((uint64_t)4ULL) << 40);
    		break;
    
    		data = vcpu->arch.efer;
    
    	case MSR_KVM_WALL_CLOCK:
    
    	case MSR_KVM_WALL_CLOCK_NEW:
    
    		data = vcpu->kvm->arch.wall_clock;
    		break;
    	case MSR_KVM_SYSTEM_TIME:
    
    	case MSR_KVM_SYSTEM_TIME_NEW:
    
    		data = vcpu->arch.time;
    		break;
    
    Huang Ying's avatar
    Huang Ying committed
    	case MSR_IA32_P5_MC_ADDR:
    	case MSR_IA32_P5_MC_TYPE:
    	case MSR_IA32_MCG_CAP:
    	case MSR_IA32_MCG_CTL:
    	case MSR_IA32_MCG_STATUS:
    	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
    		return get_msr_mce(vcpu, msr, pdata);
    
    	case MSR_K7_CLK_CTL:
    		/*
    		 * Provide expected ramp-up count for K7. All other
    		 * are set to zero, indicating minimum divisors for
    		 * every field.
    		 *
    		 * This prevents guest kernels on AMD host with CPU
    		 * type 6, model 8 and higher from exploding due to
    		 * the rdmsr failing.
    		 */
    		data = 0x20000000;
    		break;
    
    	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
    		if (kvm_hv_msr_partition_wide(msr)) {
    			int r;
    			mutex_lock(&vcpu->kvm->lock);
    			r = get_msr_hyperv_pw(vcpu, msr, pdata);
    			mutex_unlock(&vcpu->kvm->lock);
    			return r;
    		} else
    			return get_msr_hyperv(vcpu, msr, pdata);
    		break;
    
    		if (!ignore_msrs) {
    			pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
    			return 1;
    		} else {
    			pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
    			data = 0;
    		}
    		break;
    
    	}
    	*pdata = data;
    	return 0;
    }
    EXPORT_SYMBOL_GPL(kvm_get_msr_common);
    
    
    /*
     * Read or write a bunch of msrs. All parameters are kernel addresses.
     *
     * @return number of msrs set successfully.
     */
    static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
    		    struct kvm_msr_entry *entries,
    		    int (*do_msr)(struct kvm_vcpu *vcpu,
    				  unsigned index, u64 *data))
    {
    
    	idx = srcu_read_lock(&vcpu->kvm->srcu);
    
    	for (i = 0; i < msrs->nmsrs; ++i)
    		if (do_msr(vcpu, entries[i].index, &entries[i].data))
    			break;
    
    	srcu_read_unlock(&vcpu->kvm->srcu, idx);
    
    
    	return i;
    }
    
    /*
     * Read or write a bunch of msrs. Parameters are user addresses.
     *
     * @return number of msrs set successfully.
     */
    static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
    		  int (*do_msr)(struct kvm_vcpu *vcpu,
    				unsigned index, u64 *data),
    		  int writeback)
    {
    	struct kvm_msrs msrs;
    	struct kvm_msr_entry *entries;
    	int r, n;
    	unsigned size;
    
    	r = -EFAULT;
    	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
    		goto out;
    
    	r = -E2BIG;
    	if (msrs.nmsrs >= MAX_IO_MSRS)
    		goto out;
    
    	r = -ENOMEM;
    	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
    
    	entries = kmalloc(size, GFP_KERNEL);
    
    	if (!entries)
    		goto out;
    
    	r = -EFAULT;
    	if (copy_from_user(entries, user_msrs->entries, size))
    		goto out_free;
    
    	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
    	if (r < 0)
    		goto out_free;
    
    	r = -EFAULT;
    	if (writeback && copy_to_user(user_msrs->entries, entries, size))
    		goto out_free;
    
    	r = n;
    
    out_free:
    
    int kvm_dev_ioctl_check_extension(long ext)
    {
    	int r;
    
    	switch (ext) {
    	case KVM_CAP_IRQCHIP:
    	case KVM_CAP_HLT:
    	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
    	case KVM_CAP_SET_TSS_ADDR:
    
    	case KVM_CAP_EXT_CPUID:
    
    	case KVM_CAP_CLOCKSOURCE:
    
    Sheng Yang's avatar
    Sheng Yang committed
    	case KVM_CAP_PIT:
    
    	case KVM_CAP_NOP_IO_DELAY:
    
    	case KVM_CAP_MP_STATE:
    
    	case KVM_CAP_SYNC_MMU:
    
    	case KVM_CAP_REINJECT_CONTROL:
    
    	case KVM_CAP_IRQ_INJECT_STATUS:
    
    	case KVM_CAP_ASSIGN_DEV_IRQ:
    
    Gregory Haskins's avatar
    Gregory Haskins committed
    	case KVM_CAP_IRQFD:
    
    	case KVM_CAP_IOEVENTFD:
    
    	case KVM_CAP_PIT2:
    
    	case KVM_CAP_PIT_STATE2:
    
    	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
    
    	case KVM_CAP_XEN_HVM:
    
    	case KVM_CAP_ADJUST_CLOCK:
    
    	case KVM_CAP_VCPU_EVENTS:
    
    	case KVM_CAP_HYPERV:
    
    	case KVM_CAP_HYPERV_VAPIC:
    
    	case KVM_CAP_HYPERV_SPIN:
    
    	case KVM_CAP_PCI_SEGMENT:
    
    	case KVM_CAP_X86_ROBUST_SINGLESTEP:
    
    	case KVM_CAP_XSAVE:
    
    	case KVM_CAP_COALESCED_MMIO:
    		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
    		break;
    
    	case KVM_CAP_VAPIC:
    		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
    		break;
    
    	case KVM_CAP_NR_VCPUS:
    		r = KVM_MAX_VCPUS;
    		break;
    
    	case KVM_CAP_NR_MEMSLOTS:
    		r = KVM_MEMORY_SLOTS;
    		break;
    
    	case KVM_CAP_PV_MMU:	/* obsolete */
    		r = 0;
    
    	case KVM_CAP_IOMMU:
    
    		r = iommu_found();
    
    Huang Ying's avatar
    Huang Ying committed
    	case KVM_CAP_MCE:
    		r = KVM_MAX_MCE_BANKS;
    		break;
    
    	case KVM_CAP_XCRS:
    		r = cpu_has_xsave;
    		break;
    
    long kvm_arch_dev_ioctl(struct file *filp,
    			unsigned int ioctl, unsigned long arg)
    {
    	void __user *argp = (void __user *)arg;
    	long r;
    
    	switch (ioctl) {
    	case KVM_GET_MSR_INDEX_LIST: {
    		struct kvm_msr_list __user *user_msr_list = argp;
    		struct kvm_msr_list msr_list;
    		unsigned n;
    
    		r = -EFAULT;
    		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
    			goto out;
    		n = msr_list.nmsrs;
    		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
    		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
    			goto out;
    		r = -E2BIG;
    
    		if (n < msr_list.nmsrs)
    
    			goto out;
    		r = -EFAULT;
    		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
    				 num_msrs_to_save * sizeof(u32)))
    			goto out;
    
    		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
    
    				 &emulated_msrs,
    				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
    			goto out;
    		r = 0;
    		break;
    	}
    
    	case KVM_GET_SUPPORTED_CPUID: {
    		struct kvm_cpuid2 __user *cpuid_arg = argp;
    		struct kvm_cpuid2 cpuid;
    
    		r = -EFAULT;
    		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
    			goto out;
    		r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
    
    						      cpuid_arg->entries);