Skip to content
Snippets Groups Projects
x86.c 187 KiB
Newer Older
  • Learn to ignore specific revisions
  • 	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
    	if (r)
    		return;
    
    	if (version & 1)
    		++version;  /* first time write, random junk */
    
    	++version;
    
    
    	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
    
    
    	/*
    	 * The guest calculates current wall clock time by adding
    
    	 * system time (updated by kvm_guest_time_update below) to the
    
    	 * wall clock specified here.  guest system time equals host
    	 * system time for us, thus we must fill in host boot time here.
    	 */
    
    	if (kvm->arch.kvmclock_offset) {
    		struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
    		boot = timespec_sub(boot, ts);
    	}
    
    	wc.sec = boot.tv_sec;
    	wc.nsec = boot.tv_nsec;
    	wc.version = version;
    
    
    	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
    
    	version++;
    	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
    }
    
    
    static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
    {
    	uint32_t quotient, remainder;
    
    	/* Don't try to replace with do_div(), this one calculates
    	 * "(dividend << 32) / divisor" */
    	__asm__ ( "divl %4"
    		  : "=a" (quotient), "=d" (remainder)
    		  : "0" (0), "1" (dividend), "r" (divisor) );
    	return quotient;
    }
    
    
    static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
    			       s8 *pshift, u32 *pmultiplier)
    
    	uint64_t scaled64;
    
    	int32_t  shift = 0;
    	uint64_t tps64;
    	uint32_t tps32;
    
    
    	tps64 = base_khz * 1000LL;
    	scaled64 = scaled_khz * 1000LL;
    
    	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
    
    		tps64 >>= 1;
    		shift--;
    	}
    
    	tps32 = (uint32_t)tps64;
    
    	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
    		if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
    
    			scaled64 >>= 1;
    		else
    			tps32 <<= 1;
    
    	*pshift = shift;
    	*pmultiplier = div_frac(scaled64, tps32);
    
    	pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
    		 __func__, base_khz, scaled_khz, shift, *pmultiplier);
    
    static inline u64 get_kernel_ns(void)
    {
    	struct timespec ts;
    
    	WARN_ON(preemptible());
    	ktime_get_ts(&ts);
    	monotonic_to_bootbased(&ts);
    	return timespec_to_ns(&ts);
    
    static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
    
    static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
    
    unsigned long max_tsc_khz;
    
    static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
    
    	return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
    				   vcpu->arch.virtual_tsc_shift);
    
    static u32 adjust_tsc_khz(u32 khz, s32 ppm)
    
    	u64 v = (u64)khz * (1000000 + ppm);
    	do_div(v, 1000000);
    	return v;
    
    static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
    
    	u32 thresh_lo, thresh_hi;
    	int use_scaling = 0;
    
    	/* tsc_khz can be zero if TSC calibration fails */
    	if (this_tsc_khz == 0)
    		return;
    
    
    	/* Compute a scale to convert nanoseconds in TSC cycles */
    	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
    
    			   &vcpu->arch.virtual_tsc_shift,
    			   &vcpu->arch.virtual_tsc_mult);
    	vcpu->arch.virtual_tsc_khz = this_tsc_khz;
    
    	/*
    	 * Compute the variation in TSC rate which is acceptable
    	 * within the range of tolerance and decide if the
    	 * rate being applied is within that bounds of the hardware
    	 * rate.  If so, no scaling or compensation need be done.
    	 */
    	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
    	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
    	if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
    		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
    		use_scaling = 1;
    	}
    	kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
    
    }
    
    static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
    {
    
    	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
    
    				      vcpu->arch.virtual_tsc_mult,
    				      vcpu->arch.virtual_tsc_shift);
    
    	tsc += vcpu->arch.this_tsc_write;
    
    void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
    {
    #ifdef CONFIG_X86_64
    	bool vcpus_matched;
    	bool do_request = false;
    	struct kvm_arch *ka = &vcpu->kvm->arch;
    	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
    
    	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
    			 atomic_read(&vcpu->kvm->online_vcpus));
    
    	if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
    		if (!ka->use_master_clock)
    			do_request = 1;
    
    	if (!vcpus_matched && ka->use_master_clock)
    			do_request = 1;
    
    	if (do_request)
    		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
    
    	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
    			    atomic_read(&vcpu->kvm->online_vcpus),
    		            ka->use_master_clock, gtod->clock.vclock_mode);
    #endif
    }
    
    
    static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
    {
    	u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
    	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
    }
    
    
    void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
    
    {
    	struct kvm *kvm = vcpu->kvm;
    
    	u64 offset, ns, elapsed;
    
    	unsigned long flags;
    
    	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
    
    	offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
    
    	elapsed = ns - kvm->arch.last_tsc_nsec;
    
    	if (vcpu->arch.virtual_tsc_khz) {
    
    		/* n.b - signed multiplication and division required */
    		usdiff = data - kvm->arch.last_tsc_write;
    
    #ifdef CONFIG_X86_64
    
    		usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
    
    		/* do_div() only does unsigned */
    
    		asm("1: idivl %[divisor]\n"
    		    "2: xor %%edx, %%edx\n"
    		    "   movl $0, %[faulted]\n"
    		    "3:\n"
    		    ".section .fixup,\"ax\"\n"
    		    "4: movl $1, %[faulted]\n"
    		    "   jmp  3b\n"
    		    ".previous\n"
    
    		_ASM_EXTABLE(1b, 4b)
    
    		: "=A"(usdiff), [faulted] "=r" (faulted)
    		: "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
    
    
    		do_div(elapsed, 1000);
    		usdiff -= elapsed;
    		if (usdiff < 0)
    			usdiff = -usdiff;
    
    
    		/* idivl overflow => difference is larger than USEC_PER_SEC */
    		if (faulted)
    			usdiff = USEC_PER_SEC;
    
    	} else
    		usdiff = USEC_PER_SEC; /* disable TSC match window below */
    
    	 * Special case: TSC write with a small delta (1 second) of virtual
    	 * cycle time against real time is interpreted as an attempt to
    	 * synchronize the CPU.
             *
    	 * For a reliable TSC, we can match TSC offsets, and for an unstable
    	 * TSC, we add elapsed time in this computation.  We could let the
    	 * compensation code attempt to catch up if we fall behind, but
    	 * it's better to try to match offsets from the beginning.
             */
    
    	if (usdiff < USEC_PER_SEC &&
    
    	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
    
    		if (!check_tsc_unstable()) {
    
    			offset = kvm->arch.cur_tsc_offset;
    
    			pr_debug("kvm: matched tsc offset for %llu\n", data);
    		} else {
    
    			u64 delta = nsec_to_cycles(vcpu, elapsed);
    
    			data += delta;
    			offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
    
    			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
    
    	} else {
    		/*
    		 * We split periods of matched TSC writes into generations.
    		 * For each generation, we track the original measured
    		 * nanosecond time, offset, and write, so if TSCs are in
    		 * sync, we can match exact offset, and if not, we can match
    
    Guo Chao's avatar
    Guo Chao committed
    		 * exact software computation in compute_guest_tsc()
    
    		 *
    		 * These values are tracked in kvm->arch.cur_xxx variables.
    		 */
    		kvm->arch.cur_tsc_generation++;
    		kvm->arch.cur_tsc_nsec = ns;
    		kvm->arch.cur_tsc_write = data;
    		kvm->arch.cur_tsc_offset = offset;
    
    		pr_debug("kvm: new tsc generation %u, clock %llu\n",
    			 kvm->arch.cur_tsc_generation, data);
    
    
    	/*
    	 * We also track th most recent recorded KHZ, write and time to
    	 * allow the matching interval to be extended at each write.
    	 */
    
    	kvm->arch.last_tsc_nsec = ns;
    	kvm->arch.last_tsc_write = data;
    
    	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
    
    
    	/* Reset of TSC must disable overshoot protection below */
    	vcpu->arch.hv_clock.tsc_timestamp = 0;
    
    	vcpu->arch.last_guest_tsc = data;
    
    
    	/* Keep track of which generation this VCPU has synchronized to */
    	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
    	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
    	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
    
    
    	if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
    		update_ia32_tsc_adjust_msr(vcpu, offset);
    
    	kvm_x86_ops->write_tsc_offset(vcpu, offset);
    	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
    
    
    	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
    	if (matched)
    		kvm->arch.nr_vcpus_matched_tsc++;
    	else
    		kvm->arch.nr_vcpus_matched_tsc = 0;
    
    	kvm_track_tsc_matching(vcpu);
    	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
    
    EXPORT_SYMBOL_GPL(kvm_write_tsc);
    
    
    #ifdef CONFIG_X86_64
    
    static cycle_t read_tsc(void)
    {
    	cycle_t ret;
    	u64 last;
    
    	/*
    	 * Empirically, a fence (of type that depends on the CPU)
    	 * before rdtsc is enough to ensure that rdtsc is ordered
    	 * with respect to loads.  The various CPU manuals are unclear
    	 * as to whether rdtsc can be reordered with later loads,
    	 * but no one has ever seen it happen.
    	 */
    	rdtsc_barrier();
    	ret = (cycle_t)vget_cycles();
    
    	last = pvclock_gtod_data.clock.cycle_last;
    
    	if (likely(ret >= last))
    		return ret;
    
    	/*
    	 * GCC likes to generate cmov here, but this branch is extremely
    	 * predictable (it's just a funciton of time and the likely is
    	 * very likely) and there's a data dependence, so force GCC
    	 * to generate a branch instead.  I don't barrier() because
    	 * we don't actually need a barrier, and if this function
    	 * ever gets inlined it will generate worse code.
    	 */
    	asm volatile ("");
    	return last;
    }
    
    static inline u64 vgettsc(cycle_t *cycle_now)
    {
    	long v;
    	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
    
    	*cycle_now = read_tsc();
    
    	v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
    	return v * gtod->clock.mult;
    }
    
    static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
    {
    	unsigned long seq;
    	u64 ns;
    	int mode;
    	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
    
    	ts->tv_nsec = 0;
    	do {
    		seq = read_seqcount_begin(&gtod->seq);
    		mode = gtod->clock.vclock_mode;
    		ts->tv_sec = gtod->monotonic_time_sec;
    		ns = gtod->monotonic_time_snsec;
    		ns += vgettsc(cycle_now);
    		ns >>= gtod->clock.shift;
    	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
    	timespec_add_ns(ts, ns);
    
    	return mode;
    }
    
    /* returns true if host is using tsc clocksource */
    static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
    {
    	struct timespec ts;
    
    	/* checked again under seqlock below */
    	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
    		return false;
    
    	if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
    		return false;
    
    	monotonic_to_bootbased(&ts);
    	*kernel_ns = timespec_to_ns(&ts);
    
    	return true;
    }
    #endif
    
    /*
     *
    
     * Assuming a stable TSC across physical CPUS, and a stable TSC
     * across virtual CPUs, the following condition is possible.
     * Each numbered line represents an event visible to both
    
     * CPUs at the next numbered event.
     *
     * "timespecX" represents host monotonic time. "tscX" represents
     * RDTSC value.
     *
     * 		VCPU0 on CPU0		|	VCPU1 on CPU1
     *
     * 1.  read timespec0,tsc0
     * 2.					| timespec1 = timespec0 + N
     * 					| tsc1 = tsc0 + M
     * 3. transition to guest		| transition to guest
     * 4. ret0 = timespec0 + (rdtsc - tsc0) |
     * 5.				        | ret1 = timespec1 + (rdtsc - tsc1)
     * 				        | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
     *
     * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
     *
     * 	- ret0 < ret1
     *	- timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
     *		...
     *	- 0 < N - M => M < N
     *
     * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
     * always the case (the difference between two distinct xtime instances
     * might be smaller then the difference between corresponding TSC reads,
     * when updating guest vcpus pvclock areas).
     *
     * To avoid that problem, do not allow visibility of distinct
     * system_timestamp/tsc_timestamp values simultaneously: use a master
     * copy of host monotonic time values. Update that master copy
     * in lockstep.
     *
    
     * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
    
     *
     */
    
    static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
    {
    #ifdef CONFIG_X86_64
    	struct kvm_arch *ka = &kvm->arch;
    	int vclock_mode;
    
    	bool host_tsc_clocksource, vcpus_matched;
    
    	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
    			atomic_read(&kvm->online_vcpus));
    
    
    	/*
    	 * If the host uses TSC clock, then passthrough TSC as stable
    	 * to the guest.
    	 */
    
    	host_tsc_clocksource = kvm_get_time_and_clockread(
    
    					&ka->master_kernel_ns,
    					&ka->master_cycle_now);
    
    
    	ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
    
    
    	if (ka->use_master_clock)
    		atomic_set(&kvm_guest_has_master_clock, 1);
    
    	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
    
    	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
    					vcpus_matched);
    
    static void kvm_gen_update_masterclock(struct kvm *kvm)
    {
    #ifdef CONFIG_X86_64
    	int i;
    	struct kvm_vcpu *vcpu;
    	struct kvm_arch *ka = &kvm->arch;
    
    	spin_lock(&ka->pvclock_gtod_sync_lock);
    	kvm_make_mclock_inprogress_request(kvm);
    	/* no guest entries from this point */
    	pvclock_update_vm_gtod_copy(kvm);
    
    	kvm_for_each_vcpu(i, vcpu, kvm)
    		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
    
    	/* guest entries allowed */
    	kvm_for_each_vcpu(i, vcpu, kvm)
    		clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
    
    	spin_unlock(&ka->pvclock_gtod_sync_lock);
    #endif
    }
    
    
    static int kvm_guest_time_update(struct kvm_vcpu *v)
    
    	unsigned long flags, this_tsc_khz;
    
    	struct kvm_vcpu_arch *vcpu = &v->arch;
    
    	struct kvm_arch *ka = &v->kvm->arch;
    
    	s64 kernel_ns, max_kernel_ns;
    
    	u64 tsc_timestamp, host_tsc;
    
    	struct pvclock_vcpu_time_info guest_hv_clock;
    
    	bool use_master_clock;
    
    	kernel_ns = 0;
    	host_tsc = 0;
    
    	/*
    	 * If the host uses TSC clock, then passthrough TSC as stable
    	 * to the guest.
    	 */
    	spin_lock(&ka->pvclock_gtod_sync_lock);
    	use_master_clock = ka->use_master_clock;
    	if (use_master_clock) {
    		host_tsc = ka->master_cycle_now;
    		kernel_ns = ka->master_kernel_ns;
    	}
    	spin_unlock(&ka->pvclock_gtod_sync_lock);
    
    
    	/* Keep irq disabled to prevent changes to the clock */
    	local_irq_save(flags);
    	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
    	if (unlikely(this_tsc_khz == 0)) {
    		local_irq_restore(flags);
    		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
    		return 1;
    	}
    
    	if (!use_master_clock) {
    		host_tsc = native_read_tsc();
    		kernel_ns = get_kernel_ns();
    	}
    
    	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
    
    
    	/*
    	 * We may have to catch up the TSC to match elapsed wall clock
    	 * time for two reasons, even if kvmclock is used.
    	 *   1) CPU could have been running below the maximum TSC rate
    	 *   2) Broken TSC compensation resets the base at each VCPU
    	 *      entry to avoid unknown leaps of TSC even when running
    	 *      again on the same CPU.  This may cause apparent elapsed
    	 *      time to disappear, and the guest to stand still or run
    	 *	very slowly.
    	 */
    	if (vcpu->tsc_catchup) {
    		u64 tsc = compute_guest_tsc(v, kernel_ns);
    		if (tsc > tsc_timestamp) {
    
    			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
    
    			tsc_timestamp = tsc;
    		}
    
    		return 0;
    
    	/*
    	 * Time as measured by the TSC may go backwards when resetting the base
    	 * tsc_timestamp.  The reason for this is that the TSC resolution is
    	 * higher than the resolution of the other clock scales.  Thus, many
    	 * possible measurments of the TSC correspond to one measurement of any
    	 * other clock, and so a spread of values is possible.  This is not a
    	 * problem for the computation of the nanosecond clock; with TSC rates
    	 * around 1GHZ, there can only be a few cycles which correspond to one
    	 * nanosecond value, and any path through this code will inevitably
    	 * take longer than that.  However, with the kernel_ns value itself,
    	 * the precision may be much lower, down to HZ granularity.  If the
    	 * first sampling of TSC against kernel_ns ends in the low part of the
    	 * range, and the second in the high end of the range, we can get:
    	 *
    	 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
    	 *
    	 * As the sampling errors potentially range in the thousands of cycles,
    	 * it is possible such a time value has already been observed by the
    	 * guest.  To protect against this, we must compute the system time as
    	 * observed by the guest and ensure the new system time is greater.
    	 */
    	max_kernel_ns = 0;
    
    	if (vcpu->hv_clock.tsc_timestamp) {
    
    		max_kernel_ns = vcpu->last_guest_tsc -
    				vcpu->hv_clock.tsc_timestamp;
    		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
    				    vcpu->hv_clock.tsc_to_system_mul,
    				    vcpu->hv_clock.tsc_shift);
    		max_kernel_ns += vcpu->last_kernel_ns;
    	}
    
    	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
    
    		kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
    				   &vcpu->hv_clock.tsc_shift,
    				   &vcpu->hv_clock.tsc_to_system_mul);
    
    		vcpu->hw_tsc_khz = this_tsc_khz;
    
    	/* with a master <monotonic time, tsc value> tuple,
    	 * pvclock clock reads always increase at the (scaled) rate
    	 * of guest TSC - no need to deal with sampling errors.
    	 */
    	if (!use_master_clock) {
    		if (max_kernel_ns > kernel_ns)
    			kernel_ns = max_kernel_ns;
    	}
    
    	/* With all the info we got, fill in the values */
    
    	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
    
    	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
    
    	vcpu->last_kernel_ns = kernel_ns;
    
    	vcpu->last_guest_tsc = tsc_timestamp;
    
    	/*
    	 * The interface expects us to write an even number signaling that the
    	 * update is finished. Since the guest won't see the intermediate
    
    	 * state, we just increase by 2 at the end.
    
    	vcpu->hv_clock.version += 2;
    
    	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
    		&guest_hv_clock, sizeof(guest_hv_clock))))
    		return 0;
    
    
    	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
    
    	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
    
    
    	if (vcpu->pvclock_set_guest_stopped_request) {
    		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
    		vcpu->pvclock_set_guest_stopped_request = false;
    	}
    
    
    	/* If the host uses TSC clocksource, then it is stable */
    	if (use_master_clock)
    		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
    
    
    	vcpu->hv_clock.flags = pvclock_flags;
    
    
    	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
    				&vcpu->hv_clock,
    				sizeof(vcpu->hv_clock));
    
    /*
     * kvmclock updates which are isolated to a given vcpu, such as
     * vcpu->cpu migration, should not allow system_timestamp from
     * the rest of the vcpus to remain static. Otherwise ntp frequency
     * correction applies to one vcpu's system_timestamp but not
     * the others.
     *
     * So in those cases, request a kvmclock update for all vcpus.
     * The worst case for a remote vcpu to update its kvmclock
     * is then bounded by maximum nohz sleep latency.
     */
    
    static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
    {
    	int i;
    	struct kvm *kvm = v->kvm;
    	struct kvm_vcpu *vcpu;
    
    	kvm_for_each_vcpu(i, vcpu, kvm) {
    		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
    		kvm_vcpu_kick(vcpu);
    	}
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    static bool msr_mtrr_valid(unsigned msr)
    {
    	switch (msr) {
    	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
    	case MSR_MTRRfix64K_00000:
    	case MSR_MTRRfix16K_80000:
    	case MSR_MTRRfix16K_A0000:
    	case MSR_MTRRfix4K_C0000:
    	case MSR_MTRRfix4K_C8000:
    	case MSR_MTRRfix4K_D0000:
    	case MSR_MTRRfix4K_D8000:
    	case MSR_MTRRfix4K_E0000:
    	case MSR_MTRRfix4K_E8000:
    	case MSR_MTRRfix4K_F0000:
    	case MSR_MTRRfix4K_F8000:
    	case MSR_MTRRdefType:
    	case MSR_IA32_CR_PAT:
    		return true;
    	case 0x2f8:
    		return true;
    	}
    	return false;
    }
    
    
    static bool valid_pat_type(unsigned t)
    {
    	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
    }
    
    static bool valid_mtrr_type(unsigned t)
    {
    	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
    }
    
    static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	int i;
    
    	if (!msr_mtrr_valid(msr))
    		return false;
    
    	if (msr == MSR_IA32_CR_PAT) {
    		for (i = 0; i < 8; i++)
    			if (!valid_pat_type((data >> (i * 8)) & 0xff))
    				return false;
    		return true;
    	} else if (msr == MSR_MTRRdefType) {
    		if (data & ~0xcff)
    			return false;
    		return valid_mtrr_type(data & 0xff);
    	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
    		for (i = 0; i < 8 ; i++)
    			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
    				return false;
    		return true;
    	}
    
    	/* variable MTRRs */
    	return valid_mtrr_type(data & 0xff);
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    
    Sheng Yang's avatar
    Sheng Yang committed
    	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
    
    
    	if (!mtrr_valid(vcpu, msr, data))
    
    Avi Kivity's avatar
    Avi Kivity committed
    		return 1;
    
    
    Sheng Yang's avatar
    Sheng Yang committed
    	if (msr == MSR_MTRRdefType) {
    		vcpu->arch.mtrr_state.def_type = data;
    		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
    	} else if (msr == MSR_MTRRfix64K_00000)
    		p[0] = data;
    	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
    		p[1 + msr - MSR_MTRRfix16K_80000] = data;
    	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
    		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
    	else if (msr == MSR_IA32_CR_PAT)
    		vcpu->arch.pat = data;
    	else {	/* Variable MTRRs */
    		int idx, is_mtrr_mask;
    		u64 *pt;
    
    		idx = (msr - 0x200) / 2;
    		is_mtrr_mask = msr - 0x200 - 2 * idx;
    		if (!is_mtrr_mask)
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
    		else
    			pt =
    			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
    		*pt = data;
    	}
    
    	kvm_mmu_reset_context(vcpu);
    
    Avi Kivity's avatar
    Avi Kivity committed
    	return 0;
    }
    
    Huang Ying's avatar
    Huang Ying committed
    static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    
    Huang Ying's avatar
    Huang Ying committed
    	u64 mcg_cap = vcpu->arch.mcg_cap;
    	unsigned bank_num = mcg_cap & 0xff;
    
    
    	switch (msr) {
    	case MSR_IA32_MCG_STATUS:
    
    Huang Ying's avatar
    Huang Ying committed
    		vcpu->arch.mcg_status = data;
    
    	case MSR_IA32_MCG_CTL:
    
    Huang Ying's avatar
    Huang Ying committed
    		if (!(mcg_cap & MCG_CTL_P))
    			return 1;
    		if (data != 0 && data != ~(u64)0)
    			return -1;
    		vcpu->arch.mcg_ctl = data;
    		break;
    	default:
    		if (msr >= MSR_IA32_MC0_CTL &&
    		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
    			u32 offset = msr - MSR_IA32_MC0_CTL;
    
    			/* only 0 or all 1s can be written to IA32_MCi_CTL
    			 * some Linux kernels though clear bit 10 in bank 4 to
    			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
    			 * this to avoid an uncatched #GP in the guest
    			 */
    
    Huang Ying's avatar
    Huang Ying committed
    			if ((offset & 0x3) == 0 &&
    
    			    data != 0 && (data | (1 << 10)) != ~(u64)0)
    
    Huang Ying's avatar
    Huang Ying committed
    				return -1;
    			vcpu->arch.mce_banks[offset] = data;
    			break;
    		}
    		return 1;
    	}
    	return 0;
    }
    
    
    static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
    {
    	struct kvm *kvm = vcpu->kvm;
    	int lm = is_long_mode(vcpu);
    	u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
    		: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
    	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
    		: kvm->arch.xen_hvm_config.blob_size_32;
    	u32 page_num = data & ~PAGE_MASK;
    	u64 page_addr = data & PAGE_MASK;
    	u8 *page;
    	int r;
    
    	r = -E2BIG;
    	if (page_num >= blob_size)
    		goto out;
    	r = -ENOMEM;
    
    	page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
    	if (IS_ERR(page)) {
    		r = PTR_ERR(page);
    
    		goto out;
    
    	if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
    		goto out_free;
    	r = 0;
    out_free:
    	kfree(page);
    out:
    	return r;
    }
    
    
    static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
    {
    	return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
    }
    
    static bool kvm_hv_msr_partition_wide(u32 msr)
    {
    	bool r = false;
    	switch (msr) {
    	case HV_X64_MSR_GUEST_OS_ID:
    	case HV_X64_MSR_HYPERCALL:
    		r = true;
    		break;
    	}
    
    	return r;
    }
    
    static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    	struct kvm *kvm = vcpu->kvm;
    
    	switch (msr) {
    	case HV_X64_MSR_GUEST_OS_ID:
    		kvm->arch.hv_guest_os_id = data;
    		/* setting guest os id to zero disables hypercall page */
    		if (!kvm->arch.hv_guest_os_id)
    			kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
    		break;
    	case HV_X64_MSR_HYPERCALL: {
    		u64 gfn;
    		unsigned long addr;
    		u8 instructions[4];
    
    		/* if guest os id is not set hypercall should remain disabled */
    		if (!kvm->arch.hv_guest_os_id)
    			break;
    		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
    			kvm->arch.hv_hypercall = data;
    			break;
    		}
    		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
    		addr = gfn_to_hva(kvm, gfn);
    		if (kvm_is_error_hva(addr))
    			return 1;
    		kvm_x86_ops->patch_hypercall(vcpu, instructions);
    		((unsigned char *)instructions)[3] = 0xc3; /* ret */
    
    		if (__copy_to_user((void __user *)addr, instructions, 4))
    
    			return 1;
    		kvm->arch.hv_hypercall = data;
    		break;
    	}
    	default:
    
    		vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
    			    "data 0x%llx\n", msr, data);
    
    		return 1;
    	}
    	return 0;
    }
    
    static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
    {
    
    	switch (msr) {
    	case HV_X64_MSR_APIC_ASSIST_PAGE: {
    		unsigned long addr;
    
    		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
    			vcpu->arch.hv_vapic = data;
    			break;
    		}
    		addr = gfn_to_hva(vcpu->kvm, data >>
    				  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
    		if (kvm_is_error_hva(addr))
    			return 1;
    
    		if (__clear_user((void __user *)addr, PAGE_SIZE))
    
    			return 1;
    		vcpu->arch.hv_vapic = data;
    		break;
    	}
    	case HV_X64_MSR_EOI:
    		return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
    	case HV_X64_MSR_ICR:
    		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
    	case HV_X64_MSR_TPR:
    		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
    	default:
    
    		vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
    			    "data 0x%llx\n", msr, data);
    
    static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
    {
    	gpa_t gpa = data & ~0x3f;
    
    
    Guo Chao's avatar
    Guo Chao committed
    	/* Bits 2:5 are reserved, Should be zero */
    
    		return 1;
    
    	vcpu->arch.apf.msr_val = data;
    
    	if (!(data & KVM_ASYNC_PF_ENABLED)) {
    		kvm_clear_async_pf_completion_queue(vcpu);
    		kvm_async_pf_hash_reset(vcpu);
    		return 0;
    	}
    
    
    	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
    					sizeof(u32)))
    
    	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
    
    	kvm_async_pf_wakeup_all(vcpu);
    	return 0;
    }
    
    
    static void kvmclock_reset(struct kvm_vcpu *vcpu)
    {
    
    static void accumulate_steal_time(struct kvm_vcpu *vcpu)
    {
    	u64 delta;
    
    	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
    		return;
    
    	delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
    	vcpu->arch.st.last_steal = current->sched_info.run_delay;
    	vcpu->arch.st.accum_steal = delta;
    }
    
    static void record_steal_time(struct kvm_vcpu *vcpu)
    {
    	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
    		return;
    
    	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
    		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
    		return;
    
    	vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
    	vcpu->arch.st.steal.version += 2;
    	vcpu->arch.st.accum_steal = 0;
    
    	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
    		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
    }
    
    
    int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
    
    	u32 msr = msr_info->index;
    	u64 data = msr_info->data;
    
    	case MSR_AMD64_NB_CFG:
    	case MSR_IA32_UCODE_REV:
    	case MSR_IA32_UCODE_WRITE:
    	case MSR_VM_HSAVE_PA:
    	case MSR_AMD64_PATCH_LOADER:
    	case MSR_AMD64_BU_CFG2:
    		break;
    
    
    		return set_efer(vcpu, data);
    
    	case MSR_K7_HWCR:
    		data &= ~(u64)0x40;	/* ignore flush filter disable */
    
    		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
    
    		data &= ~(u64)0x8;	/* ignore TLB cache disable */
    
    			vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
    				    data);
    
    	case MSR_FAM10H_MMIO_CONF_BASE:
    		if (data != 0) {
    
    			vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "