[kernel] r18378 - in dists/sid/linux-2.6/debian: . patches/bugfix/x86 patches/series

Aurelien Jarno aurel32 at alioth.debian.org
Sun Dec 11 16:53:06 UTC 2011


Author: aurel32
Date: Sun Dec 11 16:53:04 2011
New Revision: 18378

Log:
* [x86] Fix issues in KVM nVMX implementation by backporting changes 
    from 3.2.

Added:
   dists/sid/linux-2.6/debian/patches/bugfix/x86/KVM-nVMX-Add-KVM_REQ_IMMEDIATE_EXIT.patch
   dists/sid/linux-2.6/debian/patches/bugfix/x86/KVM-nVMX-Fix-nested-VMX-TSC-emulation.patch
   dists/sid/linux-2.6/debian/patches/bugfix/x86/KVM-nVMX-Fix-warning-causing-idt-vectoring-info-behavior.patch
Modified:
   dists/sid/linux-2.6/debian/changelog
   dists/sid/linux-2.6/debian/patches/series/base

Modified: dists/sid/linux-2.6/debian/changelog
==============================================================================
--- dists/sid/linux-2.6/debian/changelog	Sat Dec 10 19:27:16 2011	(r18377)
+++ dists/sid/linux-2.6/debian/changelog	Sun Dec 11 16:53:04 2011	(r18378)
@@ -22,6 +22,10 @@
     PCH_GBE, PCH_PHUB, SERIAL_PCH_UART, SPI_TOPCLIFF_PCH, USB_GADGET,
     USB_EG20T as modules
 
+  [ Aurelien Jarno ]
+  * [x86] Fix issues in KVM nVMX implementation by backporting changes 
+    from 3.2.
+
  -- Bastian Blank <waldi at debian.org>  Thu, 01 Dec 2011 13:17:34 +0100
 
 linux-2.6 (3.1.4-1) unstable; urgency=low

Added: dists/sid/linux-2.6/debian/patches/bugfix/x86/KVM-nVMX-Add-KVM_REQ_IMMEDIATE_EXIT.patch
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ dists/sid/linux-2.6/debian/patches/bugfix/x86/KVM-nVMX-Add-KVM_REQ_IMMEDIATE_EXIT.patch	Sun Dec 11 16:53:04 2011	(r18378)
@@ -0,0 +1,103 @@
+commit a3c6d93b2c5b8724e46ee3335f65bfc75c675090
+Author: Nadav Har'El <nyh at il.ibm.com>
+Date:   Thu Sep 22 13:52:56 2011 +0300
+
+    KVM: nVMX: Add KVM_REQ_IMMEDIATE_EXIT
+    
+    This patch adds a new vcpu->requests bit, KVM_REQ_IMMEDIATE_EXIT.
+    This bit requests that when next entering the guest, we should run it only
+    for as little as possible, and exit again.
+    
+    We use this new option in nested VMX: When L1 launches L2, but L0 wishes L1
+    to continue running so it can inject an event to it, we unfortunately cannot
+    just pretend to have run L2 for a little while - We must really launch L2,
+    otherwise certain one-off vmcs12 parameters (namely, L1 injection into L2)
+    will be lost. So the existing code runs L2 in this case.
+    But L2 could potentially run for a long time until it exits, and the
+    injection into L1 will be delayed. The new KVM_REQ_IMMEDIATE_EXIT allows us
+    to request that L2 will be entered, as necessary, but will exit as soon as
+    possible after entry.
+    
+    Our implementation of this request uses smp_send_reschedule() to send a
+    self-IPI, with interrupts disabled. The interrupts remain disabled until the
+    guest is entered, and then, after the entry is complete (often including
+    processing an injection and jumping to the relevant handler), the physical
+    interrupt is noticed and causes an exit.
+    
+    On recent Intel processors, we could have achieved the same goal by using
+    MTF instead of a self-IPI. Another technique worth considering in the future
+    is to use VM_EXIT_ACK_INTR_ON_EXIT and a highest-priority vector IPI - to
+    slightly improve performance by avoiding the useless interrupt handler
+    which ends up being called when smp_send_reschedule() is used.
+    
+    Signed-off-by: Nadav Har'El <nyh at il.ibm.com>
+    Signed-off-by: Avi Kivity <avi at redhat.com>
+
+diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
+index a0d6bd9..4693d54 100644
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -3858,12 +3858,15 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
+ static void enable_irq_window(struct kvm_vcpu *vcpu)
+ {
+ 	u32 cpu_based_vm_exec_control;
+-	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+-		/* We can get here when nested_run_pending caused
+-		 * vmx_interrupt_allowed() to return false. In this case, do
+-		 * nothing - the interrupt will be injected later.
++	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
++		/*
++		 * We get here if vmx_interrupt_allowed() said we can't
++		 * inject to L1 now because L2 must run. Ask L2 to exit
++		 * right after entry, so we can inject to L1 more promptly.
+ 		 */
++		kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
+ 		return;
++	}
+ 
+ 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 41234ff..83b839f 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -5559,6 +5559,7 @@ static int vcpu_enter_guest(struct kvm_v
+ 	bool nmi_pending;
+ 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+ 		vcpu->run->request_interrupt_window;
++	bool req_immediate_exit = 0;
+ 
+ 	if (vcpu->requests) {
+ 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
+@@ -5596,7 +5597,8 @@ static int vcpu_enter_guest(struct kvm_v
+ 		}
+ 		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+ 			record_steal_time(vcpu);
+-
++		req_immediate_exit =
++			kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
+ 	}
+ 
+ 	r = kvm_mmu_reload(vcpu);
+@@ -5655,6 +5657,9 @@ static int vcpu_enter_guest(struct kvm_v
+ 
+ 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ 
++	if (req_immediate_exit)
++		smp_send_reschedule(vcpu->cpu);
++
+ 	kvm_guest_enter();
+ 
+ 	if (unlikely(vcpu->arch.switch_db_regs)) {
+diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
+index d526231..9fedeb3 100644
+--- a/include/linux/kvm_host.h
++++ b/include/linux/kvm_host.h
+@@ -50,6 +50,7 @@
+ #define KVM_REQ_EVENT             11
+ #define KVM_REQ_APF_HALT          12
+ #define KVM_REQ_STEAL_UPDATE      13
++#define KVM_REQ_IMMEDIATE_EXIT    15
+ 
+ #define KVM_USERSPACE_IRQ_SOURCE_ID	0
+ 

Added: dists/sid/linux-2.6/debian/patches/bugfix/x86/KVM-nVMX-Fix-nested-VMX-TSC-emulation.patch
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ dists/sid/linux-2.6/debian/patches/bugfix/x86/KVM-nVMX-Fix-nested-VMX-TSC-emulation.patch	Sun Dec 11 16:53:04 2011	(r18378)
@@ -0,0 +1,77 @@
+commit 27fc51b21cea3386a6672699631975d1097f9d39
+Author: Nadav Har'El <nyh at il.ibm.com>
+Date:   Tue Aug 2 15:54:52 2011 +0300
+
+    KVM: nVMX: Fix nested VMX TSC emulation
+    
+    This patch fixes two corner cases in nested (L2) handling of TSC-related
+    issues:
+    
+    1. Somewhat suprisingly, according to the Intel spec, if L1 allows WRMSR to
+    the TSC MSR without an exit, then this should set L1's TSC value itself - not
+    offset by vmcs12.TSC_OFFSET (like was wrongly done in the previous code).
+    
+    2. Allow L1 to disable the TSC_OFFSETING control, and then correctly ignore
+    the vmcs12.TSC_OFFSET.
+    
+    Signed-off-by: Nadav Har'El <nyh at il.ibm.com>
+    Signed-off-by: Avi Kivity <avi at redhat.com>
+
+diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
+index 97b6454..5e8d411 100644
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -1777,15 +1777,23 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+  */
+ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+ {
+-	vmcs_write64(TSC_OFFSET, offset);
+-	if (is_guest_mode(vcpu))
++	if (is_guest_mode(vcpu)) {
+ 		/*
+-		 * We're here if L1 chose not to trap the TSC MSR. Since
+-		 * prepare_vmcs12() does not copy tsc_offset, we need to also
+-		 * set the vmcs12 field here.
++		 * We're here if L1 chose not to trap WRMSR to TSC. According
++		 * to the spec, this should set L1's TSC; The offset that L1
++		 * set for L2 remains unchanged, and still needs to be added
++		 * to the newly set TSC to get L2's TSC.
+ 		 */
+-		get_vmcs12(vcpu)->tsc_offset = offset -
+-			to_vmx(vcpu)->nested.vmcs01_tsc_offset;
++		struct vmcs12 *vmcs12;
++		to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
++		/* recalculate vmcs02.TSC_OFFSET: */
++		vmcs12 = get_vmcs12(vcpu);
++		vmcs_write64(TSC_OFFSET, offset +
++			(nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
++			 vmcs12->tsc_offset : 0));
++	} else {
++		vmcs_write64(TSC_OFFSET, offset);
++	}
+ }
+ 
+ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+@@ -6485,8 +6493,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+ 
+ 	set_cr4_guest_host_mask(vmx);
+ 
+-	vmcs_write64(TSC_OFFSET,
+-		vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
++	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
++		vmcs_write64(TSC_OFFSET,
++			vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
++	else
++		vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+ 
+ 	if (enable_vpid) {
+ 		/*
+@@ -6893,7 +6904,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
+ 
+ 	load_vmcs12_host_state(vcpu, vmcs12);
+ 
+-	/* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
++	/* Update TSC_OFFSET if TSC was changed while L2 ran */
+ 	vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+ 
+ 	/* This is needed for same reason as it was needed in prepare_vmcs02 */

Added: dists/sid/linux-2.6/debian/patches/bugfix/x86/KVM-nVMX-Fix-warning-causing-idt-vectoring-info-behavior.patch
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ dists/sid/linux-2.6/debian/patches/bugfix/x86/KVM-nVMX-Fix-warning-causing-idt-vectoring-info-behavior.patch	Sun Dec 11 16:53:04 2011	(r18378)
@@ -0,0 +1,66 @@
+commit ea926dcf58278fa05f1873491ad298e67951c80f
+Author: Nadav Har'El <nyh at il.ibm.com>
+Date:   Thu Sep 22 13:53:26 2011 +0300
+
+    KVM: nVMX: Fix warning-causing idt-vectoring-info behavior
+    
+    When L0 wishes to inject an interrupt while L2 is running, it emulates an exit
+    to L1 with EXIT_REASON_EXTERNAL_INTERRUPT. This was explained in the original
+    nVMX patch 23, titled "Correct handling of interrupt injection".
+    
+    Unfortunately, it is possible (though rare) that at this point there is valid
+    idt_vectoring_info in vmcs02. For example, L1 injected some interrupt to L2,
+    and when L2 tried to run this interrupt's handler, it got a page fault - so
+    it returns the original interrupt vector in idt_vectoring_info. The problem
+    is that if this is the case, we cannot exit to L1 with EXTERNAL_INTERRUPT
+    like we wished to, because the VMX spec guarantees that idt_vectoring_info
+    and exit_reason_external_interrupt can never happen together. This is not
+    just specified in the spec - a KVM L1 actually prints a kernel warning
+    "unexpected, valid vectoring info" if we violate this guarantee, and some
+    users noticed these warnings in L1's logs.
+    
+    In order to better emulate a processor, which would never return the external
+    interrupt and the idt-vectoring-info together, we need to separate the two
+    injection steps: First, complete L1's injection into L2 (i.e., enter L2,
+    injecting to it the idt-vectoring-info); Second, after entry into L2 succeeds
+    and it exits back to L0, exit to L1 with the EXIT_REASON_EXTERNAL_INTERRUPT.
+    Most of this is already in the code - the only change we need is to remain
+    in L2 (and not exit to L1) in this case.
+    
+    Note that the previous patch ensures (by using KVM_REQ_IMMEDIATE_EXIT) that
+    although we do enter L2 first, it will exit immediately after processing its
+    injection, allowing us to promptly inject to L1.
+    
+    Note how we test vmcs12->idt_vectoring_info_field; This isn't really the
+    vmcs12 value (we haven't exited to L1 yet, so vmcs12 hasn't been updated),
+    but rather the place we save, at the end of vmx_vcpu_run, the vmcs02 value
+    of this field. This was explained in patch 25 ("Correct handling of idt
+    vectoring info") of the original nVMX patch series.
+    
+    Thanks to Dave Allan and to Federico Simoncelli for reporting this bug,
+    to Abel Gordon for helping me figure out the solution, and to Avi Kivity
+    for helping to improve it.
+    
+    Signed-off-by: Nadav Har'El <nyh at il.ibm.com>
+    Signed-off-by: Avi Kivity <avi at redhat.com>
+
+diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
+index 4693d54..f3ec38f 100644
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -3993,11 +3993,12 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+ {
+ 	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+-		struct vmcs12 *vmcs12;
+-		if (to_vmx(vcpu)->nested.nested_run_pending)
++		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
++		if (to_vmx(vcpu)->nested.nested_run_pending ||
++		    (vmcs12->idt_vectoring_info_field &
++		     VECTORING_INFO_VALID_MASK))
+ 			return 0;
+ 		nested_vmx_vmexit(vcpu);
+-		vmcs12 = get_vmcs12(vcpu);
+ 		vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
+ 		vmcs12->vm_exit_intr_info = 0;
+ 		/* fall through to normal code, but now in L1, not L2 */

Modified: dists/sid/linux-2.6/debian/patches/series/base
==============================================================================
--- dists/sid/linux-2.6/debian/patches/series/base	Sat Dec 10 19:27:16 2011	(r18377)
+++ dists/sid/linux-2.6/debian/patches/series/base	Sun Dec 11 16:53:04 2011	(r18378)
@@ -86,3 +86,8 @@
 + bugfix/all/0004-staging-brcm80211-restrict-register-access-method-fo.patch
 + bugfix/all/0005-staging-brcm80211-restrict-MIPS-dma-bug-workaround-t.patch
 + debian/inetpeer-hide-ABI-change-in-3.1.5.patch
+
++ bugfix/x86/KVM-nVMX-Fix-nested-VMX-TSC-emulation.patch
++ bugfix/x86/KVM-nVMX-Add-KVM_REQ_IMMEDIATE_EXIT.patch
++ bugfix/x86/KVM-nVMX-Fix-warning-causing-idt-vectoring-info-behavior.patch
+



More information about the Kernel-svn-changes mailing list