KVM: SEV: Allow for VMPL level specification in AP create

Update AP creation to support ADD/DESTROY of VMSAs at levels other than VMPL0 in order to run under an SVSM at VMPL1 or lower. To maintain backwards compatibility, the VMPL is specified in bits 16 to 19 of the AP Creation request in SW_EXITINFO1 of the GHCB. In order to track the VMSAs at different levels, create arrays for the VMSAs, GHCBs, registered GHCBs and others. When switching VMPL levels, these entries will be used to set the VMSA and GHCB physical addresses in the VMCB for the VMPL level. In order ensure that the proper responses are returned in the proper GHCB, the GHCB must be unmapped at the current level and saved for restoration later when switching back to that VMPL level. Additional checks are applied to prevent a non-VMPL0 vCPU from being able to perform an AP creation request at VMPL0. Additionally, a vCPU cannot replace its own VMSA. Signed-off-by: Tom Lendacky <[email protected]>
coconut-svsm · Oct 1, 2024 · 641ca6d · 641ca6d
1 parent c957831
commit 641ca6d
Show file tree

Hide file tree

Showing 6 changed files with 169 additions and 48 deletions.
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
@@ -294,6 +294,15 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
 	(SVM_SEV_FEAT_RESTRICTED_INJECTION |	\
 	 SVM_SEV_FEAT_ALTERNATE_INJECTION)
 
+enum {
+	SVM_SEV_VMPL0 = 0,
+	SVM_SEV_VMPL1,
+	SVM_SEV_VMPL2,
+	SVM_SEV_VMPL3,
+
+	SVM_SEV_VMPL_MAX
+};
+
 struct vmcb_seg {
 	u16 selector;
 	u16 attrib;

diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
@@ -115,6 +115,8 @@
 #define SVM_VMGEXIT_AP_CREATE_ON_INIT		0
 #define SVM_VMGEXIT_AP_CREATE			1
 #define SVM_VMGEXIT_AP_DESTROY			2
+#define SVM_VMGEXIT_AP_VMPL_MASK		GENMASK(19, 16)
+#define SVM_VMGEXIT_AP_VMPL_SHIFT		16
 #define SVM_VMGEXIT_GET_APIC_IDS		0x80000017
 #define SVM_VMGEXIT_SNP_RUN_VMPL		0x80000018
 #define SVM_VMGEXIT_HV_FEATURES			0x8000fffd

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
@@ -807,7 +807,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
 {
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
-	struct sev_es_save_area *save = svm->sev_es.vmsa;
+	struct sev_es_save_area *save = vmpl_vmsa(svm, SVM_SEV_VMPL0);
 	struct xregs_state *xsave;
 	const u8 *s;
 	u8 *d;
@@ -920,11 +920,11 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
 	 * the VMSA memory content (i.e it will write the same memory region
 	 * with the guest's key), so invalidate it first.
 	 */
-	clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
+	clflush_cache_range(vmpl_vmsa(svm, SVM_SEV_VMPL0), PAGE_SIZE);
 
 	vmsa.reserved = 0;
 	vmsa.handle = to_kvm_sev_info(kvm)->handle;
-	vmsa.address = __sme_pa(svm->sev_es.vmsa);
+	vmsa.address = __sme_pa(vmpl_vmsa(svm, SVM_SEV_VMPL0));
 	vmsa.len = PAGE_SIZE;
 	ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
 	if (ret)
@@ -2453,7 +2453,7 @@ static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		struct vcpu_svm *svm = to_svm(vcpu);
-		u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+		u64 pfn = __pa(vmpl_vmsa(svm, SVM_SEV_VMPL0)) >> PAGE_SHIFT;
 
 		ret = sev_es_sync_vmsa(svm);
 		if (ret)
@@ -2465,7 +2465,7 @@ static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
 			return ret;
 
 		/* Issue the SNP command to encrypt the VMSA */
-		data.address = __sme_pa(svm->sev_es.vmsa);
+		data.address = __sme_pa(vmpl_vmsa(svm, SVM_SEV_VMPL0));
 		ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
 				      &data, &argp->error);
 		if (ret) {
@@ -3179,16 +3179,16 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
 	 * releasing it back to the system.
 	 */
 	if (sev_snp_guest(vcpu->kvm)) {
-		u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+		u64 pfn = __pa(vmpl_vmsa(svm, SVM_SEV_VMPL0)) >> PAGE_SHIFT;
 
 		if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K))
 			goto skip_vmsa_free;
 	}
 
 	if (vcpu->arch.guest_state_protected)
-		sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
+		sev_flush_encrypted_page(vcpu, vmpl_vmsa(svm, SVM_SEV_VMPL0));
 
-	__free_page(virt_to_page(svm->sev_es.vmsa));
+	__free_page(virt_to_page(vmpl_vmsa(svm, SVM_SEV_VMPL0)));
 
 skip_vmsa_free:
 	if (svm->sev_es.ghcb_sa_free)
@@ -3386,13 +3386,19 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 		if (!kvm_ghcb_sw_scratch_is_valid(svm))
 			goto vmgexit_err;
 		break;
-	case SVM_VMGEXIT_AP_CREATION:
+	case SVM_VMGEXIT_AP_CREATION: {
+		unsigned int request;
+
 		if (!sev_snp_guest(vcpu->kvm))
 			goto vmgexit_err;
-		if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
+
+		request = lower_32_bits(control->exit_info_1);
+		request &= ~SVM_VMGEXIT_AP_VMPL_MASK;
+		if (request != SVM_VMGEXIT_AP_DESTROY)
 			if (!kvm_ghcb_rax_is_valid(svm))
 				goto vmgexit_err;
 		break;
+	}
 	case SVM_VMGEXIT_GET_APIC_IDS:
 		if (!kvm_ghcb_rax_is_valid(svm))
 			goto vmgexit_err;
@@ -3851,9 +3857,10 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
 
 	/* Clear use of the VMSA */
 	svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+	tgt_vmpl_vmsa_hpa(svm) = INVALID_PAGE;
 
-	if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) {
-		gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
+	if (VALID_PAGE(tgt_vmpl_vmsa_gpa(svm))) {
+		gfn_t gfn = gpa_to_gfn(tgt_vmpl_vmsa_gpa(svm));
 		struct kvm_memory_slot *slot;
 		kvm_pfn_t pfn;
 
@@ -3871,32 +3878,54 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
 		/*
 		 * From this point forward, the VMSA will always be a
 		 * guest-mapped page rather than the initial one allocated
-		 * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa
-		 * could be free'd and cleaned up here, but that involves
-		 * cleanups like wbinvd_on_all_cpus() which would ideally
-		 * be handled during teardown rather than guest boot.
-		 * Deferring that also allows the existing logic for SEV-ES
-		 * VMSAs to be re-used with minimal SNP-specific changes.
+		 * by KVM in svm->sev_es.vmsa_info[vmpl].vmsa. In theory,
+		 * svm->sev_es.vmsa_info[vmpl].vmsa could be free'd and cleaned
+		 * up here, but that involves cleanups like wbinvd_on_all_cpus()
+		 * which would ideally be handled during teardown rather than
+		 * guest boot. Deferring that also allows the existing logic for
+		 * SEV-ES VMSAs to be re-used with minimal SNP-specific changes.
 		 */
-		svm->sev_es.snp_has_guest_vmsa = true;
+		tgt_vmpl_has_guest_vmsa(svm) = true;
 
 		/* Use the new VMSA */
 		svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
+		tgt_vmpl_vmsa_hpa(svm) = pfn_to_hpa(pfn);
+
+		/*
+		 * Since the vCPU may not have gone through the LAUNCH_UPDATE_VMSA path,
+		 * be sure to mark the guest state as protected and enable LBR virtualization.
+		 */
+		vcpu->arch.guest_state_protected = true;
+		svm_enable_lbrv(vcpu);
 
 		/* Mark the vCPU as runnable */
 		vcpu->arch.pv.pv_unhalted = false;
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
-		svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+		tgt_vmpl_vmsa_gpa(svm) = INVALID_PAGE;
 
 		/*
 		 * gmem pages aren't currently migratable, but if this ever
 		 * changes then care should be taken to ensure
-		 * svm->sev_es.vmsa is pinned through some other means.
+		 * svm->sev_es.vmsa_info[vmpl].vmsa is pinned through some other
+		 * means.
 		 */
 		kvm_release_pfn_clean(pfn);
 	}
 
+	if (cur_vmpl(svm) != tgt_vmpl(svm)) {
+		/* Unmap the current GHCB */
+		sev_es_unmap_ghcb(svm);
+
+		/* Save the GHCB GPA of the current VMPL */
+		svm->sev_es.ghcb_gpa[cur_vmpl(svm)] = svm->vmcb->control.ghcb_gpa;
+
+		/* Set the GHCB_GPA for the target VMPL and make it the current VMPL */
+		svm->vmcb->control.ghcb_gpa = svm->sev_es.ghcb_gpa[tgt_vmpl(svm)];
+
+		cur_vmpl(svm) = tgt_vmpl(svm);
+	}
+
 	/*
 	 * When replacing the VMSA during SEV-SNP AP creation,
 	 * mark the VMCB dirty so that full state is always reloaded.
@@ -3919,10 +3948,10 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
 
 	mutex_lock(&svm->sev_es.snp_vmsa_mutex);
 
-	if (!svm->sev_es.snp_ap_waiting_for_reset)
+	if (!tgt_vmpl_ap_waiting_for_reset(svm))
 		goto unlock;
 
-	svm->sev_es.snp_ap_waiting_for_reset = false;
+	tgt_vmpl_ap_waiting_for_reset(svm) = false;
 
 	ret = __sev_snp_update_protected_guest_state(vcpu);
 	if (ret)
@@ -3940,12 +3969,24 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
 	struct vcpu_svm *target_svm;
 	unsigned int request;
 	unsigned int apic_id;
+	unsigned int vmpl;
 	bool kick;
 	int ret;
 
 	request = lower_32_bits(svm->vmcb->control.exit_info_1);
 	apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
 
+	vmpl = (request & SVM_VMGEXIT_AP_VMPL_MASK) >> SVM_VMGEXIT_AP_VMPL_SHIFT;
+	request &= ~SVM_VMGEXIT_AP_VMPL_MASK;
+
+	/* Validate the requested VMPL level */
+	if (vmpl >= SVM_SEV_VMPL_MAX) {
+		vcpu_unimpl(vcpu, "vmgexit: invalid VMPL level [%u] from guest\n",
+			    vmpl);
+		return -EINVAL;
+	}
+	vmpl = array_index_nospec(vmpl, SVM_SEV_VMPL_MAX);
+
 	/* Validate the APIC ID */
 	target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
 	if (!target_vcpu) {
@@ -3967,13 +4008,22 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
 
 	mutex_lock(&target_svm->sev_es.snp_vmsa_mutex);
 
-	target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
-	target_svm->sev_es.snp_ap_waiting_for_reset = true;
+	vmpl_vmsa_gpa(target_svm, vmpl) = INVALID_PAGE;
+	vmpl_ap_waiting_for_reset(target_svm, vmpl) = true;
 
-	/* Interrupt injection mode shouldn't change for AP creation */
+	/* VMPL0 can only be replaced by another vCPU running VMPL0 */
+	if (vmpl == SVM_SEV_VMPL0 &&
+	    (vcpu == target_vcpu ||
+	     vmpl_vmsa_hpa(svm, SVM_SEV_VMPL0) != svm->vmcb->control.vmsa_pa)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Perform common AP creation validation */
 	if (request < SVM_VMGEXIT_AP_DESTROY) {
 		u64 sev_features;
 
+		/* Interrupt injection mode shouldn't change for AP creation */
 		sev_features = vcpu->arch.regs[VCPU_REGS_RAX];
 		sev_features ^= sev->vmsa_features;
 
@@ -3983,13 +4033,8 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
 			ret = -EINVAL;
 			goto out;
 		}
-	}
 
-	switch (request) {
-	case SVM_VMGEXIT_AP_CREATE_ON_INIT:
-		kick = false;
-		fallthrough;
-	case SVM_VMGEXIT_AP_CREATE:
+		/* Validate the input VMSA page */
 		if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
 			vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
 				    svm->vmcb->control.exit_info_2);
@@ -4011,8 +4056,17 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
 			ret = -EINVAL;
 			goto out;
 		}
+	}
 
-		target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
+	switch (request) {
+	case SVM_VMGEXIT_AP_CREATE_ON_INIT:
+		/* Delay switching to the new VMSA */
+		kick = false;
+		fallthrough;
+	case SVM_VMGEXIT_AP_CREATE:
+		/* Switch to new VMSA on the next VMRUN */
+		target_svm->sev_es.snp_target_vmpl = vmpl;
+		vmpl_vmsa_gpa(target_svm, vmpl) = svm->vmcb->control.exit_info_2 & PAGE_MASK;
 		break;
 	case SVM_VMGEXIT_AP_DESTROY:
 		break;
@@ -4299,7 +4353,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 		gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK,
 					GHCB_MSR_GPA_VALUE_POS);
 
-		svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn);
+		svm->sev_es.ghcb_registered_gpa[cur_vmpl(svm)] = gfn_to_gpa(gfn);
 
 		set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK,
 				  GHCB_MSR_GPA_VALUE_POS);
@@ -4580,8 +4634,8 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
 	 * the VMSA will be NULL if this vCPU is the destination for intrahost
 	 * migration, and will be copied later.
 	 */
-	if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa)
-		svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
+	if (cur_vmpl_vmsa(svm) && !cur_vmpl_has_guest_vmsa(svm))
+		svm->vmcb->control.vmsa_pa = __pa(cur_vmpl_vmsa(svm));
 
 	/* Can't intercept CR register access, HV can't modify CR registers */
 	svm_clr_intercept(svm, INTERCEPT_CR0_READ);
@@ -4644,16 +4698,30 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
 {
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+	unsigned int i;
+	u64 sev_info;
 
 	/*
 	 * Set the GHCB MSR value as per the GHCB specification when emulating
 	 * vCPU RESET for an SEV-ES guest.
 	 */
-	set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
-					    GHCB_VERSION_MIN,
-					    sev_enc_bit));
+	sev_info = GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN,
+				     sev_enc_bit);
+	set_ghcb_msr(svm, sev_info);
+	svm->sev_es.ghcb_gpa[SVM_SEV_VMPL0] = sev_info;
 
 	mutex_init(&svm->sev_es.snp_vmsa_mutex);
+
+	/*
+	 * When not running under SNP, the "current VMPL" tracking for a guest
+	 * is always 0 and the base tracking of GPAs and SPAs will be as before
+	 * multiple VMPL support. However, under SNP, multiple VMPL levels can
+	 * be run, so initialize these values appropriately.
+	 */
+	for (i = 1; i < SVM_SEV_VMPL_MAX; i++) {
+		svm->sev_es.vmsa_info[i].hpa = INVALID_PAGE;
+		svm->sev_es.ghcb_gpa[i] = sev_info;
+	}
 }
 
 void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
@@ -1463,8 +1463,10 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
 	svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
 	svm_switch_vmcb(svm, &svm->vmcb01);
 
-	if (vmsa_page)
-		svm->sev_es.vmsa = page_address(vmsa_page);
+	if (vmsa_page) {
+		vmpl_vmsa(svm, SVM_SEV_VMPL0) = page_address(vmsa_page);
+		vmpl_vmsa_hpa(svm, SVM_SEV_VMPL0) = __pa(page_address(vmsa_page));
+	}
 
 	svm->guest_state_loaded = false;