src/arch/aarch64/hypervisor/handler.c - hafnium - Git at Google

 /*
  * Copyright 2018 The Hafnium Authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     https://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <stdnoreturn.h>

 #include "hf/arch/barriers.h"
 #include "hf/arch/init.h"
 #include "hf/arch/mm.h"

 #include "hf/api.h"
 #include "hf/check.h"
 #include "hf/cpu.h"
 #include "hf/dlog.h"
 #include "hf/panic.h"
 #include "hf/spci.h"
 #include "hf/vm.h"

 #include "vmapi/hf/call.h"

 #include "debug_el1.h"
 #include "msr.h"
 #include "perfmon.h"
 #include "psci.h"
 #include "psci_handler.h"
 #include "smc.h"
 #include "sysregs.h"

 /**
  * Gets the Exception Class from the ESR.
  */
 #define GET_EC(esr) ((esr) >> 26)

 /**
  * Gets the value to increment for the next PC.
  * The ESR encodes whether the instruction is 2 bytes or 4 bytes long.
  */
 #define GET_NEXT_PC_INC(esr) (((esr) & (1u << 25)) ? 4 : 2)

 /**
  * The Client ID field within X7 for an SMC64 call.
  */
 #define CLIENT_ID_MASK UINT64_C(0xffff)

 /**
  * Returns a reference to the currently executing vCPU.
  */
 static struct vcpu *current(void)
 {
 	return (struct vcpu *)read_msr(tpidr_el2);
 }

 /**
  * Saves the state of per-vCPU peripherals, such as the virtual timer, and
  * informs the arch-independent sections that registers have been saved.
  */
 void complete_saving_state(struct vcpu *vcpu)
 {
 	vcpu->regs.peripherals.cntv_cval_el0 = read_msr(cntv_cval_el0);
 	vcpu->regs.peripherals.cntv_ctl_el0 = read_msr(cntv_ctl_el0);

 	api_regs_state_saved(vcpu);

 	/*
 	 * If switching away from the primary, copy the current EL0 virtual
 	 * timer registers to the corresponding EL2 physical timer registers.
 	 * This is used to emulate the virtual timer for the primary in case it
 	 * should fire while the secondary is running.
 	 */
 	if (vcpu->vm->id == HF_PRIMARY_VM_ID) {
 		/*
 		 * Clear timer control register before copying compare value, to
 		 * avoid a spurious timer interrupt. This could be a problem if
 		 * the interrupt is configured as edge-triggered, as it would
 		 * then be latched in.
 		 */
 		write_msr(cnthp_ctl_el2, 0);
 		write_msr(cnthp_cval_el2, read_msr(cntv_cval_el0));
 		write_msr(cnthp_ctl_el2, read_msr(cntv_ctl_el0));
 	}
 }

 /**
  * Restores the state of per-vCPU peripherals, such as the virtual timer.
  */
 void begin_restoring_state(struct vcpu *vcpu)
 {
 	/*
 	 * Clear timer control register before restoring compare value, to avoid
 	 * a spurious timer interrupt. This could be a problem if the interrupt
 	 * is configured as edge-triggered, as it would then be latched in.
 	 */
 	write_msr(cntv_ctl_el0, 0);
 	write_msr(cntv_cval_el0, vcpu->regs.peripherals.cntv_cval_el0);
 	write_msr(cntv_ctl_el0, vcpu->regs.peripherals.cntv_ctl_el0);

 	/*
 	 * If we are switching (back) to the primary, disable the EL2 physical
 	 * timer which was being used to emulate the EL0 virtual timer, as the
 	 * virtual timer is now running for the primary again.
 	 */
 	if (vcpu->vm->id == HF_PRIMARY_VM_ID) {
 		write_msr(cnthp_ctl_el2, 0);
 		write_msr(cnthp_cval_el2, 0);
 	}
 }

 /**
  * Invalidate all stage 1 TLB entries on the current (physical) CPU for the
  * current VMID.
  */
 static void invalidate_vm_tlb(void)
 {
 	/*
 	 * Ensure that the last VTTBR write has taken effect so we invalidate
 	 * the right set of TLB entries.
 	 */
 	isb();

 	__asm__ volatile("tlbi vmalle1");

 	/*
 	 * Ensure that no instructions are fetched for the VM until after the
 	 * TLB invalidation has taken effect.
 	 */
 	isb();

 	/*
 	 * Ensure that no data reads or writes for the VM happen until after the
 	 * TLB invalidation has taken effect. Non-sharable is enough because the
 	 * TLB is local to the CPU.
 	 */
 	dsb(nsh);
 }

 /**
  * Invalidates the TLB if a different vCPU is being run than the last vCPU of
  * the same VM which was run on the current pCPU.
  *
  * This is necessary because VMs may (contrary to the architecture
  * specification) use inconsistent ASIDs across vCPUs. c.f. KVM's similar
  * workaround:
  * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=94d0e5980d6791b9
  */
 void maybe_invalidate_tlb(struct vcpu *vcpu)
 {
 	size_t current_cpu_index = cpu_index(vcpu->cpu);
 	spci_vcpu_index_t new_vcpu_index = vcpu_index(vcpu);

 	if (vcpu->vm->arch.last_vcpu_on_cpu[current_cpu_index] !=
 	    new_vcpu_index) {
 		/*
 		 * The vCPU has changed since the last time this VM was run on
 		 * this pCPU, so we need to invalidate the TLB.
 		 */
 		invalidate_vm_tlb();

 		/* Record the fact that this vCPU is now running on this CPU. */
 		vcpu->vm->arch.last_vcpu_on_cpu[current_cpu_index] =
 			new_vcpu_index;
 	}
 }

 noreturn void irq_current_exception(uintreg_t elr, uintreg_t spsr)
 {
 	(void)elr;
 	(void)spsr;

 	panic("IRQ from current");
 }

 noreturn void fiq_current_exception(uintreg_t elr, uintreg_t spsr)
 {
 	(void)elr;
 	(void)spsr;

 	panic("FIQ from current");
 }

 noreturn void serr_current_exception(uintreg_t elr, uintreg_t spsr)
 {
 	(void)elr;
 	(void)spsr;

 	panic("SERR from current");
 }

 noreturn void sync_current_exception(uintreg_t elr, uintreg_t spsr)
 {
 	uintreg_t esr = read_msr(esr_el2);
 	uintreg_t ec = GET_EC(esr);

 	(void)spsr;

 	switch (ec) {
 	case 0x25: /* EC = 100101, Data abort. */
 		dlog("Data abort: pc=%#x, esr=%#x, ec=%#x", elr, esr, ec);
 		if (!(esr & (1U << 10))) { /* Check FnV bit. */
 			dlog(", far=%#x", read_msr(far_el2));
 		} else {
 			dlog(", far=invalid");
 		}

 		dlog("\n");
 		break;

 	default:
 		dlog("Unknown current sync exception pc=%#x, esr=%#x, "
 		     "ec=%#x\n",
 		     elr, esr, ec);
 		break;
 	}

 	panic("EL2 exception");
 }

 /**
  * Sets or clears the VI bit in the HCR_EL2 register saved in the given
  * arch_regs.
  */
 static void set_virtual_interrupt(struct arch_regs *r, bool enable)
 {
 	if (enable) {
 		r->lazy.hcr_el2 |= HCR_EL2_VI;
 	} else {
 		r->lazy.hcr_el2 &= ~HCR_EL2_VI;
 	}
 }

 /**
  * Sets or clears the VI bit in the HCR_EL2 register.
  */
 static void set_virtual_interrupt_current(bool enable)
 {
 	uintreg_t hcr_el2 = read_msr(hcr_el2);

 	if (enable) {
 		hcr_el2 |= HCR_EL2_VI;
 	} else {
 		hcr_el2 &= ~HCR_EL2_VI;
 	}
 	write_msr(hcr_el2, hcr_el2);
 }

 /**
  * Checks whether to block an SMC being forwarded from a VM.
  */
 static bool smc_is_blocked(const struct vm *vm, uint32_t func)
 {
 	bool block_by_default = !vm->smc_whitelist.permissive;

 	for (size_t i = 0; i < vm->smc_whitelist.smc_count; ++i) {
 		if (func == vm->smc_whitelist.smcs[i]) {
 			return false;
 		}
 	}

 	dlog("SMC %#010x attempted from VM %d, blocked=%d\n", func, vm->id,
 	     block_by_default);

 	/* Access is still allowed in permissive mode. */
 	return block_by_default;
 }

 /**
  * Applies SMC access control according to manifest and forwards the call if
  * access is granted.
  */
 static void smc_forwarder(const struct vcpu *vcpu, struct smc_result *ret)
 {
 	uint32_t func = vcpu->regs.r[0];
 	uint32_t client_id = vcpu->vm->id;
 	uintreg_t arg7;

 	if (smc_is_blocked(vcpu->vm, func)) {
 		ret->res0 = SMCCC_ERROR_UNKNOWN;
 		return;
 	}

 	/*
 	 * Set the Client ID but keep the existing Secure OS ID and anything
 	 * else (currently unspecified) that the client may have passed in the
 	 * upper bits.
 	 */
 	arg7 = client_id | (vcpu->regs.r[7] & ~CLIENT_ID_MASK);
 	*ret = smc_forward(func, vcpu->regs.r[1], vcpu->regs.r[2],
 			   vcpu->regs.r[3], vcpu->regs.r[4], vcpu->regs.r[5],
 			   vcpu->regs.r[6], arg7);

 	/*
 	 * Preserve the value passed by the caller, rather than the client_id we
 	 * generated. Note that this would also overwrite any return value that
 	 * may be in x7, but the SMCs that we are forwarding are legacy calls
 	 * from before SMCCC 1.2 so won't have more than 4 return values anyway.
 	 */
 	ret->res7 = vcpu->regs.r[7];
 }

 static bool spci_handler(struct spci_value *args, struct vcpu **next)
 {
 	/*
 	 * NOTE: When adding new methods to this handler update
 	 * api_spci_features accordingly.
 	 */
 	switch (args->func & ~SMCCC_CONVENTION_MASK) {
 	case SPCI_VERSION_32:
 		*args = api_spci_version();
 		return true;
 	case SPCI_ID_GET_32:
 		*args = api_spci_id_get(current());
 		return true;
 	case SPCI_FEATURES_32:
 		*args = api_spci_features(args->arg1);
 		return true;
 	case SPCI_YIELD_32:
 		api_yield(current(), next);

 		/* SPCI_YIELD always returns SPCI_SUCCESS. */
 		*args = (struct spci_value){.func = SPCI_SUCCESS_32};

 		return true;
 	case SPCI_MSG_SEND_32:
 		*args = api_spci_msg_send(spci_msg_send_sender(*args),
 					  spci_msg_send_receiver(*args),
 					  spci_msg_send_size(*args),
 					  spci_msg_send_attributes(*args),
 					  current(), next);
 		return true;
 	case SPCI_MSG_WAIT_32:
 		*args = api_spci_msg_recv(true, current(), next);
 		return true;
 	case SPCI_MSG_POLL_32:
 		*args = api_spci_msg_recv(false, current(), next);
 		return true;
 	}

 	return false;
 }

 /**
  * Set or clear VI bit according to pending interrupts.
  */
 static void update_vi(struct vcpu *next)
 {
 	if (next == NULL) {
 		/*
 		 * Not switching vCPUs, set the bit for the current vCPU
 		 * directly in the register.
 		 */
 		struct vcpu *vcpu = current();

 		sl_lock(&vcpu->lock);
 		set_virtual_interrupt_current(
 			vcpu->interrupts.enabled_and_pending_count > 0);
 		sl_unlock(&vcpu->lock);
 	} else {
 		/*
 		 * About to switch vCPUs, set the bit for the vCPU to which we
 		 * are switching in the saved copy of the register.
 		 */
 		sl_lock(&next->lock);
 		set_virtual_interrupt(
 			&next->regs,
 			next->interrupts.enabled_and_pending_count > 0);
 		sl_unlock(&next->lock);
 	}
 }

 /**
  * Processes SMC instruction calls.
  */
 static void smc_handler(struct vcpu *vcpu, struct smc_result *ret,
 			struct vcpu **next)
 {
 	uint32_t func = vcpu->regs.r[0];

 	if (psci_handler(vcpu, func, vcpu->regs.r[1], vcpu->regs.r[2],
 			 vcpu->regs.r[3], &ret->res0, next)) {
 		return;
 	}

 	switch (func & ~SMCCC_CONVENTION_MASK) {
 	case HF_DEBUG_LOG:
 		ret->res0 = api_debug_log(vcpu->regs.r[1], vcpu);
 		return;
 	}

 	smc_forwarder(vcpu, ret);
 }

 struct vcpu *hvc_handler(struct vcpu *vcpu)
 {
 	struct spci_value args = {
 		.func = vcpu->regs.r[0],
 		.arg1 = vcpu->regs.r[1],
 		.arg2 = vcpu->regs.r[2],
 		.arg3 = vcpu->regs.r[3],
 		.arg4 = vcpu->regs.r[4],
 		.arg5 = vcpu->regs.r[5],
 		.arg6 = vcpu->regs.r[6],
 		.arg7 = vcpu->regs.r[7],
 	};
 	struct vcpu *next = NULL;

 	if (psci_handler(vcpu, args.func, args.arg1, args.arg2, args.arg3,
 			 &vcpu->regs.r[0], &next)) {
 		return next;
 	}

 	if (spci_handler(&args, &next)) {
 		vcpu->regs.r[0] = args.func;
 		vcpu->regs.r[1] = args.arg1;
 		vcpu->regs.r[2] = args.arg2;
 		vcpu->regs.r[3] = args.arg3;
 		vcpu->regs.r[4] = args.arg4;
 		vcpu->regs.r[5] = args.arg5;
 		vcpu->regs.r[6] = args.arg6;
 		vcpu->regs.r[7] = args.arg7;
 		update_vi(next);
 		return next;
 	}

 	switch (args.func) {
 	case HF_VM_GET_COUNT:
 		vcpu->regs.r[0] = api_vm_get_count();
 		break;

 	case HF_VCPU_GET_COUNT:
 		vcpu->regs.r[0] = api_vcpu_get_count(args.arg1, vcpu);
 		break;

 	case HF_VCPU_RUN:
 		vcpu->regs.r[0] = hf_vcpu_run_return_encode(
 			api_vcpu_run(args.arg1, args.arg2, vcpu, &next));
 		break;

 	case HF_VM_CONFIGURE:
 		vcpu->regs.r[0] = api_vm_configure(
 			ipa_init(args.arg1), ipa_init(args.arg2), vcpu, &next);
 		break;

 	case HF_MAILBOX_CLEAR:
 		vcpu->regs.r[0] = api_mailbox_clear(vcpu, &next);
 		break;

 	case HF_MAILBOX_WRITABLE_GET:
 		vcpu->regs.r[0] = api_mailbox_writable_get(vcpu);
 		break;

 	case HF_MAILBOX_WAITER_GET:
 		vcpu->regs.r[0] = api_mailbox_waiter_get(args.arg1, vcpu);
 		break;

 	case HF_INTERRUPT_ENABLE:
 		vcpu->regs.r[0] =
 			api_interrupt_enable(args.arg1, args.arg2, vcpu);
 		break;

 	case HF_INTERRUPT_GET:
 		vcpu->regs.r[0] = api_interrupt_get(vcpu);
 		break;

 	case HF_INTERRUPT_INJECT:
 		vcpu->regs.r[0] = api_interrupt_inject(args.arg1, args.arg2,
 						       args.arg3, vcpu, &next);
 		break;

 	case HF_SHARE_MEMORY:
 		vcpu->regs.r[0] = api_share_memory(
 			args.arg1 >> 32, ipa_init(args.arg2), args.arg3,
 			args.arg1 & 0xffffffff, vcpu);
 		break;

 	case HF_DEBUG_LOG:
 		vcpu->regs.r[0] = api_debug_log(args.arg1, vcpu);
 		break;

 	default:
 		vcpu->regs.r[0] = SMCCC_ERROR_UNKNOWN;
 	}

 	update_vi(next);

 	return next;
 }

 struct vcpu *irq_lower(void)
 {
 	/*
 	 * Switch back to primary VM, interrupts will be handled there.
 	 *
 	 * If the VM has aborted, this vCPU will be aborted when the scheduler
 	 * tries to run it again. This means the interrupt will not be delayed
 	 * by the aborted VM.
 	 *
 	 * TODO: Only switch when the interrupt isn't for the current VM.
 	 */
 	return api_preempt(current());
 }

 struct vcpu *fiq_lower(void)
 {
 	return irq_lower();
 }

 struct vcpu *serr_lower(void)
 {
 	dlog("SERR from lower\n");
 	return api_abort(current());
 }

 /**
  * Initialises a fault info structure. It assumes that an FnV bit exists at
  * bit offset 10 of the ESR, and that it is only valid when the bottom 6 bits of
  * the ESR (the fault status code) are 010000; this is the case for both
  * instruction and data aborts, but not necessarily for other exception reasons.
  */
 static struct vcpu_fault_info fault_info_init(uintreg_t esr,
 					      const struct vcpu *vcpu,
 					      uint32_t mode)
 {
 	uint32_t fsc = esr & 0x3f;
 	struct vcpu_fault_info r;

 	r.mode = mode;
 	r.pc = va_init(vcpu->regs.pc);

 	/*
 	 * Check the FnV bit, which is only valid if dfsc/ifsc is 010000. It
 	 * indicates that we cannot rely on far_el2.
 	 */
 	if (fsc == 0x10 && esr & (1U << 10)) {
 		r.vaddr = va_init(0);
 		r.ipaddr = ipa_init(read_msr(hpfar_el2) << 8);
 	} else {
 		r.vaddr = va_init(read_msr(far_el2));
 		r.ipaddr = ipa_init((read_msr(hpfar_el2) << 8) |
 				    (read_msr(far_el2) & (PAGE_SIZE - 1)));
 	}

 	return r;
 }

 struct vcpu *sync_lower_exception(uintreg_t esr)
 {
 	struct vcpu *vcpu = current();
 	struct vcpu_fault_info info;
 	struct vcpu *new_vcpu;
 	uintreg_t ec = GET_EC(esr);

 	switch (ec) {
 	case 0x01: /* EC = 000001, WFI or WFE. */
 		/* Skip the instruction. */
 		vcpu->regs.pc += GET_NEXT_PC_INC(esr);
 		/* Check TI bit of ISS, 0 = WFI, 1 = WFE. */
 		if (esr & 1) {
 			/* WFE */
 			/*
 			 * TODO: consider giving the scheduler more context,
 			 * somehow.
 			 */
 			api_yield(vcpu, &new_vcpu);
 			return new_vcpu;
 		}
 		/* WFI */
 		return api_wait_for_interrupt(vcpu);

 	case 0x24: /* EC = 100100, Data abort. */
 		info = fault_info_init(
 			esr, vcpu, (esr & (1U << 6)) ? MM_MODE_W : MM_MODE_R);
 		if (vcpu_handle_page_fault(vcpu, &info)) {
 			return NULL;
 		}
 		break;

 	case 0x20: /* EC = 100000, Instruction abort. */
 		info = fault_info_init(esr, vcpu, MM_MODE_X);
 		if (vcpu_handle_page_fault(vcpu, &info)) {
 			return NULL;
 		}
 		break;

 	case 0x16: /* EC = 010110, HVC instruction */
 		return hvc_handler(vcpu);

 	case 0x17: /* EC = 010111, SMC instruction. */ {
 		uintreg_t smc_pc = vcpu->regs.pc;
 		struct vcpu *next = NULL;
 		struct smc_result ret = {.res4 = vcpu->regs.r[4],
 					 .res5 = vcpu->regs.r[5],
 					 .res6 = vcpu->regs.r[6],
 					 .res7 = vcpu->regs.r[7]};

 		smc_handler(vcpu, &ret, &next);

 		/* Skip the SMC instruction. */
 		vcpu->regs.pc = smc_pc + GET_NEXT_PC_INC(esr);
 		vcpu->regs.r[0] = ret.res0;
 		vcpu->regs.r[1] = ret.res1;
 		vcpu->regs.r[2] = ret.res2;
 		vcpu->regs.r[3] = ret.res3;
 		vcpu->regs.r[4] = ret.res4;
 		vcpu->regs.r[5] = ret.res5;
 		vcpu->regs.r[6] = ret.res6;
 		vcpu->regs.r[7] = ret.res7;
 		return next;
 	}

 	/*
 	 * EC = 011000, MSR, MRS or System instruction execution that is not
 	 * reported using EC 000000, 000001 or 000111.
 	 */
 	case 0x18:
 		/*
 		 * NOTE: This should never be reached because it goes through a
 		 * separate path handled by handle_system_register_access().
 		 */
 		panic("Handled by handle_system_register_access().");

 	default:
 		dlog("Unknown lower sync exception pc=%#x, esr=%#x, "
 		     "ec=%#x\n",
 		     vcpu->regs.pc, esr, ec);
 		break;
 	}

 	/* The exception wasn't handled so abort the VM. */
 	return api_abort(vcpu);
 }

 /**
  * Handles EC = 011000, msr, mrs instruction traps.
  * Returns non-null ONLY if the access failed and the vcpu is changing.
  */
 struct vcpu *handle_system_register_access(uintreg_t esr)
 {
 	struct vcpu *vcpu = current();
 	spci_vm_id_t vm_id = vcpu->vm->id;
 	uintreg_t ec = GET_EC(esr);
 	char *direction_str;

 	CHECK(ec == 0x18);

 	/*
 	 * Handle accesses to debug and performance monitor registers.
 	 * Abort when encountering unhandled register accesses.
 	 */
 	if (debug_el1_is_register_access(esr)) {
 		if (!debug_el1_process_access(vcpu, vm_id, esr)) {
 			goto fail;
 		}
 	} else if (perfmon_is_register_access(esr)) {
 		if (!perfmon_process_access(vcpu, vm_id, esr)) {
 			goto fail;
 		}
 	} else {
 		goto fail;
 	}

 	/* Instruction was fulfilled. Skip it and run the next one. */
 	vcpu->regs.pc += GET_NEXT_PC_INC(esr);
 	return NULL;

 fail:
 	direction_str = ISS_IS_READ(esr) ? "read" : "write";

 	dlog("Unhandled system register %s: op0=%d, op1=%d, crn=%d, "
 	     "crm=%d, op2=%d, rt=%d.\n",
 	     direction_str, GET_ISS_OP0(esr), GET_ISS_OP1(esr),
 	     GET_ISS_CRN(esr), GET_ISS_CRM(esr), GET_ISS_OP2(esr),
 	     GET_ISS_RT(esr));

 	/* Abort if unable to fulfill the register access. */
 	return api_abort(vcpu);
 }
	/*
	* Copyright 2018 The Hafnium Authors.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* https://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <stdnoreturn.h>

	#include "hf/arch/barriers.h"
	#include "hf/arch/init.h"
	#include "hf/arch/mm.h"

	#include "hf/api.h"
	#include "hf/check.h"
	#include "hf/cpu.h"
	#include "hf/dlog.h"
	#include "hf/panic.h"
	#include "hf/spci.h"
	#include "hf/vm.h"

	#include "vmapi/hf/call.h"

	#include "debug_el1.h"
	#include "msr.h"
	#include "perfmon.h"
	#include "psci.h"
	#include "psci_handler.h"
	#include "smc.h"
	#include "sysregs.h"

	/**
	* Gets the Exception Class from the ESR.
	*/
	#define GET_EC(esr) ((esr) >> 26)

	/**
	* Gets the value to increment for the next PC.
	* The ESR encodes whether the instruction is 2 bytes or 4 bytes long.
	*/
	#define GET_NEXT_PC_INC(esr) (((esr) & (1u << 25)) ? 4 : 2)

	/**
	* The Client ID field within X7 for an SMC64 call.
	*/
	#define CLIENT_ID_MASK UINT64_C(0xffff)

	/**
	* Returns a reference to the currently executing vCPU.
	*/
	static struct vcpu *current(void)
	{
	return (struct vcpu *)read_msr(tpidr_el2);
	}

	/**
	* Saves the state of per-vCPU peripherals, such as the virtual timer, and
	* informs the arch-independent sections that registers have been saved.
	*/
	void complete_saving_state(struct vcpu *vcpu)
	{
	vcpu->regs.peripherals.cntv_cval_el0 = read_msr(cntv_cval_el0);
	vcpu->regs.peripherals.cntv_ctl_el0 = read_msr(cntv_ctl_el0);

	api_regs_state_saved(vcpu);

	/*
	* If switching away from the primary, copy the current EL0 virtual
	* timer registers to the corresponding EL2 physical timer registers.
	* This is used to emulate the virtual timer for the primary in case it
	* should fire while the secondary is running.
	*/
	if (vcpu->vm->id == HF_PRIMARY_VM_ID) {
	/*
	* Clear timer control register before copying compare value, to
	* avoid a spurious timer interrupt. This could be a problem if
	* the interrupt is configured as edge-triggered, as it would
	* then be latched in.
	*/
	write_msr(cnthp_ctl_el2, 0);
	write_msr(cnthp_cval_el2, read_msr(cntv_cval_el0));
	write_msr(cnthp_ctl_el2, read_msr(cntv_ctl_el0));
	}
	}

	/**
	* Restores the state of per-vCPU peripherals, such as the virtual timer.
	*/
	void begin_restoring_state(struct vcpu *vcpu)
	{
	/*
	* Clear timer control register before restoring compare value, to avoid
	* a spurious timer interrupt. This could be a problem if the interrupt
	* is configured as edge-triggered, as it would then be latched in.
	*/
	write_msr(cntv_ctl_el0, 0);
	write_msr(cntv_cval_el0, vcpu->regs.peripherals.cntv_cval_el0);
	write_msr(cntv_ctl_el0, vcpu->regs.peripherals.cntv_ctl_el0);

	/*
	* If we are switching (back) to the primary, disable the EL2 physical
	* timer which was being used to emulate the EL0 virtual timer, as the
	* virtual timer is now running for the primary again.
	*/
	if (vcpu->vm->id == HF_PRIMARY_VM_ID) {
	write_msr(cnthp_ctl_el2, 0);
	write_msr(cnthp_cval_el2, 0);
	}
	}

	/**
	* Invalidate all stage 1 TLB entries on the current (physical) CPU for the
	* current VMID.
	*/
	static void invalidate_vm_tlb(void)
	{
	/*
	* Ensure that the last VTTBR write has taken effect so we invalidate
	* the right set of TLB entries.
	*/
	isb();

	__asm__ volatile("tlbi vmalle1");

	/*
	* Ensure that no instructions are fetched for the VM until after the
	* TLB invalidation has taken effect.
	*/
	isb();

	/*
	* Ensure that no data reads or writes for the VM happen until after the
	* TLB invalidation has taken effect. Non-sharable is enough because the
	* TLB is local to the CPU.
	*/
	dsb(nsh);
	}

	/**
	* Invalidates the TLB if a different vCPU is being run than the last vCPU of
	* the same VM which was run on the current pCPU.
	*
	* This is necessary because VMs may (contrary to the architecture
	* specification) use inconsistent ASIDs across vCPUs. c.f. KVM's similar
	* workaround:
	* https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=94d0e5980d6791b9
	*/
	void maybe_invalidate_tlb(struct vcpu *vcpu)
	{
	size_t current_cpu_index = cpu_index(vcpu->cpu);
	spci_vcpu_index_t new_vcpu_index = vcpu_index(vcpu);

	if (vcpu->vm->arch.last_vcpu_on_cpu[current_cpu_index] !=
	new_vcpu_index) {
	/*
	* The vCPU has changed since the last time this VM was run on
	* this pCPU, so we need to invalidate the TLB.
	*/
	invalidate_vm_tlb();

	/* Record the fact that this vCPU is now running on this CPU. */
	vcpu->vm->arch.last_vcpu_on_cpu[current_cpu_index] =
	new_vcpu_index;
	}
	}

	noreturn void irq_current_exception(uintreg_t elr, uintreg_t spsr)
	{
	(void)elr;
	(void)spsr;

	panic("IRQ from current");
	}

	noreturn void fiq_current_exception(uintreg_t elr, uintreg_t spsr)
	{
	(void)elr;
	(void)spsr;

	panic("FIQ from current");
	}

	noreturn void serr_current_exception(uintreg_t elr, uintreg_t spsr)
	{
	(void)elr;
	(void)spsr;

	panic("SERR from current");
	}

	noreturn void sync_current_exception(uintreg_t elr, uintreg_t spsr)
	{
	uintreg_t esr = read_msr(esr_el2);
	uintreg_t ec = GET_EC(esr);

	(void)spsr;

	switch (ec) {
	case 0x25: /* EC = 100101, Data abort. */
	dlog("Data abort: pc=%#x, esr=%#x, ec=%#x", elr, esr, ec);
	if (!(esr & (1U << 10))) { /* Check FnV bit. */
	dlog(", far=%#x", read_msr(far_el2));
	} else {
	dlog(", far=invalid");
	}

	dlog("\n");
	break;

	default:
	dlog("Unknown current sync exception pc=%#x, esr=%#x, "
	"ec=%#x\n",
	elr, esr, ec);
	break;
	}

	panic("EL2 exception");
	}

	/**
	* Sets or clears the VI bit in the HCR_EL2 register saved in the given
	* arch_regs.
	*/
	static void set_virtual_interrupt(struct arch_regs *r, bool enable)
	{
	if (enable) {
	r->lazy.hcr_el2 \|= HCR_EL2_VI;
	} else {
	r->lazy.hcr_el2 &= ~HCR_EL2_VI;
	}
	}

	/**
	* Sets or clears the VI bit in the HCR_EL2 register.
	*/
	static void set_virtual_interrupt_current(bool enable)
	{
	uintreg_t hcr_el2 = read_msr(hcr_el2);

	if (enable) {
	hcr_el2 \|= HCR_EL2_VI;
	} else {
	hcr_el2 &= ~HCR_EL2_VI;
	}
	write_msr(hcr_el2, hcr_el2);
	}

	/**
	* Checks whether to block an SMC being forwarded from a VM.
	*/
	static bool smc_is_blocked(const struct vm *vm, uint32_t func)
	{
	bool block_by_default = !vm->smc_whitelist.permissive;

	for (size_t i = 0; i < vm->smc_whitelist.smc_count; ++i) {
	if (func == vm->smc_whitelist.smcs[i]) {
	return false;
	}
	}

	dlog("SMC %#010x attempted from VM %d, blocked=%d\n", func, vm->id,
	block_by_default);

	/* Access is still allowed in permissive mode. */
	return block_by_default;
	}

	/**
	* Applies SMC access control according to manifest and forwards the call if
	* access is granted.
	*/
	static void smc_forwarder(const struct vcpu vcpu, struct smc_result ret)
	{
	uint32_t func = vcpu->regs.r[0];
	uint32_t client_id = vcpu->vm->id;
	uintreg_t arg7;

	if (smc_is_blocked(vcpu->vm, func)) {
	ret->res0 = SMCCC_ERROR_UNKNOWN;
	return;
	}

	/*
	* Set the Client ID but keep the existing Secure OS ID and anything
	* else (currently unspecified) that the client may have passed in the
	* upper bits.
	*/
	arg7 = client_id \| (vcpu->regs.r[7] & ~CLIENT_ID_MASK);
	*ret = smc_forward(func, vcpu->regs.r[1], vcpu->regs.r[2],
	vcpu->regs.r[3], vcpu->regs.r[4], vcpu->regs.r[5],
	vcpu->regs.r[6], arg7);

	/*
	* Preserve the value passed by the caller, rather than the client_id we
	* generated. Note that this would also overwrite any return value that
	* may be in x7, but the SMCs that we are forwarding are legacy calls
	* from before SMCCC 1.2 so won't have more than 4 return values anyway.
	*/
	ret->res7 = vcpu->regs.r[7];
	}

	static bool spci_handler(struct spci_value args, struct vcpu *next)
	{
	/*
	* NOTE: When adding new methods to this handler update
	* api_spci_features accordingly.
	*/
	switch (args->func & ~SMCCC_CONVENTION_MASK) {
	case SPCI_VERSION_32:
	*args = api_spci_version();
	return true;
	case SPCI_ID_GET_32:
	*args = api_spci_id_get(current());
	return true;
	case SPCI_FEATURES_32:
	*args = api_spci_features(args->arg1);
	return true;
	case SPCI_YIELD_32:
	api_yield(current(), next);

	/* SPCI_YIELD always returns SPCI_SUCCESS. */
	*args = (struct spci_value){.func = SPCI_SUCCESS_32};

	return true;
	case SPCI_MSG_SEND_32:
	args = api_spci_msg_send(spci_msg_send_sender(args),
	spci_msg_send_receiver(*args),
	spci_msg_send_size(*args),
	spci_msg_send_attributes(*args),
	current(), next);
	return true;
	case SPCI_MSG_WAIT_32:
	*args = api_spci_msg_recv(true, current(), next);
	return true;
	case SPCI_MSG_POLL_32:
	*args = api_spci_msg_recv(false, current(), next);
	return true;
	}

	return false;
	}

	/**
	* Set or clear VI bit according to pending interrupts.
	*/
	static void update_vi(struct vcpu *next)
	{
	if (next == NULL) {
	/*
	* Not switching vCPUs, set the bit for the current vCPU
	* directly in the register.
	*/
	struct vcpu *vcpu = current();

	sl_lock(&vcpu->lock);
	set_virtual_interrupt_current(
	vcpu->interrupts.enabled_and_pending_count > 0);
	sl_unlock(&vcpu->lock);
	} else {
	/*
	* About to switch vCPUs, set the bit for the vCPU to which we
	* are switching in the saved copy of the register.
	*/
	sl_lock(&next->lock);
	set_virtual_interrupt(
	&next->regs,
	next->interrupts.enabled_and_pending_count > 0);
	sl_unlock(&next->lock);
	}
	}

	/**
	* Processes SMC instruction calls.
	*/
	static void smc_handler(struct vcpu vcpu, struct smc_result ret,
	struct vcpu **next)
	{
	uint32_t func = vcpu->regs.r[0];

	if (psci_handler(vcpu, func, vcpu->regs.r[1], vcpu->regs.r[2],
	vcpu->regs.r[3], &ret->res0, next)) {
	return;
	}

	switch (func & ~SMCCC_CONVENTION_MASK) {
	case HF_DEBUG_LOG:
	ret->res0 = api_debug_log(vcpu->regs.r[1], vcpu);
	return;
	}

	smc_forwarder(vcpu, ret);
	}

	struct vcpu hvc_handler(struct vcpu vcpu)
	{
	struct spci_value args = {
	.func = vcpu->regs.r[0],
	.arg1 = vcpu->regs.r[1],
	.arg2 = vcpu->regs.r[2],
	.arg3 = vcpu->regs.r[3],
	.arg4 = vcpu->regs.r[4],
	.arg5 = vcpu->regs.r[5],
	.arg6 = vcpu->regs.r[6],
	.arg7 = vcpu->regs.r[7],
	};
	struct vcpu *next = NULL;

	if (psci_handler(vcpu, args.func, args.arg1, args.arg2, args.arg3,
	&vcpu->regs.r[0], &next)) {
	return next;
	}

	if (spci_handler(&args, &next)) {
	vcpu->regs.r[0] = args.func;
	vcpu->regs.r[1] = args.arg1;
	vcpu->regs.r[2] = args.arg2;
	vcpu->regs.r[3] = args.arg3;
	vcpu->regs.r[4] = args.arg4;
	vcpu->regs.r[5] = args.arg5;
	vcpu->regs.r[6] = args.arg6;
	vcpu->regs.r[7] = args.arg7;
	update_vi(next);
	return next;
	}

	switch (args.func) {
	case HF_VM_GET_COUNT:
	vcpu->regs.r[0] = api_vm_get_count();
	break;

	case HF_VCPU_GET_COUNT:
	vcpu->regs.r[0] = api_vcpu_get_count(args.arg1, vcpu);
	break;

	case HF_VCPU_RUN:
	vcpu->regs.r[0] = hf_vcpu_run_return_encode(
	api_vcpu_run(args.arg1, args.arg2, vcpu, &next));
	break;

	case HF_VM_CONFIGURE:
	vcpu->regs.r[0] = api_vm_configure(
	ipa_init(args.arg1), ipa_init(args.arg2), vcpu, &next);
	break;

	case HF_MAILBOX_CLEAR:
	vcpu->regs.r[0] = api_mailbox_clear(vcpu, &next);
	break;

	case HF_MAILBOX_WRITABLE_GET:
	vcpu->regs.r[0] = api_mailbox_writable_get(vcpu);
	break;

	case HF_MAILBOX_WAITER_GET:
	vcpu->regs.r[0] = api_mailbox_waiter_get(args.arg1, vcpu);
	break;

	case HF_INTERRUPT_ENABLE:
	vcpu->regs.r[0] =
	api_interrupt_enable(args.arg1, args.arg2, vcpu);
	break;

	case HF_INTERRUPT_GET:
	vcpu->regs.r[0] = api_interrupt_get(vcpu);
	break;

	case HF_INTERRUPT_INJECT:
	vcpu->regs.r[0] = api_interrupt_inject(args.arg1, args.arg2,
	args.arg3, vcpu, &next);
	break;

	case HF_SHARE_MEMORY:
	vcpu->regs.r[0] = api_share_memory(
	args.arg1 >> 32, ipa_init(args.arg2), args.arg3,
	args.arg1 & 0xffffffff, vcpu);
	break;

	case HF_DEBUG_LOG:
	vcpu->regs.r[0] = api_debug_log(args.arg1, vcpu);
	break;

	default:
	vcpu->regs.r[0] = SMCCC_ERROR_UNKNOWN;
	}

	update_vi(next);

	return next;
	}

	struct vcpu *irq_lower(void)
	{
	/*
	* Switch back to primary VM, interrupts will be handled there.
	*
	* If the VM has aborted, this vCPU will be aborted when the scheduler
	* tries to run it again. This means the interrupt will not be delayed
	* by the aborted VM.
	*
	* TODO: Only switch when the interrupt isn't for the current VM.
	*/
	return api_preempt(current());
	}

	struct vcpu *fiq_lower(void)
	{
	return irq_lower();
	}

	struct vcpu *serr_lower(void)
	{
	dlog("SERR from lower\n");
	return api_abort(current());
	}

	/**
	* Initialises a fault info structure. It assumes that an FnV bit exists at
	* bit offset 10 of the ESR, and that it is only valid when the bottom 6 bits of
	* the ESR (the fault status code) are 010000; this is the case for both
	* instruction and data aborts, but not necessarily for other exception reasons.
	*/
	static struct vcpu_fault_info fault_info_init(uintreg_t esr,
	const struct vcpu *vcpu,
	uint32_t mode)
	{
	uint32_t fsc = esr & 0x3f;
	struct vcpu_fault_info r;

	r.mode = mode;
	r.pc = va_init(vcpu->regs.pc);

	/*
	* Check the FnV bit, which is only valid if dfsc/ifsc is 010000. It
	* indicates that we cannot rely on far_el2.
	*/
	if (fsc == 0x10 && esr & (1U << 10)) {
	r.vaddr = va_init(0);
	r.ipaddr = ipa_init(read_msr(hpfar_el2) << 8);
	} else {
	r.vaddr = va_init(read_msr(far_el2));
	r.ipaddr = ipa_init((read_msr(hpfar_el2) << 8) \|
	(read_msr(far_el2) & (PAGE_SIZE - 1)));
	}

	return r;
	}

	struct vcpu *sync_lower_exception(uintreg_t esr)
	{
	struct vcpu *vcpu = current();
	struct vcpu_fault_info info;
	struct vcpu *new_vcpu;
	uintreg_t ec = GET_EC(esr);

	switch (ec) {
	case 0x01: /* EC = 000001, WFI or WFE. */
	/* Skip the instruction. */
	vcpu->regs.pc += GET_NEXT_PC_INC(esr);
	/* Check TI bit of ISS, 0 = WFI, 1 = WFE. */
	if (esr & 1) {
	/* WFE */
	/*
	* TODO: consider giving the scheduler more context,
	* somehow.
	*/
	api_yield(vcpu, &new_vcpu);
	return new_vcpu;
	}
	/* WFI */
	return api_wait_for_interrupt(vcpu);

	case 0x24: /* EC = 100100, Data abort. */
	info = fault_info_init(
	esr, vcpu, (esr & (1U << 6)) ? MM_MODE_W : MM_MODE_R);
	if (vcpu_handle_page_fault(vcpu, &info)) {
	return NULL;
	}
	break;

	case 0x20: /* EC = 100000, Instruction abort. */
	info = fault_info_init(esr, vcpu, MM_MODE_X);
	if (vcpu_handle_page_fault(vcpu, &info)) {
	return NULL;
	}
	break;

	case 0x16: /* EC = 010110, HVC instruction */
	return hvc_handler(vcpu);

	case 0x17: /* EC = 010111, SMC instruction. */ {
	uintreg_t smc_pc = vcpu->regs.pc;
	struct vcpu *next = NULL;
	struct smc_result ret = {.res4 = vcpu->regs.r[4],
	.res5 = vcpu->regs.r[5],
	.res6 = vcpu->regs.r[6],
	.res7 = vcpu->regs.r[7]};

	smc_handler(vcpu, &ret, &next);

	/* Skip the SMC instruction. */
	vcpu->regs.pc = smc_pc + GET_NEXT_PC_INC(esr);
	vcpu->regs.r[0] = ret.res0;
	vcpu->regs.r[1] = ret.res1;
	vcpu->regs.r[2] = ret.res2;
	vcpu->regs.r[3] = ret.res3;
	vcpu->regs.r[4] = ret.res4;
	vcpu->regs.r[5] = ret.res5;
	vcpu->regs.r[6] = ret.res6;
	vcpu->regs.r[7] = ret.res7;
	return next;
	}

	/*
	* EC = 011000, MSR, MRS or System instruction execution that is not
	* reported using EC 000000, 000001 or 000111.
	*/
	case 0x18:
	/*
	* NOTE: This should never be reached because it goes through a
	* separate path handled by handle_system_register_access().
	*/
	panic("Handled by handle_system_register_access().");

	default:
	dlog("Unknown lower sync exception pc=%#x, esr=%#x, "
	"ec=%#x\n",
	vcpu->regs.pc, esr, ec);
	break;
	}

	/* The exception wasn't handled so abort the VM. */
	return api_abort(vcpu);
	}

	/**
	* Handles EC = 011000, msr, mrs instruction traps.
	* Returns non-null ONLY if the access failed and the vcpu is changing.
	*/
	struct vcpu *handle_system_register_access(uintreg_t esr)
	{
	struct vcpu *vcpu = current();
	spci_vm_id_t vm_id = vcpu->vm->id;
	uintreg_t ec = GET_EC(esr);
	char *direction_str;

	CHECK(ec == 0x18);

	/*
	* Handle accesses to debug and performance monitor registers.
	* Abort when encountering unhandled register accesses.
	*/
	if (debug_el1_is_register_access(esr)) {
	if (!debug_el1_process_access(vcpu, vm_id, esr)) {
	goto fail;
	}
	} else if (perfmon_is_register_access(esr)) {
	if (!perfmon_process_access(vcpu, vm_id, esr)) {
	goto fail;
	}
	} else {
	goto fail;
	}

	/* Instruction was fulfilled. Skip it and run the next one. */
	vcpu->regs.pc += GET_NEXT_PC_INC(esr);
	return NULL;

	fail:
	direction_str = ISS_IS_READ(esr) ? "read" : "write";

	dlog("Unhandled system register %s: op0=%d, op1=%d, crn=%d, "
	"crm=%d, op2=%d, rt=%d.\n",
	direction_str, GET_ISS_OP0(esr), GET_ISS_OP1(esr),
	GET_ISS_CRN(esr), GET_ISS_CRM(esr), GET_ISS_OP2(esr),
	GET_ISS_RT(esr));

	/* Abort if unable to fulfill the register access. */
	return api_abort(vcpu);
	}